summaryrefslogtreecommitdiffstats
path: root/libglusterfs
diff options
context:
space:
mode:
authorRajesh Amaravathi <rajesh@redhat.com>2012-02-20 12:31:10 +0530
committerVijay Bellur <vijay@gluster.com>2012-02-20 00:30:54 -0800
commit975933a25d14cbac861e809b40c6edd01acaa28d (patch)
tree3fa06856f1fd2f094be33e7b4f20deefc72baaa3 /libglusterfs
parent27e51951bc53f36b2286c70eb2263173b29d7a85 (diff)
glusterd: auth allow enhancements
* PROBLEM: When address-based authentication is enabled on a volume, the gNfs server, self-heal daemon (shd), and other operations such as quota, rebalance, replace-brick and geo-replication either stop working or the services are not started if all the peers' ipv{4,6} addresses or hostnames are not added in the "set auth.allow" operation, breaking the functionality of several operations. E.g: volume vol in a cluster of two peers: /mnt/brick1 in 192.168.1.4 /mnt/brick2 in 192.168.1.5 option auth.allow 192.168.1.6 (allow connection requests only from 192.168.1.6) This will disrupt the nfs servers on 192.168.1.{4,5}. brick server processes reject connection requests from both nfs servers (on 4,5), because the peer addresses are not in the auth.allow list. Same holds true for local mounts (on peer machines), self-heal daemon, and other operations which perform a glusterfs mount on one of the peers. * SOLUTION: Login-based authentication (username/password pairs, henceforth referred to as "keys") for gluster services and operations. These *per-volume* keys can be used to by-pass the addr-based authentication, provided none of the peers' addresses are put in the auth.reject list, to enable gluster services like gNfs, self-heal daemon and internal operations on volumes when auth.allow option is exercised. * IMPLEMENTATION: 1. Glusterd generates keys for each volume and stores it in memory as well as in respective volfiles. A new TRUSTED-FUSE volfile is generated which is fuse volfile + keys in protocol/client, and is named trusted-<volname>-fuse.vol. This is used by all local mounts. ANY local mount (on any peer) is granted the trusted-fuse volfile instead of fuse volfile via getspec. non-local mounts are NOT granted the trusted fuse volfile. 2. The keys generated for the volume is written to each server volfile telling servers to allow users with these keys. 3. NFS, self-heal daemon and replace-brick volfiles are updated with the volume's authentication keys. 4. The keys are NOT written to fuse volfiles for obvious reasons. 5. The ownership of volfiles and logfiles is restricted to root users. 6. Merging two identical definitions of peer_info_t in auth/addr and rpc-lib, throwing away the one in auth/addr. 7. Code cleanup in numerous places as appropriate. * IMPORTANT NOTES: 1. One SHOULD NOT put any of the peer addresses in the auth.reject list if one wants any of the glusterd services and features such as gNfs, self-heal, rebalance, geo-rep and quota. 2. If one wants to use username/password based authentication to volumes, one shall append to the server, nfs and shd volfiles, the keys one wants to use for authentication, *while_retaining those_generated_by_glusterd*. See doc/authentication.txt file for details. Change-Id: Ie0331d625ad000d63090e2d622fe1728fbfcc453 BUG: 789942 Signed-off-by: Rajesh Amaravathi <rajesh@redhat.com> Reviewed-on: http://review.gluster.com/2733 Tested-by: Gluster Build System <jenkins@build.gluster.com> Reviewed-by: Vijay Bellur <vijay@gluster.com>
Diffstat (limited to 'libglusterfs')
-rw-r--r--libglusterfs/src/logging.c41
1 files changed, 37 insertions, 4 deletions
diff --git a/libglusterfs/src/logging.c b/libglusterfs/src/logging.c
index 5485260bc10..55ef087a868 100644
--- a/libglusterfs/src/logging.c
+++ b/libglusterfs/src/logging.c
@@ -131,6 +131,8 @@ gf_log_globals_init (void)
int
gf_log_init (const char *file)
{
+ int fd = -1;
+
if (!file){
fprintf (stderr, "ERROR: no filename specified\n");
return -1;
@@ -149,6 +151,14 @@ gf_log_init (const char *file)
return -1;
}
+ fd = open (file, O_CREAT | O_RDONLY, S_IRUSR | S_IWUSR);
+ if (fd < 0) {
+ fprintf (stderr, "ERROR: failed to create logfile \"%s\" (%s)\n",
+ file, strerror (errno));
+ return -1;
+ }
+ close (fd);
+
logfile = fopen (file, "a");
if (!logfile){
fprintf (stderr, "ERROR: failed to open logfile \"%s\" (%s)\n",
@@ -475,6 +485,7 @@ _gf_log (const char *domain, const char *file, const char *function, int line,
char *msg = NULL;
size_t len = 0;
int ret = 0;
+ int fd = -1;
xlator_t *this = NULL;
this = THIS;
@@ -509,6 +520,14 @@ _gf_log (const char *domain, const char *file, const char *function, int line,
if (logrotate) {
logrotate = 0;
+ fd = open (filename, O_CREAT | O_RDONLY, S_IRUSR | S_IWUSR);
+ if (fd < 0) {
+ gf_log ("logrotate", GF_LOG_ERROR,
+ "%s", strerror (errno));
+ return -1;
+ }
+ close (fd);
+
new_logfile = fopen (filename, "a");
if (!new_logfile) {
gf_log ("logrotate", GF_LOG_CRITICAL,
@@ -601,16 +620,21 @@ out:
int
gf_cmd_log_init (const char *filename)
{
+ int fd = -1;
+ xlator_t *this = NULL;
+
+ this = THIS;
+
if (!filename){
- gf_log ("glusterd", GF_LOG_CRITICAL, "gf_cmd_log_init: no "
+ gf_log (this->name, GF_LOG_CRITICAL, "gf_cmd_log_init: no "
"filename specified\n");
return -1;
}
cmd_log_filename = gf_strdup (filename);
if (!cmd_log_filename) {
- gf_log ("glusterd", GF_LOG_CRITICAL, "gf_cmd_log_init: strdup"
- " error\n");
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "gf_cmd_log_init: strdup error\n");
return -1;
}
/* close and reopen cmdlogfile for log rotate*/
@@ -618,9 +642,18 @@ gf_cmd_log_init (const char *filename)
fclose (cmdlogfile);
cmdlogfile = NULL;
}
+
+ fd = open (cmd_log_filename, O_CREAT | O_RDONLY, S_IRUSR | S_IWUSR);
+ if (fd < 0) {
+ gf_log (this->name, GF_LOG_CRITICAL,
+ "%s", strerror (errno));
+ return -1;
+ }
+ close (fd);
+
cmdlogfile = fopen (cmd_log_filename, "a");
if (!cmdlogfile){
- gf_log ("glusterd", GF_LOG_CRITICAL,
+ gf_log (this->name, GF_LOG_CRITICAL,
"gf_cmd_log_init: failed to open logfile \"%s\" "
"(%s)\n", cmd_log_filename, strerror (errno));
return -1;
td class='upd'>xlators/cluster/afr/src/afr-transaction.h63
-rw-r--r--xlators/cluster/afr/src/afr.c1891
-rw-r--r--xlators/cluster/afr/src/afr.h1897
-rw-r--r--xlators/cluster/afr/src/pump.c2470
-rw-r--r--xlators/cluster/afr/src/pump.h81
-rw-r--r--xlators/cluster/dht/src/Makefile.am27
-rw-r--r--xlators/cluster/dht/src/dht-common.c16146
-rw-r--r--xlators/cluster/dht/src/dht-common.h2058
-rw-r--r--xlators/cluster/dht/src/dht-diskusage.c805
-rw-r--r--xlators/cluster/dht/src/dht-hashfn.c139
-rw-r--r--xlators/cluster/dht/src/dht-helper.c3622
-rw-r--r--xlators/cluster/dht/src/dht-inode-read.c2260
-rw-r--r--xlators/cluster/dht/src/dht-inode-write.c2080
-rw-r--r--xlators/cluster/dht/src/dht-layout.c1282
-rw-r--r--xlators/cluster/dht/src/dht-linkfile.c522
-rw-r--r--xlators/cluster/dht/src/dht-lock.c1392
-rw-r--r--xlators/cluster/dht/src/dht-lock.h91
-rw-r--r--xlators/cluster/dht/src/dht-mem-types.h47
-rw-r--r--xlators/cluster/dht/src/dht-messages.h1404
-rw-r--r--xlators/cluster/dht/src/dht-rebalance.c7203
-rw-r--r--xlators/cluster/dht/src/dht-rename.c2841
-rw-r--r--xlators/cluster/dht/src/dht-selfheal.c4061
-rw-r--r--xlators/cluster/dht/src/dht-shared.c1764
-rw-r--r--xlators/cluster/dht/src/dht.c156
-rw-r--r--xlators/cluster/dht/src/dht.sym8
-rw-r--r--xlators/cluster/dht/src/nufa.c1100
-rw-r--r--xlators/cluster/dht/src/nufa.sym8
-rw-r--r--xlators/cluster/dht/src/switch.c1479
-rw-r--r--xlators/cluster/dht/src/switch.sym8
-rw-r--r--xlators/cluster/dht/src/tier-common.c1086
-rw-r--r--xlators/cluster/dht/src/tier-common.h62
-rw-r--r--xlators/cluster/dht/src/tier.c2436
-rw-r--r--xlators/cluster/dht/src/tier.h104
-rw-r--r--xlators/cluster/dht/src/tier.sym9
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_mock.c39
-rw-r--r--xlators/cluster/dht/src/unittest/dht_layout_unittest.c26
-rw-r--r--xlators/cluster/ec/src/Makefile.am36
-rw-r--r--xlators/cluster/ec/src/ec-code-avx.c109
-rw-r--r--xlators/cluster/ec/src/ec-code-avx.h (renamed from xlators/cluster/dht/src/dht-helper.h)15
-rw-r--r--xlators/cluster/ec/src/ec-code-c.c11679
-rw-r--r--xlators/cluster/ec/src/ec-code-c.h27
-rw-r--r--xlators/cluster/ec/src/ec-code-intel.c594
-rw-r--r--xlators/cluster/ec/src/ec-code-intel.h191
-rw-r--r--xlators/cluster/ec/src/ec-code-sse.c101
-rw-r--r--xlators/cluster/ec/src/ec-code-sse.h18
-rw-r--r--xlators/cluster/ec/src/ec-code-x64.c144
-rw-r--r--xlators/cluster/ec/src/ec-code-x64.h18
-rw-r--r--xlators/cluster/ec/src/ec-code.c1060
-rw-r--r--xlators/cluster/ec/src/ec-code.h44
-rw-r--r--xlators/cluster/ec/src/ec-combine.c760
-rw-r--r--xlators/cluster/ec/src/ec-combine.h38
-rw-r--r--xlators/cluster/ec/src/ec-common.c2126
-rw-r--r--xlators/cluster/ec/src/ec-common.h286
-rw-r--r--xlators/cluster/ec/src/ec-data.c217
-rw-r--r--xlators/cluster/ec/src/ec-data.h329
-rw-r--r--xlators/cluster/ec/src/ec-dir-read.c398
-rw-r--r--xlators/cluster/ec/src/ec-dir-write.c701
-rw-r--r--xlators/cluster/ec/src/ec-fops.h420
-rw-r--r--xlators/cluster/ec/src/ec-galois.c183
-rw-r--r--xlators/cluster/ec/src/ec-galois.h32
-rw-r--r--xlators/cluster/ec/src/ec-generic.c944
-rw-r--r--xlators/cluster/ec/src/ec-gf.c11635
-rw-r--r--xlators/cluster/ec/src/ec-gf8.c5882
-rw-r--r--xlators/cluster/ec/src/ec-gf8.h (renamed from xlators/cluster/ec/src/ec-gf.h)11
-rw-r--r--xlators/cluster/ec/src/ec-heal.c4763
-rw-r--r--xlators/cluster/ec/src/ec-heald.c980
-rw-r--r--xlators/cluster/ec/src/ec-heald.h41
-rw-r--r--xlators/cluster/ec/src/ec-helpers.c482
-rw-r--r--xlators/cluster/ec/src/ec-helpers.h222
-rw-r--r--xlators/cluster/ec/src/ec-inode-read.c1210
-rw-r--r--xlators/cluster/ec/src/ec-inode-write.c1985
-rw-r--r--xlators/cluster/ec/src/ec-locks.c701
-rw-r--r--xlators/cluster/ec/src/ec-mem-types.h11
-rw-r--r--xlators/cluster/ec/src/ec-messages.h557
-rw-r--r--xlators/cluster/ec/src/ec-method.c488
-rw-r--r--xlators/cluster/ec/src/ec-method.h34
-rw-r--r--xlators/cluster/ec/src/ec-types.h690
-rw-r--r--xlators/cluster/ec/src/ec.c1955
-rw-r--r--xlators/cluster/ec/src/ec.h75
-rw-r--r--xlators/cluster/ha/src/Makefile.am17
-rw-r--r--xlators/cluster/ha/src/ha-helpers.c194
-rw-r--r--xlators/cluster/ha/src/ha-mem-types.h26
-rw-r--r--xlators/cluster/ha/src/ha.c4008
-rw-r--r--xlators/cluster/ha/src/ha.h53
-rw-r--r--xlators/cluster/map/src/Makefile.am17
-rw-r--r--xlators/cluster/map/src/map-helper.c343
-rw-r--r--xlators/cluster/map/src/map-mem-types.h24
-rw-r--r--xlators/cluster/map/src/map.c2561
-rw-r--r--xlators/cluster/map/src/map.h67
-rw-r--r--xlators/cluster/stripe/src/Makefile.am20
-rw-r--r--xlators/cluster/stripe/src/stripe-helpers.c677
-rw-r--r--xlators/cluster/stripe/src/stripe-mem-types.h31
-rw-r--r--xlators/cluster/stripe/src/stripe.c5775
-rw-r--r--xlators/cluster/stripe/src/stripe.h281
-rw-r--r--xlators/debug/Makefile.am2
-rw-r--r--xlators/debug/delay-gen/Makefile.am (renamed from xlators/experimental/fdl/Makefile.am)0
-rw-r--r--xlators/debug/delay-gen/src/Makefile.am11
-rw-r--r--xlators/debug/delay-gen/src/delay-gen-mem-types.h21
-rw-r--r--xlators/debug/delay-gen/src/delay-gen-messages.h26
-rw-r--r--xlators/debug/delay-gen/src/delay-gen.c697
-rw-r--r--xlators/debug/delay-gen/src/delay-gen.h27
-rw-r--r--xlators/debug/error-gen/src/Makefile.am3
-rw-r--r--xlators/debug/error-gen/src/error-gen-mem-types.h6
-rw-r--r--xlators/debug/error-gen/src/error-gen.c2992
-rw-r--r--xlators/debug/error-gen/src/error-gen.h26
-rw-r--r--xlators/debug/io-stats/src/Makefile.am4
-rw-r--r--xlators/debug/io-stats/src/io-stats-mem-types.h17
-rw-r--r--xlators/debug/io-stats/src/io-stats.c6647
-rw-r--r--xlators/debug/sink/Makefile.am (renamed from xlators/features/qemu-block/Makefile.am)1
-rw-r--r--xlators/debug/sink/src/Makefile.am14
-rw-r--r--xlators/debug/sink/src/sink.c94
-rw-r--r--xlators/debug/trace/src/Makefile.am3
-rw-r--r--xlators/debug/trace/src/trace-mem-types.h7
-rw-r--r--xlators/debug/trace/src/trace.c5413
-rw-r--r--xlators/debug/trace/src/trace.h65
-rw-r--r--xlators/encryption/Makefile.am3
-rw-r--r--xlators/encryption/crypt/src/Makefile.am24
-rw-r--r--xlators/encryption/crypt/src/atom.c957
-rw-r--r--xlators/encryption/crypt/src/crypt-common.h141
-rw-r--r--xlators/encryption/crypt/src/crypt-mem-types.h45
-rw-r--r--xlators/encryption/crypt/src/crypt.c4526
-rw-r--r--xlators/encryption/crypt/src/crypt.h900
-rw-r--r--xlators/encryption/crypt/src/data.c764
-rw-r--r--xlators/encryption/crypt/src/keys.c297
-rw-r--r--xlators/encryption/crypt/src/metadata.c614
-rw-r--r--xlators/encryption/crypt/src/metadata.h74
-rw-r--r--xlators/encryption/rot-13/src/rot-13.c196
-rw-r--r--xlators/experimental/Makefile.am3
-rw-r--r--xlators/experimental/README.md107
-rw-r--r--xlators/experimental/fdl/src/Makefile.am42
-rw-r--r--xlators/experimental/fdl/src/dump-tmpl.c156
-rw-r--r--xlators/experimental/fdl/src/fdl-tmpl.c506
-rwxr-xr-xxlators/experimental/fdl/src/gen_dumper.py116
-rwxr-xr-xxlators/experimental/fdl/src/gen_fdl.py328
-rwxr-xr-xxlators/experimental/fdl/src/gen_recon.py191
-rw-r--r--xlators/experimental/fdl/src/jnl-types.h14
-rw-r--r--xlators/experimental/fdl/src/logdump.c50
-rw-r--r--xlators/experimental/fdl/src/recon-tmpl.c305
-rw-r--r--xlators/experimental/fdl/src/recon.c89
-rw-r--r--xlators/experimental/nsr-client/src/Makefile.am32
-rw-r--r--xlators/experimental/nsr-client/src/fop-template.c113
-rwxr-xr-xxlators/experimental/nsr-client/src/gen-fops.py57
-rw-r--r--xlators/experimental/nsr-client/src/nsr-messages.h105
-rw-r--r--xlators/experimental/nsr-client/src/nsrc.c236
-rw-r--r--xlators/experimental/nsr-client/src/nsrc.h24
-rw-r--r--xlators/experimental/nsr-server/src/Makefile.am35
-rw-r--r--xlators/experimental/nsr-server/src/all-templates.c429
-rwxr-xr-xxlators/experimental/nsr-server/src/gen-fops.py138
-rw-r--r--xlators/experimental/nsr-server/src/nsr-internal.h114
-rw-r--r--xlators/experimental/nsr-server/src/nsr.c1066
-rw-r--r--xlators/features/Makefile.am16
-rw-r--r--xlators/features/arbiter/src/Makefile.am6
-rw-r--r--xlators/features/arbiter/src/arbiter-mem-types.h7
-rw-r--r--xlators/features/arbiter/src/arbiter.c503
-rw-r--r--xlators/features/arbiter/src/arbiter.h6
-rw-r--r--xlators/features/barrier/src/Makefile.am3
-rw-r--r--xlators/features/barrier/src/barrier-mem-types.h6
-rw-r--r--xlators/features/barrier/src/barrier.c1076
-rw-r--r--xlators/features/barrier/src/barrier.h129
-rw-r--r--xlators/features/bit-rot/src/bitd/Makefile.am17
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-bitd-messages.h493
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.c78
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub-status.h50
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub.c2978
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-scrub.h34
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-ssm.c137
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-ssm.h28
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-tbf.c306
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot-tbf.h70
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.c3283
-rw-r--r--xlators/features/bit-rot/src/bitd/bit-rot.h342
-rw-r--r--xlators/features/bit-rot/src/stub/Makefile.am7
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-common.h181
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-object-version.h12
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-helpers.c1125
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-mem-types.h38
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub-messages.h350
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.c4939
-rw-r--r--xlators/features/bit-rot/src/stub/bit-rot-stub.h572
-rw-r--r--xlators/features/changelog/lib/examples/c/get-changes-multi.c80
-rw-r--r--xlators/features/changelog/lib/examples/c/get-changes.c102
-rw-r--r--xlators/features/changelog/lib/examples/c/get-history.c148
-rwxr-xr-x[-rw-r--r--]xlators/features/changelog/lib/examples/python/changes.py13
-rw-r--r--xlators/features/changelog/lib/examples/python/libgfchangelog.py3
-rw-r--r--xlators/features/changelog/lib/src/Makefile.am34
-rw-r--r--xlators/features/changelog/lib/src/changelog-lib-messages.h327
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-api.c320
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.c255
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-helpers.h244
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-journal-handler.c1642
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-journal.h94
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-reborp.c583
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-rpc.c93
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog-rpc.h10
-rw-r--r--xlators/features/changelog/lib/src/gf-changelog.c883
-rw-r--r--xlators/features/changelog/lib/src/gf-history-changelog.c1492
-rw-r--r--xlators/features/changelog/src/Makefile.am25
-rw-r--r--xlators/features/changelog/src/changelog-barrier.c141
-rw-r--r--xlators/features/changelog/src/changelog-encoders.c298
-rw-r--r--xlators/features/changelog/src/changelog-encoders.h42
-rw-r--r--xlators/features/changelog/src/changelog-ev-handle.c576
-rw-r--r--xlators/features/changelog/src/changelog-ev-handle.h110
-rw-r--r--xlators/features/changelog/src/changelog-helpers.c2878
-rw-r--r--xlators/features/changelog/src/changelog-helpers.h893
-rw-r--r--xlators/features/changelog/src/changelog-mem-types.h33
-rw-r--r--xlators/features/changelog/src/changelog-messages.h590
-rw-r--r--xlators/features/changelog/src/changelog-misc.h164
-rw-r--r--xlators/features/changelog/src/changelog-rpc-common.c516
-rw-r--r--xlators/features/changelog/src/changelog-rpc-common.h59
-rw-r--r--xlators/features/changelog/src/changelog-rpc.c588
-rw-r--r--xlators/features/changelog/src/changelog-rpc.h10
-rw-r--r--xlators/features/changelog/src/changelog-rt.c63
-rw-r--r--xlators/features/changelog/src/changelog-rt.h14
-rw-r--r--xlators/features/changelog/src/changelog.c4626
-rw-r--r--xlators/features/changetimerecorder/src/Makefile.am23
-rw-r--r--xlators/features/changetimerecorder/src/changetimerecorder.c2307
-rw-r--r--xlators/features/changetimerecorder/src/changetimerecorder.h21
-rw-r--r--xlators/features/changetimerecorder/src/ctr-helper.c308
-rw-r--r--xlators/features/changetimerecorder/src/ctr-helper.h923
-rw-r--r--xlators/features/changetimerecorder/src/ctr-xlator-ctx.c409
-rw-r--r--xlators/features/changetimerecorder/src/ctr-xlator-ctx.h90
-rw-r--r--xlators/features/cloudsync/Makefile.am (renamed from xlators/experimental/nsr-client/Makefile.am)0
-rw-r--r--xlators/features/cloudsync/src/Makefile.am46
-rw-r--r--xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.c30
-rw-r--r--xlators/features/cloudsync/src/cloudsync-autogen-fops-tmpl.h (renamed from xlators/features/changetimerecorder/src/ctr_mem_types.h)20
-rw-r--r--xlators/features/cloudsync/src/cloudsync-common.c60
-rw-r--r--xlators/features/cloudsync/src/cloudsync-common.h134
-rwxr-xr-xxlators/features/cloudsync/src/cloudsync-fops-c.py324
-rwxr-xr-xxlators/features/cloudsync/src/cloudsync-fops-h.py31
-rw-r--r--xlators/features/cloudsync/src/cloudsync-mem-types.h22
-rw-r--r--xlators/features/cloudsync/src/cloudsync-messages.h16
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/Makefile.am (renamed from xlators/experimental/nsr-server/Makefile.am)0
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/Makefile.am11
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/Makefile.am (renamed from xlators/features/changetimerecorder/Makefile.am)0
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/Makefile.am12
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3-mem-types.h19
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.c584
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.h50
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cloudsyncs3/src/libcloudsyncs3.sym1
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/Makefile.am (renamed from xlators/features/ganesha/Makefile.am)0
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/Makefile.am12
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/archivestore.h203
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/cvlt-messages.h30
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcloudsynccvlt.sym1
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt-mem-types.h19
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.c842
-rw-r--r--xlators/features/cloudsync/src/cloudsync-plugins/src/cvlt/src/libcvlt.h84
-rw-r--r--xlators/features/cloudsync/src/cloudsync.c2076
-rw-r--r--xlators/features/cloudsync/src/cloudsync.h123
-rw-r--r--xlators/features/compress/src/Makefile.am6
-rw-r--r--xlators/features/compress/src/cdc-helper.c762
-rw-r--r--xlators/features/compress/src/cdc-mem-types.h10
-rw-r--r--xlators/features/compress/src/cdc.c578
-rw-r--r--xlators/features/compress/src/cdc.h76
-rw-r--r--xlators/features/filter/Makefile.am3
-rw-r--r--xlators/features/filter/src/Makefile.am16
-rw-r--r--xlators/features/filter/src/filter.c1729
-rw-r--r--xlators/features/ganesha/src/Makefile.am18
-rw-r--r--xlators/features/ganesha/src/ganesha.c90
-rw-r--r--xlators/features/ganesha/src/ganesha.h18
-rw-r--r--xlators/features/gfid-access/src/Makefile.am3
-rw-r--r--xlators/features/gfid-access/src/gfid-access-mem-types.h9
-rw-r--r--xlators/features/gfid-access/src/gfid-access.c2155
-rw-r--r--xlators/features/gfid-access/src/gfid-access.h132
-rw-r--r--xlators/features/glupy/Makefile.am3
-rw-r--r--xlators/features/glupy/doc/README.md44
-rw-r--r--xlators/features/glupy/doc/TESTING9
-rw-r--r--xlators/features/glupy/doc/test.vol10
-rw-r--r--xlators/features/glupy/examples/Makefile.am5
-rw-r--r--xlators/features/glupy/examples/debug-trace.py775
-rw-r--r--xlators/features/glupy/examples/helloworld.py19
-rw-r--r--xlators/features/glupy/examples/negative.py91
-rw-r--r--xlators/features/glupy/src/Makefile.am29
-rw-r--r--xlators/features/glupy/src/__init__.py.in2
-rw-r--r--xlators/features/glupy/src/glupy.c2496
-rw-r--r--xlators/features/glupy/src/glupy.h56
-rw-r--r--xlators/features/glupy/src/glupy.sym101
-rw-r--r--xlators/features/glupy/src/glupy/Makefile.am5
-rw-r--r--xlators/features/glupy/src/glupy/__init__.py852
-rw-r--r--xlators/features/glupy/src/setup.py.in24
-rw-r--r--xlators/features/index/src/Makefile.am8
-rw-r--r--xlators/features/index/src/index-mem-types.h15
-rw-r--r--xlators/features/index/src/index-messages.h33
-rw-r--r--xlators/features/index/src/index.c3704
-rw-r--r--xlators/features/index/src/index.h96
-rw-r--r--xlators/features/leases/Makefile.am (renamed from xlators/storage/bd/Makefile.am)0
-rw-r--r--xlators/features/leases/src/Makefile.am20
-rw-r--r--xlators/features/leases/src/leases-internal.c1412
-rw-r--r--xlators/features/leases/src/leases-mem-types.h27
-rw-r--r--xlators/features/leases/src/leases-messages.h33
-rw-r--r--xlators/features/leases/src/leases.c1168
-rw-r--r--xlators/features/leases/src/leases.h259
-rw-r--r--xlators/features/locks/src/Makefile.am12
-rw-r--r--xlators/features/locks/src/clear.c684
-rw-r--r--xlators/features/locks/src/clear.h66
-rw-r--r--xlators/features/locks/src/common.c1948
-rw-r--r--xlators/features/locks/src/common.h258
-rw-r--r--xlators/features/locks/src/entrylk.c1489
-rw-r--r--xlators/features/locks/src/inodelk.c1580
-rw-r--r--xlators/features/locks/src/locks-mem-types.h23
-rw-r--r--xlators/features/locks/src/locks.h327
-rw-r--r--xlators/features/locks/src/pl-messages.h29
-rw-r--r--xlators/features/locks/src/posix.c6352
-rw-r--r--xlators/features/locks/src/reservelk.c580
-rw-r--r--xlators/features/locks/tests/unit-test.c101
-rw-r--r--xlators/features/mac-compat/Makefile.am3
-rw-r--r--xlators/features/mac-compat/src/Makefile.am15
-rw-r--r--xlators/features/mac-compat/src/mac-compat.c344
-rw-r--r--xlators/features/mac-compat/src/mac-compat.h41
-rw-r--r--xlators/features/marker/src/Makefile.am11
-rw-r--r--xlators/features/marker/src/marker-common.c74
-rw-r--r--xlators/features/marker/src/marker-common.h7
-rw-r--r--xlators/features/marker/src/marker-mem-types.h23
-rw-r--r--xlators/features/marker/src/marker-quota-helper.c606
-rw-r--r--xlators/features/marker/src/marker-quota-helper.h81
-rw-r--r--xlators/features/marker/src/marker-quota.c3419
-rw-r--r--xlators/features/marker/src/marker-quota.h200
-rw-r--r--xlators/features/marker/src/marker.c4679
-rw-r--r--xlators/features/marker/src/marker.h227
-rw-r--r--xlators/features/metadisp/Makefile.am (renamed from xlators/encryption/rot-13/Makefile.am)2
-rw-r--r--xlators/features/metadisp/src/Makefile.am38
-rw-r--r--xlators/features/metadisp/src/backend.c45
-rw-r--r--xlators/features/metadisp/src/fops-tmpl.c10
-rw-r--r--xlators/features/metadisp/src/gen-fops.py160
-rw-r--r--xlators/features/metadisp/src/metadisp-create.c101
-rw-r--r--xlators/features/metadisp/src/metadisp-fops.h51
-rw-r--r--xlators/features/metadisp/src/metadisp-fsync.c54
-rw-r--r--xlators/features/metadisp/src/metadisp-lookup.c90
-rw-r--r--xlators/features/metadisp/src/metadisp-open.c70
-rw-r--r--xlators/features/metadisp/src/metadisp-readdir.c65
-rw-r--r--xlators/features/metadisp/src/metadisp-setattr.c90
-rw-r--r--xlators/features/metadisp/src/metadisp-stat.c124
-rw-r--r--xlators/features/metadisp/src/metadisp-unlink.c160
-rw-r--r--xlators/features/metadisp/src/metadisp.c46
-rw-r--r--xlators/features/metadisp/src/metadisp.h45
-rw-r--r--xlators/features/namespace/Makefile.am (renamed from xlators/cluster/map/Makefile.am)2
-rw-r--r--xlators/features/namespace/src/Makefile.am17
-rw-r--r--xlators/features/namespace/src/namespace.c1344
-rw-r--r--xlators/features/namespace/src/namespace.h23
-rw-r--r--xlators/features/path-convertor/Makefile.am3
-rw-r--r--xlators/features/path-convertor/src/Makefile.am15
-rw-r--r--xlators/features/path-convertor/src/path-mem-types.h22
-rw-r--r--xlators/features/path-convertor/src/path.c1223
-rw-r--r--xlators/features/protect/Makefile.am3
-rw-r--r--xlators/features/protect/src/Makefile.am21
-rw-r--r--xlators/features/protect/src/prot_client.c213
-rw-r--r--xlators/features/protect/src/prot_dht.c163
-rw-r--r--xlators/features/protect/src/prot_server.c46
-rw-r--r--xlators/features/qemu-block/src/Makefile.am156
-rw-r--r--xlators/features/qemu-block/src/bdrv-xlator.c386
-rw-r--r--xlators/features/qemu-block/src/bh-syncop.c43
-rw-r--r--xlators/features/qemu-block/src/clock-timer.c55
-rw-r--r--xlators/features/qemu-block/src/coroutine-synctask.c111
-rw-r--r--xlators/features/qemu-block/src/monitor-logging.c45
-rw-r--r--xlators/features/qemu-block/src/qb-coroutines.c662
-rw-r--r--xlators/features/qemu-block/src/qb-coroutines.h30
-rw-r--r--xlators/features/qemu-block/src/qemu-block-memory-types.h25
-rw-r--r--xlators/features/qemu-block/src/qemu-block.c1134
-rw-r--r--xlators/features/qemu-block/src/qemu-block.h109
-rw-r--r--xlators/features/quiesce/src/Makefile.am5
-rw-r--r--xlators/features/quiesce/src/quiesce-mem-types.h7
-rw-r--r--xlators/features/quiesce/src/quiesce-messages.h28
-rw-r--r--xlators/features/quiesce/src/quiesce.c3365
-rw-r--r--xlators/features/quiesce/src/quiesce.h68
-rw-r--r--xlators/features/quota/src/Makefile.am22
-rw-r--r--xlators/features/quota/src/quota-enforcer-client.c724
-rw-r--r--xlators/features/quota/src/quota-mem-types.h28
-rw-r--r--xlators/features/quota/src/quota-messages.h252
-rw-r--r--xlators/features/quota/src/quota.c8247
-rw-r--r--xlators/features/quota/src/quota.h419
-rw-r--r--xlators/features/quota/src/quotad-aggregator.c757
-rw-r--r--xlators/features/quota/src/quotad-aggregator.h29
-rw-r--r--xlators/features/quota/src/quotad-helpers.c116
-rw-r--r--xlators/features/quota/src/quotad-helpers.h4
-rw-r--r--xlators/features/quota/src/quotad.c315
-rw-r--r--xlators/features/quota/src/quotad.sym7
-rw-r--r--xlators/features/read-only/src/Makefile.am7
-rw-r--r--xlators/features/read-only/src/read-only-common.c545
-rw-r--r--xlators/features/read-only/src/read-only-common.h112
-rw-r--r--xlators/features/read-only/src/read-only-mem-types.h6
-rw-r--r--xlators/features/read-only/src/read-only.c176
-rw-r--r--xlators/features/read-only/src/read-only.h22
-rw-r--r--xlators/features/read-only/src/worm-helper.c395
-rw-r--r--xlators/features/read-only/src/worm-helper.h44
-rw-r--r--xlators/features/read-only/src/worm.c746
-rw-r--r--xlators/features/sdfs/Makefile.am (renamed from xlators/encryption/crypt/Makefile.am)2
-rw-r--r--xlators/features/sdfs/src/Makefile.am19
-rw-r--r--xlators/features/sdfs/src/sdfs-messages.h67
-rw-r--r--xlators/features/sdfs/src/sdfs.c1479
-rw-r--r--xlators/features/sdfs/src/sdfs.h49
-rw-r--r--xlators/features/selinux/Makefile.am (renamed from xlators/cluster/stripe/Makefile.am)2
-rw-r--r--xlators/features/selinux/src/Makefile.am20
-rw-r--r--xlators/features/selinux/src/selinux-mem-types.h (renamed from xlators/features/filter/src/filter-mem-types.h)15
-rw-r--r--xlators/features/selinux/src/selinux-messages.h30
-rw-r--r--xlators/features/selinux/src/selinux.c323
-rw-r--r--xlators/features/selinux/src/selinux.h24
-rw-r--r--xlators/features/shard/src/Makefile.am3
-rw-r--r--xlators/features/shard/src/shard-mem-types.h15
-rw-r--r--xlators/features/shard/src/shard-messages.h193
-rw-r--r--xlators/features/shard/src/shard.c10042
-rw-r--r--xlators/features/shard/src/shard.h518
-rw-r--r--xlators/features/snapview-client/src/Makefile.am5
-rw-r--r--xlators/features/snapview-client/src/snapview-client-mem-types.h12
-rw-r--r--xlators/features/snapview-client/src/snapview-client-messages.h71
-rw-r--r--xlators/features/snapview-client/src/snapview-client.c4082
-rw-r--r--xlators/features/snapview-client/src/snapview-client.h176
-rw-r--r--xlators/features/snapview-server/src/Makefile.am23
-rw-r--r--xlators/features/snapview-server/src/snapview-server-helpers.c900
-rw-r--r--xlators/features/snapview-server/src/snapview-server-mem-types.h15
-rw-r--r--xlators/features/snapview-server/src/snapview-server-messages.h54
-rw-r--r--xlators/features/snapview-server/src/snapview-server-mgmt.c840
-rw-r--r--xlators/features/snapview-server/src/snapview-server.c4135
-rw-r--r--xlators/features/snapview-server/src/snapview-server.h294
-rw-r--r--xlators/features/thin-arbiter/Makefile.am3
-rw-r--r--xlators/features/thin-arbiter/src/Makefile.am22
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter-mem-types.h19
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter-messages.h28
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter.c661
-rw-r--r--xlators/features/thin-arbiter/src/thin-arbiter.h59
-rw-r--r--xlators/features/trash/src/Makefile.am5
-rw-r--r--xlators/features/trash/src/trash-mem-types.h13
-rw-r--r--xlators/features/trash/src/trash.c4292
-rw-r--r--xlators/features/trash/src/trash.h100
-rw-r--r--xlators/features/upcall/src/Makefile.am10
-rw-r--r--xlators/features/upcall/src/upcall-cache-invalidation.h38
-rw-r--r--xlators/features/upcall/src/upcall-internal.c1028
-rw-r--r--xlators/features/upcall/src/upcall-mem-types.h13
-rw-r--r--xlators/features/upcall/src/upcall-messages.h54
-rw-r--r--xlators/features/upcall/src/upcall.c3061
-rw-r--r--xlators/features/upcall/src/upcall.h199
-rw-r--r--xlators/features/utime/Makefile.am3
-rw-r--r--xlators/features/utime/src/Makefile.am41
-rw-r--r--xlators/features/utime/src/utime-autogen-fops-tmpl.c28
-rw-r--r--xlators/features/utime/src/utime-autogen-fops-tmpl.h22
-rwxr-xr-xxlators/features/utime/src/utime-gen-fops-c.py147
-rwxr-xr-xxlators/features/utime/src/utime-gen-fops-h.py35
-rw-r--r--xlators/features/utime/src/utime-helpers.c110
-rw-r--r--xlators/features/utime/src/utime-helpers.h25
-rw-r--r--xlators/features/utime/src/utime-mem-types.h21
-rw-r--r--xlators/features/utime/src/utime-messages.h29
-rw-r--r--xlators/features/utime/src/utime.c392
-rw-r--r--xlators/features/utime/src/utime.h23
-rw-r--r--xlators/lib/src/libxlator.c796
-rw-r--r--xlators/lib/src/libxlator.h112
-rw-r--r--xlators/meta/Makefile.am2
-rw-r--r--xlators/meta/src/Makefile.am3
-rw-r--r--xlators/meta/src/active-link.c25
-rw-r--r--xlators/meta/src/cmdline-file.c32
-rw-r--r--xlators/meta/src/frames-file.c164
-rw-r--r--xlators/meta/src/graph-dir.c129
-rw-r--r--xlators/meta/src/graphs-dir.c87
-rw-r--r--xlators/meta/src/history-file.c33
-rw-r--r--xlators/meta/src/logfile-link.c25
-rw-r--r--xlators/meta/src/logging-dir.c42
-rw-r--r--xlators/meta/src/loglevel-file.c40
-rw-r--r--xlators/meta/src/mallinfo-file.c25
-rw-r--r--xlators/meta/src/measure-file.c37
-rw-r--r--xlators/meta/src/meminfo-file.c33
-rw-r--r--xlators/meta/src/meta-defaults.c775
-rw-r--r--xlators/meta/src/meta-helpers.c445
-rw-r--r--xlators/meta/src/meta-hooks.h6
-rw-r--r--xlators/meta/src/meta-mem-types.h17
-rw-r--r--xlators/meta/src/meta.c293
-rw-r--r--xlators/meta/src/meta.h181
-rw-r--r--xlators/meta/src/name-file.c34
-rw-r--r--xlators/meta/src/option-file.c36
-rw-r--r--xlators/meta/src/options-dir.c62
-rw-r--r--xlators/meta/src/private-file.c33
-rw-r--r--xlators/meta/src/process_uuid-file.c28
-rw-r--r--xlators/meta/src/profile-file.c33
-rw-r--r--xlators/meta/src/root-dir.c105
-rw-r--r--xlators/meta/src/subvolume-link.c67
-rw-r--r--xlators/meta/src/subvolumes-dir.c65
-rw-r--r--xlators/meta/src/top-link.c31
-rw-r--r--xlators/meta/src/type-file.c34
-rw-r--r--xlators/meta/src/version-file.c29
-rw-r--r--xlators/meta/src/view-dir.c27
-rw-r--r--xlators/meta/src/volfile-file.c71
-rw-r--r--xlators/meta/src/xlator-dir.c130
-rw-r--r--xlators/mgmt/glusterd/src/Makefile.am58
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-bitd-svc.c287
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-bitd-svc.h16
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-bitrot.c1265
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-brick-ops.c4876
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-helper.c4
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-helper.h2
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-mgmt.c223
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-conn-mgmt.h38
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-errno.h33
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-ganesha.c1473
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.c10887
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-geo-rep.h38
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.c235
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc-helper.h51
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.c478
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-gfproxyd-svc.h47
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handler.c10161
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-handshake.c4036
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-hooks.c984
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-hooks.h80
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.c1258
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-locks.h38
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-log-ops.c455
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mem-types.h101
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-messages.h5060
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt-handler.c1759
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.c4830
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mgmt.h86
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mountbroker.c1188
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-mountbroker.h35
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-nfs-svc.c313
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-nfs-svc.h17
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.c13449
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-op-sm.h327
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-peer-utils.c1559
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-peer-utils.h67
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-pmap.c877
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-pmap.h58
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-proc-mgmt.c181
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-proc-mgmt.h34
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quota.c3549
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quota.h (renamed from xlators/features/ganesha/src/ganesha-mem-types.h)16
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quotad-svc.c323
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-quotad-svc.h10
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rcu.h6
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rebalance.c2169
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-replace-brick.c1504
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-reset-brick.c376
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-rpc-ops.c4144
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-scrub-svc.c284
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-scrub-svc.h22
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-server-quorum.c703
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-server-quorum.h26
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.c153
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc-helper.h42
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc.c920
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-shd-svc.h25
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.c2566
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-sm.h253
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.c66
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc-helper.h16
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc.c750
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapd-svc.h20
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot-utils.c7071
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot-utils.h157
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-snapshot.c17370
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-statedump.c367
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-statedump.h4
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.c8053
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-store.h257
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-helper.c1112
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-helper.h50
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-mgmt.c647
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-svc-mgmt.h100
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.c3291
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-syncop.h102
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-tierd-svc-helper.c207
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.c21824
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-utils.h824
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.c10353
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volgen.h345
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-ops.c5305
-rw-r--r--xlators/mgmt/glusterd/src/glusterd-volume-set.c5515
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.c3588
-rw-r--r--xlators/mgmt/glusterd/src/glusterd.h1772
-rw-r--r--xlators/mount/fuse/src/Makefile.am7
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.c10392
-rw-r--r--xlators/mount/fuse/src/fuse-bridge.h792
-rw-r--r--xlators/mount/fuse/src/fuse-helpers.c1052
-rw-r--r--xlators/mount/fuse/src/fuse-mem-types.h26
-rw-r--r--xlators/mount/fuse/src/fuse-resolve.c1033
-rwxr-xr-xxlators/mount/fuse/utils/mount.glusterfs.in195
-rwxr-xr-xxlators/mount/fuse/utils/mount_glusterfs.in33
-rw-r--r--xlators/nfs/server/src/Makefile.am19
-rw-r--r--xlators/nfs/server/src/acl3.c1585
-rw-r--r--xlators/nfs/server/src/acl3.h18
-rw-r--r--xlators/nfs/server/src/auth-cache.c535
-rw-r--r--xlators/nfs/server/src/auth-cache.h26
-rw-r--r--xlators/nfs/server/src/exports.c1632
-rw-r--r--xlators/nfs/server/src/exports.h69
-rw-r--r--xlators/nfs/server/src/mount3-auth.c682
-rw-r--r--xlators/nfs/server/src/mount3-auth.h28
-rw-r--r--xlators/nfs/server/src/mount3.c6357
-rw-r--r--xlators/nfs/server/src/mount3.h209
-rw-r--r--xlators/nfs/server/src/mount3udp_svc.c344
-rw-r--r--xlators/nfs/server/src/netgroups.c1012
-rw-r--r--xlators/nfs/server/src/netgroups.h31
-rw-r--r--xlators/nfs/server/src/nfs-common.c653
-rw-r--r--xlators/nfs/server/src/nfs-common.h51
-rw-r--r--xlators/nfs/server/src/nfs-fops.c2366
-rw-r--r--xlators/nfs/server/src/nfs-fops.h305
-rw-r--r--xlators/nfs/server/src/nfs-generics.c336
-rw-r--r--xlators/nfs/server/src/nfs-generics.h146
-rw-r--r--xlators/nfs/server/src/nfs-inodes.c768
-rw-r--r--xlators/nfs/server/src/nfs-inodes.h60
-rw-r--r--xlators/nfs/server/src/nfs-mem-types.h70
-rw-r--r--xlators/nfs/server/src/nfs-messages.h1743
-rw-r--r--xlators/nfs/server/src/nfs.c3431
-rw-r--r--xlators/nfs/server/src/nfs.h173
-rw-r--r--xlators/nfs/server/src/nfs3-fh.c216
-rw-r--r--xlators/nfs/server/src/nfs3-fh.h105
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.c5064
-rw-r--r--xlators/nfs/server/src/nfs3-helpers.h280
-rw-r--r--xlators/nfs/server/src/nfs3.c9193
-rw-r--r--xlators/nfs/server/src/nfs3.h382
-rw-r--r--xlators/nfs/server/src/nfsserver.sym10
-rw-r--r--xlators/nfs/server/src/nlm4.c4333
-rw-r--r--xlators/nfs/server/src/nlm4.h126
-rw-r--r--xlators/nfs/server/src/nlmcbk_svc.c176
-rw-r--r--xlators/performance/Makefile.am3
-rw-r--r--xlators/performance/io-cache/src/Makefile.am1
-rw-r--r--xlators/performance/io-cache/src/io-cache-messages.h170
-rw-r--r--xlators/performance/io-cache/src/io-cache.c3099
-rw-r--r--xlators/performance/io-cache/src/io-cache.h400
-rw-r--r--xlators/performance/io-cache/src/ioc-inode.c292
-rw-r--r--xlators/performance/io-cache/src/ioc-mem-types.h24
-rw-r--r--xlators/performance/io-cache/src/page.c1456
-rw-r--r--xlators/performance/io-threads/src/Makefile.am3
-rw-r--r--xlators/performance/io-threads/src/io-threads-messages.h108
-rw-r--r--xlators/performance/io-threads/src/io-threads.c1968
-rw-r--r--xlators/performance/io-threads/src/io-threads.h115
-rw-r--r--xlators/performance/io-threads/src/iot-mem-types.h9
-rw-r--r--xlators/performance/md-cache/src/Makefile.am1
-rw-r--r--xlators/performance/md-cache/src/md-cache-mem-types.h13
-rw-r--r--xlators/performance/md-cache/src/md-cache-messages.h57
-rw-r--r--xlators/performance/md-cache/src/md-cache.c4708
-rw-r--r--xlators/performance/nl-cache/Makefile.am3
-rw-r--r--xlators/performance/nl-cache/src/Makefile.am12
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-helper.c1201
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-mem-types.h27
-rw-r--r--xlators/performance/nl-cache/src/nl-cache-messages.h29
-rw-r--r--xlators/performance/nl-cache/src/nl-cache.c840
-rw-r--r--xlators/performance/nl-cache/src/nl-cache.h175
-rw-r--r--xlators/performance/open-behind/src/Makefile.am3
-rw-r--r--xlators/performance/open-behind/src/open-behind-mem-types.h9
-rw-r--r--xlators/performance/open-behind/src/open-behind-messages.h77
-rw-r--r--xlators/performance/open-behind/src/open-behind.c1623
-rw-r--r--xlators/performance/quick-read/src/Makefile.am3
-rw-r--r--xlators/performance/quick-read/src/quick-read-mem-types.h16
-rw-r--r--xlators/performance/quick-read/src/quick-read-messages.h121
-rw-r--r--xlators/performance/quick-read/src/quick-read.c2140
-rw-r--r--xlators/performance/quick-read/src/quick-read.h83
-rw-r--r--xlators/performance/read-ahead/src/Makefile.am3
-rw-r--r--xlators/performance/read-ahead/src/page.c847
-rw-r--r--xlators/performance/read-ahead/src/read-ahead-mem-types.h17
-rw-r--r--xlators/performance/read-ahead/src/read-ahead-messages.h104
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.c1839
-rw-r--r--xlators/performance/read-ahead/src/read-ahead.h178
-rw-r--r--xlators/performance/readdir-ahead/src/Makefile.am7
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-mem-types.h12
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead-messages.h97
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.c1581
-rw-r--r--xlators/performance/readdir-ahead/src/readdir-ahead.h94
-rw-r--r--xlators/performance/symlink-cache/Makefile.am3
-rw-r--r--xlators/performance/symlink-cache/src/Makefile.am15
-rw-r--r--xlators/performance/symlink-cache/src/symlink-cache-messages.h93
-rw-r--r--xlators/performance/symlink-cache/src/symlink-cache.c402
-rw-r--r--xlators/performance/write-behind/src/Makefile.am3
-rw-r--r--xlators/performance/write-behind/src/write-behind-mem-types.h16
-rw-r--r--xlators/performance/write-behind/src/write-behind-messages.h114
-rw-r--r--xlators/performance/write-behind/src/write-behind.c4395
-rw-r--r--xlators/playground/rot-13/Makefile.am (renamed from xlators/cluster/ha/Makefile.am)0
-rw-r--r--xlators/playground/rot-13/src/Makefile.am (renamed from xlators/encryption/rot-13/src/Makefile.am)3
-rw-r--r--xlators/playground/rot-13/src/rot-13.c166
-rw-r--r--xlators/playground/rot-13/src/rot-13.h (renamed from xlators/encryption/rot-13/src/rot-13.h)4
-rw-r--r--xlators/playground/template/src/Makefile.am5
-rw-r--r--xlators/playground/template/src/template.c182
-rw-r--r--xlators/playground/template/src/template.h36
-rw-r--r--xlators/protocol/auth/addr/src/Makefile.am6
-rw-r--r--xlators/protocol/auth/addr/src/addr.c470
-rw-r--r--xlators/protocol/auth/login/src/Makefile.am5
-rw-r--r--xlators/protocol/auth/login/src/login.c318
-rw-r--r--xlators/protocol/client/src/Makefile.am11
-rw-r--r--xlators/protocol/client/src/client-callback.c330
-rw-r--r--xlators/protocol/client/src/client-common.c3589
-rw-r--r--xlators/protocol/client/src/client-common.h630
-rw-r--r--xlators/protocol/client/src/client-handshake.c2641
-rw-r--r--xlators/protocol/client/src/client-helpers.c1029
-rw-r--r--xlators/protocol/client/src/client-lk.c755
-rw-r--r--xlators/protocol/client/src/client-mem-types.h17
-rw-r--r--xlators/protocol/client/src/client-messages.h768
-rw-r--r--xlators/protocol/client/src/client-rpc-fops.c10563
-rw-r--r--xlators/protocol/client/src/client-rpc-fops_v2.c6177
-rw-r--r--xlators/protocol/client/src/client.c4304
-rw-r--r--xlators/protocol/client/src/client.h572
-rw-r--r--xlators/protocol/server/src/Makefile.am18
-rw-r--r--xlators/protocol/server/src/authenticate.c327
-rw-r--r--xlators/protocol/server/src/authenticate.h34
-rw-r--r--xlators/protocol/server/src/server-common.c842
-rw-r--r--xlators/protocol/server/src/server-common.h199
-rw-r--r--xlators/protocol/server/src/server-handshake.c1437
-rw-r--r--xlators/protocol/server/src/server-helpers.c2164
-rw-r--r--xlators/protocol/server/src/server-helpers.h94
-rw-r--r--xlators/protocol/server/src/server-mem-types.h23
-rw-r--r--xlators/protocol/server/src/server-messages.h986
-rw-r--r--xlators/protocol/server/src/server-resolve.c960
-rw-r--r--xlators/protocol/server/src/server-rpc-fops.c9261
-rw-r--r--xlators/protocol/server/src/server-rpc-fops_v2.c6031
-rw-r--r--xlators/protocol/server/src/server.c2856
-rw-r--r--xlators/protocol/server/src/server.h279
-rw-r--r--xlators/storage/Makefile.am4
-rw-r--r--xlators/storage/bd/src/Makefile.am20
-rw-r--r--xlators/storage/bd/src/bd-aio.c523
-rw-r--r--xlators/storage/bd/src/bd-aio.h36
-rw-r--r--xlators/storage/bd/src/bd-helper.c1020
-rw-r--r--xlators/storage/bd/src/bd-mem-types.h27
-rw-r--r--xlators/storage/bd/src/bd.c2448
-rw-r--r--xlators/storage/bd/src/bd.h168
-rw-r--r--xlators/storage/posix/src/Makefile.am18
-rw-r--r--xlators/storage/posix/src/posix-aio.c954
-rw-r--r--xlators/storage/posix/src/posix-aio.h22
-rw-r--r--xlators/storage/posix/src/posix-common.c1524
-rw-r--r--xlators/storage/posix/src/posix-entry-ops.c2496
-rw-r--r--xlators/storage/posix/src/posix-gfid-path.c243
-rw-r--r--xlators/storage/posix/src/posix-gfid-path.h28
-rw-r--r--xlators/storage/posix/src/posix-handle.c1606
-rw-r--r--xlators/storage/posix/src/posix-handle.h427
-rw-r--r--xlators/storage/posix/src/posix-helpers.c4889
-rw-r--r--xlators/storage/posix/src/posix-inode-fd-ops.c6004
-rw-r--r--xlators/storage/posix/src/posix-inode-handle.h118
-rw-r--r--xlators/storage/posix/src/posix-mem-types.h21
-rw-r--r--xlators/storage/posix/src/posix-messages.h967
-rw-r--r--xlators/storage/posix/src/posix-metadata-disk.h31
-rw-r--r--xlators/storage/posix/src/posix-metadata.c916
-rw-r--r--xlators/storage/posix/src/posix-metadata.h71
-rw-r--r--xlators/storage/posix/src/posix.c7207
-rw-r--r--xlators/storage/posix/src/posix.h756
-rw-r--r--xlators/system/posix-acl/src/Makefile.am6
-rw-r--r--xlators/system/posix-acl/src/posix-acl-mem-types.h13
-rw-r--r--xlators/system/posix-acl/src/posix-acl-messages.h28
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.c229
-rw-r--r--xlators/system/posix-acl/src/posix-acl-xattr.h17
-rw-r--r--xlators/system/posix-acl/src/posix-acl.c3311
-rw-r--r--xlators/system/posix-acl/src/posix-acl.h33
-rw-r--r--xlators/xlator.sym10
761 files changed, 364442 insertions, 328117 deletions
diff --git a/xlators/Makefile.am b/xlators/Makefile.am
index ea1be844ef4..ef20cbb64fa 100644
--- a/xlators/Makefile.am
+++ b/xlators/Makefile.am
@@ -1,9 +1,12 @@
-if ENABLE_EXPERIMENTAL
- EXPERIMENTAL = experimental
+if BUILD_GNFS
+ GNFS_DIR = nfs
endif
-SUBDIRS = cluster storage protocol performance debug features encryption mount nfs mgmt system \
- playground meta $(EXPERIMENTAL)
+DIST_SUBDIRS = cluster storage protocol performance debug features \
+ mount nfs mgmt system playground meta
+
+SUBDIRS = cluster storage protocol performance debug features \
+ mount ${GNFS_DIR} mgmt system playground meta
EXTRA_DIST = xlator.sym
diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am
index 903fbb39f12..8e067d5ab58 100644
--- a/xlators/cluster/Makefile.am
+++ b/xlators/cluster/Makefile.am
@@ -1,3 +1,3 @@
-SUBDIRS = stripe afr dht ec
+SUBDIRS = afr dht ec
CLEANFILES =
diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am
index 5612733d3ed..610819b28fc 100644
--- a/xlators/cluster/afr/src/Makefile.am
+++ b/xlators/cluster/afr/src/Makefile.am
@@ -1,4 +1,4 @@
-xlator_LTLIBRARIES = afr.la pump.la
+xlator_LTLIBRARIES = afr.la
xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
afr_common_source = afr-dir-read.c afr-dir-write.c afr-inode-read.c \
@@ -14,18 +14,15 @@ afr_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
afr_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) afr.c
afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-pump_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
-pump_la_SOURCES = $(afr_common_source) $(AFR_SELFHEAL_SOURCES) pump.c
-pump_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h \
afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-mem-types.h \
- afr-common.c afr-self-heald.h pump.h \
+ afr-common.c afr-self-heald.h \
$(top_builddir)/xlators/lib/src/libxlator.h afr-messages.h
AM_CPPFLAGS = $(GF_CPPFLAGS) \
-I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/xlators/lib/src \
- -I$(top_srcdir)/rpc/rpc-lib/src
+ -I$(top_srcdir)/rpc/rpc-lib/src \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src
AM_CFLAGS = -Wall $(GF_CFLAGS)
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index 457f7865cec..032ab5c8001 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -15,24 +15,20 @@
#include <stdlib.h>
#include <signal.h>
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-#include "statedump.h"
-#include "inode.h"
-
-#include "fd.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/hashfn.h>
+#include <glusterfs/list.h>
+#include <glusterfs/call-stub.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/events.h>
+#include <glusterfs/upcall-utils.h>
#include "afr-inode-read.h"
#include "afr-inode-write.h"
@@ -43,56 +39,799 @@
#include "afr-self-heald.h"
#include "afr-messages.h"
-call_frame_t *
-afr_copy_frame (call_frame_t *base)
+int32_t
+afr_quorum_errno(afr_private_t *priv)
+{
+ return ENOTCONN;
+}
+
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid)
{
- afr_local_t *local = NULL;
- call_frame_t *frame = NULL;
- int op_errno = 0;
+ if (!__is_root_gfid(pargfid)) {
+ return _gf_false;
+ }
+
+ if (strcmp(name, GF_REPLICATE_TRASH_DIR) == 0) {
+ /*For backward compatibility /.landfill is private*/
+ return _gf_true;
+ }
- frame = copy_frame (base);
- if (!frame)
- return NULL;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local) {
- AFR_STACK_DESTROY (frame);
- return NULL;
- }
+ if (pid == GF_CLIENT_PID_GSYNCD) {
+ /*geo-rep needs to create/sync private directory on slave because
+ * it appears in changelog*/
+ return _gf_false;
+ }
- return frame;
+ if (pid == GF_CLIENT_PID_GLFS_HEAL || pid == GF_CLIENT_PID_SELF_HEALD) {
+ if (strcmp(name, priv->anon_inode_name) == 0) {
+ /* anonymous-inode dir is private*/
+ return _gf_true;
+ }
+ } else {
+ if (strncmp(name, AFR_ANON_DIR_PREFIX, strlen(AFR_ANON_DIR_PREFIX)) ==
+ 0) {
+ /* anonymous-inode dir prefix is private for geo-rep to work*/
+ return _gf_true;
+ }
+ }
+
+ return _gf_false;
}
-int
-__afr_inode_ctx_get (xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies)
{
- uint64_t ctx_int = 0;
- int ret = -1;
- afr_inode_ctx_t *tmp_ctx = NULL;
+ int i = 0;
- ret = __inode_ctx_get (inode, this, &ctx_int);
- if (ret) {
- tmp_ctx = GF_CALLOC (1, sizeof (afr_inode_ctx_t),
- gf_afr_mt_inode_ctx_t);
- if (!tmp_ctx)
- goto out;
-
- ctx_int = (long) tmp_ctx;
- ret = __inode_ctx_set (inode, this, &ctx_int);
- if (ret) {
- GF_FREE (tmp_ctx);
- goto out;
- }
- tmp_ctx->spb_choice = -1;
- tmp_ctx->read_subvol = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ replies[i] = 1;
} else {
- tmp_ctx = (afr_inode_ctx_t *) ctx_int;
+ replies[i] = 0;
}
+ }
+}
- *ctx = tmp_ctx;
- ret = 0;
+int
+afr_fav_child_reset_sink_xattrs(void *opaque);
+
+int
+afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *frame, void *opaque);
+
+static void
+afr_discover_done(call_frame_t *frame, xlator_t *this);
+
+int
+afr_dom_lock_acquire_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
+
+ local->cont.lk.dom_lock_op_ret[i] = op_ret;
+ local->cont.lk.dom_lock_op_errno[i] = op_errno;
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to acquire %s on %s",
+ uuid_utoa(local->fd->inode->gfid), AFR_LK_HEAL_DOM,
+ priv->children[i]->name);
+ } else {
+ local->cont.lk.dom_locked_nodes[i] = 1;
+ }
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
+
+int
+afr_dom_lock_acquire(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+ int i = 0;
+
+ priv = frame->this->private;
+ local = frame->local;
+ local->cont.lk.dom_locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+ gf_afr_mt_char);
+ if (!local->cont.lk.dom_locked_nodes) {
+ return -ENOMEM;
+ }
+ local->cont.lk.dom_lock_op_ret = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_ret),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_ret) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ local->cont.lk.dom_lock_op_errno = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.dom_lock_op_errno),
+ gf_afr_mt_int32_t);
+ if (!local->cont.lk.dom_lock_op_errno) {
+ return -ENOMEM; /* CALLOC'd members are freed in afr_local_cleanup. */
+ }
+ flock.l_type = F_WRLCK;
+
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLK, &flock, NULL);
+
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL))
+ goto blocking_lock;
+
+ /*If any of the bricks returned EAGAIN, we still need blocking locks.*/
+ if (AFR_COUNT(local->cont.lk.dom_locked_nodes, priv->child_count) !=
+ priv->child_count) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.dom_lock_op_ret[i] == -1 &&
+ local->cont.lk.dom_lock_op_errno[i] == EAGAIN)
+ goto blocking_lock;
+ }
+ }
+
+ return 0;
+
+blocking_lock:
+ afr_dom_lock_release(frame);
+ AFR_ONALL(frame, afr_dom_lock_acquire_cbk, finodelk, AFR_LK_HEAL_DOM,
+ local->fd, F_SETLKW, &flock, NULL);
+ if (!afr_has_quorum(local->cont.lk.dom_locked_nodes, frame->this, NULL)) {
+ afr_dom_lock_release(frame);
+ return -afr_quorum_errno(priv);
+ }
+
+ return 0;
+}
+
+int
+afr_dom_lock_release_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = (long)cookie;
+
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to release %s on %s", local->loc.path,
+ AFR_LK_HEAL_DOM, priv->children[i]->name);
+ }
+ local->cont.lk.dom_locked_nodes[i] = 0;
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
+
+void
+afr_dom_lock_release(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = frame->this->private;
+ locked_on = local->cont.lk.dom_locked_nodes;
+ if (AFR_COUNT(locked_on, priv->child_count) == 0)
+ return;
+ flock.l_type = F_UNLCK;
+
+ AFR_ONLIST(locked_on, frame, afr_dom_lock_release_cbk, finodelk,
+ AFR_LK_HEAL_DOM, local->fd, F_SETLK, &flock, NULL);
+
+ return;
+}
+
+static void
+afr_lk_heal_info_cleanup(afr_lk_heal_info_t *info)
+{
+ if (!info)
+ return;
+ if (info->xdata_req)
+ dict_unref(info->xdata_req);
+ if (info->fd)
+ fd_unref(info->fd);
+ GF_FREE(info->locked_nodes);
+ GF_FREE(info->child_up_event_gen);
+ GF_FREE(info->child_down_event_gen);
+ GF_FREE(info);
+}
+
+static int
+afr_add_lock_to_saved_locks(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -ENOMEM;
+
+ info = GF_CALLOC(sizeof(*info), 1, gf_afr_mt_lk_heal_info_t);
+ if (!info) {
+ goto cleanup;
+ }
+ INIT_LIST_HEAD(&info->pos);
+ info->fd = fd_ref(local->fd);
+ info->cmd = local->cont.lk.cmd;
+ info->pid = frame->root->pid;
+ info->flock = local->cont.lk.user_flock;
+ info->xdata_req = dict_copy_with_ref(local->xdata_req, NULL);
+ if (!info->xdata_req) {
+ goto cleanup;
+ }
+ info->lk_owner = frame->root->lk_owner;
+ info->locked_nodes = GF_MALLOC(
+ sizeof(*info->locked_nodes) * priv->child_count, gf_afr_mt_char);
+ if (!info->locked_nodes) {
+ goto cleanup;
+ }
+ memcpy(info->locked_nodes, local->cont.lk.locked_nodes,
+ sizeof(*info->locked_nodes) * priv->child_count);
+ info->child_up_event_gen = GF_CALLOC(sizeof(*info->child_up_event_gen),
+ priv->child_count, gf_afr_mt_int32_t);
+ if (!info->child_up_event_gen) {
+ goto cleanup;
+ }
+ info->child_down_event_gen = GF_CALLOC(sizeof(*info->child_down_event_gen),
+ priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!info->child_down_event_gen) {
+ goto cleanup;
+ }
+
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(local->fd, this);
+ if (fd_ctx)
+ fd_ctx->lk_heal_info = info;
+ }
+ UNLOCK(&local->fd->lock);
+ if (!fd_ctx) {
+ goto cleanup;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&info->pos, &priv->saved_locks);
+ }
+ UNLOCK(&priv->lock);
+
+ return 0;
+cleanup:
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to add lock to healq",
+ uuid_utoa(local->fd->inode->gfid));
+ if (info) {
+ afr_lk_heal_info_cleanup(info);
+ if (fd_ctx) {
+ LOCK(&local->fd->lock);
+ {
+ fd_ctx->lk_heal_info = NULL;
+ }
+ UNLOCK(&local->fd->lock);
+ }
+ }
+ return ret;
+}
+
+static int
+afr_remove_lock_from_saved_locks(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ struct gf_flock flock = local->cont.lk.user_flock;
+ afr_lk_heal_info_t *info = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -EINVAL;
+
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx || !fd_ctx->lk_heal_info) {
+ goto out;
+ }
+
+ info = fd_ctx->lk_heal_info;
+ if ((info->flock.l_start != flock.l_start) ||
+ (info->flock.l_whence != flock.l_whence) ||
+ (info->flock.l_len != flock.l_len)) {
+ /*TODO: Compare lkowners too.*/
+ goto out;
+ }
+
+ LOCK(&priv->lock);
+ {
+ list_del(&fd_ctx->lk_heal_info->pos);
+ }
+ UNLOCK(&priv->lock);
+
+ afr_lk_heal_info_cleanup(info);
+ fd_ctx->lk_heal_info = NULL;
+ ret = 0;
out:
- return ret;
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_LK_HEAL_DOM,
+ "%s: Failed to remove lock from healq",
+ uuid_utoa(local->fd->inode->gfid));
+ return ret;
+}
+
+int
+afr_lock_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed to heal lock on child %d for %s", i,
+ uuid_utoa(local->fd->inode->gfid));
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_getlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "Failed getlk for %s", uuid_utoa(local->fd->inode->gfid));
+ } else {
+ local->cont.lk.getlk_rsp[i] = *lock;
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+static gf_boolean_t
+afr_does_lk_owner_match(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
+{
+ int i = 0;
+ afr_local_t *local = frame->local;
+ struct gf_flock flock = {
+ 0,
+ };
+ gf_boolean_t ret = _gf_true;
+ char *wind_on = alloca0(priv->child_count);
+ unsigned char *success_replies = alloca0(priv->child_count);
+ local->cont.lk.getlk_rsp = GF_CALLOC(sizeof(*local->cont.lk.getlk_rsp),
+ priv->child_count, gf_afr_mt_gf_lock);
+
+ flock = info->flock;
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ wind_on[i] = 1;
+ }
+
+ AFR_ONLIST(wind_on, frame, afr_getlk_cbk, lk, info->fd, F_GETLK, &flock,
+ info->xdata_req);
+
+ afr_fill_success_replies(local, priv, success_replies);
+ if (AFR_COUNT(success_replies, priv->child_count) == 0) {
+ ret = _gf_false;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid || local->replies[i].op_ret != 0)
+ continue;
+ if (local->cont.lk.getlk_rsp[i].l_type == F_UNLCK)
+ continue;
+ /*TODO: Do we really need to compare lkowner if F_UNLCK is true?*/
+ if (!is_same_lkowner(&local->cont.lk.getlk_rsp[i].l_owner,
+ &info->lk_owner)) {
+ ret = _gf_false;
+ break;
+ }
+ }
+out:
+ afr_local_replies_wipe(local, priv);
+ GF_FREE(local->cont.lk.getlk_rsp);
+ local->cont.lk.getlk_rsp = NULL;
+ return ret;
+}
+
+static void
+afr_mark_fd_bad(fd_t *fd, xlator_t *this)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ if (!fd)
+ return;
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(fd, this);
+ if (fd_ctx) {
+ fd_ctx->is_fd_bad = _gf_true;
+ fd_ctx->lk_heal_info = NULL;
+ }
+ }
+ UNLOCK(&fd->lock);
+}
+
+static void
+afr_add_lock_to_lkhealq(afr_private_t *priv, afr_lk_heal_info_t *info)
+{
+ LOCK(&priv->lock);
+ {
+ list_del(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+}
+
+static void
+afr_lock_heal_do(call_frame_t *frame, afr_private_t *priv,
+ afr_lk_heal_info_t *info)
+{
+ int i = 0;
+ int op_errno = 0;
+ int32_t *current_event_gen = NULL;
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ char *wind_on = alloca0(priv->child_count);
+ gf_boolean_t retry = _gf_true;
+
+ frame->root->pid = info->pid;
+ lk_owner_copy(&frame->root->lk_owner, &info->lk_owner);
+
+ op_errno = -afr_dom_lock_acquire(frame);
+ if ((op_errno != 0)) {
+ goto release;
+ }
+
+ if (!afr_does_lk_owner_match(frame, priv, info)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_LK_HEAL_DOM,
+ "Ignoring lock heal for %s since lk-onwers mismatch. "
+ "Lock possibly pre-empted by another client.",
+ uuid_utoa(info->fd->inode->gfid));
+ goto release;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (info->locked_nodes[i])
+ continue;
+ wind_on[i] = 1;
+ }
+
+ current_event_gen = alloca(priv->child_count);
+ memcpy(current_event_gen, info->child_up_event_gen,
+ priv->child_count * sizeof *current_event_gen);
+ AFR_ONLIST(wind_on, frame, afr_lock_heal_cbk, lk, info->fd, info->cmd,
+ &info->flock, info->xdata_req);
+
+ LOCK(&priv->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ if ((!local->replies[i].valid) || (local->replies[i].op_ret != 0)) {
+ continue;
+ }
+
+ if ((current_event_gen[i] == info->child_up_event_gen[i]) &&
+ (current_event_gen[i] > info->child_down_event_gen[i])) {
+ info->locked_nodes[i] = 1;
+ retry = _gf_false;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->saved_locks);
+ } else {
+ /*We received subsequent child up/down events while heal was in
+ * progress; don't mark child as healed. Attempt again on the
+ * new child up*/
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_LK_HEAL_DOM,
+ "Event gen mismatch: skipped healing lock on child %d "
+ "for %s.",
+ i, uuid_utoa(info->fd->inode->gfid));
+ }
+ }
+ }
+ UNLOCK(&priv->lock);
+
+release:
+ afr_dom_lock_release(frame);
+ if (retry)
+ afr_add_lock_to_lkhealq(priv, info);
+ return;
+}
+
+static int
+afr_lock_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+ STACK_DESTROY(frame->root);
+ return 0;
+}
+
+static int
+afr_lock_heal(void *opaque)
+{
+ call_frame_t *frame = (call_frame_t *)opaque;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+ struct list_head healq = {
+ 0,
+ };
+ int ret = 0;
+
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame) {
+ return ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&healq);
+ LOCK(&priv->lock);
+ {
+ list_splice_init(&priv->lk_healq, &healq);
+ }
+ UNLOCK(&priv->lock);
+
+ list_for_each_entry_safe(info, tmp, &healq, pos)
+ {
+ GF_ASSERT((AFR_COUNT(info->locked_nodes, priv->child_count) <
+ priv->child_count));
+ ((afr_local_t *)(iter_frame->local))->fd = fd_ref(info->fd);
+ afr_lock_heal_do(iter_frame, priv, info);
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = ENOTCONN;
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOTCONN,
+ AFR_MSG_LK_HEAL_DOM,
+ "Aborting processing of lk_healq."
+ "Healing will be reattempted on next child up for locks "
+ "that are still in quorum.");
+ LOCK(&priv->lock);
+ {
+ list_add_tail(&healq, &priv->lk_healq);
+ }
+ UNLOCK(&priv->lock);
+ break;
+ }
+ }
+
+ AFR_STACK_DESTROY(iter_frame);
+ return ret;
+}
+
+static int
+__afr_lock_heal_synctask(xlator_t *this, afr_private_t *priv, int child)
+{
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+
+ if (priv->shd.iamshd)
+ return 0;
+
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_up_event_gen[child] = priv->event_generation;
+ list_del_init(&info->pos);
+ list_add_tail(&info->pos, &priv->lk_healq);
+ }
+
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ return -1;
+
+ ret = synctask_new(this->ctx->env, afr_lock_heal, afr_lock_heal_done, frame,
+ frame);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_LK_HEAL_DOM,
+ "Failed to launch lock heal synctask");
+
+ return ret;
+}
+
+static int
+__afr_mark_pending_lk_heal(xlator_t *this, afr_private_t *priv, int child)
+{
+ afr_lk_heal_info_t *info = NULL;
+ afr_lk_heal_info_t *tmp = NULL;
+
+ if (priv->shd.iamshd)
+ return 0;
+ list_for_each_entry_safe(info, tmp, &priv->saved_locks, pos)
+ {
+ info->child_down_event_gen[child] = priv->event_generation;
+ if (info->locked_nodes[child] == 1)
+ info->locked_nodes[child] = 0;
+ if (!afr_has_quorum(info->locked_nodes, this, NULL)) {
+ /* Since the lock was lost on quorum no. of nodes, we should
+ * not attempt to heal it anymore. Some other client could have
+ * acquired the lock, modified data and released it and this
+ * client wouldn't know about it if we heal it.*/
+ afr_mark_fd_bad(info->fd, this);
+ list_del(&info->pos);
+ afr_lk_heal_info_cleanup(info);
+ /* We're not winding an unlock on the node where the lock is still
+ * present because when fencing logic switches over to the new
+ * client (since we marked the fd bad), it should preempt any
+ * existing lock. */
+ }
+ }
+ return 0;
+}
+
+gf_boolean_t
+afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
+ int32_t *op_errno)
+{
+ if (priv->consistent_io && local->call_count != priv->child_count) {
+ gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN,
+ "All subvolumes are not up");
+ if (op_errno)
+ *op_errno = ENOTCONN;
+ return _gf_false;
+ }
+ return _gf_true;
+}
+
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata)
+{
+ int ret = 0;
+ uint32_t lk_mode = GF_LK_ADVISORY;
+
+ ret = dict_get_uint32(xdata, GF_LOCK_MODE, &lk_mode);
+ if (!ret && lk_mode == GF_LK_MANDATORY)
+ return _gf_true;
+
+ return _gf_false;
+}
+
+call_frame_t *
+afr_copy_frame(call_frame_t *base)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int op_errno = 0;
+
+ frame = copy_frame(base);
+ if (!frame)
+ return NULL;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local) {
+ AFR_STACK_DESTROY(frame);
+ return NULL;
+ }
+
+ return frame;
+}
+
+/* Check if an entry or inode could be undergoing a transaction. */
+gf_boolean_t
+afr_is_possibly_under_txn(afr_transaction_type type, afr_local_t *local,
+ xlator_t *this)
+{
+ int i = 0;
+ int tmp = 0;
+ afr_private_t *priv = NULL;
+ GF_UNUSED char *key = NULL;
+ int keylen = 0;
+
+ priv = this->private;
+
+ if (type == AFR_ENTRY_TRANSACTION) {
+ key = GLUSTERFS_PARENT_ENTRYLK;
+ keylen = SLEN(GLUSTERFS_PARENT_ENTRYLK);
+ } else if (type == AFR_DATA_TRANSACTION) {
+ /*FIXME: Use GLUSTERFS_INODELK_DOM_COUNT etc. once
+ * pl_inodelk_xattr_fill supports separate keys for different
+ * domains.*/
+ key = GLUSTERFS_INODELK_COUNT;
+ keylen = SLEN(GLUSTERFS_INODELK_COUNT);
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].xdata)
+ continue;
+ if (dict_get_int32n(local->replies[i].xdata, key, keylen, &tmp) == 0)
+ if (tmp)
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
+
+static void
+afr_inode_ctx_destroy(afr_inode_ctx_t *ctx)
+{
+ int i = 0;
+
+ if (!ctx)
+ return;
+
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ GF_FREE(ctx->pre_op_done[i]);
+ }
+
+ GF_FREE(ctx);
+}
+
+int
+__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx)
+{
+ uint64_t ctx_int = 0;
+ int ret = -1;
+ int i = -1;
+ int num_locks = -1;
+ afr_inode_ctx_t *ictx = NULL;
+ afr_lock_t *lock = NULL;
+ afr_private_t *priv = this->private;
+
+ ret = __inode_ctx_get(inode, this, &ctx_int);
+ if (ret == 0) {
+ *ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int;
+ return 0;
+ }
+
+ ictx = GF_CALLOC(1, sizeof(afr_inode_ctx_t), gf_afr_mt_inode_ctx_t);
+ if (!ictx)
+ goto out;
+
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ ictx->pre_op_done[i] = GF_CALLOC(sizeof *ictx->pre_op_done[i],
+ priv->child_count, gf_afr_mt_int32_t);
+ if (!ictx->pre_op_done[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ num_locks = sizeof(ictx->lock) / sizeof(afr_lock_t);
+ for (i = 0; i < num_locks; i++) {
+ lock = &ictx->lock[i];
+ INIT_LIST_HEAD(&lock->post_op);
+ INIT_LIST_HEAD(&lock->frozen);
+ INIT_LIST_HEAD(&lock->waiting);
+ INIT_LIST_HEAD(&lock->owners);
+ }
+
+ ctx_int = (uint64_t)(uintptr_t)ictx;
+ ret = __inode_ctx_set(inode, this, &ctx_int);
+ if (ret) {
+ goto out;
+ }
+
+ ictx->spb_choice = -1;
+ ictx->read_subvol = 0;
+ ictx->write_subvol = 0;
+ ictx->lock_count = 0;
+ ret = 0;
+ *ctx = ictx;
+out:
+ if (ret) {
+ afr_inode_ctx_destroy(ictx);
+ }
+ return ret;
}
+
/*
* INODE CTX 64-bit VALUE FORMAT FOR SMALL (<= 16) SUBVOL COUNTS:
*
@@ -124,1677 +863,2258 @@ out:
*/
int
-__afr_inode_read_subvol_get_small (inode_t *inode, xlator_t *this,
- unsigned char *data, unsigned char *metadata,
- int *event_p)
-{
- afr_private_t *priv = NULL;
- int ret = -1;
- uint16_t datamap = 0;
- uint16_t metadatamap = 0;
- uint32_t event = 0;
- uint64_t val = 0;
- int i = 0;
- afr_inode_ctx_t *ctx = NULL;
+__afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
+ inode_t *inode)
+{
+ int i = 0;
+ int txn_type = 0;
+ int count = 0;
+ int index = -1;
+ uint16_t datamap_old = 0;
+ uint16_t metadatamap_old = 0;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint16_t tmp_map = 0;
+ uint16_t mask = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ txn_type = local->transaction.type;
+
+ if (txn_type == AFR_DATA_TRANSACTION)
+ val = local->inode_ctx->write_subvol;
+ else
+ val = local->inode_ctx->read_subvol;
+
+ metadatamap_old = metadatamap = (val & 0x000000000000ffff);
+ datamap_old = datamap = (val & 0x00000000ffff0000) >> 16;
+ event = (val & 0xffffffff00000000) >> 32;
+
+ if (txn_type == AFR_DATA_TRANSACTION)
+ tmp_map = datamap;
+ else if (txn_type == AFR_METADATA_TRANSACTION)
+ tmp_map = metadatamap;
+
+ count = gf_bits_count(tmp_map);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.failed_subvols[i])
+ continue;
+
+ mask = 1 << i;
+ if (txn_type == AFR_METADATA_TRANSACTION)
+ metadatamap &= ~mask;
+ else if (txn_type == AFR_DATA_TRANSACTION)
+ datamap &= ~mask;
+ }
+
+ switch (txn_type) {
+ case AFR_METADATA_TRANSACTION:
+ if ((metadatamap_old != 0) && (metadatamap == 0) && (count == 1)) {
+ index = gf_bits_index(tmp_map);
+ local->transaction.in_flight_sb_errno = local->replies[index]
+ .op_errno;
+ local->transaction.in_flight_sb = _gf_true;
+ metadatamap |= (1 << index);
+ }
+ if (metadatamap_old != metadatamap) {
+ __afr_inode_need_refresh_set(inode, this);
+ }
+ break;
+
+ case AFR_DATA_TRANSACTION:
+ if ((datamap_old != 0) && (datamap == 0) && (count == 1)) {
+ index = gf_bits_index(tmp_map);
+ local->transaction.in_flight_sb_errno = local->replies[index]
+ .op_errno;
+ local->transaction.in_flight_sb = _gf_true;
+ datamap |= (1 << index);
+ }
+ if (datamap_old != datamap)
+ __afr_inode_need_refresh_set(inode, this);
+ break;
- priv = this->private;
+ default:
+ break;
+ }
- ret = __afr_inode_ctx_get (this, inode, &ctx);
- if (ret < 0)
- return ret;
+ val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+ (((uint64_t)event) << 32);
- val = ctx->read_subvol;
+ if (txn_type == AFR_DATA_TRANSACTION)
+ local->inode_ctx->write_subvol = val;
+ local->inode_ctx->read_subvol = val;
- metadatamap = (val & 0x000000000000ffff);
- datamap = (val & 0x00000000ffff0000) >> 16;
- event = (val & 0xffffffff00000000) >> 32;
+ return 0;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (metadata)
- metadata[i] = (metadatamap >> i) & 1;
- if (data)
- data[i] = (datamap >> i) & 1;
- }
+gf_boolean_t
+afr_is_symmetric_error(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int op_errno = 0;
+ int i_errno = 0;
+ gf_boolean_t matching_errors = _gf_true;
+ int i = 0;
- if (event_p)
- *event_p = event;
- return ret;
-}
+ priv = this->private;
+ local = frame->local;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret != -1) {
+ /* Operation succeeded on at least one subvol,
+ so it is not a failed-everywhere situation.
+ */
+ matching_errors = _gf_false;
+ break;
+ }
+ i_errno = local->replies[i].op_errno;
+
+ if (i_errno == ENOTCONN) {
+ /* ENOTCONN is not a symmetric error. We do not
+ know if the operation was performed on the
+ backend or not.
+ */
+ matching_errors = _gf_false;
+ break;
+ }
+ if (!op_errno) {
+ op_errno = i_errno;
+ } else if (op_errno != i_errno) {
+ /* Mismatching op_errno's */
+ matching_errors = _gf_false;
+ break;
+ }
+ }
+
+ return matching_errors;
+}
int
-__afr_inode_read_subvol_set_small (inode_t *inode, xlator_t *this,
- unsigned char *data, unsigned char *metadata,
- int event)
+afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame, inode_t *inode)
{
- afr_private_t *priv = NULL;
- uint16_t datamap = 0;
- uint16_t metadatamap = 0;
- uint64_t val = 0;
- int i = 0;
- int ret = -1;
- afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- priv = this->private;
+ priv = this->private;
+ local = frame->local;
- ret = __afr_inode_ctx_get (this, inode, &ctx);
- if (ret)
- goto out;
+ /* If this transaction saw no failures, then exit. */
+ if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) == 0)
+ return 0;
- for (i = 0; i < priv->child_count; i++) {
- if (data[i])
- datamap |= (1 << i);
- if (metadata[i])
- metadatamap |= (1 << i);
- }
+ if (afr_is_symmetric_error(frame, this))
+ return 0;
- val = ((uint64_t) metadatamap) |
- (((uint64_t) datamap) << 16) |
- (((uint64_t) event) << 32);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_set_in_flight_sb_status(this, local, inode);
+ }
+ UNLOCK(&inode->lock);
- ctx->read_subvol = val;
+ return ret;
+}
- ret = 0;
-out:
+int
+__afr_inode_read_subvol_get_small(inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int *event_p)
+{
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint32_t event = 0;
+ uint64_t val = 0;
+ int i = 0;
+ afr_inode_ctx_t *ctx = NULL;
+
+ priv = this->private;
+
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret < 0)
return ret;
+
+ val = ctx->read_subvol;
+
+ metadatamap = (val & 0x000000000000ffff);
+ datamap = (val & 0x00000000ffff0000) >> 16;
+ event = (val & 0xffffffff00000000) >> 32;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (metadata)
+ metadata[i] = (metadatamap >> i) & 1;
+ if (data)
+ data[i] = (datamap >> i) & 1;
+ }
+
+ if (event_p)
+ *event_p = event;
+ return ret;
}
int
-__afr_inode_read_subvol_reset_small (inode_t *inode, xlator_t *this)
+__afr_inode_read_subvol_set_small(inode_t *inode, xlator_t *this,
+ unsigned char *data, unsigned char *metadata,
+ int event)
{
- int ret = -1;
- uint16_t datamap = 0;
- uint16_t metadatamap = 0;
- uint32_t event = 0;
- uint64_t val = 0;
- afr_inode_ctx_t *ctx = NULL;
+ afr_private_t *priv = NULL;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint64_t val = 0;
+ int i = 0;
+ int ret = -1;
+ afr_inode_ctx_t *ctx = NULL;
- ret = __afr_inode_ctx_get (this, inode, &ctx);
- if (ret)
- return ret;
+ priv = this->private;
- val = ctx->read_subvol;
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret)
+ goto out;
- metadatamap = (val & 0x000000000000ffff) >> 0;
- datamap = (val & 0x00000000ffff0000) >> 16;
- event = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (data[i])
+ datamap |= (1 << i);
+ if (metadata[i])
+ metadatamap |= (1 << i);
+ }
- val = ((uint64_t) metadatamap) |
- (((uint64_t) datamap) << 16) |
- (((uint64_t) event) << 32);
+ val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+ (((uint64_t)event) << 32);
- ctx->read_subvol = val;
+ ctx->read_subvol = val;
- return ret;
+ ret = 0;
+out:
+ return ret;
}
-
int
-__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
- unsigned char *data, unsigned char *metadata,
- int *event_p)
+__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int *event_p)
{
- afr_private_t *priv = NULL;
- int ret = -1;
+ afr_private_t *priv = NULL;
+ int ret = -1;
- priv = this->private;
+ priv = this->private;
- if (priv->child_count <= 16)
- ret = __afr_inode_read_subvol_get_small (inode, this, data,
- metadata, event_p);
- else
- /* TBD: allocate structure with array and read from it */
- ret = -1;
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_get_small(inode, this, data, metadata,
+ event_p);
+ else
+ /* TBD: allocate structure with array and read from it */
+ ret = -1;
- return ret;
+ return ret;
}
int
-__afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this,
- int *spb_choice)
+__afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
+ int *spb_choice)
{
- afr_inode_ctx_t *ctx = NULL;
- int ret = -1;
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- ret = __afr_inode_ctx_get (this, inode, &ctx);
- if (ret < 0)
- return ret;
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret < 0)
+ return ret;
- *spb_choice = ctx->spb_choice;
- return 0;
+ *spb_choice = ctx->spb_choice;
+ return 0;
}
int
-__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data,
- unsigned char *metadata, int event)
+__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
{
- afr_private_t *priv = NULL;
- int ret = -1;
+ afr_private_t *priv = NULL;
+ int ret = -1;
- priv = this->private;
+ priv = this->private;
- if (priv->child_count <= 16)
- ret = __afr_inode_read_subvol_set_small (inode, this, data,
- metadata, event);
- else
- ret = -1;
+ if (priv->child_count <= 16)
+ ret = __afr_inode_read_subvol_set_small(inode, this, data, metadata,
+ event);
+ else
+ ret = -1;
- return ret;
+ return ret;
}
int
-__afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this,
- int spb_choice)
+__afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
+ int spb_choice)
{
- afr_inode_ctx_t *ctx = NULL;
- int ret = -1;
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- ret = __afr_inode_ctx_get (this, inode, &ctx);
- if (ret)
- goto out;
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret)
+ goto out;
- ctx->spb_choice = spb_choice;
+ ctx->spb_choice = spb_choice;
- ret = 0;
+ ret = 0;
out:
- return ret;
+ return ret;
}
int
-__afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)
+afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int *event_p)
{
- afr_private_t *priv = NULL;
- int ret = -1;
-
- priv = this->private;
+ int ret = -1;
- if (priv->child_count <= 16)
- ret = __afr_inode_read_subvol_reset_small (inode, this);
- else
- ret = -1;
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- return ret;
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_get(inode, this, data, metadata, event_p);
+ }
+ UNLOCK(&inode->lock);
+out:
+ return ret;
}
-
int
-afr_inode_read_subvol_get (inode_t *inode, xlator_t *this, unsigned char *data,
- unsigned char *metadata, int *event_p)
+afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type)
{
- int ret = -1;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+ unsigned char *data = alloca0(priv->child_count);
+ unsigned char *metadata = alloca0(priv->child_count);
+ int data_count = 0;
+ int metadata_count = 0;
+ int event_generation = 0;
+ int ret = 0;
+
+ ret = afr_inode_read_subvol_get(inode, this, data, metadata,
+ &event_generation);
+ if (ret == -1)
+ return -EIO;
+
+ data_count = AFR_COUNT(data, priv->child_count);
+ metadata_count = AFR_COUNT(metadata, priv->child_count);
+
+ if (inode->ia_type == IA_IFDIR) {
+ /* For directories, allow even if it is in data split-brain. */
+ if (type == AFR_METADATA_TRANSACTION || local->op == GF_FOP_STAT ||
+ local->op == GF_FOP_FSTAT) {
+ if (!metadata_count)
+ return -EIO;
+ }
+ } else {
+ /* For files, abort in case of data/metadata split-brain. */
+ if (!data_count || !metadata_count) {
+ return -EIO;
+ }
+ }
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ if (type == AFR_METADATA_TRANSACTION && readable)
+ memcpy(readable, metadata, priv->child_count * sizeof *metadata);
+ if (type == AFR_DATA_TRANSACTION && readable) {
+ if (!data_count)
+ memcpy(readable, local->child_up,
+ priv->child_count * sizeof *readable);
+ else
+ memcpy(readable, data, priv->child_count * sizeof *data);
+ }
+ if (event_p)
+ *event_p = event_generation;
+ return 0;
+}
+
+static int
+afr_inode_split_brain_choice_get(inode_t *inode, xlator_t *this,
+ int *spb_choice)
+{
+ int ret = -1;
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- LOCK(&inode->lock);
- {
- ret = __afr_inode_read_subvol_get (inode, this, data,
- metadata, event_p);
- }
- UNLOCK(&inode->lock);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_split_brain_choice_get(inode, this, spb_choice);
+ }
+ UNLOCK(&inode->lock);
out:
- return ret;
+ return ret;
}
+/*
+ * frame is used to get the favourite policy. Since
+ * afr_inode_split_brain_choice_get was called with afr_open, it is possible to
+ * have a frame with out local->replies. So in that case, frame is passed as
+ * null, hence this function will handle the frame NULL case.
+ */
int
-afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this,
- unsigned char *readable, int *event_p, int type)
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol)
{
+ int ret = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- afr_private_t *priv = this->private;
- afr_local_t *local = frame->local;
- unsigned char *data = alloca0 (priv->child_count);
- unsigned char *metadata = alloca0 (priv->child_count);
- int data_count = 0;
- int metadata_count = 0;
- int event_generation = 0;
- int ret = 0;
+ GF_VALIDATE_OR_GOTO("afr", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ GF_VALIDATE_OR_GOTO(this->name, spb_subvol, out);
- /* We don't care about split-brains for entry transactions. */
- if (type == AFR_ENTRY_TRANSACTION || type == AFR_ENTRY_RENAME_TRANSACTION)
- return 0;
+ priv = this->private;
- ret = afr_inode_read_subvol_get (inode, this, data, metadata,
- &event_generation);
- if (ret == -1)
- return -EIO;
-
- data_count = AFR_COUNT (data, priv->child_count);
- metadata_count = AFR_COUNT (metadata, priv->child_count);
+ ret = afr_inode_split_brain_choice_get(inode, this, spb_subvol);
+ if (*spb_subvol < 0 && priv->fav_child_policy && frame && frame->local) {
+ local = frame->local;
+ *spb_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode,
+ NULL);
+ if (*spb_subvol >= 0) {
+ ret = 0;
+ }
+ }
- if (inode->ia_type == IA_IFDIR) {
- /* For directories, allow even if it is in data split-brain. */
- if (type == AFR_METADATA_TRANSACTION ||
- local->op == GF_FOP_STAT || local->op == GF_FOP_FSTAT) {
- if (!metadata_count)
- return -EIO;
- }
- } else {
- /* For files, abort in case of data/metadata split-brain. */
- if (!data_count || !metadata_count)
- return -EIO;
- }
-
- if (type == AFR_METADATA_TRANSACTION && readable)
- memcpy (readable, metadata, priv->child_count * sizeof *metadata);
- if (type == AFR_DATA_TRANSACTION && readable) {
- if (!data_count)
- memcpy (readable, local->child_up,
- priv->child_count * sizeof *readable);
- else
- memcpy (readable, data, priv->child_count * sizeof *data);
- }
- if (event_p)
- *event_p = event_generation;
- return 0;
+out:
+ return ret;
}
-
int
-afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this,
- int *spb_choice)
+afr_inode_read_subvol_set(inode_t *inode, xlator_t *this, unsigned char *data,
+ unsigned char *metadata, int event)
{
- int ret = -1;
+ int ret = -1;
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- LOCK(&inode->lock);
- {
- ret = __afr_inode_split_brain_choice_get (inode, this,
- spb_choice);
- }
- UNLOCK(&inode->lock);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_read_subvol_set(inode, this, data, metadata, event);
+ }
+ UNLOCK(&inode->lock);
out:
- return ret;
+ return ret;
}
-
int
-afr_inode_read_subvol_set (inode_t *inode, xlator_t *this, unsigned char *data,
- unsigned char *metadata, int event)
+afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this, int spb_choice)
{
- int ret = -1;
+ int ret = -1;
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- LOCK(&inode->lock);
- {
- ret = __afr_inode_read_subvol_set (inode, this, data, metadata,
- event);
- }
- UNLOCK(&inode->lock);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_split_brain_choice_set(inode, this, spb_choice);
+ }
+ UNLOCK(&inode->lock);
out:
- return ret;
+ return ret;
}
-
-int
-afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this,
- int spb_choice)
+/* The caller of this should perform afr_inode_refresh, if this function
+ * returns _gf_true
+ */
+gf_boolean_t
+afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1,
+ int event_gen2)
{
- int ret = -1;
+ gf_boolean_t need_refresh = _gf_false;
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret)
+ goto unlock;
+
+ need_refresh = ctx->need_refresh;
+ /* Hoping that the caller will do inode_refresh followed by
+ * this, hence setting the need_refresh to false */
+ ctx->need_refresh = _gf_false;
+ }
+unlock:
+ UNLOCK(&inode->lock);
- LOCK(&inode->lock);
- {
- ret = __afr_inode_split_brain_choice_set (inode, this,
- spb_choice);
- }
- UNLOCK(&inode->lock);
+ if (event_gen1 != event_gen2)
+ need_refresh = _gf_true;
out:
- return ret;
+ return need_refresh;
}
+int
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
+{
+ int ret = -1;
+ afr_inode_ctx_t *ctx = NULL;
+
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret == 0) {
+ ctx->need_refresh = _gf_true;
+ }
+
+ return ret;
+}
int
-afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this)
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
{
- int ret = -1;
+ int ret = -1;
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- LOCK(&inode->lock);
- {
- ret = __afr_inode_read_subvol_reset (inode, this);
- }
- UNLOCK(&inode->lock);
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_need_refresh_set(inode, this);
+ }
+ UNLOCK(&inode->lock);
out:
- return ret;
+ return ret;
}
int
-afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode)
+afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode)
{
- afr_inode_ctx_t *ctx = NULL;
- int ret = -1;
+ afr_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- if (!inode)
- return ret;
+ if (!inode)
+ return ret;
- LOCK(&inode->lock);
- {
- __afr_inode_ctx_get (this, inode, &ctx);
- if (!ctx) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
- "Failed to cancel split-brain choice timer.");
- goto out;
- }
- ctx->spb_choice = -1;
- if (ctx->timer) {
- gf_timer_call_cancel (this->ctx, ctx->timer);
- ctx->timer = NULL;
- }
- ret = 0;
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret < 0 || !ctx) {
+ UNLOCK(&inode->lock);
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+ "Failed to cancel split-brain choice timer.");
+ goto out;
+ }
+ ctx->spb_choice = -1;
+ if (ctx->timer) {
+ gf_timer_call_cancel(this->ctx, ctx->timer);
+ ctx->timer = NULL;
}
+ ret = 0;
+ }
+ UNLOCK(&inode->lock);
out:
- UNLOCK(&inode->lock);
- return ret;
+ return ret;
}
void
-afr_set_split_brain_choice_cbk (void *data)
+afr_set_split_brain_choice_cbk(void *data)
{
- inode_t *inode = data;
- xlator_t *this = THIS;
+ inode_t *inode = data;
+ xlator_t *this = THIS;
- afr_spb_choice_timeout_cancel (this, inode);
- inode_unref (inode);
- return;
+ afr_spb_choice_timeout_cancel(this, inode);
+ inode_invalidate(inode);
+ inode_unref(inode);
+ return;
}
-
int
-afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque)
-{
- int op_errno = ENOMEM;
- afr_private_t *priv = NULL;
- afr_inode_ctx_t *ctx = NULL;
- inode_t *inode = NULL;
- loc_t *loc = NULL;
- xlator_t *this = NULL;
- afr_spbc_timeout_t *data = opaque;
- struct timespec delta = {0, };
-
- if (ret)
- goto out;
-
- frame = data->frame;
- loc = data->loc;
- this = frame->this;
- priv = this->private;
-
- delta.tv_sec = priv->spb_choice_timeout;
- delta.tv_nsec = 0;
-
- inode = loc->inode;
- if (!inode)
- goto out;
-
- if (!(data->d_spb || data->m_spb)) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, "Cannot set "
- "replica.split-brain-choice on %s. File is"
- " not in data/metadata split-brain.",
- uuid_utoa (loc->gfid));
- ret = -1;
- op_errno = EINVAL;
- goto out;
+afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque)
+{
+ int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+ inode_t *inode = NULL;
+ loc_t *loc = NULL;
+ xlator_t *this = NULL;
+ afr_spbc_timeout_t *data = opaque;
+ struct timespec delta = {
+ 0,
+ };
+ gf_boolean_t timer_set = _gf_false;
+ gf_boolean_t timer_cancelled = _gf_false;
+ gf_boolean_t timer_reset = _gf_false;
+ int old_spb_choice = -1;
+
+ frame = data->frame;
+ loc = data->loc;
+ this = frame->this;
+ priv = this->private;
+
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ delta.tv_sec = priv->spb_choice_timeout;
+ delta.tv_nsec = 0;
+
+ if (!loc->inode) {
+ ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (!(data->d_spb || data->m_spb)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+ "Cannot set "
+ "replica.split-brain-choice on %s. File is"
+ " not in data/metadata split-brain.",
+ uuid_utoa(loc->gfid));
+ ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ /*
+ * we're ref'ing the inode before LOCK like it is done elsewhere in the
+ * code. If we ref after LOCK, coverity complains of possible deadlocks.
+ */
+ inode = inode_ref(loc->inode);
+
+ LOCK(&inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, inode, &ctx);
+ if (ret) {
+ UNLOCK(&inode->lock);
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
+ "Failed to get inode_ctx for %s", loc->name);
+ goto post_unlock;
}
- LOCK(&inode->lock);
- {
- ret = __afr_inode_ctx_get (this, inode, &ctx);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
- "Failed to get inode_ctx for %s", loc->name);
- goto unlock;
- }
-
- ctx->spb_choice = data->spb_child_index;
+ old_spb_choice = ctx->spb_choice;
+ ctx->spb_choice = data->spb_child_index;
- /* Possible changes in spb-choice :
- * -1 to valid : ref and inject timer
- *
- * valid to valid : cancel timer and inject new one
- *
- * valid to -1 : cancel timer and unref
- *
- * -1 to -1 : do not do anything
- */
+ /* Possible changes in spb-choice :
+ * valid to -1 : cancel timer and unref
+ * valid to valid : cancel timer and inject new one
+ * -1 to -1 : unref and do not do anything
+ * -1 to valid : inject timer
+ */
- /* ctx->timer is NULL iff previous value of
- * ctx->spb_choice is -1
- */
- if (ctx->timer) {
- if (ctx->spb_choice == -1) {
- gf_timer_call_cancel (this->ctx, ctx->timer);
- ctx->timer = NULL;
- inode_unref (inode);
- goto unlock;
- }
- goto reset_timer;
- } else {
- if (ctx->spb_choice == -1)
- goto unlock;
+ /* ctx->timer is NULL iff previous value of
+ * ctx->spb_choice is -1
+ */
+ if (ctx->timer) {
+ if (ctx->spb_choice == -1) {
+ if (!gf_timer_call_cancel(this->ctx, ctx->timer)) {
+ ctx->timer = NULL;
+ timer_cancelled = _gf_true;
}
-
- inode = inode_ref (loc->inode);
- goto set_timer;
-
-reset_timer:
- gf_timer_call_cancel (this->ctx, ctx->timer);
- ctx->timer = NULL;
-
-set_timer:
- ctx->timer = gf_timer_call_after (this->ctx, delta,
- afr_set_split_brain_choice_cbk,
- inode);
+ /* If timer cancel failed here it means that the
+ * previous cbk will be executed which will set
+ * spb_choice to -1. So we can consider the
+ * 'valid to -1' case to be a success
+ * (i.e. ret = 0) and goto unlock.
+ */
+ goto unlock;
+ }
+ goto reset_timer;
+ } else {
+ if (ctx->spb_choice == -1)
+ goto unlock;
+ goto set_timer;
}
+
+ reset_timer:
+ ret = gf_timer_call_cancel(this->ctx, ctx->timer);
+ if (ret != 0) {
+ /* We need to bail out now instead of launching a new
+ * timer. Otherwise the cbk of the previous timer event
+ * will cancel the new ctx->timer.
+ */
+ ctx->spb_choice = old_spb_choice;
+ ret = -1;
+ op_errno = EAGAIN;
+ goto unlock;
+ }
+ ctx->timer = NULL;
+ timer_reset = _gf_true;
+
+ set_timer:
+ ctx->timer = gf_timer_call_after(this->ctx, delta,
+ afr_set_split_brain_choice_cbk, inode);
+ if (!ctx->timer) {
+ ctx->spb_choice = old_spb_choice;
+ ret = -1;
+ op_errno = ENOMEM;
+ }
+ if (!timer_reset && ctx->timer)
+ timer_set = _gf_true;
+ if (timer_reset && !ctx->timer)
+ timer_cancelled = _gf_true;
+ }
unlock:
- UNLOCK(&inode->lock);
- inode_invalidate (inode);
+ UNLOCK(&inode->lock);
+post_unlock:
+ if (!timer_set)
+ inode_unref(inode);
+ if (timer_cancelled)
+ inode_unref(inode);
+ /*
+ * We need to invalidate the inode to prevent the kernel from serving
+ * reads from an older cached value despite a change in spb_choice to
+ * a new value.
+ */
+ inode_invalidate(inode);
out:
- if (data)
- GF_FREE (data);
- AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
- return 0;
+ GF_FREE(data);
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ return 0;
}
int
-afr_accused_fill (xlator_t *this, dict_t *xdata, unsigned char *accused,
- afr_transaction_type type)
+afr_accused_fill(xlator_t *this, dict_t *xdata, unsigned char *accused,
+ afr_transaction_type type)
{
- afr_private_t *priv = NULL;
- int i = 0;
- int idx = afr_index_for_transaction_type (type);
- void *pending_raw = NULL;
- int pending[3];
- int ret = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int idx = afr_index_for_transaction_type(type);
+ void *pending_raw = NULL;
+ int pending[3];
+ int ret = 0;
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_get_ptr (xdata, priv->pending_key[i],
- &pending_raw);
- if (ret) /* no pending flags */
- continue;
- memcpy (pending, pending_raw, sizeof(pending));
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_get_ptr(xdata, priv->pending_key[i], &pending_raw);
+ if (ret) /* no pending flags */
+ continue;
+ memcpy(pending, pending_raw, sizeof(pending));
- if (ntoh32 (pending[idx]))
- accused[i] = 1;
- }
+ if (ntoh32(pending[idx]))
+ accused[i] = 1;
+ }
- return 0;
+ return 0;
}
-
int
-afr_accuse_smallfiles (xlator_t *this, struct afr_reply *replies,
- unsigned char *data_accused)
+afr_accuse_smallfiles(xlator_t *this, struct afr_reply *replies,
+ unsigned char *data_accused)
{
- int i = 0;
- afr_private_t *priv = NULL;
- uint64_t maxsize = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t maxsize = 0;
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (replies[i].valid && replies[i].xdata &&
- dict_get (replies[i].xdata, GLUSTERFS_BAD_INODE))
- continue;
- if (data_accused[i])
- continue;
- if (replies[i].poststat.ia_size > maxsize)
- maxsize = replies[i].poststat.ia_size;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].xdata &&
+ dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))
+ continue;
+ if (data_accused[i])
+ continue;
+ if (replies[i].poststat.ia_size > maxsize)
+ maxsize = replies[i].poststat.ia_size;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (data_accused[i])
- continue;
- if (AFR_IS_ARBITER_BRICK(priv, i))
- continue;
- if (replies[i].poststat.ia_size < maxsize)
- data_accused[i] = 1;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i])
+ continue;
+ if (AFR_IS_ARBITER_BRICK(priv, i))
+ continue;
+ if (replies[i].poststat.ia_size < maxsize)
+ data_accused[i] = 1;
+ }
- return 0;
+ return 0;
}
-
int
-afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- struct afr_reply *replies = NULL;
- int event_generation = 0;
- int i = 0;
- unsigned char *data_accused = NULL;
- unsigned char *metadata_accused = NULL;
- unsigned char *data_readable = NULL;
- unsigned char *metadata_readable = NULL;
- int ret = 0;
-
- local = frame->local;
- priv = this->private;
- replies = local->replies;
- event_generation = local->event_generation;
-
- data_accused = alloca0 (priv->child_count);
- data_readable = alloca0 (priv->child_count);
- metadata_accused = alloca0 (priv->child_count);
- metadata_readable = alloca0 (priv->child_count);
-
- for (i = 0; i < priv->child_count; i++) {
- data_readable[i] = 1;
- metadata_readable[i] = 1;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid) {
- data_readable[i] = 0;
- metadata_readable[i] = 0;
- continue;
- }
-
- if (replies[i].op_ret == -1) {
- data_readable[i] = 0;
- metadata_readable[i] = 0;
- continue;
- }
-
- if (replies[i].xdata &&
- dict_get (replies[i].xdata, GLUSTERFS_BAD_INODE)) {
- data_readable[i] = 0;
- metadata_readable[i] = 0;
- continue;
- }
-
- afr_accused_fill (this, replies[i].xdata, data_accused,
- (replies[i].poststat.ia_type == IA_IFDIR) ?
- AFR_ENTRY_TRANSACTION : AFR_DATA_TRANSACTION);
-
- afr_accused_fill (this, replies[i].xdata,
- metadata_accused, AFR_METADATA_TRANSACTION);
-
- }
-
- if (inode->ia_type != IA_IFDIR)
- afr_accuse_smallfiles (this, replies, data_accused);
-
- for (i = 0; i < priv->child_count; i++) {
- if (data_accused[i]) {
- data_readable[i] = 0;
- ret = 1;
- }
- if (metadata_accused[i]) {
- metadata_readable[i] = 0;
- ret = 1;
- }
- }
-
- afr_inode_read_subvol_set (inode, this, data_readable,
- metadata_readable, event_generation);
- return ret;
+afr_readables_fill(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *data_accused, unsigned char *metadata_accused,
+ unsigned char *data_readable,
+ unsigned char *metadata_readable, struct afr_reply *replies)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ int i = 0;
+ int ret = 0;
+ ia_type_t ia_type = IA_INVAL;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ data_readable[i] = 1;
+ metadata_readable[i] = 1;
+ }
+ if (AFR_IS_ARBITER_BRICK(priv, ARBITER_BRICK_INDEX)) {
+ data_readable[ARBITER_BRICK_INDEX] = 0;
+ metadata_readable[ARBITER_BRICK_INDEX] = 0;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies) { /* Lookup */
+ if (!replies[i].valid || replies[i].op_ret == -1 ||
+ (replies[i].xdata &&
+ dict_get_sizen(replies[i].xdata, GLUSTERFS_BAD_INODE))) {
+ data_readable[i] = 0;
+ metadata_readable[i] = 0;
+ continue;
+ }
+
+ xdata = replies[i].xdata;
+ ia_type = replies[i].poststat.ia_type;
+ } else { /* pre-op xattrop */
+ xdata = local->transaction.changelog_xdata[i];
+ ia_type = inode->ia_type;
+ }
+
+ if (!xdata)
+ continue; /* mkdir_cbk sends NULL xdata_rsp. */
+ afr_accused_fill(this, xdata, data_accused,
+ (ia_type == IA_IFDIR) ? AFR_ENTRY_TRANSACTION
+ : AFR_DATA_TRANSACTION);
+
+ afr_accused_fill(this, xdata, metadata_accused,
+ AFR_METADATA_TRANSACTION);
+ }
+
+ if (replies && ia_type != IA_INVAL && ia_type != IA_IFDIR &&
+ /* We want to accuse small files only when we know for
+ * sure that there is no IO happening. Otherwise, the
+ * ia_sizes obtained in post-refresh replies may
+ * mismatch due to a race between inode-refresh and
+ * ongoing writes, causing spurious heal launches*/
+ !afr_is_possibly_under_txn(AFR_DATA_TRANSACTION, local, this)) {
+ afr_accuse_smallfiles(this, replies, data_accused);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_accused[i]) {
+ data_readable[i] = 0;
+ ret = 1;
+ }
+ if (metadata_accused[i]) {
+ metadata_readable[i] = 0;
+ ret = 1;
+ }
+ }
+ return ret;
}
-
-
int
-afr_refresh_selfheal_done (int ret, call_frame_t *heal, void *opaque)
-{
- if (heal)
- STACK_DESTROY (heal->root);
- return 0;
+afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ gf_boolean_t *start_heal)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int event_generation = 0;
+ int i = 0;
+ unsigned char *data_accused = NULL;
+ unsigned char *metadata_accused = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+ event_generation = local->event_generation;
+
+ data_accused = alloca0(priv->child_count);
+ data_readable = alloca0(priv->child_count);
+ metadata_accused = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+
+ ret = afr_readables_fill(frame, this, inode, data_accused, metadata_accused,
+ data_readable, metadata_readable, replies);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (start_heal && priv->child_up[i] &&
+ (data_accused[i] || metadata_accused[i])) {
+ *start_heal = _gf_true;
+ break;
+ }
+ }
+ afr_inode_read_subvol_set(inode, this, data_readable, metadata_readable,
+ event_generation);
+ return ret;
}
int
-afr_inode_refresh_err (call_frame_t *frame, xlator_t *this)
+afr_refresh_selfheal_done(int ret, call_frame_t *heal, void *opaque)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int err = 0;
-
- local = frame->local;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->replies[i].valid && !local->replies[i].op_ret) {
- err = 0;
- goto ret;
- }
- }
-
- err = afr_final_errno (local, priv);
-ret:
- return -err;
+ if (heal)
+ AFR_STACK_DESTROY(heal);
+ return 0;
}
-
int
-afr_refresh_selfheal_wrap (void *opaque)
+afr_inode_refresh_err(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *frame = opaque;
- afr_local_t *local = NULL;
- xlator_t *this = NULL;
- int err = 0;
-
- local = frame->local;
- this = frame->this;
-
- afr_selfheal (frame->this, local->refreshinode->gfid);
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int err = 0;
- afr_selfheal_unlocked_discover (frame, local->refreshinode,
- local->refreshinode->gfid,
- local->replies);
+ local = frame->local;
+ priv = this->private;
- afr_replies_interpret (frame, this, local->refreshinode);
-
- err = afr_inode_refresh_err (frame, this);
-
- afr_local_replies_wipe (local, this->private);
-
- local->refreshfn (frame, this, err);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && !local->replies[i].op_ret) {
+ err = 0;
+ goto ret;
+ }
+ }
- return 0;
+ err = afr_final_errno(local, priv);
+ret:
+ return err;
}
-
gf_boolean_t
-afr_selfheal_enabled (xlator_t *this)
+afr_selfheal_enabled(const xlator_t *this)
{
- afr_private_t *priv = NULL;
- gf_boolean_t data = _gf_false;
- int ret = 0;
+ const afr_private_t *priv = this->private;
- priv = this->private;
-
- ret = gf_string2boolean (priv->data_self_heal, &data);
- GF_ASSERT (!ret);
-
- return data || priv->metadata_self_heal || priv->entry_self_heal;
+ return priv->data_self_heal || priv->metadata_self_heal ||
+ priv->entry_self_heal;
}
-
int
-afr_inode_refresh_done (call_frame_t *frame, xlator_t *this)
-{
- call_frame_t *heal = NULL;
- afr_local_t *local = NULL;
- int ret = 0;
- int err = 0;
-
- local = frame->local;
-
- ret = afr_replies_interpret (frame, this, local->refreshinode);
-
- err = afr_inode_refresh_err (frame, this);
+afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
+{
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *heal_local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ int event_generation = 0;
+ int read_subvol = -1;
+ int ret = 0;
+
+ local = frame->local;
+ inode = local->inode;
+ priv = this->private;
+
+ if (err)
+ goto refresh_done;
+
+ if (local->op == GF_FOP_LOOKUP)
+ goto refresh_done;
+
+ ret = afr_inode_get_readable(frame, inode, this, local->readable,
+ &event_generation, local->transaction.type);
+
+ if (ret == -EIO) {
+ /* No readable subvolume even after refresh ==> splitbrain.*/
+ if (!priv->fav_child_policy) {
+ err = EIO;
+ goto refresh_done;
+ }
+ read_subvol = afr_sh_get_fav_by_policy(this, local->replies, inode,
+ NULL);
+ if (read_subvol == -1) {
+ err = EIO;
+ goto refresh_done;
+ }
+
+ heal_frame = afr_frame_create(this, NULL);
+ if (!heal_frame) {
+ err = EIO;
+ goto refresh_done;
+ }
+ heal_local = heal_frame->local;
+ heal_local->xdata_req = dict_new();
+ if (!heal_local->xdata_req) {
+ err = EIO;
+ AFR_STACK_DESTROY(heal_frame);
+ goto refresh_done;
+ }
+ heal_local->heal_frame = frame;
+ ret = synctask_new(this->ctx->env, afr_fav_child_reset_sink_xattrs,
+ afr_fav_child_reset_sink_xattrs_cbk, heal_frame,
+ heal_frame);
+ return 0;
+ }
- afr_local_replies_wipe (local, this->private);
+refresh_done:
+ afr_local_replies_wipe(local, this->private);
+ local->refreshfn(frame, this, err);
- if (ret && afr_selfheal_enabled (this)) {
- heal = copy_frame (frame);
- if (heal)
- heal->root->pid = GF_CLIENT_PID_SELF_HEALD;
- ret = synctask_new (this->ctx->env, afr_refresh_selfheal_wrap,
- afr_refresh_selfheal_done, heal, frame);
- if (ret)
- goto refresh_done;
- } else {
- refresh_done:
- local->refreshfn (frame, this, err);
- }
+ return 0;
+}
- return 0;
+int
+afr_inode_refresh_done(call_frame_t *frame, xlator_t *this, int error)
+{
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t start_heal = _gf_false;
+ afr_local_t *heal_local = NULL;
+ unsigned char *success_replies = NULL;
+ int ret = 0;
+
+ if (error != 0) {
+ goto refresh_done;
+ }
+
+ local = frame->local;
+ priv = this->private;
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+
+ if (priv->thin_arbiter_count && local->is_read_txn &&
+ AFR_COUNT(success_replies, priv->child_count) != priv->child_count) {
+ /* We need to query the good bricks and/or thin-arbiter.*/
+ if (success_replies[0]) {
+ local->read_txn_query_child = AFR_CHILD_ZERO;
+ } else if (success_replies[1]) {
+ local->read_txn_query_child = AFR_CHILD_ONE;
+ }
+ error = EINVAL;
+ goto refresh_done;
+ }
+
+ if (!afr_has_quorum(success_replies, this, frame)) {
+ error = afr_final_errno(frame->local, this->private);
+ if (!error)
+ error = afr_quorum_errno(priv);
+ goto refresh_done;
+ }
+
+ ret = afr_replies_interpret(frame, this, local->refreshinode, &start_heal);
+
+ if (ret && afr_selfheal_enabled(this) && start_heal) {
+ heal_frame = afr_frame_create(this, NULL);
+ if (!heal_frame)
+ goto refresh_done;
+ heal_local = heal_frame->local;
+ heal_local->refreshinode = inode_ref(local->refreshinode);
+ heal_local->heal_frame = heal_frame;
+ if (!afr_throttled_selfheal(heal_frame, this)) {
+ AFR_STACK_DESTROY(heal_frame);
+ goto refresh_done;
+ }
+ }
+
+refresh_done:
+ afr_txn_refresh_done(frame, this, error);
+
+ return 0;
}
void
-afr_inode_refresh_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *buf,
- dict_t *xdata, struct iatt *par)
-{
- afr_local_t *local = NULL;
- int call_child = (long) cookie;
- int8_t need_heal = 1;
- int call_count = 0;
- GF_UNUSED int ret = 0;
+afr_inode_refresh_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf,
+ dict_t *xdata, struct iatt *par)
+{
+ afr_local_t *local = NULL;
+ int call_child = (long)cookie;
+ int8_t need_heal = 1;
+ int call_count = 0;
+ int ret = 0;
+
+ local = frame->local;
+ local->replies[call_child].valid = 1;
+ local->replies[call_child].op_ret = op_ret;
+ local->replies[call_child].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[call_child].poststat = *buf;
+ if (par)
+ local->replies[call_child].postparent = *par;
+ if (xdata)
+ local->replies[call_child].xdata = dict_ref(xdata);
+ }
- local = frame->local;
- local->replies[call_child].valid = 1;
- local->replies[call_child].op_ret = op_ret;
- local->replies[call_child].op_errno = op_errno;
- if (op_ret != -1) {
- local->replies[call_child].poststat = *buf;
- if (par)
- local->replies[call_child].postparent = *par;
- if (xdata)
- local->replies[call_child].xdata = dict_ref (xdata);
- }
- if (xdata) {
- ret = dict_get_int8 (xdata, "link-count", &need_heal);
- local->replies[call_child].need_heal = need_heal;
- } else {
- local->replies[call_child].need_heal = need_heal;
+ if (xdata) {
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to get link count");
}
+ }
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- afr_set_need_heal (this, local);
- afr_inode_refresh_done (frame, this);
+ local->replies[call_child].need_heal = need_heal;
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ afr_set_need_heal(this, local);
+ ret = afr_inode_refresh_err(frame, this);
+ if (ret) {
+ gf_msg_debug(this->name, ret, "afr_inode_refresh_err failed");
}
-
+ afr_inode_refresh_done(frame, this, ret);
+ }
}
int
-afr_inode_refresh_subvol_with_lookup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret,
- int op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xdata,
- struct iatt *par)
+afr_inode_refresh_subvol_with_lookup_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata,
+ struct iatt *par)
{
- afr_inode_refresh_subvol_cbk (frame, cookie, this, op_ret, op_errno,
- buf, xdata, par);
- return 0;
+ afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ xdata, par);
+ return 0;
}
-
int
-afr_inode_refresh_subvol_with_lookup (call_frame_t *frame, xlator_t *this,
- int i, inode_t *inode, dict_t *xdata)
+afr_inode_refresh_subvol_with_lookup(call_frame_t *frame, xlator_t *this, int i,
+ inode_t *inode, uuid_t gfid, dict_t *xdata)
{
- loc_t loc = {0, };
- afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ afr_private_t *priv = NULL;
- priv = this->private;
+ priv = this->private;
- loc.inode = inode;
- gf_uuid_copy (loc.gfid, inode->gfid);
+ loc.inode = inode;
+ if (gf_uuid_is_null(inode->gfid) && gfid) {
+ /* To handle setattr/setxattr on yet to be linked inode from
+ * dht */
+ gf_uuid_copy(loc.gfid, gfid);
+ } else {
+ gf_uuid_copy(loc.gfid, inode->gfid);
+ }
- STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_with_lookup_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->lookup, &loc, xdata);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_lookup_cbk,
+ (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->lookup, &loc, xdata);
+ return 0;
}
int
-afr_inode_refresh_subvol_with_fstat_cbk (call_frame_t *frame,
- void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *buf, dict_t *xdata)
+afr_inode_refresh_subvol_with_fstat_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf,
+ dict_t *xdata)
{
- afr_inode_refresh_subvol_cbk (frame, cookie, this, op_ret, op_errno,
- buf, xdata, NULL);
- return 0;
+ afr_inode_refresh_subvol_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ xdata, NULL);
+ return 0;
}
int
-afr_inode_refresh_subvol_with_fstat (call_frame_t *frame, xlator_t *this, int i,
- dict_t *xdata)
+afr_inode_refresh_subvol_with_fstat(call_frame_t *frame, xlator_t *this, int i,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- priv = this->private;
- local = frame->local;
+ priv = this->private;
+ local = frame->local;
- STACK_WIND_COOKIE (frame, afr_inode_refresh_subvol_with_fstat_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->fstat, local->fd, xdata);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_inode_refresh_subvol_with_fstat_cbk,
+ (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->fstat, local->fd, xdata);
+ return 0;
}
int
-afr_inode_refresh_do (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int i = 0;
- int ret = 0;
- dict_t *xdata = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- unsigned char *wind_subvols = NULL;
-
- priv = this->private;
- local = frame->local;
- wind_subvols = alloca0 (priv->child_count);
-
- afr_local_replies_wipe (local, priv);
-
- if (local->fd) {
- fd_ctx = afr_fd_ctx_get (local->fd, this);
- if (!fd_ctx) {
- afr_inode_refresh_done (frame, this);
- return 0;
- }
- }
+afr_inode_refresh_do(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+ int ret = 0;
+ dict_t *xdata = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ unsigned char *wind_subvols = NULL;
- xdata = dict_new ();
- if (!xdata) {
- afr_inode_refresh_done (frame, this);
- return 0;
- }
+ priv = this->private;
+ local = frame->local;
+ wind_subvols = alloca0(priv->child_count);
- if (afr_xattr_req_prepare (this, xdata) != 0) {
- dict_unref (xdata);
- afr_inode_refresh_done (frame, this);
- return 0;
- }
+ afr_local_replies_wipe(local, priv);
- ret = dict_set_str (xdata, "link-count", GF_XATTROP_INDEX_COUNT);
- if (ret) {
- gf_msg_debug (this->name, -ret,
- "Unable to set link-count in dict ");
+ if (local->fd) {
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx) {
+ afr_inode_refresh_done(frame, this, EINVAL);
+ return 0;
}
+ }
- if (local->fd) {
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i] &&
- fd_ctx->opened_on[i] == AFR_FD_OPENED)
- wind_subvols[i] = 1;
- }
- } else {
- memcpy (wind_subvols, local->child_up,
- sizeof (*local->child_up) * priv->child_count);
- }
+ xdata = dict_new();
+ if (!xdata) {
+ afr_inode_refresh_done(frame, this, ENOMEM);
+ return 0;
+ }
- local->call_count = AFR_COUNT (wind_subvols, priv->child_count);
+ ret = afr_xattr_req_prepare(this, xdata);
+ if (ret != 0) {
+ dict_unref(xdata);
+ afr_inode_refresh_done(frame, this, -ret);
+ return 0;
+ }
- call_count = local->call_count;
- if (!call_count) {
- dict_unref (xdata);
- afr_inode_refresh_done (frame, this);
- return 0;
- }
- for (i = 0; i < priv->child_count; i++) {
- if (!wind_subvols[i])
- continue;
+ ret = dict_set_sizen_str_sizen(xdata, "link-count", GF_XATTROP_INDEX_COUNT);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+ }
- if (local->fd)
- afr_inode_refresh_subvol_with_fstat (frame, this, i,
- xdata);
- else
- afr_inode_refresh_subvol_with_lookup (frame, this, i,
- local->refreshinode, xdata);
+ ret = dict_set_str_sizen(xdata, GLUSTERFS_INODELK_DOM_COUNT, this->name);
+ if (ret) {
+ gf_msg_debug(this->name, -ret,
+ "Unable to set inodelk-dom-count in dict ");
+ }
- if (!--call_count)
- break;
- }
+ if (local->fd) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i] && fd_ctx->opened_on[i] == AFR_FD_OPENED)
+ wind_subvols[i] = 1;
+ }
+ } else {
+ memcpy(wind_subvols, local->child_up,
+ sizeof(*local->child_up) * priv->child_count);
+ }
+
+ local->call_count = AFR_COUNT(wind_subvols, priv->child_count);
+
+ call_count = local->call_count;
+ if (!call_count) {
+ dict_unref(xdata);
+ if (local->fd && AFR_COUNT(local->child_up, priv->child_count))
+ afr_inode_refresh_done(frame, this, EBADFD);
+ else
+ afr_inode_refresh_done(frame, this, ENOTCONN);
+ return 0;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_subvols[i])
+ continue;
- dict_unref (xdata);
+ if (local->fd)
+ afr_inode_refresh_subvol_with_fstat(frame, this, i, xdata);
+ else
+ afr_inode_refresh_subvol_with_lookup(
+ frame, this, i, local->refreshinode, local->refreshgfid, xdata);
- return 0;
-}
+ if (!--call_count)
+ break;
+ }
+
+ dict_unref(xdata);
+ return 0;
+}
int
-afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
- afr_inode_refresh_cbk_t refreshfn)
+afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, afr_inode_refresh_cbk_t refreshfn)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- local->refreshfn = refreshfn;
+ local->refreshfn = refreshfn;
- if (local->refreshinode) {
- inode_unref (local->refreshinode);
- local->refreshinode = NULL;
- }
+ if (local->refreshinode) {
+ inode_unref(local->refreshinode);
+ local->refreshinode = NULL;
+ }
- local->refreshinode = inode_ref (inode);
+ local->refreshinode = inode_ref(inode);
- afr_inode_refresh_do (frame, this);
+ if (gfid)
+ gf_uuid_copy(local->refreshgfid, gfid);
+ else
+ gf_uuid_clear(local->refreshgfid);
- return 0;
-}
+ afr_inode_refresh_do(frame, this);
+ return 0;
+}
int
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req)
+afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req)
{
- int i = 0;
- afr_private_t *priv = NULL;
- int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_uint64 (xattr_req, priv->pending_key[i],
- AFR_NUM_CHANGE_LOGS * sizeof(int));
- if (ret < 0)
- gf_msg (this->name, GF_LOG_WARNING,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Unable to set dict value for %s",
- priv->pending_key[i]);
- /* 3 = data+metadata+entry */
- }
- ret = dict_set_uint64 (xattr_req, AFR_DIRTY,
- AFR_NUM_CHANGE_LOGS * sizeof(int));
- if (ret) {
- gf_msg_debug (this->name, -ret, "failed to set dirty "
- "query flag");
- }
-
- ret = dict_set_int32 (xattr_req, "list-xattr", 1);
- if (ret) {
- gf_msg_debug (this->name, -ret,
- "Unable to set list-xattr in dict ");
- }
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_uint64(xattr_req, priv->pending_key[i],
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret < 0)
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Unable to set dict value for %s", priv->pending_key[i]);
+ /* 3 = data+metadata+entry */
+ }
+ ret = dict_set_uint64(xattr_req, AFR_DIRTY,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret) {
+ gf_msg_debug(this->name, -ret,
+ "failed to set dirty "
+ "query flag");
+ }
+
+ ret = dict_set_int32_sizen(xattr_req, "list-xattr", 1);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set list-xattr in dict ");
+ }
+
+ return ret;
+}
- return ret;
+int
+afr_lookup_xattr_req_prepare(afr_local_t *local, xlator_t *this,
+ dict_t *xattr_req, loc_t *loc)
+{
+ int ret = -ENOMEM;
+
+ if (!local->xattr_req)
+ local->xattr_req = dict_new();
+
+ if (!local->xattr_req)
+ goto out;
+
+ if (xattr_req && (xattr_req != local->xattr_req))
+ dict_copy(xattr_req, local->xattr_req);
+
+ ret = afr_xattr_req_prepare(this, local->xattr_req);
+
+ ret = dict_set_uint64(local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Unable to set dict value for %s", loc->path,
+ GLUSTERFS_INODELK_COUNT);
+ }
+ ret = dict_set_uint64(local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Unable to set dict value for %s", loc->path,
+ GLUSTERFS_ENTRYLK_COUNT);
+ }
+
+ ret = dict_set_uint32(local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Unable to set dict value for %s", loc->path,
+ GLUSTERFS_PARENT_ENTRYLK);
+ }
+
+ ret = dict_set_sizen_str_sizen(local->xattr_req, "link-count",
+ GF_XATTROP_INDEX_COUNT);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+ }
+
+ ret = 0;
+out:
+ return ret;
}
int
-afr_lookup_xattr_req_prepare (afr_local_t *local, xlator_t *this,
- dict_t *xattr_req, loc_t *loc)
+afr_least_pending_reads_child(afr_private_t *priv, unsigned char *readable)
{
- int ret = -ENOMEM;
-
- if (!local->xattr_req)
- local->xattr_req = dict_new ();
+ int i = 0;
+ int child = -1;
+ int64_t read_iter = -1;
+ int64_t pending_read = -1;
- if (!local->xattr_req)
- goto out;
-
- if (xattr_req && (xattr_req != local->xattr_req))
- dict_copy (xattr_req, local->xattr_req);
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i])
+ continue;
+ read_iter = GF_ATOMIC_GET(priv->pending_reads[i]);
+ if (child == -1 || read_iter < pending_read) {
+ pending_read = read_iter;
+ child = i;
+ }
+ }
- ret = afr_xattr_req_prepare (this, local->xattr_req);
+ return child;
+}
- ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_INODELK_COUNT, 0);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "%s: Unable to set dict value for %s",
- loc->path, GLUSTERFS_INODELK_COUNT);
- }
- ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_ENTRYLK_COUNT, 0);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "%s: Unable to set dict value for %s",
- loc->path, GLUSTERFS_ENTRYLK_COUNT);
- }
+static int32_t
+afr_least_latency_child(afr_private_t *priv, unsigned char *readable)
+{
+ int32_t i = 0;
+ int child = -1;
- ret = dict_set_uint32 (local->xattr_req, GLUSTERFS_PARENT_ENTRYLK, 0);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "%s: Unable to set dict value for %s",
- loc->path, GLUSTERFS_PARENT_ENTRYLK);
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
- ret = dict_set_str (xattr_req, "link-count", GF_XATTROP_INDEX_COUNT);
- if (ret) {
- gf_msg_debug (this->name, -ret,
- "Unable to set link-count in dict ");
+ if (child == -1 ||
+ priv->child_latency[i] < priv->child_latency[child]) {
+ child = i;
}
-
- ret = 0;
-out:
- return ret;
+ }
+ return child;
}
-
-int
-afr_hash_child (afr_read_subvol_args_t *args, int32_t child_count, int hashmode)
+static int32_t
+afr_least_latency_times_pending_reads_child(afr_private_t *priv,
+ unsigned char *readable)
{
- uuid_t gfid_copy = {0,};
- pid_t pid;
+ int32_t i = 0;
+ int child = -1;
+ int64_t pending_read = 0;
+ int64_t latency = -1;
+ int64_t least_latency = -1;
- if (!hashmode) {
- return -1;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (AFR_IS_ARBITER_BRICK(priv, i) || !readable[i] ||
+ priv->child_latency[i] < 0)
+ continue;
+
+ pending_read = GF_ATOMIC_GET(priv->pending_reads[i]);
+ latency = (pending_read + 1) * priv->child_latency[i];
- gf_uuid_copy (gfid_copy, args->gfid);
+ if (child == -1 || latency < least_latency) {
+ least_latency = latency;
+ child = i;
+ }
+ }
+ return child;
+}
- if ((hashmode > 1) && (args->ia_type != IA_IFDIR)) {
+int
+afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv,
+ unsigned char *readable)
+{
+ uuid_t gfid_copy = {
+ 0,
+ };
+ pid_t pid;
+ int child = -1;
+
+ switch (priv->hash_mode) {
+ case AFR_READ_POLICY_FIRST_UP:
+ break;
+ case AFR_READ_POLICY_GFID_HASH:
+ gf_uuid_copy(gfid_copy, args->gfid);
+ child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
+ priv->child_count;
+ break;
+ case AFR_READ_POLICY_GFID_PID_HASH:
+ if (args->ia_type != IA_IFDIR) {
/*
* Why getpid? Because it's one of the cheapest calls
- * available - faster than gethostname etc. - and returns a
- * constant-length value that's sure to be shorter than a UUID.
- * It's still very unlikely to be the same across clients, so
- * it still provides good mixing. We're not trying for
- * perfection here. All we need is a low probability that
- * multiple clients won't converge on the same subvolume.
+ * available - faster than gethostname etc. - and
+ * returns a constant-length value that's sure to be
+ * shorter than a UUID. It's still very unlikely to be
+ * the same across clients, so it still provides good
+ * mixing. We're not trying for perfection here. All we
+ * need is a low probability that multiple clients
+ * won't converge on the same subvolume.
*/
+ gf_uuid_copy(gfid_copy, args->gfid);
pid = getpid();
- memcpy (gfid_copy, &pid, sizeof(pid));
- }
-
- return SuperFastHash((char *)gfid_copy,
- sizeof(gfid_copy)) % child_count;
+ *(pid_t *)gfid_copy ^= pid;
+ }
+ child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy)) %
+ priv->child_count;
+ break;
+ case AFR_READ_POLICY_LESS_LOAD:
+ child = afr_least_pending_reads_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LEAST_LATENCY:
+ child = afr_least_latency_child(priv, readable);
+ break;
+ case AFR_READ_POLICY_LOAD_LATENCY_HYBRID:
+ child = afr_least_latency_times_pending_reads_child(priv, readable);
+ break;
+ }
+
+ return child;
}
-
int
-afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
- unsigned char *readable,
- afr_read_subvol_args_t *args)
+afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
+ unsigned char *readable,
+ afr_read_subvol_args_t *args)
{
- int i = 0;
- int read_subvol = -1;
- afr_private_t *priv = NULL;
- afr_read_subvol_args_t local_args = {0,};
+ int i = 0;
+ int read_subvol = -1;
+ afr_private_t *priv = NULL;
+ afr_read_subvol_args_t local_args = {
+ 0,
+ };
- priv = this->private;
+ priv = this->private;
- /* first preference - explicitly specified or local subvolume */
- if (priv->read_child >= 0 && readable[priv->read_child])
- return priv->read_child;
+ /* first preference - explicitly specified or local subvolume */
+ if (priv->read_child >= 0 && readable[priv->read_child])
+ return priv->read_child;
- if (inode_is_linked (inode)) {
- gf_uuid_copy (local_args.gfid, inode->gfid);
- local_args.ia_type = inode->ia_type;
- } else if (args) {
- local_args = *args;
- }
+ if (inode_is_linked(inode)) {
+ gf_uuid_copy(local_args.gfid, inode->gfid);
+ local_args.ia_type = inode->ia_type;
+ } else if (args) {
+ local_args = *args;
+ }
- /* second preference - use hashed mode */
- read_subvol = afr_hash_child (&local_args, priv->child_count,
- priv->hash_mode);
- if (read_subvol >= 0 && readable[read_subvol])
- return read_subvol;
+ /* second preference - use hashed mode */
+ read_subvol = afr_hash_child(&local_args, priv, readable);
+ if (read_subvol >= 0 && readable[read_subvol])
+ return read_subvol;
- for (i = 0; i < priv->child_count; i++) {
- if (readable[i])
- return i;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (readable[i])
+ return i;
+ }
- /* no readable subvolumes, either split brain or all subvols down */
+ /* no readable subvolumes, either split brain or all subvols down */
- return -1;
+ return -1;
}
-
int
-afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
- unsigned char *readable, int *event_p,
- int type)
+afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type)
{
- int ret = -1;
+ int ret = -1;
- if (type == AFR_METADATA_TRANSACTION)
- ret = afr_inode_read_subvol_get (inode, this, 0, readable,
- event_p);
- else
- ret = afr_inode_read_subvol_get (inode, this, readable, 0,
- event_p);
- return ret;
+ if (type == AFR_METADATA_TRANSACTION)
+ ret = afr_inode_read_subvol_get(inode, this, 0, readable, event_p);
+ else
+ ret = afr_inode_read_subvol_get(inode, this, readable, 0, event_p);
+ return ret;
}
+void
+afr_readables_intersect_get(inode_t *inode, xlator_t *this, int *event,
+ unsigned char *intersection)
+{
+ afr_private_t *priv = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ unsigned char *intersect = NULL;
+
+ priv = this->private;
+ data_readable = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+ intersect = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable,
+ event);
+
+ AFR_INTERSECT(intersect, data_readable, metadata_readable,
+ priv->child_count);
+ if (intersection)
+ memcpy(intersection, intersect,
+ sizeof(*intersection) * priv->child_count);
+}
int
-afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
- int *event_p, afr_transaction_type type,
- afr_read_subvol_args_t *args)
+afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p,
+ unsigned char *readables, int *event_p,
+ afr_transaction_type type, afr_read_subvol_args_t *args)
{
- afr_private_t *priv = NULL;
- unsigned char *data_readable = NULL;
- unsigned char *metadata_readable = NULL;
- unsigned char *readable = NULL;
- unsigned char *intersection = NULL;
- int subvol = -1;
- int event = 0;
+ afr_private_t *priv = NULL;
+ unsigned char *readable = NULL;
+ unsigned char *intersection = NULL;
+ int subvol = -1;
+ int event = 0;
- priv = this->private;
+ priv = this->private;
- readable = alloca0 (priv->child_count);
- data_readable = alloca0 (priv->child_count);
- metadata_readable = alloca0 (priv->child_count);
- intersection = alloca0 (priv->child_count);
+ readable = alloca0(priv->child_count);
+ intersection = alloca0(priv->child_count);
- afr_inode_read_subvol_type_get (inode, this, readable, &event, type);
+ afr_inode_read_subvol_type_get(inode, this, readable, &event, type);
- afr_inode_read_subvol_get (inode, this, data_readable, metadata_readable,
- &event);
+ afr_readables_intersect_get(inode, this, &event, intersection);
- AFR_INTERSECT (intersection, data_readable, metadata_readable,
- priv->child_count);
-
- if (AFR_COUNT (intersection, priv->child_count) > 0)
- subvol = afr_read_subvol_select_by_policy (inode, this,
- intersection, args);
- else
- subvol = afr_read_subvol_select_by_policy (inode, this,
- readable, args);
- if (subvol_p)
- *subvol_p = subvol;
- if (event_p)
- *event_p = event;
- return subvol;
+ if (AFR_COUNT(intersection, priv->child_count) > 0)
+ subvol = afr_read_subvol_select_by_policy(inode, this, intersection,
+ args);
+ else
+ subvol = afr_read_subvol_select_by_policy(inode, this, readable, args);
+ if (subvol_p)
+ *subvol_p = subvol;
+ if (event_p)
+ *event_p = event;
+ if (readables)
+ memcpy(readables, readable, sizeof(*readables) * priv->child_count);
+ return subvol;
}
-
void
-afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this)
+afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this)
{
- afr_private_t *priv = NULL;
- int i = 0;
-
- priv = this->private;
+ afr_private_t *priv = NULL;
+ int i = 0;
- afr_matrix_cleanup (local->pending, priv->child_count);
+ priv = this->private;
- GF_FREE (local->internal_lock.locked_nodes);
-
- for (i = 0; local->internal_lock.inodelk[i].domain; i++) {
- GF_FREE (local->internal_lock.inodelk[i].locked_nodes);
- }
+ afr_matrix_cleanup(local->pending, priv->child_count);
- GF_FREE (local->internal_lock.lower_locked_nodes);
+ GF_FREE(local->internal_lock.lower_locked_nodes);
- afr_entry_lockee_cleanup (&local->internal_lock);
+ afr_lockees_cleanup(&local->internal_lock);
- GF_FREE (local->transaction.pre_op);
+ GF_FREE(local->transaction.pre_op);
- GF_FREE (local->transaction.pre_op_sources);
- if (local->transaction.pre_op_xdata) {
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op_xdata[i])
- continue;
- dict_unref (local->transaction.pre_op_xdata[i]);
- }
- GF_FREE (local->transaction.pre_op_xdata);
+ GF_FREE(local->transaction.pre_op_sources);
+ if (local->transaction.changelog_xdata) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.changelog_xdata[i])
+ continue;
+ dict_unref(local->transaction.changelog_xdata[i]);
}
+ GF_FREE(local->transaction.changelog_xdata);
+ }
- GF_FREE (local->transaction.eager_lock);
- GF_FREE (local->transaction.failed_subvols);
-
- GF_FREE (local->transaction.basename);
- GF_FREE (local->transaction.new_basename);
+ GF_FREE(local->transaction.failed_subvols);
- loc_wipe (&local->transaction.parent_loc);
- loc_wipe (&local->transaction.new_parent_loc);
+ GF_FREE(local->transaction.basename);
+ GF_FREE(local->transaction.new_basename);
+ loc_wipe(&local->transaction.parent_loc);
+ loc_wipe(&local->transaction.new_parent_loc);
}
-
void
-afr_replies_wipe (struct afr_reply *replies, int count)
+afr_reply_wipe(struct afr_reply *reply)
{
- int i = 0;
+ if (reply->xdata) {
+ dict_unref(reply->xdata);
+ reply->xdata = NULL;
+ }
- for (i = 0; i < count; i++) {
- if (replies[i].xdata) {
- dict_unref (replies[i].xdata);
- replies[i].xdata = NULL;
- }
+ if (reply->xattr) {
+ dict_unref(reply->xattr);
+ reply->xattr = NULL;
+ }
+}
- if (replies[i].xattr) {
- dict_unref (replies[i].xattr);
- replies[i].xattr = NULL;
- }
- }
+void
+afr_replies_wipe(struct afr_reply *replies, int count)
+{
+ int i = 0;
+
+ for (i = 0; i < count; i++) {
+ afr_reply_wipe(&replies[i]);
+ }
}
void
-afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv)
+afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv)
{
+ if (!local->replies)
+ return;
- if (!local->replies)
- return;
+ afr_replies_wipe(local->replies, priv->child_count);
- afr_replies_wipe (local->replies, priv->child_count);
+ memset(local->replies, 0, sizeof(*local->replies) * priv->child_count);
+}
- memset (local->replies, 0, sizeof(*local->replies) * priv->child_count);
+static gf_boolean_t
+afr_fop_lock_is_unlock(call_frame_t *frame)
+{
+ afr_local_t *local = frame->local;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ if ((F_UNLCK == local->cont.inodelk.in_flock.l_type) &&
+ (local->cont.inodelk.in_cmd == F_SETLKW ||
+ local->cont.inodelk.in_cmd == F_SETLK))
+ return _gf_true;
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ if (ENTRYLK_UNLOCK == local->cont.entrylk.in_cmd)
+ return _gf_true;
+ break;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
}
-void
-afr_remove_eager_lock_stub (afr_local_t *local)
-{
- LOCK (&local->fd->lock);
- {
- list_del_init (&local->transaction.eager_locked);
- }
- UNLOCK (&local->fd->lock);
+static gf_boolean_t
+afr_lk_is_unlock(int32_t cmd, struct gf_flock *flock)
+{
+ switch (cmd) {
+ case F_RESLK_UNLCK:
+ return _gf_true;
+ break;
+
+#if F_SETLKW != F_SETLKW64
+ case F_SETLKW64:
+#endif
+ case F_SETLKW:
+
+#if F_SETLK != F_SETLK64
+ case F_SETLK64:
+#endif
+ case F_SETLK:
+ if (F_UNLCK == flock->l_type)
+ return _gf_true;
+ break;
+ default:
+ return _gf_false;
+ }
+ return _gf_false;
}
void
-afr_local_cleanup (afr_local_t *local, xlator_t *this)
+afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret,
+ int32_t *op_errno)
{
- afr_private_t * priv = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- if (!local)
- return;
+ if (!frame || !frame->this || !frame->local || !frame->this->private)
+ return;
- syncbarrier_destroy (&local->barrier);
+ if (*op_ret < 0)
+ return;
- if (local->transaction.eager_lock_on &&
- !list_empty (&local->transaction.eager_locked))
- afr_remove_eager_lock_stub (local);
+ /* Failing inodelk/entrylk/lk here is not a good idea because we
+ * need to cleanup the locks on the other bricks if we choose to fail
+ * the fop here. The brick may go down just after unwind happens as well
+ * so anyways the fop will fail when the next fop is sent so leaving
+ * it like this for now.*/
+ local = frame->local;
+ switch (local->op) {
+ case GF_FOP_LOOKUP:
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ case GF_FOP_LK:
+ return;
+ default:
+ break;
+ }
- afr_local_transaction_cleanup (local, this);
+ priv = frame->this->private;
+ if (!priv->consistent_io)
+ return;
- priv = this->private;
+ if (local->event_generation &&
+ (local->event_generation != priv->event_generation))
+ goto inconsistent;
- loc_wipe (&local->loc);
- loc_wipe (&local->newloc);
+ return;
+inconsistent:
+ *op_ret = -1;
+ *op_errno = ENOTCONN;
+}
- if (local->fd)
- fd_unref (local->fd);
+void
+afr_local_cleanup(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
- if (local->xattr_req)
- dict_unref (local->xattr_req);
+ if (!local)
+ return;
- if (local->xattr_rsp)
- dict_unref (local->xattr_rsp);
+ syncbarrier_destroy(&local->barrier);
- if (local->dict)
- dict_unref (local->dict);
+ afr_local_transaction_cleanup(local, this);
- afr_local_replies_wipe (local, priv);
- GF_FREE(local->replies);
+ priv = this->private;
- GF_FREE (local->child_up);
+ loc_wipe(&local->loc);
+ loc_wipe(&local->newloc);
- GF_FREE (local->read_attempted);
+ if (local->fd)
+ fd_unref(local->fd);
- GF_FREE (local->readable);
+ if (local->xattr_req)
+ dict_unref(local->xattr_req);
- if (local->inode)
- inode_unref (local->inode);
+ if (local->xattr_rsp)
+ dict_unref(local->xattr_rsp);
- if (local->parent)
- inode_unref (local->parent);
+ if (local->dict)
+ dict_unref(local->dict);
- if (local->parent2)
- inode_unref (local->parent2);
+ afr_local_replies_wipe(local, priv);
+ GF_FREE(local->replies);
- if (local->refreshinode)
- inode_unref (local->refreshinode);
+ GF_FREE(local->child_up);
- { /* getxattr */
- GF_FREE (local->cont.getxattr.name);
- }
+ GF_FREE(local->read_attempted);
+
+ GF_FREE(local->readable);
+ GF_FREE(local->readable2);
+
+ if (local->inode)
+ inode_unref(local->inode);
- { /* lk */
- GF_FREE (local->cont.lk.locked_nodes);
- }
+ if (local->parent)
+ inode_unref(local->parent);
- { /* create */
- if (local->cont.create.fd)
- fd_unref (local->cont.create.fd);
- if (local->cont.create.params)
- dict_unref (local->cont.create.params);
- }
+ if (local->parent2)
+ inode_unref(local->parent2);
- { /* mknod */
- if (local->cont.mknod.params)
- dict_unref (local->cont.mknod.params);
- }
+ if (local->refreshinode)
+ inode_unref(local->refreshinode);
- { /* mkdir */
- if (local->cont.mkdir.params)
- dict_unref (local->cont.mkdir.params);
- }
+ { /* getxattr */
+ GF_FREE(local->cont.getxattr.name);
+ }
- { /* symlink */
- if (local->cont.symlink.params)
- dict_unref (local->cont.symlink.params);
- }
+ { /* lk */
+ GF_FREE(local->cont.lk.locked_nodes);
+ GF_FREE(local->cont.lk.dom_locked_nodes);
+ GF_FREE(local->cont.lk.dom_lock_op_ret);
+ GF_FREE(local->cont.lk.dom_lock_op_errno);
+ }
- { /* writev */
- GF_FREE (local->cont.writev.vector);
- if (local->cont.writev.iobref)
- iobref_unref (local->cont.writev.iobref);
- }
+ { /* create */
+ if (local->cont.create.fd)
+ fd_unref(local->cont.create.fd);
+ if (local->cont.create.params)
+ dict_unref(local->cont.create.params);
+ }
- { /* setxattr */
- if (local->cont.setxattr.dict)
- dict_unref (local->cont.setxattr.dict);
- }
+ { /* mknod */
+ if (local->cont.mknod.params)
+ dict_unref(local->cont.mknod.params);
+ }
- { /* fsetxattr */
- if (local->cont.fsetxattr.dict)
- dict_unref (local->cont.fsetxattr.dict);
- }
+ { /* mkdir */
+ if (local->cont.mkdir.params)
+ dict_unref(local->cont.mkdir.params);
+ }
- { /* removexattr */
- GF_FREE (local->cont.removexattr.name);
- }
- { /* xattrop */
- if (local->cont.xattrop.xattr)
- dict_unref (local->cont.xattrop.xattr);
- }
- { /* symlink */
- GF_FREE (local->cont.symlink.linkpath);
- }
+ { /* symlink */
+ if (local->cont.symlink.params)
+ dict_unref(local->cont.symlink.params);
+ }
- { /* opendir */
- GF_FREE (local->cont.opendir.checksum);
- }
+ { /* writev */
+ GF_FREE(local->cont.writev.vector);
+ if (local->cont.writev.iobref)
+ iobref_unref(local->cont.writev.iobref);
+ }
- { /* readdirp */
- if (local->cont.readdir.dict)
- dict_unref (local->cont.readdir.dict);
- }
+ { /* setxattr */
+ if (local->cont.setxattr.dict)
+ dict_unref(local->cont.setxattr.dict);
+ }
- { /* inodelk */
- GF_FREE (local->cont.inodelk.volume);
- }
+ { /* fsetxattr */
+ if (local->cont.fsetxattr.dict)
+ dict_unref(local->cont.fsetxattr.dict);
+ }
- if (local->xdata_req)
- dict_unref (local->xdata_req);
+ { /* removexattr */
+ GF_FREE(local->cont.removexattr.name);
+ }
+ { /* xattrop */
+ if (local->cont.xattrop.xattr)
+ dict_unref(local->cont.xattrop.xattr);
+ }
+ { /* symlink */
+ GF_FREE(local->cont.symlink.linkpath);
+ }
- if (local->xdata_rsp)
- dict_unref (local->xdata_rsp);
+ { /* opendir */
+ GF_FREE(local->cont.opendir.checksum);
+ }
+
+ { /* open */
+ if (local->cont.open.fd)
+ fd_unref(local->cont.open.fd);
+ }
+
+ { /* readdirp */
+ if (local->cont.readdir.dict)
+ dict_unref(local->cont.readdir.dict);
+ }
+
+ { /* inodelk */
+ GF_FREE(local->cont.inodelk.volume);
+ if (local->cont.inodelk.xdata)
+ dict_unref(local->cont.inodelk.xdata);
+ }
+
+ { /* entrylk */
+ GF_FREE(local->cont.entrylk.volume);
+ GF_FREE(local->cont.entrylk.basename);
+ if (local->cont.entrylk.xdata)
+ dict_unref(local->cont.entrylk.xdata);
+ }
+
+ if (local->xdata_req)
+ dict_unref(local->xdata_req);
+
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
}
-
int
-afr_frame_return (call_frame_t *frame)
+afr_frame_return(call_frame_t *frame)
{
- afr_local_t *local = NULL;
- int call_count = 0;
+ afr_local_t *local = NULL;
+ int call_count = 0;
- local = frame->local;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- call_count = --local->call_count;
- }
- UNLOCK (&frame->lock);
+ LOCK(&frame->lock);
+ {
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- return call_count;
+ return call_count;
}
+static char *afr_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
gf_boolean_t
-afr_is_entry_possibly_under_txn (afr_local_t *local, xlator_t *this)
+afr_is_xattr_ignorable(char *key)
{
- int i = 0;
- int tmp = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].xdata)
- continue;
- if (dict_get_int32 (local->replies[i].xdata,
- GLUSTERFS_PARENT_ENTRYLK,
- &tmp) == 0)
- if (tmp)
- return _gf_true;
- }
+ int i = 0;
- return _gf_false;
+ if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX)))
+ return _gf_true;
+ for (i = 0; afr_ignore_xattrs[i]; i++) {
+ if (!strcmp(key, afr_ignore_xattrs[i]))
+ return _gf_true;
+ }
+ return _gf_false;
}
-
-static char *afr_ignore_xattrs[] = {
- GLUSTERFS_OPEN_FD_COUNT,
- GLUSTERFS_PARENT_ENTRYLK,
- GLUSTERFS_ENTRYLK_COUNT,
- GLUSTERFS_INODELK_COUNT,
- GF_SELINUX_XATTR_KEY,
- QUOTA_SIZE_KEY,
- NULL
-};
-
-gf_boolean_t
-afr_is_xattr_ignorable (char *key)
+static gf_boolean_t
+afr_xattr_match_needed(dict_t *this, char *key1, data_t *value1, void *data)
{
- int i = 0;
-
- if (!strncmp (key, AFR_XATTR_PREFIX, strlen(AFR_XATTR_PREFIX)))
- return _gf_true;
- for (i = 0; afr_ignore_xattrs[i]; i++) {
- if (!strcmp (key, afr_ignore_xattrs[i]))
- return _gf_true;
- }
+ /* Ignore all non-disk (i.e. virtual) xattrs right away. */
+ if (!gf_is_valid_xattr_namespace(key1))
return _gf_false;
-}
-static gf_boolean_t
-afr_xattr_match (dict_t *this, char *key1, data_t *value1, void *data)
-{
- if (!afr_is_xattr_ignorable (key1))
- return _gf_true;
+ /* Ignore on-disk xattrs that AFR doesn't need to heal. */
+ if (!afr_is_xattr_ignorable(key1))
+ return _gf_true;
- return _gf_false;
+ return _gf_false;
}
gf_boolean_t
-afr_xattrs_are_equal (dict_t *dict1, dict_t *dict2)
+afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2)
{
- return are_dicts_equal (dict1, dict2, afr_xattr_match, NULL);
+ return are_dicts_equal(dict1, dict2, afr_xattr_match_needed, NULL);
}
static int
-afr_get_parent_read_subvol (xlator_t *this, inode_t *parent,
- struct afr_reply *replies, unsigned char *readable)
+afr_get_parent_read_subvol(xlator_t *this, inode_t *parent,
+ struct afr_reply *replies, unsigned char *readable)
{
- int i = 0;
- int par_read_subvol = -1;
- int par_read_subvol_iter = -1;
- afr_private_t *priv = NULL;
+ int i = 0;
+ int par_read_subvol = -1;
+ int par_read_subvol_iter = -1;
+ afr_private_t *priv = NULL;
- priv = this->private;
+ priv = this->private;
- if (parent)
- par_read_subvol = afr_data_subvol_get (parent, this, 0, 0,
- NULL);
+ if (parent)
+ par_read_subvol = afr_data_subvol_get(parent, this, NULL, NULL, NULL,
+ NULL);
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
-
- if (replies[i].op_ret < 0)
- continue;
-
- if (par_read_subvol_iter == -1) {
- par_read_subvol_iter = i;
- continue;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
- if ((par_read_subvol_iter != par_read_subvol) && readable[i])
- par_read_subvol_iter = i;
+ if (replies[i].op_ret < 0)
+ continue;
- if (i == par_read_subvol)
- par_read_subvol_iter = i;
+ if (par_read_subvol_iter == -1) {
+ par_read_subvol_iter = i;
+ continue;
}
- /* At the end of the for-loop, the only reason why @par_read_subvol_iter
- * could be -1 is when this LOOKUP has failed on all sub-volumes.
- * So it is okay to send an arbitrary subvolume (0 in this case)
- * as parent read subvol.
- */
- if (par_read_subvol_iter == -1)
- par_read_subvol_iter = 0;
- return par_read_subvol_iter;
+ if ((par_read_subvol_iter != par_read_subvol) && readable[i])
+ par_read_subvol_iter = i;
+
+ if (i == par_read_subvol)
+ par_read_subvol_iter = i;
+ }
+ /* At the end of the for-loop, the only reason why @par_read_subvol_iter
+ * could be -1 is when this LOOKUP has failed on all sub-volumes.
+ * So it is okay to send an arbitrary subvolume (0 in this case)
+ * as parent read subvol.
+ */
+ if (par_read_subvol_iter == -1)
+ par_read_subvol_iter = 0;
+ return par_read_subvol_iter;
}
int
-afr_read_subvol_decide (inode_t *inode, xlator_t *this,
- afr_read_subvol_args_t *args)
+afr_read_subvol_decide(inode_t *inode, xlator_t *this,
+ afr_read_subvol_args_t *args, unsigned char *readable)
{
- int data_subvol = -1;
- int mdata_subvol = -1;
+ int event = 0;
+ afr_private_t *priv = NULL;
+ unsigned char *intersection = NULL;
- data_subvol = afr_data_subvol_get (inode, this,
- 0, 0, args);
- mdata_subvol = afr_metadata_subvol_get (inode, this,
- 0, 0, args);
- if (data_subvol == -1 || mdata_subvol == -1)
- return -1;
+ priv = this->private;
+ intersection = alloca0(priv->child_count);
- return data_subvol;
+ afr_readables_intersect_get(inode, this, &event, intersection);
+
+ if (AFR_COUNT(intersection, priv->child_count) <= 0) {
+ /* TODO: If we have one brick with valid data_readable and
+ * another with metadata_readable, try to send an iatt with
+ * valid bits from both.*/
+ return -1;
+ }
+
+ memcpy(readable, intersection, sizeof(*readable) * priv->child_count);
+
+ return afr_read_subvol_select_by_policy(inode, this, intersection, args);
}
static inline int
-afr_first_up_child (call_frame_t *frame, xlator_t *this)
+afr_first_up_child(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++)
- if (local->replies[i].valid &&
- local->replies[i].op_ret == 0)
- return i;
- return 0;
+ for (i = 0; i < priv->child_count; i++)
+ if (local->replies[i].valid && local->replies[i].op_ret == 0)
+ return i;
+ return -1;
}
static void
-afr_lookup_done (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = -1;
- int op_errno = 0;
- int read_subvol = 0;
- int par_read_subvol = 0;
- unsigned char *readable = NULL;
- int event = 0;
- struct afr_reply *replies = NULL;
- uuid_t read_gfid = {0, };
- gf_boolean_t locked_entry = _gf_false;
- gf_boolean_t can_interpret = _gf_true;
- inode_t *parent = NULL;
- int spb_choice = -1;
- ia_type_t ia_type = IA_INVAL;
- afr_read_subvol_args_t args = {0,};
-
- priv = this->private;
- local = frame->local;
- replies = local->replies;
- parent = local->loc.parent;
-
- locked_entry = afr_is_entry_possibly_under_txn (local, this);
-
- readable = alloca0 (priv->child_count);
-
- afr_inode_read_subvol_get (parent, this, readable, NULL, &event);
-
- afr_inode_split_brain_choice_get (local->inode, this,
- &spb_choice);
- /* First, check if we have a gfid-change from somewhere,
- If so, propagate that so that a fresh lookup can be
- issued
- */
- if (local->cont.lookup.needs_fresh_lookup) {
- local->op_ret = -1;
- local->op_errno = ESTALE;
- goto unwind;
- }
-
- op_errno = afr_final_errno (frame->local, this->private);
- local->op_errno = op_errno;
-
- read_subvol = -1;
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
-
- if (locked_entry && replies[i].op_ret == -1 &&
- replies[i].op_errno == ENOENT) {
- /* Second, check entry is still
- "underway" in creation */
- local->op_ret = -1;
- local->op_errno = ENOENT;
- goto unwind;
- }
-
- if (replies[i].op_ret == -1)
- continue;
-
- if (read_subvol == -1 || !readable[read_subvol]) {
- read_subvol = i;
- gf_uuid_copy (read_gfid, replies[i].poststat.ia_gfid);
- ia_type = replies[i].poststat.ia_type;
- local->op_ret = 0;
- }
- }
-
- if (read_subvol == -1)
- goto unwind;
- /* We now have a read_subvol, which is readable[] (if there
- were any). Next we look for GFID mismatches. We don't
- consider a GFID mismatch as an error if read_subvol is
- readable[] but the mismatching GFID subvol is not.
- */
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid || replies[i].op_ret == -1) {
- if (priv->child_up[i])
- can_interpret = _gf_false;
- continue;
- }
-
- if (!gf_uuid_compare (replies[i].poststat.ia_gfid, read_gfid))
- continue;
-
- can_interpret = _gf_false;
-
- if (locked_entry)
- continue;
-
- /* Now GFIDs mismatch. It's OK as long as this subvol
- is not readable[] but read_subvol is */
- if (readable[read_subvol] && !readable[i])
- continue;
-
- /* LOG ERROR */
- local->op_ret = -1;
- local->op_errno = EIO;
- goto unwind;
- }
-
- /* Forth, for the finalized GFID, pick the best subvolume
- to return stats from.
- */
- if (can_interpret) {
- /* It is safe to call afr_replies_interpret() because we have
- a response from all the UP subvolumes and all of them resolved
- to the same GFID
- */
- gf_uuid_copy (args.gfid, read_gfid);
- args.ia_type = ia_type;
- if (afr_replies_interpret (frame, this, local->inode)) {
- read_subvol = afr_read_subvol_decide (local->inode,
- this, &args);
- afr_inode_read_subvol_reset (local->inode, this);
- goto cant_interpret;
- } else {
- read_subvol = afr_data_subvol_get (local->inode, this,
- 0, 0, &args);
- }
- } else {
- cant_interpret:
- if (read_subvol == -1) {
- if (spb_choice >= 0)
- read_subvol = spb_choice;
- else
- read_subvol = afr_first_up_child (frame, this);
- }
- dict_del (replies[read_subvol].xdata, GF_CONTENT_KEY);
- }
+afr_attempt_readsubvol_set(call_frame_t *frame, xlator_t *this,
+ unsigned char *success_replies,
+ unsigned char *data_readable, int *read_subvol)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int spb_subvol = -1;
+ int child_count = -1;
- afr_handle_quota_size (frame, this);
+ if (*read_subvol != -1)
+ return;
-unwind:
- afr_set_need_heal (this, local);
+ priv = this->private;
+ local = frame->local;
+ child_count = priv->child_count;
+
+ afr_split_brain_read_subvol_get(local->inode, this, frame, &spb_subvol);
+ if ((spb_subvol >= 0) &&
+ (AFR_COUNT(success_replies, child_count) == child_count)) {
+ *read_subvol = spb_subvol;
+ } else if (!priv->quorum_count ||
+ frame->root->pid == GF_CLIENT_PID_GLFS_HEAL) {
+ *read_subvol = afr_first_up_child(frame, this);
+ } else if (priv->quorum_count &&
+ afr_has_quorum(data_readable, this, NULL)) {
+ /* read_subvol is guaranteed to be valid if we hit this path. */
+ *read_subvol = afr_first_up_child(frame, this);
+ } else {
+ /* If quorum is enabled and we do not have a
+ readable yet, it means all good copies are down.
+ */
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR,
+ "no read "
+ "subvols for %s",
+ local->loc.path);
+ }
+ if (*read_subvol >= 0)
+ dict_del_sizen(local->replies[*read_subvol].xdata, GF_CONTENT_KEY);
+}
+
+static void
+afr_lookup_done(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = -1;
+ int op_errno = 0;
+ int read_subvol = 0;
+ int par_read_subvol = 0;
+ int ret = -1;
+ unsigned char *readable = NULL;
+ unsigned char *success_replies = NULL;
+ int event = 0;
+ struct afr_reply *replies = NULL;
+ uuid_t read_gfid = {
+ 0,
+ };
+ gf_boolean_t locked_entry = _gf_false;
+ gf_boolean_t in_flight_create = _gf_false;
+ gf_boolean_t can_interpret = _gf_true;
+ inode_t *parent = NULL;
+ ia_type_t ia_type = IA_INVAL;
+ afr_read_subvol_args_t args = {
+ 0,
+ };
+ char *gfid_heal_msg = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ replies = local->replies;
+ parent = local->loc.parent;
+
+ locked_entry = afr_is_possibly_under_txn(AFR_ENTRY_TRANSACTION, local,
+ this);
+
+ readable = alloca0(priv->child_count);
+ success_replies = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(parent, this, readable, NULL, &event);
+ par_read_subvol = afr_get_parent_read_subvol(this, parent, replies,
+ readable);
+
+ /* First, check if we have a gfid-change from somewhere,
+ If so, propagate that so that a fresh lookup can be
+ issued
+ */
+ if (local->cont.lookup.needs_fresh_lookup) {
+ local->op_ret = -1;
+ local->op_errno = ESTALE;
+ goto error;
+ }
+
+ op_errno = afr_final_errno(frame->local, this->private);
+ local->op_errno = op_errno;
+
+ read_subvol = -1;
+ afr_fill_success_replies(local, priv, success_replies);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret == -1) {
+ if (locked_entry && replies[i].op_errno == ENOENT) {
+ in_flight_create = _gf_true;
+ }
+ continue;
+ }
+
+ if (read_subvol == -1 || !readable[read_subvol]) {
+ read_subvol = i;
+ gf_uuid_copy(read_gfid, replies[i].poststat.ia_gfid);
+ ia_type = replies[i].poststat.ia_type;
+ local->op_ret = 0;
+ }
+ }
+
+ if (in_flight_create && !afr_has_quorum(success_replies, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = ENOENT;
+ goto error;
+ }
+
+ if (read_subvol == -1)
+ goto error;
+ /* We now have a read_subvol, which is readable[] (if there
+ were any). Next we look for GFID mismatches. We don't
+ consider a GFID mismatch as an error if read_subvol is
+ readable[] but the mismatching GFID subvol is not.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1) {
+ continue;
+ }
+
+ if (!gf_uuid_compare(replies[i].poststat.ia_gfid, read_gfid))
+ continue;
+
+ can_interpret = _gf_false;
+
+ if (locked_entry)
+ continue;
+
+ /* Now GFIDs mismatch. It's OK as long as this subvol
+ is not readable[] but read_subvol is */
+ if (readable[read_subvol] && !readable[i])
+ continue;
+
+ /* If we were called from glfsheal and there is still a gfid
+ * mismatch, succeed the lookup and let glfsheal print the
+ * response via gfid-heal-msg.*/
+ if (!dict_get_str_sizen(local->xattr_req, "gfid-heal-msg",
+ &gfid_heal_msg))
+ goto cant_interpret;
+
+ /* LOG ERROR */
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ goto error;
+ }
+
+ /* Forth, for the finalized GFID, pick the best subvolume
+ to return stats from.
+ */
+ read_subvol = -1;
+ memset(readable, 0, sizeof(*readable) * priv->child_count);
+ if (can_interpret) {
+ if (!afr_has_quorum(success_replies, this, NULL))
+ goto cant_interpret;
+ /* It is safe to call afr_replies_interpret() because we have
+ a response from all the UP subvolumes and all of them resolved
+ to the same GFID
+ */
+ gf_uuid_copy(args.gfid, read_gfid);
+ args.ia_type = ia_type;
+ ret = afr_replies_interpret(frame, this, local->inode, NULL);
+ read_subvol = afr_read_subvol_decide(local->inode, this, &args,
+ readable);
+ if (read_subvol == -1)
+ goto cant_interpret;
+ if (ret) {
+ afr_inode_need_refresh_set(local->inode, this);
+ dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY);
+ }
+ } else {
+ cant_interpret:
+ afr_attempt_readsubvol_set(frame, this, success_replies, readable,
+ &read_subvol);
if (read_subvol == -1) {
- if (spb_choice >= 0)
- read_subvol = spb_choice;
- else
- read_subvol = afr_first_up_child (frame, this);
+ goto error;
}
- par_read_subvol = afr_get_parent_read_subvol (this, parent, replies,
- readable);
+ }
- AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->replies[read_subvol].poststat,
- local->replies[read_subvol].xdata,
- &local->replies[par_read_subvol].postparent);
+ afr_handle_quota_size(frame, this);
+
+ afr_set_need_heal(this, local);
+ if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg_debug(this->name, 0,
+ "Arbiter cannot be a read subvol "
+ "for %s",
+ local->loc.path);
+ goto error;
+ }
+
+ ret = dict_get_str_sizen(local->xattr_req, "gfid-heal-msg", &gfid_heal_msg);
+ if (!ret) {
+ ret = dict_set_str_sizen(local->replies[read_subvol].xdata,
+ "gfid-heal-msg", gfid_heal_msg);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "Error setting gfid-heal-msg dict");
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ }
+ }
+
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[par_read_subvol].postparent);
+ return;
+
+error:
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL,
+ NULL, NULL);
}
/*
@@ -1802,650 +3122,845 @@ unwind:
* others in that they must be given higher priority while
* returning to the user.
*
- * The hierarchy is ENODATA > ENOENT > ESTALE > others
+ * The hierarchy is ENODATA > ENOENT > ESTALE > ENOSPC others
*/
int
-afr_higher_errno (int32_t old_errno, int32_t new_errno)
+afr_higher_errno(int32_t old_errno, int32_t new_errno)
{
- if (old_errno == ENODATA || new_errno == ENODATA)
- return ENODATA;
- if (old_errno == ENOENT || new_errno == ENOENT)
- return ENOENT;
- if (old_errno == ESTALE || new_errno == ESTALE)
- return ESTALE;
+ if (old_errno == ENODATA || new_errno == ENODATA)
+ return ENODATA;
+ if (old_errno == ENOENT || new_errno == ENOENT)
+ return ENOENT;
+ if (old_errno == ESTALE || new_errno == ESTALE)
+ return ESTALE;
+ if (old_errno == ENOSPC || new_errno == ENOSPC)
+ return ENOSPC;
- return new_errno;
+ return new_errno;
}
-
int
-afr_final_errno (afr_local_t *local, afr_private_t *priv)
+afr_final_errno(afr_local_t *local, afr_private_t *priv)
{
- int i = 0;
- int op_errno = 0;
- int tmp_errno = 0;
+ int i = 0;
+ int op_errno = 0;
+ int tmp_errno = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].valid)
- continue;
- if (local->replies[i].op_ret >= 0)
- continue;
- tmp_errno = local->replies[i].op_errno;
- op_errno = afr_higher_errno (op_errno, tmp_errno);
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret >= 0)
+ continue;
+ tmp_errno = local->replies[i].op_errno;
+ op_errno = afr_higher_errno(op_errno, tmp_errno);
+ }
- return op_errno;
+ return op_errno;
}
static int32_t
-afr_local_discovery_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict,
- dict_t *xdata)
-{
- int ret = 0;
- char *pathinfo = NULL;
- gf_boolean_t is_local = _gf_false;
- afr_private_t *priv = NULL;
- int32_t child_index = -1;
-
- if (op_ret != 0) {
- goto out;
- }
-
- priv = this->private;
- child_index = (int32_t)(long)cookie;
-
- ret = dict_get_str (dict, GF_XATTR_PATHINFO_KEY, &pathinfo);
- if (ret != 0) {
- goto out;
- }
-
- ret = glusterfs_is_local_pathinfo (pathinfo, &is_local);
- if (ret) {
- goto out;
- }
-
- /*
- * Note that one local subvolume will override another here. The only
- * way to avoid that would be to retain extra information about whether
- * the previous read_child is local, and it's just not worth it. Even
- * the slowest local subvolume is far preferable to a remote one.
- */
- if (is_local) {
- /* Don't set arbiter as read child. */
- if (AFR_IS_ARBITER_BRICK(priv, child_index))
- goto out;
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_LOCAL_CHILD, "selecting local read_child %s",
- priv->children[child_index]->name);
-
- priv->read_child = child_index;
- }
+afr_local_discovery_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ int ret = 0;
+ char *pathinfo = NULL;
+ gf_boolean_t is_local = _gf_false;
+ afr_private_t *priv = NULL;
+ int32_t child_index = -1;
+
+ if (op_ret != 0) {
+ goto out;
+ }
+
+ priv = this->private;
+ child_index = (int32_t)(long)cookie;
+
+ ret = dict_get_str_sizen(dict, GF_XATTR_PATHINFO_KEY, &pathinfo);
+ if (ret != 0) {
+ goto out;
+ }
+
+ ret = glusterfs_is_local_pathinfo(pathinfo, &is_local);
+ if (ret) {
+ goto out;
+ }
+
+ /*
+ * Note that one local subvolume will override another here. The only
+ * way to avoid that would be to retain extra information about whether
+ * the previous read_child is local, and it's just not worth it. Even
+ * the slowest local subvolume is far preferable to a remote one.
+ */
+ if (is_local) {
+ priv->local[child_index] = 1;
+ /* Don't set arbiter as read child. */
+ if (AFR_IS_ARBITER_BRICK(priv, child_index))
+ goto out;
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_LOCAL_CHILD,
+ "selecting local read_child %s",
+ priv->children[child_index]->name);
+
+ priv->read_child = child_index;
+ }
out:
- STACK_DESTROY(frame->root);
- return 0;
+ STACK_DESTROY(frame->root);
+ return 0;
}
static void
-afr_attempt_local_discovery (xlator_t *this, int32_t child_index)
+afr_attempt_local_discovery(xlator_t *this, int32_t child_index)
{
- call_frame_t *newframe = NULL;
- loc_t tmploc = {0,};
- afr_private_t *priv = this->private;
+ call_frame_t *newframe = NULL;
+ loc_t tmploc = {
+ 0,
+ };
+ afr_private_t *priv = this->private;
- newframe = create_frame(this,this->ctx->pool);
- if (!newframe) {
- return;
- }
+ newframe = create_frame(this, this->ctx->pool);
+ if (!newframe) {
+ return;
+ }
- tmploc.gfid[sizeof(tmploc.gfid)-1] = 1;
- STACK_WIND_COOKIE (newframe, afr_local_discovery_cbk,
- (void *)(long)child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->getxattr,
- &tmploc, GF_XATTR_PATHINFO_KEY, NULL);
+ tmploc.gfid[sizeof(tmploc.gfid) - 1] = 1;
+ STACK_WIND_COOKIE(newframe, afr_local_discovery_cbk,
+ (void *)(long)child_index, priv->children[child_index],
+ priv->children[child_index]->fops->getxattr, &tmploc,
+ GF_XATTR_PATHINFO_KEY, NULL);
}
int
-afr_lookup_sh_metadata_wrap (void *opaque)
-{
- call_frame_t *frame = opaque;
- afr_local_t *local = NULL;
- xlator_t *this = NULL;
- inode_t *inode = NULL;
- afr_private_t *priv = NULL;
- struct afr_reply *replies = NULL;
- int i= 0, first = -1;
- int ret = -1;
- dict_t *dict = NULL;
+afr_lookup_sh_metadata_wrap(void *opaque)
+{
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ inode_t *inode = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0, first = -1;
+ int ret = -1;
+ dict_t *dict = NULL;
+
+ local = frame->local;
+ this = frame->this;
+ priv = this->private;
+ replies = local->replies;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ first = i;
+ break;
+ }
+ if (first == -1)
+ goto out;
+
+ if (afr_selfheal_metadata_by_stbuf(this, &replies[first].poststat))
+ goto out;
+
+ afr_local_replies_wipe(local, this->private);
+
+ dict = dict_new();
+ if (!dict)
+ goto out;
+ if (local->xattr_req) {
+ dict_copy(local->xattr_req, dict);
+ }
+
+ ret = dict_set_sizen_str_sizen(dict, "link-count", GF_XATTROP_INDEX_COUNT);
+ if (ret) {
+ gf_msg_debug(this->name, -ret, "Unable to set link-count in dict ");
+ }
+
+ if (loc_is_nameless(&local->loc)) {
+ ret = afr_selfheal_unlocked_discover_on(frame, local->inode,
+ local->loc.gfid, local->replies,
+ local->child_up, dict);
+ } else {
+ inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
+ local->loc.name, local->replies,
+ local->child_up, dict);
+ }
+ if (inode)
+ inode_unref(inode);
+out:
+ if (loc_is_nameless(&local->loc))
+ afr_discover_done(frame, this);
+ else
+ afr_lookup_done(frame, this);
- local = frame->local;
- this = frame->this;
- priv = this->private;
- replies = local->replies;
-
- for (i =0; i < priv->child_count; i++) {
- if(!replies[i].valid || replies[i].op_ret == -1)
- continue;
- first = i;
- break;
- }
- if (first == -1)
- goto out;
+ if (dict)
+ dict_unref(dict);
- inode = afr_inode_link (local->inode,&replies[first].poststat);
- if(!inode)
- goto out;
+ return 0;
+}
- afr_selfheal_metadata (frame, this, inode);
- inode_forget (inode, 1);
- inode_unref (inode);
+gf_boolean_t
+afr_is_pending_set(xlator_t *this, dict_t *xdata, int type)
+{
+ int idx = -1;
+ afr_private_t *priv = NULL;
+ void *pending_raw = NULL;
+ int *pending_int = NULL;
+ int i = 0;
- afr_local_replies_wipe (local, this->private);
+ priv = this->private;
+ idx = afr_index_for_transaction_type(type);
- dict = dict_new ();
- if (!dict)
- goto out;
- ret = dict_set_str (dict, "link-count", GF_XATTROP_INDEX_COUNT);
- if (ret) {
- gf_msg_debug (this->name, -ret,
- "Unable to set link-count in dict ");
+ if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw) == 0) {
+ if (pending_raw) {
+ pending_int = pending_raw;
+
+ if (ntoh32(pending_int[idx]))
+ return _gf_true;
}
+ }
- inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent,
- local->loc.name, local->replies,
- local->child_up, dict);
- if (inode)
- inode_unref (inode);
-out:
- afr_lookup_done (frame, this);
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+ continue;
+ if (!pending_raw)
+ continue;
+ pending_int = pending_raw;
- if (dict)
- dict_unref (dict);
+ if (ntoh32(pending_int[idx]))
+ return _gf_true;
+ }
- return 0;
+ return _gf_false;
}
static gf_boolean_t
afr_can_start_metadata_self_heal(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- struct afr_reply *replies = NULL;
- int i = 0, first = -1;
- gf_boolean_t start = _gf_false;
- struct iatt stbuf = {0, };
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0, first = -1;
+ gf_boolean_t start = _gf_false;
+ struct iatt stbuf = {
+ 0,
+ };
- local = frame->local;
- replies = local->replies;
- priv = this->private;
+ local = frame->local;
+ replies = local->replies;
+ priv = this->private;
- if (!priv->metadata_self_heal)
- return _gf_false;
+ if (!priv->metadata_self_heal)
+ return _gf_false;
- for (i = 0; i < priv->child_count; i++) {
- if(!replies[i].valid || replies[i].op_ret == -1)
- continue;
- if (first == -1) {
- first = i;
- stbuf = replies[i].poststat;
- continue;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (first == -1) {
+ first = i;
+ stbuf = replies[i].poststat;
+ continue;
+ }
- if (gf_uuid_compare (stbuf.ia_gfid, replies[i].poststat.ia_gfid)) {
- start = _gf_false;
- break;
- }
- if (!IA_EQUAL (stbuf, replies[i].poststat, type)) {
- start = _gf_false;
- break;
- }
+ if (afr_is_pending_set(this, replies[i].xdata,
+ AFR_METADATA_TRANSACTION)) {
+ /* Let shd do the heal so that lookup is not blocked
+ * on getting metadata lock/doing the heal */
+ start = _gf_false;
+ break;
+ }
- /*Check if iattrs need heal*/
- if ((!IA_EQUAL (stbuf, replies[i].poststat, uid)) ||
- (!IA_EQUAL (stbuf, replies[i].poststat, gid)) ||
- (!IA_EQUAL (stbuf, replies[i].poststat, prot))) {
- start = _gf_true;
- continue;
- }
+ if (gf_uuid_compare(stbuf.ia_gfid, replies[i].poststat.ia_gfid)) {
+ start = _gf_false;
+ break;
+ }
+ if (!IA_EQUAL(stbuf, replies[i].poststat, type)) {
+ start = _gf_false;
+ break;
+ }
- /*Check if xattrs need heal*/
- if (!afr_xattrs_are_equal (replies[first].xdata,
- replies[i].xdata))
- start = _gf_true;
+ /*Check if iattrs need heal*/
+ if ((!IA_EQUAL(stbuf, replies[i].poststat, uid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, gid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, prot))) {
+ start = _gf_true;
+ continue;
}
- return start;
+ /*Check if xattrs need heal*/
+ if (!afr_xattrs_are_equal(replies[first].xdata, replies[i].xdata))
+ start = _gf_true;
+ }
+
+ return start;
}
int
-afr_lookup_metadata_heal_check (call_frame_t *frame, xlator_t *this)
+afr_lookup_metadata_heal_check(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *heal = NULL;
- int ret = 0;
+ call_frame_t *heal = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
- if (!afr_can_start_metadata_self_heal (frame, this))
- goto out;
+ local = frame->local;
+ if (!afr_can_start_metadata_self_heal(frame, this))
+ goto out;
- heal = copy_frame (frame);
- if (heal)
- heal->root->pid = GF_CLIENT_PID_SELF_HEALD;
- ret = synctask_new (this->ctx->env, afr_lookup_sh_metadata_wrap,
- afr_refresh_selfheal_done, heal, frame);
- if(ret)
- goto out;
- return ret;
+ heal = afr_frame_create(this, &ret);
+ if (!heal) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = synctask_new(this->ctx->env, afr_lookup_sh_metadata_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret)
+ goto out;
+ return ret;
out:
- afr_lookup_done (frame, this);
- return ret;
+ if (loc_is_nameless(&local->loc))
+ afr_discover_done(frame, this);
+ else
+ afr_lookup_done(frame, this);
+ if (heal)
+ AFR_STACK_DESTROY(heal);
+ return ret;
}
int
-afr_lookup_selfheal_wrap (void *opaque)
+afr_lookup_selfheal_wrap(void *opaque)
{
- int ret = 0;
- call_frame_t *frame = opaque;
- afr_local_t *local = NULL;
- xlator_t *this = NULL;
- inode_t *inode = NULL;
+ int ret = 0;
+ call_frame_t *frame = opaque;
+ afr_local_t *local = NULL;
+ xlator_t *this = NULL;
+ inode_t *inode = NULL;
+ uuid_t pargfid = {
+ 0,
+ };
- local = frame->local;
- this = frame->this;
+ local = frame->local;
+ this = frame->this;
+ loc_pargfid(&local->loc, pargfid);
- ret = afr_selfheal_name (frame->this, local->loc.pargfid,
- local->loc.name, &local->cont.lookup.gfid_req);
- if (ret == -EIO)
- goto unwind;
+ ret = afr_selfheal_name(frame->this, pargfid, local->loc.name,
+ &local->cont.lookup.gfid_req, local->xattr_req);
+ if (ret == -EIO)
+ goto unwind;
- afr_local_replies_wipe (local, this->private);
+ afr_local_replies_wipe(local, this->private);
- inode = afr_selfheal_unlocked_lookup_on (frame, local->loc.parent,
- local->loc.name, local->replies,
- local->child_up, NULL);
- if (inode)
- inode_unref (inode);
+ inode = afr_selfheal_unlocked_lookup_on(frame, local->loc.parent,
+ local->loc.name, local->replies,
+ local->child_up, local->xattr_req);
+ if (inode)
+ inode_unref(inode);
- afr_lookup_metadata_heal_check(frame, this);
- return 0;
+ afr_lookup_metadata_heal_check(frame, this);
+ return 0;
unwind:
- AFR_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
+ return 0;
}
-
int
-afr_lookup_entry_heal (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- call_frame_t *heal = NULL;
- int i = 0, first = -1;
- gf_boolean_t need_heal = _gf_false;
- struct afr_reply *replies = NULL;
- int ret = 0;
-
- local = frame->local;
- replies = local->replies;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
-
- if ((replies[i].op_ret == -1) &&
- (replies[i].op_errno == ENODATA))
- need_heal = _gf_true;
-
- if (first == -1) {
- first = i;
- continue;
- }
-
- if (replies[i].op_ret != replies[first].op_ret) {
- need_heal = _gf_true;
- break;
- }
-
- if (gf_uuid_compare (replies[i].poststat.ia_gfid,
- replies[first].poststat.ia_gfid)) {
- need_heal = _gf_true;
- break;
- }
- }
-
- if (need_heal) {
- heal = copy_frame (frame);
- if (heal)
- heal->root->pid = GF_CLIENT_PID_SELF_HEALD;
- ret = synctask_new (this->ctx->env, afr_lookup_selfheal_wrap,
- afr_refresh_selfheal_done, heal, frame);
- if (ret)
- goto metadata_heal;
- return ret;
- }
-metadata_heal:
- ret = afr_lookup_metadata_heal_check (frame, this);
-
- return ret;
-}
+afr_lookup_entry_heal(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *heal = NULL;
+ int i = 0, first = -1;
+ gf_boolean_t name_state_mismatch = _gf_false;
+ struct afr_reply *replies = NULL;
+ int ret = 0;
+ unsigned char *par_readables = NULL;
+ unsigned char *success = NULL;
+ int32_t op_errno = 0;
+ uuid_t gfid = {0};
+
+ local = frame->local;
+ replies = local->replies;
+ priv = this->private;
+ par_readables = alloca0(priv->child_count);
+ success = alloca0(priv->child_count);
+
+ ret = afr_inode_read_subvol_get(local->loc.parent, this, par_readables,
+ NULL, NULL);
+ if (ret < 0 || AFR_COUNT(par_readables, priv->child_count) == 0) {
+ /* In this case set par_readables to all 1 so that name_heal
+ * need checks at the end of this function will flag missing
+ * entry when name state mismatches*/
+ memset(par_readables, 1, priv->child_count);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret == 0) {
+ if (gf_uuid_is_null(gfid)) {
+ gf_uuid_copy(gfid, replies[i].poststat.ia_gfid);
+ }
+ success[i] = 1;
+ } else {
+ if ((replies[i].op_errno != ENOTCONN) &&
+ (replies[i].op_errno != ENOENT) &&
+ (replies[i].op_errno != ESTALE)) {
+ op_errno = replies[i].op_errno;
+ }
+ }
+ /*gfid is missing, needs heal*/
+ if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) {
+ goto name_heal;
+ }
-int
-afr_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
- dict_t *xdata, struct iatt *postparent)
-{
- afr_local_t * local = NULL;
- int call_count = -1;
- int child_index = -1;
- GF_UNUSED int ret = 0;
- int8_t need_heal = 1;
-
- child_index = (long) cookie;
-
- local = frame->local;
-
- local->replies[child_index].valid = 1;
- local->replies[child_index].op_ret = op_ret;
- local->replies[child_index].op_errno = op_errno;
- /*
- * On revalidate lookup if the gfid-changed, afr should unwind the fop
- * with ESTALE so that a fresh lookup will be sent by the top xlator.
- * So remember it.
- */
- if (xdata && dict_get (xdata, "gfid-changed"))
- local->cont.lookup.needs_fresh_lookup = _gf_true;
+ if (first == -1) {
+ first = i;
+ continue;
+ }
- if (xdata) {
- ret = dict_get_int8 (xdata, "link-count", &need_heal);
- local->replies[child_index].need_heal = need_heal;
- } else {
- local->replies[child_index].need_heal = need_heal;
+ if (replies[i].op_ret != replies[first].op_ret) {
+ name_state_mismatch = _gf_true;
}
- if (op_ret != -1) {
- local->replies[child_index].poststat = *buf;
- local->replies[child_index].postparent = *postparent;
- if (xdata)
- local->replies[child_index].xdata = dict_ref (xdata);
- }
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- afr_set_need_heal (this, local);
- afr_lookup_entry_heal (frame, this);
+ if (replies[i].op_ret == 0) {
+ /* Rename after this lookup may succeed if we don't do
+ * a name-heal and the destination may not have pending xattrs
+ * to indicate which name is good and which is bad so always do
+ * this heal*/
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid, gfid)) {
+ goto name_heal;
+ }
}
+ }
- return 0;
+ if (name_state_mismatch) {
+ if (!priv->quorum_count)
+ goto name_heal;
+ if (!afr_has_quorum(success, this, NULL))
+ goto name_heal;
+ if (op_errno)
+ goto name_heal;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (par_readables[i] && replies[i].op_ret < 0 &&
+ replies[i].op_errno != ENOTCONN) {
+ goto name_heal;
+ }
+ }
+ }
+
+ goto metadata_heal;
+
+name_heal:
+ heal = afr_frame_create(this, NULL);
+ if (!heal)
+ goto metadata_heal;
+
+ ret = synctask_new(this->ctx->env, afr_lookup_selfheal_wrap,
+ afr_refresh_selfheal_done, heal, frame);
+ if (ret) {
+ AFR_STACK_DESTROY(heal);
+ goto metadata_heal;
+ }
+ return ret;
+
+metadata_heal:
+ ret = afr_lookup_metadata_heal_check(frame, this);
+
+ return ret;
}
+int
+afr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = -1;
+ GF_UNUSED int ret = 0;
+ int8_t need_heal = 1;
+
+ child_index = (long)cookie;
+
+ local = frame->local;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ /*
+ * On revalidate lookup if the gfid-changed, afr should unwind the fop
+ * with ESTALE so that a fresh lookup will be sent by the top xlator.
+ * So remember it.
+ */
+ if (xdata && dict_get_sizen(xdata, "gfid-changed"))
+ local->cont.lookup.needs_fresh_lookup = _gf_true;
+
+ if (xdata) {
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ local->replies[child_index].need_heal = need_heal;
+ } else {
+ local->replies[child_index].need_heal = need_heal;
+ }
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+ }
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ afr_set_need_heal(this, local);
+ afr_lookup_entry_heal(frame, this);
+ }
+
+ return 0;
+}
static void
-afr_discover_done (call_frame_t *frame, xlator_t *this)
+afr_discover_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = -1;
- int op_errno = 0;
- int spb_choice = -1;
- int read_subvol = -1;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int read_subvol = -1;
+ int ret = 0;
+ unsigned char *data_readable = NULL;
+ unsigned char *success_replies = NULL;
- priv = this->private;
- local = frame->local;
+ priv = this->private;
+ local = frame->local;
+ data_readable = alloca0(priv->child_count);
+ success_replies = alloca0(priv->child_count);
- afr_inode_split_brain_choice_get (local->inode, this,
- &spb_choice);
+ afr_fill_success_replies(local, priv, success_replies);
+ if (AFR_COUNT(success_replies, priv->child_count) > 0)
+ local->op_ret = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].valid)
- continue;
- if (local->replies[i].op_ret == 0)
- local->op_ret = 0;
- }
+ if (local->op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(frame->local, this->private);
+ goto error;
+ }
- op_errno = afr_final_errno (frame->local, this->private);
+ if (!afr_has_quorum(success_replies, this, frame))
+ goto unwind;
- if (local->op_ret < 0) {
- local->op_errno = op_errno;
- local->op_ret = -1;
- goto unwind;
- }
+ ret = afr_replies_interpret(frame, this, local->inode, NULL);
+ if (ret) {
+ afr_inode_need_refresh_set(local->inode, this);
+ }
- afr_replies_interpret (frame, this, local->inode);
+ read_subvol = afr_read_subvol_decide(local->inode, this, NULL,
+ data_readable);
- read_subvol = afr_read_subvol_decide (local->inode, this, NULL);
- if (read_subvol == -1) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_READ_SUBVOL_ERROR, "no read subvols for %s",
- local->loc.path);
+unwind:
+ afr_attempt_readsubvol_set(frame, this, success_replies, data_readable,
+ &read_subvol);
+ if (read_subvol == -1)
+ goto error;
- if (spb_choice >= 0) {
- read_subvol = spb_choice;
- } else {
- read_subvol = afr_first_up_child (frame, this);
- }
- }
+ if (AFR_IS_ARBITER_BRICK(priv, read_subvol) && local->op_ret == 0) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ gf_msg_debug(this->name, 0,
+ "Arbiter cannot be a read subvol "
+ "for %s",
+ local->loc.path);
+ }
-unwind:
- if (read_subvol == -1) {
- if (spb_choice >= 0)
- read_subvol = spb_choice;
- else
- read_subvol = afr_first_up_child (frame, this);
- }
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->replies[read_subvol].poststat,
+ local->replies[read_subvol].xdata,
+ &local->replies[read_subvol].postparent);
+ return;
- AFR_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->replies[read_subvol].poststat,
- local->replies[read_subvol].xdata,
- &local->replies[read_subvol].postparent);
+error:
+ AFR_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno, NULL, NULL,
+ NULL, NULL);
}
+static int
+afr_ta_id_file_check(void *opaque)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *this = NULL;
+ loc_t loc = {
+ 0,
+ };
+ struct iatt stbuf = {
+ 0,
+ };
+ dict_t *dict = NULL;
+ uuid_t gfid = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ this = opaque;
+ priv = this->private;
+
+ ret = afr_fill_ta_loc(this, &loc, _gf_false);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ goto out;
+ }
+
+ ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], &loc, &stbuf,
+ 0, 0, 0);
+ if (ret == 0) {
+ goto out;
+ } else if (ret == -ENOENT) {
+ fd = fd_create(loc.inode, getpid());
+ if (!fd)
+ goto out;
+ dict = dict_new();
+ if (!dict)
+ goto out;
+ gf_uuid_generate(gfid);
+ ret = dict_set_gfuuid(dict, "gfid-req", gfid, true);
+ ret = syncop_create(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ O_RDWR, 0664, fd, &stbuf, dict, NULL);
+ }
-int
-afr_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode, struct iatt *buf,
- dict_t *xdata, struct iatt *postparent)
-{
- afr_local_t * local = NULL;
- int call_count = -1;
- int child_index = -1;
- GF_UNUSED int ret = 0;
- int8_t need_heal = 1;
-
- child_index = (long) cookie;
-
- local = frame->local;
-
- local->replies[child_index].valid = 1;
- local->replies[child_index].op_ret = op_ret;
- local->replies[child_index].op_errno = op_errno;
- if (op_ret != -1) {
- local->replies[child_index].poststat = *buf;
- local->replies[child_index].postparent = *postparent;
- if (xdata)
- local->replies[child_index].xdata = dict_ref (xdata);
- }
-
- if (local->do_discovery && (op_ret == 0))
- afr_attempt_local_discovery (this, child_index);
-
- if (xdata) {
- ret = dict_get_int8 (xdata, "link-count", &need_heal);
- local->replies[child_index].need_heal = need_heal;
- } else {
- local->replies[child_index].need_heal = need_heal;
- }
-
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- afr_set_need_heal (this, local);
- afr_discover_done (frame, this);
- }
+out:
+ if (ret == 0) {
+ gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid);
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to lookup/create thin-arbiter id file.");
+ }
+ if (dict)
+ dict_unref(dict);
+ if (fd)
+ fd_unref(fd);
+ loc_wipe(&loc);
- return 0;
+ return 0;
}
+static int
+afr_ta_id_file_check_cbk(int ret, call_frame_t *ta_frame, void *opaque)
+{
+ return 0;
+}
-int
-afr_discover_do (call_frame_t *frame, xlator_t *this, int err)
+static void
+afr_discover_done(call_frame_t *frame, xlator_t *this)
{
- int ret = 0;
- int i = 0;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
+ int ret = 0;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ priv = this->private;
+ if (!priv->thin_arbiter_count)
+ goto unwind;
+ if (!gf_uuid_is_null(priv->ta_gfid))
+ goto unwind;
- if (err) {
- local->op_errno = -err;
- ret = -1;
- goto out;
- }
+ ret = synctask_new(this->ctx->env, afr_ta_id_file_check,
+ afr_ta_id_file_check_cbk, NULL, this);
+ if (ret)
+ goto unwind;
+unwind:
+ afr_discover_unwind(frame, this);
+}
- call_count = local->call_count = AFR_COUNT (local->child_up,
- priv->child_count);
+int
+afr_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = -1;
+ GF_UNUSED int ret = 0;
+ int8_t need_heal = 1;
+
+ child_index = (long)cookie;
+
+ local = frame->local;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret != -1) {
+ local->replies[child_index].poststat = *buf;
+ local->replies[child_index].postparent = *postparent;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+ }
+
+ if (local->do_discovery && (op_ret == 0))
+ afr_attempt_local_discovery(this, child_index);
+
+ if (xdata) {
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ local->replies[child_index].need_heal = need_heal;
+ } else {
+ local->replies[child_index].need_heal = need_heal;
+ }
+
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ afr_set_need_heal(this, local);
+ afr_lookup_metadata_heal_check(frame, this);
+ }
- ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
- &local->loc);
- if (ret) {
- local->op_errno = -ret;
- ret = -1;
- goto out;
- }
+ return 0;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_discover_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- &local->loc, local->xattr_req);
- if (!--call_count)
- break;
- }
+int
+afr_discover_do(call_frame_t *frame, xlator_t *this, int err)
+{
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err) {
+ local->op_errno = err;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT(local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req,
+ &local->loc);
+ if (ret) {
+ local->op_errno = -ret;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(
+ frame, afr_discover_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->lookup, &local->loc, local->xattr_req);
+ if (!--call_count)
+ break;
}
+ }
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
- return 0;
+ AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
}
-
int
-afr_discover (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
{
- int op_errno = ENOMEM;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int event = 0;
+ int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int event = 0;
- priv = this->private;
+ priv = this->private;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- if (!local->call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
- if (__is_root_gfid (loc->inode->gfid)) {
- if (!this->itable)
- this->itable = loc->inode->table;
- if (!priv->root_inode)
- priv->root_inode = inode_ref (loc->inode);
+ if (__is_root_gfid(loc->inode->gfid)) {
+ if (!priv->root_inode)
+ priv->root_inode = inode_ref(loc->inode);
- if (priv->choose_local && !priv->did_discovery) {
- /* Logic to detect which subvolumes of AFR are
- local, in order to prefer them for reads
- */
- local->do_discovery = _gf_true;
- priv->did_discovery = _gf_true;
- }
- }
+ if (priv->choose_local && !priv->did_discovery) {
+ /* Logic to detect which subvolumes of AFR are
+ local, in order to prefer them for reads
+ */
+ local->do_discovery = _gf_true;
+ priv->did_discovery = _gf_true;
+ }
+ }
- local->op = GF_FOP_LOOKUP;
+ local->op = GF_FOP_LOOKUP;
- loc_copy (&local->loc, loc);
+ loc_copy(&local->loc, loc);
- local->inode = inode_ref (loc->inode);
+ local->inode = inode_ref(loc->inode);
- if (xattr_req)
- /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
- allocate one for us */
- local->xattr_req = dict_ref (xattr_req);
+ if (xattr_req) {
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_copy_with_ref(xattr_req, NULL);
+ if (!local->xattr_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ }
- if (gf_uuid_is_null (loc->inode->gfid)) {
- afr_discover_do (frame, this, 0);
- return 0;
- }
+ if (gf_uuid_is_null(loc->inode->gfid)) {
+ afr_discover_do(frame, this, 0);
+ return 0;
+ }
- afr_read_subvol_get (loc->inode, this, NULL, &event,
- AFR_DATA_TRANSACTION, NULL);
+ afr_read_subvol_get(loc->inode, this, NULL, NULL, &event,
+ AFR_DATA_TRANSACTION, NULL);
- if (event != local->event_generation)
- afr_inode_refresh (frame, this, loc->inode, afr_discover_do);
- else
- afr_discover_do (frame, this, 0);
+ afr_discover_do(frame, this, 0);
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
}
-
int
-afr_lookup_do (call_frame_t *frame, xlator_t *this, int err)
-{
- int ret = 0;
- int i = 0;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
-
- local = frame->local;
- priv = this->private;
-
- if (err < 0) {
- local->op_errno = -err;
- ret = -1;
- goto out;
- }
-
- call_count = local->call_count = AFR_COUNT (local->child_up,
- priv->child_count);
-
- ret = afr_lookup_xattr_req_prepare (local, this, local->xattr_req,
- &local->loc);
- if (ret) {
- local->op_errno = -ret;
- ret = -1;
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_lookup_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->lookup,
- &local->loc, local->xattr_req);
- if (!--call_count)
- break;
- }
+afr_lookup_do(call_frame_t *frame, xlator_t *this, int err)
+{
+ int ret = 0;
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err < 0) {
+ local->op_errno = err;
+ goto out;
+ }
+
+ call_count = local->call_count = AFR_COUNT(local->child_up,
+ priv->child_count);
+
+ ret = afr_lookup_xattr_req_prepare(local, this, local->xattr_req,
+ &local->loc);
+ if (ret) {
+ local->op_errno = -ret;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(
+ frame, afr_lookup_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->lookup, &local->loc, local->xattr_req);
+ if (!--call_count)
+ break;
}
- return 0;
+ }
+ return 0;
out:
- AFR_STACK_UNWIND (lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
- return 0;
+ AFR_STACK_UNWIND(lookup, frame, -1, local->op_errno, 0, 0, 0, 0);
+ return 0;
}
/*
@@ -2485,2602 +4000,3879 @@ out:
*/
int
-afr_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
-{
- afr_local_t *local = NULL;
- int32_t op_errno = 0;
- int event = 0;
- void *gfid_req = NULL;
- int ret = 0;
-
- if (!loc->parent && gf_uuid_is_null (loc->pargfid)) {
- if (xattr_req)
- dict_del (xattr_req, "gfid-req");
- afr_discover (frame, this, loc, xattr_req);
- return 0;
- }
-
- if (__is_root_gfid (loc->parent->gfid)) {
- if (!strcmp (loc->name, GF_REPLICATE_TRASH_DIR)) {
- op_errno = EPERM;
- goto out;
- }
- }
-
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
-
- if (!local->call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
-
- local->op = GF_FOP_LOOKUP;
-
- loc_copy (&local->loc, loc);
-
- local->inode = inode_ref (loc->inode);
+afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int event = 0;
+ int ret = 0;
- if (xattr_req) {
- /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
- allocate one for us */
- ret = dict_get_ptr (xattr_req, "gfid-req", &gfid_req);
- if (ret == 0) {
- gf_uuid_copy (local->cont.lookup.gfid_req, gfid_req);
- dict_del (xattr_req, "gfid-req");
- }
- local->xattr_req = dict_ref (xattr_req);
- }
+ if (loc_is_nameless(loc)) {
+ if (xattr_req)
+ dict_del_sizen(xattr_req, "gfid-req");
+ afr_discover(frame, this, loc, xattr_req);
+ return 0;
+ }
- afr_read_subvol_get (loc->parent, this, NULL, &event,
- AFR_DATA_TRANSACTION, NULL);
+ if (afr_is_private_directory(this->private, loc->parent->gfid, loc->name,
+ frame->root->pid)) {
+ op_errno = EPERM;
+ goto out;
+ }
- if (event != local->event_generation)
- afr_inode_refresh (frame, this, loc->parent, afr_lookup_do);
- else
- afr_lookup_do (frame, this, 0);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- return 0;
-out:
- AFR_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ if (!local->call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
- return 0;
-}
+ local->op = GF_FOP_LOOKUP;
+ loc_copy(&local->loc, loc);
-/* {{{ open */
+ local->inode = inode_ref(loc->inode);
-afr_fd_ctx_t *
-__afr_fd_ctx_get (fd_t *fd, xlator_t *this)
-{
- uint64_t ctx = 0;
- int ret = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
+ if (xattr_req) {
+ /* If xattr_req was null, afr_lookup_xattr_req_prepare() will
+ allocate one for us */
+ local->xattr_req = dict_copy_with_ref(xattr_req, NULL);
+ if (!local->xattr_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ ret = dict_get_gfuuid(local->xattr_req, "gfid-req",
+ &local->cont.lookup.gfid_req);
+ if (ret == 0) {
+ dict_del_sizen(local->xattr_req, "gfid-req");
+ }
+ }
- ret = __fd_ctx_get (fd, this, &ctx);
+ afr_read_subvol_get(loc->parent, this, NULL, NULL, &event,
+ AFR_DATA_TRANSACTION, NULL);
- if (ret < 0) {
- ret = __afr_fd_ctx_set (this, fd);
- if (ret < 0)
- goto out;
+ afr_lookup_do(frame, this, 0);
- ret = __fd_ctx_get (fd, this, &ctx);
- if (ret < 0)
- goto out;
- }
-
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
+ return 0;
out:
- return fd_ctx;
-}
+ AFR_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
+}
-afr_fd_ctx_t *
-afr_fd_ctx_get (fd_t *fd, xlator_t *this)
+void
+_afr_cleanup_fd_ctx(xlator_t *this, afr_fd_ctx_t *fd_ctx)
{
- afr_fd_ctx_t *fd_ctx = NULL;
+ afr_private_t *priv = this->private;
- LOCK(&fd->lock);
+ if (fd_ctx->lk_heal_info) {
+ LOCK(&priv->lock);
{
- fd_ctx = __afr_fd_ctx_get (fd, this);
+ list_del(&fd_ctx->lk_heal_info->pos);
}
- UNLOCK(&fd->lock);
-
- return fd_ctx;
+ afr_lk_heal_info_cleanup(fd_ctx->lk_heal_info);
+ fd_ctx->lk_heal_info = NULL;
+ }
+ GF_FREE(fd_ctx->opened_on);
+ GF_FREE(fd_ctx);
+ return;
}
-
int
-__afr_fd_ctx_set (xlator_t *this, fd_t *fd)
+afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd)
{
- afr_private_t * priv = NULL;
- int ret = -1;
- uint64_t ctx = 0;
- afr_fd_ctx_t * fd_ctx = NULL;
- int i = 0;
-
- VALIDATE_OR_GOTO (this->private, out);
- VALIDATE_OR_GOTO (fd, out);
-
- priv = this->private;
-
- ret = __fd_ctx_get (fd, this, &ctx);
-
- if (ret == 0)
- goto out;
+ uint64_t ctx = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = 0;
- fd_ctx = GF_CALLOC (1, sizeof (afr_fd_ctx_t),
- gf_afr_mt_afr_fd_ctx_t);
- if (!fd_ctx) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = fd_ctx_get(fd, this, &ctx);
+ if (ret < 0)
+ goto out;
- for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
- fd_ctx->pre_op_done[i] = GF_CALLOC (sizeof (*fd_ctx->pre_op_done[i]),
- priv->child_count,
- gf_afr_mt_int32_t);
- if (!fd_ctx->pre_op_done[i]) {
- ret = -ENOMEM;
- goto out;
- }
- }
-
- fd_ctx->opened_on = GF_CALLOC (sizeof (*fd_ctx->opened_on),
- priv->child_count,
- gf_afr_mt_int32_t);
- if (!fd_ctx->opened_on) {
- ret = -ENOMEM;
- goto out;
- }
+ fd_ctx = (afr_fd_ctx_t *)(long)ctx;
- for (i = 0; i < priv->child_count; i++) {
- if (fd_is_anonymous (fd))
- fd_ctx->opened_on[i] = AFR_FD_OPENED;
- else
- fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
- }
+ if (fd_ctx) {
+ _afr_cleanup_fd_ctx(this, fd_ctx);
+ }
- fd_ctx->lock_piggyback = GF_CALLOC (sizeof (*fd_ctx->lock_piggyback),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->lock_piggyback) {
- ret = -ENOMEM;
- goto out;
- }
-
- fd_ctx->lock_acquired = GF_CALLOC (sizeof (*fd_ctx->lock_acquired),
- priv->child_count,
- gf_afr_mt_char);
- if (!fd_ctx->lock_acquired) {
- ret = -ENOMEM;
- goto out;
- }
-
- fd_ctx->readdir_subvol = -1;
-
- pthread_mutex_init (&fd_ctx->delay_lock, NULL);
-
- INIT_LIST_HEAD (&fd_ctx->eager_locked);
-
- ret = __fd_ctx_set (fd, this, (uint64_t)(long) fd_ctx);
- if (ret)
- gf_msg_debug (this->name, 0,
- "failed to set fd ctx (%p)", fd);
out:
- return ret;
+ return 0;
}
-
int
-afr_fd_ctx_set (xlator_t *this, fd_t *fd)
+afr_release(xlator_t *this, fd_t *fd)
{
- int ret = -1;
-
- LOCK (&fd->lock);
- {
- ret = __afr_fd_ctx_set (this, fd);
- }
- UNLOCK (&fd->lock);
+ afr_cleanup_fd_ctx(this, fd);
- return ret;
+ return 0;
}
-/* {{{ flush */
-
-int
-afr_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_fd_ctx_t *
+__afr_fd_ctx_get(fd_t *fd, xlator_t *this)
{
- afr_local_t *local = NULL;
- int call_count = -1;
+ uint64_t ctx = 0;
+ int ret = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
- local = frame->local;
+ ret = __fd_ctx_get(fd, this, &ctx);
- LOCK (&frame->lock);
- {
- if (op_ret != -1) {
- local->op_ret = op_ret;
- if (!local->xdata_rsp && xdata)
- local->xdata_rsp = dict_ref (xdata);
- } else {
- local->op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
+ if (ret < 0) {
+ ret = __afr_fd_ctx_set(this, fd);
+ if (ret < 0)
+ goto out;
- if (call_count == 0)
- AFR_STACK_UNWIND (flush, frame, local->op_ret,
- local->op_errno, local->xdata_rsp);
+ ret = __fd_ctx_get(fd, this, &ctx);
+ if (ret < 0)
+ goto out;
+ }
- return 0;
+ fd_ctx = (afr_fd_ctx_t *)(long)ctx;
+out:
+ return fd_ctx;
}
-static int
-afr_flush_wrapper (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+afr_fd_ctx_t *
+afr_fd_ctx_get(fd_t *fd, xlator_t *this)
{
- int i = 0;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
+ afr_fd_ctx_t *fd_ctx = NULL;
- priv = this->private;
- local = frame->local;
- call_count = local->call_count;
+ LOCK(&fd->lock);
+ {
+ fd_ctx = __afr_fd_ctx_get(fd, this);
+ }
+ UNLOCK(&fd->lock);
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_flush_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->flush,
- local->fd, xdata);
- if (!--call_count)
- break;
-
- }
- }
-
- return 0;
+ return fd_ctx;
}
int
-afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+__afr_fd_ctx_set(xlator_t *this, fd_t *fd)
{
- afr_local_t *local = NULL;
- call_stub_t *stub = NULL;
- int op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ uint64_t ctx = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int i = 0;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ VALIDATE_OR_GOTO(this->private, out);
+ VALIDATE_OR_GOTO(fd, out);
- if (!local->call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
+ priv = this->private;
- local->fd = fd_ref(fd);
+ ret = __fd_ctx_get(fd, this, &ctx);
- stub = fop_flush_stub (frame, afr_flush_wrapper, fd, xdata);
- if (!stub)
- goto out;
+ if (ret == 0)
+ goto out;
- afr_delayed_changelog_wake_resume (this, fd, stub);
+ fd_ctx = GF_CALLOC(1, sizeof(afr_fd_ctx_t), gf_afr_mt_afr_fd_ctx_t);
+ if (!fd_ctx) {
+ ret = -ENOMEM;
+ goto out;
+ }
- return 0;
+ fd_ctx->opened_on = GF_CALLOC(sizeof(*fd_ctx->opened_on), priv->child_count,
+ gf_afr_mt_int32_t);
+ if (!fd_ctx->opened_on) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_is_anonymous(fd))
+ fd_ctx->opened_on[i] = AFR_FD_OPENED;
+ else
+ fd_ctx->opened_on[i] = AFR_FD_NOT_OPENED;
+ }
+
+ fd_ctx->readdir_subvol = -1;
+ fd_ctx->lk_heal_info = NULL;
+
+ ret = __fd_ctx_set(fd, this, (uint64_t)(long)fd_ctx);
+ if (ret)
+ gf_msg_debug(this->name, 0, "failed to set fd ctx (%p)", fd);
out:
- AFR_STACK_UNWIND (flush, frame, -1, op_errno, NULL);
- return 0;
+ if (ret && fd_ctx)
+ _afr_cleanup_fd_ctx(this, fd_ctx);
+ return ret;
}
-/* }}} */
-
+/* {{{ flush */
int
-afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd)
+afr_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
{
- uint64_t ctx = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- int ret = 0;
- int i = 0;
+ afr_local_t *local = NULL;
+ int call_count = -1;
- ret = fd_ctx_get (fd, this, &ctx);
- if (ret < 0)
- goto out;
+ local = frame->local;
- fd_ctx = (afr_fd_ctx_t *)(long) ctx;
-
- if (fd_ctx) {
- //no need to take any locks
- if (!list_empty (&fd_ctx->eager_locked))
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_INVALID_DATA, "%s: Stale "
- "Eager-lock stubs found",
- uuid_utoa (fd->inode->gfid));
-
- for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++)
- GF_FREE (fd_ctx->pre_op_done[i]);
-
- GF_FREE (fd_ctx->opened_on);
-
- GF_FREE (fd_ctx->lock_piggyback);
+ LOCK(&frame->lock);
+ {
+ if (op_ret != -1) {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ } else {
+ local->op_errno = op_errno;
+ }
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- GF_FREE (fd_ctx->lock_acquired);
+ if (call_count == 0)
+ AFR_STACK_UNWIND(flush, frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
- pthread_mutex_destroy (&fd_ctx->delay_lock);
+ return 0;
+}
- GF_FREE (fd_ctx);
+static int
+afr_flush_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+{
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+
+ priv = this->private;
+ local = frame->local;
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, afr_flush_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->flush,
+ local->fd, xdata);
+ if (!--call_count)
+ break;
}
+ }
-out:
- return 0;
+ return 0;
}
-
-int
-afr_release (xlator_t *this, fd_t *fd)
+afr_local_t *
+afr_wakeup_same_fd_delayed_op(xlator_t *this, afr_lock_t *lock, fd_t *fd)
{
- afr_cleanup_fd_ctx (this, fd);
-
- return 0;
-}
+ afr_local_t *local = NULL;
+ if (lock->delay_timer) {
+ local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ if (fd == local->fd) {
+ if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+ local = NULL;
+ } else {
+ lock->delay_timer = NULL;
+ }
+ } else {
+ local = NULL;
+ }
+ }
-/* {{{ fsync */
+ return local;
+}
-int
-afr_fsync_unwind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
-{
- AFR_STACK_UNWIND (fsync, frame, op_ret, op_errno, prebuf, postbuf,
- xdata);
- return 0;
+void
+afr_delayed_changelog_wake_resume(xlator_t *this, inode_t *inode,
+ call_stub_t *stub)
+{
+ afr_inode_ctx_t *ctx = NULL;
+ afr_lock_t *lock = NULL;
+ afr_local_t *metadata_local = NULL;
+ afr_local_t *data_local = NULL;
+ LOCK(&inode->lock);
+ {
+ (void)__afr_inode_ctx_get(this, inode, &ctx);
+ lock = &ctx->lock[AFR_DATA_TRANSACTION];
+ data_local = afr_wakeup_same_fd_delayed_op(this, lock, stub->args.fd);
+ lock = &ctx->lock[AFR_METADATA_TRANSACTION];
+ metadata_local = afr_wakeup_same_fd_delayed_op(this, lock,
+ stub->args.fd);
+ }
+ UNLOCK(&inode->lock);
+
+ if (data_local) {
+ data_local->transaction.resume_stub = stub;
+ } else if (metadata_local) {
+ metadata_local->transaction.resume_stub = stub;
+ } else {
+ call_resume(stub);
+ }
+ if (data_local) {
+ afr_delayed_changelog_wake_up_cbk(data_local);
+ }
+ if (metadata_local) {
+ afr_delayed_changelog_wake_up_cbk(metadata_local);
+ }
}
int
-afr_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+afr_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
- int read_subvol = 0;
- call_stub_t *stub = NULL;
+ afr_local_t *local = NULL;
+ call_stub_t *stub = NULL;
+ int op_errno = ENOMEM;
- local = frame->local;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- read_subvol = afr_data_subvol_get (local->inode, this, 0, 0, NULL);
+ local->op = GF_FOP_FLUSH;
+ if (!afr_is_consistent_io_possible(local, this->private, &op_errno))
+ goto out;
- LOCK (&frame->lock);
- {
- if (op_ret == 0) {
- if (local->op_ret == -1) {
- local->op_ret = 0;
-
- local->cont.inode_wfop.prebuf = *prebuf;
- local->cont.inode_wfop.postbuf = *postbuf;
-
- if (xdata)
- local->xdata_rsp = dict_ref (xdata);
- }
-
- if (child_index == read_subvol) {
- local->cont.inode_wfop.prebuf = *prebuf;
- local->cont.inode_wfop.postbuf = *postbuf;
- if (xdata) {
- if (local->xdata_rsp)
- dict_unref (local->xdata_rsp);
- local->xdata_rsp = dict_ref (xdata);
- }
- }
- } else {
- local->op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- /* Make a stub out of the frame, and register it
- with the waking up post-op. When the call-stub resumes,
- we are guaranteed that there was no post-op pending
- (i.e changelogs were unset in the server). This is an
- essential "guarantee", that fsync() returns only after
- completely finishing EVERYTHING, including the delayed
- post-op. This guarantee is expected by FUSE graph switching
- for example.
- */
- stub = fop_fsync_cbk_stub (frame, afr_fsync_unwind_cbk,
- local->op_ret, local->op_errno,
- &local->cont.inode_wfop.prebuf,
- &local->cont.inode_wfop.postbuf,
- local->xdata_rsp);
- if (!stub) {
- AFR_STACK_UNWIND (fsync, frame, -1, ENOMEM, 0, 0, 0);
- return 0;
- }
-
- /* If no new unstable writes happened between the
- time we cleared the unstable write witness flag in afr_fsync
- and now, calling afr_delayed_changelog_wake_up() should
- wake up and skip over the fsync phase and go straight to
- afr_changelog_post_op_now()
- */
- afr_delayed_changelog_wake_resume (this, local->fd, stub);
- }
+ local->fd = fd_ref(fd);
- return 0;
-}
+ stub = fop_flush_stub(frame, afr_flush_wrapper, fd, xdata);
+ if (!stub)
+ goto out;
+ afr_delayed_changelog_wake_resume(this, fd->inode, stub);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+ return 0;
+}
int
-afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
- dict_t *xdata)
+afr_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_errno = ENOMEM;
-
- priv = this->private;
-
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ afr_local_t *local = NULL;
+ int call_count = -1;
- call_count = local->call_count;
- if (!call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
+ local = frame->local;
- local->fd = fd_ref (fd);
+ LOCK(&frame->lock);
+ {
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ } else {
+ local->op_errno = op_errno;
+ }
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- if (afr_fd_has_witnessed_unstable_write (this, fd)) {
- /* don't care. we only wanted to CLEAR the bit */
- }
+ if (call_count == 0)
+ AFR_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
- local->inode = inode_ref (fd->inode);
+ return 0;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_fsync_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fsync,
- fd, datasync, xdata);
- if (!--call_count)
- break;
- }
+int
+afr_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int32_t call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_FSYNCDIR;
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+
+ call_count = local->call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND(frame, afr_fsyncdir_cbk, priv->children[i],
+ priv->children[i]->fops->fsyncdir, fd, datasync, xdata);
+ if (!--call_count)
+ break;
}
+ }
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL);
+ AFR_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
/* }}} */
-/* {{{ fsync */
+static int
+afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this);
-int
-afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+static gf_boolean_t
+afr_is_conflicting_lock_present(int32_t op_ret, int32_t op_errno)
{
- afr_local_t *local = NULL;
- int call_count = -1;
+ if (op_ret == -1 && op_errno == EAGAIN)
+ return _gf_true;
+ return _gf_false;
+}
- local = frame->local;
+static void
+afr_fop_lock_unwind(call_frame_t *frame, glusterfs_fop_t op, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ switch (op) {
+ case GF_FOP_INODELK:
+ AFR_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata);
+ break;
+ case GF_FOP_FINODELK:
+ AFR_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata);
+ break;
+ case GF_FOP_ENTRYLK:
+ AFR_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata);
+ break;
+ case GF_FOP_FENTRYLK:
+ AFR_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, xdata);
+ break;
+ default:
+ break;
+ }
+}
- LOCK (&frame->lock);
- {
- if (op_ret == 0) {
- local->op_ret = 0;
- if (!local->xdata_rsp && xdata)
- local->xdata_rsp = dict_ref (xdata);
- } else {
- local->op_errno = op_errno;
- }
- }
- UNLOCK (&frame->lock);
+static void
+afr_fop_lock_wind(call_frame_t *frame, xlator_t *this, int child_index,
+ int32_t (*lock_cbk)(call_frame_t *, void *, xlator_t *,
+ int32_t, int32_t, dict_t *))
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int i = child_index;
+
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->inodelk,
+ (const char *)local->cont.inodelk.volume, &local->loc,
+ local->cont.inodelk.cmd, &local->cont.inodelk.flock,
+ local->cont.inodelk.xdata);
+ break;
+ case GF_FOP_FINODELK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->finodelk,
+ (const char *)local->cont.inodelk.volume, local->fd,
+ local->cont.inodelk.cmd, &local->cont.inodelk.flock,
+ local->cont.inodelk.xdata);
+ break;
+ case GF_FOP_ENTRYLK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->entrylk, local->cont.entrylk.volume,
+ &local->loc, local->cont.entrylk.basename,
+ local->cont.entrylk.cmd, local->cont.entrylk.type,
+ local->cont.entrylk.xdata);
+ break;
+ case GF_FOP_FENTRYLK:
+ STACK_WIND_COOKIE(
+ frame, lock_cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->fentrylk, local->cont.entrylk.volume,
+ local->fd, local->cont.entrylk.basename,
+ local->cont.entrylk.cmd, local->cont.entrylk.type,
+ local->cont.entrylk.xdata);
+ break;
+ default:
+ break;
+ }
+}
- call_count = afr_frame_return (frame);
+void
+afr_fop_lock_proceed(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- if (call_count == 0)
- AFR_STACK_UNWIND (fsyncdir, frame, local->op_ret,
- local->op_errno, local->xdata_rsp);
+ local = frame->local;
+ priv = frame->this->private;
- return 0;
+ if (local->fop_lock_state != AFR_FOP_LOCK_PARALLEL) {
+ afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return;
+ }
+ /* At least one child is up */
+ /*
+ * Non-blocking locks also need to be serialized. Otherwise there is
+ * a chance that both the mounts which issued same non-blocking inodelk
+ * may endup not acquiring the lock on any-brick.
+ * Ex: Mount1 and Mount2
+ * request for full length lock on file f1. Mount1 afr may acquire the
+ * partial lock on brick-1 and may not acquire the lock on brick-2
+ * because Mount2 already got the lock on brick-2, vice versa. Since
+ * both the mounts only got partial locks, afr treats them as failure in
+ * gaining the locks and unwinds with EAGAIN errno.
+ */
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+ local->fop_lock_state = AFR_FOP_LOCK_SERIAL;
+ afr_local_replies_wipe(local, priv);
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.cmd = local->cont.inodelk.in_cmd;
+ local->cont.inodelk.flock = local->cont.inodelk.in_flock;
+ if (local->cont.inodelk.xdata)
+ dict_unref(local->cont.inodelk.xdata);
+ local->cont.inodelk.xdata = NULL;
+ if (local->xdata_req)
+ local->cont.inodelk.xdata = dict_ref(local->xdata_req);
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.cmd = local->cont.entrylk.in_cmd;
+ if (local->cont.entrylk.xdata)
+ dict_unref(local->cont.entrylk.xdata);
+ local->cont.entrylk.xdata = NULL;
+ if (local->xdata_req)
+ local->cont.entrylk.xdata = dict_ref(local->xdata_req);
+ break;
+ default:
+ break;
+ }
+ afr_serialized_lock_wind(frame, frame->this);
}
+static int32_t
+afr_unlock_partial_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
-int
-afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
- dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_errno = ENOMEM;
-
- priv = this->private;
-
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ int child_index = (long)cookie;
+ uuid_t gfid = {0};
- call_count = local->call_count;
- if (!call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
+ local = frame->local;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fsyncdir_cbk,
- priv->children[i],
- priv->children[i]->fops->fsyncdir,
- fd, datasync, xdata);
- if (!--call_count)
- break;
- }
- }
+ if (op_ret < 0 && op_errno != ENOTCONN) {
+ if (local->fd)
+ gf_uuid_copy(gfid, local->fd->inode->gfid);
+ else
+ loc_gfid(&local->loc, gfid);
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "%s: Failed to unlock %s on %s "
+ "with lk_owner: %s",
+ uuid_utoa(gfid), gf_fop_list[local->op],
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
- return 0;
-out:
- AFR_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL);
+ call_count = afr_frame_return(frame);
+ if (call_count == 0)
+ afr_fop_lock_proceed(frame);
- return 0;
+ return 0;
}
-/* }}} */
-
-int32_t
-afr_unlock_partial_inodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret,
- int32_t op_errno, dict_t *xdata)
+static int32_t
+afr_unlock_locks_and_proceed(call_frame_t *frame, xlator_t *this,
+ int call_count)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ if (call_count == 0) {
+ afr_fop_lock_proceed(frame);
+ goto out;
+ }
+
+ local = frame->local;
+ priv = this->private;
+ local->call_count = call_count;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.flock.l_type = F_UNLCK;
+ local->cont.inodelk.cmd = F_SETLK;
+ if (local->cont.inodelk.xdata)
+ dict_unref(local->cont.inodelk.xdata);
+ local->cont.inodelk.xdata = NULL;
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.cmd = ENTRYLK_UNLOCK;
+ if (local->cont.entrylk.xdata)
+ dict_unref(local->cont.entrylk.xdata);
+ local->cont.entrylk.xdata = NULL;
+ break;
+ default:
+ break;
+ }
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int child_index = (long)cookie;
- uuid_t gfid = {0};
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
- local = frame->local;
- priv = this->private;
+ if (local->replies[i].op_ret == -1)
+ continue;
- if (op_ret < 0 && op_errno != ENOTCONN) {
- loc_gfid (&local->loc, gfid);
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_INODE_UNLOCK_FAIL,
- "%s: Failed to unlock %s "
- "with lk_owner: %s (%s)", uuid_utoa (gfid),
- priv->children[child_index]->name,
- lkowner_utoa (&frame->root->lk_owner),
- strerror (op_errno));
- }
+ afr_fop_lock_wind(frame, this, i, afr_unlock_partial_lock_cbk);
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- AFR_STACK_UNWIND (inodelk, frame, local->op_ret,
- local->op_errno, local->xdata_rsp);
- }
+ if (!--call_count)
+ break;
+ }
- return 0;
+out:
+ return 0;
}
int32_t
-afr_unlock_inodelks_and_unwind (call_frame_t *frame, xlator_t *this,
- int call_count)
+afr_fop_lock_done(call_frame_t *frame, xlator_t *this)
{
- int i = 0;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
+ int i = 0;
+ int lock_count = 0;
+ unsigned char *success = NULL;
- local = frame->local;
- priv = this->private;
- local->call_count = call_count;
- local->cont.inodelk.flock.l_type = F_UNLCK;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].valid)
- continue;
+ local = frame->local;
+ priv = this->private;
+ success = alloca0(priv->child_count);
- if (local->replies[i].op_ret == -1)
- continue;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
- STACK_WIND_COOKIE (frame, afr_unlock_partial_inodelk_cbk,
- (void*) (long) i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- local->cont.inodelk.volume,
- &local->loc, local->cont.inodelk.cmd,
- &local->cont.inodelk.flock, 0);
+ if (local->replies[i].op_ret == 0) {
+ lock_count++;
+ success[i] = 1;
+ }
+
+ if (local->op_ret == -1 && local->op_errno == EAGAIN)
+ continue;
- if (!--call_count)
- break;
+ if ((local->replies[i].op_ret == -1) &&
+ (local->replies[i].op_errno == EAGAIN)) {
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
+ continue;
}
- return 0;
+ if (local->replies[i].op_ret == 0)
+ local->op_ret = 0;
+
+ local->op_errno = local->replies[i].op_errno;
+ }
+
+ if (afr_fop_lock_is_unlock(frame))
+ goto unwind;
+
+ if (afr_is_conflicting_lock_present(local->op_ret, local->op_errno)) {
+ afr_unlock_locks_and_proceed(frame, this, lock_count);
+ } else if (priv->quorum_count && !afr_has_quorum(success, this, NULL)) {
+ local->fop_lock_state = AFR_FOP_LOCK_QUORUM_FAILED;
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ if (local->op_errno == 0)
+ local->op_errno = afr_quorum_errno(priv);
+ afr_unlock_locks_and_proceed(frame, this, lock_count);
+ } else {
+ goto unwind;
+ }
+
+ return 0;
+unwind:
+ afr_fop_lock_unwind(frame, local->op, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
}
-int32_t
-afr_inodelk_done (call_frame_t *frame, xlator_t *this)
+static int
+afr_common_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- int i = 0;
- int lock_count = 0;
-
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].valid)
- continue;
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (op_ret == 0 && xdata) {
+ local->replies[child_index].xdata = dict_ref(xdata);
+ LOCK(&frame->lock);
+ {
+ if (!local->xdata_rsp)
+ local->xdata_rsp = dict_ref(xdata);
+ }
+ UNLOCK(&frame->lock);
+ }
+ return 0;
+}
- if (local->replies[i].op_ret == 0)
- lock_count++;
+static int32_t
+afr_serialized_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
- if (local->op_ret == -1 && local->op_errno == EAGAIN)
- continue;
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = (long)cookie;
+ int next_child = 0;
- if ((local->replies[i].op_ret == -1) &&
- (local->replies[i].op_errno == EAGAIN)) {
- local->op_ret = -1;
- local->op_errno = EAGAIN;
- continue;
- }
+ local = frame->local;
+ priv = this->private;
- if (local->replies[i].op_ret == 0)
- local->op_ret = 0;
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
- local->op_errno = local->replies[i].op_errno;
- }
+ for (next_child = child_index + 1; next_child < priv->child_count;
+ next_child++) {
+ if (local->child_up[next_child])
+ break;
+ }
- if (lock_count && local->cont.inodelk.flock.l_type != F_UNLCK &&
- (local->op_ret == -1 && local->op_errno == EAGAIN)) {
- afr_unlock_inodelks_and_unwind (frame, this,
- lock_count);
- } else {
- AFR_STACK_UNWIND (inodelk, frame, local->op_ret,
- local->op_errno, local->xdata_rsp);
- }
+ if (afr_is_conflicting_lock_present(op_ret, op_errno) ||
+ (next_child == priv->child_count)) {
+ afr_fop_lock_done(frame, this);
+ } else {
+ afr_fop_lock_wind(frame, this, next_child, afr_serialized_lock_cbk);
+ }
- return 0;
+ return 0;
}
-int
-afr_common_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+static int
+afr_serialized_lock_wind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- int child_index = (long)cookie;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- local = frame->local;
+ priv = this->private;
+ local = frame->local;
- local->replies[child_index].valid = 1;
- local->replies[child_index].op_ret = op_ret;
- local->replies[child_index].op_errno = op_errno;
- if (op_ret == 0 && xdata) {
- local->replies[child_index].xdata = dict_ref (xdata);
- LOCK (&frame->lock);
- {
- if (!local->xdata_rsp)
- local->xdata_rsp = dict_ref (xdata);
- }
- UNLOCK (&frame->lock);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ afr_fop_lock_wind(frame, this, i, afr_serialized_lock_cbk);
+ break;
}
- return 0;
+ }
+ return 0;
}
static int32_t
-afr_parallel_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_parallel_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- int call_count = 0;
+ int call_count = 0;
- afr_common_inodelk_cbk (frame, cookie, this, op_ret, op_errno, xdata);
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- afr_inodelk_done (frame, this);
+ call_count = afr_frame_return(frame);
+ if (call_count == 0)
+ afr_fop_lock_done(frame, this);
- return 0;
+ return 0;
}
-static gf_boolean_t
-afr_is_conflicting_lock_present (int32_t op_ret, int32_t op_errno)
+static int
+afr_parallel_lock_wind(call_frame_t *frame, xlator_t *this)
{
- if (op_ret == -1 && op_errno == EAGAIN)
- return _gf_true;
- return _gf_false;
-}
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ int i = 0;
-static int32_t
-afr_serialized_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+ priv = this->private;
+ local = frame->local;
+ call_count = local->call_count;
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int child_index = (long)cookie;
- int next_child = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+ afr_fop_lock_wind(frame, this, i, afr_parallel_lock_cbk);
+ if (!--call_count)
+ break;
+ }
+ return 0;
+}
- local = frame->local;
- priv = this->private;
+static int
+afr_fop_handle_lock(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = frame->local;
+ int op_errno = 0;
- afr_common_inodelk_cbk (frame, cookie, this, op_ret, op_errno, xdata);
+ if (!afr_fop_lock_is_unlock(frame)) {
+ if (!afr_is_consistent_io_possible(local, this->private, &op_errno))
+ goto out;
- for (next_child = child_index + 1; next_child < priv->child_count;
- next_child++) {
- if (local->child_up[next_child])
- break;
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.cmd = F_SETLK;
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.cmd = ENTRYLK_LOCK_NB;
+ break;
+ default:
+ break;
}
+ }
- if (afr_is_conflicting_lock_present (op_ret, op_errno) ||
- (next_child == priv->child_count)) {
- afr_inodelk_done (frame, this);
- } else {
- STACK_WIND_COOKIE (frame, afr_serialized_inodelk_cbk,
- (void *) (long) next_child,
- priv->children[next_child],
- priv->children[next_child]->fops->inodelk,
- (const char *)local->cont.inodelk.volume,
- &local->loc, local->cont.inodelk.cmd,
- &local->cont.inodelk.flock,
- local->xdata_req);
+ if (local->xdata_req) {
+ switch (local->op) {
+ case GF_FOP_INODELK:
+ case GF_FOP_FINODELK:
+ local->cont.inodelk.xdata = dict_ref(local->xdata_req);
+ break;
+ case GF_FOP_ENTRYLK:
+ case GF_FOP_FENTRYLK:
+ local->cont.entrylk.xdata = dict_ref(local->xdata_req);
+ break;
+ default:
+ break;
}
+ }
- return 0;
+ local->fop_lock_state = AFR_FOP_LOCK_PARALLEL;
+ afr_parallel_lock_wind(frame, this);
+out:
+ return -op_errno;
}
-static int
-afr_parallel_inodelk_wind (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int call_count = 0;
- int i = 0;
+static int32_t
+afr_handle_inodelk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int32_t op_errno = ENOMEM;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = fop;
+ if (loc)
+ loc_copy(&local->loc, loc);
+ if (fd && (flock->l_type != F_UNLCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local->fd = fd_ref(fd);
+ }
+
+ local->cont.inodelk.volume = gf_strdup(volume);
+ if (!local->cont.inodelk.volume) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->cont.inodelk.in_cmd = cmd;
+ local->cont.inodelk.cmd = cmd;
+ local->cont.inodelk.in_flock = *flock;
+ local->cont.inodelk.flock = *flock;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ op_errno = -afr_fop_handle_lock(frame, frame->this);
+ if (op_errno)
+ goto out;
+ return 0;
+out:
+ afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL);
- priv = this->private;
- local = frame->local;
- call_count = local->call_count;
+ return 0;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i])
- continue;
- STACK_WIND_COOKIE (frame, afr_parallel_inodelk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- (const char *)local->cont.inodelk.volume,
- &local->loc, local->cont.inodelk.cmd,
- &local->cont.inodelk.flock,
- local->xdata_req);
- if (!--call_count)
- break;
- }
- return 0;
+int32_t
+afr_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ afr_handle_inodelk(frame, this, GF_FOP_INODELK, volume, loc, NULL, cmd,
+ flock, xdata);
+ return 0;
+}
+
+int32_t
+afr_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+{
+ afr_handle_inodelk(frame, this, GF_FOP_FINODELK, volume, NULL, fd, cmd,
+ flock, xdata);
+ return 0;
}
static int
-afr_serialized_inodelk_wind (call_frame_t *frame, xlator_t *this)
+afr_handle_entrylk(call_frame_t *frame, xlator_t *this, glusterfs_fop_t fop,
+ const char *volume, loc_t *loc, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
+ afr_local_t *local = NULL;
+ int32_t op_errno = ENOMEM;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = fop;
+ if (loc)
+ loc_copy(&local->loc, loc);
+ if (fd && (cmd != ENTRYLK_UNLOCK)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local->fd = fd_ref(fd);
+ }
+ local->cont.entrylk.cmd = cmd;
+ local->cont.entrylk.in_cmd = cmd;
+ local->cont.entrylk.type = type;
+ local->cont.entrylk.volume = gf_strdup(volume);
+ local->cont.entrylk.basename = gf_strdup(basename);
+ if (!local->cont.entrylk.volume || !local->cont.entrylk.basename) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+ op_errno = -afr_fop_handle_lock(frame, frame->this);
+ if (op_errno)
+ goto out;
+
+ return 0;
+out:
+ afr_fop_lock_unwind(frame, fop, -1, op_errno, NULL);
+ return 0;
+}
- priv = this->private;
- local = frame->local;
+int
+afr_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ afr_handle_entrylk(frame, this, GF_FOP_ENTRYLK, volume, loc, NULL, basename,
+ cmd, type, xdata);
+ return 0;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_serialized_inodelk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- (const char *)local->cont.inodelk.volume,
- &local->loc, local->cont.inodelk.cmd,
- &local->cont.inodelk.flock,
- local->xdata_req);
- break;
- }
- }
- return 0;
+int
+afr_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ afr_handle_entrylk(frame, this, GF_FOP_FENTRYLK, volume, NULL, fd, basename,
+ cmd, type, xdata);
+ return 0;
}
-int32_t
-afr_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd,
- struct gf_flock *flock, dict_t *xdata)
+int
+afr_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int32_t op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ struct statvfs *buf = NULL;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ local = frame->local;
- loc_copy (&local->loc, loc);
- local->cont.inodelk.volume = gf_strdup (volume);
- if (!local->cont.inodelk.volume) {
- op_errno = ENOMEM;
- goto out;
+ LOCK(&frame->lock);
+ {
+ if (op_ret != 0) {
+ local->op_errno = op_errno;
+ goto unlock;
}
- local->cont.inodelk.cmd = cmd;
- local->cont.inodelk.flock = *flock;
- if (xdata)
- local->xdata_req = dict_ref (xdata);
-
- /* At least one child is up */
- /*
- * Non-blocking locks also need to be serialized. Otherwise there is
- * a chance that both the mounts which issued same non-blocking inodelk
- * may endup not acquiring the lock on any-brick.
- * Ex: Mount1 and Mount2
- * request for full length lock on file f1. Mount1 afr may acquire the
- * partial lock on brick-1 and may not acquire the lock on brick-2
- * because Mount2 already got the lock on brick-2, vice versa. Since
- * both the mounts only got partial locks, afr treats them as failure in
- * gaining the locks and unwinds with EAGAIN errno.
- */
- if (flock->l_type == F_UNLCK) {
- afr_parallel_inodelk_wind (frame, this);
+ local->op_ret = op_ret;
+
+ buf = &local->cont.statfs.buf;
+ if (local->cont.statfs.buf_set) {
+ if (statvfs->f_bavail < buf->f_bavail) {
+ *buf = *statvfs;
+ if (xdata) {
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = dict_ref(xdata);
+ }
+ }
} else {
- afr_serialized_inodelk_wind (frame, this);
+ *buf = *statvfs;
+ local->cont.statfs.buf_set = 1;
+ if (xdata)
+ local->xdata_rsp = dict_ref(xdata);
}
+ }
+unlock:
+ call_count = --local->call_count;
+ UNLOCK(&frame->lock);
- return 0;
-out:
- AFR_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL);
+ if (call_count == 0)
+ AFR_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
+ &local->cont.statfs.buf, local->xdata_rsp);
- return 0;
+ return 0;
}
+int
+afr_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_STATFS;
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+
+ if (priv->arbiter_count == 1 && local->child_up[ARBITER_BRICK_INDEX])
+ local->call_count--;
+ call_count = local->call_count;
+ if (!call_count) {
+ op_errno = ENOTCONN;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ if (AFR_IS_ARBITER_BRICK(priv, i))
+ continue;
+ STACK_WIND(frame, afr_statfs_cbk, priv->children[i],
+ priv->children[i]->fops->statfs, loc, xdata);
+ if (!--call_count)
+ break;
+ }
+ }
-int32_t
-afr_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-
-{
- afr_local_t *local = NULL;
- int call_count = -1;
+ return 0;
+out:
+ AFR_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
- local = frame->local;
+ return 0;
+}
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+int32_t
+afr_lk_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ int call_count = -1;
+ int child_index = (long)cookie;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ local = frame->local;
- call_count = afr_frame_return (frame);
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "gfid=%s: unlock failed on subvolume %s "
+ "with lock owner %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
- if (call_count == 0)
- AFR_STACK_UNWIND (finodelk, frame, local->op_ret,
- local->op_errno, xdata);
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
+ local->xdata_rsp);
+ }
- return 0;
+ return 0;
}
-
int32_t
-afr_finodelk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
- int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+afr_lk_unlock(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ call_count = afr_locked_nodes_count(local->cont.lk.locked_nodes,
+ priv->child_count);
- call_count = local->call_count;
- if (!call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
+ if (call_count == 0) {
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno, NULL,
+ local->xdata_rsp);
+ return 0;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_finodelk_cbk,
- priv->children[i],
- priv->children[i]->fops->finodelk,
- volume, fd, cmd, flock, xdata);
-
- if (!--call_count)
- break;
- }
- }
+ local->call_count = call_count;
- return 0;
-out:
- AFR_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL);
+ local->cont.lk.user_flock.l_type = F_UNLCK;
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lk.locked_nodes[i]) {
+ STACK_WIND_COOKIE(frame, afr_lk_unlock_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->lk,
+ local->fd, F_SETLK, &local->cont.lk.user_flock,
+ NULL);
+ if (!--call_count)
+ break;
+ }
+ }
+
+ return 0;
+}
int32_t
-afr_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int call_count = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = -1;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+ child_index = (long)cookie;
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret < 0 && op_errno == EAGAIN) {
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
- call_count = afr_frame_return (frame);
+ afr_lk_unlock(frame, this);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ local->cont.lk.ret_flock = *lock;
+ }
+
+ child_index++;
+
+ if (child_index < priv->child_count) {
+ STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->lk, local->fd,
+ local->cont.lk.cmd, &local->cont.lk.user_flock,
+ local->xdata_req);
+ } else if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
- if (call_count == 0)
- AFR_STACK_UNWIND (entrylk, frame, local->op_ret,
- local->op_errno, xdata);
+ afr_lk_unlock(frame, this);
+ } else {
+ if (local->op_ret < 0)
+ local->op_errno = afr_final_errno(local, priv);
- return 0;
-}
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+ &local->cont.lk.ret_flock, local->xdata_rsp);
+ }
+ return 0;
+}
int
-afr_entrylk (call_frame_t *frame, xlator_t *this, const char *volume,
- loc_t *loc, const char *basename, entrylk_cmd cmd,
- entrylk_type type, dict_t *xdata)
+afr_lk_transaction_cbk(int ret, call_frame_t *frame, void *opaque)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_errno = 0;
+ return 0;
+}
- priv = this->private;
+int
+afr_lk_txn_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = -1;
+
+ local = frame->local;
+ child_index = (long)cookie;
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lk.locked_nodes[child_index] = 1;
+ local->cont.lk.ret_flock = *lock;
+ }
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+int
+afr_lk_txn_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
+ dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ int child_index = (long)cookie;
+
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_UNLOCK_FAIL,
+ "gfid=%s: unlock failed on subvolume %s "
+ "with lock owner %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name,
+ lkowner_utoa(&frame->root->lk_owner));
+ }
+ return 0;
+}
+int
+afr_lk_transaction(void *opaque)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ char *wind_on = NULL;
+ int op_errno = 0;
+ int i = 0;
+ int ret = 0;
+
+ frame = (call_frame_t *)opaque;
+ local = frame->local;
+ this = frame->this;
+ priv = this->private;
+ wind_on = alloca0(priv->child_count);
+
+ if (priv->arbiter_count || priv->child_count != 3) {
+ op_errno = ENOTSUP;
+ gf_msg(frame->this->name, GF_LOG_ERROR, op_errno, AFR_MSG_LK_HEAL_DOM,
+ "%s: Lock healing supported only for replica 3 volumes.",
+ uuid_utoa(local->fd->inode->gfid));
+ goto err;
+ }
+
+ op_errno = -afr_dom_lock_acquire(frame); // Released during
+ // AFR_STACK_UNWIND
+ if (op_errno != 0) {
+ goto err;
+ }
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.dom_locked_nodes, this, NULL)) {
+ op_errno = afr_final_errno(local, priv);
+ goto err;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] && local->cont.lk.dom_locked_nodes[i])
+ wind_on[i] = 1;
+ }
+ AFR_ONLIST(wind_on, frame, afr_lk_txn_wind_cbk, lk, local->fd,
+ local->cont.lk.cmd, &local->cont.lk.user_flock,
+ local->xdata_req);
+
+ if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lk.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ goto unlock;
+ } else {
+ if (local->cont.lk.user_flock.l_type == F_UNLCK)
+ ret = afr_remove_lock_from_saved_locks(local, this);
+ else
+ ret = afr_add_lock_to_saved_locks(frame, this);
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ goto unlock;
+ }
+ AFR_STACK_UNWIND(lk, frame, local->op_ret, local->op_errno,
+ &local->cont.lk.ret_flock, local->xdata_rsp);
+ }
- call_count = local->call_count;
- if (!call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
+ return 0;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_entrylk_cbk,
- priv->children[i],
- priv->children[i]->fops->entrylk,
- volume, loc, basename, cmd, type, xdata);
-
- if (!--call_count)
- break;
- }
+unlock:
+ local->cont.lk.user_flock.l_type = F_UNLCK;
+ AFR_ONLIST(local->cont.lk.locked_nodes, frame, afr_lk_txn_unlock_cbk, lk,
+ local->fd, F_SETLK, &local->cont.lk.user_flock, NULL);
+err:
+ AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+ return -1;
+}
+
+int
+afr_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ int i = 0;
+ int32_t op_errno = ENOMEM;
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_LK;
+ if (!afr_lk_is_unlock(cmd, flock)) {
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+ }
+
+ local->cont.lk.locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lk.locked_nodes),
+ gf_afr_mt_char);
+
+ if (!local->cont.lk.locked_nodes) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->fd = fd_ref(fd);
+ local->cont.lk.cmd = cmd;
+ local->cont.lk.user_flock = *flock;
+ local->cont.lk.ret_flock = *flock;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ if (afr_is_lock_mode_mandatory(xdata)) {
+ ret = synctask_new(this->ctx->env, afr_lk_transaction,
+ afr_lk_transaction_cbk, frame, frame);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto out;
}
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_lk_cbk, (void *)(long)0, priv->children[i],
+ priv->children[i]->fops->lk, fd, cmd, flock,
+ local->xdata_req);
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
+int32_t
+afr_lease_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct gf_lease *lease,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+
+ local = frame->local;
+ call_count = afr_frame_return(frame);
+ if (call_count == 0)
+ AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno, lease,
+ xdata);
-int
-afr_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+ return 0;
+}
+int32_t
+afr_lease_unlock(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- int call_count = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_count = 0;
- local = frame->local;
+ local = frame->local;
+ priv = this->private;
- LOCK (&frame->lock);
- {
- if (op_ret == 0)
- local->op_ret = 0;
+ call_count = afr_locked_nodes_count(local->cont.lease.locked_nodes,
+ priv->child_count);
- local->op_errno = op_errno;
- }
- UNLOCK (&frame->lock);
+ if (call_count == 0) {
+ AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno,
+ &local->cont.lease.ret_lease, NULL);
+ return 0;
+ }
- call_count = afr_frame_return (frame);
+ local->call_count = call_count;
- if (call_count == 0)
- AFR_STACK_UNWIND (fentrylk, frame, local->op_ret,
- local->op_errno, xdata);
+ local->cont.lease.user_lease.cmd = GF_UNLK_LEASE;
- return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->cont.lease.locked_nodes[i]) {
+ STACK_WIND(frame, afr_lease_unlock_cbk, priv->children[i],
+ priv->children[i]->fops->lease, &local->loc,
+ &local->cont.lease.user_lease, NULL);
+ if (!--call_count)
+ break;
+ }
+ }
-int
-afr_fentrylk (call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
- const char *basename, entrylk_cmd cmd, entrylk_type type,
- dict_t *xdata)
+ return 0;
+}
+
+int32_t
+afr_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct gf_lease *lease, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = -1;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ child_index = (long)cookie;
- call_count = local->call_count;
- if (!call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
+ afr_common_lock_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret < 0 && op_errno == EAGAIN) {
+ local->op_ret = -1;
+ local->op_errno = EAGAIN;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND (frame, afr_fentrylk_cbk,
- priv->children[i],
- priv->children[i]->fops->fentrylk,
- volume, fd, basename, cmd, type, xdata);
-
- if (!--call_count)
- break;
- }
- }
+ afr_lease_unlock(frame, this);
+ return 0;
+ }
+
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->op_errno = 0;
+ local->cont.lease.locked_nodes[child_index] = 1;
+ local->cont.lease.ret_lease = *lease;
+ }
+
+ child_index++;
+ if (child_index < priv->child_count) {
+ STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)child_index,
+ priv->children[child_index],
+ priv->children[child_index]->fops->lease, &local->loc,
+ &local->cont.lease.user_lease, xdata);
+ } else if (priv->quorum_count &&
+ !afr_has_quorum(local->cont.lease.locked_nodes, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
- return 0;
-out:
- AFR_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL);
+ afr_lease_unlock(frame, this);
+ } else {
+ if (local->op_ret < 0)
+ local->op_errno = afr_final_errno(local, priv);
+ AFR_STACK_UNWIND(lease, frame, local->op_ret, local->op_errno,
+ &local->cont.lease.ret_lease, NULL);
+ }
- return 0;
+ return 0;
}
-
int
-afr_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, struct statvfs *statvfs, dict_t *xdata)
+afr_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct gf_lease *lease, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int call_count = 0;
- struct statvfs *buf = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int32_t op_errno = ENOMEM;
- LOCK (&frame->lock);
- {
- local = frame->local;
-
- if (op_ret != 0) {
- local->op_errno = op_errno;
- goto unlock;
- }
-
- local->op_ret = op_ret;
-
- buf = &local->cont.statfs.buf;
- if (local->cont.statfs.buf_set) {
- if (statvfs->f_bavail < buf->f_bavail) {
- *buf = *statvfs;
- if (xdata) {
- if (local->xdata_rsp)
- dict_unref (local->xdata_rsp);
- local->xdata_rsp = dict_ref (xdata);
- }
- }
- } else {
- *buf = *statvfs;
- local->cont.statfs.buf_set = 1;
- if (xdata)
- local->xdata_rsp = dict_ref (xdata);
- }
- }
-unlock:
- UNLOCK (&frame->lock);
+ priv = this->private;
- call_count = afr_frame_return (frame);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- if (call_count == 0)
- AFR_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno,
- &local->cont.statfs.buf, local->xdata_rsp);
+ local->op = GF_FOP_LEASE;
+ local->cont.lease.locked_nodes = GF_CALLOC(
+ priv->child_count, sizeof(*local->cont.lease.locked_nodes),
+ gf_afr_mt_char);
- return 0;
+ if (!local->cont.lease.locked_nodes) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ loc_copy(&local->loc, loc);
+ local->cont.lease.user_lease = *lease;
+ local->cont.lease.ret_lease = *lease;
+
+ STACK_WIND_COOKIE(frame, afr_lease_cbk, (void *)(long)0, priv->children[0],
+ priv->children[0]->fops->lease, loc, lease, xdata);
+
+ return 0;
+out:
+ AFR_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
}
+int
+afr_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
+ int call_count = 0;
+ gf_boolean_t failed = _gf_false;
+ gf_boolean_t succeeded = _gf_false;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+
+ call_count = afr_frame_return(frame);
+ if (call_count)
+ goto out;
+ /* If any of the subvolumes failed with other than ENOTCONN
+ * return error else return success unless all the subvolumes
+ * failed.
+ * TODO: In case of failure, we need to unregister the xattrs
+ * from the other subvolumes where it succeeded (once upcall
+ * fixes the Bz-1371622)*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0 &&
+ local->replies[i].op_errno != ENOTCONN) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ if (local->replies[i].xdata) {
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ failed = _gf_true;
+ break;
+ }
+ if (local->replies[i].op_ret == 0) {
+ succeeded = _gf_true;
+ local->op_ret = 0;
+ local->op_errno = 0;
+ if (!local->xdata_rsp && local->replies[i].xdata) {
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ }
+ }
+
+ if (!succeeded && !failed) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
+ }
+
+ AFR_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+
+out:
+ return 0;
+}
int
-afr_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+afr_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
{
- afr_local_t * local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = 0;
- int32_t op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ int32_t op_errno = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int call_cnt = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+
+ if (op != GF_IPC_TARGET_UPCALL)
+ goto wind_default;
- priv = this->private;
+ VALIDATE_OR_GOTO(this->private, err);
+ priv = this->private;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto err;
- if (priv->arbiter_count == 1 && local->child_up[ARBITER_BRICK_INDEX])
- local->call_count--;
- call_count = local->call_count;
- if (!call_count) {
- op_errno = ENOTCONN;
- goto out;
- }
+ call_cnt = local->call_count;
+ if (xdata) {
for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- if (AFR_IS_ARBITER_BRICK(priv, i))
- continue;
- STACK_WIND (frame, afr_statfs_cbk,
- priv->children[i],
- priv->children[i]->fops->statfs,
- loc, xdata);
- if (!--call_count)
- break;
- }
+ if (dict_set_int8(xdata, priv->pending_key[i], 0) < 0)
+ goto err;
}
+ }
- return 0;
-out:
- AFR_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
- return 0;
-}
+ STACK_WIND_COOKIE(frame, afr_ipc_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->ipc, op,
+ xdata);
+ if (!--call_cnt)
+ break;
+ }
+ return 0;
+err:
+ if (op_errno == -1)
+ op_errno = errno;
+ AFR_STACK_UNWIND(ipc, frame, -1, op_errno, NULL);
-int32_t
-afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock,
- dict_t *xdata)
-{
- afr_local_t * local = NULL;
- int call_count = -1;
+ return 0;
- local = frame->local;
- call_count = afr_frame_return (frame);
+wind_default:
+ STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ipc, op, xdata);
+ return 0;
+}
- if (call_count == 0)
- AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
- lock, xdata);
+int
+afr_forget(xlator_t *this, inode_t *inode)
+{
+ uint64_t ctx_int = 0;
+ afr_inode_ctx_t *ctx = NULL;
+ afr_spb_choice_timeout_cancel(this, inode);
+ inode_ctx_del(inode, this, &ctx_int);
+ if (!ctx_int)
return 0;
+
+ ctx = (afr_inode_ctx_t *)(uintptr_t)ctx_int;
+ afr_inode_ctx_destroy(ctx);
+ return 0;
}
+int
+afr_priv_dump(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ char key_prefix[GF_DUMP_MAX_BUF_LEN];
+ char key[GF_DUMP_MAX_BUF_LEN];
+ int i = 0;
+
+ GF_ASSERT(this);
+ priv = this->private;
+
+ GF_ASSERT(priv);
+ snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
+ gf_proc_dump_add_section("%s", key_prefix);
+ gf_proc_dump_write("child_count", "%u", priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ sprintf(key, "child_up[%d]", i);
+ gf_proc_dump_write(key, "%d", priv->child_up[i]);
+ sprintf(key, "pending_key[%d]", i);
+ gf_proc_dump_write(key, "%s", priv->pending_key[i]);
+ sprintf(key, "pending_reads[%d]", i);
+ gf_proc_dump_write(key, "%" PRId64,
+ GF_ATOMIC_GET(priv->pending_reads[i]));
+ sprintf(key, "child_latency[%d]", i);
+ gf_proc_dump_write(key, "%" PRId64, priv->child_latency[i]);
+ sprintf(key, "halo_child_up[%d]", i);
+ gf_proc_dump_write(key, "%d", priv->halo_child_up[i]);
+ }
+ gf_proc_dump_write("data_self_heal", "%d", priv->data_self_heal);
+ gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
+ gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal);
+ gf_proc_dump_write("read_child", "%d", priv->read_child);
+ gf_proc_dump_write("wait_count", "%u", priv->wait_count);
+ gf_proc_dump_write("heal-wait-queue-length", "%d", priv->heal_wait_qlen);
+ gf_proc_dump_write("heal-waiters", "%d", priv->heal_waiters);
+ gf_proc_dump_write("background-self-heal-count", "%d",
+ priv->background_self_heal_count);
+ gf_proc_dump_write("healers", "%d", priv->healers);
+ gf_proc_dump_write("read-hash-mode", "%d", priv->hash_mode);
+ gf_proc_dump_write("use-anonymous-inode", "%d", priv->use_anon_inode);
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ gf_proc_dump_write("quorum-type", "auto");
+ } else if (priv->quorum_count == 0) {
+ gf_proc_dump_write("quorum-type", "none");
+ } else {
+ gf_proc_dump_write("quorum-type", "fixed");
+ gf_proc_dump_write("quorum-count", "%d", priv->quorum_count);
+ }
+ gf_proc_dump_write("up", "%u", afr_has_quorum(priv->child_up, this, NULL));
+ if (priv->thin_arbiter_count) {
+ gf_proc_dump_write("ta_child_up", "%d", priv->ta_child_up);
+ gf_proc_dump_write("ta_bad_child_index", "%d",
+ priv->ta_bad_child_index);
+ gf_proc_dump_write("ta_notify_dom_lock_offset", "%" PRId64,
+ priv->ta_notify_dom_lock_offset);
+ }
+
+ return 0;
+}
-int32_t
-afr_lk_unlock (call_frame_t *frame, xlator_t *this)
+/**
+ * find_child_index - find the child's index in the array of subvolumes
+ * @this: AFR
+ * @child: child
+ */
+
+static int
+afr_find_child_index(xlator_t *this, xlator_t *child)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- int i = 0;
- int call_count = 0;
+ afr_private_t *priv = NULL;
+ int child_count = -1;
+ int i = -1;
- local = frame->local;
- priv = this->private;
+ priv = this->private;
+ child_count = priv->child_count;
+ if (priv->thin_arbiter_count) {
+ child_count++;
+ }
- call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes,
- priv->child_count);
+ for (i = 0; i < child_count; i++) {
+ if ((xlator_t *)child == priv->children[i])
+ break;
+ }
- if (call_count == 0) {
- AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
- &local->cont.lk.ret_flock, NULL);
- return 0;
- }
+ return i;
+}
+
+int
+__afr_get_up_children_count(afr_private_t *priv)
+{
+ int up_children = 0;
+ int i = 0;
- local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 1)
+ up_children++;
- local->cont.lk.user_flock.l_type = F_UNLCK;
+ return up_children;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (local->cont.lk.locked_nodes[i]) {
- STACK_WIND (frame, afr_lk_unlock_cbk,
- priv->children[i],
- priv->children[i]->fops->lk,
- local->fd, F_SETLK,
- &local->cont.lk.user_flock, NULL);
-
- if (!--call_count)
- break;
- }
- }
+static int
+__get_heard_from_all_status(xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ int i;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->last_event[i]) {
+ return 0;
+ }
+ }
+ if (priv->thin_arbiter_count && !priv->ta_child_up) {
return 0;
+ }
+ return 1;
}
-
-int32_t
-afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct gf_flock *lock, dict_t *xdata)
+glusterfs_event_t
+__afr_transform_event_from_state(xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int child_index = -1;
-/* int ret = 0; */
+ int i = 0;
+ int up_children = 0;
+ afr_private_t *priv = this->private;
+ if (__get_heard_from_all_status(this))
+ /* have_heard_from_all. Let afr_notify() do the propagation. */
+ return GF_EVENT_MAXVAL;
- local = frame->local;
- priv = this->private;
+ up_children = __afr_get_up_children_count(priv);
+ /* Treat the children with pending notification, as having sent a
+ * GF_EVENT_CHILD_DOWN. i.e. set the event as GF_EVENT_SOME_DESCENDENT_DOWN,
+ * as done in afr_notify() */
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->last_event[i])
+ continue;
+ priv->last_event[i] = GF_EVENT_SOME_DESCENDENT_DOWN;
+ priv->child_up[i] = 0;
+ }
- child_index = (long) cookie;
+ if (up_children)
+ /* We received at least one child up */
+ return GF_EVENT_CHILD_UP;
+ else
+ return GF_EVENT_CHILD_DOWN;
- if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) {
- local->op_ret = -1;
- local->op_errno = op_errno;
+ return GF_EVENT_MAXVAL;
+}
- afr_lk_unlock (frame, this);
- return 0;
- }
+static void
+afr_notify_cbk(void *data)
+{
+ xlator_t *this = data;
+ afr_private_t *priv = this->private;
+ glusterfs_event_t event = GF_EVENT_MAXVAL;
+ gf_boolean_t propagate = _gf_false;
+
+ LOCK(&priv->lock);
+ {
+ if (!priv->timer) {
+ /*
+ * Either child_up/child_down is already sent to parent.
+ * This is a spurious wake up.
+ */
+ goto unlock;
+ }
+ priv->timer = NULL;
+ event = __afr_transform_event_from_state(this);
+ if (event != GF_EVENT_MAXVAL)
+ propagate = _gf_true;
+ }
+unlock:
+ UNLOCK(&priv->lock);
+ if (propagate)
+ default_notify(this, event, NULL);
+}
- if (op_ret == 0) {
- local->op_ret = 0;
- local->op_errno = 0;
- local->cont.lk.locked_nodes[child_index] = 1;
- local->cont.lk.ret_flock = *lock;
- }
+static void
+__afr_launch_notify_timer(xlator_t *this, afr_private_t *priv)
+{
+ struct timespec delay = {
+ 0,
+ };
+
+ gf_msg_debug(this->name, 0, "Initiating child-down timer");
+ delay.tv_sec = 10;
+ delay.tv_nsec = 0;
+ priv->timer = gf_timer_call_after(this->ctx, delay, afr_notify_cbk, this);
+ if (priv->timer == NULL) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_TIMER_CREATE_FAIL,
+ "Cannot create timer for delayed initialization");
+ }
+}
- child_index++;
+static int
+find_best_down_child(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t best_child = -1;
+ int64_t best_latency = INT64_MAX;
- if (child_index < priv->child_count) {
- STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->lk,
- local->fd, local->cont.lk.cmd,
- &local->cont.lk.user_flock, xdata);
- } else if (local->op_ret == -1) {
- /* all nodes have gone down */
+ priv = this->private;
- AFR_STACK_UNWIND (lk, frame, -1, ENOTCONN,
- &local->cont.lk.ret_flock, NULL);
- } else {
- AFR_STACK_UNWIND (lk, frame, local->op_ret, local->op_errno,
- &local->cont.lk.ret_flock, NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i] && priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] < best_latency) {
+ best_child = i;
+ best_latency = priv->child_latency[i];
}
-
- return 0;
+ }
+ if (best_child >= 0) {
+ gf_msg_debug(this->name, 0,
+ "Found best down child (%d) @ %" PRId64 " ms latency",
+ best_child, best_latency);
+ }
+ return best_child;
}
-
int
-afr_lk (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int32_t cmd, struct gf_flock *flock, dict_t *xdata)
+find_worst_up_child(xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
- int32_t op_errno = ENOMEM;
+ afr_private_t *priv = NULL;
+ int i = -1;
+ int32_t worst_child = -1;
+ int64_t worst_latency = INT64_MIN;
- priv = this->private;
+ priv = this->private;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
-
- local->cont.lk.locked_nodes = GF_CALLOC (priv->child_count,
- sizeof (*local->cont.lk.locked_nodes),
- gf_afr_mt_char);
-
- if (!local->cont.lk.locked_nodes) {
- op_errno = ENOMEM;
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->child_up[i] && priv->child_latency[i] >= 0 &&
+ priv->child_latency[i] > worst_latency) {
+ worst_child = i;
+ worst_latency = priv->child_latency[i];
}
+ }
+ if (worst_child >= 0) {
+ gf_msg_debug(this->name, 0,
+ "Found worst up child (%d) @ %" PRId64 " ms latency",
+ worst_child, worst_latency);
+ }
+ return worst_child;
+}
- local->fd = fd_ref (fd);
- local->cont.lk.cmd = cmd;
- local->cont.lk.user_flock = *flock;
- local->cont.lk.ret_flock = *flock;
+void
+__afr_handle_ping_event(xlator_t *this, xlator_t *child_xlator, const int idx,
+ int64_t halo_max_latency_msec, int32_t *event,
+ int64_t child_latency_msec)
+{
+ afr_private_t *priv = NULL;
+ int up_children = 0;
- STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0,
- priv->children[i],
- priv->children[i]->fops->lk,
- fd, cmd, flock, xdata);
+ priv = this->private;
- return 0;
-out:
- AFR_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL);
+ priv->child_latency[idx] = child_latency_msec;
+ gf_msg_debug(child_xlator->name, 0, "Client ping @ %" PRId64 " ms",
+ child_latency_msec);
+ if (priv->shd.iamshd)
+ return;
- return 0;
+ up_children = __afr_get_up_children_count(priv);
+
+ if (child_latency_msec > halo_max_latency_msec &&
+ priv->child_up[idx] == 1 && up_children > priv->halo_min_replicas) {
+ if ((up_children - 1) < priv->halo_min_replicas) {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Overriding halo threshold, "
+ "min replicas: %d",
+ priv->halo_min_replicas);
+ } else {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Child latency (%" PRId64
+ " ms) "
+ "exceeds halo threshold (%" PRId64
+ "), "
+ "marking child down.",
+ child_latency_msec, halo_max_latency_msec);
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_DOWN;
+ }
+ }
+ } else if (child_latency_msec < halo_max_latency_msec &&
+ priv->child_up[idx] == 0) {
+ if (up_children < priv->halo_max_replicas) {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Child latency (%" PRId64
+ " ms) "
+ "below halo threshold (%" PRId64
+ "), "
+ "marking child up.",
+ child_latency_msec, halo_max_latency_msec);
+ if (priv->halo_child_up[idx]) {
+ *event = GF_EVENT_CHILD_UP;
+ }
+ } else {
+ gf_log(child_xlator->name, GF_LOG_INFO,
+ "Not marking child %d up, "
+ "max replicas (%d) reached.",
+ idx, priv->halo_max_replicas);
+ }
+ }
}
-int
-afr_forget (xlator_t *this, inode_t *inode)
+static int64_t
+afr_get_halo_latency(xlator_t *this)
{
- uint64_t ctx_int = 0;
- afr_inode_ctx_t *ctx = NULL;
+ afr_private_t *priv = NULL;
+ int64_t halo_max_latency_msec = 0;
- afr_spb_choice_timeout_cancel (this, inode);
- inode_ctx_del (inode, this, &ctx_int);
- if (!ctx_int)
- return 0;
+ priv = this->private;
- ctx = (afr_inode_ctx_t *)ctx_int;
- GF_FREE (ctx);
- return 0;
+ if (priv->shd.iamshd) {
+ halo_max_latency_msec = priv->shd.halo_max_latency_msec;
+ } else if (priv->nfsd.iamnfsd) {
+ halo_max_latency_msec = priv->nfsd.halo_max_latency_msec;
+ } else {
+ halo_max_latency_msec = priv->halo_max_latency_msec;
+ }
+ gf_msg_debug(this->name, 0, "Using halo latency %" PRId64,
+ halo_max_latency_msec);
+ return halo_max_latency_msec;
}
-int
-afr_priv_dump (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- char key_prefix[GF_DUMP_MAX_BUF_LEN];
- char key[GF_DUMP_MAX_BUF_LEN];
- int i = 0;
+void
+__afr_handle_child_up_event(xlator_t *this, xlator_t *child_xlator,
+ const int idx, int64_t child_latency_msec,
+ int32_t *event, int32_t *call_psh,
+ int32_t *up_child)
+{
+ afr_private_t *priv = NULL;
+ int up_children = 0;
+ int worst_up_child = -1;
+ int64_t halo_max_latency_msec = afr_get_halo_latency(this);
+
+ priv = this->private;
+
+ /*
+ * This only really counts if the child was never up
+ * (value = -1) or had been down (value = 0). See
+ * comment at GF_EVENT_CHILD_DOWN for a more detailed
+ * explanation.
+ */
+ if (priv->child_up[idx] != 1) {
+ priv->event_generation++;
+ }
+ priv->child_up[idx] = 1;
+
+ *call_psh = 1;
+ *up_child = idx;
+ up_children = __afr_get_up_children_count(priv);
+ /*
+ * If this is an _actual_ CHILD_UP event, we
+ * want to set the child_latency to MAX to indicate
+ * the child needs ping data to be available before doing child-up
+ */
+ if (!priv->halo_enabled)
+ goto out;
+
+ if (child_latency_msec < 0) {
+ /*set to INT64_MAX-1 so that it is found for best_down_child*/
+ priv->halo_child_up[idx] = 1;
+ if (priv->child_latency[idx] < 0) {
+ priv->child_latency[idx] = AFR_HALO_MAX_LATENCY;
+ }
+ }
+
+ /*
+ * Handle the edge case where we exceed
+ * halo_min_replicas and we've got a child which is
+ * marked up as it was helping to satisfy the
+ * halo_min_replicas even though it's latency exceeds
+ * halo_max_latency_msec.
+ */
+ if (up_children > priv->halo_min_replicas) {
+ worst_up_child = find_worst_up_child(this);
+ if (worst_up_child >= 0 &&
+ priv->child_latency[worst_up_child] > halo_max_latency_msec) {
+ gf_msg_debug(this->name, 0,
+ "Marking child %d down, "
+ "doesn't meet halo threshold (%" PRId64
+ "), and > "
+ "halo_min_replicas (%d)",
+ worst_up_child, halo_max_latency_msec,
+ priv->halo_min_replicas);
+ priv->child_up[worst_up_child] = 0;
+ up_children--;
+ }
+ }
+
+ if (up_children > priv->halo_max_replicas && !priv->shd.iamshd) {
+ worst_up_child = find_worst_up_child(this);
+ if (worst_up_child < 0) {
+ worst_up_child = idx;
+ }
+ priv->child_up[worst_up_child] = 0;
+ up_children--;
+ gf_msg_debug(this->name, 0,
+ "Marking child %d down, "
+ "up_children (%d) > halo_max_replicas (%d)",
+ worst_up_child, up_children, priv->halo_max_replicas);
+ }
+out:
+ if (up_children == 1) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOL_UP,
+ "Subvolume '%s' came back up; "
+ "going online.",
+ child_xlator->name);
+ gf_event(EVENT_AFR_SUBVOL_UP, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ } else {
+ *event = GF_EVENT_SOME_DESCENDENT_UP;
+ }
+ priv->last_event[idx] = *event;
+}
- GF_ASSERT (this);
- priv = this->private;
+void
+__afr_handle_child_down_event(xlator_t *this, xlator_t *child_xlator, int idx,
+ int64_t child_latency_msec, int32_t *event,
+ int32_t *call_psh, int32_t *up_child)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int up_children = 0;
+ int down_children = 0;
+ int best_down_child = -1;
+
+ priv = this->private;
+
+ /*
+ * If a brick is down when we start, we'll get a
+ * CHILD_DOWN to indicate its initial state. There
+ * was never a CHILD_UP in this case, so if we
+ * increment "down_count" the difference between than
+ * and "up_count" will no longer be the number of
+ * children that are currently up. This has serious
+ * implications e.g. for quorum enforcement, so we
+ * don't increment these values unless the event
+ * represents an actual state transition between "up"
+ * (value = 1) and anything else.
+ */
+ if (priv->child_up[idx] == 1) {
+ priv->event_generation++;
+ }
+
+ /*
+ * If this is an _actual_ CHILD_DOWN event, we
+ * want to set the child_latency to < 0 to indicate
+ * the child is really disconnected.
+ */
+ if (child_latency_msec < 0) {
+ priv->child_latency[idx] = child_latency_msec;
+ priv->halo_child_up[idx] = 0;
+ }
+ priv->child_up[idx] = 0;
+
+ up_children = __afr_get_up_children_count(priv);
+ /*
+ * Handle the edge case where we need to find the
+ * next best child (to mark up) as marking this child
+ * down would cause us to fall below halo_min_replicas.
+ * We will also force the SHD to heal this child _now_
+ * as we want it to be up to date if we are going to
+ * begin using it synchronously.
+ */
+ if (priv->halo_enabled && up_children < priv->halo_min_replicas) {
+ best_down_child = find_best_down_child(this);
+ if (best_down_child >= 0) {
+ gf_msg_debug(this->name, 0,
+ "Swapping out child %d for "
+ "child %d to satisfy halo_min_replicas (%d).",
+ idx, best_down_child, priv->halo_min_replicas);
+ priv->child_up[best_down_child] = 1;
+ *call_psh = 1;
+ *up_child = best_down_child;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++)
+ if (priv->child_up[i] == 0)
+ down_children++;
+ if (down_children == priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SUBVOLS_DOWN,
+ "All subvolumes are down. Going "
+ "offline until at least one of them "
+ "comes back up.");
+ gf_event(EVENT_AFR_SUBVOLS_DOWN, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ } else {
+ *event = GF_EVENT_SOME_DESCENDENT_DOWN;
+ }
+ priv->last_event[idx] = *event;
+}
- GF_ASSERT (priv);
- snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type, this->name);
- gf_proc_dump_add_section(key_prefix);
- gf_proc_dump_write("child_count", "%u", priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- sprintf (key, "child_up[%d]", i);
- gf_proc_dump_write(key, "%d", priv->child_up[i]);
- sprintf (key, "pending_key[%d]", i);
- gf_proc_dump_write(key, "%s", priv->pending_key[i]);
- }
- gf_proc_dump_write("data_self_heal", "%s", priv->data_self_heal);
- gf_proc_dump_write("metadata_self_heal", "%d", priv->metadata_self_heal);
- gf_proc_dump_write("entry_self_heal", "%d", priv->entry_self_heal);
- gf_proc_dump_write("data_change_log", "%d", priv->data_change_log);
- gf_proc_dump_write("metadata_change_log", "%d", priv->metadata_change_log);
- gf_proc_dump_write("entry-change_log", "%d", priv->entry_change_log);
- gf_proc_dump_write("read_child", "%d", priv->read_child);
- gf_proc_dump_write("favorite_child", "%d", priv->favorite_child);
- gf_proc_dump_write("wait_count", "%u", priv->wait_count);
- gf_proc_dump_write("quorum-reads", "%d", priv->quorum_reads);
+void
+afr_ta_lock_release_synctask(xlator_t *this)
+{
+ call_frame_t *ta_frame = NULL;
+ int ret = 0;
- return 0;
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ return;
+ }
+
+ ret = synctask_new(this->ctx->env, afr_release_notify_lock_for_ta,
+ afr_ta_lock_release_done, ta_frame, this);
+ if (ret) {
+ STACK_DESTROY(ta_frame->root);
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to release "
+ "AFR_TA_DOM_NOTIFY lock.");
+ }
}
+static void
+afr_handle_inodelk_contention(xlator_t *this, struct gf_upcall *upcall)
+{
+ struct gf_upcall_inodelk_contention *lc = NULL;
+ unsigned int inmem_count = 0;
+ unsigned int onwire_count = 0;
+ afr_private_t *priv = this->private;
-/**
- * find_child_index - find the child's index in the array of subvolumes
- * @this: AFR
- * @child: child
- */
+ lc = upcall->data;
-static int
-find_child_index (xlator_t *this, xlator_t *child)
-{
- afr_private_t *priv = NULL;
- int i = -1;
+ if (strcmp(lc->domain, AFR_TA_DOM_NOTIFY) != 0)
+ return;
- priv = this->private;
+ if (priv->shd.iamshd) {
+ /* shd should ignore AFR_TA_DOM_NOTIFY release requests. */
+ return;
+ }
+ LOCK(&priv->lock);
+ {
+ if (priv->release_ta_notify_dom_lock == _gf_true) {
+ /* Ignore multiple release requests from shds.*/
+ UNLOCK(&priv->lock);
+ return;
+ }
+ priv->release_ta_notify_dom_lock = _gf_true;
+ inmem_count = priv->ta_in_mem_txn_count;
+ onwire_count = priv->ta_on_wire_txn_count;
+ }
+ UNLOCK(&priv->lock);
+ if (inmem_count || onwire_count)
+ /* lock release will happen in txn code path after
+ * in-memory or on-wire txns are over.*/
+ return;
- for (i = 0; i < priv->child_count; i++) {
- if ((xlator_t *) child == priv->children[i])
- break;
- }
+ afr_ta_lock_release_synctask(this);
+}
- return i;
+static void
+afr_handle_upcall_event(xlator_t *this, struct gf_upcall *upcall)
+{
+ struct gf_upcall_cache_invalidation *up_ci = NULL;
+ afr_private_t *priv = this->private;
+ inode_t *inode = NULL;
+ inode_table_t *itable = NULL;
+ int i = 0;
+
+ switch (upcall->event_type) {
+ case GF_UPCALL_INODELK_CONTENTION:
+ afr_handle_inodelk_contention(this, upcall);
+ break;
+ case GF_UPCALL_CACHE_INVALIDATION:
+ up_ci = (struct gf_upcall_cache_invalidation *)upcall->data;
+
+ /* Since md-cache will be aggressively filtering
+ * lookups, the stale read issue will be more
+ * pronounced. Hence when a pending xattr is set notify
+ * all the md-cache clients to invalidate the existing
+ * stat cache and send the lookup next time */
+ if (!up_ci->dict)
+ break;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!dict_get(up_ci->dict, priv->pending_key[i]))
+ continue;
+ up_ci->flags |= UP_INVAL_ATTR;
+ itable = ((xlator_t *)this->graph->top)->itable;
+ /*Internal processes may not have itable for
+ *top xlator*/
+ if (itable)
+ inode = inode_find(itable, upcall->gfid);
+ if (inode)
+ afr_inode_need_refresh_set(inode, this);
+ break;
+ }
+ break;
+ default:
+ break;
+ }
}
int32_t
-afr_notify (xlator_t *this, int32_t event,
- void *data, void *data2)
-{
- afr_private_t *priv = NULL;
- int i = -1;
- int up_children = 0;
- int down_children = 0;
- int propagate = 0;
- int had_heard_from_all = 0;
- int have_heard_from_all = 0;
- int idx = -1;
- int ret = -1;
- int call_psh = 0;
- dict_t *input = NULL;
- dict_t *output = NULL;
- gf_boolean_t had_quorum = _gf_false;
- gf_boolean_t has_quorum = _gf_false;
-
- priv = this->private;
-
- if (!priv)
- return 0;
-
- /*
- * We need to reset this in case children come up in "staggered"
- * fashion, so that we discover a late-arriving local subvolume. Note
- * that we could end up issuing N lookups to the first subvolume, and
- * O(N^2) overall, but N is small for AFR so it shouldn't be an issue.
- */
- priv->did_discovery = _gf_false;
+afr_notify(xlator_t *this, int32_t event, void *data, void *data2)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *child_xlator = NULL;
+ int i = -1;
+ int propagate = 0;
+ int had_heard_from_all = 0;
+ int have_heard_from_all = 0;
+ int idx = -1;
+ int ret = -1;
+ int call_psh = 0;
+ int up_child = -1;
+ dict_t *input = NULL;
+ dict_t *output = NULL;
+ gf_boolean_t had_quorum = _gf_false;
+ gf_boolean_t has_quorum = _gf_false;
+ int64_t halo_max_latency_msec = 0;
+ int64_t child_latency_msec = -1;
+
+ child_xlator = (xlator_t *)data;
+
+ priv = this->private;
+
+ if (!priv)
+ return 0;
- had_heard_from_all = 1;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->last_event[i]) {
- had_heard_from_all = 0;
- }
+ /*
+ * We need to reset this in case children come up in "staggered"
+ * fashion, so that we discover a late-arriving local subvolume. Note
+ * that we could end up issuing N lookups to the first subvolume, and
+ * O(N^2) overall, but N is small for AFR so it shouldn't be an issue.
+ */
+ priv->did_discovery = _gf_false;
+
+ /* parent xlators don't need to know about every child_up, child_down
+ * because of afr ha. If all subvolumes go down, child_down has
+ * to be triggered. In that state when 1 subvolume comes up child_up
+ * needs to be triggered. dht optimizes revalidate lookup by sending
+ * it only to one of its subvolumes. When child up/down happens
+ * for afr's subvolumes dht should be notified by child_modified. The
+ * subsequent revalidate lookup happens on all the dht's subvolumes
+ * which triggers afr self-heals if any.
+ */
+ idx = afr_find_child_index(this, child_xlator);
+ if (idx < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
+ "Received child_up from invalid subvolume");
+ goto out;
+ }
+
+ had_quorum = priv->quorum_count &&
+ afr_has_quorum(priv->child_up, this, NULL);
+ if (event == GF_EVENT_CHILD_PING) {
+ child_latency_msec = (int64_t)(uintptr_t)data2;
+ if (priv->halo_enabled) {
+ halo_max_latency_msec = afr_get_halo_latency(this);
+
+ /* Calculates the child latency and sets event
+ */
+ LOCK(&priv->lock);
+ {
+ __afr_handle_ping_event(this, child_xlator, idx,
+ halo_max_latency_msec, &event,
+ child_latency_msec);
+ }
+ UNLOCK(&priv->lock);
+ } else {
+ LOCK(&priv->lock);
+ {
+ priv->child_latency[idx] = child_latency_msec;
+ }
+ UNLOCK(&priv->lock);
}
+ }
- /* parent xlators dont need to know about every child_up, child_down
- * because of afr ha. If all subvolumes go down, child_down has
- * to be triggered. In that state when 1 subvolume comes up child_up
- * needs to be triggered. dht optimizes revalidate lookup by sending
- * it only to one of its subvolumes. When child up/down happens
- * for afr's subvolumes dht should be notified by child_modified. The
- * subsequent revalidate lookup happens on all the dht's subvolumes
- * which triggers afr self-heals if any.
+ if (event == GF_EVENT_CHILD_PING) {
+ /* This is the only xlator that handles PING, no reason to
+ * propagate.
*/
- idx = find_child_index (this, data);
- if (idx < 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_CHILD_UP,
- "Received child_up from invalid subvolume");
- goto out;
+ goto out;
+ }
+
+ if (event == GF_EVENT_TRANSLATOR_OP) {
+ LOCK(&priv->lock);
+ {
+ had_heard_from_all = __get_heard_from_all_status(this);
}
+ UNLOCK(&priv->lock);
- had_quorum = priv->quorum_count && afr_has_quorum (priv->child_up,
- this);
+ if (!had_heard_from_all) {
+ ret = -1;
+ } else {
+ input = data;
+ output = data2;
+ ret = afr_xl_op(this, input, output);
+ }
+ goto out;
+ }
+
+ if (event == GF_EVENT_UPCALL) {
+ afr_handle_upcall_event(this, data);
+ }
+
+ LOCK(&priv->lock);
+ {
+ had_heard_from_all = __get_heard_from_all_status(this);
switch (event) {
- case GF_EVENT_CHILD_UP:
- LOCK (&priv->lock);
- {
- /*
- * This only really counts if the child was never up
- * (value = -1) or had been down (value = 0). See
- * comment at GF_EVENT_CHILD_DOWN for a more detailed
- * explanation.
- */
- if (priv->child_up[idx] != 1) {
- priv->up_count++;
- priv->event_generation++;
- }
- priv->child_up[idx] = 1;
-
- call_psh = 1;
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i] == 1)
- up_children++;
- if (up_children == 1) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_SUBVOL_UP,
- "Subvolume '%s' came back up; "
- "going online.", ((xlator_t *)data)->name);
- } else {
- event = GF_EVENT_CHILD_MODIFIED;
- }
-
- priv->last_event[idx] = event;
+ case GF_EVENT_PARENT_UP:
+ __afr_launch_notify_timer(this, priv);
+ propagate = 1;
+ break;
+ case GF_EVENT_CHILD_UP:
+ if (priv->thin_arbiter_count &&
+ (idx == AFR_CHILD_THIN_ARBITER)) {
+ priv->ta_child_up = 1;
+ priv->ta_event_gen++;
+ break;
}
- UNLOCK (&priv->lock);
-
+ __afr_handle_child_up_event(this, child_xlator, idx,
+ child_latency_msec, &event,
+ &call_psh, &up_child);
+ __afr_lock_heal_synctask(this, priv, idx);
break;
- case GF_EVENT_CHILD_DOWN:
- LOCK (&priv->lock);
- {
- /*
- * If a brick is down when we start, we'll get a
- * CHILD_DOWN to indicate its initial state. There
- * was never a CHILD_UP in this case, so if we
- * increment "down_count" the difference between than
- * and "up_count" will no longer be the number of
- * children that are currently up. This has serious
- * implications e.g. for quorum enforcement, so we
- * don't increment these values unless the event
- * represents an actual state transition between "up"
- * (value = 1) and anything else.
- */
- if (priv->child_up[idx] == 1) {
- priv->down_count++;
- priv->event_generation++;
- }
- priv->child_up[idx] = 0;
-
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i] == 0)
- down_children++;
- if (down_children == priv->child_count) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_ALL_SUBVOLS_DOWN,
- "All subvolumes are down. Going offline "
- "until atleast one of them comes back up.");
- } else {
- event = GF_EVENT_SOME_CHILD_DOWN;
- }
-
- priv->last_event[idx] = event;
+ case GF_EVENT_CHILD_DOWN:
+ if (priv->thin_arbiter_count &&
+ (idx == AFR_CHILD_THIN_ARBITER)) {
+ priv->ta_child_up = 0;
+ priv->ta_event_gen++;
+ afr_ta_locked_priv_invalidate(priv);
+ break;
}
- UNLOCK (&priv->lock);
-
+ __afr_handle_child_down_event(this, child_xlator, idx,
+ child_latency_msec, &event,
+ &call_psh, &up_child);
+ __afr_mark_pending_lk_heal(this, priv, idx);
break;
- case GF_EVENT_CHILD_CONNECTING:
- LOCK (&priv->lock);
- {
- priv->last_event[idx] = event;
- }
- UNLOCK (&priv->lock);
+ case GF_EVENT_CHILD_CONNECTING:
+ priv->last_event[idx] = event;
break;
- case GF_EVENT_TRANSLATOR_OP:
- input = data;
- output = data2;
- if (!had_heard_from_all) {
- ret = -1;
- goto out;
- }
- ret = afr_xl_op (this, input, output);
- goto out;
+ case GF_EVENT_SOME_DESCENDENT_DOWN:
+ priv->last_event[idx] = event;
break;
-
- default:
+ default:
propagate = 1;
break;
}
-
- if (priv->quorum_count) {
- has_quorum = afr_has_quorum (priv->child_up, this);
- if (!had_quorum && has_quorum)
- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET,
- "Client-quorum is met");
- if (had_quorum && !has_quorum)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_QUORUM_FAIL,
- "Client-quorum is not met");
- }
-
- /* have all subvolumes reported status once by now? */
- have_heard_from_all = 1;
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->last_event[i])
- have_heard_from_all = 0;
- }
-
- /* if all subvols have reported status, no need to hide anything
- or wait for anything else. Just propagate blindly */
- if (have_heard_from_all)
- propagate = 1;
-
+ have_heard_from_all = __get_heard_from_all_status(this);
if (!had_heard_from_all && have_heard_from_all) {
- /* This is the first event which completes aggregation
- of events from all subvolumes. If at least one subvol
- had come up, propagate CHILD_UP, but only this time
- */
- event = GF_EVENT_CHILD_DOWN;
-
- LOCK (&priv->lock);
- {
- up_children = AFR_COUNT (priv->child_up, priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
- event = GF_EVENT_CHILD_UP;
- break;
- }
-
- if (priv->last_event[i] ==
- GF_EVENT_CHILD_CONNECTING) {
- event = GF_EVENT_CHILD_CONNECTING;
- /* continue to check other events for CHILD_UP */
- }
- }
+ if (priv->timer) {
+ gf_timer_call_cancel(this->ctx, priv->timer);
+ priv->timer = NULL;
+ }
+ /* This is the first event which completes aggregation
+ of events from all subvolumes. If at least one subvol
+ had come up, propagate CHILD_UP, but only this time
+ */
+ event = GF_EVENT_CHILD_DOWN;
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->last_event[i] == GF_EVENT_CHILD_UP) {
+ event = GF_EVENT_CHILD_UP;
+ break;
}
- UNLOCK (&priv->lock);
- }
- ret = 0;
- if (propagate)
- ret = default_notify (this, event, data);
-
- if ((!had_heard_from_all) || call_psh) {
- /* Launch self-heal on all local subvolumes if:
- * a) We have_heard_from_all for the first time
- * b) Already heard from everyone, but we now got a child-up
- * event.
- */
- if (have_heard_from_all && priv->shd.iamshd) {
- for (i = 0; i < priv->child_count; i++)
- if (priv->child_up[i])
- afr_selfheal_childup (this, i);
+ if (priv->last_event[i] == GF_EVENT_CHILD_CONNECTING) {
+ event = GF_EVENT_CHILD_CONNECTING;
+ /* continue to check other events for CHILD_UP */
}
+ }
+ }
+ }
+ UNLOCK(&priv->lock);
+
+ if (priv->quorum_count) {
+ has_quorum = afr_has_quorum(priv->child_up, this, NULL);
+ if (!had_quorum && has_quorum) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_QUORUM_MET,
+ "Client-quorum is met");
+ gf_event(EVENT_AFR_QUORUM_MET, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ }
+ if (had_quorum && !has_quorum) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
+ "Client-quorum is not met");
+ gf_event(EVENT_AFR_QUORUM_FAIL, "client-pid=%d; subvol=%s",
+ this->ctx->cmd_args.client_pid, this->name);
+ }
+ }
+
+ /* if all subvols have reported status, no need to hide anything
+ or wait for anything else. Just propagate blindly */
+ if (have_heard_from_all)
+ propagate = 1;
+
+ ret = 0;
+ if (propagate)
+ ret = default_notify(this, event, data);
+
+ if ((!had_heard_from_all) || call_psh) {
+ /* Launch self-heal on all local subvolumes if:
+ * a) We have_heard_from_all for the first time
+ * b) Already heard from everyone, but we now got a child-up
+ * event.
+ */
+ if (have_heard_from_all) {
+ afr_selfheal_childup(this, priv);
}
+ }
out:
- return ret;
+ return ret;
}
-
int
-afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
-{
- local->op_ret = -1;
- local->op_errno = EUCLEAN;
-
- syncbarrier_init (&local->barrier);
-
- local->child_up = GF_CALLOC (priv->child_count,
- sizeof (*local->child_up),
- gf_afr_mt_char);
- if (!local->child_up) {
- if (op_errno)
- *op_errno = ENOMEM;
- goto out;
- }
-
- memcpy (local->child_up, priv->child_up,
- sizeof (*local->child_up) * priv->child_count);
- local->call_count = AFR_COUNT (local->child_up, priv->child_count);
- if (local->call_count == 0) {
- gf_msg (THIS->name, GF_LOG_INFO, 0,
- AFR_MSG_ALL_SUBVOLS_DOWN, "no subvolumes up");
- if (op_errno)
- *op_errno = ENOTCONN;
- goto out;
- }
- local->event_generation = priv->event_generation;
-
- local->read_attempted = GF_CALLOC (priv->child_count, sizeof (char),
- gf_afr_mt_char);
- if (!local->read_attempted) {
- if (op_errno)
- *op_errno = ENOMEM;
- goto out;
- }
-
- local->readable = GF_CALLOC (priv->child_count, sizeof (char),
- gf_afr_mt_char);
- if (!local->readable) {
- if (op_errno)
- *op_errno = ENOMEM;
- goto out;
- }
-
- local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
- gf_afr_mt_reply_t);
- if (!local->replies) {
- if (op_errno)
- *op_errno = ENOMEM;
- goto out;
- }
-
- return 0;
+afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno)
+{
+ int __ret = -1;
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+
+ __ret = syncbarrier_init(&local->barrier);
+ if (__ret) {
+ if (op_errno)
+ *op_errno = __ret;
+ goto out;
+ }
+
+ local->child_up = GF_MALLOC(priv->child_count * sizeof(*local->child_up),
+ gf_afr_mt_char);
+ if (!local->child_up) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ memcpy(local->child_up, priv->child_up,
+ sizeof(*local->child_up) * priv->child_count);
+ local->call_count = AFR_COUNT(local->child_up, priv->child_count);
+ if (local->call_count == 0) {
+ gf_msg(THIS->name, GF_LOG_INFO, 0, AFR_MSG_SUBVOLS_DOWN,
+ "no subvolumes up");
+ if (op_errno)
+ *op_errno = ENOTCONN;
+ goto out;
+ }
+
+ local->event_generation = priv->event_generation;
+
+ local->read_attempted = GF_CALLOC(priv->child_count, sizeof(char),
+ gf_afr_mt_char);
+ if (!local->read_attempted) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->readable = GF_CALLOC(priv->child_count, sizeof(char),
+ gf_afr_mt_char);
+ if (!local->readable) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->readable2 = GF_CALLOC(priv->child_count, sizeof(char),
+ gf_afr_mt_char);
+ if (!local->readable2) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->read_subvol = -1;
+
+ local->replies = GF_CALLOC(priv->child_count, sizeof(*local->replies),
+ gf_afr_mt_reply_t);
+ if (!local->replies) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->need_full_crawl = _gf_false;
+ if (priv->thin_arbiter_count) {
+ local->ta_child_up = priv->ta_child_up;
+ local->ta_failed_subvol = AFR_CHILD_UNKNOWN;
+ local->read_txn_query_child = AFR_CHILD_UNKNOWN;
+ local->ta_event_gen = priv->ta_event_gen;
+ local->fop_state = TA_SUCCESS;
+ }
+ local->is_new_entry = _gf_false;
+
+ INIT_LIST_HEAD(&local->healer);
+ return 0;
out:
- return -1;
+ return -1;
}
int
-afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
- transaction_lk_type_t lk_type)
+afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count)
{
- int ret = -ENOMEM;
+ int ret = -ENOMEM;
- lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
- child_count, gf_afr_mt_char);
- if (NULL == lk->locked_nodes)
- goto out;
+ lk->lower_locked_nodes = GF_CALLOC(sizeof(*lk->lower_locked_nodes),
+ child_count, gf_afr_mt_char);
+ if (NULL == lk->lower_locked_nodes)
+ goto out;
- lk->lower_locked_nodes = GF_CALLOC (sizeof (*lk->lower_locked_nodes),
- child_count, gf_afr_mt_char);
- if (NULL == lk->lower_locked_nodes)
- goto out;
+ lk->lock_op_ret = -1;
+ lk->lock_op_errno = EUCLEAN;
- lk->lock_op_ret = -1;
- lk->lock_op_errno = EUCLEAN;
- lk->transaction_lk_type = lk_type;
-
- ret = 0;
+ ret = 0;
out:
- return ret;
+ return ret;
}
void
-afr_matrix_cleanup (int32_t **matrix, unsigned int m)
+afr_matrix_cleanup(int32_t **matrix, unsigned int m)
{
- int i = 0;
+ int i = 0;
- if (!matrix)
- goto out;
- for (i = 0; i < m; i++) {
- GF_FREE (matrix[i]);
- }
+ if (!matrix)
+ goto out;
+ for (i = 0; i < m; i++) {
+ GF_FREE(matrix[i]);
+ }
- GF_FREE (matrix);
+ GF_FREE(matrix);
out:
- return;
+ return;
}
-int32_t**
-afr_matrix_create (unsigned int m, unsigned int n)
+int32_t **
+afr_matrix_create(unsigned int m, unsigned int n)
{
- int32_t **matrix = NULL;
- int i = 0;
+ int32_t **matrix = NULL;
+ int i = 0;
- matrix = GF_CALLOC (sizeof (*matrix), m, gf_afr_mt_int32_t);
- if (!matrix)
- goto out;
+ matrix = GF_CALLOC(sizeof(*matrix), m, gf_afr_mt_int32_t);
+ if (!matrix)
+ goto out;
- for (i = 0; i < m; i++) {
- matrix[i] = GF_CALLOC (sizeof (*matrix[i]), n,
- gf_afr_mt_int32_t);
- if (!matrix[i])
- goto out;
- }
- return matrix;
+ for (i = 0; i < m; i++) {
+ matrix[i] = GF_CALLOC(sizeof(*matrix[i]), n, gf_afr_mt_int32_t);
+ if (!matrix[i])
+ goto out;
+ }
+ return matrix;
out:
- afr_matrix_cleanup (matrix, m);
- return NULL;
+ afr_matrix_cleanup(matrix, m);
+ return NULL;
}
int
-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count)
-{
- int ret = -ENOMEM;
-
- lk->domain = dom;
- lk->locked_nodes = GF_CALLOC (sizeof (*lk->locked_nodes),
- child_count, gf_afr_mt_char);
- if (NULL == lk->locked_nodes)
- goto out;
- ret = 0;
+afr_transaction_local_init(afr_local_t *local, xlator_t *this)
+{
+ int ret = -ENOMEM;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ INIT_LIST_HEAD(&local->transaction.wait_list);
+ INIT_LIST_HEAD(&local->transaction.owner_list);
+ INIT_LIST_HEAD(&local->ta_waitq);
+ INIT_LIST_HEAD(&local->ta_onwireq);
+ ret = afr_internal_lock_init(&local->internal_lock, priv->child_count);
+ if (ret < 0)
+ goto out;
+
+ ret = -ENOMEM;
+ local->pre_op_compat = priv->pre_op_compat;
+
+ local->transaction.pre_op = GF_CALLOC(sizeof(*local->transaction.pre_op),
+ priv->child_count, gf_afr_mt_char);
+ if (!local->transaction.pre_op)
+ goto out;
+
+ local->transaction.changelog_xdata = GF_CALLOC(
+ sizeof(*local->transaction.changelog_xdata), priv->child_count,
+ gf_afr_mt_dict_t);
+ if (!local->transaction.changelog_xdata)
+ goto out;
+
+ if (priv->arbiter_count == 1) {
+ local->transaction.pre_op_sources = GF_CALLOC(
+ sizeof(*local->transaction.pre_op_sources), priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.pre_op_sources)
+ goto out;
+ }
+
+ local->transaction.failed_subvols = GF_CALLOC(
+ sizeof(*local->transaction.failed_subvols), priv->child_count,
+ gf_afr_mt_char);
+ if (!local->transaction.failed_subvols)
+ goto out;
+
+ local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!local->pending)
+ goto out;
+
+ ret = 0;
out:
- return ret;
+ return ret;
}
-int
-afr_transaction_local_init (afr_local_t *local, xlator_t *this)
+void
+afr_set_low_priority(call_frame_t *frame)
{
- int child_up_count = 0;
- int ret = -ENOMEM;
- afr_private_t *priv = NULL;
+ frame->root->pid = LOW_PRIO_PROC_PID;
+}
- priv = this->private;
- ret = afr_internal_lock_init (&local->internal_lock, priv->child_count,
- AFR_TRANSACTION_LK);
- if (ret < 0)
- goto out;
+void
+afr_priv_destroy(afr_private_t *priv)
+{
+ int i = 0;
+ int child_count = -1;
+
+ if (!priv)
+ goto out;
+
+ GF_FREE(priv->sh_domain);
+ GF_FREE(priv->last_event);
+
+ child_count = priv->child_count;
+ if (priv->thin_arbiter_count) {
+ child_count++;
+ }
+ if (priv->pending_key) {
+ for (i = 0; i < child_count; i++)
+ GF_FREE(priv->pending_key[i]);
+ }
+
+ GF_FREE(priv->pending_reads);
+ GF_FREE(priv->local);
+ GF_FREE(priv->pending_key);
+ GF_FREE(priv->children);
+ GF_FREE(priv->anon_inode);
+ GF_FREE(priv->child_up);
+ GF_FREE(priv->halo_child_up);
+ GF_FREE(priv->child_latency);
+ LOCK_DESTROY(&priv->lock);
+
+ GF_FREE(priv);
+out:
+ return;
+}
- if ((local->transaction.type == AFR_DATA_TRANSACTION) ||
- (local->transaction.type == AFR_METADATA_TRANSACTION)) {
- ret = afr_inodelk_init (&local->internal_lock.inodelk[0],
- this->name, priv->child_count);
- if (ret < 0)
- goto out;
- }
+int **
+afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending,
+ dict_t *xattr, ia_type_t iat)
+{
+ int i = 0;
+ int **changelog = NULL;
+ int idx = -1;
+ int m_idx = 0;
+ int d_idx = 0;
+ int ret = 0;
- ret = -ENOMEM;
- child_up_count = AFR_COUNT (local->child_up, priv->child_count);
- if (priv->optimistic_change_log && child_up_count == priv->child_count)
- local->optimistic_change_log = 1;
+ m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION);
+ d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
- local->pre_op_compat = priv->pre_op_compat;
+ idx = afr_index_from_ia_type(iat);
- local->transaction.eager_lock =
- GF_CALLOC (sizeof (*local->transaction.eager_lock),
- priv->child_count,
- gf_afr_mt_int32_t);
+ changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!changelog)
+ goto out;
- if (!local->transaction.eager_lock)
- goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!pending[i])
+ continue;
- local->transaction.pre_op = GF_CALLOC (sizeof (*local->transaction.pre_op),
- priv->child_count,
- gf_afr_mt_char);
- if (!local->transaction.pre_op)
- goto out;
+ changelog[i][m_idx] = hton32(1);
+ if (idx != -1)
+ changelog[i][idx] = hton32(1);
+ /* If the newentry marking is on a newly created directory,
+ * then mark it with the full-heal indicator.
+ */
+ if ((IA_ISDIR(iat)) && (priv->esh_granular))
+ changelog[i][d_idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict(priv, xattr, changelog);
+ if (ret < 0) {
+ afr_matrix_cleanup(changelog, priv->child_count);
+ return NULL;
+ }
+out:
+ return changelog;
+}
- if (priv->arbiter_count == 1) {
- local->transaction.pre_op_xdata =
- GF_CALLOC (sizeof (*local->transaction.pre_op_xdata),
- priv->child_count, gf_afr_mt_dict_t);
- if (!local->transaction.pre_op_xdata)
- goto out;
+static dict_t *
+afr_set_heal_info(char *status)
+{
+ dict_t *dict = NULL;
+ int ret = -1;
- local->transaction.pre_op_sources =
- GF_CALLOC (sizeof (*local->transaction.pre_op_sources),
- priv->child_count, gf_afr_mt_char);
- if (!local->transaction.pre_op_sources)
- goto out;
- }
+ dict = dict_new();
+ if (!dict) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_dynstr_sizen(dict, "heal-info", status);
+ if (ret)
+ gf_msg("", GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set heal-info key to "
+ "%s",
+ status);
+out:
+ /* Any error other than EINVAL, dict_set_dynstr frees status */
+ if (ret == -ENOMEM || ret == -EINVAL) {
+ GF_FREE(status);
+ }
- local->transaction.failed_subvols = GF_CALLOC (sizeof (*local->transaction.failed_subvols),
- priv->child_count,
- gf_afr_mt_char);
- if (!local->transaction.failed_subvols)
- goto out;
+ if (ret && dict) {
+ dict_unref(dict);
+ dict = NULL;
+ }
+ return dict;
+}
- local->pending = afr_matrix_create (priv->child_count,
- AFR_NUM_CHANGE_LOGS);
- if (!local->pending)
- goto out;
+static gf_boolean_t
+afr_is_dirty_count_non_unary_for_txn(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ afr_private_t *priv = this->private;
+ int *dirty = alloca0(priv->child_count * sizeof(int));
+ int i = 0;
- INIT_LIST_HEAD (&local->transaction.eager_locked);
+ afr_selfheal_extract_xattr(this, replies, type, dirty, NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (dirty[i] > 1)
+ return _gf_true;
+ }
- ret = 0;
-out:
- return ret;
+ return _gf_false;
}
+static gf_boolean_t
+afr_is_dirty_count_non_unary(xlator_t *this, struct afr_reply *replies,
+ ia_type_t ia_type)
+{
+ gf_boolean_t data_chk = _gf_false;
+ gf_boolean_t mdata_chk = _gf_false;
+ gf_boolean_t entry_chk = _gf_false;
+
+ switch (ia_type) {
+ case IA_IFDIR:
+ mdata_chk = _gf_true;
+ entry_chk = _gf_true;
+ break;
+ case IA_IFREG:
+ mdata_chk = _gf_true;
+ data_chk = _gf_true;
+ break;
+ default:
+ /*IA_IFBLK, IA_IFCHR, IA_IFLNK, IA_IFIFO, IA_IFSOCK*/
+ mdata_chk = _gf_true;
+ break;
+ }
-void
-afr_set_low_priority (call_frame_t *frame)
-{
- frame->root->pid = LOW_PRIO_PROC_PID;
+ if (data_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_DATA_TRANSACTION)) {
+ return _gf_true;
+ } else if (mdata_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_METADATA_TRANSACTION)) {
+ return _gf_true;
+ } else if (entry_chk && afr_is_dirty_count_non_unary_for_txn(
+ this, replies, AFR_ENTRY_TRANSACTION)) {
+ return _gf_true;
+ }
+
+ return _gf_false;
}
+static int
+afr_update_heal_status(xlator_t *this, struct afr_reply *replies,
+ ia_type_t ia_type, gf_boolean_t *esh, gf_boolean_t *dsh,
+ gf_boolean_t *msh, unsigned char pending)
+{
+ int ret = -1;
+ GF_UNUSED int ret1 = 0;
+ int i = 0;
+ int io_domain_lk_count = 0;
+ int shd_domain_lk_count = 0;
+ afr_private_t *priv = NULL;
+ char *key1 = NULL;
+ char *key2 = NULL;
+
+ priv = this->private;
+ key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(this->name));
+ key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(priv->sh_domain));
+ sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+ sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if ((replies[i].valid != 1) || (replies[i].op_ret != 0))
+ continue;
+ if (!io_domain_lk_count) {
+ ret1 = dict_get_int32(replies[i].xdata, key1, &io_domain_lk_count);
+ }
+ if (!shd_domain_lk_count) {
+ ret1 = dict_get_int32(replies[i].xdata, key2, &shd_domain_lk_count);
+ }
+ }
+
+ if (!pending) {
+ if ((afr_is_dirty_count_non_unary(this, replies, ia_type)) ||
+ (!io_domain_lk_count)) {
+ /* Needs heal. */
+ ret = 0;
+ } else {
+ /* No heal needed. */
+ *dsh = *esh = *msh = 0;
+ }
+ } else {
+ if (shd_domain_lk_count) {
+ ret = -EAGAIN; /*For 'possibly-healing'. */
+ } else {
+ ret = 0; /*needs heal. Just set a non -ve value so that it is
+ assumed as the source index.*/
+ }
+ }
+ return ret;
+}
-gf_boolean_t
-afr_have_quorum (char *logname, afr_private_t *priv)
-{
- unsigned int quorum = 0;
+/*return EIO, EAGAIN or pending*/
+int
+afr_lockless_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **inode, gf_boolean_t *entry_selfheal,
+ gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal, unsigned char *pending)
+{
+ int ret = -1;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ gf_boolean_t dsh = _gf_false;
+ gf_boolean_t msh = _gf_false;
+ gf_boolean_t esh = _gf_false;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *valid_on = NULL;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+ sources = alloca0(sizeof(*sources) * priv->child_count);
+ sinks = alloca0(sizeof(*sinks) * priv->child_count);
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ valid_on = alloca0(sizeof(*valid_on) * priv->child_count);
+
+ ret = afr_selfheal_unlocked_inspect(frame, this, gfid, inode, &dsh, &msh,
+ &esh, replies);
+ if (ret)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ valid_on[i] = 1;
+ }
+ }
+ if (msh) {
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_METADATA_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
+ goto out;
+ }
+ if (dsh) {
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_DATA_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
+ goto out;
+ }
+ if (esh) {
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_ENTRY_TRANSACTION, valid_on,
+ sources, sinks, witness, pending);
+ if (*pending & PFLAG_SBRAIN)
+ ret = -EIO;
+ if (ret)
+ goto out;
+ }
- GF_VALIDATE_OR_GOTO(logname,priv,out);
+ ret = afr_update_heal_status(this, replies, (*inode)->ia_type, &esh, &dsh,
+ &msh, *pending);
+out:
+ *data_selfheal = dsh;
+ *entry_selfheal = esh;
+ *metadata_selfheal = msh;
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ return ret;
+}
- quorum = priv->quorum_count;
- if (quorum != AFR_QUORUM_AUTO) {
- return (priv->up_count >= (priv->down_count + quorum));
+int
+afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
+ unsigned char pending = 0;
+ dict_t *dict = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+ inode_t *inode = NULL;
+ char *substr = NULL;
+ char *status = NULL;
+ call_frame_t *heal_frame = NULL;
+ afr_local_t *heal_local = NULL;
+
+ /*Use frame with lk-owner set*/
+ heal_frame = afr_frame_create(frame->this, &op_errno);
+ if (!heal_frame) {
+ ret = -1;
+ goto out;
+ }
+ heal_local = heal_frame->local;
+ heal_frame->local = frame->local;
+
+ ret = afr_lockless_inspect(heal_frame, this, loc->gfid, &inode,
+ &entry_selfheal, &data_selfheal,
+ &metadata_selfheal, &pending);
+
+ if (ret == -ENOMEM) {
+ ret = -1;
+ goto out;
+ }
+
+ if (pending & PFLAG_PENDING) {
+ gf_asprintf(&substr, "-pending");
+ if (!substr)
+ goto out;
+ }
+
+ if (ret == -EIO) {
+ ret = gf_asprintf(&status, "split-brain%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
}
-
- quorum = priv->child_count / 2 + 1;
- if (priv->up_count >= (priv->down_count + quorum)) {
- return _gf_true;
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
}
-
- /*
- * Special case for even numbers of nodes: if we have exactly half
- * and that includes the first ("senior-most") node, then that counts
- * as quorum even if it wouldn't otherwise. This supports e.g. N=2
- * while preserving the critical property that there can only be one
- * such group.
+ } else if (ret == -EAGAIN) {
+ ret = gf_asprintf(&status, "possibly-healing%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ } else if (ret >= 0) {
+ /* value of ret = source index
+ * so ret >= 0 and at least one of the 3 booleans set to
+ * true means a source is identified; heal is required.
*/
- if ((priv->child_count % 2) == 0) {
- quorum = priv->child_count / 2;
- if (priv->up_count >= (priv->down_count + quorum)) {
- if (priv->child_up[0]) {
- return _gf_true;
- }
- }
+ if (!data_selfheal && !entry_selfheal && !metadata_selfheal) {
+ status = gf_strdup("no-heal");
+ if (!status) {
+ ret = -1;
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ } else {
+ ret = gf_asprintf(&status, "heal%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
+ }
+ } else if (ret < 0) {
+ /* Apart from above checked -ve ret values, there are
+ * other possible ret values like ENOTCONN
+ * (returned when number of valid replies received are
+ * less than 2)
+ * in which case heal is required when one of the
+ * selfheal booleans is set.
+ */
+ if (data_selfheal || entry_selfheal || metadata_selfheal) {
+ ret = gf_asprintf(&status, "heal%s", substr ? substr : "");
+ if (ret < 0) {
+ goto out;
+ }
+ dict = afr_set_heal_info(status);
+ if (!dict) {
+ ret = -1;
+ goto out;
+ }
}
+ }
+
+ ret = 0;
+ op_errno = 0;
out:
- return _gf_false;
+ if (heal_frame) {
+ heal_frame->local = heal_local;
+ AFR_STACK_DESTROY(heal_frame);
+ }
+ AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+ if (dict)
+ dict_unref(dict);
+ if (inode)
+ inode_unref(inode);
+ GF_FREE(substr);
+ return ret;
}
-void
-afr_priv_destroy (afr_private_t *priv)
-{
- int i = 0;
+int
+_afr_is_split_brain(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies, afr_transaction_type type,
+ gf_boolean_t *spb)
+{
+ afr_private_t *priv = NULL;
+ uint64_t *witness = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ int sources_count = 0;
+ int ret = 0;
+
+ priv = this->private;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ witness = alloca0(priv->child_count * sizeof(*witness));
+
+ ret = afr_selfheal_find_direction(frame, this, replies, type,
+ priv->child_up, sources, sinks, witness,
+ NULL);
+ if (ret)
+ return ret;
- if (!priv)
- goto out;
- GF_FREE (priv->last_event);
- if (priv->pending_key) {
- for (i = 0; i < priv->child_count; i++)
- GF_FREE (priv->pending_key[i]);
- }
- GF_FREE (priv->pending_key);
- GF_FREE (priv->children);
- GF_FREE (priv->child_up);
- LOCK_DESTROY (&priv->lock);
+ sources_count = AFR_COUNT(sources, priv->child_count);
+ if (!sources_count)
+ *spb = _gf_true;
- GF_FREE (priv);
-out:
- return;
+ return ret;
}
-void
-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this)
+int
+afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb)
{
- afr_local_t *local = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
- local = frame->local;
+ priv = this->private;
+
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+
+ ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies);
+ if (ret)
+ goto out;
- if (!local->fd)
- return;
+ if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+ ret = -EAGAIN;
+ goto out;
+ }
- fd_ctx = afr_fd_ctx_get (local->fd, this);
- if (!fd_ctx)
- return;
+ ret = _afr_is_split_brain(frame, this, replies, AFR_DATA_TRANSACTION,
+ d_spb);
+ if (ret)
+ goto out;
- fd_ctx->open_fd_count = local->open_fd_count;
+ ret = _afr_is_split_brain(frame, this, replies, AFR_METADATA_TRANSACTION,
+ m_spb);
+out:
+ if (replies) {
+ afr_replies_wipe(replies, priv->child_count);
+ replies = NULL;
+ }
+ return ret;
}
-int**
-afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending,
- dict_t *xattr, ia_type_t iat)
+int
+afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque)
{
- int i = 0;
- int **changelog = NULL;
- int idx = -1;
- int m_idx = 0;
- int ret = 0;
+ GF_FREE(opaque);
+ return 0;
+}
- m_idx = afr_index_for_transaction_type (AFR_METADATA_TRANSACTION);
+int
+afr_get_split_brain_status(void *opaque)
+{
+ gf_boolean_t d_spb = _gf_false;
+ gf_boolean_t m_spb = _gf_false;
+ int ret = -1;
+ int op_errno = 0;
+ int i = 0;
+ char *choices = NULL;
+ char *status = NULL;
+ dict_t *dict = NULL;
+ inode_t *inode = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ loc_t *loc = NULL;
+ afr_spb_status_t *data = NULL;
+
+ data = opaque;
+ frame = data->frame;
+ this = frame->this;
+ loc = data->loc;
+ priv = this->private;
+ children = priv->children;
+
+ inode = afr_inode_find(this, loc->gfid);
+ if (!inode)
+ goto out;
+
+ dict = dict_new();
+ if (!dict) {
+ op_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ /* Calculation for string length :
+ * (child_count X length of child-name) + SLEN(" Choices :")
+ * child-name consists of :
+ * a) 251 = max characters for volname according to GD_VOLUME_NAME_MAX
+ * b) strlen("-client-00,") assuming 16 replicas
+ */
+ choices = alloca0(priv->child_count * (256 + SLEN("-client-00,")) +
+ SLEN(" Choices:"));
+
+ ret = afr_is_split_brain(frame, this, inode, loc->gfid, &d_spb, &m_spb);
+ if (ret) {
+ op_errno = -ret;
+ if (ret == -EAGAIN) {
+ ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS,
+ SBRAIN_HEAL_NO_GO_MSG);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ AFR_MSG_DICT_SET_FAILED,
+ "Failed to set GF_AFR_SBRAIN_STATUS in dict");
+ }
+ }
+ ret = -1;
+ goto out;
+ }
+
+ if (d_spb || m_spb) {
+ sprintf(choices, " Choices:");
+ for (i = 0; i < priv->child_count; i++) {
+ strcat(choices, children[i]->name);
+ strcat(choices, ",");
+ }
+ choices[strlen(choices) - 1] = '\0';
- idx = afr_index_from_ia_type (iat);
+ ret = gf_asprintf(&status,
+ "data-split-brain:%s "
+ "metadata-split-brain:%s%s",
+ (d_spb) ? "yes" : "no", (m_spb) ? "yes" : "no",
+ choices);
- changelog = afr_matrix_create (priv->child_count, AFR_NUM_CHANGE_LOGS);
- if (!changelog)
- goto out;
+ if (-1 == ret) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ ret = dict_set_dynstr_sizen(dict, GF_AFR_SBRAIN_STATUS, status);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ ret = dict_set_sizen_str_sizen(dict, GF_AFR_SBRAIN_STATUS,
+ SFILE_NOT_UNDER_DATA);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (!pending[i])
- continue;
-
- changelog[i][m_idx] = hton32(1);
- if (idx != -1)
- changelog[i][idx] = hton32(1);
- }
- ret = afr_set_pending_dict (priv, xattr, changelog);
- if (ret < 0) {
- afr_matrix_cleanup (changelog, priv->child_count);
- return NULL;
- }
+ ret = 0;
out:
- return changelog;
+ AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+ if (dict)
+ dict_unref(dict);
+ if (inode)
+ inode_unref(inode);
+ return ret;
}
-gf_boolean_t
-afr_decide_heal_info (afr_private_t *priv, unsigned char *sources, int ret)
+int32_t
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
- int sources_count = 0;
-
+ int ret = 0;
+ int op_errno = 0;
+ dict_t *dict = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *heal_local = NULL;
+ call_frame_t *heal_frame = NULL;
+
+ local = frame->local;
+ dict = dict_new();
+ if (!dict) {
+ op_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ heal_frame = afr_frame_create(this, &op_errno);
+ if (!heal_frame) {
+ ret = -1;
+ goto out;
+ }
+ heal_local = heal_frame->local;
+ heal_frame->local = frame->local;
+ /*Initiate heal with heal_frame with lk-owner set so that inodelk/entrylk
+ * work correctly*/
+ ret = afr_selfheal_do(heal_frame, this, loc->gfid);
+
+ if (ret == 1 || ret == 2) {
+ ret = dict_set_sizen_str_sizen(dict, "sh-fail-msg",
+ SFILE_NOT_IN_SPLIT_BRAIN);
if (ret)
- goto out;
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set sh-fail-msg in dict");
+ ret = 0;
+ goto out;
+ } else {
+ if (local->xdata_rsp) {
+ /* 'sh-fail-msg' has been set in the dict during self-heal.*/
+ dict_copy(local->xdata_rsp, dict);
+ ret = 0;
+ } else if (ret < 0) {
+ op_errno = -ret;
+ ret = -1;
+ }
+ }
- sources_count = AFR_COUNT (sources, priv->child_count);
- if (sources_count == priv->child_count)
- return _gf_false;
out:
- return _gf_true;
+ if (heal_frame) {
+ heal_frame->local = heal_local;
+ AFR_STACK_DESTROY(heal_frame);
+ }
+ if (local->op == GF_FOP_GETXATTR)
+ AFR_STACK_UNWIND(getxattr, frame, ret, op_errno, dict, NULL);
+ else if (local->op == GF_FOP_SETXATTR)
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ if (dict)
+ dict_unref(dict);
+ return ret;
}
int
-afr_selfheal_locked_metadata_inspect (call_frame_t *frame, xlator_t *this,
- inode_t *inode, gf_boolean_t *msh,
- gf_boolean_t *pending)
+afr_get_child_index_from_name(xlator_t *this, char *name)
{
- int ret = -1;
- unsigned char *locked_on = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *healed_sinks = NULL;
- struct afr_reply *locked_replies = NULL;
-
- afr_private_t *priv = this->private;
-
- locked_on = alloca0 (priv->child_count);
- sources = alloca0 (priv->child_count);
- sinks = alloca0 (priv->child_count);
- healed_sinks = alloca0 (priv->child_count);
+ afr_private_t *priv = this->private;
+ int index = -1;
- locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
-
- ret = afr_selfheal_inodelk (frame, this, inode, this->name,
- LLONG_MAX - 1, 0, locked_on);
- {
- if (ret == 0) {
- /* Not a single lock */
- ret = -afr_final_errno (frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN;/* all invalid responses */
- goto out;
- }
- ret = __afr_selfheal_metadata_prepare (frame, this, inode,
- locked_on, sources,
- sinks, healed_sinks,
- locked_replies,
- pending);
- *msh = afr_decide_heal_info (priv, sources, ret);
- }
- afr_selfheal_uninodelk (frame, this, inode, this->name,
- LLONG_MAX - 1, 0, locked_on);
+ for (index = 0; index < priv->child_count; index++) {
+ if (!strcmp(priv->children[index]->name, name))
+ goto out;
+ }
+ index = -1;
out:
- if (locked_replies)
- afr_replies_wipe (locked_replies, priv->child_count);
- return ret;
+ return index;
}
-int
-afr_selfheal_locked_data_inspect (call_frame_t *frame, xlator_t *this,
- inode_t *inode, gf_boolean_t *dsh,
- gf_boolean_t *pflag)
-{
- int ret = -1;
- unsigned char *locked_on = NULL;
- unsigned char *data_lock = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *healed_sinks = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
- struct afr_reply *locked_replies = NULL;
-
- priv = this->private;
- locked_on = alloca0 (priv->child_count);
- data_lock = alloca0 (priv->child_count);
- sources = alloca0 (priv->child_count);
- sinks = alloca0 (priv->child_count);
- healed_sinks = alloca0 (priv->child_count);
-
- /* Heal-info does an open() on the file being examined so that the
- * current eager-lock holding client, if present, at some point sees
- * open-fd count being > 1 and releases the eager-lock so that heal-info
- * doesn't remain blocked forever until IO completes.
- */
- ret = afr_selfheal_data_open (this, inode, &fd);
- if (ret < 0) {
- gf_msg_debug (this->name, -ret, "%s: Failed to open",
- uuid_utoa (inode->gfid));
- goto out;
- }
+void
+afr_priv_need_heal_set(afr_private_t *priv, gf_boolean_t need_heal)
+{
+ LOCK(&priv->lock);
+ {
+ priv->need_heal = need_heal;
+ }
+ UNLOCK(&priv->lock);
+}
- locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
+void
+afr_set_need_heal(xlator_t *this, afr_local_t *local)
+{
+ int i = 0;
+ afr_private_t *priv = this->private;
+ gf_boolean_t need_heal = _gf_false;
- ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain,
- 0, 0, locked_on);
- {
- if (ret == 0) {
- ret = -afr_final_errno (frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN;/* all invalid responses */
- goto out;
- }
- ret = afr_selfheal_inodelk (frame, this, inode, this->name,
- 0, 0, data_lock);
- {
- if (ret == 0) {
- ret = -afr_final_errno (frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN;
- /* all invalid responses */
- goto unlock;
- }
- ret = __afr_selfheal_data_prepare (frame, this, inode,
- data_lock, sources,
- sinks, healed_sinks,
- locked_replies,
- pflag);
- *dsh = afr_decide_heal_info (priv, sources, ret);
- }
- afr_selfheal_uninodelk (frame, this, inode, this->name, 0, 0,
- data_lock);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].need_heal) {
+ need_heal = _gf_true;
+ break;
}
-unlock:
- afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0,
- locked_on);
-out:
- if (locked_replies)
- afr_replies_wipe (locked_replies, priv->child_count);
- if (fd)
- fd_unref (fd);
- return ret;
+ }
+ afr_priv_need_heal_set(priv, need_heal);
+ return;
}
-int
-afr_selfheal_locked_entry_inspect (call_frame_t *frame, xlator_t *this,
- inode_t *inode,
- gf_boolean_t *esh, gf_boolean_t *pflag)
-{
- int ret = -1;
- int source = -1;
- afr_private_t *priv = NULL;
- unsigned char *locked_on = NULL;
- unsigned char *data_lock = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *healed_sinks = NULL;
- struct afr_reply *locked_replies = NULL;
-
- priv = this->private;
- locked_on = alloca0 (priv->child_count);
- data_lock = alloca0 (priv->child_count);
- sources = alloca0 (priv->child_count);
- sinks = alloca0 (priv->child_count);
- healed_sinks = alloca0 (priv->child_count);
-
- locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
-
- ret = afr_selfheal_tryentrylk (frame, this, inode, priv->sh_domain,
- NULL, locked_on);
- {
- if (ret == 0) {
- ret = -afr_final_errno (frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN;/* all invalid responses */
- goto out;
- }
+gf_boolean_t
+afr_get_need_heal(xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ gf_boolean_t need_heal = _gf_true;
- ret = afr_selfheal_entrylk (frame, this, inode, this->name,
- NULL, data_lock);
- {
- if (ret == 0) {
- ret = -afr_final_errno (frame->local, priv);
- if (ret == 0)
- ret = -ENOTCONN;
- /* all invalid responses */
- goto unlock;
- }
- ret = __afr_selfheal_entry_prepare (frame, this, inode,
- data_lock, sources,
- sinks, healed_sinks,
- locked_replies,
- &source, pflag);
- if ((ret == 0) && source < 0)
- ret = -EIO;
- *esh = afr_decide_heal_info (priv, sources, ret);
- }
- afr_selfheal_unentrylk (frame, this, inode, this->name, NULL,
- data_lock);
- }
-unlock:
- afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL,
- locked_on);
-out:
- if (locked_replies)
- afr_replies_wipe (locked_replies, priv->child_count);
- return ret;
+ LOCK(&priv->lock);
+ {
+ need_heal = priv->need_heal;
+ }
+ UNLOCK(&priv->lock);
+ return need_heal;
}
int
-afr_selfheal_locked_inspect (call_frame_t *frame, xlator_t *this, uuid_t gfid,
- inode_t **inode,
- gf_boolean_t *entry_selfheal,
- gf_boolean_t *data_selfheal,
- gf_boolean_t *metadata_selfheal,
- gf_boolean_t *pending)
+afr_get_msg_id(char *op_type)
+{
+ if (!strcmp(op_type, GF_AFR_REPLACE_BRICK))
+ return AFR_MSG_REPLACE_BRICK_STATUS;
+ else if (!strcmp(op_type, GF_AFR_ADD_BRICK))
+ return AFR_MSG_ADD_BRICK_STATUS;
+ return -1;
+}
+int
+afr_fav_child_reset_sink_xattrs_cbk(int ret, call_frame_t *heal_frame,
+ void *opaque)
{
- int ret = -1;
- gf_boolean_t dsh = _gf_false;
- gf_boolean_t msh = _gf_false;
- gf_boolean_t esh = _gf_false;
+ call_frame_t *txn_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *heal_local = NULL;
+ xlator_t *this = NULL;
- ret = afr_selfheal_unlocked_inspect (frame, this, gfid, inode,
- &dsh, &msh, &esh);
- if (ret)
- goto out;
+ heal_local = heal_frame->local;
+ txn_frame = heal_local->heal_frame;
+ local = txn_frame->local;
+ this = txn_frame->this;
- /* For every heal type hold locks and check if it indeed needs heal */
+ /* Refresh the inode agan and proceed with the transaction.*/
+ afr_inode_refresh(txn_frame, this, local->inode, NULL, local->refreshfn);
- if (msh) {
- ret = afr_selfheal_locked_metadata_inspect (frame, this,
- *inode, &msh,
- pending);
- if (ret == -EIO)
- goto out;
- }
+ AFR_STACK_DESTROY(heal_frame);
- if (dsh) {
- ret = afr_selfheal_locked_data_inspect (frame, this, *inode,
- &dsh, pending);
- if (ret == -EIO || (ret == -EAGAIN))
- goto out;
- }
+ return 0;
+}
- if (esh) {
- ret = afr_selfheal_locked_entry_inspect (frame, this, *inode,
- &esh, pending);
+int
+afr_fav_child_reset_sink_xattrs(void *opaque)
+{
+ call_frame_t *heal_frame = NULL;
+ call_frame_t *txn_frame = NULL;
+ xlator_t *this = NULL;
+ gf_boolean_t d_spb = _gf_false;
+ gf_boolean_t m_spb = _gf_false;
+ afr_local_t *heal_local = NULL;
+ afr_local_t *txn_local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ unsigned char *locked_on = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int ret = 0;
+
+ heal_frame = (call_frame_t *)opaque;
+ heal_local = heal_frame->local;
+ txn_frame = heal_local->heal_frame;
+ txn_local = txn_frame->local;
+ this = txn_frame->this;
+ inode = txn_local->inode;
+ priv = this->private;
+ locked_on = alloca0(priv->child_count);
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = _afr_is_split_brain(txn_frame, this, txn_local->replies,
+ AFR_DATA_TRANSACTION, &d_spb);
+
+ ret = _afr_is_split_brain(txn_frame, this, txn_local->replies,
+ AFR_METADATA_TRANSACTION, &m_spb);
+
+ /* Take appropriate locks and reset sink xattrs. */
+ if (d_spb) {
+ ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name, 0, 0,
+ locked_on);
+ {
+ if (ret < priv->child_count)
+ goto data_unlock;
+ ret = __afr_selfheal_data_prepare(
+ heal_frame, this, inode, locked_on, sources, sinks,
+ healed_sinks, undid_pending, locked_replies, NULL);
+ }
+ data_unlock:
+ afr_selfheal_uninodelk(heal_frame, this, inode, this->name, 0, 0,
+ locked_on);
+ }
+
+ if (m_spb) {
+ memset(locked_on, 0, sizeof(*locked_on) * priv->child_count);
+ memset(undid_pending, 0, sizeof(*undid_pending) * priv->child_count);
+ ret = afr_selfheal_inodelk(heal_frame, this, inode, this->name,
+ LLONG_MAX - 1, 0, locked_on);
+ {
+ if (ret < priv->child_count)
+ goto mdata_unlock;
+ ret = __afr_selfheal_metadata_prepare(
+ heal_frame, this, inode, locked_on, sources, sinks,
+ healed_sinks, undid_pending, locked_replies, NULL);
}
+ mdata_unlock:
+ afr_selfheal_uninodelk(heal_frame, this, inode, this->name,
+ LLONG_MAX - 1, 0, locked_on);
+ }
-out:
- *data_selfheal = dsh;
- *entry_selfheal = esh;
- *metadata_selfheal = msh;
- return ret;
+ return ret;
}
-dict_t*
-afr_set_heal_info (char *status)
-{
- dict_t *dict = NULL;
- int ret = -1;
-
- dict = dict_new ();
- if (!dict) {
- ret = -ENOMEM;
+/*
+ * Concatenates the xattrs in local->replies separated by a delimiter.
+ */
+int
+afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this,
+ char *buf, const char *default_str,
+ int32_t *serz_len, char delimiter)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ char *xattr = NULL;
+ int i = 0;
+ int len = 0;
+ int keylen = 0;
+ size_t str_len = 0;
+ int ret = -1;
+
+ priv = this->private;
+ local = frame->local;
+
+ keylen = strlen(local->cont.getxattr.name);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid || local->replies[i].op_ret) {
+ str_len = strlen(default_str);
+ buf = strncat(buf, default_str, str_len);
+ len += str_len;
+ buf[len++] = delimiter;
+ buf[len] = '\0';
+ } else {
+ ret = dict_get_strn(local->replies[i].xattr,
+ local->cont.getxattr.name, keylen, &xattr);
+ if (ret) {
+ gf_msg("TEST", GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "Failed to get the node_uuid of brick "
+ "%d",
+ i);
goto out;
- }
+ }
+ str_len = strlen(xattr);
+ buf = strncat(buf, xattr, str_len);
+ len += str_len;
+ buf[len++] = delimiter;
+ buf[len] = '\0';
+ }
+ }
+ buf[--len] = '\0'; /*remove the last delimiter*/
+ if (serz_len)
+ *serz_len = ++len;
+ ret = 0;
- ret = dict_set_str (dict, "heal-info", status);
- if (ret)
- gf_msg ("", GF_LOG_WARNING, -ret,
- AFR_MSG_DICT_SET_FAILED,
- "Failed to set heal-info key to "
- "%s", status);
out:
- return dict;
+ return ret;
}
-int
-afr_get_heal_info (call_frame_t *frame, xlator_t *this, loc_t *loc)
-{
- gf_boolean_t data_selfheal = _gf_false;
- gf_boolean_t metadata_selfheal = _gf_false;
- gf_boolean_t entry_selfheal = _gf_false;
- gf_boolean_t pending = _gf_false;
- dict_t *dict = NULL;
- int ret = -1;
- int op_errno = 0;
- int size = 0;
- inode_t *inode = NULL;
- char *substr = NULL;
- char *status = NULL;
-
- ret = afr_selfheal_locked_inspect (frame, this, loc->gfid, &inode,
- &entry_selfheal,
- &data_selfheal, &metadata_selfheal,
- &pending);
-
- if (ret == -ENOMEM) {
- op_errno = -ret;
- ret = -1;
- goto out;
- }
-
- if (pending) {
- size = strlen ("-pending") + 1;
- gf_asprintf (&substr, "-pending");
- if (!substr)
- goto out;
- }
+uint64_t
+afr_write_subvol_get(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ uint64_t write_subvol = 0;
- if (ret == -EIO) {
- size += strlen ("split-brain") + 1;
- ret = gf_asprintf (&status, "split-brain%s",
- substr? substr : "");
- if (ret < 0)
- goto out;
- dict = afr_set_heal_info (status);
- } else if (ret == -EAGAIN) {
- size += strlen ("possibly-healing") + 1;
- ret = gf_asprintf (&status, "possibly-healing%s",
- substr? substr : "");
- if (ret < 0)
- goto out;
- dict = afr_set_heal_info (status);
- } else if (ret >= 0) {
- /* value of ret = source index
- * so ret >= 0 and at least one of the 3 booleans set to
- * true means a source is identified; heal is required.
- */
- if (!data_selfheal && !entry_selfheal &&
- !metadata_selfheal) {
- dict = afr_set_heal_info ("no-heal");
- } else {
- size += strlen ("heal") + 1;
- ret = gf_asprintf (&status, "heal%s",
- substr? substr : "");
- if (ret < 0)
- goto out;
- dict = afr_set_heal_info (status);
- }
- } else if (ret < 0) {
- /* Apart from above checked -ve ret values, there are
- * other possible ret values like ENOTCONN
- * (returned when number of valid replies received are
- * less than 2)
- * in which case heal is required when one of the
- * selfheal booleans is set.
- */
- if (data_selfheal || entry_selfheal ||
- metadata_selfheal) {
- size += strlen ("heal") + 1;
- ret = gf_asprintf (&status, "heal%s",
- substr? substr : "");
- if (ret < 0)
- goto out;
- dict = afr_set_heal_info (status);
- }
- }
- ret = 0;
+ local = frame->local;
+ LOCK(&local->inode->lock);
+ write_subvol = local->inode_ctx->write_subvol;
+ UNLOCK(&local->inode->lock);
-out:
- AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);
- if (dict)
- dict_unref (dict);
- if (inode) {
- inode_forget (inode, 1);
- inode_unref (inode);
- }
- GF_FREE (substr);
- return ret;
+ return write_subvol;
}
int
-_afr_is_split_brain (call_frame_t *frame, xlator_t *this,
- struct afr_reply *replies,
- afr_transaction_type type,
- gf_boolean_t *spb)
-{
- afr_private_t *priv = NULL;
- uint64_t *witness = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- int sources_count = 0;
- int ret = 0;
-
- priv = this->private;
-
- sources = alloca0 (priv->child_count);
- sinks = alloca0 (priv->child_count);
- witness = alloca0(priv->child_count * sizeof (*witness));
-
- ret = afr_selfheal_find_direction (frame, this, replies,
- type, priv->child_up, sources,
- sinks, witness, NULL);
- if (ret)
- return ret;
-
- sources_count = AFR_COUNT (sources, priv->child_count);
- if (!sources_count)
- *spb = _gf_true;
-
- return ret;
+afr_write_subvol_set(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *data_accused = NULL;
+ unsigned char *metadata_accused = NULL;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ uint16_t datamap = 0;
+ uint16_t metadatamap = 0;
+ uint64_t val = 0;
+ int event = 0;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+ data_accused = alloca0(priv->child_count);
+ metadata_accused = alloca0(priv->child_count);
+ data_readable = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+ event = local->event_generation;
+
+ afr_readables_fill(frame, this, local->inode, data_accused,
+ metadata_accused, data_readable, metadata_readable,
+ NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (data_readable[i])
+ datamap |= (1 << i);
+ if (metadata_readable[i])
+ metadatamap |= (1 << i);
+ }
+
+ val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
+ (((uint64_t)event) << 32);
+
+ LOCK(&local->inode->lock);
+ {
+ if (local->inode_ctx->write_subvol == 0 &&
+ local->transaction.type == AFR_DATA_TRANSACTION) {
+ local->inode_ctx->write_subvol = val;
+ }
+ }
+ UNLOCK(&local->inode->lock);
+
+ return 0;
}
int
-afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
- uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb)
+afr_write_subvol_reset(call_frame_t *frame, xlator_t *this)
{
- int ret = -1;
- afr_private_t *priv = NULL;
- struct afr_reply *replies = NULL;
+ afr_local_t *local = NULL;
- priv = this->private;
+ local = frame->local;
+ LOCK(&local->inode->lock);
+ {
+ GF_ASSERT(local->inode_ctx->lock_count > 0);
+ local->inode_ctx->lock_count--;
- replies = alloca0 (sizeof (*replies) * priv->child_count);
+ if (!local->inode_ctx->lock_count)
+ local->inode_ctx->write_subvol = 0;
+ }
+ UNLOCK(&local->inode->lock);
- ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
- if (ret)
- goto out;
-
- ret = _afr_is_split_brain (frame, this, replies,
- AFR_DATA_TRANSACTION, d_spb);
- if (ret)
- goto out;
-
- ret = _afr_is_split_brain (frame, this, replies,
- AFR_METADATA_TRANSACTION, m_spb);
-out:
- if (replies) {
- afr_replies_wipe (replies, priv->child_count);
- replies = NULL;
- }
- return ret;
+ return 0;
}
int
-afr_get_split_brain_status_cbk (int ret, call_frame_t *frame, void *opaque)
+afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode)
{
- GF_FREE (opaque);
- return 0;
+ int ret = 0;
+
+ local->inode = inode_ref(inode);
+ LOCK(&local->inode->lock);
+ {
+ ret = __afr_inode_ctx_get(this, local->inode, &local->inode_ctx);
+ }
+ UNLOCK(&local->inode->lock);
+ if (ret < 0) {
+ gf_msg_callingfn(
+ this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_INODE_CTX_GET_FAILED,
+ "Error getting inode ctx %s", uuid_utoa(local->inode->gfid));
+ }
+ return ret;
}
-int
-afr_get_split_brain_status (void *opaque)
-{
- gf_boolean_t d_spb = _gf_false;
- gf_boolean_t m_spb = _gf_false;
- int ret = -1;
- int op_errno = 0;
- int i = 0;
- char *choices = NULL;
- char *status = NULL;
- dict_t *dict = NULL;
- inode_t *inode = NULL;
- afr_private_t *priv = NULL;
- xlator_t **children = NULL;
- call_frame_t *frame = NULL;
- xlator_t *this = NULL;
- loc_t *loc = NULL;
- afr_spb_status_t *data = NULL;
-
- data = opaque;
- frame = data->frame;
- this = frame->this;
- loc = data->loc;
- priv = this->private;
- children = priv->children;
-
- inode = afr_inode_find (this, loc->gfid);
- if (!inode)
- goto out;
+gf_boolean_t
+afr_ta_is_fop_called_from_synctask(xlator_t *this)
+{
+ struct synctask *task = NULL;
+ gf_lkowner_t tmp_owner = {
+ 0,
+ };
- /* Calculation for string length :
- * (child_count X length of child-name) + strlen (" Choices :")
- * child-name consists of :
- * a) 256 = max characters for volname according to GD_VOLUME_NAME_MAX
- * b) strlen ("-client-00,") assuming 16 replicas
- */
- choices = alloca0 (priv->child_count * (256 + strlen ("-client-00,")) +
- strlen (" Choices:"));
+ task = synctask_get();
+ if (!task)
+ return _gf_false;
- ret = afr_is_split_brain (frame, this, inode, loc->gfid, &d_spb,
- &m_spb);
- if (ret) {
- op_errno = -ret;
- ret = -1;
- goto out;
- }
+ set_lk_owner_from_ptr(&tmp_owner, (void *)this);
- dict = dict_new ();
- if (!dict) {
- op_errno = ENOMEM;
- ret = -1;
- goto out;
- }
-
- if (d_spb || m_spb) {
- sprintf (choices, " Choices:");
- for (i = 0; i < priv->child_count; i++) {
- strcat (choices, children[i]->name);
- strcat (choices, ",");
- }
- choices[strlen (choices) - 1] = '\0';
+ if (!is_same_lkowner(&tmp_owner, &task->frame->root->lk_owner))
+ return _gf_false;
- ret = gf_asprintf (&status, "data-split-brain:%s "
- "metadata-split-brain:%s%s",
- (d_spb) ? "yes" : "no",
- (m_spb) ? "yes" : "no", choices);
+ return _gf_true;
+}
- if (-1 == ret) {
- op_errno = ENOMEM;
- goto out;
- }
- ret = dict_set_dynstr (dict, GF_AFR_SBRAIN_STATUS, status);
- if (ret) {
- op_errno = -ret;
- ret = -1;
- goto out;
- }
+int
+afr_ta_post_op_lock(xlator_t *this, loc_t *loc)
+{
+ int ret = 0;
+ uuid_t gfid = {
+ 0,
+ };
+ afr_private_t *priv = this->private;
+ gf_boolean_t locked = _gf_false;
+ struct gf_flock flock1 = {
+ 0,
+ };
+ struct gf_flock flock2 = {
+ 0,
+ };
+ int32_t cmd = 0;
+
+ /* Clients must take AFR_TA_DOM_NOTIFY lock only when the previous lock
+ * has been released in afr_notify due to upcall notification from shd.
+ */
+ GF_ASSERT(priv->ta_notify_dom_lock_offset == 0);
+
+ if (!priv->shd.iamshd)
+ GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
+ flock1.l_type = F_WRLCK;
+
+ while (!locked) {
+ if (priv->shd.iamshd) {
+ cmd = F_SETLKW;
+ flock1.l_start = 0;
+ flock1.l_len = 0;
} else {
- ret = dict_set_str (dict, GF_AFR_SBRAIN_STATUS,
- "The file is not under data or"
- " metadata split-brain");
- if (ret) {
- op_errno = -ret;
- ret = -1;
- goto out;
- }
- }
+ cmd = F_SETLK;
+ gf_uuid_generate(gfid);
+ flock1.l_start = gfid_to_ino(gfid);
+ if (flock1.l_start < 0)
+ flock1.l_start = -flock1.l_start;
+ flock1.l_len = 1;
+ }
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, loc, cmd, &flock1, NULL, NULL);
+ if (!ret) {
+ locked = _gf_true;
+ priv->ta_notify_dom_lock_offset = flock1.l_start;
+ } else if (ret == -EAGAIN) {
+ continue;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to get "
+ "AFR_TA_DOM_NOTIFY lock on %s.",
+ loc->name);
+ goto out;
+ }
+ }
+
+ flock2.l_type = F_WRLCK;
+ flock2.l_start = 0;
+ flock2.l_len = 0;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, loc, F_SETLKW, &flock2, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to get AFR_TA_DOM_MODIFY lock on %s.", loc->name);
+ flock1.l_type = F_UNLCK;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock1, NULL,
+ NULL);
+ }
+out:
+ return ret;
+}
- ret = 0;
+int
+afr_ta_post_op_unlock(xlator_t *this, loc_t *loc)
+{
+ afr_private_t *priv = this->private;
+ struct gf_flock flock = {
+ 0,
+ };
+ int ret = 0;
+
+ if (!priv->shd.iamshd)
+ GF_ASSERT(afr_ta_is_fop_called_from_synctask(this));
+ flock.l_type = F_UNLCK;
+ flock.l_start = 0;
+ flock.l_len = 0;
+
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to unlock AFR_TA_DOM_MODIFY lock.");
+ goto out;
+ }
+
+ if (!priv->shd.iamshd)
+ /* Mounts (clients) will not release the AFR_TA_DOM_NOTIFY lock
+ * in post-op as they use it as a notification mechanism. When
+ * shd sends a lock request on TA during heal, the clients will
+ * receive a lock-contention upcall notification upon which they
+ * will release the AFR_TA_DOM_NOTIFY lock after completing the
+ * in flight I/O.*/
+ goto out;
+
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to unlock AFR_TA_DOM_NOTIFY lock.");
+ }
out:
- AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);
- if (dict)
- dict_unref (dict);
- if (inode)
- inode_unref (inode);
- return ret;
+ return ret;
}
-int32_t
-afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc)
+call_frame_t *
+afr_ta_frame_create(xlator_t *this)
{
- int ret = 0;
- int op_errno = 0;
- dict_t *dict = NULL;
- afr_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ void *lk_owner = NULL;
- local = frame->local;
- dict = dict_new ();
- if (!dict) {
- op_errno = ENOMEM;
- ret = -1;
- goto out;
- }
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ return NULL;
+ lk_owner = (void *)this;
+ afr_set_lk_owner(frame, this, lk_owner);
+ return frame;
+}
- ret = afr_selfheal_do (frame, this, loc->gfid);
+gf_boolean_t
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local)
+{
+ int data_count = 0;
- if (ret == 1 || ret == 2) {
- ret = dict_set_str (dict, "sh-fail-msg",
- "File not in split-brain");
- if (ret)
- gf_msg (this->name, GF_LOG_WARNING,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Failed to set sh-fail-msg in dict");
- ret = 0;
- goto out;
- } else {
- if (local->xdata_rsp) {
- /* 'sh-fail-msg' has been set in the dict during self-heal.*/
- dict_copy (local->xdata_rsp, dict);
- ret = 0;
- } else if (ret < 0) {
- op_errno = -ret;
- ret = -1;
- }
- }
+ data_count = AFR_COUNT(local->child_up, priv->child_count);
+ if (data_count == 2) {
+ return _gf_true;
+ } else if (data_count == 1 && local->ta_child_up) {
+ return _gf_true;
+ }
-out:
- if (local->op == GF_FOP_GETXATTR)
- AFR_STACK_UNWIND (getxattr, frame, ret, op_errno, dict, NULL);
- else if (local->op == GF_FOP_SETXATTR)
- AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
- if (dict)
- dict_unref(dict);
- return ret;
+ return _gf_false;
}
-int
-afr_get_child_index_from_name (xlator_t *this, char *name)
+static gf_boolean_t
+afr_is_add_replica_mount_lookup_on_root(call_frame_t *frame)
{
- afr_private_t *priv = this->private;
- int index = -1;
+ afr_local_t *local = NULL;
- for (index = 0; index < priv->child_count; index++) {
- if (!strcmp (priv->children[index]->name, name))
- goto out;
- }
- index = -1;
-out:
- return index;
+ if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT)
+ return _gf_false;
+
+ local = frame->local;
+
+ if (local->op != GF_FOP_LOOKUP)
+ /* TODO:If the replica count is being increased on a plain distribute
+ * volume that was never mounted, we need to allow setxattr on '/' with
+ * GF_CLIENT_PID_NO_ROOT_SQUASH to accomodate for DHT layout setting */
+ return _gf_false;
+
+ if (local->inode == NULL)
+ return _gf_false;
+
+ if (!__is_root_gfid(local->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
}
-void
-afr_priv_need_heal_set (afr_private_t *priv, gf_boolean_t need_heal)
+gf_boolean_t
+afr_lookup_has_quorum(call_frame_t *frame, const unsigned int up_children_count)
{
- LOCK (&priv->lock);
- {
- priv->need_heal = need_heal;
- }
- UNLOCK (&priv->lock);
+ if (frame && (up_children_count > 0) &&
+ afr_is_add_replica_mount_lookup_on_root(frame))
+ return _gf_true;
+
+ return _gf_false;
}
void
-afr_set_need_heal (xlator_t *this, afr_local_t *local)
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this)
{
- int i = 0;
- afr_private_t *priv = this->private;
- gf_boolean_t need_heal = _gf_false;
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ unsigned char *success_replies = NULL;
- for (i = 0; i < priv->child_count; i++) {
- if (local->replies[i].valid && local->replies[i].need_heal) {
- need_heal = _gf_true;
- break;
- }
- }
- afr_priv_need_heal_set (priv, need_heal);
- return;
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+
+ if (priv->quorum_count && !afr_has_quorum(success_replies, this, NULL)) {
+ local->op_errno = afr_final_errno(local, priv);
+ if (!local->op_errno)
+ local->op_errno = afr_quorum_errno(priv);
+ local->op_ret = -1;
+ }
}
gf_boolean_t
-afr_get_need_heal (xlator_t *this)
-{
- afr_private_t *priv = this->private;
- gf_boolean_t need_heal = _gf_true;
-
- LOCK (&priv->lock);
- {
- need_heal = priv->need_heal;
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv, int child)
+{
+ int *pending = NULL;
+ int ret = 0;
+ int i = 0;
+
+ ret = dict_get_ptr(dict, priv->pending_key[child], (void *)&pending);
+ if (ret == 0) {
+ for (i = 0; i < AFR_NUM_CHANGE_LOGS; i++) {
+ /* Not doing a ntoh32(pending) as we just want to check
+ * if it is non-zero or not. */
+ if (pending[i]) {
+ return _gf_true;
+ }
}
- UNLOCK (&priv->lock);
- return need_heal;
+ }
+
+ return _gf_false;
}
diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c
index 841c64361cf..f8bf8340dab 100644
--- a/xlators/cluster/afr/src/afr-dir-read.c
+++ b/xlators/cluster/afr/src/afr-dir-read.c
@@ -8,344 +8,339 @@
cases as published by the Free Software Foundation.
*/
-
#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
#include <string.h>
-#include "glusterfs.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "checksum.h"
+#include <glusterfs/glusterfs.h>
+#include <glusterfs/dict.h>
+#include <glusterfs/list.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
#include "afr.h"
#include "afr-transaction.h"
-
int32_t
-afr_opendir_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- fd_t *fd, dict_t *xdata)
+afr_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int call_count = -1;
- int32_t child_index = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- local = frame->local;
- fd_ctx = local->fd_ctx;
- child_index = (long) cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
- } else {
- local->op_ret = op_ret;
- fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
- if (!local->xdata_rsp && xdata)
- local->xdata_rsp = dict_ref (xdata);
- }
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int32_t child_index = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
+ child_index = (long)cookie;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
+ local->op_ret = op_ret;
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
}
- UNLOCK (&frame->lock);
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
- call_count = afr_frame_return (frame);
+ if (call_count == 0) {
+ afr_handle_replies_quorum(frame, this);
+ AFR_STACK_UNWIND(opendir, frame, local->op_ret, local->op_errno,
+ local->fd, NULL);
+ }
- if (call_count == 0)
- AFR_STACK_UNWIND (opendir, frame, local->op_ret,
- local->op_errno, local->fd, NULL);
- return 0;
+ return 0;
}
-
int
-afr_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int i = 0;
- int call_count = -1;
- int32_t op_errno = ENOMEM;
- afr_fd_ctx_t *fd_ctx = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = -1;
+ int32_t op_errno = ENOMEM;
+ afr_fd_ctx_t *fd_ctx = NULL;
- priv = this->private;
+ priv = this->private;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- goto out;
+ local->op = GF_FOP_OPENDIR;
- loc_copy (&local->loc, loc);
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ op_errno = afr_quorum_errno(priv);
+ goto out;
+ }
- local->fd = fd_ref (fd);
- local->fd_ctx = fd_ctx;
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
- call_count = local->call_count;
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx)
+ goto out;
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_opendir_cbk,
- (void*) (long) i,
- priv->children[i],
- priv->children[i]->fops->opendir,
- loc, fd, NULL);
+ loc_copy(&local->loc, loc);
- if (!--call_count)
- break;
- }
+ local->fd = fd_ref(fd);
+ local->fd_ctx = fd_ctx;
+
+ call_count = local->call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, afr_opendir_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->opendir, loc, fd, NULL);
+
+ if (!--call_count)
+ break;
}
+ }
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (opendir, frame, -1, op_errno, fd, NULL);
- return 0;
+ AFR_STACK_UNWIND(opendir, frame, -1, op_errno, fd, NULL);
+ return 0;
}
static int
-afr_validate_read_subvol (inode_t *inode, xlator_t *this, int par_read_subvol)
+afr_validate_read_subvol(inode_t *inode, xlator_t *this, int par_read_subvol)
{
- int gen = 0;
- int entry_read_subvol = 0;
- unsigned char *data_readable = NULL;
- unsigned char *metadata_readable = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- data_readable = alloca0 (priv->child_count);
- metadata_readable = alloca0 (priv->child_count);
-
- afr_inode_read_subvol_get (inode, this, data_readable,
- metadata_readable, &gen);
-
- if (gen != priv->event_generation ||
- !data_readable[par_read_subvol] ||
- !metadata_readable[par_read_subvol])
- return -1;
-
- /* Once the control reaches the following statement, it means that the
- * parent's read subvol is perfectly readable. So calling
- * either afr_data_subvol_get() or afr_metadata_subvol_get() would
- * yield the same result. Hence, choosing afr_data_subvol_get() below.
- */
-
- if (!priv->consistent_metadata)
- return 0;
-
- /* For an inode fetched through readdirp which is yet to be linked,
- * inode ctx would not be initialised (yet). So this function returns
- * -1 above due to gen being 0, which is why it is OK to pass NULL for
- * read_subvol_args here.
- */
- entry_read_subvol = afr_data_subvol_get (inode, this, 0, 0, NULL);
- if (entry_read_subvol != par_read_subvol)
- return -1;
-
+ int gen = 0;
+ int entry_read_subvol = 0;
+ unsigned char *data_readable = NULL;
+ unsigned char *metadata_readable = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ data_readable = alloca0(priv->child_count);
+ metadata_readable = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(inode, this, data_readable, metadata_readable,
+ &gen);
+
+ if (gen != priv->event_generation || !data_readable[par_read_subvol] ||
+ !metadata_readable[par_read_subvol])
+ return -1;
+
+ /* Once the control reaches the following statement, it means that the
+ * parent's read subvol is perfectly readable. So calling
+ * either afr_data_subvol_get() or afr_metadata_subvol_get() would
+ * yield the same result. Hence, choosing afr_data_subvol_get() below.
+ */
+
+ if (!priv->consistent_metadata)
return 0;
+ /* For an inode fetched through readdirp which is yet to be linked,
+ * inode ctx would not be initialised (yet). So this function returns
+ * -1 above due to gen being 0, which is why it is OK to pass NULL for
+ * read_subvol_args here.
+ */
+ entry_read_subvol = afr_data_subvol_get(inode, this, NULL, NULL, NULL,
+ NULL);
+ if (entry_read_subvol != par_read_subvol)
+ return -1;
+
+ return 0;
}
static void
-afr_readdir_transform_entries (gf_dirent_t *subvol_entries, int subvol,
- gf_dirent_t *entries, fd_t *fd)
+afr_readdir_transform_entries(call_frame_t *frame, gf_dirent_t *subvol_entries,
+ int subvol, gf_dirent_t *entries, fd_t *fd)
{
- int ret = -1;
- gf_dirent_t *entry = NULL;
- gf_dirent_t *tmp = NULL;
- xlator_t *this = NULL;
- afr_private_t *priv = NULL;
- gf_boolean_t need_heal = _gf_false;
- gf_boolean_t validate_subvol = _gf_false;
-
- this = THIS;
- priv = this->private;
-
- need_heal = afr_get_need_heal (this);
- validate_subvol = need_heal | priv->consistent_metadata;
-
- list_for_each_entry_safe (entry, tmp, &subvol_entries->list, list) {
- if (__is_root_gfid (fd->inode->gfid) &&
- !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR)) {
- continue;
- }
-
- list_del_init (&entry->list);
- list_add_tail (&entry->list, &entries->list);
-
- if (!validate_subvol)
- continue;
-
- if (entry->inode) {
- ret = afr_validate_read_subvol (entry->inode, this,
- subvol);
- if (ret == -1) {
- inode_unref (entry->inode);
- entry->inode = NULL;
- continue;
- }
- }
+ int ret = -1;
+ gf_dirent_t *entry = NULL;
+ gf_dirent_t *tmp = NULL;
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t need_heal = _gf_false;
+ gf_boolean_t validate_subvol = _gf_false;
+
+ this = THIS;
+ priv = this->private;
+
+ need_heal = afr_get_need_heal(this);
+ validate_subvol = need_heal | priv->consistent_metadata;
+
+ list_for_each_entry_safe(entry, tmp, &subvol_entries->list, list)
+ {
+ if (afr_is_private_directory(priv, fd->inode->gfid, entry->d_name,
+ frame->root->pid)) {
+ continue;
}
-}
+ list_del_init(&entry->list);
+ list_add_tail(&entry->list, &entries->list);
+
+ if (!validate_subvol)
+ continue;
+
+ if (entry->inode) {
+ ret = afr_validate_read_subvol(entry->inode, this, subvol);
+ if (ret == -1) {
+ inode_unref(entry->inode);
+ entry->inode = NULL;
+ continue;
+ }
+ }
+ }
+}
int32_t
-afr_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries,
- dict_t *xdata)
+afr_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, gf_dirent_t *subvol_entries,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- gf_dirent_t entries;
+ afr_local_t *local = NULL;
+ gf_dirent_t entries;
- INIT_LIST_HEAD (&entries.list);
+ INIT_LIST_HEAD(&entries.list);
- local = frame->local;
+ local = frame->local;
- if (op_ret < 0 && !local->cont.readdir.offset) {
- /* failover only if this was first readdir, detected
- by offset == 0 */
- local->op_ret = op_ret;
- local->op_errno = op_errno;
+ if (op_ret < 0 && !local->cont.readdir.offset) {
+ /* failover only if this was first readdir, detected
+ by offset == 0 */
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- afr_read_txn_continue (frame, this, (long) cookie);
- return 0;
- }
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- if (op_ret >= 0)
- afr_readdir_transform_entries (subvol_entries, (long) cookie,
- &entries, local->fd);
+ if (op_ret >= 0)
+ afr_readdir_transform_entries(frame, subvol_entries, (long)cookie,
+ &entries, local->fd);
- AFR_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, xdata);
+ AFR_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, xdata);
- gf_dirent_free (&entries);
+ gf_dirent_free(&entries);
- return 0;
+ return 0;
}
-
int
-afr_readdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_readdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- priv = this->private;
- local = frame->local;
- fd_ctx = afr_fd_ctx_get (local->fd, this);
-
- if (subvol == -1) {
- AFR_STACK_UNWIND (readdir, frame, local->op_ret,
- local->op_errno, 0, 0);
- return 0;
- }
-
- fd_ctx->readdir_subvol = subvol;
-
- if (local->op == GF_FOP_READDIR)
- STACK_WIND_COOKIE (frame, afr_readdir_cbk,
- (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->readdir,
- local->fd, local->cont.readdir.size,
- local->cont.readdir.offset,
- local->xdata_req);
- else
- STACK_WIND_COOKIE (frame, afr_readdir_cbk,
- (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->readdirp,
- local->fd, local->cont.readdir.size,
- local->cont.readdir.offset,
- local->xdata_req);
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ priv = this->private;
+ local = frame->local;
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx) {
+ local->op_errno = EINVAL;
+ local->op_ret = -1;
+ }
+
+ if (subvol == -1 || !fd_ctx) {
+ AFR_STACK_UNWIND(readdir, frame, local->op_ret, local->op_errno, 0, 0);
+ return 0;
+ }
+
+ fd_ctx->readdir_subvol = subvol;
+
+ if (local->op == GF_FOP_READDIR)
+ STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdir, local->fd,
+ local->cont.readdir.size, local->cont.readdir.offset,
+ local->xdata_req);
+ else
+ STACK_WIND_COOKIE(frame, afr_readdir_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readdirp, local->fd,
+ local->cont.readdir.size, local->cont.readdir.offset,
+ local->xdata_req);
+ return 0;
}
-
int
-afr_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, int whichop, dict_t *dict)
+afr_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, int whichop, dict_t *dict)
{
- afr_local_t *local = NULL;
- int32_t op_errno = 0;
- int subvol = -1;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx) {
- op_errno = EINVAL;
- goto out;
- }
-
- local->op = whichop;
- local->fd = fd_ref (fd);
- local->cont.readdir.size = size;
- local->cont.readdir.offset = offset;
- local->xdata_req = (dict)? dict_ref (dict) : NULL;
-
- subvol = fd_ctx->readdir_subvol;
-
- if (offset == 0 || subvol == -1) {
- /* First readdir has option of failing over and selecting
- an appropriate read subvolume */
- afr_read_txn (frame, this, fd->inode, afr_readdir_wind,
- AFR_DATA_TRANSACTION);
- } else {
- /* But continued readdirs MUST stick to the same subvolume
- without an option to failover */
- afr_readdir_wind (frame, this, subvol);
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ int subvol = -1;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ local->op = whichop;
+ local->fd = fd_ref(fd);
+ local->cont.readdir.size = size;
+ local->cont.readdir.offset = offset;
+ local->xdata_req = (dict) ? dict_ref(dict) : NULL;
+
+ subvol = fd_ctx->readdir_subvol;
+
+ if (offset == 0 || subvol == -1) {
+ /* First readdir has option of failing over and selecting
+ an appropriate read subvolume */
+ afr_read_txn(frame, this, fd->inode, afr_readdir_wind,
+ AFR_DATA_TRANSACTION);
+ } else {
+ /* But continued readdirs MUST stick to the same subvolume
+ without an option to failover */
+ afr_readdir_wind(frame, this, subvol);
+ }
+
+ return 0;
out:
- AFR_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
+ return 0;
}
-
int32_t
-afr_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, dict_t *xdata)
+afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata)
{
- afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
+ afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIR, xdata);
- return 0;
+ return 0;
}
-
int32_t
-afr_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, dict_t *dict)
+afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *dict)
{
- afr_do_readdir (frame, this, fd, size, offset, GF_FOP_READDIRP, dict);
+ afr_do_readdir(frame, this, fd, size, offset, GF_FOP_READDIRP, dict);
- return 0;
+ return 0;
}
-
int32_t
-afr_releasedir (xlator_t *this, fd_t *fd)
+afr_releasedir(xlator_t *this, fd_t *fd)
{
- afr_cleanup_fd_ctx (this, fd);
+ afr_cleanup_fd_ctx(this, fd);
- return 0;
+ return 0;
}
diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h
index 09456d15949..773e925ec6c 100644
--- a/xlators/cluster/afr/src/afr-dir-read.h
+++ b/xlators/cluster/afr/src/afr-dir-read.h
@@ -11,26 +11,23 @@
#ifndef __DIR_READ_H__
#define __DIR_READ_H__
-
int32_t
-afr_opendir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, fd_t *fd, dict_t *xdata);
+afr_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata);
int32_t
-afr_releasedir (xlator_t *this, fd_t *fd);
+afr_releasedir(xlator_t *this, fd_t *fd);
int32_t
-afr_readdir (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, dict_t *xdata);
-
+afr_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *xdata);
int32_t
-afr_readdirp (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, dict_t *dict);
+afr_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, dict_t *dict);
int32_t
-afr_checksum (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, dict_t *xdata);
-
+afr_checksum(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ dict_t *xdata);
#endif /* __DIR_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index 3f4ad246992..b7cceb79158 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -8,504 +8,493 @@
cases as published by the Free Software Foundation.
*/
-
#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
#include "afr.h"
#include "afr-transaction.h"
void
-afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this);
+afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this);
int
-afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno)
+afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno)
{
- int ret = -1;
- char *child_path = NULL;
+ int ret = -1;
+ char *child_path = NULL;
+
+ if (!child->parent) {
+ if (op_errno)
+ *op_errno = EINVAL;
+ goto out;
+ }
+
+ child_path = gf_strdup(child->path);
+ if (!child_path) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ parent->path = gf_strdup(dirname(child_path));
+ if (!parent->path) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ goto out;
+ }
+
+ parent->inode = inode_ref(child->parent);
+ gf_uuid_copy(parent->gfid, child->pargfid);
+
+ ret = 0;
+out:
+ GF_FREE(child_path);
- if (!child->parent) {
- if (op_errno)
- *op_errno = EINVAL;
- goto out;
- }
+ return ret;
+}
- child_path = gf_strdup (child->path);
- if (!child_path) {
- if (op_errno)
- *op_errno = ENOMEM;
- goto out;
+static void
+__afr_dir_write_finalize(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int inode_read_subvol = -1;
+ int parent_read_subvol = -1;
+ int parent2_read_subvol = -1;
+ int i = 0;
+ afr_read_subvol_args_t args = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == -1)
+ continue;
+ gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid);
+ args.ia_type = local->replies[i].poststat.ia_type;
+ break;
+ }
+
+ if (local->inode) {
+ if (local->op != GF_FOP_RENAME && local->op != GF_FOP_LINK)
+ afr_replies_interpret(frame, this, local->inode, NULL);
+
+ inode_read_subvol = afr_data_subvol_get(local->inode, this, NULL, NULL,
+ NULL, &args);
+ }
+
+ if (local->parent)
+ parent_read_subvol = afr_data_subvol_get(local->parent, this, NULL,
+ local->readable, NULL, NULL);
+
+ if (local->parent2)
+ parent2_read_subvol = afr_data_subvol_get(local->parent2, this, NULL,
+ local->readable2, NULL, NULL);
+
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ afr_pick_error_xdata(local, priv, local->parent, local->readable,
+ local->parent2, local->readable2);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0) {
+ if (local->inode)
+ afr_inode_need_refresh_set(local->inode, this);
+ if (local->parent)
+ afr_inode_need_refresh_set(local->parent, this);
+ if (local->parent2)
+ afr_inode_need_refresh_set(local->parent2, this);
+ continue;
}
- parent->path = gf_strdup (dirname (child_path));
- if (!parent->path) {
- if (op_errno)
- *op_errno = ENOMEM;
- goto out;
+ if (local->op_ret == -1) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.dir_fop.buf = local->replies[i].poststat;
+ local->cont.dir_fop.preparent = local->replies[i].preparent;
+ local->cont.dir_fop.postparent = local->replies[i].postparent;
+ local->cont.dir_fop.prenewparent = local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent = local->replies[i].postparent2;
+ if (local->xdata_rsp) {
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ }
+
+ if (local->replies[i].xdata)
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ continue;
}
- parent->inode = inode_ref (child->parent);
- gf_uuid_copy (parent->gfid, child->pargfid);
-
- ret = 0;
-out:
- GF_FREE (child_path);
-
- return ret;
-}
+ if (i == inode_read_subvol) {
+ local->cont.dir_fop.buf = local->replies[i].poststat;
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ }
+ if (i == parent_read_subvol) {
+ local->cont.dir_fop.preparent = local->replies[i].preparent;
+ local->cont.dir_fop.postparent = local->replies[i].postparent;
+ }
-static void
-__afr_dir_write_finalize (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int inode_read_subvol = -1;
- int parent_read_subvol = -1;
- int parent2_read_subvol = -1;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- if (local->inode) {
- afr_replies_interpret (frame, this, local->inode);
- inode_read_subvol = afr_data_subvol_get (local->inode, this,
- NULL, NULL, NULL);
- }
- if (local->parent)
- parent_read_subvol = afr_data_subvol_get (local->parent, this,
- NULL, NULL, NULL);
- if (local->parent2)
- parent2_read_subvol = afr_data_subvol_get (local->parent2, this,
- NULL, NULL, NULL);
-
- local->op_ret = -1;
- local->op_errno = afr_final_errno (local, priv);
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].valid)
- continue;
- if (local->replies[i].op_ret < 0) {
- if (local->inode)
- afr_inode_read_subvol_reset (local->inode,
- this);
- if (local->parent)
- afr_inode_read_subvol_reset (local->parent,
- this);
- if (local->parent2)
- afr_inode_read_subvol_reset (local->parent2,
- this);
- continue;
- }
-
- if (local->op_ret == -1) {
- local->op_ret = local->replies[i].op_ret;
- local->op_errno = local->replies[i].op_errno;
-
- local->cont.dir_fop.buf =
- local->replies[i].poststat;
- local->cont.dir_fop.preparent =
- local->replies[i].preparent;
- local->cont.dir_fop.postparent =
- local->replies[i].postparent;
- local->cont.dir_fop.prenewparent =
- local->replies[i].preparent2;
- local->cont.dir_fop.postnewparent =
- local->replies[i].postparent2;
- if (local->replies[i].xdata)
- local->xdata_rsp =
- dict_ref (local->replies[i].xdata);
- continue;
- }
-
- if (i == inode_read_subvol) {
- local->cont.dir_fop.buf =
- local->replies[i].poststat;
- if (local->replies[i].xdata) {
- if (local->xdata_rsp)
- dict_unref (local->xdata_rsp);
- local->xdata_rsp =
- dict_ref (local->replies[i].xdata);
- }
- }
-
- if (i == parent_read_subvol) {
- local->cont.dir_fop.preparent =
- local->replies[i].preparent;
- local->cont.dir_fop.postparent =
- local->replies[i].postparent;
- }
-
- if (i == parent2_read_subvol) {
- local->cont.dir_fop.prenewparent =
- local->replies[i].preparent2;
- local->cont.dir_fop.postnewparent =
- local->replies[i].postparent2;
- }
- }
-
- afr_txn_arbitrate_fop_cbk (frame, this);
+ if (i == parent2_read_subvol) {
+ local->cont.dir_fop.prenewparent = local->replies[i].preparent2;
+ local->cont.dir_fop.postnewparent = local->replies[i].postparent2;
+ }
+ }
}
-
static void
-__afr_dir_write_fill (call_frame_t *frame, xlator_t *this, int child_index,
- int op_ret, int op_errno, struct iatt *poststat,
- struct iatt *preparent, struct iatt *postparent,
- struct iatt *preparent2, struct iatt *postparent2,
- dict_t *xdata)
+__afr_dir_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno, struct iatt *poststat,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- local = frame->local;
- fd_ctx = local->fd_ctx;
-
- local->replies[child_index].valid = 1;
- local->replies[child_index].op_ret = op_ret;
- local->replies[child_index].op_errno = op_errno;
-
- if (op_ret >= 0) {
- if (poststat)
- local->replies[child_index].poststat = *poststat;
- if (preparent)
- local->replies[child_index].preparent = *preparent;
- if (postparent)
- local->replies[child_index].postparent = *postparent;
- if (preparent2)
- local->replies[child_index].preparent2 = *preparent2;
- if (postparent2)
- local->replies[child_index].postparent2 = *postparent2;
- if (xdata)
- local->replies[child_index].xdata = dict_ref (xdata);
-
- if (fd_ctx)
- fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
- } else {
- if (op_errno != ENOTEMPTY)
- afr_transaction_fop_failed (frame, this, child_index);
- if (fd_ctx)
- fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
- }
-
- return;
+ afr_local_t *local = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
+
+ if (op_ret >= 0) {
+ if (poststat)
+ local->replies[child_index].poststat = *poststat;
+ if (preparent)
+ local->replies[child_index].preparent = *preparent;
+ if (postparent)
+ local->replies[child_index].postparent = *postparent;
+ if (preparent2)
+ local->replies[child_index].preparent2 = *preparent2;
+ if (postparent2)
+ local->replies[child_index].postparent2 = *postparent2;
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ } else {
+ if (op_errno != ENOTEMPTY)
+ afr_transaction_fop_failed(frame, this, child_index);
+ if (fd_ctx)
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ }
+
+ return;
}
-
static int
-__afr_dir_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent,
- struct iatt *preparent2, struct iatt *postparent2,
- dict_t *xdata)
+__afr_dir_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent,
+ struct iatt *preparent2, struct iatt *postparent2,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- int child_index = (long) cookie;
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- __afr_dir_write_fill (frame, this, child_index, op_ret,
- op_errno, buf, preparent, postparent,
- preparent2, postparent2, xdata);
- }
- UNLOCK (&frame->lock);
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- __afr_dir_write_finalize (frame, this);
-
- if (afr_txn_nothing_failed (frame, this))
- local->transaction.unwind (frame, this);
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ __afr_dir_write_fill(frame, this, child_index, op_ret, op_errno, buf,
+ preparent, postparent, preparent2, postparent2,
+ xdata);
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
+
+ if (call_count == 0) {
+ __afr_dir_write_finalize(frame, this);
+
+ if (afr_txn_nothing_failed(frame, this)) {
+ /*if it did pre-op, it will do post-op changing ctime*/
+ if (priv->consistent_metadata && afr_needs_changelog_update(local))
+ afr_zero_fill_stat(local);
+ local->transaction.unwind(frame, this);
+ }
- afr_mark_entry_pending_changelog (frame, this);
+ afr_mark_entry_pending_changelog(frame, this);
- local->transaction.resume (frame, this);
- }
+ afr_transaction_resume(frame, this);
+ }
- return 0;
+ return 0;
}
-
int
-afr_mark_new_entry_changelog_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- dict_t *xattr, dict_t *xdata)
+afr_mark_new_entry_changelog_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
{
- int call_count = 0;
+ int call_count = 0;
- call_count = afr_frame_return (frame);
+ call_count = afr_frame_return(frame);
- if (call_count == 0)
- AFR_STACK_DESTROY (frame);
+ if (call_count == 0)
+ AFR_STACK_DESTROY(frame);
- return 0;
+ return 0;
}
-
void
-afr_mark_new_entry_changelog (call_frame_t *frame, xlator_t *this)
+afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *new_frame = NULL;
- afr_local_t *local = NULL;
- afr_local_t *new_local = NULL;
- afr_private_t *priv = NULL;
- dict_t *xattr = NULL;
- int32_t **changelog = NULL;
- int i = 0;
- int op_errno = ENOMEM;
- unsigned char *pending = NULL;
- int call_count = 0;
-
- local = frame->local;
- priv = this->private;
-
- new_frame = copy_frame (frame);
- if (!new_frame)
- goto out;
-
- new_local = AFR_FRAME_INIT (new_frame, op_errno);
- if (!new_local)
- goto out;
-
- xattr = dict_new ();
- if (!xattr)
- goto out;
-
- pending = alloca0 (priv->child_count);
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i] &&
- !local->transaction.failed_subvols[i]) {
- call_count ++;
- continue;
- }
- pending[i] = 1;
- }
-
- changelog = afr_mark_pending_changelog (priv, pending, xattr,
- local->cont.dir_fop.buf.ia_type);
- if (!changelog)
- goto out;
-
- new_local->pending = changelog;
- gf_uuid_copy (new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid);
- new_local->loc.inode = inode_ref (local->inode);
-
- new_local->call_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (pending[i])
- continue;
-
- STACK_WIND_COOKIE (new_frame, afr_mark_new_entry_changelog_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->xattrop,
- &new_local->loc, GF_XATTROP_ADD_ARRAY,
- xattr, NULL);
- if (!--call_count)
- break;
+ call_frame_t *new_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_local_t *new_local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int32_t **changelog = NULL;
+ int i = 0;
+ int op_errno = ENOMEM;
+ unsigned char *pending = NULL;
+ int call_count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ new_frame = copy_frame(frame);
+ if (!new_frame)
+ goto out;
+
+ new_local = AFR_FRAME_INIT(new_frame, op_errno);
+ if (!new_local)
+ goto out;
+
+ xattr = dict_new();
+ if (!xattr)
+ goto out;
+
+ pending = alloca0(priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ !local->transaction.failed_subvols[i]) {
+ call_count++;
+ continue;
}
+ pending[i] = 1;
+ }
+
+ changelog = afr_mark_pending_changelog(priv, pending, xattr,
+ local->cont.dir_fop.buf.ia_type);
+ if (!changelog)
+ goto out;
+
+ new_local->pending = changelog;
+ gf_uuid_copy(new_local->loc.gfid, local->cont.dir_fop.buf.ia_gfid);
+ new_local->loc.inode = inode_ref(local->inode);
+
+ new_local->call_count = call_count;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (pending[i])
+ continue;
- new_frame = NULL;
+ STACK_WIND_COOKIE(new_frame, afr_mark_new_entry_changelog_cbk,
+ (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->xattrop, &new_local->loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL);
+ if (!--call_count)
+ break;
+ }
+
+ new_frame = NULL;
out:
- if (new_frame)
- AFR_STACK_DESTROY (new_frame);
- if (xattr)
- dict_unref (xattr);
- return;
+ if (new_frame)
+ AFR_STACK_DESTROY(new_frame);
+ if (xattr)
+ dict_unref(xattr);
+ return;
}
-
void
-afr_mark_entry_pending_changelog (call_frame_t *frame, xlator_t *this)
+afr_mark_entry_pending_changelog(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int pre_op_count = 0;
- int failed_count = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int pre_op_count = 0;
+ int failed_count = 0;
+ unsigned char *success_replies = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- if (local->op_ret < 0)
- return;
+ if (local->op_ret < 0)
+ return;
- if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD)
- return;
+ if (local->op != GF_FOP_CREATE && local->op != GF_FOP_MKNOD &&
+ local->op != GF_FOP_MKDIR)
+ return;
- pre_op_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
- failed_count = AFR_COUNT (local->transaction.failed_subvols,
- priv->child_count);
+ pre_op_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+ failed_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
- if (pre_op_count == priv->child_count && !failed_count)
- return;
+ /* FOP succeeded on all bricks. */
+ if (pre_op_count == priv->child_count && !failed_count)
+ return;
- afr_mark_new_entry_changelog (frame, this);
+ /* FOP did not suceed on quorum no. of bricks. */
+ success_replies = alloca0(priv->child_count);
+ afr_fill_success_replies(local, priv, success_replies);
+ if (!afr_has_quorum(success_replies, this, NULL))
+ return;
+ if (priv->thin_arbiter_count) {
+ /*Mark new entry using ta file*/
+ local->is_new_entry = _gf_true;
return;
-}
+ }
+ afr_mark_new_entry_changelog(frame, this);
+
+ return;
+}
/* {{{ create */
int
-afr_create_unwind (call_frame_t *frame, xlator_t *this)
+afr_create_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- main_frame = afr_transaction_detach_fop_frame (frame);
+ main_frame = afr_transaction_detach_fop_frame(frame);
- if (!main_frame)
- return 0;
-
- AFR_STACK_UNWIND (create, main_frame, local->op_ret, local->op_errno,
- local->cont.create.fd, local->inode,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent, local->xdata_rsp);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(create, main_frame, local->op_ret, local->op_errno,
+ local->cont.create.fd, local->inode,
+ &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- fd_t *fd, inode_t *inode, struct iatt *buf,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
+afr_create_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
- preparent, postparent, NULL, NULL, xdata);
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_create_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_create_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
-
- STACK_WIND_COOKIE (frame, afr_create_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->create,
- &local->loc, local->cont.create.flags,
- local->cont.create.mode, local->umask,
- local->cont.create.fd, local->xdata_req);
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_create_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->create, &local->loc,
+ local->cont.create.flags, local->cont.create.mode,
+ local->umask, local->cont.create.fd, local->xdata_req);
+ return 0;
}
-
int
-afr_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
+afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
-
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
-
- loc_copy (&local->loc, loc);
-
- local->fd_ctx = afr_fd_ctx_get (fd, this);
- if (!local->fd_ctx)
- goto out;
-
- local->inode = inode_ref (loc->inode);
- local->parent = inode_ref (loc->parent);
-
- local->op = GF_FOP_CREATE;
- local->cont.create.flags = flags;
- local->fd_ctx->flags = flags;
- local->cont.create.mode = mode;
- local->cont.create.fd = fd_ref (fd);
- local->umask = umask;
-
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
-
- if (!local->xdata_req)
- goto out;
-
- local->transaction.wind = afr_create_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_create_unwind;
-
- ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
- &op_errno);
- if (ret)
- goto out;
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename,
- priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
- ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+
+ local->fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!local->fd_ctx)
+ goto out;
+
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->op = GF_FOP_CREATE;
+ local->cont.create.flags = flags;
+ local->fd_ctx->flags = flags;
+ local->cont.create.mode = mode;
+ local->cont.create.fd = fd_ref(fd);
+ local->umask = umask;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_create_wind;
+ local->transaction.unwind = afr_create_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
/* }}} */
@@ -513,518 +502,436 @@ out:
/* {{{ mknod */
int
-afr_mknod_unwind (call_frame_t *frame, xlator_t *this)
+afr_mknod_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
-
- AFR_STACK_UNWIND (mknod, main_frame, local->op_ret, local->op_errno,
- local->inode, &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(mknod, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+afr_mknod_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
- preparent, postparent, NULL, NULL, xdata);
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_mknod_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_mknod_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
-
- STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->mknod,
- &local->loc, local->cont.mknod.mode,
- local->cont.mknod.dev, local->umask,
- local->xdata_req);
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_mknod_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mknod, &local->loc,
+ local->cont.mknod.mode, local->cont.mknod.dev,
+ local->umask, local->xdata_req);
+ return 0;
}
int
-afr_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- dev_t dev, mode_t umask, dict_t *xdata)
+afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t dev, mode_t umask, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
-
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
-
- loc_copy (&local->loc, loc);
- local->inode = inode_ref (loc->inode);
- local->parent = inode_ref (loc->parent);
-
- local->op = GF_FOP_MKNOD;
- local->cont.mknod.mode = mode;
- local->cont.mknod.dev = dev;
- local->umask = umask;
-
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
-
- if (!local->xdata_req)
- goto out;
-
- local->transaction.wind = afr_mknod_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_mknod_unwind;
-
- ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
- &op_errno);
- if (ret)
- goto out;
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename,
- priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
- ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->op = GF_FOP_MKNOD;
+ local->cont.mknod.mode = mode;
+ local->cont.mknod.dev = dev;
+ local->umask = umask;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->transaction.wind = afr_mknod_wind;
+ local->transaction.unwind = afr_mknod_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL);
- return 0;
+ AFR_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
/* {{{ mkdir */
-
int
-afr_mkdir_unwind (call_frame_t *frame, xlator_t *this)
+afr_mkdir_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (mkdir, main_frame, local->op_ret, local->op_errno,
- local->inode, &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(mkdir, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+afr_mkdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
- preparent, postparent, NULL, NULL, xdata);
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_mkdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_mkdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->mkdir, &local->loc,
- local->cont.mkdir.mode, local->umask,
- local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_mkdir_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->mkdir, &local->loc,
+ local->cont.mkdir.mode, local->umask, local->xdata_req);
+ return 0;
}
-
int
-afr_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
- mode_t umask, dict_t *xdata)
+afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
-
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
-
- loc_copy (&local->loc, loc);
- local->inode = inode_ref (loc->inode);
- local->parent = inode_ref (loc->parent);
-
- local->cont.mkdir.mode = mode;
- local->umask = umask;
-
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
-
- if (!local->xdata_req)
- goto out;
-
- local->op = GF_FOP_MKDIR;
- local->transaction.wind = afr_mkdir_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_mkdir_unwind;
-
- ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
- &op_errno);
- if (ret)
- goto out;
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename,
- priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
- ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->cont.mkdir.mode = mode;
+ local->umask = umask;
+
+ if (!xdata || !dict_get_sizen(xdata, "gfid-req")) {
+ op_errno = EPERM;
+ gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno,
+ AFR_MSG_GFID_NULL,
+ "mkdir: %s is received "
+ "without gfid-req %p",
+ loc->path, xdata);
+ goto out;
+ }
+
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ if (!local->xdata_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local->op = GF_FOP_MKDIR;
+ local->transaction.wind = afr_mkdir_wind;
+ local->transaction.unwind = afr_mkdir_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL);
- return 0;
+ AFR_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
/* {{{ link */
-
int
-afr_link_unwind (call_frame_t *frame, xlator_t *this)
+afr_link_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (link, main_frame, local->op_ret, local->op_errno,
- local->inode, &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(link, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+afr_link_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
- preparent, postparent, NULL, NULL, xdata);
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_link_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_link_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->link,
- &local->loc, &local->newloc, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_link_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->link, &local->loc,
+ &local->newloc, local->xdata_req);
+ return 0;
}
-
int
-afr_link (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
- dict_t *xdata)
+afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
-
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
-
- loc_copy (&local->loc, oldloc);
- loc_copy (&local->newloc, newloc);
-
- local->inode = inode_ref (oldloc->inode);
- local->parent = inode_ref (newloc->parent);
-
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
-
- if (!local->xdata_req)
- goto out;
-
- local->op = GF_FOP_LINK;
-
- local->transaction.wind = afr_link_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_link_unwind;
-
- ret = afr_build_parent_loc (&local->transaction.parent_loc, newloc,
- &op_errno);
- if (ret)
- goto out;
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (newloc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename,
- priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
- ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, oldloc);
+ loc_copy(&local->newloc, newloc);
- return 0;
+ local->inode = inode_ref(oldloc->inode);
+ local->parent = inode_ref(newloc->parent);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_LINK;
+
+ local->transaction.wind = afr_link_wind;
+ local->transaction.unwind = afr_link_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, newloc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(newloc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL);
- return 0;
+ AFR_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
/* {{{ symlink */
-
int
-afr_symlink_unwind (call_frame_t *frame, xlator_t *this)
+afr_symlink_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (symlink, main_frame, local->op_ret, local->op_errno,
- local->inode, &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(symlink, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->cont.dir_fop.buf,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, inode_t *inode,
- struct iatt *buf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+afr_symlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
- preparent, postparent, NULL, NULL, xdata);
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_symlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_symlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
-
- STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->symlink,
- local->cont.symlink.linkpath, &local->loc,
- local->umask, local->xdata_req);
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_symlink_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->symlink,
+ local->cont.symlink.linkpath, &local->loc, local->umask,
+ local->xdata_req);
+ return 0;
}
-
int
-afr_symlink (call_frame_t *frame, xlator_t *this, const char *linkpath,
- loc_t *loc, mode_t umask, dict_t *xdata)
+afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
-
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
-
- loc_copy (&local->loc, loc);
- local->inode = inode_ref (loc->inode);
- local->parent = inode_ref (loc->parent);
-
- local->cont.symlink.linkpath = gf_strdup (linkpath);
- local->umask = umask;
-
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
-
- if (!local->xdata_req)
- goto out;
-
- local->op = GF_FOP_SYMLINK;
- local->transaction.wind = afr_symlink_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_symlink_unwind;
-
- ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
- &op_errno);
- if (ret)
- goto out;
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename,
- priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
- ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->cont.symlink.linkpath = gf_strdup(linkpath);
+ local->umask = umask;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_SYMLINK;
+ local->transaction.wind = afr_symlink_wind;
+ local->transaction.unwind = afr_symlink_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (symlink, frame, -1, op_errno, NULL, NULL, NULL,
- NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(symlink, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
/* }}} */
@@ -1032,161 +939,118 @@ out:
/* {{{ rename */
int
-afr_rename_unwind (call_frame_t *frame, xlator_t *this)
+afr_rename_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (rename, main_frame, local->op_ret, local->op_errno,
- &local->cont.dir_fop.buf,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent,
- &local->cont.dir_fop.prenewparent,
- &local->cont.dir_fop.postnewparent, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(rename, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.buf, &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent,
+ &local->cont.dir_fop.prenewparent,
+ &local->cont.dir_fop.postnewparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent,
- dict_t *xdata)
+afr_rename_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *buf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
{
- return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, buf,
- preoldparent, postoldparent, prenewparent,
- postnewparent, xdata);
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, buf,
+ preoldparent, postoldparent, prenewparent,
+ postnewparent, xdata);
}
-
int
-afr_rename_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_rename_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->rename,
- &local->loc, &local->newloc, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_rename_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rename, &local->loc,
+ &local->newloc, local->xdata_req);
+ return 0;
}
-
int
-afr_rename (call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
- dict_t *xdata)
+afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
- int nlockee = 0;
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame) {
- op_errno = ENOMEM;
- goto out;
- }
-
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
-
- loc_copy (&local->loc, oldloc);
- loc_copy (&local->newloc, newloc);
-
- local->inode = inode_ref (oldloc->inode);
- local->parent = inode_ref (oldloc->parent);
- local->parent2 = inode_ref (newloc->parent);
-
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
-
- if (!local->xdata_req)
- goto out;
-
- local->op = GF_FOP_RENAME;
- local->transaction.wind = afr_rename_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_rename_unwind;
-
- ret = afr_build_parent_loc (&local->transaction.parent_loc, oldloc,
- &op_errno);
- if (ret)
- goto out;
- ret = afr_build_parent_loc (&local->transaction.new_parent_loc, newloc,
- &op_errno);
- if (ret)
- goto out;
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (oldloc->path);
- local->transaction.new_basename = AFR_BASENAME (newloc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = nlockee = 0;
- ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
- &local->transaction.new_parent_loc,
- local->transaction.new_basename,
- priv->child_count);
- if (ret)
- goto out;
-
- nlockee++;
- ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
- &local->transaction.parent_loc,
- local->transaction.basename,
- priv->child_count);
- if (ret)
- goto out;
-
- nlockee++;
- if (local->newloc.inode && IA_ISDIR (local->newloc.inode->ia_type)) {
- ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
- &local->newloc,
- NULL,
- priv->child_count);
- if (ret)
- goto out;
-
- nlockee++;
- }
- qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee),
- afr_entry_lockee_cmp);
- int_lock->lockee_count = nlockee;
-
- ret = afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, oldloc);
+ loc_copy(&local->newloc, newloc);
+
+ local->inode = inode_ref(oldloc->inode);
+ local->parent = inode_ref(oldloc->parent);
+ local->parent2 = inode_ref(newloc->parent);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_RENAME;
+ local->transaction.wind = afr_rename_wind;
+ local->transaction.unwind = afr_rename_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, oldloc,
+ &op_errno);
+ if (ret)
+ goto out;
+ ret = afr_build_parent_loc(&local->transaction.new_parent_loc, newloc,
+ &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(oldloc->path);
+ local->transaction.new_basename = AFR_BASENAME(newloc->path);
+ ret = afr_transaction(transaction_frame, this,
+ AFR_ENTRY_RENAME_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
/* }}} */
@@ -1194,263 +1058,205 @@ out:
/* {{{ unlink */
int
-afr_unlink_unwind (call_frame_t *frame, xlator_t *this)
+afr_unlink_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (unlink, main_frame, local->op_ret, local->op_errno,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(unlink, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+afr_unlink_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL,
- preparent, postparent, NULL, NULL, xdata);
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_unlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_unlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->unlink,
- &local->loc, local->xflag, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_unlink_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->unlink, &local->loc,
+ local->xflag, local->xdata_req);
+ return 0;
}
-
int
-afr_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
- dict_t *xdata)
+afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
-
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
-
- loc_copy (&local->loc, loc);
- local->xflag = xflag;
-
- local->inode = inode_ref (loc->inode);
- local->parent = inode_ref (loc->parent);
-
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
-
- if (!local->xdata_req)
- goto out;
-
- local->op = GF_FOP_UNLINK;
- local->transaction.wind = afr_unlink_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_unlink_unwind;
-
- ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
- &op_errno);
- if (ret)
- goto out;
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = 0;
- ret = afr_init_entry_lockee (&int_lock->lockee[0], local,
- &local->transaction.parent_loc,
- local->transaction.basename,
- priv->child_count);
- if (ret)
- goto out;
-
- int_lock->lockee_count++;
- ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->xflag = xflag;
+
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_UNLINK;
+ local->transaction.wind = afr_unlink_wind;
+ local->transaction.unwind = afr_unlink_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
/* {{{ rmdir */
-
-
int
-afr_rmdir_unwind (call_frame_t *frame, xlator_t *this)
+afr_rmdir_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *main_frame = NULL;
- afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
-
- AFR_STACK_UNWIND (rmdir, main_frame, local->op_ret, local->op_errno,
- &local->cont.dir_fop.preparent,
- &local->cont.dir_fop.postparent, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno,
+ &local->cont.dir_fop.preparent,
+ &local->cont.dir_fop.postparent, local->xdata_rsp);
+ return 0;
+}
int
-afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+afr_rmdir_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- return __afr_dir_write_cbk (frame, cookie, this, op_ret, op_errno, NULL,
- preparent, postparent, NULL, NULL, xdata);
+ return __afr_dir_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ preparent, postparent, NULL, NULL, xdata);
}
-
int
-afr_rmdir_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_rmdir_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->rmdir,
- &local->loc, local->cont.rmdir.flags, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_rmdir_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->rmdir, &local->loc,
+ local->cont.rmdir.flags, local->xdata_req);
+ return 0;
}
-
int
-afr_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
- dict_t *xdata)
+afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
- int nlockee = 0;
-
- priv = this->private;
-
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
-
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
-
-
- loc_copy (&local->loc, loc);
- local->inode = inode_ref (loc->inode);
- local->parent = inode_ref (loc->parent);
-
- local->cont.rmdir.flags = flags;
-
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
-
- if (!local->xdata_req)
- goto out;
-
- local->op = GF_FOP_RMDIR;
- local->transaction.wind = afr_rmdir_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_rmdir_unwind;
-
- ret = afr_build_parent_loc (&local->transaction.parent_loc, loc,
- &op_errno);
- if (ret)
- goto out;
-
- local->transaction.main_frame = frame;
- local->transaction.basename = AFR_BASENAME (loc->path);
- int_lock = &local->internal_lock;
-
- int_lock->lockee_count = nlockee = 0;
- ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
- &local->transaction.parent_loc,
- local->transaction.basename,
- priv->child_count);
- if (ret)
- goto out;
-
- nlockee++;
- ret = afr_init_entry_lockee (&int_lock->lockee[nlockee], local,
- &local->loc,
- NULL,
- priv->child_count);
- if (ret)
- goto out;
-
- nlockee++;
- qsort (int_lock->lockee, nlockee, sizeof (*int_lock->lockee),
- afr_entry_lockee_cmp);
- int_lock->lockee_count = nlockee;
-
- ret = afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
-
- return 0;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, loc);
+ local->inode = inode_ref(loc->inode);
+ local->parent = inode_ref(loc->parent);
+
+ local->cont.rmdir.flags = flags;
+
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->op = GF_FOP_RMDIR;
+ local->transaction.wind = afr_rmdir_wind;
+ local->transaction.unwind = afr_rmdir_unwind;
+
+ ret = afr_build_parent_loc(&local->transaction.parent_loc, loc, &op_errno);
+ if (ret)
+ goto out;
+
+ local->transaction.main_frame = frame;
+ local->transaction.basename = AFR_BASENAME(loc->path);
+ ret = afr_transaction(transaction_frame, this, AFR_ENTRY_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (rmdir, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h
index 02f0a3682d9..1d88c3b9b26 100644
--- a/xlators/cluster/afr/src/afr-dir-write.h
+++ b/xlators/cluster/afr/src/afr-dir-write.h
@@ -12,36 +12,35 @@
#define __DIR_WRITE_H__
int32_t
-afr_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- mode_t umask, fd_t *fd, dict_t *xdata);
+afr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata);
int32_t
-afr_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t dev, mode_t umask, dict_t *xdata);
+afr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t dev, mode_t umask, dict_t *xdata);
int32_t
-afr_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata);
+afr_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata);
int32_t
-afr_unlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int xflag, dict_t *xdata);
+afr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata);
int32_t
-afr_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags, dict_t *xdata);
+afr_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata);
int32_t
-afr_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc, dict_t *xdata);
+afr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
int32_t
-afr_rename (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc, dict_t *xdata);
+afr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
int
-afr_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *oldloc, mode_t umask, dict_t *params);
+afr_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *oldloc, mode_t umask, dict_t *params);
#endif /* __DIR_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c
index caa2a97d6d8..c5521704de2 100644
--- a/xlators/cluster/afr/src/afr-inode-read.c
+++ b/xlators/cluster/afr/src/afr-inode-read.c
@@ -8,7 +8,6 @@
cases as published by the Free Software Foundation.
*/
-
#include <libgen.h>
#include <unistd.h>
#include <fnmatch.h>
@@ -16,21 +15,17 @@
#include <stdlib.h>
#include <signal.h>
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "byte-order.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "quota-common-utils.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/list.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/quota-common-utils.h>
#include "afr-transaction.h"
#include "afr-messages.h"
@@ -45,146 +40,146 @@
* */
int
-afr_handle_quota_size (call_frame_t *frame, xlator_t *this)
+afr_handle_quota_size(call_frame_t *frame, xlator_t *this)
{
- unsigned char *readable = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- struct afr_reply *replies = NULL;
- int i = 0;
- int ret = 0;
- quota_meta_t size = {0, };
- quota_meta_t max_size = {0, };
- int readable_cnt = 0;
- int read_subvol = -1;
-
- local = frame->local;
- priv = this->private;
- replies = local->replies;
-
- readable = alloca0 (priv->child_count);
-
- afr_inode_read_subvol_get (local->inode, this, readable, 0, 0);
-
- readable_cnt = AFR_COUNT (readable, priv->child_count);
-
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid || replies[i].op_ret == -1)
- continue;
- if (readable_cnt && !readable[i])
- continue;
- if (!replies[i].xdata)
- continue;
- ret = quota_dict_get_meta (replies[i].xdata, QUOTA_SIZE_KEY,
- &size);
- if (ret == -1)
- continue;
- if (read_subvol == -1)
- read_subvol = i;
- if (size.size > max_size.size ||
- (size.file_count + size.dir_count) >
- (max_size.file_count + max_size.dir_count))
- read_subvol = i;
-
- if (size.size > max_size.size)
- max_size.size = size.size;
- if (size.file_count > max_size.file_count)
- max_size.file_count = size.file_count;
- if (size.dir_count > max_size.dir_count)
- max_size.dir_count = size.dir_count;
- }
-
- if (max_size.size == 0 && max_size.file_count == 0 &&
- max_size.dir_count == 0)
- return read_subvol;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid || replies[i].op_ret == -1)
- continue;
- if (readable_cnt && !readable[i])
- continue;
- if (!replies[i].xdata)
- continue;
- quota_dict_set_meta (replies[i].xdata, QUOTA_SIZE_KEY,
- &max_size, IA_IFDIR);
- }
-
+ unsigned char *readable = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ int i = 0;
+ int ret = 0;
+ quota_meta_t size = {
+ 0,
+ };
+ quota_meta_t max_size = {
+ 0,
+ };
+ int readable_cnt = 0;
+ int read_subvol = -1;
+
+ local = frame->local;
+ priv = this->private;
+ replies = local->replies;
+
+ readable = alloca0(priv->child_count);
+
+ afr_inode_read_subvol_get(local->inode, this, readable, 0, 0);
+
+ readable_cnt = AFR_COUNT(readable, priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ ret = quota_dict_get_meta(replies[i].xdata, QUOTA_SIZE_KEY,
+ SLEN(QUOTA_SIZE_KEY), &size);
+ if (ret == -1)
+ continue;
+ if (read_subvol == -1)
+ read_subvol = i;
+ if (size.size > max_size.size ||
+ (size.file_count + size.dir_count) >
+ (max_size.file_count + max_size.dir_count))
+ read_subvol = i;
+
+ if (size.size > max_size.size)
+ max_size.size = size.size;
+ if (size.file_count > max_size.file_count)
+ max_size.file_count = size.file_count;
+ if (size.dir_count > max_size.dir_count)
+ max_size.dir_count = size.dir_count;
+ }
+
+ if (max_size.size == 0 && max_size.file_count == 0 &&
+ max_size.dir_count == 0)
return read_subvol;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (readable_cnt && !readable[i])
+ continue;
+ if (!replies[i].xdata)
+ continue;
+ quota_dict_set_meta(replies[i].xdata, QUOTA_SIZE_KEY, &max_size,
+ IA_IFDIR);
+ }
+
+ return read_subvol;
+}
/* {{{ access */
int
-afr_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
+afr_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret < 0) {
- local->op_ret = op_ret;
- local->op_errno = op_errno;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- afr_read_txn_continue (frame, this, (long) cookie);
- return 0;
- }
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- AFR_STACK_UNWIND (access, frame, op_ret, op_errno, xdata);
+ AFR_STACK_UNWIND(access, frame, op_ret, op_errno, xdata);
- return 0;
+ return 0;
}
-
int
-afr_access_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_access_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- local = frame->local;
-
- if (subvol == -1) {
- AFR_STACK_UNWIND (access, frame, local->op_ret,
- local->op_errno, 0);
- return 0;
- }
-
- STACK_WIND_COOKIE (frame, afr_access_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->access,
- &local->loc, local->cont.access.mask,
- local->xdata_req);
- return 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(access, frame, local->op_ret, local->op_errno, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_access_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->access, &local->loc,
+ local->cont.access.mask, local->xdata_req);
+ return 0;
}
int
-afr_access (call_frame_t *frame, xlator_t *this, loc_t *loc,
- int mask, dict_t *xdata)
+afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int mask,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- int op_errno = 0;
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local->op = GF_FOP_ACCESS;
- loc_copy (&local->loc, loc);
- local->cont.access.mask = mask;
- if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->op = GF_FOP_ACCESS;
+ loc_copy(&local->loc, loc);
+ local->cont.access.mask = mask;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- afr_read_txn (frame, this, loc->inode, afr_access_wind,
- AFR_METADATA_TRANSACTION);
+ afr_read_txn(frame, this, loc->inode, afr_access_wind,
+ AFR_METADATA_TRANSACTION);
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (access, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND(access, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
/* }}} */
@@ -192,152 +187,140 @@ out:
/* {{{ stat */
int
-afr_stat_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iatt *buf, dict_t *xdata)
+afr_stat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret < 0) {
- local->op_ret = op_ret;
- local->op_errno = op_errno;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- afr_read_txn_continue (frame, this, (long) cookie);
- return 0;
- }
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- AFR_STACK_UNWIND (stat, frame, op_ret, op_errno, buf, xdata);
+ AFR_STACK_UNWIND(stat, frame, op_ret, op_errno, buf, xdata);
- return 0;
+ return 0;
}
-
int
-afr_stat_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_stat_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- local = frame->local;
-
- if (subvol == -1) {
- AFR_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno,
- 0, 0);
- return 0;
- }
-
- STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->stat,
- &local->loc, local->xdata_req);
- return 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno, 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(
+ frame, afr_stat_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->stat, &local->loc, local->xdata_req);
+ return 0;
}
int
-afr_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int op_errno = 0;
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local->op = GF_FOP_STAT;
- loc_copy (&local->loc, loc);
- if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->op = GF_FOP_STAT;
+ loc_copy(&local->loc, loc);
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- afr_read_txn (frame, this, loc->inode, afr_stat_wind,
- AFR_DATA_TRANSACTION);
+ afr_read_txn(frame, this, loc->inode, afr_stat_wind, AFR_DATA_TRANSACTION);
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
-
/* }}} */
/* {{{ fstat */
int
-afr_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *buf,
- dict_t *xdata)
+afr_fstat_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *buf, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret < 0) {
- local->op_ret = op_ret;
- local->op_errno = op_errno;
+ if (op_ret < 0) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- afr_read_txn_continue (frame, this, (long) cookie);
- return 0;
- }
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- AFR_STACK_UNWIND (fstat, frame, op_ret, op_errno, buf, xdata);
+ AFR_STACK_UNWIND(fstat, frame, op_ret, op_errno, buf, xdata);
- return 0;
+ return 0;
}
-
int
-afr_fstat_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_fstat_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
- local = frame->local;
-
- if (subvol == -1) {
- AFR_STACK_UNWIND (fstat, frame, local->op_ret, local->op_errno,
- 0, 0);
- return 0;
- }
-
- STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->fstat,
- local->fd, local->xdata_req);
- return 0;
-}
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ priv = this->private;
+ local = frame->local;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(fstat, frame, local->op_ret, local->op_errno, 0, 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(
+ frame, afr_fstat_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->fstat, local->fd, local->xdata_req);
+ return 0;
+}
int32_t
-afr_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd, dict_t *xdata)
+afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int op_errno = 0;
+ afr_local_t *local = NULL;
+ int op_errno = 0;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local->op = GF_FOP_FSTAT;
- local->fd = fd_ref (fd);
- if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->op = GF_FOP_FSTAT;
+ local->fd = fd_ref(fd);
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- afr_fix_open (fd, this);
+ afr_fix_open(fd, this);
- afr_read_txn (frame, this, fd->inode, afr_fstat_wind,
- AFR_DATA_TRANSACTION);
+ afr_read_txn(frame, this, fd->inode, afr_fstat_wind, AFR_DATA_TRANSACTION);
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
/* }}} */
@@ -345,1437 +328,1493 @@ out:
/* {{{ readlink */
int
-afr_readlink_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- const char *buf, struct iatt *sbuf, dict_t *xdata)
+afr_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, const char *buf,
+ struct iatt *sbuf, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = op_errno;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- afr_read_txn_continue (frame, this, (long) cookie);
- return 0;
- }
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- AFR_STACK_UNWIND (readlink, frame, op_ret, op_errno,
- buf, sbuf, xdata);
- return 0;
+ AFR_STACK_UNWIND(readlink, frame, op_ret, op_errno, buf, sbuf, xdata);
+ return 0;
}
int
-afr_readlink_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_readlink_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
-
- if (subvol == -1) {
- AFR_STACK_UNWIND (readlink, frame, local->op_ret,
- local->op_errno, 0, 0, 0);
- return 0;
- }
-
- STACK_WIND_COOKIE (frame, afr_readlink_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->readlink,
- &local->loc, local->cont.readlink.size,
- local->xdata_req);
- return 0;
-}
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(readlink, frame, local->op_ret, local->op_errno, 0, 0,
+ 0);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_readlink_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->readlink, &local->loc,
+ local->cont.readlink.size, local->xdata_req);
+ return 0;
+}
int
-afr_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size, dict_t *xdata)
+afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata)
{
- afr_local_t * local = NULL;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local->op = GF_FOP_READLINK;
- loc_copy (&local->loc, loc);
- local->cont.readlink.size = size;
- if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->op = GF_FOP_READLINK;
+ loc_copy(&local->loc, loc);
+ local->cont.readlink.size = size;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- afr_read_txn (frame, this, loc->inode, afr_readlink_wind,
- AFR_DATA_TRANSACTION);
+ afr_read_txn(frame, this, loc->inode, afr_readlink_wind,
+ AFR_DATA_TRANSACTION);
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0);
+ AFR_STACK_UNWIND(readlink, frame, -1, op_errno, 0, 0, 0);
- return 0;
+ return 0;
}
-
/* }}} */
/* {{{ getxattr */
struct _xattr_key {
- char *key;
- struct list_head list;
+ char *key;
+ struct list_head list;
};
-
int
-__gather_xattr_keys (dict_t *dict, char *key, data_t *value,
- void *data)
+__gather_xattr_keys(dict_t *dict, char *key, data_t *value, void *data)
{
- struct list_head * list = data;
- struct _xattr_key * xkey = NULL;
-
- if (!strncmp (key, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
+ struct list_head *list = data;
+ struct _xattr_key *xkey = NULL;
- xkey = GF_CALLOC (1, sizeof (*xkey), gf_afr_mt_xattr_key);
- if (!xkey)
- return -1;
+ if (!strncmp(key, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) {
+ xkey = GF_MALLOC(sizeof(*xkey), gf_afr_mt_xattr_key);
+ if (!xkey)
+ return -1;
- xkey->key = key;
- INIT_LIST_HEAD (&xkey->list);
+ xkey->key = key;
+ INIT_LIST_HEAD(&xkey->list);
- list_add_tail (&xkey->list, list);
- }
- return 0;
+ list_add_tail(&xkey->list, list);
+ }
+ return 0;
}
-
void
-afr_filter_xattrs (dict_t *dict)
+afr_filter_xattrs(dict_t *dict)
{
- struct list_head keys = {0,};
- struct _xattr_key *key = NULL;
- struct _xattr_key *tmp = NULL;
+ struct list_head keys = {
+ 0,
+ };
+ struct _xattr_key *key = NULL;
+ struct _xattr_key *tmp = NULL;
- INIT_LIST_HEAD (&keys);
+ INIT_LIST_HEAD(&keys);
- dict_foreach (dict, __gather_xattr_keys,
- (void *) &keys);
+ dict_foreach(dict, __gather_xattr_keys, (void *)&keys);
- list_for_each_entry_safe (key, tmp, &keys, list) {
- dict_del (dict, key->key);
+ list_for_each_entry_safe(key, tmp, &keys, list)
+ {
+ dict_del(dict, key->key);
- list_del_init (&key->list);
+ list_del_init(&key->list);
- GF_FREE (key);
- }
+ GF_FREE(key);
+ }
}
+static gf_boolean_t
+afr_getxattr_ignorable_errnos(int32_t op_errno)
+{
+ if (op_errno == ENODATA || op_errno == ENOTSUP || op_errno == ERANGE ||
+ op_errno == ENAMETOOLONG)
+ return _gf_true;
+ return _gf_false;
+}
int
-afr_getxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+afr_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret < 0) {
- local->op_ret = op_ret;
- local->op_errno = op_errno;
+ if (op_ret < 0 && !afr_getxattr_ignorable_errnos(op_errno)) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- afr_read_txn_continue (frame, this, (long) cookie);
- return 0;
- }
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- if (dict)
- afr_filter_xattrs (dict);
+ if (dict)
+ afr_filter_xattrs(dict);
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata);
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
+ return 0;
}
-
int
-afr_getxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_getxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
-
- if (subvol == -1) {
- AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
- local->op_errno, NULL, NULL);
- return 0;
- }
-
- STACK_WIND_COOKIE (frame, afr_getxattr_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->getxattr,
- &local->loc, local->cont.getxattr.name,
- local->xdata_req);
- return 0;
-}
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno, NULL,
+ NULL);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_getxattr_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->getxattr, &local->loc,
+ local->cont.getxattr.name, local->xdata_req);
+ return 0;
+}
int32_t
-afr_getxattr_unwind (call_frame_t *frame, int op_ret, int op_errno,
- dict_t *dict, dict_t *xdata)
+afr_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict,
+ dict_t *xdata)
{
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
}
int32_t
-afr_fgetxattr_clrlk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+afr_fgetxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- xlator_t **children = NULL;
- dict_t *xattr = NULL;
- char *tmp_report = NULL;
- char lk_summary[1024] = {0,};
- int serz_len = 0;
- int32_t callcnt = 0;
- long int cky = 0;
- int ret = 0;
-
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
- cky = (long) cookie;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret == -1)
- local->replies[cky].op_errno = op_errno;
-
- if (!local->dict)
- local->dict = dict_new ();
- if (local->dict) {
- ret = dict_get_str (dict, local->cont.getxattr.name,
- &tmp_report);
- if (ret)
- goto unlock;
- ret = dict_set_dynstr (local->dict,
- children[cky]->name,
- gf_strdup (tmp_report));
- if (ret)
- goto unlock;
- }
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ dict_t *xattr = NULL;
+ char *tmp_report = NULL;
+ char lk_summary[1024] = {
+ 0,
+ };
+ int serz_len = 0;
+ int32_t callcnt = 0;
+ long int cky = 0;
+ int ret = 0;
+ int keylen = 0;
+ int children_keylen = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+ cky = (long)cookie;
+ keylen = strlen(local->cont.getxattr.name);
+ children_keylen = strlen(children[cky]->name);
+
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1)
+ local->replies[cky].op_errno = op_errno;
+
+ if (!local->dict)
+ local->dict = dict_new();
+ if (local->dict) {
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen,
+ &tmp_report);
+ if (ret)
+ goto unlock;
+ ret = dict_set_dynstrn(local->dict, children[cky]->name,
+ children_keylen, gf_strdup(tmp_report));
+ if (ret)
+ goto unlock;
}
+ }
unlock:
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- xattr = dict_new ();
- if (!xattr) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto unwind;
- }
- ret = dict_serialize_value_with_delim (local->dict,
- lk_summary,
- &serz_len, '\n');
- if (ret) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto unwind;
- }
- if (serz_len == -1)
- snprintf (lk_summary, sizeof (lk_summary),
- "No locks cleared.");
- ret = dict_set_dynstr (xattr, local->cont.getxattr.name,
- gf_strdup (lk_summary));
- if (ret) {
- op_ret = -1;
- op_errno = ENOMEM;
- gf_msg (this->name, GF_LOG_ERROR,
- ENOMEM, AFR_MSG_DICT_SET_FAILED,
- "Error setting dictionary");
- goto unwind;
- }
+ UNLOCK(&frame->lock);
+
+ if (!callcnt) {
+ xattr = dict_new();
+ if (!xattr) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ ret = dict_serialize_value_with_delim(local->dict, lk_summary,
+ &serz_len, '\n');
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ if (serz_len == -1)
+ snprintf(lk_summary, sizeof(lk_summary), "No locks cleared.");
+ ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen,
+ gf_strdup(lk_summary));
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED,
+ "Error setting dictionary");
+ goto unwind;
+ }
- unwind:
- // Updating child_errno with more recent 'events'
- op_errno = afr_final_errno (local, priv);
+ op_errno = afr_final_errno(local, priv);
- AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, xattr,
- xdata);
- if (xattr)
- dict_unref (xattr);
- }
+ unwind:
+ AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata);
+ if (xattr)
+ dict_unref(xattr);
+ }
- return ret;
+ return ret;
}
int32_t
-afr_getxattr_clrlk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+afr_getxattr_clrlk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- xlator_t **children = NULL;
- dict_t *xattr = NULL;
- char *tmp_report = NULL;
- char lk_summary[1024] = {0,};
- int serz_len = 0;
- int32_t callcnt = 0;
- long int cky = 0;
- int ret = 0;
-
- priv = this->private;
- children = priv->children;
-
- local = frame->local;
- cky = (long) cookie;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
- if (op_ret == -1)
- local->replies[cky].op_errno = op_errno;
-
- if (!local->dict)
- local->dict = dict_new ();
- if (local->dict) {
- ret = dict_get_str (dict, local->cont.getxattr.name,
- &tmp_report);
- if (ret)
- goto unlock;
- ret = dict_set_dynstr (local->dict,
- children[cky]->name,
- gf_strdup (tmp_report));
- if (ret)
- goto unlock;
- }
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t **children = NULL;
+ dict_t *xattr = NULL;
+ char *tmp_report = NULL;
+ char lk_summary[1024] = {
+ 0,
+ };
+ int serz_len = 0;
+ int32_t callcnt = 0;
+ long int cky = 0;
+ int ret = 0;
+ int keylen = 0;
+ int children_keylen = 0;
+
+ priv = this->private;
+ children = priv->children;
+
+ local = frame->local;
+ cky = (long)cookie;
+
+ keylen = strlen(local->cont.getxattr.name);
+ children_keylen = strlen(children[cky]->name);
+
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+ if (op_ret == -1)
+ local->replies[cky].op_errno = op_errno;
+
+ if (!local->dict)
+ local->dict = dict_new();
+ if (local->dict) {
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen,
+ &tmp_report);
+ if (ret)
+ goto unlock;
+ ret = dict_set_dynstrn(local->dict, children[cky]->name,
+ children_keylen, gf_strdup(tmp_report));
+ if (ret)
+ goto unlock;
}
+ }
unlock:
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- xattr = dict_new ();
- if (!xattr) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto unwind;
- }
- ret = dict_serialize_value_with_delim (local->dict,
- lk_summary,
- &serz_len, '\n');
- if (ret) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto unwind;
- }
- if (serz_len == -1)
- snprintf (lk_summary, sizeof (lk_summary),
- "No locks cleared.");
- ret = dict_set_dynstr (xattr, local->cont.getxattr.name,
- gf_strdup (lk_summary));
- if (ret) {
- op_ret = -1;
- op_errno = ENOMEM;
- gf_msg (this->name, GF_LOG_ERROR,
- ENOMEM, AFR_MSG_DICT_SET_FAILED,
- "Error setting dictionary");
- goto unwind;
- }
+ UNLOCK(&frame->lock);
+
+ if (!callcnt) {
+ xattr = dict_new();
+ if (!xattr) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ ret = dict_serialize_value_with_delim(local->dict, lk_summary,
+ &serz_len, '\n');
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ goto unwind;
+ }
+ if (serz_len == -1)
+ snprintf(lk_summary, sizeof(lk_summary), "No locks cleared.");
+ ret = dict_set_dynstrn(xattr, local->cont.getxattr.name, keylen,
+ gf_strdup(lk_summary));
+ if (ret) {
+ op_ret = -1;
+ op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_SET_FAILED,
+ "Error setting dictionary");
+ goto unwind;
+ }
- unwind:
- // Updating child_errno with more recent 'events'
- op_errno = afr_final_errno (local, priv);
+ op_errno = afr_final_errno(local, priv);
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata);
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
- if (xattr)
- dict_unref (xattr);
- }
+ if (xattr)
+ dict_unref(xattr);
+ }
- return ret;
+ return ret;
}
/**
* node-uuid cbk uses next child querying mechanism
*/
int32_t
-afr_getxattr_node_uuid_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+afr_getxattr_node_uuid_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- xlator_t **children = NULL;
- int unwind = 1;
- int curr_call_child = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ xlator_t **children = NULL;
+ int unwind = 1;
+ int curr_call_child = 0;
- priv = this->private;
- children = priv->children;
+ priv = this->private;
+ children = priv->children;
- local = frame->local;
+ local = frame->local;
+
+ if (op_ret == -1) { /** query the _next_ child */
- if (op_ret == -1) { /** query the _next_ child */
-
- /**
- * _current_ becomes _next_
- * If done with all childs and yet no success; give up !
- */
- curr_call_child = (int) ((long)cookie);
- if (++curr_call_child == priv->child_count)
- goto unwind;
-
- gf_msg_debug (this->name, op_errno,
- "op_ret (-1): Re-querying afr-child (%d/%d)",
- curr_call_child, priv->child_count);
-
- unwind = 0;
- STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk,
- (void *) (long) curr_call_child,
- children[curr_call_child],
- children[curr_call_child]->fops->getxattr,
- &local->loc,
- local->cont.getxattr.name,
- NULL);
+ /**
+ * _current_ becomes _next_
+ * If done with all children and yet no success; give up !
+ */
+ curr_call_child = (int)((long)cookie);
+ if (++curr_call_child == priv->child_count)
+ goto unwind;
+
+ gf_msg_debug(this->name, op_errno,
+ "op_ret (-1): Re-querying afr-child (%d/%d)",
+ curr_call_child, priv->child_count);
+
+ unwind = 0;
+ STACK_WIND_COOKIE(
+ frame, afr_getxattr_node_uuid_cbk, (void *)(long)curr_call_child,
+ children[curr_call_child],
+ children[curr_call_child]->fops->getxattr, &local->loc,
+ local->cont.getxattr.name, local->xdata_req);
+ }
+
+unwind:
+ if (unwind)
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+
+ return 0;
+}
+
+/**
+ * list-node-uuids cbk returns the list of node_uuids for the subvolume.
+ */
+int32_t
+afr_getxattr_list_node_uuids_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *dict, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr_serz = NULL;
+ long cky = 0;
+ int32_t tlen = 0;
+
+ local = frame->local;
+ priv = this->private;
+ cky = (long)cookie;
+
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+ local->replies[cky].valid = 1;
+ local->replies[cky].op_ret = op_ret;
+ local->replies[cky].op_errno = op_errno;
+
+ if (op_ret < 0)
+ goto unlock;
+
+ local->op_ret = 0;
+
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ local->replies[cky].xattr = dict_ref(dict);
+ }
+
+unlock:
+ UNLOCK(&frame->lock);
+
+ if (!callcnt) {
+ if (local->op_ret != 0) {
+ /* All bricks gave an error. */
+ local->op_errno = afr_final_errno(local, priv);
+ goto unwind;
+ }
+
+ /*Since we store the UUID0_STR as node uuid for down bricks and
+ *for non zero op_ret, assigning length to priv->child_count
+ *number of uuids*/
+ local->cont.getxattr.xattr_len = (SLEN(UUID0_STR) + 2) *
+ priv->child_count;
+
+ if (!local->dict)
+ local->dict = dict_new();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
}
- unwind:
- if (unwind)
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict,
- NULL);
+ xattr_serz = GF_CALLOC(local->cont.getxattr.xattr_len, sizeof(char),
+ gf_common_mt_char);
- return 0;
+ if (!xattr_serz) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
+
+ ret = afr_serialize_xattrs_with_delimiter(frame, this, xattr_serz,
+ UUID0_STR, &tlen, ' ');
+ if (ret) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ GF_FREE(xattr_serz);
+ goto unwind;
+ }
+ ret = dict_set_dynstr_sizen(local->dict, GF_XATTR_LIST_NODE_UUIDS_KEY,
+ xattr_serz);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set node_uuid key in dict");
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
+ } else {
+ local->op_ret = local->cont.getxattr.xattr_len - 1;
+ local->op_errno = 0;
+ }
+
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ local->dict, local->xdata_rsp);
+ }
+
+ return ret;
}
int32_t
-afr_getxattr_quota_size_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+afr_getxattr_quota_size_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- int idx = (long) cookie;
- int call_count = 0;
- afr_local_t *local = frame->local;
- int read_subvol = -1;
-
- local->replies[idx].valid = 1;
- local->replies[idx].op_ret = op_ret;
- local->replies[idx].op_errno = op_errno;
- if (dict)
- local->replies[idx].xdata = dict_ref (dict);
- call_count = afr_frame_return (frame);
- if (call_count == 0) {
- local->inode = inode_ref (local->loc.inode);
- read_subvol = afr_handle_quota_size (frame, this);
- if (read_subvol != -1) {
- op_ret = local->replies[read_subvol].op_ret;
- op_errno = local->replies[read_subvol].op_errno;
- dict = local->replies[read_subvol].xdata;
- }
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict,
- xdata);
+ int idx = (long)cookie;
+ int call_count = 0;
+ afr_local_t *local = frame->local;
+ int read_subvol = -1;
+
+ local->replies[idx].valid = 1;
+ local->replies[idx].op_ret = op_ret;
+ local->replies[idx].op_errno = op_errno;
+ if (dict)
+ local->replies[idx].xdata = dict_ref(dict);
+ call_count = afr_frame_return(frame);
+ if (call_count == 0) {
+ local->inode = inode_ref(local->loc.inode);
+ read_subvol = afr_handle_quota_size(frame, this);
+ if (read_subvol != -1) {
+ op_ret = local->replies[read_subvol].op_ret;
+ op_errno = local->replies[read_subvol].op_errno;
+ dict = local->replies[read_subvol].xdata;
}
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+ }
- return 0;
+ return 0;
}
int32_t
-afr_getxattr_lockinfo_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+afr_getxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- int call_cnt = 0, len = 0;
- char *lockinfo_buf = NULL;
- dict_t *lockinfo = NULL, *newdict = NULL;
- afr_local_t *local = NULL;
+ int call_cnt = 0, len = 0;
+ char *lockinfo_buf = NULL;
+ dict_t *lockinfo = NULL, *newdict = NULL;
+ afr_local_t *local = NULL;
- LOCK (&frame->lock);
- {
- local = frame->local;
-
- call_cnt = --local->call_count;
+ LOCK(&frame->lock);
+ {
+ local = frame->local;
- if ((op_ret < 0) || (!dict && !xdata)) {
- goto unlock;
- }
+ call_cnt = --local->call_count;
- if (xdata) {
- if (!local->xdata_rsp) {
- local->xdata_rsp = dict_new ();
- if (!local->xdata_rsp) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto unlock;
- }
- }
- }
+ if ((op_ret < 0) || (!dict && !xdata)) {
+ goto unlock;
+ }
- if (!dict) {
- goto unlock;
+ if (xdata) {
+ if (!local->xdata_rsp) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
}
+ }
+ }
- op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY,
- (void **)&lockinfo_buf, &len);
+ if (!dict) {
+ goto unlock;
+ }
- if (!lockinfo_buf) {
- goto unlock;
- }
+ op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY,
+ (void **)&lockinfo_buf, &len);
- if (!local->dict) {
- local->dict = dict_new ();
- if (!local->dict) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto unlock;
- }
- }
- }
-unlock:
- UNLOCK (&frame->lock);
-
- if (lockinfo_buf != NULL) {
- lockinfo = dict_new ();
- if (lockinfo == NULL) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- } else {
- op_ret = dict_unserialize (lockinfo_buf, len,
- &lockinfo);
-
- if (lockinfo && local->dict) {
- dict_copy (lockinfo, local->dict);
- }
- }
+ if (!lockinfo_buf) {
+ goto unlock;
}
- if (xdata && local->xdata_rsp) {
- dict_copy (xdata, local->xdata_rsp);
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
}
+ }
+unlock:
+ UNLOCK(&frame->lock);
- if (!call_cnt) {
- newdict = dict_new ();
- if (!newdict) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto unwind;
- }
-
- len = dict_serialized_length (local->dict);
- if (len <= 0) {
- goto unwind;
- }
-
- lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char);
- if (!lockinfo_buf) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto unwind;
- }
+ if (lockinfo_buf != NULL) {
+ lockinfo = dict_new();
+ if (lockinfo == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ } else {
+ op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo);
- op_ret = dict_serialize (local->dict, lockinfo_buf);
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = -op_ret;
- }
+ if (lockinfo && local->dict) {
+ dict_copy(lockinfo, local->dict);
+ }
+ }
+ }
+
+ if (xdata && local->xdata_rsp) {
+ dict_copy(xdata, local->xdata_rsp);
+ }
+
+ if (!call_cnt) {
+ newdict = dict_new();
+ if (!newdict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
- op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY,
- (void *)lockinfo_buf, len);
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = -op_ret;
- goto unwind;
- }
+ op_ret = dict_allocate_and_serialize(
+ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+ if (op_ret != 0) {
+ local->op_ret = -1;
+ goto unwind;
+ }
- unwind:
- AFR_STACK_UNWIND (getxattr, frame, op_ret,
- op_errno, newdict,
- local->xdata_rsp);
+ op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
+ (void *)lockinfo_buf, len);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ goto unwind;
}
- dict_unref (lockinfo);
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, op_ret, op_errno, newdict,
+ local->xdata_rsp);
+ }
- return 0;
+ dict_unref(lockinfo);
+
+ return 0;
}
int32_t
-afr_fgetxattr_lockinfo_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+afr_fgetxattr_lockinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- int call_cnt = 0, len = 0;
- char *lockinfo_buf = NULL;
- dict_t *lockinfo = NULL, *newdict = NULL;
- afr_local_t *local = NULL;
-
- LOCK (&frame->lock);
- {
- local = frame->local;
+ int call_cnt = 0, len = 0;
+ char *lockinfo_buf = NULL;
+ dict_t *lockinfo = NULL, *newdict = NULL;
+ afr_local_t *local = NULL;
- call_cnt = --local->call_count;
+ LOCK(&frame->lock);
+ {
+ local = frame->local;
- if ((op_ret < 0) || (!dict && !xdata)) {
- goto unlock;
- }
+ call_cnt = --local->call_count;
- if (xdata) {
- if (!local->xdata_rsp) {
- local->xdata_rsp = dict_new ();
- if (!local->xdata_rsp) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto unlock;
- }
- }
- }
+ if ((op_ret < 0) || (!dict && !xdata)) {
+ goto unlock;
+ }
- if (!dict) {
- goto unlock;
+ if (xdata) {
+ if (!local->xdata_rsp) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
}
+ }
+ }
- op_ret = dict_get_ptr_and_len (dict, GF_XATTR_LOCKINFO_KEY,
- (void **)&lockinfo_buf, &len);
+ if (!dict) {
+ goto unlock;
+ }
- if (!lockinfo_buf) {
- goto unlock;
- }
+ op_ret = dict_get_ptr_and_len(dict, GF_XATTR_LOCKINFO_KEY,
+ (void **)&lockinfo_buf, &len);
- if (!local->dict) {
- local->dict = dict_new ();
- if (!local->dict) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto unlock;
- }
- }
- }
-unlock:
- UNLOCK (&frame->lock);
-
- if (lockinfo_buf != NULL) {
- lockinfo = dict_new ();
- if (lockinfo == NULL) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- } else {
- op_ret = dict_unserialize (lockinfo_buf, len,
- &lockinfo);
-
- if (lockinfo && local->dict) {
- dict_copy (lockinfo, local->dict);
- }
- }
+ if (!lockinfo_buf) {
+ goto unlock;
}
- if (xdata && local->xdata_rsp) {
- dict_copy (xdata, local->xdata_rsp);
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unlock;
+ }
}
+ }
+unlock:
+ UNLOCK(&frame->lock);
- if (!call_cnt) {
- newdict = dict_new ();
- if (!newdict) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto unwind;
- }
-
- len = dict_serialized_length (local->dict);
- if (len <= 0) {
- goto unwind;
- }
-
- lockinfo_buf = GF_CALLOC (1, len, gf_common_mt_char);
- if (!lockinfo_buf) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto unwind;
- }
+ if (lockinfo_buf != NULL) {
+ lockinfo = dict_new();
+ if (lockinfo == NULL) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ } else {
+ op_ret = dict_unserialize(lockinfo_buf, len, &lockinfo);
- op_ret = dict_serialize (local->dict, lockinfo_buf);
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = -op_ret;
- }
+ if (lockinfo && local->dict) {
+ dict_copy(lockinfo, local->dict);
+ }
+ }
+ }
+
+ if (xdata && local->xdata_rsp) {
+ dict_copy(xdata, local->xdata_rsp);
+ }
+
+ if (!call_cnt) {
+ newdict = dict_new();
+ if (!newdict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto unwind;
+ }
- op_ret = dict_set_dynptr (newdict, GF_XATTR_LOCKINFO_KEY,
- (void *)lockinfo_buf, len);
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = -op_ret;
- goto unwind;
- }
+ op_ret = dict_allocate_and_serialize(
+ local->dict, (char **)&lockinfo_buf, (unsigned int *)&len);
+ if (op_ret != 0) {
+ local->op_ret = -1;
+ goto unwind;
+ }
- unwind:
- AFR_STACK_UNWIND (fgetxattr, frame, op_ret,
- op_errno, newdict,
- local->xdata_rsp);
+ op_ret = dict_set_dynptr(newdict, GF_XATTR_LOCKINFO_KEY,
+ (void *)lockinfo_buf, len);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = -op_ret;
+ goto unwind;
}
- dict_unref (lockinfo);
+ unwind:
+ AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, newdict,
+ local->xdata_rsp);
+ }
- return 0;
+ dict_unref(lockinfo);
+
+ return 0;
}
int32_t
-afr_fgetxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+afr_fgetxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- int32_t callcnt = 0;
- int ret = 0;
- char *xattr = NULL;
- char *xattr_serz = NULL;
- char xattr_cky[1024] = {0,};
- dict_t *nxattr = NULL;
- long cky = 0;
- int32_t padding = 0;
- int32_t tlen = 0;
-
- if (!frame || !frame->local || !this) {
- gf_msg ("", GF_LOG_ERROR, 0,
- AFR_MSG_INVALID_ARG, "possible NULL deref");
- goto out;
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr = NULL;
+ char *xattr_serz = NULL;
+ int keylen = 0;
+ char xattr_cky[1024] = {
+ 0,
+ };
+ int xattr_cky_len = 0;
+ dict_t *nxattr = NULL;
+ long cky = 0;
+ int32_t padding = 0;
+ int32_t tlen = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+ cky = (long)cookie;
+ keylen = strlen(local->cont.getxattr.name);
+ xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld",
+ local->cont.getxattr.name, cky);
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
+
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
}
- local = frame->local;
- cky = (long) cookie;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret < 0) {
- local->op_errno = op_errno;
- } else {
- local->op_ret = op_ret;
- if (!local->xdata_rsp && xdata)
- local->xdata_rsp = dict_ref (xdata);
- }
+ if (!dict || (op_ret < 0))
+ goto unlock;
- if (!dict || (op_ret < 0))
- goto unlock;
-
- if (!local->dict)
- local->dict = dict_new ();
-
- if (local->dict) {
- ret = dict_get_str (dict,
- local->cont.getxattr.name,
- &xattr);
- if (ret)
- goto unlock;
-
- xattr = gf_strdup (xattr);
-
- (void)snprintf (xattr_cky, 1024, "%s-%ld",
- local->cont.getxattr.name, cky);
- ret = dict_set_dynstr (local->dict,
- xattr_cky, xattr);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Cannot set xattr cookie key");
- goto unlock;
- }
-
- local->cont.getxattr.xattr_len
- += strlen (xattr) + 1;
- }
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict)
+ goto unlock;
+ }
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr);
+ if (ret)
+ goto unlock;
+
+ xattr = gf_strdup(xattr);
+
+ ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr);
+ if (ret) {
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set xattr cookie key");
+ goto post_unlock;
}
-unlock:
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (!local->cont.getxattr.xattr_len)
- goto unwind;
-
- nxattr = dict_new ();
- if (!nxattr)
- goto unwind;
-
- /* extra bytes for decorations (brackets and <>'s) */
- padding += strlen (this->name)
- + strlen (AFR_PATHINFO_HEADER) + 4;
- local->cont.getxattr.xattr_len += (padding + 2);
-
- xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len,
- sizeof (char), gf_common_mt_char);
-
- if (!xattr_serz)
- goto unwind;
-
- /* the xlator info */
- (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ",
- this->name);
-
- /* actual series of pathinfo */
- ret = dict_serialize_value_with_delim (local->dict,
- xattr_serz
- + strlen (xattr_serz),
- &tlen, ' ');
- if (ret) {
- goto unwind;
- }
- /* closing part */
- *(xattr_serz + padding + tlen) = ')';
- *(xattr_serz + padding + tlen + 1) = '\0';
+ local->cont.getxattr.xattr_len += strlen(xattr) + 1;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!callcnt) {
+ if (!local->cont.getxattr.xattr_len)
+ goto unwind;
+
+ nxattr = dict_new();
+ if (!nxattr)
+ goto unwind;
+
+ /* extra bytes for decorations (brackets and <>'s) */
+ padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4;
+ local->cont.getxattr.xattr_len += (padding + 2);
+
+ xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len,
+ gf_common_mt_char);
+
+ if (!xattr_serz)
+ goto unwind;
+
+ /* the xlator info */
+ int xattr_serz_len = sprintf(
+ xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name);
+
+ /* actual series of pathinfo */
+ ret = dict_serialize_value_with_delim(
+ local->dict, xattr_serz + xattr_serz_len, &tlen, ' ');
+ if (ret) {
+ GF_FREE(xattr_serz);
+ goto unwind;
+ }
- ret = dict_set_dynstr (nxattr, local->cont.getxattr.name,
- xattr_serz);
- if (ret)
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Cannot set pathinfo key in dict");
+ /* closing part */
+ *(xattr_serz + padding + tlen) = ')';
+ *(xattr_serz + padding + tlen + 1) = '\0';
+
+ ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen,
+ xattr_serz);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set pathinfo key in dict");
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
+ }
- unwind:
- AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret,
- local->op_errno, nxattr, local->xdata_rsp);
+ unwind:
+ AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno,
+ nxattr, local->xdata_rsp);
- if (nxattr)
- dict_unref (nxattr);
- }
+ if (nxattr)
+ dict_unref(nxattr);
+ }
out:
- return ret;
+ return ret;
}
int32_t
-afr_getxattr_pathinfo_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+afr_getxattr_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- int32_t callcnt = 0;
- int ret = 0;
- char *xattr = NULL;
- char *xattr_serz = NULL;
- char xattr_cky[1024] = {0,};
- dict_t *nxattr = NULL;
- long cky = 0;
- int32_t padding = 0;
- int32_t tlen = 0;
-
- if (!frame || !frame->local || !this) {
- gf_msg ("", GF_LOG_ERROR, 0,
- AFR_MSG_INVALID_ARG, "possible NULL deref");
- goto out;
- }
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
+ int ret = 0;
+ char *xattr = NULL;
+ char *xattr_serz = NULL;
+ char xattr_cky[1024] = {
+ 0,
+ };
+ int keylen = 0;
+ int xattr_cky_len = 0;
+ dict_t *nxattr = NULL;
+ long cky = 0;
+ int32_t padding = 0;
+ int32_t tlen = 0;
+
+ if (!frame || !frame->local || !this) {
+ gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+ goto out;
+ }
+
+ local = frame->local;
+ cky = (long)cookie;
+ keylen = strlen(local->cont.getxattr.name);
+ xattr_cky_len = snprintf(xattr_cky, sizeof(xattr_cky), "%s-%ld",
+ local->cont.getxattr.name, cky);
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
- local = frame->local;
- cky = (long) cookie;
-
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
-
- if (op_ret < 0) {
- local->op_errno = op_errno;
- } else {
- local->op_ret = op_ret;
- if (!local->xdata_rsp && xdata)
- local->xdata_rsp = dict_ref (xdata);
- }
-
- if (!dict || (op_ret < 0))
- goto unlock;
-
- if (!local->dict)
- local->dict = dict_new ();
-
- if (local->dict) {
- ret = dict_get_str (dict,
- local->cont.getxattr.name,
- &xattr);
- if (ret)
- goto unlock;
-
- xattr = gf_strdup (xattr);
-
- (void)snprintf (xattr_cky, 1024, "%s-%ld",
- local->cont.getxattr.name, cky);
- ret = dict_set_dynstr (local->dict,
- xattr_cky, xattr);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret,
- AFR_MSG_DICT_SET_FAILED,
- "Cannot set xattr "
- "cookie key");
- goto unlock;
- }
-
- local->cont.getxattr.xattr_len += strlen (xattr) + 1;
- }
- }
-unlock:
- UNLOCK (&frame->lock);
-
- if (!callcnt) {
- if (!local->cont.getxattr.xattr_len)
- goto unwind;
-
- nxattr = dict_new ();
- if (!nxattr)
- goto unwind;
-
- /* extra bytes for decorations (brackets and <>'s) */
- padding += strlen (this->name) + strlen (AFR_PATHINFO_HEADER) + 4;
- local->cont.getxattr.xattr_len += (padding + 2);
-
- xattr_serz = GF_CALLOC (local->cont.getxattr.xattr_len,
- sizeof (char), gf_common_mt_char);
-
- if (!xattr_serz)
- goto unwind;
+ if (op_ret < 0) {
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = op_ret;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
+ }
- /* the xlator info */
- (void) sprintf (xattr_serz, "(<"AFR_PATHINFO_HEADER"%s> ",
- this->name);
+ if (!dict || (op_ret < 0))
+ goto unlock;
- /* actual series of pathinfo */
- ret = dict_serialize_value_with_delim (local->dict,
- xattr_serz + strlen (xattr_serz),
- &tlen, ' ');
- if (ret) {
- goto unwind;
- }
+ if (!local->dict) {
+ local->dict = dict_new();
+ if (!local->dict)
+ goto unlock;
+ }
+ ret = dict_get_strn(dict, local->cont.getxattr.name, keylen, &xattr);
+ if (ret)
+ goto unlock;
+
+ xattr = gf_strdup(xattr);
+
+ ret = dict_set_dynstrn(local->dict, xattr_cky, xattr_cky_len, xattr);
+ if (ret) {
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set xattr cookie key");
+ goto post_unlock;
+ }
- /* closing part */
- *(xattr_serz + padding + tlen) = ')';
- *(xattr_serz + padding + tlen + 1) = '\0';
+ local->cont.getxattr.xattr_len += strlen(xattr) + 1;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!callcnt) {
+ if (!local->cont.getxattr.xattr_len)
+ goto unwind;
+
+ nxattr = dict_new();
+ if (!nxattr)
+ goto unwind;
+
+ /* extra bytes for decorations (brackets and <>'s) */
+ padding += strlen(this->name) + SLEN(AFR_PATHINFO_HEADER) + 4;
+ local->cont.getxattr.xattr_len += (padding + 2);
+
+ xattr_serz = GF_MALLOC(local->cont.getxattr.xattr_len,
+ gf_common_mt_char);
+
+ if (!xattr_serz)
+ goto unwind;
+
+ /* the xlator info */
+ int xattr_serz_len = sprintf(
+ xattr_serz, "(<" AFR_PATHINFO_HEADER "%s> ", this->name);
+
+ /* actual series of pathinfo */
+ ret = dict_serialize_value_with_delim(
+ local->dict, xattr_serz + xattr_serz_len, &tlen, ' ');
+ if (ret) {
+ GF_FREE(xattr_serz);
+ goto unwind;
+ }
- ret = dict_set_dynstr (nxattr, local->cont.getxattr.name,
- xattr_serz);
- if (ret)
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Cannot set pathinfo key in dict");
+ /* closing part */
+ *(xattr_serz + padding + tlen) = ')';
+ *(xattr_serz + padding + tlen + 1) = '\0';
+
+ ret = dict_set_dynstrn(nxattr, local->cont.getxattr.name, keylen,
+ xattr_serz);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Cannot set pathinfo key in dict");
+ if (ret == -EINVAL)
+ GF_FREE(xattr_serz);
+ }
- unwind:
- AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
- local->op_errno, nxattr, local->xdata_rsp);
+ unwind:
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ nxattr, local->xdata_rsp);
- if (nxattr)
- dict_unref (nxattr);
- }
+ if (nxattr)
+ dict_unref(nxattr);
+ }
out:
- return ret;
+ return ret;
}
static int
-afr_aggregate_stime_xattr (dict_t *this, char *key, data_t *value, void *data)
+afr_aggregate_stime_xattr(dict_t *this, char *key, data_t *value, void *data)
{
- int ret = 0;
+ int ret = 0;
- if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0)
- ret = gf_get_max_stime (THIS, data, key, value);
+ if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0)
+ ret = gf_get_max_stime(THIS, data, key, value);
- return ret;
+ return ret;
}
int32_t
-afr_common_getxattr_stime_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+afr_common_getxattr_stime_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- int32_t callcnt = 0;
+ afr_local_t *local = NULL;
+ int32_t callcnt = 0;
- if (!frame || !frame->local || !this) {
- gf_msg ("", GF_LOG_ERROR, 0,
- AFR_MSG_INVALID_ARG, "possible NULL deref");
- goto out;
- }
+ if (!frame || !frame->local || !this) {
+ gf_msg("", GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "possible NULL deref");
+ goto out;
+ }
- local = frame->local;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- callcnt = --local->call_count;
+ LOCK(&frame->lock);
+ {
+ callcnt = --local->call_count;
- if (!dict || (op_ret < 0)) {
- local->op_errno = op_errno;
- goto cleanup;
- }
-
- if (!local->dict)
- local->dict = dict_copy_with_ref (dict, NULL);
- else
- dict_foreach (dict, afr_aggregate_stime_xattr,
- local->dict);
- local->op_ret = 0;
+ if (!dict || (op_ret < 0)) {
+ local->op_errno = op_errno;
+ goto cleanup;
}
+ if (!local->dict)
+ local->dict = dict_copy_with_ref(dict, NULL);
+ else
+ dict_foreach(dict, afr_aggregate_stime_xattr, local->dict);
+ local->op_ret = 0;
+ }
+
cleanup:
- UNLOCK (&frame->lock);
+ UNLOCK(&frame->lock);
- if (!callcnt) {
- AFR_STACK_UNWIND (getxattr, frame, local->op_ret,
- local->op_errno, local->dict, xdata);
- }
+ if (!callcnt) {
+ AFR_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ local->dict, xdata);
+ }
out:
- return 0;
+ return 0;
}
-
static gf_boolean_t
-afr_is_special_xattr (const char *name, fop_getxattr_cbk_t *cbk,
- gf_boolean_t is_fgetxattr)
+afr_is_special_xattr(const char *name, fop_getxattr_cbk_t *cbk,
+ gf_boolean_t is_fgetxattr)
{
- gf_boolean_t is_spl = _gf_true;
-
- GF_ASSERT (cbk);
- if (!cbk || !name) {
- is_spl = _gf_false;
- goto out;
+ gf_boolean_t is_spl = _gf_true;
+
+ GF_ASSERT(cbk);
+ if (!cbk || !name) {
+ is_spl = _gf_false;
+ goto out;
+ }
+
+ if (!strcmp(name, GF_XATTR_PATHINFO_KEY) ||
+ !strcmp(name, GF_XATTR_USER_PATHINFO_KEY)) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_pathinfo_cbk;
+ } else {
+ *cbk = afr_getxattr_pathinfo_cbk;
}
-
- if (!strcmp (name, GF_XATTR_PATHINFO_KEY) ||
- !strcmp (name, GF_XATTR_USER_PATHINFO_KEY)) {
- if (is_fgetxattr) {
- *cbk = afr_fgetxattr_pathinfo_cbk;
- } else {
- *cbk = afr_getxattr_pathinfo_cbk;
- }
- } else if (!strncmp (name, GF_XATTR_CLRLK_CMD,
- strlen (GF_XATTR_CLRLK_CMD))) {
- if (is_fgetxattr) {
- *cbk = afr_fgetxattr_clrlk_cbk;
- } else {
- *cbk = afr_getxattr_clrlk_cbk;
- }
- } else if (!strncmp (name, GF_XATTR_LOCKINFO_KEY,
- strlen (GF_XATTR_LOCKINFO_KEY))) {
- if (is_fgetxattr) {
- *cbk = afr_fgetxattr_lockinfo_cbk;
- } else {
- *cbk = afr_getxattr_lockinfo_cbk;
- }
- } else if (fnmatch (GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) {
- *cbk = afr_common_getxattr_stime_cbk;
- } else if (strcmp (name, QUOTA_SIZE_KEY) == 0) {
- *cbk = afr_getxattr_quota_size_cbk;
+ } else if (!strncmp(name, GF_XATTR_CLRLK_CMD, SLEN(GF_XATTR_CLRLK_CMD))) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_clrlk_cbk;
} else {
- is_spl = _gf_false;
+ *cbk = afr_getxattr_clrlk_cbk;
}
+ } else if (!strncmp(name, GF_XATTR_LOCKINFO_KEY,
+ SLEN(GF_XATTR_LOCKINFO_KEY))) {
+ if (is_fgetxattr) {
+ *cbk = afr_fgetxattr_lockinfo_cbk;
+ } else {
+ *cbk = afr_getxattr_lockinfo_cbk;
+ }
+ } else if (fnmatch(GF_XATTR_STIME_PATTERN, name, FNM_NOESCAPE) == 0) {
+ *cbk = afr_common_getxattr_stime_cbk;
+ } else if (strcmp(name, QUOTA_SIZE_KEY) == 0) {
+ *cbk = afr_getxattr_quota_size_cbk;
+ } else if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) {
+ *cbk = afr_getxattr_list_node_uuids_cbk;
+ } else {
+ is_spl = _gf_false;
+ }
out:
- return is_spl;
+ return is_spl;
}
static void
-afr_getxattr_all_subvols (xlator_t *this, call_frame_t *frame,
- const char *name, loc_t *loc,
- fop_getxattr_cbk_t cbk)
+afr_getxattr_all_subvols(xlator_t *this, call_frame_t *frame, const char *name,
+ loc_t *loc, fop_getxattr_cbk_t cbk)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
-
- priv = this->private;
-
- local = frame->local;
- //local->call_count set in afr_local_init
- call_count = local->call_count;
-
- //If up-children count is 0, afr_local_init would have failed already
- //and the call would have unwound so not handling it here.
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->getxattr,
- loc, name, NULL);
- if (!--call_count)
- break;
- }
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ priv = this->private;
+
+ local = frame->local;
+ // local->call_count set in afr_local_init
+ call_count = local->call_count;
+
+ if (!strcmp(name, GF_XATTR_LIST_NODE_UUIDS_KEY)) {
+ GF_FREE(local->cont.getxattr.name);
+ local->cont.getxattr.name = gf_strdup(GF_XATTR_NODE_UUID_KEY);
+ }
+
+ // If up-children count is 0, afr_local_init would have failed already
+ // and the call would have unwound so not handling it here.
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->getxattr, loc,
+ local->cont.getxattr.name, NULL);
+ if (!--call_count)
+ break;
}
- return;
+ }
+ return;
}
int
-afr_marker_populate_args (call_frame_t *frame, int type, int *gauge,
- xlator_t **subvols)
+afr_marker_populate_args(call_frame_t *frame, int type, int *gauge,
+ xlator_t **subvols)
{
- xlator_t *this = frame->this;
- afr_private_t *priv = this->private;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
- memcpy (subvols, priv->children, sizeof (*subvols) * priv->child_count);
+ memcpy(subvols, priv->children, sizeof(*subvols) * priv->child_count);
- if (type == MARKER_XTIME_TYPE) {
- /*Don't error out on ENOENT/ENOTCONN */
- gauge[MCNT_NOTFOUND] = 0;
- gauge[MCNT_ENOTCONN] = 0;
- }
- return priv->child_count;
+ if (type == MARKER_XTIME_TYPE) {
+ /*Don't error out on ENOENT/ENOTCONN */
+ gauge[MCNT_NOTFOUND] = 0;
+ gauge[MCNT_ENOTCONN] = 0;
+ }
+ return priv->child_count;
}
static int
-afr_handle_heal_xattrs (call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *heal_op)
+afr_handle_heal_xattrs(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *heal_op)
{
- int ret = -1;
- afr_spb_status_t *data = NULL;
+ int ret = -1;
+ afr_spb_status_t *data = NULL;
- if (!strcmp (heal_op, GF_HEAL_INFO)) {
- afr_get_heal_info (frame, this, loc);
- ret = 0;
- goto out;
- }
+ if (!strcmp(heal_op, GF_HEAL_INFO)) {
+ afr_get_heal_info(frame, this, loc);
+ ret = 0;
+ goto out;
+ }
- if (!strcmp (heal_op, GF_AFR_HEAL_SBRAIN)) {
- afr_heal_splitbrain_file (frame, this, loc);
- ret = 0;
- goto out;
+ if (!strcmp(heal_op, GF_AFR_HEAL_SBRAIN)) {
+ afr_heal_splitbrain_file(frame, this, loc);
+ ret = 0;
+ goto out;
+ }
+
+ if (!strcmp(heal_op, GF_AFR_SBRAIN_STATUS)) {
+ data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spb_status_t);
+ if (!data) {
+ ret = 1;
+ goto out;
}
-
- if (!strcmp (heal_op, GF_AFR_SBRAIN_STATUS)) {
- data = GF_CALLOC (1, sizeof (*data), gf_afr_mt_spb_status_t);
- if (!data) {
- ret = 1;
- goto out;
- }
- data->frame = frame;
- data->loc = loc;
- ret = synctask_new (this->ctx->env,
- afr_get_split_brain_status,
- afr_get_split_brain_status_cbk,
- NULL, data);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SPLIT_BRAIN_STATUS,
- "Failed to create"
- " synctask. Unable to fetch split-brain status"
- " for %s.", loc->name);
- ret = 1;
- goto out;
- }
- goto out;
+ data->frame = frame;
+ data->loc = loc;
+ ret = synctask_new(this->ctx->env, afr_get_split_brain_status,
+ afr_get_split_brain_status_cbk, NULL, data);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ "Failed to create"
+ " synctask. Unable to fetch split-brain status"
+ " for %s.",
+ loc->name);
+ ret = 1;
+ goto out;
}
+ goto out;
+ }
out:
- if (ret == 1) {
- AFR_STACK_UNWIND (getxattr, frame, -1, ENOMEM, NULL, NULL);
- if (data)
- GF_FREE (data);
- ret = 0;
- }
- return ret;
+ if (ret == 1) {
+ AFR_STACK_UNWIND(getxattr, frame, -1, ENOMEM, NULL, NULL);
+ if (data)
+ GF_FREE(data);
+ ret = 0;
+ }
+ return ret;
}
int32_t
-afr_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata)
+afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- xlator_t **children = NULL;
- afr_local_t *local = NULL;
- int i = 0;
- int32_t op_errno = 0;
- int ret = -1;
- fop_getxattr_cbk_t cbk = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ xlator_t **children = NULL;
+ int i = 0;
+ int32_t op_errno = 0;
+ int ret = -1;
+ fop_getxattr_cbk_t cbk = NULL;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ priv = this->private;
- priv = this->private;
+ children = priv->children;
- children = priv->children;
+ loc_copy(&local->loc, loc);
- loc_copy (&local->loc, loc);
+ local->op = GF_FOP_GETXATTR;
- local->op = GF_FOP_GETXATTR;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- if (xdata)
- local->xdata_req = dict_ref (xdata);
+ if (!name)
+ goto no_name;
- if (!name)
- goto no_name;
+ local->cont.getxattr.name = gf_strdup(name);
- local->cont.getxattr.name = gf_strdup (name);
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- if (!local->cont.getxattr.name) {
- op_errno = ENOMEM;
- goto out;
- }
+ if (!strncmp(name, AFR_XATTR_PREFIX, SLEN(AFR_XATTR_PREFIX))) {
+ op_errno = ENODATA;
+ goto out;
+ }
- if (!strncmp (name, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
- op_errno = ENODATA;
- goto out;
- }
-
- if (cluster_handle_marker_getxattr (frame, loc, name, priv->vol_uuid,
- afr_getxattr_unwind,
- afr_marker_populate_args) == 0)
- return 0;
+ if (cluster_handle_marker_getxattr(frame, loc, name, priv->vol_uuid,
+ afr_getxattr_unwind,
+ afr_marker_populate_args) == 0)
+ return 0;
- ret = afr_handle_heal_xattrs (frame, this, &local->loc, name);
- if (ret == 0)
- return 0;
+ ret = afr_handle_heal_xattrs(frame, this, &local->loc, name);
+ if (ret == 0)
+ return 0;
- /*
- * Special xattrs which need responses from all subvols
- */
- if (afr_is_special_xattr (name, &cbk, 0)) {
- afr_getxattr_all_subvols (this, frame, name, loc, cbk);
- return 0;
- }
+ /*
+ * Heal daemons don't have IO threads ... and as a result they
+ * send this getxattr down and eventually crash :(
+ */
+ op_errno = -1;
+ GF_CHECK_XATTR_KEY_AND_GOTO(name, IO_THREADS_QUEUE_SIZE_KEY, op_errno, out);
+
+ /*
+ * Special xattrs which need responses from all subvols
+ */
+ if (afr_is_special_xattr(name, &cbk, 0)) {
+ afr_getxattr_all_subvols(this, frame, name, loc, cbk);
+ return 0;
+ }
- if (XATTR_IS_NODE_UUID (name)) {
- i = 0;
- STACK_WIND_COOKIE (frame, afr_getxattr_node_uuid_cbk,
- (void *) (long) i,
- children[i],
- children[i]->fops->getxattr,
- loc, name, xdata);
- return 0;
- }
+ if (XATTR_IS_NODE_UUID(name)) {
+ i = 0;
+ STACK_WIND_COOKIE(frame, afr_getxattr_node_uuid_cbk, (void *)(long)i,
+ children[i], children[i]->fops->getxattr, loc, name,
+ xdata);
+ return 0;
+ }
no_name:
- afr_read_txn (frame, this, local->loc.inode, afr_getxattr_wind,
- AFR_METADATA_TRANSACTION);
+ afr_read_txn(frame, this, local->loc.inode, afr_getxattr_wind,
+ AFR_METADATA_TRANSACTION);
- ret = 0;
+ ret = 0;
out:
- if (ret < 0)
- AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL);
- return 0;
+ if (ret < 0)
+ AFR_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL);
+ return 0;
}
/* {{{ fgetxattr */
-
int32_t
-afr_fgetxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *dict, dict_t *xdata)
+afr_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = op_errno;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- afr_read_txn_continue (frame, this, (long) cookie);
- return 0;
- }
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- if (dict)
- afr_filter_xattrs (dict);
+ if (dict)
+ afr_filter_xattrs(dict);
- AFR_STACK_UNWIND (fgetxattr, frame, op_ret, op_errno, dict, xdata);
+ AFR_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
+ return 0;
}
int
-afr_fgetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_fgetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
-
- if (subvol == -1) {
- AFR_STACK_UNWIND (fgetxattr, frame, local->op_ret,
- local->op_errno, NULL, NULL);
- return 0;
- }
-
- STACK_WIND_COOKIE (frame, afr_fgetxattr_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->fgetxattr,
- local->fd, local->cont.getxattr.name,
- local->xdata_req);
- return 0;
-}
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ local = frame->local;
+ priv = this->private;
-static void
-afr_fgetxattr_all_subvols (xlator_t *this, call_frame_t *frame,
- fop_fgetxattr_cbk_t cbk)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(fgetxattr, frame, local->op_ret, local->op_errno, NULL,
+ NULL);
+ return 0;
+ }
- priv = this->private;
+ STACK_WIND_COOKIE(frame, afr_fgetxattr_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fgetxattr, local->fd,
+ local->cont.getxattr.name, local->xdata_req);
+ return 0;
+}
- local = frame->local;
- //local->call_count set in afr_local_init
- call_count = local->call_count;
-
- //If up-children count is 0, afr_local_init would have failed already
- //and the call would have unwound so not handling it here.
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fgetxattr,
- local->fd, local->cont.getxattr.name,
- NULL);
- if (!--call_count)
- break;
- }
+static void
+afr_fgetxattr_all_subvols(xlator_t *this, call_frame_t *frame,
+ fop_fgetxattr_cbk_t cbk)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+
+ priv = this->private;
+
+ local = frame->local;
+ // local->call_count set in afr_local_init
+ call_count = local->call_count;
+
+ // If up-children count is 0, afr_local_init would have failed already
+ // and the call would have unwound so not handling it here.
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, cbk, (void *)(long)i, priv->children[i],
+ priv->children[i]->fops->fgetxattr, local->fd,
+ local->cont.getxattr.name, NULL);
+ if (!--call_count)
+ break;
}
+ }
- return;
+ return;
}
-
int
-afr_fgetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *name, dict_t *xdata)
+afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- int32_t op_errno = 0;
- fop_fgetxattr_cbk_t cbk = NULL;
-
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
-
- local->op = GF_FOP_FGETXATTR;
- local->fd = fd_ref (fd);
- if (name) {
- local->cont.getxattr.name = gf_strdup (name);
- if (!local->cont.getxattr.name) {
- op_errno = ENOMEM;
- goto out;
- }
- }
- if (xdata)
- local->xdata_req = dict_ref (xdata);
-
- /* pathinfo gets handled only in getxattr(), but we need to handle
- * lockinfo.
- * If we are doing fgetxattr with lockinfo as the key then we
- * collect information from all children.
- */
- if (afr_is_special_xattr (name, &cbk, 1)) {
- afr_fgetxattr_all_subvols (this, frame, cbk);
- return 0;
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
+ fop_fgetxattr_cbk_t cbk = NULL;
+
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_FGETXATTR;
+ local->fd = fd_ref(fd);
+ if (name) {
+ local->cont.getxattr.name = gf_strdup(name);
+ if (!local->cont.getxattr.name) {
+ op_errno = ENOMEM;
+ goto out;
}
+ }
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ /* pathinfo gets handled only in getxattr(), but we need to handle
+ * lockinfo.
+ * If we are doing fgetxattr with lockinfo as the key then we
+ * collect information from all children.
+ */
+ if (afr_is_special_xattr(name, &cbk, 1)) {
+ afr_fgetxattr_all_subvols(this, frame, cbk);
+ return 0;
+ }
- afr_fix_open (fd, this);
+ afr_fix_open(fd, this);
- afr_read_txn (frame, this, fd->inode, afr_fgetxattr_wind,
- AFR_METADATA_TRANSACTION);
+ afr_read_txn(frame, this, fd->inode, afr_fgetxattr_wind,
+ AFR_METADATA_TRANSACTION);
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL);
+ AFR_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
-
/* }}} */
/* {{{ readv */
int
-afr_readv_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- struct iovec *vector, int32_t count, struct iatt *buf,
- struct iobref *iobref, dict_t *xdata)
+afr_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iovec *vector, int32_t count,
+ struct iatt *buf, struct iobref *iobref, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = op_errno;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- afr_read_txn_continue (frame, this, (long) cookie);
- return 0;
- }
+ afr_read_txn_continue(frame, this, (long)cookie);
+ return 0;
+ }
- AFR_STACK_UNWIND (readv, frame, op_ret, op_errno,
- vector, count, buf, iobref, xdata);
- return 0;
+ AFR_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, buf, iobref,
+ xdata);
+ return 0;
}
-
int
-afr_readv_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_readv_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
-
- if (subvol == -1) {
- AFR_STACK_UNWIND (readv, frame, local->op_ret, local->op_errno,
- 0, 0, 0, 0, 0);
- return 0;
- }
-
- STACK_WIND_COOKIE (frame, afr_readv_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->readv,
- local->fd, local->cont.readv.size,
- local->cont.readv.offset, local->cont.readv.flags,
- local->xdata_req);
- return 0;
-}
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(readv, frame, local->op_ret, local->op_errno, 0, 0, 0,
+ 0, 0);
+ return 0;
+ }
+ STACK_WIND_COOKIE(
+ frame, afr_readv_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->readv, local->fd, local->cont.readv.size,
+ local->cont.readv.offset, local->cont.readv.flags, local->xdata_req);
+ return 0;
+}
int
-afr_readv (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t offset, uint32_t flags, dict_t *xdata)
+afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata)
{
- afr_local_t * local = NULL;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local->op = GF_FOP_READ;
- local->fd = fd_ref (fd);
- local->cont.readv.size = size;
- local->cont.readv.offset = offset;
- local->cont.readv.flags = flags;
- if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->op = GF_FOP_READ;
+ local->fd = fd_ref(fd);
+ local->cont.readv.size = size;
+ local->cont.readv.offset = offset;
+ local->cont.readv.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- afr_fix_open (fd, this);
+ afr_fix_open(fd, this);
- afr_read_txn (frame, this, fd->inode, afr_readv_wind,
- AFR_DATA_TRANSACTION);
+ afr_read_txn(frame, this, fd->inode, afr_readv_wind, AFR_DATA_TRANSACTION);
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
+ AFR_STACK_UNWIND(readv, frame, -1, op_errno, 0, 0, 0, 0, 0);
- return 0;
+ return 0;
}
/* }}} */
@@ -1783,77 +1822,73 @@ out:
/* {{{ seek */
int
-afr_seek_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, off_t offset, dict_t *xdata)
+afr_seek_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, off_t offset, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret < 0) {
- local->op_ret = -1;
- local->op_errno = op_errno;
-
- afr_read_txn_continue (frame, this, (long) cookie);
- return 0;
- }
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- AFR_STACK_UNWIND (seek, frame, op_ret, op_errno, offset, xdata);
+ afr_read_txn_continue(frame, this, (long)cookie);
return 0;
-}
+ }
+ AFR_STACK_UNWIND(seek, frame, op_ret, op_errno, offset, xdata);
+ return 0;
+}
int
-afr_seek_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_seek_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- if (subvol == -1) {
- AFR_STACK_UNWIND (seek, frame, local->op_ret, local->op_errno,
- 0, NULL);
- return 0;
- }
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_seek_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->seek,
- local->fd, local->cont.seek.offset,
- local->cont.seek.what, local->xdata_req);
+ if (subvol == -1) {
+ AFR_STACK_UNWIND(seek, frame, local->op_ret, local->op_errno, 0, NULL);
return 0;
-}
+ }
+ STACK_WIND_COOKIE(
+ frame, afr_seek_cbk, (void *)(long)subvol, priv->children[subvol],
+ priv->children[subvol]->fops->seek, local->fd, local->cont.seek.offset,
+ local->cont.seek.what, local->xdata_req);
+ return 0;
+}
int
-afr_seek (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- gf_seek_what_t what, dict_t *xdata)
+afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int32_t op_errno = 0;
+ afr_local_t *local = NULL;
+ int32_t op_errno = 0;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local->op = GF_FOP_SEEK;
- local->fd = fd_ref (fd);
- local->cont.seek.offset = offset;
- local->cont.seek.what = what;
- if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->op = GF_FOP_SEEK;
+ local->fd = fd_ref(fd);
+ local->cont.seek.offset = offset;
+ local->cont.seek.what = what;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- afr_fix_open (fd, this);
+ afr_fix_open(fd, this);
- afr_read_txn (frame, this, fd->inode, afr_seek_wind,
- AFR_DATA_TRANSACTION);
+ afr_read_txn(frame, this, fd->inode, afr_seek_wind, AFR_DATA_TRANSACTION);
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (seek, frame, -1, op_errno, 0, NULL);
+ AFR_STACK_UNWIND(seek, frame, -1, op_errno, 0, NULL);
- return 0;
+ return 0;
}
/* }}} */
diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h
index d128134ef2a..8c982bc7e6f 100644
--- a/xlators/cluster/afr/src/afr-inode-read.h
+++ b/xlators/cluster/afr/src/afr-inode-read.h
@@ -12,34 +12,34 @@
#define __INODE_READ_H__
int32_t
-afr_access (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t mask, dict_t *xdata);
+afr_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata);
int32_t
-afr_stat (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xdata);
+afr_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
int32_t
-afr_fstat (call_frame_t *frame, xlator_t *this,
- fd_t *fd, dict_t *xdata);
+afr_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
int32_t
-afr_readlink (call_frame_t *frame, xlator_t *this,
- loc_t *loc, size_t size, dict_t *xdata);
+afr_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata);
int32_t
-afr_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t offset, uint32_t flags, dict_t *xdata);
+afr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata);
int32_t
-afr_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata);
+afr_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+ dict_t *xdata);
int32_t
-afr_fgetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *name, dict_t *xdata);
-
+afr_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata);
int
-afr_handle_quota_size (call_frame_t *frame, xlator_t *this);
+afr_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ gf_seek_what_t what, dict_t *xdata);
+int
+afr_handle_quota_size(call_frame_t *frame, xlator_t *this);
#endif /* __INODE_READ_H__ */
diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
index 6b9d5ee87b0..1d6e4f3570a 100644
--- a/xlators/cluster/afr/src/afr-inode-write.c
+++ b/xlators/cluster/afr/src/afr-inode-write.c
@@ -8,741 +8,779 @@
cases as published by the Free Software Foundation.
*/
-
-#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
#include "protocol-common.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "afr-transaction.h"
#include "afr-self-heal.h"
#include "afr-messages.h"
static void
-__afr_inode_write_finalize (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int read_subvol = 0;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- if (local->inode) {
- if (local->transaction.type == AFR_METADATA_TRANSACTION)
- read_subvol = afr_metadata_subvol_get (local->inode, this,
- NULL, NULL,
- NULL);
- else
- read_subvol = afr_data_subvol_get (local->inode, this,
- NULL, NULL, NULL);
- }
-
- local->op_ret = -1;
- local->op_errno = afr_final_errno (local, priv);
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].valid)
- continue;
- if (local->replies[i].op_ret < 0) {
- afr_inode_read_subvol_reset (local->inode, this);
- continue;
- }
-
- /* Order of checks in the compound conditional
- below is important.
-
- - Highest precedence: largest op_ret
- - Next precendence: if all op_rets are equal, read subvol
- - Least precedence: any succeeded subvol
- */
- if ((local->op_ret < local->replies[i].op_ret) ||
- ((local->op_ret == local->replies[i].op_ret) &&
- (i == read_subvol))) {
-
- local->op_ret = local->replies[i].op_ret;
- local->op_errno = local->replies[i].op_errno;
-
- local->cont.inode_wfop.prebuf =
- local->replies[i].prestat;
- local->cont.inode_wfop.postbuf =
- local->replies[i].poststat;
-
- if (local->replies[i].xdata) {
- if (local->xdata_rsp)
- dict_unref (local->xdata_rsp);
- local->xdata_rsp =
- dict_ref (local->replies[i].xdata);
- }
- if (local->replies[i].xattr) {
- if (local->xattr_rsp)
- dict_unref (local->xattr_rsp);
- local->xattr_rsp =
- dict_ref (local->replies[i].xattr);
- }
- }
- }
-
- afr_txn_arbitrate_fop_cbk (frame, this);
-}
+__afr_inode_write_finalize(call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ int ret = 0;
+ int read_subvol = 0;
+ struct iatt *stbuf = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_read_subvol_args_t args = {
+ 0,
+ };
+
+ local = frame->local;
+ priv = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, local->inode, out);
+
+ /*This code needs to stay till DHT sends fops on linked
+ * inodes*/
+ if (!inode_is_linked(local->inode)) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret == -1)
+ continue;
+ if (!gf_uuid_is_null(local->replies[i].poststat.ia_gfid)) {
+ gf_uuid_copy(args.gfid, local->replies[i].poststat.ia_gfid);
+ args.ia_type = local->replies[i].poststat.ia_type;
+ break;
+ } else {
+ ret = dict_get_bin(local->replies[i].xdata,
+ DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+ if (ret)
+ continue;
+ gf_uuid_copy(args.gfid, stbuf->ia_gfid);
+ args.ia_type = stbuf->ia_type;
+ break;
+ }
+ }
+ }
+
+ if (local->transaction.type == AFR_METADATA_TRANSACTION) {
+ read_subvol = afr_metadata_subvol_get(local->inode, this, NULL,
+ local->readable, NULL, &args);
+ } else {
+ read_subvol = afr_data_subvol_get(local->inode, this, NULL,
+ local->readable, NULL, &args);
+ }
+
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ afr_pick_error_xdata(local, priv, local->inode, local->readable, NULL,
+ NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+ if (local->replies[i].op_ret < 0)
+ continue;
+
+ /* Order of checks in the compound conditional
+ below is important.
+
+ - Highest precedence: largest op_ret
+ - Next precedence: if all op_rets are equal, read subvol
+ - Least precedence: any succeeded subvol
+ */
+ if ((local->op_ret < local->replies[i].op_ret) ||
+ ((local->op_ret == local->replies[i].op_ret) &&
+ (i == read_subvol))) {
+ local->op_ret = local->replies[i].op_ret;
+ local->op_errno = local->replies[i].op_errno;
+
+ local->cont.inode_wfop.prebuf = local->replies[i].prestat;
+ local->cont.inode_wfop.postbuf = local->replies[i].poststat;
+
+ if (local->replies[i].xdata) {
+ if (local->xdata_rsp)
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ }
+ if (local->replies[i].xattr) {
+ if (local->xattr_rsp)
+ dict_unref(local->xattr_rsp);
+ local->xattr_rsp = dict_ref(local->replies[i].xattr);
+ }
+ }
+ }
+ afr_set_in_flight_sb_status(this, frame, local->inode);
+out:
+ return;
+}
static void
-__afr_inode_write_fill (call_frame_t *frame, xlator_t *this, int child_index,
- int op_ret, int op_errno,
- struct iatt *prebuf, struct iatt *postbuf,
- dict_t *xattr, dict_t *xdata)
+__afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xattr, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- local->replies[child_index].valid = 1;
+ local->replies[child_index].valid = 1;
- if (AFR_IS_ARBITER_BRICK(priv, child_index) && op_ret == 1)
- op_ret = iov_length (local->cont.writev.vector,
- local->cont.writev.count);
+ if (AFR_IS_ARBITER_BRICK(priv, child_index) && op_ret == 1)
+ op_ret = iov_length(local->cont.writev.vector,
+ local->cont.writev.count);
- local->replies[child_index].op_ret = op_ret;
- local->replies[child_index].op_errno = op_errno;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+ if (xdata)
+ local->replies[child_index].xdata = dict_ref(xdata);
- if (op_ret >= 0) {
- if (prebuf)
- local->replies[child_index].prestat = *prebuf;
- if (postbuf)
- local->replies[child_index].poststat = *postbuf;
- if (xattr)
- local->replies[child_index].xattr = dict_ref (xattr);
- if (xdata)
- local->replies[child_index].xdata = dict_ref (xdata);
- } else {
- afr_transaction_fop_failed (frame, this, child_index);
- }
+ if (op_ret >= 0) {
+ if (prebuf)
+ local->replies[child_index].prestat = *prebuf;
+ if (postbuf)
+ local->replies[child_index].poststat = *postbuf;
+ if (xattr)
+ local->replies[child_index].xattr = dict_ref(xattr);
+ } else {
+ afr_transaction_fop_failed(frame, this, child_index);
+ }
- return;
+ return;
}
-
static int
-__afr_inode_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xattr, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- int child_index = (long) cookie;
- int call_count = -1;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- __afr_inode_write_fill (frame, this, child_index, op_ret,
- op_errno, prebuf, postbuf, xattr,
- xdata);
+__afr_inode_write_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ LOCK(&frame->lock);
+ {
+ __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno,
+ prebuf, postbuf, xattr, xdata);
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
+
+ if (call_count == 0) {
+ __afr_inode_write_finalize(frame, this);
+
+ if (afr_txn_nothing_failed(frame, this)) {
+ /*if it did pre-op, it will do post-op changing ctime*/
+ if (priv->consistent_metadata && afr_needs_changelog_update(local))
+ afr_zero_fill_stat(local);
+ local->transaction.unwind(frame, this);
}
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- __afr_inode_write_finalize (frame, this);
- if (afr_txn_nothing_failed (frame, this))
- local->transaction.unwind (frame, this);
+ afr_transaction_resume(frame, this);
+ }
- local->transaction.resume (frame, this);
- }
-
- return 0;
+ return 0;
}
/* {{{ writev */
void
-afr_writev_copy_outvars (call_frame_t *src_frame, call_frame_t *dst_frame)
+afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame)
{
- afr_local_t *src_local = NULL;
- afr_local_t *dst_local = NULL;
+ afr_local_t *src_local = NULL;
+ afr_local_t *dst_local = NULL;
- src_local = src_frame->local;
- dst_local = dst_frame->local;
+ src_local = src_frame->local;
+ dst_local = dst_frame->local;
- dst_local->op_ret = src_local->op_ret;
- dst_local->op_errno = src_local->op_errno;
- dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf;
- dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf;
- if (src_local->xdata_rsp)
- dst_local->xdata_rsp = dict_ref (src_local->xdata_rsp);
+ dst_local->op_ret = src_local->op_ret;
+ dst_local->op_errno = src_local->op_errno;
+ dst_local->cont.inode_wfop.prebuf = src_local->cont.inode_wfop.prebuf;
+ dst_local->cont.inode_wfop.postbuf = src_local->cont.inode_wfop.postbuf;
+ if (src_local->xdata_rsp)
+ dst_local->xdata_rsp = dict_ref(src_local->xdata_rsp);
}
void
-afr_writev_unwind (call_frame_t *frame, xlator_t *this)
+afr_writev_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- local = frame->local;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
- AFR_STACK_UNWIND (writev, frame,
- local->op_ret, local->op_errno,
- &local->cont.inode_wfop.prebuf,
- &local->cont.inode_wfop.postbuf,
- local->xdata_rsp);
-}
+ local = frame->local;
+
+ if (priv->consistent_metadata)
+ afr_zero_fill_stat(local);
+ AFR_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+}
int
-afr_transaction_writev_unwind (call_frame_t *frame, xlator_t *this)
+afr_transaction_writev_unwind(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *fop_frame = NULL;
+ call_frame_t *fop_frame = NULL;
- fop_frame = afr_transaction_detach_fop_frame (frame);
+ fop_frame = afr_transaction_detach_fop_frame(frame);
- if (fop_frame) {
- afr_writev_copy_outvars (frame, fop_frame);
- afr_writev_unwind (fop_frame, this);
- }
- return 0;
+ if (fop_frame) {
+ afr_writev_copy_outvars(frame, fop_frame);
+ afr_writev_unwind(fop_frame, this);
+ }
+ return 0;
}
static void
-afr_writev_handle_short_writes (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
- /*
- * We already have the best case result of the writev calls staged
- * as the return value. Any writev that returns some value less
- * than the best case is now out of sync, so mark the fop as
- * failed. Note that fops that have returned with errors have
- * already been marked as failed.
- */
- for (i = 0; i < priv->child_count; i++) {
- if ((!local->replies[i].valid) ||
- (local->replies[i].op_ret == -1))
- continue;
-
- if (local->replies[i].op_ret < local->op_ret)
- afr_transaction_fop_failed (frame, this, i);
- }
+afr_writev_handle_short_writes(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+ /*
+ * We already have the best case result of the writev calls staged
+ * as the return value. Any writev that returns some value less
+ * than the best case is now out of sync, so mark the fop as
+ * failed. Note that fops that have returned with errors have
+ * already been marked as failed.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!local->replies[i].valid) || (local->replies[i].op_ret == -1))
+ continue;
+
+ if (local->replies[i].op_ret < local->op_ret)
+ afr_transaction_fop_failed(frame, this, i);
+ }
}
-int
-afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+void
+afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = NULL;
- call_frame_t *fop_frame = NULL;
- int child_index = (long) cookie;
- int call_count = -1;
- int ret = 0;
- uint32_t open_fd_count = 0;
- uint32_t write_is_append = 0;
-
- local = frame->local;
-
- LOCK (&frame->lock);
- {
- __afr_inode_write_fill (frame, this, child_index, op_ret,
- op_errno, prebuf, postbuf, NULL, xdata);
- if (op_ret == -1 || !xdata)
- goto unlock;
-
- write_is_append = 0;
- ret = dict_get_uint32 (xdata, GLUSTERFS_WRITE_IS_APPEND,
- &write_is_append);
- if (ret || !write_is_append)
- local->append_write = _gf_false;
-
- ret = dict_get_uint32 (xdata, GLUSTERFS_OPEN_FD_COUNT,
- &open_fd_count);
- if (ret == -1)
- goto unlock;
- if ((open_fd_count > local->open_fd_count)) {
- local->open_fd_count = open_fd_count;
- local->update_open_fd_count = _gf_true;
- }
+ int ret = 0;
+ afr_local_t *local = frame->local;
+ uint32_t open_fd_count = 0;
+ uint32_t write_is_append = 0;
+ int32_t num_inodelks = 0;
+
+ LOCK(&frame->lock);
+ {
+ __afr_inode_write_fill(frame, this, child_index, op_ret, op_errno,
+ prebuf, postbuf, NULL, xdata);
+ if (op_ret == -1 || !xdata)
+ goto unlock;
+
+ write_is_append = 0;
+ ret = dict_get_uint32(xdata, GLUSTERFS_WRITE_IS_APPEND,
+ &write_is_append);
+ if (ret || !write_is_append)
+ local->append_write = _gf_false;
+
+ ret = dict_get_uint32(xdata, GLUSTERFS_ACTIVE_FD_COUNT, &open_fd_count);
+ if (ret < 0)
+ goto unlock;
+ if (open_fd_count > local->open_fd_count) {
+ local->open_fd_count = open_fd_count;
+ local->update_open_fd_count = _gf_true;
}
-unlock:
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if (!local->stable_write && !local->append_write)
- /* An appended write removes the necessity to
- fsync() the file. This is because self-heal
- has the logic to check for larger file when
- the xattrs are not reliably pointing at
- a stale file.
- */
- afr_fd_report_unstable_write (this, local->fd);
-
- __afr_inode_write_finalize (frame, this);
-
- afr_writev_handle_short_writes (frame, this);
-
- if (local->update_open_fd_count)
- afr_handle_open_fd_count (frame, this);
-
- if (!afr_txn_nothing_failed (frame, this)) {
- //Don't unwind until post-op is complete
- local->transaction.resume (frame, this);
- } else {
- /*
- * Generally inode-write fops do transaction.unwind then
- * transaction.resume, but writev needs to make sure that
- * delayed post-op frame is placed in fdctx before unwind
- * happens. This prevents the race of flush doing the
- * changelog wakeup first in fuse thread and then this
- * writev placing its delayed post-op frame in fdctx.
- * This helps flush make sure all the delayed post-ops are
- * completed.
- */
-
- fop_frame = afr_transaction_detach_fop_frame (frame);
- afr_writev_copy_outvars (frame, fop_frame);
- local->transaction.resume (frame, this);
- afr_writev_unwind (fop_frame, this);
- }
+
+ ret = dict_get_int32_sizen(xdata, GLUSTERFS_INODELK_COUNT,
+ &num_inodelks);
+ if (ret < 0)
+ goto unlock;
+ if (num_inodelks > local->num_inodelks) {
+ local->num_inodelks = num_inodelks;
+ local->update_num_inodelks = _gf_true;
}
- return 0;
+ }
+unlock:
+ UNLOCK(&frame->lock);
}
-static int
-afr_arbiter_writev_wind (call_frame_t *frame, xlator_t *this, int subvol)
+void
+afr_process_post_writev(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = frame->local;
- afr_private_t *priv = this->private;
- static char byte = 0xFF;
- static struct iovec vector = {&byte, 1};
- int32_t count = 1;
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
- STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->writev,
- local->fd, &vector, count, local->cont.writev.offset,
- local->cont.writev.flags, local->cont.writev.iobref,
- local->xdata_req);
+ local = frame->local;
- return 0;
+ if (!local->stable_write && !local->append_write)
+ /* An appended write removes the necessity to
+ fsync() the file. This is because self-heal
+ has the logic to check for larger file when
+ the xattrs are not reliably pointing at
+ a stale file.
+ */
+ afr_fd_report_unstable_write(this, local);
+
+ __afr_inode_write_finalize(frame, this);
+
+ afr_writev_handle_short_writes(frame, this);
+
+ if (local->update_open_fd_count)
+ local->inode_ctx->open_fd_count = local->open_fd_count;
+ if (local->update_num_inodelks &&
+ local->transaction.type == AFR_DATA_TRANSACTION) {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ lock->num_inodelks = local->num_inodelks;
+ }
}
int
-afr_writev_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_writev_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ call_frame_t *fop_frame = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
+
+ afr_inode_write_fill(frame, this, child_index, op_ret, op_errno, prebuf,
+ postbuf, xdata);
- local = frame->local;
- priv = this->private;
+ call_count = afr_frame_return(frame);
- if (AFR_IS_ARBITER_BRICK(priv, subvol)) {
- afr_arbiter_writev_wind (frame, this, subvol);
- return 0;
+ if (call_count == 0) {
+ afr_process_post_writev(frame, this);
+
+ if (!afr_txn_nothing_failed(frame, this)) {
+ // Don't unwind until post-op is complete
+ afr_transaction_resume(frame, this);
+ } else {
+ /*
+ * Generally inode-write fops do transaction.unwind then
+ * transaction.resume, but writev needs to make sure that
+ * delayed post-op frame is placed in fdctx before unwind
+ * happens. This prevents the race of flush doing the
+ * changelog wakeup first in fuse thread and then this
+ * writev placing its delayed post-op frame in fdctx.
+ * This helps flush make sure all the delayed post-ops are
+ * completed.
+ */
+
+ fop_frame = afr_transaction_detach_fop_frame(frame);
+ afr_writev_copy_outvars(frame, fop_frame);
+ afr_transaction_resume(frame, this);
+ afr_writev_unwind(fop_frame, this);
}
+ }
+ return 0;
+}
- STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->writev,
- local->fd, local->cont.writev.vector,
- local->cont.writev.count, local->cont.writev.offset,
- local->cont.writev.flags, local->cont.writev.iobref,
- local->xdata_req);
- return 0;
+static int
+afr_arbiter_writev_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = frame->local;
+ afr_private_t *priv = this->private;
+ static char byte = 0xFF;
+ static struct iovec vector = {&byte, 1};
+ int32_t count = 1;
+
+ STACK_WIND_COOKIE(
+ frame, afr_writev_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol], priv->children[subvol]->fops->writev, local->fd,
+ &vector, count, local->cont.writev.offset, local->cont.writev.flags,
+ local->cont.writev.iobref, local->xdata_req);
+
+ return 0;
}
+int
+afr_writev_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (AFR_IS_ARBITER_BRICK(priv, subvol)) {
+ afr_arbiter_writev_wind(frame, this, subvol);
+ return 0;
+ }
+
+ STACK_WIND_COOKIE(frame, afr_writev_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->writev, local->fd,
+ local->cont.writev.vector, local->cont.writev.count,
+ local->cont.writev.offset, local->cont.writev.flags,
+ local->cont.writev.iobref, local->xdata_req);
+ return 0;
+}
int
-afr_do_writev (call_frame_t *frame, xlator_t *this)
+afr_do_writev(call_frame_t *frame, xlator_t *this)
{
- call_frame_t *transaction_frame = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = frame->local;
- priv = this->private;
- transaction_frame->local = local;
- frame->local = NULL;
-
- if (!AFR_FRAME_INIT (frame, op_errno))
- goto out;
-
- local->op = GF_FOP_WRITE;
-
- local->transaction.wind = afr_writev_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_transaction_writev_unwind;
-
- local->transaction.main_frame = frame;
-
- if (local->fd->flags & O_APPEND) {
- /*
- * Backend vfs ignores the 'offset' for append mode fd so
- * locking just the region provided for the writev does not
- * give consistency guarantee. The actual write may happen at a
- * completely different range than the one provided by the
- * offset, len in the fop. So lock the entire file.
- */
- local->transaction.start = 0;
- local->transaction.len = 0;
- } else {
- local->transaction.start = local->cont.writev.offset;
- local->transaction.len = iov_length (local->cont.writev.vector,
- local->cont.writev.count);
-
- /*Lock entire file to avoid network split brains.*/
- if (priv->arbiter_count == 1) {
- local->transaction.start = 0;
- local->transaction.len = 0;
- }
- }
+ local = frame->local;
+ transaction_frame->local = local;
+ frame->local = NULL;
- ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ if (!AFR_FRAME_INIT(frame, op_errno))
+ goto out;
+
+ local->op = GF_FOP_WRITE;
+
+ local->transaction.wind = afr_writev_wind;
+ local->transaction.unwind = afr_transaction_writev_unwind;
- return 0;
+ local->transaction.main_frame = frame;
+
+ if (local->fd->flags & O_APPEND) {
+ /*
+ * Backend vfs ignores the 'offset' for append mode fd so
+ * locking just the region provided for the writev does not
+ * give consistency guarantee. The actual write may happen at a
+ * completely different range than the one provided by the
+ * offset, len in the fop. So lock the entire file.
+ */
+ local->transaction.start = 0;
+ local->transaction.len = 0;
+ } else {
+ local->transaction.start = local->cont.writev.offset;
+ local->transaction.len = iov_length(local->cont.writev.vector,
+ local->cont.writev.count);
+ }
+
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
-
int
-afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset,
- uint32_t flags, struct iobref *iobref, dict_t *xdata)
+afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ int op_errno = ENOMEM;
+ int ret = -1;
+
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ local->cont.writev.vector = iov_dup(vector, count);
+ if (!local->cont.writev.vector)
+ goto out;
+ local->cont.writev.count = count;
+ local->cont.writev.offset = offset;
+ local->cont.writev.flags = flags;
+ local->cont.writev.iobref = iobref_ref(iobref);
- local->cont.writev.vector = iov_dup (vector, count);
- if (!local->cont.writev.vector)
- goto out;
- local->cont.writev.count = count;
- local->cont.writev.offset = offset;
- local->cont.writev.flags = flags;
- local->cont.writev.iobref = iobref_ref (iobref);
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ if (!local->xdata_req)
+ goto out;
- if (!local->xdata_req)
- goto out;
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- local->fd = fd_ref (fd);
- local->inode = inode_ref (fd->inode);
+ if (dict_set_uint32(local->xdata_req, GLUSTERFS_ACTIVE_FD_COUNT, 4)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- if (dict_set_uint32 (local->xdata_req, GLUSTERFS_OPEN_FD_COUNT, 4)) {
- op_errno = ENOMEM;
- goto out;
- }
+ if (dict_set_str_sizen(local->xdata_req, GLUSTERFS_INODELK_DOM_COUNT,
+ this->name)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- if (dict_set_uint32 (local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) {
- op_errno = ENOMEM;
- goto out;
- }
+ if (dict_set_uint32(local->xdata_req, GLUSTERFS_WRITE_IS_APPEND, 4)) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- /* Set append_write to be true speculatively. If on any
- server it turns not be true, we unset it in the
- callback.
- */
- local->append_write = _gf_true;
+ /* Set append_write to be true speculatively. If on any
+ server it turns not be true, we unset it in the
+ callback.
+ */
+ local->append_write = _gf_true;
- /* detect here, but set it in writev_wind_cbk *after* the unstable
- write is performed
- */
- local->stable_write = !!((fd->flags|flags)&(O_SYNC|O_DSYNC));
+ /* detect here, but set it in writev_wind_cbk *after* the unstable
+ write is performed
+ */
+ local->stable_write = !!((fd->flags | flags) & (O_SYNC | O_DSYNC));
- afr_fix_open (fd, this);
+ afr_fix_open(fd, this);
- afr_do_writev (frame, this);
+ afr_do_writev(frame, this);
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
+ AFR_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
-
/* }}} */
/* {{{ truncate */
int
-afr_truncate_unwind (call_frame_t *frame, xlator_t *this)
+afr_truncate_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
+ local = frame->local;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
-
- AFR_STACK_UNWIND (truncate, main_frame, local->op_ret, local->op_errno,
- &local->cont.inode_wfop.prebuf,
- &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(truncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+afr_truncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
- local->stable_write = _gf_false;
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- prebuf, postbuf, NULL, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
}
-
int
-afr_truncate_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_truncate_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->truncate,
- &local->loc, local->cont.truncate.offset,
- local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_truncate_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->truncate, &local->loc,
+ local->cont.truncate.offset, local->xdata_req);
+ return 0;
}
-
int
-afr_truncate (call_frame_t *frame, xlator_t *this,
- loc_t *loc, off_t offset, dict_t *xdata)
+afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
{
- afr_local_t * local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.truncate.offset = offset;
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ local->cont.truncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (!local->xdata_req)
- goto out;
+ if (!local->xdata_req)
+ goto out;
- local->transaction.wind = afr_truncate_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_truncate_unwind;
+ local->transaction.wind = afr_truncate_wind;
+ local->transaction.unwind = afr_truncate_unwind;
- loc_copy (&local->loc, loc);
- local->inode = inode_ref (loc->inode);
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
- local->op = GF_FOP_TRUNCATE;
+ local->op = GF_FOP_TRUNCATE;
- local->transaction.main_frame = frame;
- local->transaction.start = offset;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
+ local->transaction.start = offset;
+ local->transaction.len = 0;
- /* Set it true speculatively, will get reset in afr_truncate_wind_cbk
- if truncate was not a NOP */
- local->stable_write = _gf_true;
+ /* Set it true speculatively, will get reset in afr_truncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
- ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
-
/* }}} */
/* {{{ ftruncate */
-
int
-afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this)
+afr_ftruncate_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (ftruncate, main_frame, local->op_ret, local->op_errno,
- &local->cont.inode_wfop.prebuf,
- &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(ftruncate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+afr_ftruncate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
- local->stable_write = _gf_false;
+ if (op_ret == 0 && prebuf->ia_size != postbuf->ia_size)
+ local->stable_write = _gf_false;
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- prebuf, postbuf, NULL, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
}
-
int
-afr_ftruncate_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_ftruncate_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->ftruncate,
- local->fd, local->cont.ftruncate.offset,
- local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_ftruncate_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->ftruncate, local->fd,
+ local->cont.ftruncate.offset, local->xdata_req);
+ return 0;
}
-
int
-afr_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- dict_t *xdata)
+afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.ftruncate.offset = offset;
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ local->cont.ftruncate.offset = offset;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (!local->xdata_req)
- goto out;
+ if (!local->xdata_req)
+ goto out;
- local->fd = fd_ref (fd);
- local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- local->op = GF_FOP_FTRUNCATE;
+ local->op = GF_FOP_FTRUNCATE;
- local->transaction.wind = afr_ftruncate_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_ftruncate_unwind;
+ local->transaction.wind = afr_ftruncate_wind;
+ local->transaction.unwind = afr_ftruncate_unwind;
- local->transaction.main_frame = frame;
+ local->transaction.main_frame = frame;
- local->transaction.start = local->cont.ftruncate.offset;
- local->transaction.len = 0;
+ local->transaction.start = local->cont.ftruncate.offset;
+ local->transaction.len = 0;
- afr_fix_open (fd, this);
+ afr_fix_open(fd, this);
- /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk
- if truncate was not a NOP */
- local->stable_write = _gf_true;
+ /* Set it true speculatively, will get reset in afr_ftruncate_wind_cbk
+ if truncate was not a NOP */
+ local->stable_write = _gf_true;
- ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- AFR_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+ AFR_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
/* }}} */
@@ -750,1665 +788,1778 @@ out:
/* {{{ setattr */
int
-afr_setattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_setattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (setattr, main_frame, local->op_ret, local->op_errno,
- &local->cont.inode_wfop.prebuf,
- &local->cont.inode_wfop.postbuf,
- local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(setattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_setattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- struct iatt *preop, struct iatt *postop, dict_t *xdata)
+afr_setattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preop,
+ struct iatt *postop, dict_t *xdata)
{
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- preop, postop, NULL, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop,
+ postop, NULL, xdata);
}
-
int
-afr_setattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_setattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_setattr_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->setattr,
- &local->loc, &local->cont.setattr.in_buf,
- local->cont.setattr.valid, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_setattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setattr, &local->loc,
+ &local->cont.setattr.in_buf, local->cont.setattr.valid,
+ local->xdata_req);
+ return 0;
}
-
int
-afr_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
- int32_t valid, dict_t *xdata)
+afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+ int32_t valid, dict_t *xdata)
{
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.setattr.in_buf = *buf;
- local->cont.setattr.valid = valid;
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ local->cont.setattr.in_buf = *buf;
+ local->cont.setattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (!local->xdata_req)
- goto out;
+ if (!local->xdata_req)
+ goto out;
- local->transaction.wind = afr_setattr_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_setattr_unwind;
+ local->transaction.wind = afr_setattr_wind;
+ local->transaction.unwind = afr_setattr_unwind;
- loc_copy (&local->loc, loc);
- local->inode = inode_ref (loc->inode);
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
- local->op = GF_FOP_SETATTR;
+ local->op = GF_FOP_SETATTR;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* {{{ fsetattr */
int
-afr_fsetattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_fsetattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (fsetattr, main_frame, local->op_ret, local->op_errno,
- &local->cont.inode_wfop.prebuf,
- &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(fsetattr, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_fsetattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preop, struct iatt *postop, dict_t *xdata)
+afr_fsetattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preop,
+ struct iatt *postop, dict_t *xdata)
{
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- preop, postop, NULL, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, preop,
+ postop, NULL, xdata);
}
-
int
-afr_fsetattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_fsetattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_fsetattr_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->fsetattr,
- local->fd, &local->cont.fsetattr.in_buf,
- local->cont.fsetattr.valid, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_fsetattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetattr, local->fd,
+ &local->cont.fsetattr.in_buf, local->cont.fsetattr.valid,
+ local->xdata_req);
+ return 0;
}
-
int
-afr_fsetattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata)
+afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
+ int32_t valid, dict_t *xdata)
{
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.fsetattr.in_buf = *buf;
- local->cont.fsetattr.valid = valid;
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ local->cont.fsetattr.in_buf = *buf;
+ local->cont.fsetattr.valid = valid;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (!local->xdata_req)
- goto out;
+ if (!local->xdata_req)
+ goto out;
- local->transaction.wind = afr_fsetattr_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_fsetattr_unwind;
+ local->transaction.wind = afr_fsetattr_wind;
+ local->transaction.unwind = afr_fsetattr_unwind;
- local->fd = fd_ref (fd);
- local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- local->op = GF_FOP_FSETATTR;
+ local->op = GF_FOP_FSETATTR;
- afr_fix_open (fd, this);
+ afr_fix_open(fd, this);
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
-
/* {{{ setxattr */
-
int
-afr_setxattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_setxattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (setxattr, main_frame, local->op_ret, local->op_errno,
- local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(setxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
int
-afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_setxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- NULL, NULL, NULL, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
}
-
int
-afr_setxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_setxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->setxattr,
- &local->loc, local->cont.setxattr.dict,
- local->cont.setxattr.flags, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_setxattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->setxattr, &local->loc,
+ local->cont.setxattr.dict, local->cont.setxattr.flags,
+ local->xdata_req);
+ return 0;
}
int
-afr_rb_set_pending_changelog_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- dict_t *xattr, dict_t *xdata)
+afr_emptyb_set_pending_changelog_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
- i = (long) cookie;
-
- local->replies[i].valid = 1;
- local->replies[i].op_ret = op_ret;
- local->replies[i].op_errno = op_errno;
- gf_msg (this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO,
- op_ret ? op_errno : 0,
- AFR_MSG_REPLACE_BRICK_STATUS, "Set of pending xattr %s on"
- " %s.", op_ret ? "failed" : "succeeded",
- priv->children[i]->name);
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i, ret = 0;
+ char *op_type = NULL;
- syncbarrier_wake (&local->barrier);
- return 0;
-}
+ local = frame->local;
+ priv = this->private;
+ i = (long)cookie;
-int
-afr_rb_set_pending_changelog (call_frame_t *frame, xlator_t *this,
- unsigned char *locked_nodes)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int ret = 0, i = 0;
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
- local = frame->local;
- priv = this->private;
+ ret = dict_get_str_sizen(local->xdata_req, "replicate-brick-op", &op_type);
+ if (ret)
+ goto out;
- AFR_ONLIST (locked_nodes, frame, afr_rb_set_pending_changelog_cbk,
- xattrop, &local->loc, GF_XATTROP_ADD_ARRAY,
- local->xdata_req, NULL);
+ gf_smsg(this->name, op_ret ? GF_LOG_ERROR : GF_LOG_INFO,
+ op_ret ? op_errno : 0, AFR_MSG_SET_PEND_XATTR, "name=%s",
+ priv->children[i]->name, "op_ret=%s",
+ op_ret ? "failed" : "succeeded", NULL);
- /* It is sufficient if xattrop was successful on one child */
- for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].valid)
- continue;
-
- if (local->replies[i].op_ret == 0) {
- ret = 0;
- goto out;
- } else {
- ret = afr_higher_errno (ret,
- local->replies[i].op_errno);
- }
- }
out:
- return -ret;
+ syncbarrier_wake(&local->barrier);
+ return 0;
}
int
-_afr_handle_replace_brick_type (xlator_t *this, call_frame_t *frame,
- loc_t *loc, int rb_index,
- afr_transaction_type type)
+afr_emptyb_set_pending_changelog(call_frame_t *frame, xlator_t *this,
+ unsigned char *locked_nodes)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- unsigned char *locked_nodes = NULL;
- int count = 0;
- int ret = -ENOMEM;
- int idx = -1;
-
- priv = this->private;
- local = frame->local;
-
- locked_nodes = alloca0 (priv->child_count);
-
- idx = afr_index_for_transaction_type (type);
-
- local->pending = afr_matrix_create (priv->child_count,
- AFR_NUM_CHANGE_LOGS);
- if (!local->pending)
- goto out;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int ret = 0, i = 0;
- local->pending[rb_index][idx] = hton32 (1);
+ local = frame->local;
+ priv = this->private;
- local->xdata_req = dict_new ();
- if (!local->xdata_req)
- goto out;
+ AFR_ONLIST(locked_nodes, frame, afr_emptyb_set_pending_changelog_cbk,
+ xattrop, &local->loc, GF_XATTROP_ADD_ARRAY, local->xattr_req,
+ NULL);
- ret = afr_set_pending_dict (priv, local->xdata_req, local->pending);
- if (ret < 0)
- goto out;
+ /* It is sufficient if xattrop was successful on one child */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
- if (AFR_ENTRY_TRANSACTION == type) {
- count = afr_selfheal_entrylk (frame, this, loc->inode,
- this->name, NULL, locked_nodes);
+ if (local->replies[i].op_ret == 0) {
+ ret = 0;
+ goto out;
} else {
- count = afr_selfheal_inodelk (frame, this, loc->inode,
- this->name, LLONG_MAX - 1, 0,
- locked_nodes);
- }
-
- if (!count) {
- gf_msg (this->name, GF_LOG_ERROR, EAGAIN,
- AFR_MSG_REPLACE_BRICK_STATUS, "Couldn't acquire lock on"
- " any child.");
- ret = -EAGAIN;
- goto unlock;
+ ret = afr_higher_errno(ret, local->replies[i].op_errno);
}
+ }
+out:
+ return -ret;
+}
- ret = afr_rb_set_pending_changelog (frame, this, locked_nodes);
- if (ret)
- goto unlock;
- ret = 0;
+static int
+_afr_handle_empty_brick_type(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ int empty_index, afr_transaction_type type,
+ char *op_type, const int op_type_len)
+{
+ int count = 0;
+ int ret = -ENOMEM;
+ int idx = -1;
+ int d_idx = -1;
+ unsigned char *locked_nodes = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ locked_nodes = alloca0(priv->child_count);
+
+ idx = afr_index_for_transaction_type(type);
+ d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+ local->pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!local->pending)
+ goto out;
+
+ local->pending[empty_index][idx] = hton32(1);
+
+ if ((priv->esh_granular) && (type == AFR_ENTRY_TRANSACTION))
+ local->pending[empty_index][d_idx] = hton32(1);
+
+ local->xdata_req = dict_new();
+ if (!local->xdata_req)
+ goto out;
+
+ ret = dict_set_nstrn(local->xdata_req, "replicate-brick-op",
+ SLEN("replicate-brick-op"), op_type, op_type_len);
+ if (ret)
+ goto out;
+
+ local->xattr_req = dict_new();
+ if (!local->xattr_req)
+ goto out;
+
+ ret = afr_set_pending_dict(priv, local->xattr_req, local->pending);
+ if (ret < 0)
+ goto out;
+
+ if (AFR_ENTRY_TRANSACTION == type) {
+ count = afr_selfheal_entrylk(frame, this, loc->inode, this->name, NULL,
+ locked_nodes);
+ } else {
+ count = afr_selfheal_inodelk(frame, this, loc->inode, this->name,
+ LLONG_MAX - 1, 0, locked_nodes);
+ }
+
+ if (!count) {
+ gf_smsg(this->name, GF_LOG_ERROR, EAGAIN, AFR_MSG_REPLACE_BRICK_STATUS,
+ NULL);
+ ret = -EAGAIN;
+ goto unlock;
+ }
+
+ ret = afr_emptyb_set_pending_changelog(frame, this, locked_nodes);
+ if (ret)
+ goto unlock;
+ ret = 0;
unlock:
- if (AFR_ENTRY_TRANSACTION == type) {
- afr_selfheal_unentrylk (frame, this, loc->inode, this->name,
- NULL, locked_nodes);
- } else {
- afr_selfheal_uninodelk (frame, this, loc->inode, this->name,
- LLONG_MAX - 1, 0, locked_nodes);
- }
+ if (AFR_ENTRY_TRANSACTION == type) {
+ afr_selfheal_unentrylk(frame, this, loc->inode, this->name, NULL,
+ locked_nodes, NULL);
+ } else {
+ afr_selfheal_uninodelk(frame, this, loc->inode, this->name,
+ LLONG_MAX - 1, 0, locked_nodes);
+ }
out:
- return ret;
+ return ret;
}
-int
-_afr_handle_replace_brick_cbk (int ret, call_frame_t *frame, void *opaque)
-{
- afr_replace_brick_args_t *data = NULL;
-
- data = opaque;
- loc_wipe (&data->loc);
- GF_FREE (data);
- return 0;
+void
+afr_brick_args_cleanup(void *opaque)
+{
+ afr_empty_brick_args_t *data = NULL;
+
+ data = opaque;
+ loc_wipe(&data->loc);
+ GF_FREE(data);
+}
+
+int
+_afr_handle_empty_brick_cbk(int ret, call_frame_t *frame, void *opaque)
+{
+ afr_brick_args_cleanup(opaque);
+ return 0;
+}
+
+int
+_afr_handle_empty_brick(void *opaque)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int empty_index = -1;
+ int ret = -1;
+ int op_errno = ENOMEM;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ char *op_type = NULL;
+ int op_type_len = 0;
+ afr_empty_brick_args_t *data = NULL;
+ call_frame_t *op_frame = NULL;
+
+ data = opaque;
+ frame = data->frame;
+ empty_index = data->empty_index;
+ if (!data->op_type)
+ goto out;
+
+ op_frame = copy_frame(frame);
+ if (!op_frame) {
+ ret = -1;
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_type = data->op_type;
+ op_type_len = strlen(op_type);
+ this = op_frame->this;
+ priv = this->private;
+
+ afr_set_lk_owner(op_frame, this, op_frame->root);
+ local = AFR_FRAME_INIT(op_frame, op_errno);
+ if (!local)
+ goto out;
+
+ loc_copy(&local->loc, &data->loc);
+
+ gf_smsg(this->name, GF_LOG_INFO, 0, AFR_MSG_NEW_BRICK, "name=%s",
+ priv->children[empty_index]->name, NULL);
+
+ ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
+ AFR_METADATA_TRANSACTION, op_type,
+ op_type_len);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ dict_unref(local->xdata_req);
+ dict_unref(local->xattr_req);
+ afr_matrix_cleanup(local->pending, priv->child_count);
+ local->pending = NULL;
+ local->xattr_req = NULL;
+ local->xdata_req = NULL;
+
+ ret = _afr_handle_empty_brick_type(this, op_frame, &local->loc, empty_index,
+ AFR_ENTRY_TRANSACTION, op_type,
+ op_type_len);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ if (op_frame) {
+ AFR_STACK_DESTROY(op_frame);
+ }
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ return 0;
+}
+
+int
+afr_split_brain_resolve_do(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ char *data)
+{
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ local = frame->local;
+ local->xdata_req = dict_new();
+
+ if (!local->xdata_req) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_int32_sizen(local->xdata_req, "heal-op",
+ GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_str_sizen(local->xdata_req, "child-name", data);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ /* set spb choice to -1 whether heal succeeds or not:
+ * If heal succeeds : spb-choice should be set to -1 as
+ * it is no longer valid; file is not
+ * in split-brain anymore.
+ * If heal doesn't succeed:
+ * spb-choice should be set to -1
+ * otherwise reads will be served
+ * from spb-choice which is misleading.
+ */
+ ret = afr_inode_split_brain_choice_set(loc->inode, this, -1);
+ if (ret)
+ gf_smsg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+ NULL);
+ afr_heal_splitbrain_file(frame, this, loc);
+ ret = 0;
+out:
+ if (ret < 0)
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+ return 0;
}
int
-_afr_handle_replace_brick (void *opaque)
+afr_get_split_brain_child_index(xlator_t *this, void *value, size_t len)
{
+ int spb_child_index = -1;
+ char *spb_child_str = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int rb_index = -1;
- int ret = -1;
- int op_errno = ENOMEM;
- call_frame_t *frame = NULL;
- xlator_t *this = NULL;
- afr_replace_brick_args_t *data = NULL;
+ spb_child_str = alloca0(len + 1);
+ memcpy(spb_child_str, value, len);
- data = opaque;
- frame = data->frame;
- rb_index = data->rb_index;
- this = frame->this;
- priv = this->private;
+ if (!strcmp(spb_child_str, "none"))
+ return -2;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
-
- loc_copy (&local->loc, &data->loc);
+ spb_child_index = afr_get_child_index_from_name(this, spb_child_str);
+ if (spb_child_index < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "subvol=%s", spb_child_str, NULL);
+ }
+ return spb_child_index;
+}
- gf_msg_debug (this->name, 0, "Child being replaced is : %s",
- priv->children[rb_index]->name);
+int
+afr_can_set_split_brain_choice(void *opaque)
+{
+ afr_spbc_timeout_t *data = opaque;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ loc_t *loc = NULL;
+ int ret = -1;
- ret = _afr_handle_replace_brick_type (this, frame, &local->loc, rb_index,
- AFR_METADATA_TRANSACTION);
- if (ret) {
- op_errno = -ret;
- ret = -1;
- goto out;
- }
+ frame = data->frame;
+ loc = data->loc;
+ this = frame->this;
- dict_unref (local->xdata_req);
- afr_matrix_cleanup (local->pending, priv->child_count);
- local->pending = NULL;
- local->xdata_req = NULL;
+ ret = afr_is_split_brain(frame, this, loc->inode, loc->gfid, &data->d_spb,
+ &data->m_spb);
- ret = _afr_handle_replace_brick_type (this, frame, &local->loc, rb_index,
- AFR_ENTRY_TRANSACTION);
- if (ret) {
- op_errno = -ret;
- ret = -1;
- goto out;
- }
- ret = 0;
-out:
- AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
- return 0;
+ if (ret)
+ gf_smsg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, "gfid=%s",
+ uuid_utoa(loc->gfid), NULL);
+ return ret;
}
-
int
-afr_split_brain_resolve_do (call_frame_t *frame, xlator_t *this, loc_t *loc,
- char *data)
+afr_handle_split_brain_commands(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ dict_t *dict)
{
- afr_local_t *local = NULL;
- int ret = -1;
- int op_errno = EINVAL;
+ void *choice_value = NULL;
+ void *resolve_value = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_spbc_timeout_t *data = NULL;
+ int len = 0;
+ int spb_child_index = -1;
+ int ret = -1;
+ int op_errno = EINVAL;
+
+ priv = this->private;
- local = frame->local;
- local->xdata_req = dict_new ();
+ ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_CHOICE, &choice_value, &len);
+ ret = dict_get_ptr_and_len(dict, GF_AFR_SBRAIN_RESOLVE, &resolve_value,
+ &len);
+ if (!choice_value && !resolve_value) {
+ ret = -1;
+ goto out;
+ }
- if (!local->xdata_req) {
- op_errno = ENOMEM;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local) {
+ ret = 1;
+ goto out;
+ }
+
+ local->op = GF_FOP_SETXATTR;
+
+ if (choice_value) {
+ spb_child_index = afr_get_split_brain_child_index(this, choice_value,
+ len);
+ if (spb_child_index < 0) {
+ /* Case where value was "none" */
+ if (spb_child_index == -2)
+ spb_child_index = -1;
+ else {
+ ret = 1;
+ op_errno = EINVAL;
goto out;
+ }
}
- ret = dict_set_int32 (local->xdata_req, "heal-op",
- GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK);
- if (ret) {
- op_errno = -ret;
- ret = -1;
- goto out;
+ data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_spbc_timeout_t);
+ if (!data) {
+ ret = 1;
+ goto out;
}
- ret = dict_set_str (local->xdata_req, "child-name", data);
+ data->spb_child_index = spb_child_index;
+ data->frame = frame;
+ loc_copy(&local->loc, loc);
+ data->loc = &local->loc;
+ ret = synctask_new(this->ctx->env, afr_can_set_split_brain_choice,
+ afr_set_split_brain_choice, NULL, data);
if (ret) {
- op_errno = -ret;
- ret = -1;
- goto out;
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ "name=%s", loc->name, NULL);
+ ret = 1;
+ op_errno = ENOMEM;
+ goto out;
}
- /* set spb choice to -1 whether heal succeeds or not:
- * If heal succeeds : spb-choice should be set to -1 as
- * it is no longer valid; file is not
- * in split-brain anymore.
- * If heal doesn't succeed:
- * spb-choice should be set to -1
- * otherwise reads will be served
- * from spb-choice which is misleading.
- */
- ret = afr_inode_split_brain_choice_set (loc->inode, this, -1);
- if (ret)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, "Failed to set"
- "split-brain choice to -1");
- afr_heal_splitbrain_file (frame, this, loc);
ret = 0;
-out:
- if (ret < 0)
- AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
- return 0;
-}
-
-int
-afr_get_split_brain_child_index (xlator_t *this, void *value, size_t len)
-{
- int spb_child_index = -1;
- char *spb_child_str = NULL;
-
- spb_child_str = alloca0 (len + 1);
- memcpy (spb_child_str, value, len);
-
- if (!strcmp (spb_child_str, "none"))
- return -2;
+ goto out;
+ }
- spb_child_index = afr_get_child_index_from_name (this,
- spb_child_str);
+ if (resolve_value) {
+ spb_child_index = afr_get_split_brain_child_index(this, resolve_value,
+ len);
if (spb_child_index < 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_INVALID_SUBVOL, "Invalid subvol: %s",
- spb_child_str);
+ ret = 1;
+ goto out;
}
- return spb_child_index;
+
+ afr_split_brain_resolve_do(frame, this, loc,
+ priv->children[spb_child_index]->name);
+ ret = 0;
+ }
+out:
+ /* key was correct but value was invalid when ret == 1 */
+ if (ret == 1) {
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+ if (data)
+ GF_FREE(data);
+ ret = 0;
+ }
+ return ret;
}
int
-afr_can_set_split_brain_choice (void *opaque)
+afr_handle_spb_choice_timeout(xlator_t *this, call_frame_t *frame, dict_t *dict)
{
- afr_spbc_timeout_t *data = opaque;
- call_frame_t *frame = NULL;
- xlator_t *this = NULL;
- loc_t *loc = NULL;
- int ret = -1;
+ int ret = -1;
+ int op_errno = 0;
+ uint64_t timeout = 0;
+ afr_private_t *priv = NULL;
- frame = data->frame;
- loc = data->loc;
- this = frame->this;
+ priv = this->private;
- ret = afr_is_split_brain (frame, this, loc->inode, loc->gfid,
- &data->d_spb, &data->m_spb);
+ ret = dict_get_uint64(dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout);
+ if (!ret) {
+ priv->spb_choice_timeout = timeout * 60;
+ AFR_STACK_UNWIND(setxattr, frame, ret, op_errno, NULL);
+ }
- if (ret)
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
- "Failed to determine if %s"
- " is in split-brain. "
- "Aborting split-brain-choice set.",
- uuid_utoa (loc->gfid));
- return ret;
+ return ret;
}
int
-afr_handle_split_brain_commands (xlator_t *this, call_frame_t *frame,
- loc_t *loc, dict_t *dict)
+afr_handle_empty_brick(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ dict_t *dict)
{
- void *value = NULL;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- afr_spbc_timeout_t *data = NULL;
- int len = 0;
- int spb_child_index = -1;
- int ret = -1;
- int op_errno = EINVAL;
+ int ret = -1;
+ int ab_ret = -1;
+ int empty_index = -1;
+ int op_errno = EPERM;
+ char *empty_brick = NULL;
+ char *op_type = NULL;
+ afr_empty_brick_args_t *data = NULL;
- priv = this->private;
+ ret = dict_get_str_sizen(dict, GF_AFR_REPLACE_BRICK, &empty_brick);
+ if (!ret)
+ op_type = GF_AFR_REPLACE_BRICK;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local) {
- ret = 1;
- goto out;
- }
+ ab_ret = dict_get_str_sizen(dict, GF_AFR_ADD_BRICK, &empty_brick);
+ if (!ab_ret)
+ op_type = GF_AFR_ADD_BRICK;
- local->op = GF_FOP_SETXATTR;
-
- ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_CHOICE, &value,
- &len);
- if (value) {
- spb_child_index = afr_get_split_brain_child_index (this, value,
- len);
- if (spb_child_index < 0) {
- /* Case where value was "none" */
- if (spb_child_index == -2)
- spb_child_index = -1;
- else {
- ret = 1;
- op_errno = EINVAL;
- goto out;
- }
- }
-
- data = GF_CALLOC (1, sizeof (*data), gf_afr_mt_spbc_timeout_t);
- if (!data) {
- ret = 1;
- goto out;
- }
- data->spb_child_index = spb_child_index;
- data->frame = frame;
- data->loc = loc;
- ret = synctask_new (this->ctx->env,
- afr_can_set_split_brain_choice,
- afr_set_split_brain_choice, NULL, data);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR,
- "Failed to create"
- " synctask. Aborting split-brain choice set"
- " for %s", loc->name);
- ret = 1;
- op_errno = ENOMEM;
- goto out;
- }
- ret = 0;
- goto out;
- }
+ if (ret && ab_ret)
+ goto out;
- ret = dict_get_ptr_and_len (dict, GF_AFR_SBRAIN_RESOLVE, &value, &len);
- if (value) {
- spb_child_index = afr_get_split_brain_child_index (this, value,
- len);
- if (spb_child_index < 0) {
- ret = 1;
- goto out;
- }
-
- afr_split_brain_resolve_do (frame, this, loc,
- priv->children[spb_child_index]->name);
- ret = 0;
- }
-out:
- /* key was correct but value was invalid when ret == 1 */
- if (ret == 1) {
- AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
- if (data)
- GF_FREE (data);
- ret = 0;
- }
- return ret;
-}
-
-int
-afr_handle_spb_choice_timeout (xlator_t *this, call_frame_t *frame,
- dict_t *dict)
-{
- int ret = -1;
- int op_errno = 0;
- uint64_t timeout = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
+ if (frame->root->pid != GF_CLIENT_PID_ADD_REPLICA_MOUNT) {
+ gf_smsg(this->name, GF_LOG_ERROR, EPERM, AFR_MSG_INTERNAL_ATTR,
+ "op_type=%s", op_type, NULL);
+ ret = 1;
+ goto out;
+ }
+ empty_index = afr_get_child_index_from_name(this, empty_brick);
- ret = dict_get_uint64 (dict, GF_AFR_SPB_CHOICE_TIMEOUT, &timeout);
- if (!ret) {
- priv->spb_choice_timeout = timeout * 60;
- AFR_STACK_UNWIND (setxattr, frame, ret, op_errno, NULL);
+ if (empty_index < 0) {
+ /* Didn't belong to this replica pair
+ * Just do a no-op
+ */
+ AFR_STACK_UNWIND(setxattr, frame, 0, 0, NULL);
+ return 0;
+ } else {
+ data = GF_CALLOC(1, sizeof(*data), gf_afr_mt_empty_brick_t);
+ if (!data) {
+ ret = 1;
+ op_errno = ENOMEM;
+ goto out;
}
-
- return ret;
-}
-
-int
-afr_handle_replace_brick (xlator_t *this, call_frame_t *frame, loc_t *loc,
- dict_t *dict)
-{
- int ret = -1;
- int rb_index = -1;
- int op_errno = EPERM;
- char *replace_brick = NULL;
- afr_replace_brick_args_t *data = NULL;
-
- ret = dict_get_str (dict, GF_AFR_REPLACE_BRICK, &replace_brick);
-
- if (!ret) {
- if (frame->root->pid != GF_CLIENT_PID_SELF_HEALD) {
- gf_msg (this->name, GF_LOG_ERROR, EPERM,
- AFR_MSG_REPLACE_BRICK_STATUS, "'%s' is an "
- "internal extended attribute",
- GF_AFR_REPLACE_BRICK);
- ret = 1;
- goto out;
- }
- rb_index = afr_get_child_index_from_name (this, replace_brick);
-
- if (rb_index < 0) {
- /* Didn't belong to this replica pair
- * Just do a no-op
- */
- AFR_STACK_UNWIND (setxattr, frame, 0, 0, NULL);
- return 0;
- } else {
- data = GF_CALLOC (1, sizeof (*data),
- gf_afr_mt_replace_brick_t);
- if (!data) {
- ret = 1;
- op_errno = ENOMEM;
- goto out;
- }
- data->frame = frame;
- loc_copy (&data->loc, loc);
- data->rb_index = rb_index;
- ret = synctask_new (this->ctx->env,
- _afr_handle_replace_brick,
- _afr_handle_replace_brick_cbk,
- NULL, data);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_REPLACE_BRICK_FAILED,
- "Failed to create synctask. Unable to "
- "perform replace-brick.");
- ret = 1;
- op_errno = ENOMEM;
- loc_wipe (&data->loc);
- GF_FREE (data);
- goto out;
- }
- }
- ret = 0;
+ data->frame = frame;
+ loc_copy(&data->loc, loc);
+ data->empty_index = empty_index;
+ data->op_type = op_type;
+ ret = synctask_new(this->ctx->env, _afr_handle_empty_brick,
+ _afr_handle_empty_brick_cbk, NULL, data);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN_STATUS,
+ NULL);
+ ret = 1;
+ op_errno = ENOMEM;
+ afr_brick_args_cleanup(data);
+ goto out;
}
+ }
+ ret = 0;
out:
- if (ret == 1) {
- AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
- ret = 0;
- }
- return ret;
+ if (ret == 1) {
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+ ret = 0;
+ }
+ return ret;
}
static int
-afr_handle_special_xattr (xlator_t *this, call_frame_t *frame, loc_t *loc,
- dict_t *dict)
+afr_handle_special_xattr(xlator_t *this, call_frame_t *frame, loc_t *loc,
+ dict_t *dict)
{
- int ret = -1;
+ int ret = -1;
- ret = afr_handle_split_brain_commands (this, frame, loc, dict);
- if (ret == 0)
- goto out;
+ ret = afr_handle_split_brain_commands(this, frame, loc, dict);
+ if (ret == 0)
+ goto out;
- ret = afr_handle_spb_choice_timeout (this, frame, dict);
- if (ret == 0)
- goto out;
+ ret = afr_handle_spb_choice_timeout(this, frame, dict);
+ if (ret == 0)
+ goto out;
- ret = afr_handle_replace_brick (this, frame, loc, dict);
+ /* Applicable for replace-brick and add-brick commands */
+ ret = afr_handle_empty_brick(this, frame, loc, dict);
out:
- return ret;
+ return ret;
}
int
-afr_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
- int32_t flags, dict_t *xdata)
+afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = EINVAL;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = EINVAL;
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict,
- op_errno, out);
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out);
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,
- op_errno, out);
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
- ret = afr_handle_special_xattr (this, frame, loc, dict);
- if (ret == 0)
- return 0;
+ ret = afr_handle_special_xattr(this, frame, loc, dict);
+ if (ret == 0)
+ return 0;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.setxattr.dict = dict_ref (dict);
- local->cont.setxattr.flags = flags;
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ local->cont.setxattr.dict = dict_ref(dict);
+ local->cont.setxattr.flags = flags;
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (!local->xdata_req)
- goto out;
+ if (!local->xdata_req)
+ goto out;
- local->transaction.wind = afr_setxattr_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_setxattr_unwind;
+ local->transaction.wind = afr_setxattr_wind;
+ local->transaction.unwind = afr_setxattr_unwind;
- loc_copy (&local->loc, loc);
- local->inode = inode_ref (loc->inode);
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- local->op = GF_FOP_SETXATTR;
+ local->op = GF_FOP_SETXATTR;
- ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
/* {{{ fsetxattr */
-
int
-afr_fsetxattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_fsetxattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (fsetxattr, main_frame, local->op_ret, local->op_errno,
- local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(fsetxattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
int
-afr_fsetxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_fsetxattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- NULL, NULL, NULL, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
}
-
int
-afr_fsetxattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_fsetxattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_fsetxattr_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->fsetxattr,
- local->fd, local->cont.fsetxattr.dict,
- local->cont.fsetxattr.flags, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_fsetxattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsetxattr, local->fd,
+ local->cont.fsetxattr.dict, local->cont.fsetxattr.flags,
+ local->xdata_req);
+ return 0;
}
-
int
-afr_fsetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata)
+afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata)
{
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.afr.*", dict,
- op_errno, out);
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.afr.*", dict, op_errno, out);
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.afr.*", dict,
- op_errno, out);
+ GF_IF_INTERNAL_XATTR_GOTO("trusted.glusterfs.afr.*", dict, op_errno, out);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.fsetxattr.dict = dict_ref (dict);
- local->cont.fsetxattr.flags = flags;
+ local->cont.fsetxattr.dict = dict_ref(dict);
+ local->cont.fsetxattr.flags = flags;
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (!local->xdata_req)
- goto out;
+ if (!local->xdata_req)
+ goto out;
- local->transaction.wind = afr_fsetxattr_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_fsetxattr_unwind;
+ local->transaction.wind = afr_fsetxattr_wind;
+ local->transaction.unwind = afr_fsetxattr_unwind;
- local->fd = fd_ref (fd);
- local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- local->op = GF_FOP_FSETXATTR;
+ local->op = GF_FOP_FSETXATTR;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
- return 0;
+ AFR_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
+ return 0;
}
/* }}} */
-
/* {{{ removexattr */
-
int
-afr_removexattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_removexattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
+ local = frame->local;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
-
- AFR_STACK_UNWIND (removexattr, main_frame, local->op_ret, local->op_errno,
- local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(removexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
int
-afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_removexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- NULL, NULL, NULL, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
}
-
int
-afr_removexattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_removexattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->removexattr,
- &local->loc, local->cont.removexattr.name,
- local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_removexattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->removexattr, &local->loc,
+ local->cont.removexattr.name, local->xdata_req);
+ return 0;
}
-
int
-afr_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata)
+afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata)
{
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",
- name, op_errno, out);
+ GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",
- name, op_errno, out);
+ GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.removexattr.name = gf_strdup (name);
+ local->cont.removexattr.name = gf_strdup(name);
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (!local->xdata_req)
- goto out;
+ if (!local->xdata_req)
+ goto out;
- local->transaction.wind = afr_removexattr_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_removexattr_unwind;
+ local->transaction.wind = afr_removexattr_wind;
+ local->transaction.unwind = afr_removexattr_unwind;
- loc_copy (&local->loc, loc);
- local->inode = inode_ref (loc->inode);
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
- local->op = GF_FOP_REMOVEXATTR;
+ local->op = GF_FOP_REMOVEXATTR;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
- return 0;
+ AFR_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+ return 0;
}
/* ffremovexattr */
int
-afr_fremovexattr_unwind (call_frame_t *frame, xlator_t *this)
+afr_fremovexattr_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (fremovexattr, main_frame, local->op_ret, local->op_errno,
- local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(fremovexattr, main_frame, local->op_ret, local->op_errno,
+ local->xdata_rsp);
+ return 0;
+}
int
-afr_fremovexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+afr_fremovexattr_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- NULL, NULL, NULL, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, NULL, xdata);
}
-
int
-afr_fremovexattr_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_fremovexattr_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_fremovexattr_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->fremovexattr,
- local->fd, local->cont.removexattr.name,
- local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_fremovexattr_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fremovexattr, local->fd,
+ local->cont.removexattr.name, local->xdata_req);
+ return 0;
}
-
int
-afr_fremovexattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
- const char *name, dict_t *xdata)
+afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata)
{
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- GF_IF_NATIVE_XATTR_GOTO ("trusted.afr.*",
- name, op_errno, out);
+ GF_IF_NATIVE_XATTR_GOTO("trusted.afr.*", name, op_errno, out);
- GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.afr.*",
- name, op_errno, out);
+ GF_IF_NATIVE_XATTR_GOTO("trusted.glusterfs.afr.*", name, op_errno, out);
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.removexattr.name = gf_strdup (name);
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ local->cont.removexattr.name = gf_strdup(name);
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (!local->xdata_req)
- goto out;
+ if (!local->xdata_req)
+ goto out;
- local->transaction.wind = afr_fremovexattr_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_fremovexattr_unwind;
+ local->transaction.wind = afr_fremovexattr_wind;
+ local->transaction.unwind = afr_fremovexattr_unwind;
- local->fd = fd_ref (fd);
- local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- local->op = GF_FOP_FREMOVEXATTR;
+ local->op = GF_FOP_FREMOVEXATTR;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL);
+ AFR_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
int
-afr_fallocate_unwind (call_frame_t *frame, xlator_t *this)
+afr_fallocate_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
+ local = frame->local;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
-
- AFR_STACK_UNWIND (fallocate, main_frame, local->op_ret, local->op_errno,
- &local->cont.inode_wfop.prebuf,
- &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(fallocate, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_fallocate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+afr_fallocate_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- prebuf, postbuf, NULL, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
}
-
int
-afr_fallocate_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_fallocate_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_fallocate_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->fallocate,
- local->fd, local->cont.fallocate.mode,
- local->cont.fallocate.offset,
- local->cont.fallocate.len, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_fallocate_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fallocate, local->fd,
+ local->cont.fallocate.mode, local->cont.fallocate.offset,
+ local->cont.fallocate.len, local->xdata_req);
+ return 0;
}
-
int
-afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
- off_t offset, size_t len, dict_t *xdata)
+afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata)
{
- call_frame_t *transaction_frame = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ call_frame_t *transaction_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.fallocate.mode = mode;
- local->cont.fallocate.offset = offset;
- local->cont.fallocate.len = len;
+ local->cont.fallocate.mode = mode;
+ local->cont.fallocate.offset = offset;
+ local->cont.fallocate.len = len;
- local->fd = fd_ref (fd);
- local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (!local->xdata_req)
- goto out;
+ if (!local->xdata_req)
+ goto out;
- local->op = GF_FOP_FALLOCATE;
+ local->op = GF_FOP_FALLOCATE;
- local->transaction.wind = afr_fallocate_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_fallocate_unwind;
+ local->transaction.wind = afr_fallocate_wind;
+ local->transaction.unwind = afr_fallocate_unwind;
- local->transaction.main_frame = frame;
+ local->transaction.main_frame = frame;
- local->transaction.start = local->cont.fallocate.offset;
- local->transaction.len = 0;
+ local->transaction.start = local->cont.fallocate.offset;
+ local->transaction.len = 0;
- afr_fix_open (fd, this);
+ afr_fix_open(fd, this);
- ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
-
/* }}} */
/* {{{ discard */
int
-afr_discard_unwind (call_frame_t *frame, xlator_t *this)
+afr_discard_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno,
- &local->cont.inode_wfop.prebuf,
- &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_discard_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+afr_discard_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- prebuf, postbuf, NULL, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
}
-
int
-afr_discard_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_discard_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_discard_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->discard,
- local->fd, local->cont.discard.offset,
- local->cont.discard.len, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_discard_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->discard, local->fd,
+ local->cont.discard.offset, local->cont.discard.len,
+ local->xdata_req);
+ return 0;
}
-
int
-afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- size_t len, dict_t *xdata)
+afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata)
{
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.discard.offset = offset;
- local->cont.discard.len = len;
+ local->cont.discard.offset = offset;
+ local->cont.discard.len = len;
- local->fd = fd_ref (fd);
- local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (!local->xdata_req)
- goto out;
+ if (!local->xdata_req)
+ goto out;
- local->op = GF_FOP_DISCARD;
+ local->op = GF_FOP_DISCARD;
- local->transaction.wind = afr_discard_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_discard_unwind;
+ local->transaction.wind = afr_discard_wind;
+ local->transaction.unwind = afr_discard_unwind;
- local->transaction.main_frame = frame;
+ local->transaction.main_frame = frame;
- local->transaction.start = local->cont.discard.offset;
- local->transaction.len = 0;
+ local->transaction.start = local->cont.discard.offset;
+ local->transaction.len = 0;
- afr_fix_open (fd, this);
+ afr_fix_open(fd, this);
- ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
-
/* {{{ zerofill */
int
-afr_zerofill_unwind (call_frame_t *frame, xlator_t *this)
+afr_zerofill_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (discard, main_frame, local->op_ret, local->op_errno,
- &local->cont.inode_wfop.prebuf,
- &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
-}
+ AFR_STACK_UNWIND(discard, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+ return 0;
+}
int
-afr_zerofill_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+afr_zerofill_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
struct iatt *postbuf, dict_t *xdata)
{
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- prebuf, postbuf, NULL, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
}
-
int
-afr_zerofill_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_zerofill_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_zerofill_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->zerofill,
- local->fd, local->cont.zerofill.offset,
- local->cont.zerofill.len, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_zerofill_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->zerofill, local->fd,
+ local->cont.zerofill.offset, local->cont.zerofill.len,
+ local->xdata_req);
+ return 0;
}
int
-afr_zerofill (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
size_t len, dict_t *xdata)
{
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.zerofill.offset = offset;
- local->cont.zerofill.len = len;
+ local->cont.zerofill.offset = offset;
+ local->cont.zerofill.len = len;
- local->fd = fd_ref (fd);
- local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- if (xdata)
- local->xdata_req = dict_copy_with_ref (xdata, NULL);
- else
- local->xdata_req = dict_new ();
+ if (xdata)
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ else
+ local->xdata_req = dict_new();
- if (!local->xdata_req)
- goto out;
+ if (!local->xdata_req)
+ goto out;
- local->op = GF_FOP_ZEROFILL;
+ local->op = GF_FOP_ZEROFILL;
- local->transaction.wind = afr_zerofill_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_zerofill_unwind;
+ local->transaction.wind = afr_zerofill_wind;
+ local->transaction.unwind = afr_zerofill_unwind;
- local->transaction.main_frame = frame;
+ local->transaction.main_frame = frame;
- local->transaction.start = local->cont.discard.offset;
- local->transaction.len = len;
+ local->transaction.start = local->cont.zerofill.offset;
+ local->transaction.len = len;
- afr_fix_open (fd, this);
+ afr_fix_open(fd, this);
- ret = afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
/* }}} */
int32_t
-afr_xattrop_wind_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr, dict_t *xdata)
+afr_xattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr,
+ dict_t *xdata)
{
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- NULL, NULL, xattr, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, xattr, xdata);
}
int
-afr_xattrop_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_xattrop_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_xattrop_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->xattrop,
- &local->loc, local->cont.xattrop.optype,
- local->cont.xattrop.xattr, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_xattrop_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->xattrop, &local->loc,
+ local->cont.xattrop.optype, local->cont.xattrop.xattr,
+ local->xdata_req);
+ return 0;
}
int
-afr_xattrop_unwind (call_frame_t *frame, xlator_t *this)
+afr_xattrop_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (xattrop, main_frame, local->op_ret, local->op_errno,
- local->xattr_rsp, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
+
+ AFR_STACK_UNWIND(xattrop, main_frame, local->op_ret, local->op_errno,
+ local->xattr_rsp, local->xdata_rsp);
+ return 0;
}
int32_t
-afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.xattrop.xattr = dict_ref (xattr);
- local->cont.xattrop.optype = optype;
- if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->cont.xattrop.xattr = dict_ref(xattr);
+ local->cont.xattrop.optype = optype;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- local->transaction.wind = afr_xattrop_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_xattrop_unwind;
+ local->transaction.wind = afr_xattrop_wind;
+ local->transaction.unwind = afr_xattrop_unwind;
- loc_copy (&local->loc, loc);
- local->inode = inode_ref (loc->inode);
+ loc_copy(&local->loc, loc);
+ ret = afr_set_inode_local(this, local, loc->inode);
+ if (ret)
+ goto out;
- local->op = GF_FOP_XATTROP;
+ local->op = GF_FOP_XATTROP;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- ret = afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
- AFR_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL);
- return 0;
+ AFR_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
+ return 0;
}
int32_t
-afr_fxattrop_wind_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xattr, dict_t *xdata)
+afr_fxattrop_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xattr,
+ dict_t *xdata)
{
- return __afr_inode_write_cbk (frame, cookie, this, op_ret, op_errno,
- NULL, NULL, xattr, xdata);
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, NULL,
+ NULL, xattr, xdata);
}
int
-afr_fxattrop_wind (call_frame_t *frame, xlator_t *this, int subvol)
+afr_fxattrop_wind(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- STACK_WIND_COOKIE (frame, afr_fxattrop_wind_cbk, (void *) (long) subvol,
- priv->children[subvol],
- priv->children[subvol]->fops->fxattrop,
- local->fd, local->cont.xattrop.optype,
- local->cont.xattrop.xattr, local->xdata_req);
- return 0;
+ STACK_WIND_COOKIE(frame, afr_fxattrop_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fxattrop, local->fd,
+ local->cont.xattrop.optype, local->cont.xattrop.xattr,
+ local->xdata_req);
+ return 0;
}
int
-afr_fxattrop_unwind (call_frame_t *frame, xlator_t *this)
+afr_fxattrop_unwind(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- call_frame_t *main_frame = NULL;
-
- local = frame->local;
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- main_frame = afr_transaction_detach_fop_frame (frame);
- if (!main_frame)
- return 0;
+ local = frame->local;
- AFR_STACK_UNWIND (fxattrop, main_frame, local->op_ret, local->op_errno,
- local->xattr_rsp, local->xdata_rsp);
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
+
+ AFR_STACK_UNWIND(fxattrop, main_frame, local->op_ret, local->op_errno,
+ local->xattr_rsp, local->xdata_rsp);
+ return 0;
}
int32_t
-afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
+afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata)
{
- afr_local_t *local = NULL;
- call_frame_t *transaction_frame = NULL;
- int ret = -1;
- int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int op_errno = ENOMEM;
- transaction_frame = copy_frame (frame);
- if (!transaction_frame)
- goto out;
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
- local = AFR_FRAME_INIT (transaction_frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
- local->cont.xattrop.xattr = dict_ref (xattr);
- local->cont.xattrop.optype = optype;
- if (xdata)
- local->xdata_req = dict_ref (xdata);
+ local->cont.xattrop.xattr = dict_ref(xattr);
+ local->cont.xattrop.optype = optype;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
- local->transaction.wind = afr_fxattrop_wind;
- local->transaction.fop = __afr_txn_write_fop;
- local->transaction.done = __afr_txn_write_done;
- local->transaction.unwind = afr_fxattrop_unwind;
+ local->transaction.wind = afr_fxattrop_wind;
+ local->transaction.unwind = afr_fxattrop_unwind;
- local->fd = fd_ref (fd);
- local->inode = inode_ref (fd->inode);
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
- local->op = GF_FOP_FXATTROP;
+ local->op = GF_FOP_FXATTROP;
- local->transaction.main_frame = frame;
- local->transaction.start = LLONG_MAX - 1;
- local->transaction.len = 0;
+ local->transaction.main_frame = frame;
+ local->transaction.start = LLONG_MAX - 1;
+ local->transaction.len = 0;
- ret = afr_transaction (transaction_frame, this,
- AFR_METADATA_TRANSACTION);
- if (ret < 0) {
- op_errno = -ret;
- goto out;
- }
+ ret = afr_transaction(transaction_frame, this, AFR_METADATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
- return 0;
+ return 0;
out:
- if (transaction_frame)
- AFR_STACK_DESTROY (transaction_frame);
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
+
+ AFR_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
+
+int
+afr_fsync_unwind(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *main_frame = NULL;
- AFR_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL);
+ local = frame->local;
+
+ main_frame = afr_transaction_detach_fop_frame(frame);
+ if (!main_frame)
return 0;
+
+ AFR_STACK_UNWIND(fsync, main_frame, local->op_ret, local->op_errno,
+ &local->cont.inode_wfop.prebuf,
+ &local->cont.inode_wfop.postbuf, local->xdata_rsp);
+
+ return 0;
+}
+
+int
+afr_fsync_wind_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
+{
+ return __afr_inode_write_cbk(frame, cookie, this, op_ret, op_errno, prebuf,
+ postbuf, NULL, xdata);
+}
+
+int
+afr_fsync_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ STACK_WIND_COOKIE(frame, afr_fsync_wind_cbk, (void *)(long)subvol,
+ priv->children[subvol],
+ priv->children[subvol]->fops->fsync, local->fd,
+ local->cont.fsync.datasync, local->xdata_req);
+ return 0;
+}
+
+int
+afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int32_t op_errno = ENOMEM;
+ int8_t last_fsync = 0;
+
+ AFR_ERROR_OUT_IF_FDCTX_INVALID(fd, this, op_errno, out);
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+ goto out;
+
+ local = AFR_FRAME_INIT(transaction_frame, op_errno);
+ if (!local)
+ goto out;
+
+ if (xdata) {
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+ if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) {
+ if (last_fsync) {
+ local->transaction.disable_delayed_post_op = _gf_true;
+ }
+ }
+ } else {
+ local->xdata_req = dict_new();
+ }
+
+ if (!local->xdata_req)
+ goto out;
+
+ local->fd = fd_ref(fd);
+ ret = afr_set_inode_local(this, local, fd->inode);
+ if (ret)
+ goto out;
+
+ local->op = GF_FOP_FSYNC;
+ local->cont.fsync.datasync = datasync;
+
+ if (afr_fd_has_witnessed_unstable_write(this, fd->inode)) {
+ /* don't care. we only wanted to CLEAR the bit */
+ }
+
+ local->transaction.wind = afr_fsync_wind;
+ local->transaction.unwind = afr_fsync_unwind;
+
+ local->transaction.main_frame = frame;
+
+ ret = afr_transaction(transaction_frame, this, AFR_DATA_TRANSACTION);
+ if (ret < 0) {
+ op_errno = -ret;
+ goto out;
+ }
+
+ return 0;
+out:
+ if (transaction_frame)
+ AFR_STACK_DESTROY(transaction_frame);
+
+ AFR_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+
+ return 0;
}
diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h
index e174cc2d610..a787069b7a1 100644
--- a/xlators/cluster/afr/src/afr-inode-write.h
+++ b/xlators/cluster/afr/src/afr-inode-write.h
@@ -12,79 +12,83 @@
#define __INODE_WRITE_H__
int32_t
-afr_chmod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dict_t *xdata);
+afr_chmod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dict_t *xdata);
int32_t
-afr_chown (call_frame_t *frame, xlator_t *this,
- loc_t *loc, uid_t uid, gid_t gid, dict_t *xdata);
+afr_chown(call_frame_t *frame, xlator_t *this, loc_t *loc, uid_t uid, gid_t gid,
+ dict_t *xdata);
int
-afr_fchown (call_frame_t *frame, xlator_t *this,
- fd_t *fd, uid_t uid, gid_t gid, dict_t *xdata);
+afr_fchown(call_frame_t *frame, xlator_t *this, fd_t *fd, uid_t uid, gid_t gid,
+ dict_t *xdata);
int32_t
-afr_fchmod (call_frame_t *frame, xlator_t *this,
- fd_t *fd, mode_t mode, dict_t *xdata);
+afr_fchmod(call_frame_t *frame, xlator_t *this, fd_t *fd, mode_t mode,
+ dict_t *xdata);
int32_t
-afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int32_t count, off_t offset,
- uint32_t flags, struct iobref *iobref, dict_t *xdata);
+afr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata);
int32_t
-afr_truncate (call_frame_t *frame, xlator_t *this,
- loc_t *loc, off_t offset, dict_t *xdata);
+afr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata);
int32_t
-afr_ftruncate (call_frame_t *frame, xlator_t *this,
- fd_t *fd, off_t offset, dict_t *xdata);
+afr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata);
int32_t
-afr_utimens (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct timespec tv[2], dict_t *xdata);
+afr_utimens(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct timespec tv[2], dict_t *xdata);
int
-afr_setattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, struct iatt *buf, int32_t valid, dict_t *xdata);
+afr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *buf,
+ int32_t valid, dict_t *xdata);
int
-afr_fsetattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, struct iatt *buf, int32_t valid, dict_t *xdata);
+afr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *buf,
+ int32_t valid, dict_t *xdata);
int32_t
-afr_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *dict, int32_t flags, dict_t *xdata);
+afr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata);
int32_t
-afr_fsetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, dict_t *dict, int32_t flags, dict_t *xdata);
+afr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata);
int32_t
-afr_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name, dict_t *xdata);
+afr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata);
int32_t
-afr_fremovexattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *name, dict_t *xdata);
+afr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata);
int
-afr_discard (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- size_t len, dict_t *xdata);
+afr_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
int
-afr_fallocate (call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
- off_t offset, size_t len, dict_t *xdata);
+afr_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata);
int
afr_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
off_t len, dict_t *xdata);
int32_t
-afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
+afr_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
int32_t
-afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd,
- gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
+afr_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata);
+
+int
+afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata);
#endif /* __INODE_WRITE_H__ */
diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
index 53bb7920089..bc8eabe0f43 100644
--- a/xlators/cluster/afr/src/afr-lk-common.c
+++ b/xlators/cluster/afr/src/afr-lk-common.c
@@ -8,9 +8,9 @@
cases as published by the Free Software Foundation.
*/
-#include "dict.h"
-#include "byte-order.h"
-#include "common-utils.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
#include "afr.h"
#include "afr-transaction.h"
@@ -18,1669 +18,774 @@
#include <signal.h>
-
-#define LOCKED_NO 0x0 /* no lock held */
-#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */
-#define LOCKED_LOWER 0x2 /* for lower path */
-
-#define AFR_TRACE_INODELK_IN(frame, this, params ...) \
- do { \
- afr_private_t *_priv = this->private; \
- if (!_priv->inodelk_trace) \
- break; \
- afr_trace_inodelk_in (frame, this, params); \
- } while (0);
-
-#define AFR_TRACE_INODELK_OUT(frame, this, params ...) \
- do { \
- afr_private_t *_priv = this->private; \
- if (!_priv->inodelk_trace) \
- break; \
- afr_trace_inodelk_out (frame, this, params); \
- } while (0);
-
-#define AFR_TRACE_ENTRYLK_IN(frame, this, params ...) \
- do { \
- afr_private_t *_priv = this->private; \
- if (!_priv->entrylk_trace) \
- break; \
- afr_trace_entrylk_in (frame, this, params); \
- } while (0);
-
-#define AFR_TRACE_ENTRYLK_OUT(frame, this, params ...) \
- do { \
- afr_private_t *_priv = this->private; \
- if (!_priv->entrylk_trace) \
- break; \
- afr_trace_entrylk_out (frame, this, params); \
- } while (0);
-
-int
-afr_entry_lockee_cmp (const void *l1, const void *l2)
-{
- const afr_entry_lockee_t *r1 = l1;
- const afr_entry_lockee_t *r2 = l2;
- int ret = 0;
- uuid_t gfid1 = {0};
- uuid_t gfid2 = {0};
-
- loc_gfid ((loc_t*)&r1->loc, gfid1);
- loc_gfid ((loc_t*)&r2->loc, gfid2);
- ret = gf_uuid_compare (gfid1, gfid2);
- /*Entrylks with NULL basename are the 'smallest'*/
- if (ret == 0) {
- if (!r1->basename)
- return -1;
- if (!r2->basename)
- return 1;
- ret = strcmp (r1->basename, r2->basename);
- }
-
- if (ret <= 0)
- return -1;
- else
- return 1;
-}
-
-int afr_lock_blocking (call_frame_t *frame, xlator_t *this, int child_index);
-
-static int
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this);
-
-static uint64_t afr_lock_number = 1;
-
-static uint64_t
-get_afr_lock_number ()
-{
- return (++afr_lock_number);
-}
-
-int
-afr_set_lock_number (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->lock_number = get_afr_lock_number ();
-
- return 0;
-}
+#define LOCKED_NO 0x0 /* no lock held */
+#define LOCKED_YES 0x1 /* for DATA, METADATA, ENTRY and higher_path */
+#define LOCKED_LOWER 0x2 /* for lower path */
void
-afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner)
-{
- gf_msg_trace (this->name, 0,
- "Setting lk-owner=%llu",
- (unsigned long long) (unsigned long)lk_owner);
-
- set_lk_owner_from_ptr (&frame->root->lk_owner, lk_owner);
-}
-
-static int
-is_afr_lock_selfheal (afr_local_t *local)
+afr_lockee_cleanup(afr_lockee_t *lockee)
{
- afr_internal_lock_t *int_lock = NULL;
- int ret = -1;
-
- int_lock = &local->internal_lock;
-
- switch (int_lock->selfheal_lk_type) {
- case AFR_DATA_SELF_HEAL_LK:
- case AFR_METADATA_SELF_HEAL_LK:
- ret = 1;
- break;
- case AFR_ENTRY_SELF_HEAL_LK:
- ret = 0;
- break;
- }
+ if (lockee->fd) {
+ fd_unref(lockee->fd);
+ lockee->fd = NULL;
+ } else {
+ loc_wipe(&lockee->loc);
+ }
- return ret;
+ GF_FREE(lockee->basename);
+ lockee->basename = NULL;
+ GF_FREE(lockee->locked_nodes);
+ lockee->locked_nodes = NULL;
-}
-
-int32_t
-internal_lock_count (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int32_t call_count = 0;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i])
- ++call_count;
- }
-
- return call_count;
-}
-
-static void
-afr_print_inodelk (char *str, int size, int cmd,
- struct gf_flock *flock, gf_lkowner_t *owner)
-{
- char *cmd_str = NULL;
- char *type_str = NULL;
-
- switch (cmd) {
-#if F_GETLK != F_GETLK64
- case F_GETLK64:
-#endif
- case F_GETLK:
- cmd_str = "GETLK";
- break;
-
-#if F_SETLK != F_SETLK64
- case F_SETLK64:
-#endif
- case F_SETLK:
- cmd_str = "SETLK";
- break;
-
-#if F_SETLKW != F_SETLKW64
- case F_SETLKW64:
-#endif
- case F_SETLKW:
- cmd_str = "SETLKW";
- break;
-
- default:
- cmd_str = "<null>";
- break;
- }
-
- switch (flock->l_type) {
- case F_RDLCK:
- type_str = "READ";
- break;
- case F_WRLCK:
- type_str = "WRITE";
- break;
- case F_UNLCK:
- type_str = "UNLOCK";
- break;
- default:
- type_str = "UNKNOWN";
- break;
- }
-
- snprintf (str, size, "lock=INODELK, cmd=%s, type=%s, "
- "start=%llu, len=%llu, pid=%llu, lk-owner=%s",
- cmd_str, type_str, (unsigned long long) flock->l_start,
- (unsigned long long) flock->l_len,
- (unsigned long long) flock->l_pid,
- lkowner_utoa (owner));
-
-}
-
-static void
-afr_print_lockee (char *str, int size, loc_t *loc, fd_t *fd,
- int child_index)
-{
- snprintf (str, size, "path=%s, fd=%p, child=%d",
- loc->path ? loc->path : "<nul>",
- fd ? fd : NULL,
- child_index);
+ return;
}
void
-afr_print_entrylk (char *str, int size, const char *basename,
- gf_lkowner_t *owner)
-{
- snprintf (str, size, "Basename=%s, lk-owner=%s",
- basename ? basename : "<nul>",
- lkowner_utoa (owner));
-}
-
-static void
-afr_print_verdict (int op_ret, int op_errno, char *str)
+afr_lockees_cleanup(afr_internal_lock_t *int_lock)
{
- if (op_ret < 0) {
- if (op_errno == EAGAIN)
- strcpy (str, "EAGAIN");
- else
- strcpy (str, "FAILED");
- }
- else
- strcpy (str, "GRANTED");
-}
+ int i = 0;
-static void
-afr_set_lock_call_type (afr_lock_call_type_t lock_call_type,
- char *lock_call_type_str,
- afr_internal_lock_t *int_lock)
-{
- switch (lock_call_type) {
- case AFR_INODELK_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_INODELK_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_INODELK_SELFHEAL");
- break;
- case AFR_INODELK_NB_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_INODELK_NB_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_INODELK_NB_SELFHEAL");
- break;
- case AFR_ENTRYLK_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_ENTRYLK_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_ENTRYLK_SELFHEAL");
- break;
- case AFR_ENTRYLK_NB_TRANSACTION:
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK)
- strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_TRANSACTION");
- else
- strcpy (lock_call_type_str, "AFR_ENTRYLK_NB_SELFHEAL");
- break;
- default:
- strcpy (lock_call_type_str, "UNKNOWN");
- break;
- }
+ for (i = 0; i < int_lock->lockee_count; i++) {
+ afr_lockee_cleanup(&int_lock->lockee[i]);
+ }
+ return;
}
-
-static void
-afr_trace_inodelk_out (call_frame_t *frame, xlator_t *this,
- afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, struct gf_flock *flock,
- int op_ret, int op_errno, int32_t child_index)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- char lockee[256];
- char lock_call_type_str[256];
- char verdict[16];
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- afr_print_verdict (op_ret, op_errno, verdict);
-
- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO,
- "[%s %s] [%s] lk-owner=%s Lockee={%s} Number={%llu}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY",
- verdict, lkowner_utoa (&frame->root->lk_owner), lockee,
- (unsigned long long) int_lock->lock_number);
-
-}
-
-static void
-afr_trace_inodelk_in (call_frame_t *frame, xlator_t *this,
- afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, struct gf_flock *flock,
- int32_t cmd, int32_t child_index)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
-
- char lock[256];
- char lockee[256];
- char lock_call_type_str[256];
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- afr_print_inodelk (lock, 256, cmd, flock, &frame->root->lk_owner);
- afr_print_lockee (lockee, 256, &local->loc, local->fd, child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO,
- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST",
- lock, lockee,
- (unsigned long long) int_lock->lock_number);
-
+int
+afr_entry_lockee_cmp(const void *l1, const void *l2)
+{
+ const afr_lockee_t *r1 = l1;
+ const afr_lockee_t *r2 = l2;
+ int ret = 0;
+ uuid_t gfid1 = {0};
+ uuid_t gfid2 = {0};
+
+ loc_gfid((loc_t *)&r1->loc, gfid1);
+ loc_gfid((loc_t *)&r2->loc, gfid2);
+ ret = gf_uuid_compare(gfid1, gfid2);
+ /*Entrylks with NULL basename are the 'smallest'*/
+ if (ret == 0) {
+ if (!r1->basename)
+ return -1;
+ if (!r2->basename)
+ return 1;
+ ret = strcmp(r1->basename, r2->basename);
+ }
+
+ if (ret <= 0)
+ return -1;
+ else
+ return 1;
}
-static void
-afr_trace_entrylk_in (call_frame_t *frame, xlator_t *this,
- afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, const char *basename,
- int32_t cookie)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- int child_index = 0;
- int lockee_no = 0;
-
- char lock[256];
- char lockee[256];
- char lock_call_type_str[256];
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- if (!priv->entrylk_trace) {
- return;
- }
- lockee_no = cookie / priv->child_count;
- child_index = cookie % priv->child_count;
-
- afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner);
- afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd,
- child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO,
- "[%s %s] Lock={%s} Lockee={%s} Number={%llu}, Cookie={%d}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REQUEST" : "UNLOCK REQUEST",
- lock, lockee,
- (unsigned long long) int_lock->lock_number,
- cookie);
-}
+int
+afr_lock_blocking(call_frame_t *frame, xlator_t *this, int child_index);
-static void
-afr_trace_entrylk_out (call_frame_t *frame, xlator_t *this,
- afr_lock_call_type_t lock_call_type,
- afr_lock_op_type_t lk_op_type, const char *basename,
- int op_ret, int op_errno, int32_t cookie)
+void
+afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int lockee_no = 0;
- int child_index = 0;
-
- char lock[256];
- char lockee[256];
- char lock_call_type_str[256];
- char verdict[16];
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- if (!priv->entrylk_trace) {
- return;
- }
- lockee_no = cookie / priv->child_count;
- child_index = cookie % priv->child_count;
-
- afr_print_entrylk (lock, 256, basename, &frame->root->lk_owner);
- afr_print_lockee (lockee, 256, &int_lock->lockee[lockee_no].loc, local->fd,
- child_index);
-
- afr_set_lock_call_type (lock_call_type, lock_call_type_str, int_lock);
-
- afr_print_verdict (op_ret, op_errno, verdict);
-
- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_LOCK_INFO,
- "[%s %s] [%s] Lock={%s} Lockee={%s} Number={%llu} Cookie={%d}",
- lock_call_type_str,
- lk_op_type == AFR_LOCK_OP ? "LOCK REPLY" : "UNLOCK REPLY",
- verdict,
- lock, lockee,
- (unsigned long long) int_lock->lock_number,
- cookie);
+ gf_msg_trace(this->name, 0, "Setting lk-owner=%llu",
+ (unsigned long long)(unsigned long)lk_owner);
+ set_lk_owner_from_ptr(&frame->root->lk_owner, lk_owner);
}
-static int
-transaction_lk_op (afr_local_t *local)
+int32_t
+internal_lock_count(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- int ret = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int32_t call_count = 0;
+ int i = 0;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ priv = this->private;
- if (int_lock->transaction_lk_type == AFR_TRANSACTION_LK) {
- gf_msg_debug (THIS->name, 0,
- "lk op is for a transaction");
- ret = 1;
- }
- else if (int_lock->transaction_lk_type == AFR_SELFHEAL_LK) {
- gf_msg_debug (THIS->name, 0,
- "lk op is for a self heal");
-
- ret = 0;
- }
-
- if (ret == -1)
- gf_msg_debug (THIS->name, 0,
- "lk op is not set");
-
- return ret;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->child_up[i])
+ ++call_count;
+ }
+ return call_count;
}
-static int
-is_afr_lock_transaction (afr_local_t *local)
+int
+afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename,
+ int child_count)
{
- int ret = 0;
+ int ret = -ENOMEM;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count];
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- ret = 1;
- break;
+ GF_ASSERT(int_lock->lockee_count < AFR_LOCKEE_COUNT_MAX);
+ loc_copy(&lockee->loc, loc);
+ lockee->basename = (basename) ? gf_strdup(basename) : NULL;
+ if (basename && !lockee->basename)
+ goto out;
- case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_TRANSACTION:
- ret = 0;
- break;
+ lockee->locked_count = 0;
+ lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes),
+ gf_afr_mt_afr_node_character);
- }
+ if (!lockee->locked_nodes)
+ goto out;
- return ret;
+ ret = 0;
+ int_lock->lockee_count++;
+out:
+ if (ret) {
+ afr_lockee_cleanup(lockee);
+ }
+ return ret;
}
int
-afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
- loc_t *loc, char *basename, int child_count)
+afr_add_inode_lockee(afr_local_t *local, int child_count)
{
- int ret = -1;
+ int ret = -ENOMEM;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_lockee_t *lockee = &int_lock->lockee[int_lock->lockee_count];
- loc_copy (&lockee->loc, loc);
- lockee->basename = (basename)? gf_strdup (basename): NULL;
- if (basename && !lockee->basename)
- goto out;
+ if (local->fd) {
+ lockee->fd = fd_ref(local->fd);
+ } else {
+ loc_copy(&lockee->loc, &local->loc);
+ }
- lockee->locked_count = 0;
- lockee->locked_nodes = GF_CALLOC (child_count,
- sizeof (*lockee->locked_nodes),
- gf_afr_mt_afr_node_character);
+ lockee->locked_count = 0;
+ lockee->locked_nodes = GF_CALLOC(child_count, sizeof(*lockee->locked_nodes),
+ gf_afr_mt_afr_node_character);
- if (!lockee->locked_nodes)
- goto out;
+ if (!lockee->locked_nodes)
+ goto out;
- ret = 0;
+ ret = 0;
+ int_lock->lockee_count++;
out:
- return ret;
-
-}
-
-void
-afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock)
-{
- int i = 0;
-
- for (i = 0; i < int_lock->lockee_count; i++) {
- loc_wipe (&int_lock->lockee[i].loc);
- if (int_lock->lockee[i].basename)
- GF_FREE (int_lock->lockee[i].basename);
- if (int_lock->lockee[i].locked_nodes)
- GF_FREE (int_lock->lockee[i].locked_nodes);
- }
-
- return;
-}
-
-static int
-initialize_entrylk_variables (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
-
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- int_lock->entrylk_lock_count = 0;
- int_lock->lock_op_ret = -1;
- int_lock->lock_op_errno = 0;
-
- for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) {
- if (!int_lock->lockee[i].locked_nodes)
- break;
- int_lock->lockee[i].locked_count = 0;
- memset (int_lock->lockee[i].locked_nodes, 0,
- sizeof (*int_lock->lockee[i].locked_nodes) *
- priv->child_count);
- }
-
- return 0;
+ if (ret) {
+ afr_lockee_cleanup(lockee);
+ }
+ return ret;
}
static int
-initialize_inodelk_variables (call_frame_t *frame, xlator_t *this)
+initialize_internal_lock_variables(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_private_t *priv = NULL;
- afr_inodelk_t *inodelk = NULL;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
+ int i = 0;
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- inodelk->lock_count = 0;
- int_lock->lk_attempted_count = 0;
- int_lock->lock_op_ret = -1;
- int_lock->lock_op_errno = 0;
+ int_lock->lock_count = 0;
+ int_lock->lock_op_ret = -1;
+ int_lock->lock_op_errno = 0;
+ int_lock->lk_attempted_count = 0;
- memset (inodelk->locked_nodes, 0,
- sizeof (*inodelk->locked_nodes) * priv->child_count);
- memset (int_lock->locked_nodes, 0,
- sizeof (*int_lock->locked_nodes) * priv->child_count);
+ for (i = 0; i < AFR_LOCKEE_COUNT_MAX; i++) {
+ if (!int_lock->lockee[i].locked_nodes)
+ break;
+ int_lock->lockee[i].locked_count = 0;
+ memset(int_lock->lockee[i].locked_nodes, 0,
+ sizeof(*int_lock->lockee[i].locked_nodes) * priv->child_count);
+ }
- return 0;
+ return 0;
}
int
-afr_lockee_locked_nodes_count (afr_internal_lock_t *int_lock)
+afr_lockee_locked_nodes_count(afr_internal_lock_t *int_lock)
{
- int call_count = 0;
- int i = 0;
+ int call_count = 0;
+ int i = 0;
- for (i = 0; i < int_lock->lockee_count; i++)
- call_count += int_lock->lockee[i].locked_count;
+ for (i = 0; i < int_lock->lockee_count; i++)
+ call_count += int_lock->lockee[i].locked_count;
- return call_count;
+ return call_count;
}
int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count)
+afr_locked_nodes_count(unsigned char *locked_nodes, int child_count)
{
- int i = 0;
- int call_count = 0;
+ int i = 0;
+ int call_count = 0;
- for (i = 0; i < child_count; i++) {
- if (locked_nodes[i] & LOCKED_YES)
- call_count++;
- }
+ for (i = 0; i < child_count; i++) {
+ if (locked_nodes[i] & LOCKED_YES)
+ call_count++;
+ }
- return call_count;
+ return call_count;
}
-/* FIXME: What if UNLOCK fails */
-static int32_t
-afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+static void
+afr_log_locks_failure(call_frame_t *frame, char *where, char *what,
+ int op_errno)
{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- int call_count = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- LOCK (&frame->lock);
- {
- call_count = --int_lock->lk_call_count;
- }
- UNLOCK (&frame->lock);
-
- if (call_count == 0) {
- gf_msg_trace (this->name, 0,
- "All internal locks unlocked");
+ xlator_t *this = frame->this;
+ gf_lkowner_t *lk_owner = &frame->root->lk_owner;
+ afr_local_t *local = frame->local;
+ const char *fop = NULL;
+ char *gfid = NULL;
+ const char *name = NULL;
- int_lock->lock_cbk (frame, this);
- }
+ fop = gf_fop_list[local->op];
- return 0;
+ switch (local->transaction.type) {
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ case AFR_ENTRY_TRANSACTION:
+ switch (local->op) {
+ case GF_FOP_LINK:
+ gfid = uuid_utoa(local->newloc.pargfid);
+ name = local->newloc.name;
+ break;
+ default:
+ gfid = uuid_utoa(local->loc.pargfid);
+ name = local->loc.name;
+ break;
+ }
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ AFR_MSG_INTERNAL_LKS_FAILED,
+ "Unable to do entry %s with lk-owner:%s on %s "
+ "while attempting %s on {pgfid:%s, name:%s}.",
+ what, lkowner_utoa(lk_owner), where, fop, gfid, name);
+ break;
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ gfid = uuid_utoa(local->inode->gfid);
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ AFR_MSG_INTERNAL_LKS_FAILED,
+ "Unable to do inode %s with lk-owner:%s on %s "
+ "while attempting %s on gfid:%s.",
+ what, lkowner_utoa(lk_owner), where, fop, gfid);
+ break;
+ }
}
static int32_t
-afr_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+afr_unlock_common_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- afr_inodelk_t *inodelk = NULL;
- int32_t child_index = (long)cookie;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION,
- AFR_UNLOCK_OP, NULL, op_ret,
- op_errno, child_index);
-
- priv = this->private;
-
- if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
- gf_msg (this->name, GF_LOG_ERROR, op_errno,
- AFR_MSG_INODE_UNLOCK_FAIL,
- "path=%s gfid=%s: unlock failed on subvolume %s "
- "with lock owner %s", local->loc.path,
- loc_gfid_utoa (&(local->loc)),
- priv->children[child_index]->name,
- lkowner_utoa (&frame->root->lk_owner));
- }
-
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ int lockee_num = 0;
+ int call_count = 0;
+ int child_index = 0;
+ int ret = 0;
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- inodelk->locked_nodes[child_index] &= LOCKED_NO;
- if (local->transaction.eager_lock)
- local->transaction.eager_lock[child_index] = 0;
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+ lockee_num = (int)((long)cookie) / priv->child_count;
+ child_index = (int)((long)cookie) % priv->child_count;
- afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, xdata);
+ if (op_ret < 0 && op_errno != ENOTCONN && op_errno != EBADFD) {
+ afr_log_locks_failure(frame, priv->children[child_index]->name,
+ "unlock", op_errno);
+ }
- return 0;
-
-}
+ int_lock->lockee[lockee_num].locked_nodes[child_index] &= LOCKED_NO;
+ if (local->transaction.type == AFR_DATA_TRANSACTION && op_ret != 1)
+ ret = afr_write_subvol_reset(frame, this);
-static int
-afr_unlock_inodelk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_inodelk_t *inodelk = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- struct gf_flock flock = {0,};
- struct gf_flock full_flock = {0,};
- struct gf_flock *flock_use = NULL;
- int call_count = 0;
- int i = 0;
- int piggyback = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
+ LOCK(&frame->lock);
+ {
+ call_count = --int_lock->lk_call_count;
+ }
+ UNLOCK(&frame->lock);
+ if (call_count == 0) {
+ int_lock->lock_cbk(frame, this);
+ }
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
-
- flock.l_start = inodelk->flock.l_start;
- flock.l_len = inodelk->flock.l_len;
- flock.l_type = F_UNLCK;
-
- full_flock.l_type = F_UNLCK;
- call_count = afr_locked_nodes_count (inodelk->locked_nodes,
- priv->child_count);
-
- int_lock->lk_call_count = call_count;
-
- if (!call_count) {
- gf_msg_trace (this->name, 0,
- "No internal locks unlocked");
-
- int_lock->lock_cbk (frame, this);
- goto out;
- }
-
- if (local->fd)
- fd_ctx = afr_fd_ctx_get (local->fd, this);
-
- for (i = 0; i < priv->child_count; i++) {
- if ((inodelk->locked_nodes[i] & LOCKED_YES) != LOCKED_YES)
- continue;
-
- if (local->fd) {
- flock_use = &flock;
- if (!local->transaction.eager_lock[i]) {
- goto wind;
- }
-
- piggyback = 0;
-
- LOCK (&local->fd->lock);
- {
- if (fd_ctx->lock_piggyback[i]) {
- fd_ctx->lock_piggyback[i]--;
- piggyback = 1;
- } else {
- fd_ctx->lock_acquired[i]--;
- }
- }
- UNLOCK (&local->fd->lock);
-
- if (piggyback) {
- afr_unlock_inodelk_cbk (frame, (void *) (long) i,
- this, 1, 0, NULL);
- if (!--call_count)
- break;
- continue;
- }
-
- flock_use = &full_flock;
- wind:
- AFR_TRACE_INODELK_IN (frame, this,
- AFR_INODELK_TRANSACTION,
- AFR_UNLOCK_OP, flock_use, F_SETLK,
- i);
-
- STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
- (void *) (long)i,
- priv->children[i],
- priv->children[i]->fops->finodelk,
- int_lock->domain, local->fd,
- F_SETLK, flock_use, NULL);
-
- if (!--call_count)
- break;
-
- } else {
- AFR_TRACE_INODELK_IN (frame, this,
- AFR_INODELK_TRANSACTION,
- AFR_UNLOCK_OP, &flock, F_SETLK, i);
-
- STACK_WIND_COOKIE (frame, afr_unlock_inodelk_cbk,
- (void *) (long)i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- int_lock->domain, &local->loc,
- F_SETLK, &flock, NULL);
-
- if (!--call_count)
- break;
- }
- }
-out:
- return 0;
+ return ret;
}
-static int32_t
-afr_unlock_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_internal_lock_t *int_lock = NULL;
- int32_t child_index = 0;
- int lockee_no = 0;
-
- priv = this->private;
- lockee_no = (int)((long) cookie) / priv->child_count;
- child_index = (int) ((long) cookie) % priv->child_count;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_UNLOCK_OP,
- int_lock->lockee[lockee_no].basename, op_ret,
- op_errno, (int) ((long)cookie));
-
- if (op_ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, op_errno,
- AFR_MSG_ENTRY_UNLOCK_FAIL,
- "%s: unlock failed on %s", local->loc.path,
- priv->children[child_index]->name);
- }
-
- int_lock->lockee[lockee_no].locked_nodes[child_index] &= LOCKED_NO;
- afr_unlock_common_cbk (frame, cookie, this, op_ret, op_errno, NULL);
+void
+afr_internal_lock_wind(call_frame_t *frame,
+ int32_t (*cbk)(call_frame_t *, void *, xlator_t *,
+ int32_t, int32_t, dict_t *),
+ void *cookie, int child, int lockee_num,
+ gf_boolean_t blocking, gf_boolean_t unlock)
+{
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ entrylk_cmd cmd = ENTRYLK_LOCK_NB;
+ int32_t cmd1 = F_SETLK;
+ struct gf_flock flock = {
+ 0,
+ };
+
+ switch (local->transaction.type) {
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ if (unlock) {
+ cmd = ENTRYLK_UNLOCK;
+ } else if (blocking) { /*Doesn't make sense to have blocking
+ unlock*/
+ cmd = ENTRYLK_LOCK;
+ }
+
+ if (local->fd) {
+ STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->fentrylk,
+ int_lock->domain,
+ int_lock->lockee[lockee_num].fd,
+ int_lock->lockee[lockee_num].basename, cmd,
+ ENTRYLK_WRLCK, NULL);
+ } else {
+ STACK_WIND_COOKIE(frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->entrylk,
+ int_lock->domain,
+ &int_lock->lockee[lockee_num].loc,
+ int_lock->lockee[lockee_num].basename, cmd,
+ ENTRYLK_WRLCK, NULL);
+ }
+ break;
- return 0;
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ flock = int_lock->lockee[lockee_num].flock;
+ if (unlock) {
+ flock.l_type = F_UNLCK;
+ } else if (blocking) { /*Doesn't make sense to have blocking
+ unlock*/
+ cmd1 = F_SETLKW;
+ }
+
+ if (local->fd) {
+ STACK_WIND_COOKIE(
+ frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->finodelk, int_lock->domain,
+ int_lock->lockee[lockee_num].fd, cmd1, &flock, NULL);
+ } else {
+ STACK_WIND_COOKIE(
+ frame, cbk, cookie, priv->children[child],
+ priv->children[child]->fops->inodelk, int_lock->domain,
+ &int_lock->lockee[lockee_num].loc, cmd1, &flock, NULL);
+ }
+ break;
+ }
}
static int
-afr_unlock_entrylk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = 0;
- int index = 0;
- int lockee_no = 0;
- int copies = 0;
- int i = -1;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
- copies = priv->child_count;
-
- call_count = afr_lockee_locked_nodes_count (int_lock);
-
- int_lock->lk_call_count = call_count;
-
- if (!call_count){
- gf_msg_trace (this->name, 0,
- "No internal locks unlocked");
- int_lock->lock_cbk (frame, this);
- goto out;
- }
-
- for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
- lockee_no = i / copies;
- index = i % copies;
- if (int_lock->lockee[lockee_no].locked_nodes[index] & LOCKED_YES) {
- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
- AFR_UNLOCK_OP,
- int_lock->lockee[lockee_no].basename,
- i);
-
- STACK_WIND_COOKIE (frame, afr_unlock_entrylk_cbk,
- (void *) (long) i,
- priv->children[index],
- priv->children[index]->fops->entrylk,
- int_lock->domain,
- &int_lock->lockee[lockee_no].loc,
- int_lock->lockee[lockee_no].basename,
- ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
-
- if (!--call_count)
- break;
- }
+afr_unlock_now(call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ int lockee_num = 0;
+ int i = -1;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+
+ call_count = afr_lockee_locked_nodes_count(int_lock);
+
+ int_lock->lk_call_count = call_count;
+
+ if (!call_count) {
+ gf_msg_trace(this->name, 0, "No internal locks unlocked");
+ int_lock->lock_cbk(frame, this);
+ goto out;
+ }
+
+ for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+ lockee_num = i / priv->child_count;
+ child_index = i % priv->child_count;
+ if (int_lock->lockee[lockee_num].locked_nodes[child_index] &
+ LOCKED_YES) {
+ afr_internal_lock_wind(frame, afr_unlock_common_cbk,
+ (void *)(long)i, child_index, lockee_num,
+ _gf_false, _gf_true);
+ if (!--call_count)
+ break;
}
+ }
out:
- return 0;
-
+ return 0;
}
static int32_t
-afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int cky = (long) cookie;
- int child_index = 0;
- int lockee_no = 0;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- child_index = ((int)cky) % priv->child_count;
- lockee_no = ((int)cky) / priv->child_count;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_msg (this->name, GF_LOG_ERROR, ENOSYS,
- AFR_MSG_LOCK_XLATOR_NOT_LOADED,
- "subvolume does not support locking. "
- "please load features/locks xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
- }
-
- local->op_errno = op_errno;
- int_lock->lock_op_errno = op_errno;
- }
-
- int_lock->lk_attempted_count++;
- }
- UNLOCK (&frame->lock);
-
- if ((op_ret == -1) &&
- (op_errno == ENOSYS)) {
- afr_unlock (frame, this);
- } else {
- if (op_ret == 0) {
- if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
- local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
- int_lock->lockee[lockee_no].locked_nodes[child_index] |= LOCKED_YES;
- int_lock->lockee[lockee_no].locked_count++;
- int_lock->entrylk_lock_count++;
- } else {
- int_lock->locked_nodes[child_index] |= LOCKED_YES;
- int_lock->lock_count++;
- }
+afr_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int cky = (long)cookie;
+ int child_index = 0;
+ int lockee_num = 0;
+
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ child_index = ((int)cky) % priv->child_count;
+ lockee_num = ((int)cky) / priv->child_count;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_msg(this->name, GF_LOG_ERROR, ENOSYS,
+ AFR_MSG_LOCK_XLATOR_NOT_LOADED,
+ "subvolume does not support locking. "
+ "please load features/locks xlator on server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+ }
+
+ local->op_errno = op_errno;
+ int_lock->lock_op_errno = op_errno;
+ }
+
+ int_lock->lk_attempted_count++;
+ }
+ UNLOCK(&frame->lock);
+
+ if ((op_ret == -1) && (op_errno == ENOSYS)) {
+ afr_unlock_now(frame, this);
+ } else {
+ if (op_ret == 0) {
+ int_lock->lockee[lockee_num]
+ .locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lockee[lockee_num].locked_count++;
+ int_lock->lock_count++;
+ if (local->transaction.type == AFR_DATA_TRANSACTION) {
+ LOCK(&local->inode->lock);
+ {
+ local->inode_ctx->lock_count++;
}
- afr_lock_blocking (frame, this, cky + 1);
+ UNLOCK(&local->inode->lock);
+ }
}
+ afr_lock_blocking(frame, this, cky + 1);
+ }
- return 0;
-}
-
-static int32_t
-afr_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
-
- afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata);
- return 0;
-
-}
-
-static int32_t
-afr_blocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
- op_errno, (long)cookie);
-
- afr_lock_cbk (frame, cookie, this, op_ret, op_errno, xdata);
- return 0;
-}
-
-static int
-afr_copy_locked_nodes (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_inodelk_t *inodelk = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- memcpy (inodelk->locked_nodes, int_lock->locked_nodes,
- sizeof (*inodelk->locked_nodes) * priv->child_count);
- inodelk->lock_count = int_lock->lock_count;
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_TRANSACTION:
- /*entrylk_count is being used in both non-blocking and blocking
- * modes */
- break;
- }
-
- return 0;
-
+ return 0;
}
static gf_boolean_t
-afr_is_entrylk (afr_internal_lock_t *int_lock,
- afr_transaction_type trans_type)
+_is_lock_wind_needed(afr_local_t *local, int child_index)
{
- gf_boolean_t is_entrylk = _gf_false;
-
- if ((int_lock->transaction_lk_type == AFR_SELFHEAL_LK) &&
- int_lock->selfheal_lk_type == AFR_ENTRY_SELF_HEAL_LK) {
-
- is_entrylk = _gf_true;
-
- } else if ((int_lock->transaction_lk_type == AFR_TRANSACTION_LK) &&
- (trans_type == AFR_ENTRY_TRANSACTION ||
- trans_type == AFR_ENTRY_RENAME_TRANSACTION)) {
-
- is_entrylk = _gf_true;
+ if (!local->child_up[child_index])
+ return _gf_false;
- } else {
- is_entrylk = _gf_false;
- }
-
- return is_entrylk;
+ return _gf_true;
}
static gf_boolean_t
-_is_lock_wind_needed (afr_local_t *local, int child_index)
-{
- if (!local->child_up[child_index])
- return _gf_false;
-
- return _gf_true;
+is_blocking_locks_count_sufficient(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ int child = 0;
+ int nlockee = 0;
+ int lockee_count = 0;
+ gf_boolean_t ret = _gf_true;
+
+ local = frame->local;
+ priv = this->private;
+ int_lock = &local->internal_lock;
+ lockee_count = int_lock->lockee_count;
+
+ if (int_lock->lock_count == 0) {
+ afr_log_locks_failure(frame, "any subvolume", "lock",
+ int_lock->lock_op_errno);
+ return _gf_false;
+ }
+ /* For FOPS that take multiple sets of locks (mkdir, rename),
+ * there must be at least one brick on which the locks from
+ * all lock sets were successful. */
+ for (child = 0; child < priv->child_count; child++) {
+ ret = _gf_true;
+ for (nlockee = 0; nlockee < lockee_count; nlockee++) {
+ if (!(int_lock->lockee[nlockee].locked_nodes[child] & LOCKED_YES))
+ ret = _gf_false;
+ }
+ if (ret)
+ return ret;
+ }
+ if (!ret)
+ afr_log_locks_failure(frame, "all", "lock", int_lock->lock_op_errno);
+
+ return ret;
}
int
-afr_lock_blocking (call_frame_t *frame, xlator_t *this, int cookie)
+afr_lock_blocking(call_frame_t *frame, xlator_t *this, int cookie)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_inodelk_t *inodelk = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- struct gf_flock flock = {0,};
- uint64_t ctx = 0;
- int ret = 0;
- int child_index = 0;
- int lockee_no = 0;
- gf_boolean_t is_entrylk = _gf_false;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
- child_index = cookie % priv->child_count;
- lockee_no = cookie / priv->child_count;
- is_entrylk = afr_is_entrylk (int_lock, local->transaction.type);
-
-
- if (!is_entrylk) {
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- flock.l_start = inodelk->flock.l_start;
- flock.l_len = inodelk->flock.l_len;
- flock.l_type = inodelk->flock.l_type;
- }
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ uint64_t ctx = 0;
+ int ret = 0;
+ int child_index = 0;
+ int lockee_num = 0;
- if (local->fd) {
- ret = fd_ctx_get (local->fd, this, &ctx);
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+ child_index = cookie % priv->child_count;
+ lockee_num = cookie / priv->child_count;
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_FD_CTX_GET_FAILED,
- "unable to get fd ctx for fd=%p",
- local->fd);
+ if (local->fd) {
+ ret = fd_ctx_get(local->fd, this, &ctx);
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED,
+ "unable to get fd ctx for fd=%p", local->fd);
- afr_copy_locked_nodes (frame, this);
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
- afr_unlock (frame, this);
+ afr_unlock_now(frame, this);
- return 0;
- }
+ return 0;
}
+ }
- if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
- if ((is_entrylk && int_lock->entrylk_lock_count == 0) ||
- (!is_entrylk && int_lock->lock_count == 0)) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_BLOCKING_LKS_FAILED,
- "unable to lock on even one child");
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
-
- afr_copy_locked_nodes (frame, this);
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+ if (!is_blocking_locks_count_sufficient(frame, this)) {
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
- afr_unlock(frame, this);
+ afr_unlock_now(frame, this);
- return 0;
- }
+ return 0;
}
+ }
- if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
- /* we're done locking */
-
- gf_msg_debug (this->name, 0,
- "we're done locking");
+ if (int_lock->lk_expected_count == int_lock->lk_attempted_count) {
+ /* we're done locking */
- afr_copy_locked_nodes (frame, this);
+ gf_msg_debug(this->name, 0, "we're done locking");
- int_lock->lock_op_ret = 0;
- int_lock->lock_cbk (frame, this);
- return 0;
- }
-
- if (!_is_lock_wind_needed (local, child_index)) {
- afr_lock_blocking (frame, this, cookie + 1);
- return 0;
- }
-
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
-
- if (local->fd) {
- AFR_TRACE_INODELK_IN (frame, this,
- AFR_INODELK_TRANSACTION,
- AFR_LOCK_OP, &flock, F_SETLKW,
- child_index);
-
- STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->finodelk,
- int_lock->domain, local->fd,
- F_SETLKW, &flock, NULL);
-
- } else {
- AFR_TRACE_INODELK_IN (frame, this,
- AFR_INODELK_TRANSACTION,
- AFR_LOCK_OP, &flock, F_SETLKW,
- child_index);
-
- STACK_WIND_COOKIE (frame, afr_blocking_inodelk_cbk,
- (void *) (long) child_index,
- priv->children[child_index],
- priv->children[child_index]->fops->inodelk,
- int_lock->domain, &local->loc,
- F_SETLKW, &flock, NULL);
- }
+ int_lock->lock_op_ret = 0;
+ int_lock->lock_cbk(frame, this);
+ return 0;
+ }
- break;
+ if (!_is_lock_wind_needed(local, child_index)) {
+ afr_lock_blocking(frame, this, cookie + 1);
+ return 0;
+ }
- case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_TRANSACTION:
- /*Accounting for child_index increments on 'down'
- *and 'fd-less' children */
-
- if (local->fd) {
- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP,
- int_lock->lockee[lockee_no].basename,
- cookie);
-
- STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
- (void *) (long) cookie,
- priv->children[child_index],
- priv->children[child_index]->fops->fentrylk,
- int_lock->domain, local->fd,
- int_lock->lockee[lockee_no].basename,
- ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
- } else {
- AFR_TRACE_ENTRYLK_IN (frame, this,
- AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP, local->transaction.basename,
- child_index);
-
- STACK_WIND_COOKIE (frame, afr_blocking_entrylk_cbk,
- (void *) (long) cookie,
- priv->children[child_index],
- priv->children[child_index]->fops->entrylk,
- int_lock->domain,
- &int_lock->lockee[lockee_no].loc,
- int_lock->lockee[lockee_no].basename,
- ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
- }
+ afr_internal_lock_wind(frame, afr_lock_cbk, (void *)(long)cookie,
+ child_index, lockee_num, _gf_true, _gf_false);
- break;
- }
-
- return 0;
+ return 0;
}
int32_t
-afr_blocking_lock (call_frame_t *frame, xlator_t *this)
+afr_blocking_lock(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int up_count = 0;
-
- priv = this->private;
- local = frame->local;
- int_lock = &local->internal_lock;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int up_count = 0;
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- initialize_inodelk_variables (frame, this);
- break;
+ priv = this->private;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- case AFR_ENTRY_RENAME_TRANSACTION:
- case AFR_ENTRY_TRANSACTION:
- up_count = AFR_COUNT (local->child_up, priv->child_count);
- int_lock->lk_call_count = int_lock->lk_expected_count
- = (int_lock->lockee_count *
- up_count);
- initialize_entrylk_variables (frame, this);
- break;
- }
+ up_count = AFR_COUNT(local->child_up, priv->child_count);
+ int_lock->lk_call_count = int_lock->lk_expected_count =
+ (int_lock->lockee_count * up_count);
+ initialize_internal_lock_variables(frame, this);
- afr_lock_blocking (frame, this, 0);
+ afr_lock_blocking(frame, this, 0);
- return 0;
+ return 0;
}
static int32_t
-afr_nonblocking_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
- int copies = 0;
- int index = 0;
- int lockee_no = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- copies = priv->child_count;
- index = child_index % copies;
- lockee_no = child_index / copies;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- AFR_TRACE_ENTRYLK_OUT (frame, this, AFR_ENTRYLK_TRANSACTION,
- AFR_LOCK_OP,
- int_lock->lockee[lockee_no].basename, op_ret,
- op_errno, (long) cookie);
-
- LOCK (&frame->lock);
- {
- if (op_ret < 0 ) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_msg (this->name, GF_LOG_ERROR,
- ENOSYS, AFR_MSG_LOCK_XLATOR_NOT_LOADED,
- "subvolume does not support "
- "locking. please load features/locks"
- " xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
-
- int_lock->lock_op_errno = op_errno;
- local->op_errno = op_errno;
- }
- } else if (op_ret == 0) {
- int_lock->lockee[lockee_no].locked_nodes[index] |= \
- LOCKED_YES;
- int_lock->lockee[lockee_no].locked_count++;
- int_lock->entrylk_lock_count++;
- }
-
- call_count = --int_lock->lk_call_count;
- }
- UNLOCK (&frame->lock);
-
- if (call_count == 0) {
- gf_msg_trace (this->name, 0,
- "Last locking reply received");
- /* all locks successful. Proceed to call FOP */
- if (int_lock->entrylk_lock_count ==
- int_lock->lk_expected_count) {
- gf_msg_trace (this->name, 0,
- "All servers locked. Calling the cbk");
- int_lock->lock_op_ret = 0;
- int_lock->lock_cbk (frame, this);
- }
- /* Not all locks were successful. Unlock and try locking
- again, this time with serially blocking locks */
- else {
- gf_msg_trace (this->name, 0,
- "%d servers locked. Trying again "
- "with blocking calls",
- int_lock->lock_count);
-
- afr_unlock(frame, this);
- }
- }
-
- return 0;
-}
-
-int
-afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this)
+afr_nb_internal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- int copies = 0;
- int index = 0;
- int lockee_no = 0;
- int32_t call_count = 0;
- int i = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- copies = priv->child_count;
- initialize_entrylk_variables (frame, this);
-
- if (local->fd) {
- fd_ctx = afr_fd_ctx_get (local->fd, this);
- if (!fd_ctx) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_FD_CTX_GET_FAILED,
- "unable to get fd ctx for fd=%p",
- local->fd);
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
- local->op_errno = EINVAL;
- int_lock->lock_op_errno = EINVAL;
-
- afr_unlock (frame, this);
- return -1;
- }
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ int call_count = 0;
+ int child_index = 0;
+ int lockee_num = 0;
+ afr_private_t *priv = NULL;
- call_count = int_lock->lockee_count * internal_lock_count (frame, this);
- int_lock->lk_call_count = call_count;
- int_lock->lk_expected_count = call_count;
+ priv = this->private;
- if (!call_count) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_INFO_COMMON,
- "fd not open on any subvolumes. aborting.");
- afr_unlock (frame, this);
- goto out;
- }
+ child_index = ((long)cookie) % priv->child_count;
+ lockee_num = ((long)cookie) / priv->child_count;
- /* Send non-blocking entrylk calls only on up children
- and where the fd has been opened */
- for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) {
- index = i%copies;
- lockee_no = i/copies;
- if (local->child_up[index]) {
- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
- AFR_LOCK_OP,
- int_lock->lockee[lockee_no].basename,
- i);
-
- STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
- (void *) (long) i,
- priv->children[index],
- priv->children[index]->fops->fentrylk,
- this->name, local->fd,
- int_lock->lockee[lockee_no].basename,
- ENTRYLK_LOCK_NB, ENTRYLK_WRLCK,
- NULL);
- if (!--call_count)
- break;
- }
- }
- } else {
- call_count = int_lock->lockee_count * internal_lock_count (frame, this);
- int_lock->lk_call_count = call_count;
- int_lock->lk_expected_count = call_count;
-
- for (i = 0; i < int_lock->lockee_count*priv->child_count; i++) {
- index = i%copies;
- lockee_no = i/copies;
- if (local->child_up[index]) {
- AFR_TRACE_ENTRYLK_IN (frame, this, AFR_ENTRYLK_NB_TRANSACTION,
- AFR_LOCK_OP,
- int_lock->lockee[lockee_no].basename,
- i);
-
- STACK_WIND_COOKIE (frame, afr_nonblocking_entrylk_cbk,
- (void *) (long) i,
- priv->children[index],
- priv->children[index]->fops->entrylk,
- this->name, &int_lock->lockee[lockee_no].loc,
- int_lock->lockee[lockee_no].basename,
- ENTRYLK_LOCK_NB, ENTRYLK_WRLCK,
- NULL);
-
- if (!--call_count)
- break;
- }
- }
- }
-out:
- return 0;
-}
+ local = frame->local;
+ int_lock = &local->internal_lock;
-int32_t
-afr_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_inodelk_t *inodelk = NULL;
- afr_local_t *local = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
- afr_fd_ctx_t *fd_ctx = NULL;
-
-
- local = frame->local;
- int_lock = &local->internal_lock;
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
-
- AFR_TRACE_INODELK_OUT (frame, this, AFR_INODELK_NB_TRANSACTION,
- AFR_LOCK_OP, NULL, op_ret,
- op_errno, (long) cookie);
-
- if (local->fd)
- fd_ctx = afr_fd_ctx_get (local->fd, this);
-
- LOCK (&frame->lock);
+ if (op_ret == 0 && local->transaction.type == AFR_DATA_TRANSACTION) {
+ LOCK(&local->inode->lock);
{
- if (op_ret < 0) {
- if (op_errno == ENOSYS) {
- /* return ENOTSUP */
- gf_msg (this->name, GF_LOG_ERROR, ENOSYS,
- AFR_MSG_LOCK_XLATOR_NOT_LOADED,
- "subvolume does not support "
- "locking. please load features/locks"
- " xlator on server");
- local->op_ret = op_ret;
- int_lock->lock_op_ret = op_ret;
- int_lock->lock_op_errno = op_errno;
- local->op_errno = op_errno;
- }
- if (local->transaction.eager_lock)
- local->transaction.eager_lock[child_index] = 0;
- } else {
- inodelk->locked_nodes[child_index] |= LOCKED_YES;
- inodelk->lock_count++;
-
- if (local->transaction.eager_lock &&
- local->transaction.eager_lock[child_index] &&
- local->fd) {
- /* piggybacked */
- if (op_ret == 1) {
- /* piggybacked */
- } else if (op_ret == 0) {
- /* lock acquired from server */
- fd_ctx->lock_acquired[child_index]++;
- }
- }
- }
-
- call_count = --int_lock->lk_call_count;
- }
- UNLOCK (&frame->lock);
-
- if (call_count == 0) {
- gf_msg_trace (this->name, 0,
- "Last inode locking reply received");
- /* all locks successful. Proceed to call FOP */
- if (inodelk->lock_count == int_lock->lk_expected_count) {
- gf_msg_trace (this->name, 0,
- "All servers locked. Calling the cbk");
- int_lock->lock_op_ret = 0;
- int_lock->lock_cbk (frame, this);
- }
- /* Not all locks were successful. Unlock and try locking
- again, this time with serially blocking locks */
- else {
- gf_msg_trace (this->name, 0,
- "%d servers locked. "
- "Trying again with blocking calls",
- int_lock->lock_count);
-
- afr_unlock(frame, this);
- }
+ local->inode_ctx->lock_count++;
}
+ UNLOCK(&local->inode->lock);
+ }
- return 0;
+ LOCK(&frame->lock);
+ {
+ if (op_ret < 0) {
+ if (op_errno == ENOSYS) {
+ /* return ENOTSUP */
+ gf_msg(this->name, GF_LOG_ERROR, ENOSYS,
+ AFR_MSG_LOCK_XLATOR_NOT_LOADED,
+ "subvolume does not support "
+ "locking. please load features/locks"
+ " xlator on server");
+ local->op_ret = op_ret;
+ int_lock->lock_op_ret = op_ret;
+
+ int_lock->lock_op_errno = op_errno;
+ local->op_errno = op_errno;
+ }
+ } else if (op_ret == 0) {
+ int_lock->lockee[lockee_num]
+ .locked_nodes[child_index] |= LOCKED_YES;
+ int_lock->lockee[lockee_num].locked_count++;
+ int_lock->lock_count++;
+ }
+
+ call_count = --int_lock->lk_call_count;
+ }
+ UNLOCK(&frame->lock);
+
+ if (call_count == 0) {
+ gf_msg_trace(this->name, 0, "Last locking reply received");
+ /* all locks successful. Proceed to call FOP */
+ if (int_lock->lock_count == int_lock->lk_expected_count) {
+ gf_msg_trace(this->name, 0, "All servers locked. Calling the cbk");
+ int_lock->lock_op_ret = 0;
+ int_lock->lock_cbk(frame, this);
+ }
+ /* Not all locks were successful. Unlock and try locking
+ again, this time with serially blocking locks */
+ else {
+ gf_msg_trace(this->name, 0,
+ "%d servers locked. Trying again "
+ "with blocking calls",
+ int_lock->lock_count);
+
+ afr_unlock_now(frame, this);
+ }
+ }
+
+ return 0;
}
int
-afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_inodelk_t *inodelk = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- int32_t call_count = 0;
- int i = 0;
- int ret = 0;
- struct gf_flock flock = {0,};
- struct gf_flock full_flock = {0,};
- struct gf_flock *flock_use = NULL;
- int piggyback = 0;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- priv = this->private;
-
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
-
- flock.l_start = inodelk->flock.l_start;
- flock.l_len = inodelk->flock.l_len;
- flock.l_type = inodelk->flock.l_type;
-
- full_flock.l_type = inodelk->flock.l_type;
-
- initialize_inodelk_variables (frame, this);
-
- if (local->fd) {
- fd_ctx = afr_fd_ctx_get (local->fd, this);
- if (!fd_ctx) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_FD_CTX_GET_FAILED,
- "unable to get fd ctx for fd=%p",
- local->fd);
-
- local->op_ret = -1;
- int_lock->lock_op_ret = -1;
- local->op_errno = EINVAL;
- int_lock->lock_op_errno = EINVAL;
-
- afr_unlock (frame, this);
- ret = -1;
- goto out;
- }
-
- call_count = internal_lock_count (frame, this);
- int_lock->lk_call_count = call_count;
- int_lock->lk_expected_count = call_count;
-
- if (!call_count) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_ALL_SUBVOLS_DOWN,
- "All bricks are down, aborting.");
- afr_unlock (frame, this);
- goto out;
- }
-
- /* Send non-blocking inodelk calls only on up children
- and where the fd has been opened */
- for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i])
- continue;
-
- flock_use = &flock;
- if (!local->transaction.eager_lock_on) {
- goto wind;
- }
-
- piggyback = 0;
- local->transaction.eager_lock[i] = 1;
-
- afr_set_delayed_post_op (frame, this);
-
- LOCK (&local->fd->lock);
- {
- if (fd_ctx->lock_acquired[i]) {
- fd_ctx->lock_piggyback[i]++;
- piggyback = 1;
- }
- }
- UNLOCK (&local->fd->lock);
-
- if (piggyback) {
- /* (op_ret == 1) => indicate piggybacked lock */
- afr_nonblocking_inodelk_cbk (frame, (void *) (long) i,
- this, 1, 0, NULL);
- if (!--call_count)
- break;
- continue;
- }
- flock_use = &full_flock;
- wind:
- AFR_TRACE_INODELK_IN (frame, this,
- AFR_INODELK_NB_TRANSACTION,
- AFR_LOCK_OP, flock_use, F_SETLK, i);
-
- STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->finodelk,
- int_lock->domain, local->fd,
- F_SETLK, flock_use, NULL);
-
- if (!--call_count)
- break;
- }
- } else {
- call_count = internal_lock_count (frame, this);
- int_lock->lk_call_count = call_count;
- int_lock->lk_expected_count = call_count;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->child_up[i])
- continue;
- AFR_TRACE_INODELK_IN (frame, this,
- AFR_INODELK_NB_TRANSACTION,
- AFR_LOCK_OP, &flock, F_SETLK, i);
-
- STACK_WIND_COOKIE (frame, afr_nonblocking_inodelk_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->inodelk,
- int_lock->domain, &local->loc,
- F_SETLK, &flock, NULL);
-
- if (!--call_count)
- break;
- }
+afr_lock_nonblocking(call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int child = 0;
+ int lockee_num = 0;
+ int32_t call_count = 0;
+ int i = 0;
+ int ret = 0;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ priv = this->private;
+
+ initialize_internal_lock_variables(frame, this);
+
+ if (local->fd) {
+ fd_ctx = afr_fd_ctx_get(local->fd, this);
+ if (!fd_ctx) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_FD_CTX_GET_FAILED,
+ "unable to get fd ctx for fd=%p", local->fd);
+
+ local->op_ret = -1;
+ int_lock->lock_op_ret = -1;
+ local->op_errno = EINVAL;
+ int_lock->lock_op_errno = EINVAL;
+
+ afr_unlock_now(frame, this);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ call_count = int_lock->lockee_count * internal_lock_count(frame, this);
+ int_lock->lk_call_count = call_count;
+ int_lock->lk_expected_count = call_count;
+
+ if (!call_count) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INFO_COMMON,
+ "fd not open on any subvolumes. aborting.");
+ afr_unlock_now(frame, this);
+ goto out;
+ }
+
+ /* Send non-blocking lock calls only on up children
+ and where the fd has been opened */
+ for (i = 0; i < int_lock->lockee_count * priv->child_count; i++) {
+ child = i % priv->child_count;
+ lockee_num = i / priv->child_count;
+ if (local->child_up[child]) {
+ afr_internal_lock_wind(frame, afr_nb_internal_lock_cbk,
+ (void *)(long)i, child, lockee_num,
+ _gf_false, _gf_false);
+ if (!--call_count)
+ break;
}
+ }
out:
- return ret;
+ return ret;
}
int32_t
-afr_unlock (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- if (transaction_lk_op (local)) {
- if (is_afr_lock_transaction (local))
- afr_unlock_inodelk (frame, this);
- else
- afr_unlock_entrylk (frame, this);
-
- } else {
- if (is_afr_lock_selfheal (local))
- afr_unlock_inodelk (frame, this);
- else
- afr_unlock_entrylk (frame, this);
- }
-
+afr_unlock(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+
+ local = frame->local;
+
+ if (!local->transaction.eager_lock_on)
+ goto out;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ list_del_init(&local->transaction.owner_list);
+ if (list_empty(&lock->owners) && list_empty(&lock->post_op)) {
+ local->transaction.do_eager_unlock = _gf_true;
+ /*TODO: Need to get metadata use on_disk and inherit/uninherit
+ *GF_ASSERT (!local->inode_ctx->on_disk[local->transaction.type]);
+ *GF_ASSERT (!local->inode_ctx->inherited[local->transaction.type]);
+ */
+ GF_ASSERT(lock->release);
+ }
+ }
+ UNLOCK(&local->inode->lock);
+ if (!local->transaction.do_eager_unlock) {
+ local->internal_lock.lock_cbk(frame, this);
return 0;
-}
-
-int
-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
- unsigned int child_count)
-{
- afr_local_t *dst_local = NULL;
- afr_local_t *src_local = NULL;
- afr_internal_lock_t *dst_lock = NULL;
- afr_internal_lock_t *src_lock = NULL;
- afr_inodelk_t *dst_inodelk = NULL;
- afr_inodelk_t *src_inodelk = NULL;
- int ret = -1;
-
- src_local = src->local;
- src_lock = &src_local->internal_lock;
- src_inodelk = afr_get_inodelk (src_lock, dom);
- dst_local = dst->local;
- dst_lock = &dst_local->internal_lock;
- dst_inodelk = afr_get_inodelk (dst_lock, dom);
- if (!dst_inodelk || !src_inodelk)
- goto out;
- if (src_inodelk->locked_nodes) {
- memcpy (dst_inodelk->locked_nodes, src_inodelk->locked_nodes,
- sizeof (*dst_inodelk->locked_nodes) * child_count);
- memset (src_inodelk->locked_nodes, 0,
- sizeof (*src_inodelk->locked_nodes) * child_count);
- }
+ }
- dst_lock->transaction_lk_type = src_lock->transaction_lk_type;
- dst_lock->selfheal_lk_type = src_lock->selfheal_lk_type;
- dst_inodelk->lock_count = src_inodelk->lock_count;
- src_inodelk->lock_count = 0;
- ret = 0;
out:
- return ret;
+ afr_unlock_now(frame, this);
+ return 0;
}
diff --git a/xlators/cluster/afr/src/afr-mem-types.h b/xlators/cluster/afr/src/afr-mem-types.h
index 6f1eee95322..816065fb57a 100644
--- a/xlators/cluster/afr/src/afr-mem-types.h
+++ b/xlators/cluster/afr/src/afr-mem-types.h
@@ -8,45 +8,31 @@
cases as published by the Free Software Foundation.
*/
-
#ifndef __AFR_MEM_TYPES_H__
#define __AFR_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_afr_mem_types_ {
- gf_afr_mt_iovec = gf_common_mt_end + 1,
- gf_afr_mt_afr_fd_ctx_t,
- gf_afr_mt_afr_private_t,
- gf_afr_mt_int32_t,
- gf_afr_mt_char,
- gf_afr_mt_xattr_key,
- gf_afr_mt_dict_t,
- gf_afr_mt_xlator_t,
- gf_afr_mt_iatt,
- gf_afr_mt_int,
- gf_afr_mt_afr_node_character,
- gf_afr_mt_sh_diff_loop_state,
- gf_afr_mt_uint8_t,
- gf_afr_mt_loc_t,
- gf_afr_mt_entry_name,
- gf_afr_mt_pump_priv,
- gf_afr_mt_locked_fd,
- gf_afr_mt_inode_ctx_t,
- gf_afr_fd_paused_call_t,
- gf_afr_mt_crawl_data_t,
- gf_afr_mt_brick_pos_t,
- gf_afr_mt_shd_bool_t,
- gf_afr_mt_shd_timer_t,
- gf_afr_mt_shd_event_t,
- gf_afr_mt_time_t,
- gf_afr_mt_pos_data_t,
- gf_afr_mt_reply_t,
- gf_afr_mt_subvol_healer_t,
- gf_afr_mt_spbc_timeout_t,
- gf_afr_mt_spb_status_t,
- gf_afr_mt_replace_brick_t,
- gf_afr_mt_end
+ gf_afr_mt_afr_fd_ctx_t = gf_common_mt_end + 1,
+ gf_afr_mt_afr_private_t,
+ gf_afr_mt_int32_t,
+ gf_afr_mt_char,
+ gf_afr_mt_xattr_key,
+ gf_afr_mt_dict_t,
+ gf_afr_mt_xlator_t,
+ gf_afr_mt_afr_node_character,
+ gf_afr_mt_inode_ctx_t,
+ gf_afr_mt_shd_event_t,
+ gf_afr_mt_reply_t,
+ gf_afr_mt_subvol_healer_t,
+ gf_afr_mt_spbc_timeout_t,
+ gf_afr_mt_spb_status_t,
+ gf_afr_mt_empty_brick_t,
+ gf_afr_mt_child_latency_t,
+ gf_afr_mt_atomic_t,
+ gf_afr_mt_lk_heal_info_t,
+ gf_afr_mt_gf_lock,
+ gf_afr_mt_end
};
#endif
-
diff --git a/xlators/cluster/afr/src/afr-messages.h b/xlators/cluster/afr/src/afr-messages.h
index fca94eeff6b..e73fd997765 100644
--- a/xlators/cluster/afr/src/afr-messages.h
+++ b/xlators/cluster/afr/src/afr-messages.h
@@ -11,345 +11,157 @@
#ifndef _AFR_MESSAGES_H_
#define _AFR_MESSAGES_H_
-#include "glfs-message-id.h"
-
-/*! \file afr-messages.h
- * \brief AFR log-message IDs and their descriptions.
- */
-
-/* NOTE: Rules for message additions
- * 1) Each instance of a message is _better_ left with a unique message ID, even
- * if the message format is the same. Reasoning is that, if the message
- * format needs to change in one instance, the other instances are not
- * impacted or the new change does not change the ID of the instance being
- * modified.
- * 2) Addition of a message,
- * - Should increment the GLFS_NUM_MESSAGES
- * - Append to the list of messages defined, towards the end
- * - Retain macro naming as glfs_msg_X (for redability across developers)
- * NOTE: Rules for message format modifications
- * 3) Check acorss the code if the message ID macro in question is reused
- * anywhere. If reused then then the modifications should ensure correctness
- * everywhere, or needs a new message ID as (1) above was not adhered to. If
- * not used anywhere, proceed with the required modification.
- * NOTE: Rules for message deletion
- * 4) Check (3) and if used anywhere else, then cannot be deleted. If not used
- * anywhere, then can be deleted, but will leave a hole by design, as
- * addition rules specify modification to the end of the list and not filling
- * holes.
- */
-
-#define GLFS_COMP_BASE_AFR GLFS_MSGID_COMP_AFR
-#define GLFS_NUM_MESSAGES 40
-#define GLFS_MSGID_END (GLFS_COMP_BASE_AFR + GLFS_NUM_MESSAGES + 1)
-
-#define glfs_msg_start_x GLFS_COMP_BASE_AFR, "Invalid: Start of messages"
-
-/*!
- * @messageid 108001
- * @diagnosis Client quorum is not met due to which file modification
- * operations are disallowed.
- * @recommendedaction Some brick processes are down/ not visible from the
- * client. Ensure that the bricks are up/ network traffic is not blocked.
- */
-#define AFR_MSG_QUORUM_FAIL (GLFS_COMP_BASE_AFR + 1)
-
-
-/*!
- * @messageid 108002
- * @diagnosis The bricks that were down are now up and quorum is restored.
- * @recommendedaction Possibly check why the bricks went down to begin with.
- */
-#define AFR_MSG_QUORUM_MET (GLFS_COMP_BASE_AFR + 2)
-
-
-/*!
- * @messageid 108003
- * @diagnosis Client quorum-type was set to auto due to which the quorum-count
- * option is no longer valid.
- * @recommendedaction None.
- */
-#define AFR_MSG_QUORUM_OVERRIDE (GLFS_COMP_BASE_AFR + 3)
-
-
-/*!
- * @messageid 108004
- * @diagnosis Replication sub volume witnessed a connection notification
- * from a brick which does not belong to its replica set.
- * @recommendedaction None. This is a safety check in code.
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
*/
-#define AFR_MSG_INVALID_CHILD_UP (GLFS_COMP_BASE_AFR + 4)
-
-
-/*!
- * @messageid 108005
- * @diagnosis A replica set that was inaccessible because all its bricks were
- * down is now accessible because at least one of its bricks came back up.
- * @recommendedaction Possibly check why all the bricks of that replica set
- * went down to begin with.
- */
-#define AFR_MSG_SUBVOL_UP (GLFS_COMP_BASE_AFR + 5)
-
-
-/*!
- * @messageid 108006
- * @diagnosis All bricks of a replica set are down. Data residing in that
- * replica cannot be accessed until one of the bricks come back up.
- * @recommendedaction Ensure that the bricks are up.
- */
-#define AFR_MSG_ALL_SUBVOLS_DOWN (GLFS_COMP_BASE_AFR + 6)
-
-
-/*!
- * @messageid 108007
- * @diagnosis Entry unlocks failed on a brick.
- * @recommendedaction Error number in the log should give the reason why it
- * failed. Also observe brick logs for more information.
-*/
-#define AFR_MSG_ENTRY_UNLOCK_FAIL (GLFS_COMP_BASE_AFR + 7)
-
-
-/*!
- * @messageid 108008
- * @diagnosis There is an inconsistency in the file's data/metadata/gfid
- * amongst the bricks of a replica set.
- * @recommendedaction Resolve the split brain by clearing the AFR changelog
- * attributes from the appropriate brick and trigger self-heal.
- */
-#define AFR_MSG_SPLIT_BRAIN (GLFS_COMP_BASE_AFR + 8)
-
-
-/*!
- * @messageid 108009
- * @diagnosis open/opendir failed on a brick.
- * @recommendedaction Error number in the log should give the reason why it
- * failed. Also observe brick logs for more information.
- */
-#define AFR_MSG_OPEN_FAIL (GLFS_COMP_BASE_AFR + 9)
-
-
-/*!
- * @messageid 108010
- * @diagnosis Inode unlocks failed on a brick.
- * @recommendedaction Error number in the log should give the reason why it
- * failed. Also observe brick logs for more information.
-*/
-#define AFR_MSG_INODE_UNLOCK_FAIL (GLFS_COMP_BASE_AFR + 10)
-
-/*!
- * @messageid 108011
- * @diagnosis Setting of pending xattrs succeeded/failed during replace-brick
- * operation.
- * @recommendedaction In case of failure, error number in the log should give
- * the reason why it failed. Also observe brick logs for more information.
-*/
-#define AFR_MSG_REPLACE_BRICK_STATUS (GLFS_COMP_BASE_AFR + 11)
-
-/*!
- * @messageid 108012
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_GFID_NULL (GLFS_COMP_BASE_AFR + 12)
-
-/*!
- * @messageid 108013
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_FD_CREATE_FAILED (GLFS_COMP_BASE_AFR + 13)
-
-/*!
- * @messageid 108014
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_DICT_SET_FAILED (GLFS_COMP_BASE_AFR + 14)
-
-/*!
- * @messageid 108015
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_EXPUNGING_FILE_OR_DIR (GLFS_COMP_BASE_AFR + 15)
-
-/*!
- * @messageid 108016
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_MIGRATION_IN_PROGRESS (GLFS_COMP_BASE_AFR + 16)
-
-/*!
- * @messageid 108017
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_CHILD_MISCONFIGURED (GLFS_COMP_BASE_AFR + 17)
-
-/*!
- * @messageid 108018
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_VOL_MISCONFIGURED (GLFS_COMP_BASE_AFR + 18)
-
-/*!
- * @messageid 108019
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_BLOCKING_LKS_FAILED (GLFS_COMP_BASE_AFR + 19)
-
-/*!
- * @messageid 108020
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_INVALID_FD (GLFS_COMP_BASE_AFR + 20)
-
-/*!
- * @messageid 108021
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_LOCK_INFO (GLFS_COMP_BASE_AFR + 21)
-
-/*!
- * @messageid 108022
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_LOCK_XLATOR_NOT_LOADED (GLFS_COMP_BASE_AFR + 22)
-
-/*!
- * @messageid 108023
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_FD_CTX_GET_FAILED (GLFS_COMP_BASE_AFR + 23)
-
-/*!
- * @messageid 108024
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_INVALID_SUBVOL (GLFS_COMP_BASE_AFR + 24)
-
-/*!
- * @messageid 108025
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_PUMP_XLATOR_ERROR (GLFS_COMP_BASE_AFR + 25)
-
-/*!
- * @messageid 108026
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_SELF_HEAL_INFO (GLFS_COMP_BASE_AFR + 26)
-
-/*!
- * @messageid 108027
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_READ_SUBVOL_ERROR (GLFS_COMP_BASE_AFR + 27)
-
-/*!
- * @messageid 108028
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_DICT_GET_FAILED (GLFS_COMP_BASE_AFR + 28)
-
-
-/*!
- * @messageid 108029
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_INFO_COMMON (GLFS_COMP_BASE_AFR + 29)
-
-/*!
- * @messageid 108030
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR (GLFS_COMP_BASE_AFR + 30)
-
-/*!
- * @messageid 108031
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_LOCAL_CHILD (GLFS_COMP_BASE_AFR + 31)
-
-/*!
- * @messageid 108032
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_INVALID_DATA (GLFS_COMP_BASE_AFR + 32)
-
-/*!
- * @messageid 108033
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_INVALID_ARG (GLFS_COMP_BASE_AFR + 33)
-
-/*!
- * @messageid 108034
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_INDEX_DIR_GET_FAILED (GLFS_COMP_BASE_AFR + 34)
-
-/*!
- * @messageid 108035
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_FSYNC_FAILED (GLFS_COMP_BASE_AFR + 35)
-
-/*!
- * @messageid 108036
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_FAVORITE_CHILD (GLFS_COMP_BASE_AFR + 36)
-/*!
- * @messageid 108037
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_SELF_HEAL_FAILED (GLFS_COMP_BASE_AFR + 37)
-
-/*!
- * @messageid 108038
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_SPLIT_BRAIN_STATUS (GLFS_COMP_BASE_AFR + 38)
-
-/*!
- * @messageid 108039
- * @diagnosis
- * @recommendedaction
-*/
-#define AFR_MSG_REPLACE_BRICK_FAILED (GLFS_COMP_BASE_AFR + 39)
-
-
-/*!
- * @messageid 108039
- * @diagnosis AFR was unable to be loaded because the pending-changelog xattrs
- * were not found in the volfile.
- * @recommendedaction Please ensure cluster op-version is atleast 30707 and the
- * volfiles are regenerated.
-*/
-#define AFR_MSG_NO_CHANGELOG (GLFS_COMP_BASE_AFR + 40)
-#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
+GLFS_MSGID(
+ AFR, AFR_MSG_QUORUM_FAIL, AFR_MSG_QUORUM_MET, AFR_MSG_QUORUM_OVERRIDE,
+ AFR_MSG_INVALID_CHILD_UP, AFR_MSG_SUBVOL_UP, AFR_MSG_SUBVOLS_DOWN,
+ AFR_MSG_ENTRY_UNLOCK_FAIL, AFR_MSG_SPLIT_BRAIN, AFR_MSG_OPEN_FAIL,
+ AFR_MSG_UNLOCK_FAIL, AFR_MSG_REPLACE_BRICK_STATUS, AFR_MSG_GFID_NULL,
+ AFR_MSG_FD_CREATE_FAILED, AFR_MSG_DICT_SET_FAILED,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, AFR_MSG_MIGRATION_IN_PROGRESS,
+ AFR_MSG_CHILD_MISCONFIGURED, AFR_MSG_VOL_MISCONFIGURED,
+ AFR_MSG_INTERNAL_LKS_FAILED, AFR_MSG_INVALID_FD, AFR_MSG_LOCK_INFO,
+ AFR_MSG_LOCK_XLATOR_NOT_LOADED, AFR_MSG_FD_CTX_GET_FAILED,
+ AFR_MSG_INVALID_SUBVOL, AFR_MSG_PUMP_XLATOR_ERROR, AFR_MSG_SELF_HEAL_INFO,
+ AFR_MSG_READ_SUBVOL_ERROR, AFR_MSG_DICT_GET_FAILED, AFR_MSG_INFO_COMMON,
+ AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR, AFR_MSG_LOCAL_CHILD, AFR_MSG_INVALID_DATA,
+ AFR_MSG_INVALID_ARG, AFR_MSG_INDEX_DIR_GET_FAILED, AFR_MSG_FSYNC_FAILED,
+ AFR_MSG_FAVORITE_CHILD, AFR_MSG_SELF_HEAL_FAILED,
+ AFR_MSG_SPLIT_BRAIN_STATUS, AFR_MSG_ADD_BRICK_STATUS, AFR_MSG_NO_CHANGELOG,
+ AFR_MSG_TIMER_CREATE_FAIL, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ AFR_MSG_INODE_CTX_GET_FAILED, AFR_MSG_THIN_ARB,
+ AFR_MSG_THIN_ARB_XATTROP_FAILED, AFR_MSG_THIN_ARB_LOC_POP_FAILED,
+ AFR_MSG_GET_PEND_VAL, AFR_MSG_THIN_ARB_SKIP_SHD, AFR_MSG_UNKNOWN_SET,
+ AFR_MSG_NO_XL_ID, AFR_MSG_SELF_HEAL_INFO_START,
+ AFR_MSG_SELF_HEAL_INFO_FINISH, AFR_MSG_INCRE_COUNT,
+ AFR_MSG_ADD_TO_OUTPUT_FAILED, AFR_MSG_SET_TIME_FAILED,
+ AFR_MSG_GFID_MISMATCH_DETECTED, AFR_MSG_GFID_HEAL_MSG,
+ AFR_MSG_THIN_ARB_LOOKUP_FAILED, AFR_MSG_DICT_CREATE_FAILED,
+ AFR_MSG_NO_MAJORITY_TO_RESOLVE, AFR_MSG_TYPE_MISMATCH,
+ AFR_MSG_SIZE_POLICY_NOT_APPLICABLE, AFR_MSG_NO_CHILD_SELECTED,
+ AFR_MSG_INVALID_CHILD, AFR_MSG_RESOLVE_CONFLICTING_DATA,
+ SERROR_GETTING_SRC_BRICK, SNO_DIFF_IN_MTIME, SNO_BIGGER_FILE,
+ SALL_BRICKS_UP_TO_RESOLVE, AFR_MSG_UNLOCK_FAILED, AFR_MSG_POST_OP_FAILED,
+ AFR_MSG_TA_FRAME_CREATE_FAILED, AFR_MSG_SET_KEY_XATTROP_FAILED,
+ AFR_MSG_BLOCKING_ENTRYLKS_FAILED, AFR_MSG_FOP_FAILED,
+ AFR_MSG_CLEAN_UP_FAILED, AFR_MSG_UNABLE_TO_FETCH, AFR_MSG_XATTR_SET_FAILED,
+ AFR_MSG_SPLIT_BRAIN_REPLICA, AFR_MSG_INODE_CTX_FAILED,
+ AFR_MSG_LOOKUP_FAILED, AFR_MSG_ALL_SUBVOLS_DOWN,
+ AFR_MSG_RELEASE_LOCK_FAILED, AFR_MSG_CLEAR_TIME_SPLIT_BRAIN,
+ AFR_MSG_READ_FAILED, AFR_MSG_LAUNCH_FAILED, AFR_MSG_READ_SUBVOL_NOT_UP,
+ AFR_MSG_LK_HEAL_DOM, AFR_MSG_NEW_BRICK, AFR_MSG_SPLIT_BRAIN_SET_FAILED,
+ AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED, AFR_MSG_HEALER_SPAWN_FAILED,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, AFR_MSG_NULL_DEREF, AFR_MSG_SET_PEND_XATTR,
+ AFR_MSG_INTERNAL_ATTR);
+
+#define AFR_MSG_DICT_GET_FAILED_STR "Dict get failed"
+#define AFR_MSG_DICT_SET_FAILED_STR "Dict set failed"
+#define AFR_MSG_HEALER_SPAWN_FAILED_STR "Healer spawn failed"
+#define AFR_MSG_ADD_CRAWL_EVENT_FAILED_STR "Adding crawl event failed"
+#define AFR_MSG_INVALID_ARG_STR "Invalid argument"
+#define AFR_MSG_INDEX_DIR_GET_FAILED_STR "unable to get index-dir on "
+#define AFR_MSG_THIN_ARB_LOOKUP_FAILED_STR "Failed lookup on file"
+#define AFR_MSG_DICT_CREATE_FAILED_STR "Failed to create dict."
+#define AFR_MSG_THIN_ARB_XATTROP_FAILED_STR "Xattrop failed."
+#define AFR_MSG_THIN_ARB_LOC_POP_FAILED_STR \
+ "Failed to populate loc for thin-arbiter"
+#define AFR_MSG_GET_PEND_VAL_STR "Error getting value of pending"
+#define AFR_MSG_THIN_ARB_SKIP_SHD_STR "I am not the god shd. skipping."
+#define AFR_MSG_UNKNOWN_SET_STR "Unknown set"
+#define AFR_MSG_NO_XL_ID_STR "xl does not have id"
+#define AFR_MSG_SELF_HEAL_INFO_START_STR "starting full sweep on"
+#define AFR_MSG_SELF_HEAL_INFO_FINISH_STR "finished full sweep on"
+#define AFR_MSG_INCRE_COUNT_STR "Could not increment the counter."
+#define AFR_MSG_ADD_TO_OUTPUT_FAILED_STR "Could not add to output"
+#define AFR_MSG_SET_TIME_FAILED_STR "Could not set time"
+#define AFR_MSG_GFID_HEAL_MSG_STR "Error setting gfid-heal-msg dict"
+#define AFR_MSG_NO_MAJORITY_TO_RESOLVE_STR \
+ "No majority to resolve gfid split brain"
+#define AFR_MSG_GFID_MISMATCH_DETECTED_STR "Gfid mismatch dectected"
+#define AFR_MSG_SELF_HEAL_INFO_STR "performing selfheal"
+#define AFR_MSG_TYPE_MISMATCH_STR "TYPE mismatch"
+#define AFR_MSG_SIZE_POLICY_NOT_APPLICABLE_STR \
+ "Size policy is not applicable to directories."
+#define AFR_MSG_NO_CHILD_SELECTED_STR \
+ "No child selected by favorite-child policy"
+#define AFR_MSG_INVALID_CHILD_STR "Invalid child"
+#define AFR_MSG_RESOLVE_CONFLICTING_DATA_STR \
+ "selected as authentic to resolve conflicting data"
+#define SERROR_GETTING_SRC_BRICK_STR "Error getting the source brick"
+#define SNO_DIFF_IN_MTIME_STR "No difference in mtime"
+#define SNO_BIGGER_FILE_STR "No bigger file"
+#define SALL_BRICKS_UP_TO_RESOLVE_STR \
+ "All the bricks should be up to resolve the gfid split brain"
+#define AFR_MSG_UNLOCK_FAILED_STR "Failed to unlock"
+#define AFR_MSG_POST_OP_FAILED_STR "Post-op on thin-arbiter failed"
+#define AFR_MSG_TA_FRAME_CREATE_FAILED_STR "Failed to create ta_frame"
+#define AFR_MSG_SET_KEY_XATTROP_FAILED_STR "Could not set key during xattrop"
+#define AFR_MSG_BLOCKING_ENTRYLKS_FAILED_STR "Blocking entrylks failed"
+#define AFR_MSG_FSYNC_FAILED_STR "fsync failed"
+#define AFR_MSG_QUORUM_FAIL_STR "quorum is not met"
+#define AFR_MSG_FOP_FAILED_STR "Failing Fop"
+#define AFR_MSG_INVALID_SUBVOL_STR "not a subvolume"
+#define AFR_MSG_VOL_MISCONFIGURED_STR "Volume is dangling"
+#define AFR_MSG_CHILD_MISCONFIGURED_STR \
+ "replicate translator needs more than one subvolume defined"
+#define AFR_MSG_CLEAN_UP_FAILED_STR "Failed to clean up healer threads"
+#define AFR_MSG_QUORUM_OVERRIDE_STR "overriding quorum-count"
+#define AFR_MSG_UNABLE_TO_FETCH_STR \
+ "Unable to fetch afr-pending-xattr option from volfile. Falling back to " \
+ "using client translator names"
+#define AFR_MSG_NULL_DEREF_STR "possible NULL deref"
+#define AFR_MSG_XATTR_SET_FAILED_STR "Cannot set xattr cookie key"
+#define AFR_MSG_SPLIT_BRAIN_STATUS_STR "Failed to create synctask"
+#define AFR_MSG_SUBVOLS_DOWN_STR "All subvolumes are not up"
+#define AFR_MSG_SPLIT_BRAIN_CHOICE_ERROR_STR \
+ "Failed to cancel split-brain choice"
+#define AFR_MSG_SPLIT_BRAIN_REPLICA_STR \
+ "Cannot set replica. File is not in data/metadata split-brain"
+#define AFR_MSG_INODE_CTX_FAILED_STR "Failed to get inode_ctx"
+#define AFR_MSG_READ_SUBVOL_ERROR_STR "no read subvols"
+#define AFR_MSG_LOCAL_CHILD_STR "selecting local read-child"
+#define AFR_MSG_LOOKUP_FAILED_STR "Failed to lookup/create thin-arbiter id file"
+#define AFR_MSG_TIMER_CREATE_FAIL_STR \
+ "Cannot create timer for delayed initialization"
+#define AFR_MSG_SUBVOL_UP_STR "Subvolume came back up; going online"
+#define AFR_MSG_ALL_SUBVOLS_DOWN_STR \
+ "All subvolumes are down. Going offline until atleast one of them is up"
+#define AFR_MSG_RELEASE_LOCK_FAILED_STR "Failed to release lock"
+#define AFR_MSG_INVALID_CHILD_UP_STR "Received child_up from invalid subvolume"
+#define AFR_MSG_QUORUM_MET_STR "Client-quorum is met"
+#define AFR_MSG_EXPUNGING_FILE_OR_DIR_STR "expunging file or dir"
+#define AFR_MSG_SELF_HEAL_FAILED_STR "Invalid"
+#define AFR_MSG_SPLIT_BRAIN_STR "Skipping conservative mergeon the file"
+#define AFR_MSG_CLEAR_TIME_SPLIT_BRAIN_STR "clear time split brain"
+#define AFR_MSG_READ_FAILED_STR "Failing read since good brick is down"
+#define AFR_MSG_LAUNCH_FAILED_STR "Failed to launch synctask"
+#define AFR_MSG_READ_SUBVOL_NOT_UP_STR \
+ "read subvolume in this generation is not up"
+#define AFR_MSG_INTERNAL_LKS_FAILED_STR \
+ "Unable to work with lk-owner while attempting fop"
+#define AFR_MSG_LOCK_XLATOR_NOT_LOADED_STR \
+ "subvolume does not support locking. please load features/locks xlator " \
+ "on server."
+#define AFR_MSG_FD_CTX_GET_FAILED_STR "unable to get fd ctx"
+#define AFR_MSG_INFO_COMMON_STR "fd not open on any subvolumes, aborting."
+#define AFR_MSG_REPLACE_BRICK_STATUS_STR "Couldn't acquire lock on any child."
+#define AFR_MSG_NEW_BRICK_STR "New brick"
+#define AFR_MSG_SPLIT_BRAIN_SET_FAILED_STR \
+ "Failed to set split-brain choice to -1"
+#define AFR_MSG_SPLIT_BRAIN_DETERMINE_FAILED_STR \
+ "Failed to determine split-brain. Aborting split-brain-choice set"
+#define AFR_MSG_OPEN_FAIL_STR "Failed to open subvolume"
+#define AFR_MSG_SET_PEND_XATTR_STR "Set of pending xattr"
+#define AFR_MSG_INTERNAL_ATTR_STR "is an internal extended attribute"
#endif /* !_AFR_MESSAGES_H_ */
diff --git a/xlators/cluster/afr/src/afr-open.c b/xlators/cluster/afr/src/afr-open.c
index 059d3f9bd71..64856042b65 100644
--- a/xlators/cluster/afr/src/afr-open.c
+++ b/xlators/cluster/afr/src/afr-open.c
@@ -8,322 +8,346 @@
cases as published by the Free Software Foundation.
*/
-#include <libgen.h>
#include <unistd.h>
-#include <fnmatch.h>
#include <sys/time.h>
#include <stdlib.h>
#include <signal.h>
-#include "glusterfs.h"
+#include <glusterfs/glusterfs.h>
#include "afr.h"
-#include "dict.h"
-#include "xlator.h"
-#include "hashfn.h"
-#include "logging.h"
-#include "stack.h"
-#include "list.h"
-#include "call-stub.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-#include "statedump.h"
-
-#include "fd.h"
-
-#include "afr-inode-read.h"
-#include "afr-inode-write.h"
-#include "afr-dir-read.h"
-#include "afr-dir-write.h"
-#include "afr-transaction.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/logging.h>
+#include <glusterfs/defaults.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/statedump.h>
+#include "afr-transaction.h"
gf_boolean_t
-afr_is_fd_fixable (fd_t *fd)
+afr_is_fd_fixable(fd_t *fd)
{
- if (!fd || !fd->inode)
- return _gf_false;
- else if (fd_is_anonymous (fd))
- return _gf_false;
- else if (gf_uuid_is_null (fd->inode->gfid))
- return _gf_false;
-
- return _gf_true;
+ if (!fd || !fd->inode)
+ return _gf_false;
+ else if (fd_is_anonymous(fd))
+ return _gf_false;
+ else if (gf_uuid_is_null(fd->inode->gfid))
+ return _gf_false;
+
+ return _gf_true;
}
-
int
-afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+afr_open_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- afr_local_t * local = frame->local;
+ afr_local_t *local = frame->local;
- AFR_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,
- local->fd, xdata);
- return 0;
+ AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno,
+ local->cont.open.fd, xdata);
+ return 0;
}
-
int
-afr_open_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- fd_t *fd, dict_t *xdata)
+afr_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, fd_t *fd, dict_t *xdata)
{
- afr_local_t * local = NULL;
- int call_count = -1;
- int child_index = (long) cookie;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- local = frame->local;
- fd_ctx = local->fd_ctx;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
- } else {
- local->op_ret = op_ret;
- fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
- if (!local->xdata_rsp && xdata)
- local->xdata_rsp = dict_ref (xdata);
- }
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = (long)cookie;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ local = frame->local;
+ fd_ctx = local->fd_ctx;
+
+ local->replies[child_index].valid = 1;
+ local->replies[child_index].op_ret = op_ret;
+ local->replies[child_index].op_errno = op_errno;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
+ } else {
+ local->op_ret = op_ret;
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
+ if (!local->xdata_rsp && xdata)
+ local->xdata_rsp = dict_ref(xdata);
}
- UNLOCK (&frame->lock);
-
- call_count = afr_frame_return (frame);
-
- if (call_count == 0) {
- if ((fd_ctx->flags & O_TRUNC) && (local->op_ret >= 0)) {
- STACK_WIND (frame, afr_open_ftruncate_cbk,
- this, this->fops->ftruncate,
- fd, 0, NULL);
- } else {
- AFR_STACK_UNWIND (open, frame, local->op_ret,
- local->op_errno, local->fd,
- local->xdata_rsp);
- }
+ call_count = --local->call_count;
+ }
+ UNLOCK(&frame->lock);
+
+ if (call_count == 0) {
+ afr_handle_replies_quorum(frame, this);
+ if (local->op_ret == -1) {
+ AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, NULL,
+ NULL);
+ } else if (fd_ctx->flags & O_TRUNC) {
+ STACK_WIND(frame, afr_open_ftruncate_cbk, this,
+ this->fops->ftruncate, fd, 0, NULL);
+ } else {
+ AFR_STACK_UNWIND(open, frame, local->op_ret, local->op_errno,
+ local->cont.open.fd, local->xdata_rsp);
}
+ }
- return 0;
+ return 0;
}
int
-afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, dict_t *xdata)
+afr_open_continue(call_frame_t *frame, xlator_t *this, int err)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int i = 0;
- int32_t call_count = 0;
- int32_t op_errno = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
-
- //We can't let truncation to happen outside transaction.
-
- priv = this->private;
-
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx) {
- op_errno = ENOMEM;
- goto out;
- }
-
- local->fd = fd_ref (fd);
- local->fd_ctx = fd_ctx;
- fd_ctx->flags = flags;
-
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = 0;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (err) {
+ AFR_STACK_UNWIND(open, frame, -1, err, NULL, NULL);
+ } else {
+ local->call_count = AFR_COUNT(local->child_up, priv->child_count);
call_count = local->call_count;
- local->cont.open.flags = flags;
-
for (i = 0; i < priv->child_count; i++) {
- if (local->child_up[i]) {
- STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- loc, (flags & ~O_TRUNC), fd, xdata);
- if (!--call_count)
- break;
- }
+ if (local->child_up[i]) {
+ STACK_WIND_COOKIE(frame, afr_open_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->open, &local->loc,
+ (local->cont.open.flags & ~O_TRUNC),
+ local->cont.open.fd, local->xdata_req);
+ if (!--call_count)
+ break;
+ }
}
+ }
+ return 0;
+}
- return 0;
+int
+afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int spb_subvol = 0;
+ int event_generation = 0;
+ int ret = 0;
+ int32_t op_errno = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+
+ // We can't let truncation to happen outside transaction.
+
+ priv = this->private;
+
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
+
+ local->op = GF_FOP_OPEN;
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ op_errno = afr_quorum_errno(priv);
+ goto out;
+ }
+
+ if (!afr_is_consistent_io_possible(local, priv, &op_errno))
+ goto out;
+
+ local->inode = inode_ref(loc->inode);
+ loc_copy(&local->loc, loc);
+ local->fd_ctx = fd_ctx;
+ fd_ctx->flags = flags;
+ if (xdata)
+ local->xdata_req = dict_ref(xdata);
+
+ local->cont.open.flags = flags;
+ local->cont.open.fd = fd_ref(fd);
+
+ ret = afr_inode_get_readable(frame, local->inode, this, NULL,
+ &event_generation, AFR_DATA_TRANSACTION);
+ if ((ret < 0) &&
+ (afr_split_brain_read_subvol_get(local->inode, this, NULL,
+ &spb_subvol) == 0) &&
+ spb_subvol < 0) {
+ afr_inode_refresh(frame, this, local->inode, local->inode->gfid,
+ afr_open_continue);
+ } else {
+ afr_open_continue(frame, this, 0);
+ }
+
+ return 0;
out:
- AFR_STACK_UNWIND (open, frame, -1, op_errno, fd, NULL);
+ AFR_STACK_UNWIND(open, frame, -1, op_errno, fd, NULL);
- return 0;
+ return 0;
}
int
-afr_openfd_fix_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, fd_t *fd,
- dict_t *xdata)
+afr_openfd_fix_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- int call_count = 0;
- int child_index = (long) cookie;
-
- priv = this->private;
- local = frame->local;
-
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ int call_count = 0;
+ int child_index = (long)cookie;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (op_ret >= 0) {
+ gf_msg_debug(this->name, 0,
+ "fd for %s opened "
+ "successfully on subvolume %s",
+ local->loc.path, priv->children[child_index]->name);
+ } else {
+ gf_smsg(this->name, fop_log_level(GF_FOP_OPEN, op_errno), op_errno,
+ AFR_MSG_OPEN_FAIL, "path=%s", local->loc.path, "subvolume=%s",
+ priv->children[child_index]->name, NULL);
+ }
+
+ fd_ctx = local->fd_ctx;
+
+ LOCK(&local->fd->lock);
+ {
if (op_ret >= 0) {
- gf_msg_debug (this->name, 0, "fd for %s opened "
- "successfully on subvolume %s", local->loc.path,
- priv->children[child_index]->name);
+ fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
} else {
- gf_msg (this->name, fop_log_level (GF_FOP_OPEN, op_errno),
- op_errno, AFR_MSG_OPEN_FAIL, "Failed to open %s on "
- "subvolume %s", local->loc.path,
- priv->children[child_index]->name);
+ fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
}
+ }
+ UNLOCK(&local->fd->lock);
- fd_ctx = local->fd_ctx;
+ call_count = afr_frame_return(frame);
+ if (call_count == 0)
+ AFR_STACK_DESTROY(frame);
- LOCK (&local->fd->lock);
- {
- if (op_ret >= 0) {
- fd_ctx->opened_on[child_index] = AFR_FD_OPENED;
- } else {
- fd_ctx->opened_on[child_index] = AFR_FD_NOT_OPENED;
- }
- }
- UNLOCK (&local->fd->lock);
+ return 0;
+}
- call_count = afr_frame_return (frame);
- if (call_count == 0)
- AFR_STACK_DESTROY (frame);
+static int
+afr_fd_ctx_need_open(fd_t *fd, xlator_t *this, unsigned char *need_open)
+{
+ afr_fd_ctx_t *fd_ctx = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int count = 0;
+
+ priv = this->private;
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx)
return 0;
-}
+ LOCK(&fd->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED &&
+ priv->child_up[i]) {
+ fd_ctx->opened_on[i] = AFR_FD_OPENING;
+ need_open[i] = 1;
+ count++;
+ } else {
+ need_open[i] = 0;
+ }
+ }
+ }
+ UNLOCK(&fd->lock);
-static int
-afr_fd_ctx_need_open (fd_t *fd, xlator_t *this, unsigned char *need_open)
-{
- afr_fd_ctx_t *fd_ctx = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int count = 0;
-
- priv = this->private;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return 0;
-
- LOCK (&fd->lock);
- {
- for (i = 0; i < priv->child_count; i++) {
- if (fd_ctx->opened_on[i] == AFR_FD_NOT_OPENED &&
- priv->child_up[i]) {
- fd_ctx->opened_on[i] = AFR_FD_OPENING;
- need_open[i] = 1;
- count++;
- } else {
- need_open[i] = 0;
- }
- }
- }
- UNLOCK (&fd->lock);
-
- return count;
+ return count;
}
-
void
-afr_fix_open (fd_t *fd, xlator_t *this)
+afr_fix_open(fd_t *fd, xlator_t *this)
{
- afr_private_t *priv = NULL;
- int i = 0;
- call_frame_t *frame = NULL;
- afr_local_t *local = NULL;
- int ret = -1;
- int32_t op_errno = 0;
- afr_fd_ctx_t *fd_ctx = NULL;
- unsigned char *need_open = NULL;
- int call_count = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = -1;
+ int32_t op_errno = 0;
+ afr_fd_ctx_t *fd_ctx = NULL;
+ unsigned char *need_open = NULL;
+ int call_count = 0;
- priv = this->private;
+ priv = this->private;
- if (!afr_is_fd_fixable (fd))
- goto out;
+ if (!afr_is_fd_fixable(fd))
+ goto out;
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- goto out;
+ fd_ctx = afr_fd_ctx_get(fd, this);
+ if (!fd_ctx)
+ goto out;
- need_open = alloca0 (priv->child_count);
+ need_open = alloca0(priv->child_count);
- call_count = afr_fd_ctx_need_open (fd, this, need_open);
- if (!call_count)
- goto out;
+ call_count = afr_fd_ctx_need_open(fd, this, need_open);
+ if (!call_count)
+ goto out;
- frame = create_frame (this, this->ctx->pool);
- if (!frame)
- goto out;
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ goto out;
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
+ local = AFR_FRAME_INIT(frame, op_errno);
+ if (!local)
+ goto out;
- local->loc.inode = inode_ref (fd->inode);
- ret = loc_path (&local->loc, NULL);
- if (ret < 0)
- goto out;
+ local->loc.inode = inode_ref(fd->inode);
+ ret = loc_path(&local->loc, NULL);
+ if (ret < 0)
+ goto out;
- local->fd = fd_ref (fd);
- local->fd_ctx = fd_ctx;
+ local->fd = fd_ref(fd);
+ local->fd_ctx = fd_ctx;
- local->call_count = call_count;
+ local->call_count = call_count;
- gf_msg_debug (this->name, 0, "need open count: %d",
- call_count);
+ gf_msg_debug(this->name, 0, "need open count: %d", call_count);
- for (i = 0; i < priv->child_count; i++) {
- if (!need_open[i])
- continue;
-
- if (IA_IFDIR == fd->inode->ia_type) {
- gf_msg_debug (this->name, 0,
- "opening fd for dir %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk,
- (void*) (long) i,
- priv->children[i],
- priv->children[i]->fops->opendir,
- &local->loc, local->fd,
- NULL);
- } else {
- gf_msg_debug (this->name, 0,
- "opening fd for file %s on subvolume %s",
- local->loc.path, priv->children[i]->name);
-
- STACK_WIND_COOKIE (frame, afr_openfd_fix_open_cbk,
- (void *)(long) i,
- priv->children[i],
- priv->children[i]->fops->open,
- &local->loc,
- fd_ctx->flags & (~O_TRUNC),
- local->fd, NULL);
- }
-
- if (!--call_count)
- break;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!need_open[i])
+ continue;
+
+ if (IA_IFDIR == fd->inode->ia_type) {
+ gf_msg_debug(this->name, 0, "opening fd for dir %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->opendir, &local->loc,
+ local->fd, NULL);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "opening fd for file %s on subvolume %s",
+ local->loc.path, priv->children[i]->name);
+
+ STACK_WIND_COOKIE(frame, afr_openfd_fix_open_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->open,
+ &local->loc, fd_ctx->flags & (~O_TRUNC),
+ local->fd, NULL);
}
- return;
+ if (!--call_count)
+ break;
+ }
+
+ return;
out:
- if (frame)
- AFR_STACK_DESTROY (frame);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
}
diff --git a/xlators/cluster/afr/src/afr-read-txn.c b/xlators/cluster/afr/src/afr-read-txn.c
index a70565c37a1..6fc2c75145c 100644
--- a/xlators/cluster/afr/src/afr-read-txn.c
+++ b/xlators/cluster/afr/src/afr-read-txn.c
@@ -12,125 +12,328 @@
#include "afr-transaction.h"
#include "afr-messages.h"
-int
-afr_read_txn_next_subvol (call_frame_t *frame, xlator_t *this)
+void
+afr_pending_read_increment(afr_private_t *priv, int child_index)
+{
+ if (child_index < 0 || child_index > priv->child_count)
+ return;
+
+ GF_ATOMIC_INC(priv->pending_reads[child_index]);
+}
+
+void
+afr_pending_read_decrement(afr_private_t *priv, int child_index)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int subvol = -1;
-
- local = frame->local;
- priv = this->private;
-
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->readable[i]) {
- /* don't even bother trying here.
- just mark as attempted and move on. */
- local->read_attempted[i] = 1;
- continue;
- }
-
- if (!local->read_attempted[i]) {
- subvol = i;
- break;
- }
- }
-
- /* If no more subvols were available for reading, we leave
- @subvol as -1, which is an indication we have run out of
- readable subvols. */
- if (subvol != -1)
- local->read_attempted[subvol] = 1;
- local->readfn (frame, this, subvol);
-
- return 0;
+ if (child_index < 0 || child_index > priv->child_count)
+ return;
+
+ GF_ATOMIC_DEC(priv->pending_reads[child_index]);
}
-#define AFR_READ_TXN_SET_ERROR_AND_GOTO(ret, errnum, index, label) \
- do { \
- local->op_ret = ret; \
- local->op_errno = errnum; \
- read_subvol = index; \
- gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_SPLIT_BRAIN,\
- "Failing %s on gfid %s: split-brain observed.",\
- gf_fop_list[local->op], uuid_utoa (inode->gfid));\
- goto label; \
- } while (0)
+void
+afr_read_txn_wind(call_frame_t *frame, xlator_t *this, int subvol)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ afr_pending_read_decrement(priv, local->read_subvol);
+ local->read_subvol = subvol;
+ afr_pending_read_increment(priv, subvol);
+ local->readfn(frame, this, subvol);
+}
int
-afr_read_txn_refresh_done (call_frame_t *frame, xlator_t *this, int err)
+afr_read_txn_next_subvol(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- int read_subvol = 0;
- int event_generation = 0;
- inode_t *inode = NULL;
- int ret = -1;
- int spb_choice = -1;
-
- local = frame->local;
- inode = local->inode;
-
- if (err) {
- local->op_errno = -err;
- local->op_ret = -1;
- read_subvol = -1;
- goto readfn;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int subvol = -1;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->readable[i]) {
+ /* don't even bother trying here.
+ just mark as attempted and move on. */
+ local->read_attempted[i] = 1;
+ continue;
}
- ret = afr_inode_get_readable (frame, inode, this, local->readable,
- &event_generation,
- local->transaction.type);
+ if (!local->read_attempted[i]) {
+ subvol = i;
+ break;
+ }
+ }
- if (ret == -1 || !event_generation)
- /* Even after refresh, we don't have a good
- read subvolume. Time to bail */
- AFR_READ_TXN_SET_ERROR_AND_GOTO (-1, EIO, -1, readfn);
+ /* If no more subvols were available for reading, we leave
+ @subvol as -1, which is an indication we have run out of
+ readable subvols. */
+ if (subvol != -1)
+ local->read_attempted[subvol] = 1;
+ afr_read_txn_wind(frame, this, subvol);
- read_subvol = afr_read_subvol_select_by_policy (inode, this,
- local->readable, NULL);
- if (read_subvol == -1)
- AFR_READ_TXN_SET_ERROR_AND_GOTO (-1, EIO, -1, readfn);
+ return 0;
+}
- if (local->read_attempted[read_subvol]) {
- afr_read_txn_next_subvol (frame, this);
- return 0;
- }
+static int
+afr_ta_read_txn_done(int ret, call_frame_t *ta_frame, void *opaque)
+{
+ STACK_DESTROY(ta_frame->root);
+ return 0;
+}
- local->read_attempted[read_subvol] = 1;
-readfn:
- if (read_subvol == -1) {
- ret = afr_inode_split_brain_choice_get (inode, this,
- &spb_choice);
- if ((ret == 0) && spb_choice >= 0)
- read_subvol = spb_choice;
- }
- local->readfn (frame, this, read_subvol);
+static int
+afr_ta_read_txn(void *opaque)
+{
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ int read_subvol = -1;
+ int query_child = AFR_CHILD_UNKNOWN;
+ int possible_bad_child = AFR_CHILD_UNKNOWN;
+ int ret = 0;
+ int op_errno = ENOMEM;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+ dict_t *xdata_req = NULL;
+ dict_t *xdata_rsp = NULL;
+ int **pending = NULL;
+ loc_t loc = {
+ 0,
+ };
+
+ frame = (call_frame_t *)opaque;
+ this = frame->this;
+ local = frame->local;
+ priv = this->private;
+ query_child = local->read_txn_query_child;
+
+ if (query_child == AFR_CHILD_ZERO) {
+ possible_bad_child = AFR_CHILD_ONE;
+ } else if (query_child == AFR_CHILD_ONE) {
+ possible_bad_child = AFR_CHILD_ZERO;
+ } else {
+ /*read_txn_query_child is AFR_CHILD_UNKNOWN*/
+ goto out;
+ }
+
+ /* Ask the query_child to see if it blames the possibly bad one. */
+ xdata_req = dict_new();
+ if (!xdata_req)
+ goto out;
+
+ pending = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!pending)
+ goto out;
+
+ ret = afr_set_pending_dict(priv, xdata_req, pending);
+ if (ret < 0)
+ goto out;
+
+ if (local->fd) {
+ ret = syncop_fxattrop(priv->children[query_child], local->fd,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+ NULL);
+ } else {
+ ret = syncop_xattrop(priv->children[query_child], &local->loc,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+ NULL);
+ }
+ if (ret || !xdata_rsp) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed xattrop for gfid %s on %s",
+ uuid_utoa(local->inode->gfid),
+ priv->children[query_child]->name);
+ op_errno = -ret;
+ goto out;
+ }
+
+ if (afr_ta_dict_contains_pending_xattr(xdata_rsp, priv,
+ possible_bad_child)) {
+ read_subvol = query_child;
+ goto out;
+ }
+ dict_unref(xdata_rsp);
+ xdata_rsp = NULL;
+
+ /* It doesn't. So query thin-arbiter to see if it blames any data brick. */
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ goto out;
+ }
+ flock.l_type = F_WRLCK; /*start and length are already zero. */
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, &loc, F_SETLKW, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed to get AFR_TA_DOM_MODIFY lock on %s.",
+ uuid_utoa(local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ op_errno = -ret;
+ goto out;
+ }
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ GF_XATTROP_ADD_ARRAY, xdata_req, NULL, &xdata_rsp,
+ NULL);
+ if (ret || !xdata_rsp) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed xattrop on %s.", uuid_utoa(local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ op_errno = -ret;
+ goto unlock;
+ }
+
+ if (!afr_ta_dict_contains_pending_xattr(xdata_rsp, priv, query_child)) {
+ read_subvol = query_child;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
+ "Failing read for gfid %s since good brick %s is down",
+ uuid_utoa(local->inode->gfid),
+ priv->children[possible_bad_child]->name);
+ op_errno = EIO;
+ }
+
+unlock:
+ flock.l_type = F_UNLCK;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_MODIFY, &loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "gfid:%s: Failed to unlock AFR_TA_DOM_MODIFY lock on "
+ "%s.",
+ uuid_utoa(local->inode->gfid),
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX]);
+ }
+out:
+ if (xdata_req)
+ dict_unref(xdata_req);
+ if (xdata_rsp)
+ dict_unref(xdata_rsp);
+ if (pending)
+ afr_matrix_cleanup(pending, priv->child_count);
+ loc_wipe(&loc);
+
+ if (read_subvol == -1) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ }
+ afr_read_txn_wind(frame, this, read_subvol);
+ return ret;
+}
- return 0;
+void
+afr_ta_read_txn_synctask(call_frame_t *frame, xlator_t *this)
+{
+ call_frame_t *ta_frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ goto out;
+ }
+ ret = synctask_new(this->ctx->env, afr_ta_read_txn, afr_ta_read_txn_done,
+ ta_frame, frame);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to launch "
+ "afr_ta_read_txn synctask for gfid %s.",
+ uuid_utoa(local->inode->gfid));
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ STACK_DESTROY(ta_frame->root);
+ goto out;
+ }
+ return;
+out:
+ afr_read_txn_wind(frame, this, -1);
}
+int
+afr_read_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int read_subvol = -1;
+ inode_t *inode = NULL;
+ int ret = -1;
+ int spb_subvol = -1;
+
+ local = frame->local;
+ inode = local->inode;
+ priv = this->private;
+
+ if (err) {
+ if (!priv->thin_arbiter_count)
+ goto readfn;
+ if (err != EINVAL)
+ goto readfn;
+ /* We need to query the good bricks and/or thin-arbiter.*/
+ afr_ta_read_txn_synctask(frame, this);
+ return 0;
+ }
+
+ read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable,
+ NULL);
+ if (read_subvol == -1) {
+ err = EIO;
+ goto readfn;
+ }
+
+ if (local->read_attempted[read_subvol]) {
+ afr_read_txn_next_subvol(frame, this);
+ return 0;
+ }
+
+ local->read_attempted[read_subvol] = 1;
+readfn:
+ if (read_subvol == -1) {
+ ret = afr_split_brain_read_subvol_get(inode, this, frame, &spb_subvol);
+ if ((ret == 0) && spb_subvol >= 0)
+ read_subvol = spb_subvol;
+ }
+
+ if (read_subvol == -1) {
+ AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err);
+ }
+ afr_read_txn_wind(frame, this, read_subvol);
+
+ return 0;
+}
int
-afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol)
+afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (!local->refreshed) {
- local->refreshed = _gf_true;
- afr_inode_refresh (frame, this, local->inode,
- afr_read_txn_refresh_done);
- } else {
- afr_read_txn_next_subvol (frame, this);
- }
+ if (!local->refreshed) {
+ local->refreshed = _gf_true;
+ afr_inode_refresh(frame, this, local->inode, NULL,
+ afr_read_txn_refresh_done);
+ } else {
+ afr_read_txn_next_subvol(frame, this);
+ }
- return 0;
+ return 0;
}
-
/* afr_read_txn_wipe:
clean internal variables in @local in order to make
@@ -139,27 +342,26 @@ afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol)
*/
void
-afr_read_txn_wipe (call_frame_t *frame, xlator_t *this)
+afr_read_txn_wipe(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- local->readfn = NULL;
+ local->readfn = NULL;
- if (local->inode)
- inode_unref (local->inode);
+ if (local->inode)
+ inode_unref(local->inode);
- for (i = 0; i < priv->child_count; i++) {
- local->read_attempted[i] = 0;
- local->readable[i] = 0;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ local->read_attempted[i] = 0;
+ local->readable[i] = 0;
+ }
}
-
/*
afr_read_txn:
@@ -188,87 +390,105 @@ afr_read_txn_wipe (call_frame_t *frame, xlator_t *this)
*/
int
-afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
- afr_read_txn_wind_t readfn, afr_transaction_type type)
+afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- unsigned char *data = NULL;
- unsigned char *metadata = NULL;
- int read_subvol = -1;
- int event_generation = 0;
- int ret = -1;
-
- priv = this->private;
- local = frame->local;
- data = alloca0 (priv->child_count);
- metadata = alloca0 (priv->child_count);
-
- afr_read_txn_wipe (frame, this);
-
- local->readfn = readfn;
- local->inode = inode_ref (inode);
-
- if (priv->quorum_reads &&
- priv->quorum_count && !afr_has_quorum (priv->child_up, this)) {
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- read_subvol = -1;
- goto read;
- }
-
- local->transaction.type = type;
- if (local->op == GF_FOP_FSTAT || local->op == GF_FOP_STAT) {
- ret = afr_inode_read_subvol_get (inode, this, data, metadata,
- &event_generation);
- AFR_INTERSECT (local->readable, data, metadata,
- priv->child_count);
- } else {
- ret = afr_inode_read_subvol_type_get (inode, this, local->readable,
- &event_generation, type);
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ unsigned char *data = NULL;
+ unsigned char *metadata = NULL;
+ int read_subvol = -1;
+ int event_generation = 0;
+ int ret = -1;
+
+ priv = this->private;
+ local = frame->local;
+ data = alloca0(priv->child_count);
+ metadata = alloca0(priv->child_count);
+
+ afr_read_txn_wipe(frame, this);
+
+ local->readfn = readfn;
+ local->inode = inode_ref(inode);
+ local->is_read_txn = _gf_true;
+ local->transaction.type = type;
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ local->op_ret = -1;
+ local->op_errno = afr_quorum_errno(priv);
+ goto read;
+ }
+
+ if (!afr_is_consistent_io_possible(local, priv, &local->op_errno)) {
+ local->op_ret = -1;
+ goto read;
+ }
+
+ if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+ local->op_ret = -1;
+ local->op_errno = -afr_quorum_errno(priv);
+ goto read;
+ }
+
+ if (priv->thin_arbiter_count &&
+ AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ if (local->child_up[0]) {
+ local->read_txn_query_child = AFR_CHILD_ZERO;
+ } else if (local->child_up[1]) {
+ local->read_txn_query_child = AFR_CHILD_ONE;
}
- if (ret == -1)
- /* very first transaction on this inode */
- goto refresh;
-
- gf_msg_debug (this->name, 0, "%s: generation now vs cached: %d, "
- "%d", uuid_utoa (inode->gfid), local->event_generation,
- event_generation);
- if (local->event_generation != event_generation)
- /* servers have disconnected / reconnected, and possibly
- rebooted, very likely changing the state of freshness
- of copies */
- goto refresh;
-
- read_subvol = afr_read_subvol_select_by_policy (inode, this,
- local->readable, NULL);
-
- if (read_subvol < 0 || read_subvol > priv->child_count) {
- gf_msg (this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN,
- "Unreadable subvolume %d found with event generation "
- "%d for gfid %s. (Possible split-brain)",
- read_subvol, event_generation, uuid_utoa(inode->gfid));
- goto refresh;
- }
-
- if (!local->child_up[read_subvol]) {
- /* should never happen, just in case */
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_READ_SUBVOL_ERROR, "subvolume %d is the "
- "read subvolume in this generation, but is not up",
- read_subvol);
- goto refresh;
- }
-
- local->read_attempted[read_subvol] = 1;
+ afr_ta_read_txn_synctask(frame, this);
+ return 0;
+ }
+
+ ret = afr_inode_read_subvol_get(inode, this, data, metadata,
+ &event_generation);
+ if (ret == -1)
+ /* very first transaction on this inode */
+ goto refresh;
+ AFR_INTERSECT(local->readable, data, metadata, priv->child_count);
+
+ gf_msg_debug(this->name, 0,
+ "%s: generation now vs cached: %d, "
+ "%d",
+ uuid_utoa(inode->gfid), local->event_generation,
+ event_generation);
+ if (afr_is_inode_refresh_reqd(inode, this, local->event_generation,
+ event_generation))
+ /* servers have disconnected / reconnected, and possibly
+ rebooted, very likely changing the state of freshness
+ of copies */
+ goto refresh;
+
+ read_subvol = afr_read_subvol_select_by_policy(inode, this, local->readable,
+ NULL);
+
+ if (read_subvol < 0 || read_subvol > priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "Unreadable subvolume %d found "
+ "with event generation %d for gfid %s.",
+ read_subvol, event_generation, uuid_utoa(inode->gfid));
+ goto refresh;
+ }
+
+ if (!local->child_up[read_subvol]) {
+ /* should never happen, just in case */
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_READ_SUBVOL_ERROR,
+ "subvolume %d is the "
+ "read subvolume in this generation, but is not up",
+ read_subvol);
+ goto refresh;
+ }
+
+ local->read_attempted[read_subvol] = 1;
read:
- local->readfn (frame, this, read_subvol);
+ afr_read_txn_wind(frame, this, read_subvol);
- return 0;
+ return 0;
refresh:
- afr_inode_refresh (frame, this, inode, afr_read_txn_refresh_done);
+ afr_inode_refresh(frame, this, inode, NULL, afr_read_txn_refresh_done);
- return 0;
+ return 0;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c
index 49c6bd0cc98..a580a1584cc 100644
--- a/xlators/cluster/afr/src/afr-self-heal-common.c
+++ b/xlators/cluster/afr/src/afr-self-heal-common.c
@@ -8,302 +8,808 @@
cases as published by the Free Software Foundation.
*/
-
#include "afr.h"
#include "afr-self-heal.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "protocol-common.h"
#include "afr-messages.h"
+#include <glusterfs/events.h>
+
+void
+afr_heal_synctask(xlator_t *this, afr_local_t *local);
int
-afr_selfheal_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
+ inode_t *inode, struct afr_reply *replies, int source,
+ unsigned char *sources, void *gfid, int *gfid_idx)
{
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ unsigned char *wind_on = NULL;
+ ia_type_t ia_type = IA_INVAL;
+ dict_t *xdata = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
+ int i = 0;
+
+ priv = this->private;
+ wind_on = alloca0(priv->child_count);
+ if (source >= 0 && replies[source].valid && replies[source].op_ret == 0)
+ ia_type = replies[source].poststat.ia_type;
+
+ if (ia_type != IA_INVAL)
+ goto heal;
+
+ /* If ia_type is still invalid, it means either
+ * (a)'source' was -1, i.e. parent dir pending xattrs are in split-brain
+ * (or) (b) The parent dir pending xattrs are all zeroes (i.e. all bricks
+ * are sources) and the 'source' we selected earlier might be the one where
+ * the file is not actually present.
+ *
+ * In both cases, let us pick a brick with a successful reply and use its
+ * ia_type.
+ * */
+ for (i = 0; i < priv->child_count; i++) {
+ if (source == -1) {
+ /* case (a) above. */
+ if (replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
+ ia_type = replies[i].poststat.ia_type;
+ break;
+ }
+ } else {
+ /* case (b) above. */
+ if (i == source)
+ continue;
+ if (sources[i] && replies[i].valid && replies[i].op_ret == 0 &&
+ replies[i].poststat.ia_type != IA_INVAL) {
+ ia_type = replies[i].poststat.ia_type;
+ break;
+ }
+ }
+ }
+
+heal:
+ /* gfid heal on those subvolumes that do not have gfid associated
+ * with the inode and update those replies.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+
+ if (gf_uuid_is_null(gfid) &&
+ !gf_uuid_is_null(replies[i].poststat.ia_gfid) &&
+ replies[i].poststat.ia_type == ia_type)
+ gfid = replies[i].poststat.ia_gfid;
+
+ if (!gf_uuid_is_null(replies[i].poststat.ia_gfid) ||
+ replies[i].poststat.ia_type != ia_type)
+ continue;
+
+ wind_on[i] = 1;
+ }
+
+ if (AFR_COUNT(wind_on, priv->child_count) == 0)
+ return 0;
+
+ xdata = dict_new();
+ if (!xdata) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = dict_set_gfuuid(xdata, "gfid-req", gfid, true);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ local = frame->local;
+ loc.parent = inode_ref(parent);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.name = name;
+ loc.inode = inode_ref(inode);
+
+ AFR_ONLIST(wind_on, frame, afr_selfheal_discover_cbk, lookup, &loc, xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ afr_reply_wipe(&replies[i]);
+ afr_reply_copy(&replies[i], &local->replies[i]);
+ }
+ if (gfid_idx && (*gfid_idx == -1)) {
+ /*Pick a brick where the gifd heal was successful.*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (!wind_on[i])
+ continue;
+ if (replies[i].valid && replies[i].op_ret == 0 &&
+ !gf_uuid_is_null(replies[i].poststat.ia_gfid)) {
+ *gfid_idx = i;
+ break;
+ }
+ }
+ }
+out:
+ if (gfid_idx && (*gfid_idx == -1) && (ret == 0) && local) {
+ ret = -afr_final_errno(local, priv);
+ }
+ loc_wipe(&loc);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ if (xdata)
+ dict_unref(xdata);
+
+ return ret;
+}
- local = frame->local;
+int
+afr_gfid_sbrain_source_from_src_brick(xlator_t *this, struct afr_reply *replies,
+ char *src_brick)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (strcmp(priv->children[i]->name, src_brick) == 0)
+ return i;
+ }
+ return -1;
+}
- syncbarrier_wake (&local->barrier);
+int
+afr_selfheal_gfid_mismatch_by_majority(struct afr_reply *replies,
+ int child_count)
+{
+ int j = 0;
+ int i = 0;
+ int votes;
+
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+
+ votes = 1;
+ for (j = i + 1; j < child_count; j++) {
+ if ((!gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[j].poststat.ia_gfid)))
+ votes++;
+ if (votes > child_count / 2)
+ return i;
+ }
+ }
- return 0;
+ return -1;
}
+int
+afr_gfid_sbrain_source_from_bigger_file(struct afr_reply *replies,
+ int child_count)
+{
+ int i = 0;
+ int src = -1;
+ uint64_t size = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret == -1)
+ continue;
+ if (size < replies[i].poststat.ia_size) {
+ src = i;
+ size = replies[i].poststat.ia_size;
+ } else if (replies[i].poststat.ia_size == size) {
+ src = -1;
+ }
+ }
+ return src;
+}
int
-afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
- int subvol, dict_t *xattr)
+afr_gfid_sbrain_source_from_latest_mtime(struct afr_reply *replies,
+ int child_count)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- loc_t loc = {0, };
+ int i = 0;
+ int src = -1;
+ uint32_t mtime = 0;
+ uint32_t mtime_nsec = 0;
+
+ for (i = 0; i < child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+ if ((mtime < replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
+ src = i;
+ mtime = replies[i].poststat.ia_mtime;
+ mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ } else if ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec == replies[i].poststat.ia_mtime_nsec)) {
+ src = -1;
+ }
+ }
+ return src;
+}
- priv = this->private;
- local = frame->local;
+int
+afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, uuid_t pargfid, const char *bname,
+ int src_idx, int child_idx,
+ unsigned char *locked_on, int *src, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ char g1[64] = {
+ 0,
+ };
+ char g2[64] = {
+ 0,
+ };
+ int up_count = 0;
+ int heal_op = -1;
+ int ret = -1;
+ char *src_brick = NULL;
+
+ *src = -1;
+ priv = this->private;
+ up_count = AFR_COUNT(locked_on, priv->child_count);
+ if (up_count != priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "All the bricks should be up to resolve the gfid split "
+ "barin");
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SALL_BRICKS_UP_TO_RESOLVE);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "Error setting"
+ " gfid-heal-msg dict");
+ }
+ goto out;
+ }
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, inode->gfid);
+ if (xdata) {
+ ret = dict_get_int32_sizen(xdata, "heal-op", &heal_op);
+ if (ret)
+ goto fav_child;
+ } else {
+ goto fav_child;
+ }
- STACK_WIND (frame, afr_selfheal_post_op_cbk, priv->children[subvol],
- priv->children[subvol]->fops->xattrop, &loc,
- GF_XATTROP_ADD_ARRAY, xattr, NULL);
+ switch (heal_op) {
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ *src = afr_gfid_sbrain_source_from_bigger_file(replies,
+ priv->child_count);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SNO_BIGGER_FILE);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SNO_BIGGER_FILE);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ " setting gfid-heal-msg dict");
+ }
+ }
+ break;
+
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
+ *src = afr_gfid_sbrain_source_from_latest_mtime(replies,
+ priv->child_count);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SNO_DIFF_IN_MTIME);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SNO_DIFF_IN_MTIME);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ "setting gfid-heal-msg dict");
+ }
+ }
+ break;
- syncbarrier_wait (&local->barrier, 1);
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_get_str_sizen(xdata, "child-name", &src_brick);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Error getting the source "
+ "brick");
+ break;
+ }
+ *src = afr_gfid_sbrain_source_from_src_brick(this, replies,
+ src_brick);
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ SERROR_GETTING_SRC_BRICK);
+ if (xdata) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ SERROR_GETTING_SRC_BRICK);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error"
+ " setting gfid-heal-msg dict");
+ }
+ }
+ break;
- loc_wipe (&loc);
+ default:
+ break;
+ }
+ goto out;
+
+fav_child:
+ switch (priv->fav_child_policy) {
+ case AFR_FAV_CHILD_BY_SIZE:
+ *src = afr_sh_fav_by_size(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_MTIME:
+ *src = afr_sh_fav_by_mtime(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_CTIME:
+ *src = afr_sh_fav_by_ctime(this, replies, inode);
+ break;
+ case AFR_FAV_CHILD_BY_MAJORITY:
+ if (priv->child_count != 2)
+ *src = afr_selfheal_gfid_mismatch_by_majority(
+ replies, priv->child_count);
+ else
+ *src = -1;
+
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "No majority to resolve "
+ "gfid split brain");
+ }
+ break;
+ default:
+ break;
+ }
- return 0;
+out:
+ if (*src == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Gfid mismatch detected for <gfid:%s>/%s>, %s on %s and"
+ " %s on %s.",
+ uuid_utoa(pargfid), bname,
+ uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1),
+ priv->children[child_idx]->name,
+ uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2),
+ priv->children[src_idx]->name);
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=gfid;file="
+ "<gfid:%s>/%s>;count=2;child-%d=%s;gfid-%d=%s;"
+ "child-%d=%s;gfid-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name, uuid_utoa(pargfid),
+ bname, child_idx, priv->children[child_idx]->name, child_idx,
+ uuid_utoa_r(replies[child_idx].poststat.ia_gfid, g1), src_idx,
+ priv->children[src_idx]->name, src_idx,
+ uuid_utoa_r(replies[src_idx].poststat.ia_gfid, g2));
+ return -1;
+ }
+ return 0;
}
int
-afr_check_stale_error (struct afr_reply *replies, afr_private_t *priv)
+afr_selfheal_post_op_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
- int i = 0;
- int op_errno = 0;
- int tmp_errno = 0;
- int stale_count = 0;
+ afr_local_t *local = NULL;
- for (i = 0; i < priv->child_count; i++) {
- tmp_errno = replies[i].op_errno;
- if (tmp_errno == ENOENT || tmp_errno == ESTALE) {
- op_errno = afr_higher_errno (op_errno, tmp_errno);
- stale_count++;
- }
- }
- if (stale_count != priv->child_count)
- return -ENOTCONN;
- else
- return -op_errno;
+ local = frame->local;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
}
+int
+afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr, dict_t *xdata)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
-dict_t *
-afr_selfheal_output_xattr (xlator_t *this, afr_transaction_type type,
- int *output_dirty, int **output_matrix, int subvol)
-{
- dict_t *xattr = NULL;
- afr_private_t *priv = NULL;
- int j = 0;
- int idx = 0;
- int ret = 0;
- int *raw = 0;
-
- priv = this->private;
- idx = afr_index_for_transaction_type (type);
-
- xattr = dict_new ();
- if (!xattr)
- return NULL;
-
- /* clear dirty */
- raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
- if (!raw)
- goto err;
-
- raw[idx] = hton32 (output_dirty[subvol]);
- ret = dict_set_bin (xattr, AFR_DIRTY, raw,
- sizeof(int) * AFR_NUM_CHANGE_LOGS);
- if (ret) {
- GF_FREE (raw);
- goto err;
- }
+ priv = this->private;
+ local = frame->local;
- /* clear/set pending */
- for (j = 0; j < priv->child_count; j++) {
- raw = GF_CALLOC (sizeof(int), AFR_NUM_CHANGE_LOGS,
- gf_afr_mt_int32_t);
- if (!raw)
- goto err;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- raw[idx] = hton32 (output_matrix[subvol][j]);
+ local->op_ret = 0;
- ret = dict_set_bin (xattr, priv->pending_key[j],
- raw, sizeof(int) * AFR_NUM_CHANGE_LOGS);
- if (ret) {
- GF_FREE (raw);
- goto err;
- }
- }
+ STACK_WIND(frame, afr_selfheal_post_op_cbk, priv->children[subvol],
+ priv->children[subvol]->fops->xattrop, &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, xdata);
- return xattr;
-err:
- if (xattr)
- dict_unref (xattr);
- return NULL;
-}
+ syncbarrier_wait(&local->barrier, 1);
+ if (local->op_ret < 0)
+ ret = -local->op_errno;
+
+ loc_wipe(&loc);
+ local->op_ret = 0;
+ return ret;
+}
int
-afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
- unsigned char *sources, unsigned char *sinks,
- unsigned char *healed_sinks, afr_transaction_type type,
- struct afr_reply *replies, unsigned char *locked_on)
+afr_check_stale_error(struct afr_reply *replies, afr_private_t *priv)
{
- afr_private_t *priv = NULL;
- int i = 0;
- int j = 0;
- unsigned char *pending = NULL;
- int *input_dirty = NULL;
- int **input_matrix = NULL;
- int *output_dirty = NULL;
- int **output_matrix = NULL;
- dict_t *xattr = NULL;
+ int i = 0;
+ int op_errno = 0;
+ int tmp_errno = 0;
+ int stale_count = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ tmp_errno = replies[i].op_errno;
+ if (tmp_errno == ENOENT || tmp_errno == ESTALE) {
+ op_errno = afr_higher_errno(op_errno, tmp_errno);
+ stale_count++;
+ }
+ }
+ if (stale_count != priv->child_count)
+ return -ENOTCONN;
+ else
+ return -op_errno;
+}
- priv = this->private;
+int
+afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
+{
+ int i = (long)cookie;
+ afr_local_t *local = NULL;
- pending = alloca0 (priv->child_count);
+ local = frame->local;
- input_dirty = alloca0 (priv->child_count * sizeof (int));
- input_matrix = ALLOC_MATRIX (priv->child_count, int);
- output_dirty = alloca0 (priv->child_count * sizeof (int));
- output_matrix = ALLOC_MATRIX (priv->child_count, int);
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (pre)
+ local->replies[i].prestat = *pre;
+ if (post)
+ local->replies[i].poststat = *post;
+ if (xdata)
+ local->replies[i].xdata = dict_ref(xdata);
- afr_selfheal_extract_xattr (this, replies, type, input_dirty,
- input_matrix);
+ syncbarrier_wake(&local->barrier);
- for (i = 0; i < priv->child_count; i++)
- if (sinks[i] && !healed_sinks[i])
- pending[i] = 1;
+ return 0;
+}
- for (i = 0; i < priv->child_count; i++) {
- for (j = 0; j < priv->child_count; j++) {
- if (pending[j])
- output_matrix[i][j] = 1;
- else
- output_matrix[i][j] = -input_matrix[i][j];
- }
- }
+int
+afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *replies)
+{
+ loc_t loc = {
+ 0,
+ };
- for (i = 0; i < priv->child_count; i++) {
- if (!pending[i])
- output_dirty[i] = -input_dirty[i];
- }
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- for (i = 0; i < priv->child_count; i++) {
- if (!locked_on[i])
- /* perform post-op only on subvols we had locked
- and inspected on.
- */
- continue;
+ AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, setattr, &loc,
+ &replies[source].poststat,
+ (GF_SET_ATTR_ATIME | GF_SET_ATTR_MTIME | GF_SET_ATTR_CTIME),
+ NULL);
- xattr = afr_selfheal_output_xattr (this, type, output_dirty,
- output_matrix, i);
- if (!xattr) {
- continue;
- }
+ loc_wipe(&loc);
- afr_selfheal_post_op (frame, this, inode, i, xattr);
+ return 0;
+}
- dict_unref (xattr);
- }
+dict_t *
+afr_selfheal_output_xattr(xlator_t *this, gf_boolean_t is_full_crawl,
+ afr_transaction_type type, int *output_dirty,
+ int **output_matrix, int subvol,
+ int **full_heal_mtx_out)
+{
+ int j = 0;
+ int idx = 0;
+ int d_idx = 0;
+ int ret = 0;
+ int *raw = 0;
+ dict_t *xattr = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ idx = afr_index_for_transaction_type(type);
+ d_idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+ xattr = dict_new();
+ if (!xattr)
+ return NULL;
+
+ /* clear dirty */
+ raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
+
+ raw[idx] = hton32(output_dirty[subvol]);
+ ret = dict_set_bin(xattr, AFR_DIRTY, raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ GF_FREE(raw);
+ goto err;
+ }
+
+ /* clear/set pending */
+ for (j = 0; j < priv->child_count; j++) {
+ raw = GF_CALLOC(sizeof(int), AFR_NUM_CHANGE_LOGS, gf_afr_mt_int32_t);
+ if (!raw)
+ goto err;
+
+ raw[idx] = hton32(output_matrix[subvol][j]);
+ if (is_full_crawl)
+ raw[d_idx] = hton32(full_heal_mtx_out[subvol][j]);
+
+ ret = dict_set_bin(xattr, priv->pending_key[j], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ GF_FREE(raw);
+ goto err;
+ }
+ }
- return 0;
+ return xattr;
+err:
+ if (xattr)
+ dict_unref(xattr);
+ return NULL;
}
+int
+afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type, struct afr_reply *replies,
+ unsigned char *locked_on)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int j = 0;
+ unsigned char *pending = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int **full_heal_mtx_in = NULL;
+ int **full_heal_mtx_out = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xdata = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ pending = alloca0(priv->child_count);
+
+ input_dirty = alloca0(priv->child_count * sizeof(int));
+ input_matrix = ALLOC_MATRIX(priv->child_count, int);
+ full_heal_mtx_in = ALLOC_MATRIX(priv->child_count, int);
+ full_heal_mtx_out = ALLOC_MATRIX(priv->child_count, int);
+ output_dirty = alloca0(priv->child_count * sizeof(int));
+ output_matrix = ALLOC_MATRIX(priv->child_count, int);
+
+ xdata = dict_new();
+ if (!xdata)
+ return -1;
+
+ afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
+
+ if (local->need_full_crawl)
+ afr_selfheal_extract_xattr(this, replies, AFR_DATA_TRANSACTION, NULL,
+ full_heal_mtx_in);
+
+ for (i = 0; i < priv->child_count; i++)
+ if (sinks[i] && !healed_sinks[i])
+ pending[i] = 1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (pending[j]) {
+ output_matrix[i][j] = 1;
+ if (type == AFR_ENTRY_TRANSACTION)
+ full_heal_mtx_out[i][j] = 1;
+ } else if (locked_on[j]) {
+ output_matrix[i][j] = -input_matrix[i][j];
+ if (type == AFR_ENTRY_TRANSACTION)
+ full_heal_mtx_out[i][j] = -full_heal_mtx_in[i][j];
+ }
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!pending[i])
+ output_dirty[i] = -input_dirty[i];
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!locked_on[i])
+ /* perform post-op only on subvols we had locked
+ and inspected on.
+ */
+ continue;
+ if (undid_pending[i])
+ /* We already unset the pending xattrs in
+ * _afr_fav_child_reset_sink_xattrs(). */
+ continue;
+
+ xattr = afr_selfheal_output_xattr(this, local->need_full_crawl, type,
+ output_dirty, output_matrix, i,
+ full_heal_mtx_out);
+ if (!xattr) {
+ continue;
+ }
+
+ if ((type == AFR_ENTRY_TRANSACTION) && (priv->esh_granular)) {
+ if (xdata && dict_set_int8(xdata, GF_XATTROP_PURGE_INDEX, 1))
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_DICT_SET_FAILED,
+ "Failed to set"
+ " dict value for %s",
+ GF_XATTROP_PURGE_INDEX);
+ }
+
+ afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
+ dict_unref(xattr);
+ }
+
+ if (xdata)
+ dict_unref(xdata);
+
+ return 0;
+}
void
-afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count)
-{
- int i = 0;
- dict_t *xdata = NULL;
-
- if (dst == src)
- return;
-
- for (i = 0; i < count; i++) {
- dst[i].valid = src[i].valid;
- dst[i].op_ret = src[i].op_ret;
- dst[i].op_errno = src[i].op_errno;
- dst[i].prestat = src[i].prestat;
- dst[i].poststat = src[i].poststat;
- dst[i].preparent = src[i].preparent;
- dst[i].postparent = src[i].postparent;
- dst[i].preparent2 = src[i].preparent2;
- dst[i].postparent2 = src[i].postparent2;
- if (src[i].xdata)
- xdata = dict_ref (src[i].xdata);
- else
- xdata = NULL;
- if (dst[i].xdata)
- dict_unref (dst[i].xdata);
- dst[i].xdata = xdata;
- memcpy (dst[i].checksum, src[i].checksum,
- MD5_DIGEST_LENGTH);
- }
+afr_reply_copy(struct afr_reply *dst, struct afr_reply *src)
+{
+ dict_t *xdata = NULL;
+
+ dst->valid = src->valid;
+ dst->op_ret = src->op_ret;
+ dst->op_errno = src->op_errno;
+ dst->prestat = src->prestat;
+ dst->poststat = src->poststat;
+ dst->preparent = src->preparent;
+ dst->postparent = src->postparent;
+ dst->preparent2 = src->preparent2;
+ dst->postparent2 = src->postparent2;
+ if (src->xdata)
+ xdata = dict_ref(src->xdata);
+ else
+ xdata = NULL;
+ if (dst->xdata)
+ dict_unref(dst->xdata);
+ dst->xdata = xdata;
+ if (xdata && dict_get_str_boolean(xdata, "fips-mode-rchecksum",
+ _gf_false) == _gf_true) {
+ memcpy(dst->checksum, src->checksum, SHA256_DIGEST_LENGTH);
+ } else {
+ memcpy(dst->checksum, src->checksum, MD5_DIGEST_LENGTH);
+ }
+ dst->fips_mode_rchecksum = src->fips_mode_rchecksum;
}
+void
+afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count)
+{
+ int i = 0;
+
+ if (dst == src)
+ return;
+
+ for (i = 0; i < count; i++) {
+ afr_reply_copy(&dst[i], &src[i]);
+ }
+}
int
-afr_selfheal_fill_dirty (xlator_t *this, int *dirty, int subvol,
- int idx, dict_t *xdata)
+afr_selfheal_fill_dirty(xlator_t *this, int *dirty, int subvol, int idx,
+ dict_t *xdata)
{
- void *pending_raw = NULL;
- int pending[3] = {0, };
+ void *pending_raw = NULL;
+ int pending[3] = {
+ 0,
+ };
- if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw))
- return -1;
+ if (!dirty)
+ return 0;
- if (!pending_raw)
- return -1;
+ if (dict_get_ptr(xdata, AFR_DIRTY, &pending_raw))
+ return -1;
- memcpy (pending, pending_raw, sizeof(pending));
+ if (!pending_raw)
+ return -1;
- dirty[subvol] = ntoh32 (pending[idx]);
+ memcpy(pending, pending_raw, sizeof(pending));
- return 0;
-}
+ dirty[subvol] = ntoh32(pending[idx]);
+ return 0;
+}
int
-afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol,
- int idx, dict_t *xdata)
+afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx,
+ dict_t *xdata)
{
- int i = 0;
- void *pending_raw = NULL;
- int pending[3] = {0, };
- afr_private_t *priv = NULL;
+ int i = 0;
+ void *pending_raw = NULL;
+ int pending[3] = {
+ 0,
+ };
+ afr_private_t *priv = NULL;
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (dict_get_ptr (xdata, priv->pending_key[i], &pending_raw))
- continue;
+ if (!matrix)
+ return 0;
- if (!pending_raw)
- continue;
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+ continue;
- memcpy (pending, pending_raw, sizeof(pending));
+ if (!pending_raw)
+ continue;
- matrix[subvol][i] = ntoh32 (pending[idx]);
- }
+ memcpy(pending, pending_raw, sizeof(pending));
- return 0;
-}
+ matrix[subvol][i] = ntoh32(pending[idx]);
+ }
+ return 0;
+}
int
-afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
- afr_transaction_type type, int *dirty, int **matrix)
+afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix)
{
- afr_private_t *priv = NULL;
- int i = 0;
- dict_t *xdata = NULL;
- int idx = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ dict_t *xdata = NULL;
+ int idx = -1;
- idx = afr_index_for_transaction_type (type);
+ idx = afr_index_for_transaction_type(type);
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].xdata)
- continue;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
- xdata = replies[i].xdata;
+ if (!replies[i].xdata)
+ continue;
- afr_selfheal_fill_dirty (this, dirty, i, idx, xdata);
- afr_selfheal_fill_matrix (this, matrix, i, idx, xdata);
- }
+ xdata = replies[i].xdata;
- return 0;
+ afr_selfheal_fill_dirty(this, dirty, i, idx, xdata);
+ afr_selfheal_fill_matrix(this, matrix, i, idx, xdata);
+ }
+
+ return 0;
}
/*
@@ -313,70 +819,566 @@ afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
* This can happen if data was directly modified in the backend or for snapshots
*/
void
-afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources,
- struct afr_reply *replies)
+afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
{
- int i = 0;
- afr_private_t *priv = NULL;
- uint64_t size = 0;
-
- /* Find source with biggest file size */
- priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- if (size <= replies[i].poststat.ia_size) {
- size = replies[i].poststat.ia_size;
- }
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t size = 0;
+
+ /* Find source with biggest file size */
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (!replies[i].valid || replies[i].op_ret != 0) {
+ sources[i] = 0;
+ continue;
+ }
+ if (size <= replies[i].poststat.ia_size) {
+ size = replies[i].poststat.ia_size;
}
+ }
+
+ /* Mark sources with less size as not source */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (size > replies[i].poststat.ia_size)
+ sources[i] = 0;
+ }
+}
- /* Mark sources with less size as not source */
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- if (size > replies[i].poststat.ia_size)
- sources[i] = 0;
+void
+afr_mark_latest_mtime_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint32_t mtime = 0;
+ uint32_t mtime_nsec = 0;
+
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (!replies[i].valid || replies[i].op_ret != 0) {
+ sources[i] = 0;
+ continue;
+ }
+ if ((mtime < replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec < replies[i].poststat.ia_mtime_nsec))) {
+ mtime = replies[i].poststat.ia_mtime;
+ mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if ((mtime > replies[i].poststat.ia_mtime) ||
+ ((mtime == replies[i].poststat.ia_mtime) &&
+ (mtime_nsec > replies[i].poststat.ia_mtime_nsec))) {
+ sources[i] = 0;
}
+ }
}
void
-afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
- unsigned char *locked_on, unsigned char *sinks)
+afr_mark_active_sinks(xlator_t *this, unsigned char *sources,
+ unsigned char *locked_on, unsigned char *sinks)
{
- int i = 0;
- afr_private_t *priv = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- priv = this->private;
+ priv = this->private;
- memset (sinks, 0, sizeof (*sinks) * priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i] && locked_on[i])
- sinks[i] = 1;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i] && locked_on[i])
+ sinks[i] = 1;
+ else
+ sinks[i] = 0;
+ }
+}
+
+gf_boolean_t
+afr_dict_contains_heal_op(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ dict_t *xdata_req = NULL;
+ int ret = 0;
+ int heal_op = -1;
+
+ local = frame->local;
+ xdata_req = local->xdata_req;
+ ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
+ if (ret)
+ return _gf_false;
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp)
+ return _gf_true;
+ }
+ ret = dict_set_sizen_str_sizen(local->xdata_rsp, "sh-fail-msg",
+ SFILE_NOT_IN_SPLIT_BRAIN);
+
+ return _gf_true;
+}
+
+gf_boolean_t
+afr_can_decide_split_brain_source_sinks(struct afr_reply *replies,
+ int child_count)
+{
+ int i = 0;
+
+ for (i = 0; i < child_count; i++)
+ if (replies[i].valid != 1 || replies[i].op_ret != 0)
+ return _gf_false;
+
+ return _gf_true;
+}
+
+int
+afr_mark_split_brain_source_sinks_by_heal_op(
+ call_frame_t *frame, xlator_t *this, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type, int heal_op)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata_req = NULL;
+ dict_t *xdata_rsp = NULL;
+ int ret = 0;
+ int i = 0;
+ char *name = NULL;
+ int source = -1;
+
+ local = frame->local;
+ priv = this->private;
+ xdata_req = local->xdata_req;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i])
+ if (sources[i] || !sinks[i] || !healed_sinks[i]) {
+ ret = -1;
+ goto out;
+ }
+ }
+ if (local->xdata_rsp == NULL) {
+ local->xdata_rsp = dict_new();
+ if (!local->xdata_rsp) {
+ ret = -1;
+ goto out;
+ }
+ }
+ xdata_rsp = local->xdata_rsp;
+
+ if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SBRAIN_HEAL_NO_GO_MSG);
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++)
+ if (locked_on[i])
+ sources[i] = 1;
+ switch (heal_op) {
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
+ if (type == AFR_METADATA_TRANSACTION) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SUSE_SOURCE_BRICK_TO_HEAL);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ afr_mark_largest_file_as_source(this, sources, replies);
+ if (AFR_COUNT(sources, priv->child_count) != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SNO_BIGGER_FILE);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ break;
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_LATEST_MTIME:
+ if (type == AFR_METADATA_TRANSACTION) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SUSE_SOURCE_BRICK_TO_HEAL);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ afr_mark_latest_mtime_file_as_source(this, sources, replies);
+ if (AFR_COUNT(sources, priv->child_count) != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SNO_DIFF_IN_MTIME);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ break;
+ case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
+ ret = dict_get_str_sizen(xdata_req, "child-name", &name);
+ if (ret)
+ goto out;
+ source = afr_get_child_index_from_name(this, name);
+ if (source < 0) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SINVALID_BRICK_NAME);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ if (locked_on[source] != 1) {
+ ret = dict_set_sizen_str_sizen(xdata_rsp, "sh-fail-msg",
+ SBRICK_IS_NOT_UP);
+ if (!ret)
+ ret = -1;
+ goto out;
+ }
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ sources[source] = 1;
+ break;
+ default:
+ ret = -1;
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+ sinks[source] = 0;
+ healed_sinks[source] = 0;
+ ret = source;
+out:
+ if (ret < 0)
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ return ret;
+}
+
+int
+afr_sh_fav_by_majority(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode)
+{
+ afr_private_t *priv;
+ int vote_count = -1;
+ int fav_child = -1;
+ int i = 0;
+ int k = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s mtime_sec = %" PRId64 ", size = %" PRIu64
+ " for gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_mtime,
+ replies[i].poststat.ia_size, uuid_utoa(inode->gfid));
+ vote_count = 0;
+ for (k = 0; k < priv->child_count; k++) {
+ if ((replies[k].poststat.ia_mtime ==
+ replies[i].poststat.ia_mtime) &&
+ (replies[k].poststat.ia_size ==
+ replies[i].poststat.ia_size)) {
+ vote_count++;
+ }
+ }
+ if (vote_count > priv->child_count / 2) {
+ fav_child = i;
+ break;
+ }
+ }
+ }
+ return fav_child;
+}
+
+/*
+ * afr_sh_fav_by_mtime: Choose favorite child by mtime.
+ */
+int
+afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint32_t cmp_mtime = 0;
+ uint32_t cmp_mtime_nsec = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s mtime = %" PRId64
+ ", mtime_nsec = %d for "
+ "gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_mtime,
+ replies[i].poststat.ia_mtime_nsec,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_mtime > cmp_mtime) {
+ cmp_mtime = replies[i].poststat.ia_mtime;
+ cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ fav_child = i;
+ } else if ((replies[i].poststat.ia_mtime == cmp_mtime) &&
+ (replies[i].poststat.ia_mtime_nsec > cmp_mtime_nsec)) {
+ cmp_mtime = replies[i].poststat.ia_mtime;
+ cmp_mtime_nsec = replies[i].poststat.ia_mtime_nsec;
+ fav_child = i;
+ }
+ }
+ }
+ return fav_child;
+}
+
+/*
+ * afr_sh_fav_by_ctime: Choose favorite child by ctime.
+ */
+int
+afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint32_t cmp_ctime = 0;
+ uint32_t cmp_ctime_nsec = 0;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid == 1) {
+ gf_msg_debug(this->name, 0,
+ "Child:%s ctime = %" PRId64
+ ", ctime_nsec = %d for "
+ "gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_ctime,
+ replies[i].poststat.ia_ctime_nsec,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_ctime > cmp_ctime) {
+ cmp_ctime = replies[i].poststat.ia_ctime;
+ cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
+ fav_child = i;
+ } else if ((replies[i].poststat.ia_ctime == cmp_ctime) &&
+ (replies[i].poststat.ia_ctime_nsec > cmp_ctime_nsec)) {
+ cmp_ctime = replies[i].poststat.ia_ctime;
+ cmp_ctime_nsec = replies[i].poststat.ia_ctime_nsec;
+ fav_child = i;
+ }
+ }
+ }
+ return fav_child;
+}
+
+/*
+ * afr_sh_fav_by_size: Choose favorite child by size
+ * when not all files are of zero size.
+ */
+int
+afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode)
+{
+ afr_private_t *priv;
+ int fav_child = -1;
+ int i = 0;
+ uint64_t cmp_sz = 0;
+
+ priv = this->private;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid) {
+ continue;
+ }
+ gf_msg_debug(this->name, 0,
+ "Child:%s file size = %" PRIu64 " for gfid %s",
+ priv->children[i]->name, replies[i].poststat.ia_size,
+ uuid_utoa(inode->gfid));
+ if (replies[i].poststat.ia_type == IA_IFDIR) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Cannot perform selfheal on %s. "
+ "Size policy is not applicable to directories.",
+ uuid_utoa(inode->gfid));
+ break;
}
+ if (replies[i].poststat.ia_size > cmp_sz) {
+ cmp_sz = replies[i].poststat.ia_size;
+ fav_child = i;
+ } else if (replies[i].poststat.ia_size == cmp_sz) {
+ fav_child = -1;
+ }
+ }
+ if (fav_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "No bigger file");
+ }
+ return fav_child;
+}
+
+int
+afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, char **policy_str)
+{
+ afr_private_t *priv = NULL;
+ int fav_child = -1;
+
+ priv = this->private;
+ if (!afr_can_decide_split_brain_source_sinks(replies, priv->child_count)) {
+ return -1;
+ }
+
+ switch (priv->fav_child_policy) {
+ case AFR_FAV_CHILD_BY_SIZE:
+ fav_child = afr_sh_fav_by_size(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "SIZE";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_CTIME:
+ fav_child = afr_sh_fav_by_ctime(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "CTIME";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_MTIME:
+ fav_child = afr_sh_fav_by_mtime(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "MTIME";
+ }
+ break;
+ case AFR_FAV_CHILD_BY_MAJORITY:
+ fav_child = afr_sh_fav_by_majority(this, replies, inode);
+ if (policy_str && fav_child >= 0) {
+ *policy_str = "MAJORITY";
+ }
+ break;
+ case AFR_FAV_CHILD_NONE:
+ default:
+ break;
+ }
+
+ return fav_child;
+}
+
+int
+afr_mark_split_brain_source_sinks_by_policy(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type)
+{
+ afr_private_t *priv = NULL;
+ int fav_child = -1;
+ char mtime_str[256];
+ char ctime_str[256];
+ char *policy_str = NULL;
+ struct tm *tm_ptr;
+ time_t time;
+
+ priv = this->private;
+
+ fav_child = afr_sh_get_fav_by_policy(this, replies, inode, &policy_str);
+ if (fav_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "No child selected by favorite-child policy.");
+ } else if (fav_child > priv->child_count - 1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Invalid child (%d) "
+ "selected by policy %s.",
+ fav_child, policy_str);
+ } else if (fav_child >= 0) {
+ time = replies[fav_child].poststat.ia_mtime;
+ tm_ptr = localtime(&time);
+ strftime(mtime_str, sizeof(mtime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
+ time = replies[fav_child].poststat.ia_ctime;
+ tm_ptr = localtime(&time);
+ strftime(ctime_str, sizeof(ctime_str), "%Y-%m-%d %H:%M:%S", tm_ptr);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SBRAIN_FAV_CHILD_POLICY,
+ "Source %s selected as authentic to resolve conflicting data "
+ "in file (gfid:%s) by %s (%" PRIu64
+ " bytes @ %s mtime, %s "
+ "ctime).",
+ priv->children[fav_child]->name, uuid_utoa(inode->gfid),
+ policy_str, replies[fav_child].poststat.ia_size, mtime_str,
+ ctime_str);
+
+ sources[fav_child] = 1;
+ sinks[fav_child] = 0;
+ healed_sinks[fav_child] = 0;
+ }
+ return fav_child;
}
gf_boolean_t
-afr_dict_contains_heal_op (call_frame_t *frame)
+afr_is_file_empty_on_all_children(afr_private_t *priv,
+ struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- dict_t *xdata_req = NULL;
- int ret = 0;
- int heal_op = -1;
+ int i = 0;
- local = frame->local;
- xdata_req = local->xdata_req;
- ret = dict_get_int32 (xdata_req, "heal-op", &heal_op);
- if (ret)
- return _gf_false;
- if (local->xdata_rsp == NULL) {
- local->xdata_rsp = dict_new();
- if (!local->xdata_rsp)
- return _gf_true;
+ for (i = 0; i < priv->child_count; i++) {
+ if ((!replies[i].valid) || (replies[i].op_ret != 0) ||
+ (replies[i].poststat.ia_size != 0))
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
+
+int
+afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type)
+{
+ int source = -1;
+ int i = 0;
+ afr_private_t *priv = this->private;
+ struct iatt stbuf = {
+ 0,
+ };
+
+ if ((AFR_COUNT(locked_on, priv->child_count) < priv->child_count) ||
+ (afr_success_count(replies, priv->child_count) < priv->child_count))
+ return -1;
+
+ if (type == AFR_DATA_TRANSACTION) {
+ if (!afr_is_file_empty_on_all_children(priv, replies))
+ return -1;
+ goto mark;
+ }
+
+ /*For AFR_METADATA_TRANSACTION, metadata must be same on all bricks.*/
+ stbuf = replies[0].poststat;
+ for (i = 1; i < priv->child_count; i++) {
+ if ((!IA_EQUAL(stbuf, replies[i].poststat, type)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, uid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, gid)) ||
+ (!IA_EQUAL(stbuf, replies[i].poststat, prot)))
+ return -1;
+ }
+ for (i = 1; i < priv->child_count; i++) {
+ if (!afr_xattrs_are_equal(replies[0].xdata, replies[i].xdata))
+ return -1;
+ }
+
+mark:
+ /* data/metadata is same on all bricks. Pick one of them as source. Rest
+ * are sinks.*/
+ for (i = 0; i < priv->child_count; i++) {
+ if (source == -1) {
+ source = i;
+ sources[i] = 1;
+ sinks[i] = 0;
+ healed_sinks[i] = 0;
+ continue;
}
- ret = dict_set_str (local->xdata_rsp, "sh-fail-msg",
- "File not in split-brain");
+ sources[i] = 0;
+ sinks[i] = 1;
+ healed_sinks[i] = 1;
+ }
- return _gf_true;
+ return source;
}
/* Return a source depending on the type of heal_op, and set sources[source],
@@ -387,121 +1389,156 @@ afr_dict_contains_heal_op (call_frame_t *frame)
* sinks[node] are 1. This should be the case if the file is in split-brain.
*/
int
-afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
- unsigned char *sources,
- unsigned char *sinks,
- unsigned char *healed_sinks,
- unsigned char *locked_on,
- struct afr_reply *replies,
- afr_transaction_type type)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- dict_t *xdata_req = NULL;
- dict_t *xdata_rsp = NULL;
- int ret = 0;
- int heal_op = -1;
- int i = 0;
- char *name = NULL;
- int source = -1;
-
- local = frame->local;
- priv = this->private;
- xdata_req = local->xdata_req;
-
- ret = dict_get_int32 (xdata_req, "heal-op", &heal_op);
- if (ret)
- goto out;
-
- for (i = 0; i < priv->child_count; i++) {
- if (locked_on[i])
- if (sources[i] || !sinks[i] || !healed_sinks[i]) {
- ret = -1;
- goto out;
- }
- }
- if (local->xdata_rsp == NULL) {
- local->xdata_rsp = dict_new();
- if (!local->xdata_rsp) {
- ret = -1;
- goto out;
- }
+afr_mark_split_brain_source_sinks(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata_req = NULL;
+ int heal_op = -1;
+ int ret = -1;
+ int source = -1;
+
+ local = frame->local;
+ priv = this->private;
+ xdata_req = local->xdata_req;
+
+ source = afr_mark_source_sinks_if_file_empty(
+ this, sources, sinks, healed_sinks, locked_on, replies, type);
+ if (source >= 0)
+ return source;
+
+ ret = dict_get_int32_sizen(xdata_req, "heal-op", &heal_op);
+ if (ret)
+ goto autoheal;
+
+ source = afr_mark_split_brain_source_sinks_by_heal_op(
+ frame, this, sources, sinks, healed_sinks, locked_on, replies, type,
+ heal_op);
+ return source;
+
+autoheal:
+ /* Automatically heal if fav_child_policy is set. */
+ if (priv->fav_child_policy != AFR_FAV_CHILD_NONE) {
+ source = afr_mark_split_brain_source_sinks_by_policy(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ replies, type);
+ if (source != -1) {
+ ret = dict_set_int32_sizen(xdata_req, "fav-child-policy", 1);
+ if (ret)
+ return -1;
}
- xdata_rsp = local->xdata_rsp;
+ }
- switch (heal_op) {
- case GF_SHD_OP_SBRAIN_HEAL_FROM_BIGGER_FILE:
- if (type == AFR_METADATA_TRANSACTION) {
- ret = dict_set_str (xdata_rsp, "sh-fail-msg",
- "Use source-brick option to"
- " heal metadata split-brain");
- if (!ret)
- ret = -1;
- goto out;
- }
- for (i = 0 ; i < priv->child_count; i++)
- if (locked_on[i])
- sources[i] = 1;
- afr_mark_largest_file_as_source (this, sources, replies);
- if (AFR_COUNT (sources, priv->child_count) != 1) {
- ret = dict_set_str (xdata_rsp, "sh-fail-msg",
- "No bigger file");
- if (!ret)
- ret = -1;
- goto out;
- }
- for (i = 0 ; i < priv->child_count; i++)
- if (sources[i])
- source = i;
- sinks[source] = 0;
- healed_sinks[source] = 0;
- break;
- case GF_SHD_OP_SBRAIN_HEAL_FROM_BRICK:
- ret = dict_get_str (xdata_req, "child-name", &name);
- if (ret)
- goto out;
- source = afr_get_child_index_from_name (this, name);
- if (source < 0) {
- ret = dict_set_str (xdata_rsp, "sh-fail-msg",
- "Invalid brick name");
- if (!ret)
- ret = -1;
- goto out;
- }
- if (locked_on[source] != 1) {
- ret = dict_set_str (xdata_rsp, "sh-fail-msg",
- "Brick is not up");
- if (!ret)
- ret = -1;
- goto out;
- }
- sources[source] = 1;
- sinks[source] = 0;
- healed_sinks[source] = 0;
- break;
- default:
- ret = -1;
- goto out;
- }
- ret = source;
-out:
- return ret;
+ return source;
+}
+int
+_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int *input_dirty = NULL;
+ int **input_matrix = NULL;
+ int *output_dirty = NULL;
+ int **output_matrix = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xdata = NULL;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (!dict_get_sizen(local->xdata_req, "fav-child-policy"))
+ return 0;
+
+ xdata = dict_new();
+ if (!xdata)
+ return -1;
+
+ input_dirty = alloca0(priv->child_count * sizeof(int));
+ input_matrix = ALLOC_MATRIX(priv->child_count, int);
+ output_dirty = alloca0(priv->child_count * sizeof(int));
+ output_matrix = ALLOC_MATRIX(priv->child_count, int);
+
+ afr_selfheal_extract_xattr(this, replies, type, input_dirty, input_matrix);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || !healed_sinks[i])
+ continue;
+ output_dirty[i] = -input_dirty[i];
+ output_matrix[i][source] = -input_matrix[i][source];
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i] || !locked_on[i])
+ continue;
+ xattr = afr_selfheal_output_xattr(this, _gf_false, type, output_dirty,
+ output_matrix, i, NULL);
+
+ afr_selfheal_post_op(frame, this, inode, i, xattr, xdata);
+
+ undid_pending[i] = 1;
+ dict_unref(xattr);
+ }
+
+ if (xdata)
+ dict_unref(xdata);
+
+ return 0;
}
gf_boolean_t
-afr_does_witness_exist (xlator_t *this, uint64_t *witness)
+afr_does_witness_exist(xlator_t *this, uint64_t *witness)
{
- int i = 0;
- afr_private_t *priv = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (witness[i])
- return _gf_true;
+ for (i = 0; i < priv->child_count; i++) {
+ if (witness[i])
+ return _gf_true;
+ }
+ return _gf_false;
+}
+
+unsigned int
+afr_get_quorum_count(afr_private_t *priv)
+{
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ return priv->child_count / 2 + 1;
+ } else {
+ return priv->quorum_count;
+ }
+}
+
+void
+afr_selfheal_post_op_failure_accounting(afr_private_t *priv, char *accused,
+ unsigned char *sources,
+ unsigned char *locked_on)
+{
+ int i = 0;
+ unsigned int quorum_count = 0;
+
+ if (AFR_COUNT(sources, priv->child_count) != 0)
+ return;
+
+ quorum_count = afr_get_quorum_count(priv);
+ for (i = 0; i < priv->child_count; i++) {
+ if ((accused[i] < quorum_count) && locked_on[i]) {
+ sources[i] = 1;
}
- return _gf_false;
+ }
+ return;
}
/*
@@ -524,585 +1561,713 @@ afr_does_witness_exist (xlator_t *this, uint64_t *witness)
*/
int
-afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
- struct afr_reply *replies,
- afr_transaction_type type,
- unsigned char *locked_on, unsigned char *sources,
- unsigned char *sinks, uint64_t *witness,
- gf_boolean_t *pflag)
-{
- afr_private_t *priv = NULL;
- int i = 0;
- int j = 0;
- int *dirty = NULL; /* Denotes if dirty xattr is set */
- int **matrix = NULL;/* Changelog matrix */
- char *accused = NULL;/* Accused others without any self-accusal */
- char *pending = NULL;/* Have pending operations on others */
- char *self_accused = NULL; /* Accused itself */
-
- priv = this->private;
-
- dirty = alloca0 (priv->child_count * sizeof (int));
- accused = alloca0 (priv->child_count);
- pending = alloca0 (priv->child_count);
- self_accused = alloca0 (priv->child_count);
- matrix = ALLOC_MATRIX(priv->child_count, int);
- memset (witness, 0, sizeof (*witness) * priv->child_count);
-
- /* First construct the pending matrix for further analysis */
- afr_selfheal_extract_xattr (this, replies, type, dirty, matrix);
-
- if (pflag) {
- for (i = 0; i < priv->child_count; i++) {
- for (j = 0; j < priv->child_count; j++)
- if (matrix[i][j])
- *pflag = _gf_true;
- if (*pflag)
- break;
- }
+afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ uint64_t *witness, unsigned char *pflag)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int j = 0;
+ int *dirty = NULL; /* Denotes if dirty xattr is set */
+ int **matrix = NULL; /* Changelog matrix */
+ char *accused = NULL; /* Accused others without any self-accusal */
+ char *pending = NULL; /* Have pending operations on others */
+ char *self_accused = NULL; /* Accused itself */
+
+ priv = this->private;
+
+ dirty = alloca0(priv->child_count * sizeof(int));
+ accused = alloca0(priv->child_count);
+ pending = alloca0(priv->child_count);
+ self_accused = alloca0(priv->child_count);
+ matrix = ALLOC_MATRIX(priv->child_count, int);
+ memset(witness, 0, sizeof(*witness) * priv->child_count);
+
+ /* First construct the pending matrix for further analysis */
+ afr_selfheal_extract_xattr(this, replies, type, dirty, matrix);
+
+ if (pflag) {
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++)
+ if (matrix[i][j])
+ *pflag |= PFLAG_PENDING;
+ if (*pflag)
+ break;
}
-
- if (afr_success_count (replies,
- priv->child_count) < AFR_SH_MIN_PARTICIPANTS) {
- /* Treat this just like locks not being acquired */
- return -ENOTCONN;
+ }
+
+ if (afr_success_count(replies, priv->child_count) < priv->child_count) {
+ /* Treat this just like locks not being acquired */
+ return -ENOTCONN;
+ }
+
+ /* short list all self-accused */
+ for (i = 0; i < priv->child_count; i++) {
+ if (matrix[i][i])
+ self_accused[i] = 1;
+ }
+
+ /* Next short list all accused to exclude them from being sources */
+ /* Self-accused can't accuse others as they are FOOLs */
+ for (i = 0; i < priv->child_count; i++) {
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j]) {
+ if (!self_accused[i])
+ accused[j] += 1;
+ if (i != j)
+ pending[i] += 1;
+ }
}
+ }
- /* short list all self-accused */
- for (i = 0; i < priv->child_count; i++) {
- if (matrix[i][i])
- self_accused[i] = 1;
+ /* Short list all non-accused as sources */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!accused[i] && locked_on[i])
+ sources[i] = 1;
+ else
+ sources[i] = 0;
+ }
+
+ /* Everyone accused by non-self-accused sources are sinks */
+ memset(sinks, 0, priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (self_accused[i])
+ continue;
+ for (j = 0; j < priv->child_count; j++) {
+ if (matrix[i][j])
+ sinks[j] = 1;
}
-
- /* Next short list all accused to exclude them from being sources */
- /* Self-accused can't accuse others as they are FOOLs */
- for (i = 0; i < priv->child_count; i++) {
- for (j = 0; j < priv->child_count; j++) {
- if (matrix[i][j]) {
- if (!self_accused[i])
- accused[j] = 1;
-
- if (i != j)
- pending[i] = 1;
- }
- }
- }
-
- /* Short list all non-accused as sources */
- memset (sources, 0, priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- if (!accused[i] && locked_on[i])
- sources[i] = 1;
- }
-
- /* Everyone accused by non-self-accused sources are sinks */
- memset (sinks, 0, priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- if (self_accused[i])
- continue;
- for (j = 0; j < priv->child_count; j++) {
- if (matrix[i][j])
- sinks[j] = 1;
- }
+ }
+
+ /* For breaking ties provide with number of fops they witnessed */
+
+ /*
+ * count the pending fops witnessed from itself to others when it is
+ * self-accused
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!self_accused[i])
+ continue;
+ for (j = 0; j < priv->child_count; j++) {
+ if (i == j)
+ continue;
+ witness[i] += matrix[i][j];
}
+ }
- /* For breaking ties provide with number of fops they witnessed */
+ if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION)
+ afr_selfheal_post_op_failure_accounting(priv, accused, sources,
+ locked_on);
- /*
- * count the pending fops witnessed from itself to others when it is
- * self-accused
- */
+ /* If no sources, all locked nodes are sinks - split brain */
+ if (AFR_COUNT(sources, priv->child_count) == 0) {
for (i = 0; i < priv->child_count; i++) {
- if (!self_accused[i])
- continue;
- for (j = 0; j < priv->child_count; j++) {
- if (i == j)
- continue;
- witness[i] += matrix[i][j];
- }
+ if (locked_on[i])
+ sinks[i] = 1;
}
-
- /* If no sources, all locked nodes are sinks - split brain */
- if (AFR_COUNT (sources, priv->child_count) == 0) {
- for (i = 0; i < priv->child_count; i++) {
- if (locked_on[i])
- sinks[i] = 1;
- }
+ if (pflag)
+ *pflag |= PFLAG_SBRAIN;
+ }
+
+ /* One more class of witness similar to dirty in v2 is where no pending
+ * exists but we have self-accusing markers. This can happen in afr-v1
+ * if the brick crashes just after doing xattrop on self but
+ * before xattrop on the other xattrs on the brick in pre-op. */
+ if (AFR_COUNT(pending, priv->child_count) == 0) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (self_accused[i])
+ witness[i] += matrix[i][i];
}
-
- /* In afr-v1 if a file is self-accused but didn't have any pending
+ } else {
+ /* In afr-v1 if a file is self-accused and has pending
* operations on others then it is similar to 'dirty' in afr-v2.
* Consider such cases as witness.
*/
for (i = 0; i < priv->child_count; i++) {
- if (self_accused[i] && !pending[i])
- witness[i] += matrix[i][i];
+ if (self_accused[i] && pending[i])
+ witness[i] += matrix[i][i];
}
+ }
- /* count the number of dirty fops witnessed */
- for (i = 0; i < priv->child_count; i++)
- witness[i] += dirty[i];
+ /* count the number of dirty fops witnessed */
+ for (i = 0; i < priv->child_count; i++)
+ witness[i] += dirty[i];
- return 0;
+ return 0;
}
void
-afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type,
- int source, unsigned char *healed_sinks)
-{
- char *status = NULL;
- char *sinks_str = NULL;
- char *p = NULL;
- afr_private_t *priv = NULL;
- gf_loglevel_t loglevel = GF_LOG_NONE;
- int i = 0;
-
- priv = this->private;
- sinks_str = alloca0 (priv->child_count * 8);
- p = sinks_str;
- for (i = 0; i < priv->child_count; i++) {
- if (!healed_sinks[i])
- continue;
- p += sprintf (p, "%d ", i);
- }
-
- if (ret < 0) {
- status = "Failed";
- loglevel = GF_LOG_DEBUG;
- } else {
- status = "Completed";
- loglevel = GF_LOG_INFO;
+afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source,
+ unsigned char *sources, unsigned char *healed_sinks)
+{
+ char *status = NULL;
+ char *sinks_str = NULL;
+ char *p = NULL;
+ char *sources_str = NULL;
+ char *q = NULL;
+ afr_private_t *priv = NULL;
+ gf_loglevel_t loglevel = GF_LOG_NONE;
+ int i = 0;
+
+ priv = this->private;
+ sinks_str = alloca0(priv->child_count * 8);
+ p = sinks_str;
+ sources_str = alloca0(priv->child_count * 8);
+ q = sources_str;
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i])
+ p += sprintf(p, "%d ", i);
+ if (sources[i]) {
+ if (source == i) {
+ q += sprintf(q, "[%d] ", i);
+ } else {
+ q += sprintf(q, "%d ", i);
+ }
}
-
- gf_msg (this->name, loglevel, 0,
- AFR_MSG_SELF_HEAL_INFO, "%s %s selfheal on %s. "
- "source=%d sinks=%s", status, type, uuid_utoa (gfid),
- source, sinks_str);
+ }
+
+ if (ret < 0) {
+ status = "Failed";
+ loglevel = GF_LOG_DEBUG;
+ } else {
+ status = "Completed";
+ loglevel = GF_LOG_INFO;
+ }
+
+ gf_msg(this->name, loglevel, 0, AFR_MSG_SELF_HEAL_INFO,
+ "%s %s selfheal on %s. "
+ "sources=%s sinks=%s",
+ status, type, uuid_utoa(gfid), sources_str, sinks_str);
}
int
-afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xdata, struct iatt *parbuf)
-{
- afr_local_t *local = NULL;
- int i = -1;
- GF_UNUSED int ret = -1;
- int8_t need_heal = 1;
-
- local = frame->local;
- i = (long) cookie;
-
- local->replies[i].valid = 1;
- local->replies[i].op_ret = op_ret;
- local->replies[i].op_errno = op_errno;
- if (buf)
- local->replies[i].poststat = *buf;
- if (parbuf)
- local->replies[i].postparent = *parbuf;
- if (xdata) {
- local->replies[i].xdata = dict_ref (xdata);
- ret = dict_get_int8 (xdata, "link-count", &need_heal);
- local->replies[i].need_heal = need_heal;
- } else {
- local->replies[i].need_heal = need_heal;
- }
-
- syncbarrier_wake (&local->barrier);
-
- return 0;
+afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *parbuf)
+{
+ afr_local_t *local = NULL;
+ int i = -1;
+ GF_UNUSED int ret = -1;
+ int8_t need_heal = 1;
+
+ local = frame->local;
+ i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (buf)
+ local->replies[i].poststat = *buf;
+ if (parbuf)
+ local->replies[i].postparent = *parbuf;
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ ret = dict_get_int8(xdata, "link-count", &need_heal);
+ }
+
+ local->replies[i].need_heal = need_heal;
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
}
-
inode_t *
-afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
- const char *name, struct afr_reply *replies,
- unsigned char *lookup_on, dict_t *xattr)
+afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on, dict_t *xattr)
{
- loc_t loc = {0, };
- dict_t *xattr_req = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- inode_t *inode = NULL;
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
- local = frame->local;
- priv = frame->this->private;
+ local = frame->local;
+ priv = frame->this->private;
- xattr_req = dict_new ();
- if (!xattr_req)
- return NULL;
+ xattr_req = dict_new();
+ if (!xattr_req)
+ return NULL;
- if (xattr)
- dict_copy (xattr, xattr_req);
+ if (xattr)
+ dict_copy(xattr, xattr_req);
- if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
- dict_destroy (xattr_req);
- return NULL;
- }
+ if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
+ dict_unref(xattr_req);
+ return NULL;
+ }
- inode = inode_new (parent->table);
- if (!inode) {
- dict_destroy (xattr_req);
- return NULL;
- }
+ inode = inode_new(parent->table);
+ if (!inode) {
+ dict_unref(xattr_req);
+ return NULL;
+ }
- loc.parent = inode_ref (parent);
- gf_uuid_copy (loc.pargfid, parent->gfid);
- loc.name = name;
- loc.inode = inode_ref (inode);
+ loc.parent = inode_ref(parent);
+ gf_uuid_copy(loc.pargfid, parent->gfid);
+ loc.name = name;
+ loc.inode = inode_ref(inode);
- AFR_ONLIST (lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
- xattr_req);
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
- afr_replies_copy (replies, local->replies, priv->child_count);
+ afr_replies_copy(replies, local->replies, priv->child_count);
- loc_wipe (&loc);
- dict_unref (xattr_req);
+ loc_wipe(&loc);
+ dict_unref(xattr_req);
- return inode;
+ return inode;
+}
+
+static int
+afr_set_multi_dom_lock_count_request(xlator_t *this, dict_t *dict)
+{
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ char *key1 = NULL;
+ char *key2 = NULL;
+
+ priv = this->private;
+ key1 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(this->name));
+ key2 = alloca0(strlen(GLUSTERFS_INODELK_DOM_PREFIX) + 2 +
+ strlen(priv->sh_domain));
+
+ ret = dict_set_uint32(dict, GLUSTERFS_MULTIPLE_DOM_LK_CNT_REQUESTS, 1);
+ if (ret)
+ return ret;
+
+ sprintf(key1, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, this->name);
+ ret = dict_set_uint32(dict, key1, 1);
+ if (ret)
+ return ret;
+
+ sprintf(key2, "%s:%s", GLUSTERFS_INODELK_DOM_PREFIX, priv->sh_domain);
+ ret = dict_set_uint32(dict, key2, 1);
+ if (ret)
+ return ret;
+
+ return 0;
}
int
-afr_selfheal_unlocked_discover_on (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies,
- unsigned char *discover_on)
+afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on, dict_t *dict)
{
- loc_t loc = {0, };
- dict_t *xattr_req = NULL;
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr_req = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- local = frame->local;
- priv = frame->this->private;
+ local = frame->local;
+ priv = frame->this->private;
- xattr_req = dict_new ();
- if (!xattr_req)
- return -ENOMEM;
+ xattr_req = dict_new();
+ if (!xattr_req)
+ return -ENOMEM;
+ if (dict)
+ dict_copy(dict, xattr_req);
- if (afr_xattr_req_prepare (frame->this, xattr_req) != 0) {
- dict_destroy (xattr_req);
- return -ENOMEM;
- }
+ if (afr_xattr_req_prepare(frame->this, xattr_req) != 0) {
+ dict_unref(xattr_req);
+ return -ENOMEM;
+ }
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, gfid);
+ if (afr_set_multi_dom_lock_count_request(frame->this, xattr_req)) {
+ dict_unref(xattr_req);
+ return -1;
+ }
- AFR_ONLIST (discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
- xattr_req);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, gfid);
- afr_replies_copy (replies, local->replies, priv->child_count);
+ AFR_ONLIST(discover_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xattr_req);
- loc_wipe (&loc);
- dict_unref (xattr_req);
+ afr_replies_copy(replies, local->replies, priv->child_count);
- return 0;
+ loc_wipe(&loc);
+ dict_unref(xattr_req);
+
+ return 0;
}
int
-afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies)
+afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
+ struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ dict_t *dict = NULL;
+
+ local = frame->local;
- priv = frame->this->private;
+ if (local->xattr_req)
+ dict = local->xattr_req;
- return afr_selfheal_unlocked_discover_on (frame, inode, gfid, replies,
- priv->child_up);
+ return afr_selfheal_unlocked_discover_on(frame, inode, gfid, replies,
+ local->child_up, dict);
}
unsigned int
-afr_success_count (struct afr_reply *replies, unsigned int count)
+afr_success_count(struct afr_reply *replies, unsigned int count)
{
- int i = 0;
- unsigned int success = 0;
+ int i = 0;
+ unsigned int success = 0;
- for (i = 0; i < count; i++)
- if (replies[i].valid && replies[i].op_ret == 0)
- success++;
- return success;
+ for (i = 0; i < count; i++)
+ if (replies[i].valid && replies[i].op_ret == 0)
+ success++;
+ return success;
}
int
-afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
+afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- afr_local_t *local = NULL;
- int i = 0;
+ afr_local_t *local = NULL;
+ int i = 0;
- local = frame->local;
- i = (long) cookie;
+ local = frame->local;
+ i = (long)cookie;
- local->replies[i].valid = 1;
- local->replies[i].op_ret = op_ret;
- local->replies[i].op_errno = op_errno;
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
- syncbarrier_wake (&local->barrier);
+ syncbarrier_wake(&local->barrier);
- return 0;
+ return 0;
}
-
int
-afr_locked_fill (call_frame_t *frame, xlator_t *this,
- unsigned char *locked_on)
+afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on)
{
- int i = 0;
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int count = 0;
-
- local = frame->local;
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->replies[i].valid && local->replies[i].op_ret == 0) {
- locked_on[i] = 1;
- count++;
- } else {
- locked_on[i] = 0;
- }
- }
+ int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int count = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].valid && local->replies[i].op_ret == 0) {
+ locked_on[i] = 1;
+ count++;
+ } else {
+ locked_on[i] = 0;
+ }
+ }
- return count;
+ return count;
}
-
int
-afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, off_t off, size_t size,
- unsigned char *locked_on)
+afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
{
- loc_t loc = {0,};
- struct gf_flock flock = {0, };
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, inode->gfid);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- flock.l_type = F_WRLCK;
- flock.l_start = off;
- flock.l_len = size;
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom,
- &loc, F_SETLK, &flock, NULL);
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
- loc_wipe (&loc);
+ loc_wipe(&loc);
- return afr_locked_fill (frame, this, locked_on);
+ return afr_locked_fill(frame, this, locked_on);
}
-
int
-afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, off_t off, size_t size,
- unsigned char *locked_on)
+afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on)
{
- loc_t loc = {0,};
- struct gf_flock flock = {0, };
- afr_local_t *local = NULL;
- int i = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
- local = frame->local;
-
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, inode->gfid);
-
- flock.l_type = F_WRLCK;
- flock.l_start = off;
- flock.l_len = size;
-
- AFR_ONALL (frame, afr_selfheal_lock_cbk, inodelk, dom,
- &loc, F_SETLK, &flock, NULL);
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->replies[i].op_ret == -1 &&
- local->replies[i].op_errno == EAGAIN) {
- afr_locked_fill (frame, this, locked_on);
- afr_selfheal_uninodelk (frame, this, inode, dom, off,
- size, locked_on);
-
- AFR_SEQ (frame, afr_selfheal_lock_cbk, inodelk, dom,
- &loc, F_SETLKW, &flock, NULL);
- break;
- }
- }
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
+
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_uninodelk(frame, this, inode, dom, off, size,
+ locked_on);
+
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
+ &flock, NULL);
+ break;
+ }
+ }
- loc_wipe (&loc);
+ loc_wipe(&loc);
- return afr_locked_fill (frame, this, locked_on);
+ return afr_locked_fill(frame, this, locked_on);
}
+static void
+afr_get_lock_and_eagain_counts(afr_private_t *priv, struct afr_reply *replies,
+ int *lock_count, int *eagain_count)
+{
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == 0) {
+ (*lock_count)++;
+ } else if (replies[i].op_ret == -1 && replies[i].op_errno == EAGAIN) {
+ (*eagain_count)++;
+ }
+ }
+}
+/*Do blocking locks if number of locks acquired is majority and there were some
+ * EAGAINs. Useful for odd-way replication*/
int
-afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, off_t off, size_t size,
- const unsigned char *locked_on)
+afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, off_t off,
+ size_t size, unsigned char *locked_on)
{
- loc_t loc = {0,};
- struct gf_flock flock = {0, };
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int lock_count = 0;
+ int eagain_count = 0;
+ priv = this->private;
+ local = frame->local;
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, inode->gfid);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- flock.l_type = F_UNLCK;
- flock.l_start = off;
- flock.l_len = size;
+ flock.l_type = F_WRLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, inodelk,
- dom, &loc, F_SETLK, &flock, NULL);
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLK, &flock,
+ NULL);
- loc_wipe (&loc);
+ afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
+ &eagain_count);
- return 0;
-}
+ if (lock_count > priv->child_count / 2 && eagain_count) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_uninodelk(frame, this, inode, dom, off, size, locked_on);
+
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, inodelk, dom, &loc, F_SETLKW,
+ &flock, NULL);
+ }
+ loc_wipe(&loc);
+
+ return afr_locked_fill(frame, this, locked_on);
+}
int
-afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, const char *name, unsigned char *locked_on)
+afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on)
{
- loc_t loc = {0,};
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, inode->gfid);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom,
- &loc, name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
+ flock.l_type = F_UNLCK;
+ flock.l_start = off;
+ flock.l_len = size;
- loc_wipe (&loc);
+ AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, inodelk, dom, &loc,
+ F_SETLK, &flock, NULL);
- return afr_locked_fill (frame, this, locked_on);
-}
+ loc_wipe(&loc);
+ return 0;
+}
int
-afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, const char *name, unsigned char *locked_on)
+afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
{
- loc_t loc = {0,};
- afr_local_t *local = NULL;
- int i = 0;
- afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
- priv = this->private;
- local = frame->local;
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, inode->gfid);
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
- AFR_ONALL (frame, afr_selfheal_lock_cbk, entrylk, dom, &loc,
- name, ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
+ loc_wipe(&loc);
- for (i = 0; i < priv->child_count; i++) {
- if (local->replies[i].op_ret == -1 &&
- local->replies[i].op_errno == EAGAIN) {
- afr_locked_fill (frame, this, locked_on);
- afr_selfheal_unentrylk (frame, this, inode, dom, name,
- locked_on);
+ return afr_locked_fill(frame, this, locked_on);
+}
- AFR_SEQ (frame, afr_selfheal_lock_cbk, entrylk, dom,
- &loc, name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
- break;
- }
- }
+int
+afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on)
+{
+ loc_t loc = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == -1 &&
+ local->replies[i].op_errno == EAGAIN) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on,
+ NULL);
+
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ break;
+ }
+ }
- loc_wipe (&loc);
+ loc_wipe(&loc);
- return afr_locked_fill (frame, this, locked_on);
+ return afr_locked_fill(frame, this, locked_on);
}
-
int
-afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, const char *name, unsigned char *locked_on)
+afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, const char *name,
+ unsigned char *locked_on)
{
- loc_t loc = {0,};
+ loc_t loc = {
+ 0,
+ };
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int lock_count = 0;
+ int eagain_count = 0;
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, inode->gfid);
+ priv = this->private;
+ local = frame->local;
- AFR_ONLIST (locked_on, frame, afr_selfheal_lock_cbk, entrylk,
- dom, &loc, name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, NULL);
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- loc_wipe (&loc);
-
- return 0;
-}
+ AFR_ONALL(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK_NB, ENTRYLK_WRLCK, NULL);
+ afr_get_lock_and_eagain_counts(priv, local->replies, &lock_count,
+ &eagain_count);
-gf_boolean_t
-afr_is_pending_set (xlator_t *this, dict_t *xdata, int type)
-{
- int idx = -1;
- afr_private_t *priv = NULL;
- void *pending_raw = NULL;
- int *pending_int = NULL;
- int i = 0;
+ if (lock_count > priv->child_count / 2 && eagain_count) {
+ afr_locked_fill(frame, this, locked_on);
+ afr_selfheal_unentrylk(frame, this, inode, dom, name, locked_on, NULL);
- priv = this->private;
- idx = afr_index_for_transaction_type (type);
+ AFR_SEQ(frame, afr_selfheal_lock_cbk, entrylk, dom, &loc, name,
+ ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL);
+ }
- if (dict_get_ptr (xdata, AFR_DIRTY, &pending_raw) == 0) {
- if (pending_raw) {
- pending_int = pending_raw;
+ loc_wipe(&loc);
- if (ntoh32 (pending_int[idx]))
- return _gf_true;
- }
- }
+ return afr_locked_fill(frame, this, locked_on);
+}
- for (i = 0; i < priv->child_count; i++) {
- if (dict_get_ptr (xdata, priv->pending_key[i],
- &pending_raw))
- continue;
- if (!pending_raw)
- continue;
- pending_int = pending_raw;
+int
+afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on,
+ dict_t *xdata)
+{
+ loc_t loc = {
+ 0,
+ };
- if (ntoh32 (pending_int[idx]))
- return _gf_true;
- }
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
- return _gf_false;
-}
+ AFR_ONLIST(locked_on, frame, afr_selfheal_lock_cbk, entrylk, dom, &loc,
+ name, ENTRYLK_UNLOCK, ENTRYLK_WRLCK, xdata);
+ loc_wipe(&loc);
-gf_boolean_t
-afr_is_data_set (xlator_t *this, dict_t *xdata)
-{
- return afr_is_pending_set (this, xdata, AFR_DATA_TRANSACTION);
+ return 0;
}
gf_boolean_t
-afr_is_metadata_set (xlator_t *this, dict_t *xdata)
+afr_is_data_set(xlator_t *this, dict_t *xdata)
{
- return afr_is_pending_set (this, xdata, AFR_METADATA_TRANSACTION);
+ return afr_is_pending_set(this, xdata, AFR_DATA_TRANSACTION);
}
gf_boolean_t
-afr_is_entry_set (xlator_t *this, dict_t *xdata)
+afr_is_metadata_set(xlator_t *this, dict_t *xdata)
{
- return afr_is_pending_set (this, xdata, AFR_ENTRY_TRANSACTION);
+ return afr_is_pending_set(this, xdata, AFR_METADATA_TRANSACTION);
}
-
-inode_t*
-afr_inode_link (inode_t *inode, struct iatt *iatt)
+gf_boolean_t
+afr_is_entry_set(xlator_t *this, dict_t *xdata)
{
- inode_t *linked_inode = NULL;
-
- linked_inode = inode_link (inode, NULL, NULL, iatt);
-
- if (linked_inode)
- inode_lookup (linked_inode);
- return linked_inode;
+ return afr_is_pending_set(this, xdata, AFR_ENTRY_TRANSACTION);
}
-
/*
* This function inspects the looked up replies (in an unlocked manner)
* and decides whether a locked verification and possible healing is
@@ -1113,288 +2278,310 @@ afr_inode_link (inode_t *inode, struct iatt *iatt)
*/
int
-afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
- uuid_t gfid, inode_t **link_inode,
- gf_boolean_t *data_selfheal,
- gf_boolean_t *metadata_selfheal,
- gf_boolean_t *entry_selfheal)
-{
- afr_private_t *priv = NULL;
- inode_t *inode = NULL;
- int i = 0;
- int valid_cnt = 0;
- struct iatt first = {0, };
- struct afr_reply *replies = NULL;
- int ret = -1;
-
- priv = this->private;
-
- inode = afr_inode_find (this, gfid);
- if (!inode)
- goto out;
+afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **link_inode, gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal,
+ gf_boolean_t *entry_selfheal,
+ struct afr_reply *replies_dst)
+{
+ afr_private_t *priv = NULL;
+ inode_t *inode = NULL;
+ int i = 0;
+ int valid_cnt = 0;
+ struct iatt first = {
+ 0,
+ };
+ int first_idx = 0;
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+
+ priv = this->private;
+
+ inode = afr_inode_find(this, gfid);
+ if (!inode)
+ goto out;
+
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+
+ ret = afr_selfheal_unlocked_discover(frame, inode, gfid, replies);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+ if (replies[i].op_ret == -1)
+ continue;
+
+ /* The data segment of the changelog can be non-zero to indicate
+ * the directory needs a full heal. So the check below ensures
+ * it's not a directory before setting the data_selfheal boolean.
+ */
+ if (data_selfheal && !IA_ISDIR(replies[i].poststat.ia_type) &&
+ afr_is_data_set(this, replies[i].xdata))
+ *data_selfheal = _gf_true;
- replies = alloca0 (sizeof (*replies) * priv->child_count);
+ if (metadata_selfheal && afr_is_metadata_set(this, replies[i].xdata))
+ *metadata_selfheal = _gf_true;
- ret = afr_selfheal_unlocked_discover (frame, inode, gfid, replies);
- if (ret)
- goto out;
+ if (entry_selfheal && afr_is_entry_set(this, replies[i].xdata))
+ *entry_selfheal = _gf_true;
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
- if (replies[i].op_ret == -1)
- continue;
-
- if (data_selfheal && afr_is_data_set (this, replies[i].xdata))
- *data_selfheal = _gf_true;
-
- if (metadata_selfheal &&
- afr_is_metadata_set (this, replies[i].xdata))
- *metadata_selfheal = _gf_true;
-
- if (entry_selfheal && afr_is_entry_set (this, replies[i].xdata))
- *entry_selfheal = _gf_true;
-
- valid_cnt ++;
- if (valid_cnt == 1) {
- first = replies[i].poststat;
- continue;
- }
-
- if (!IA_EQUAL (first, replies[i].poststat, type)) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SPLIT_BRAIN,
- "TYPE mismatch %d vs %d on %s for gfid:%s",
- (int) first.ia_type,
- (int) replies[i].poststat.ia_type,
- priv->children[i]->name,
- uuid_utoa (replies[i].poststat.ia_gfid));
- ret = -EIO;
- goto out;
- }
-
- if (!IA_EQUAL (first, replies[i].poststat, uid)) {
- gf_msg_debug (this->name, 0,
- "UID mismatch "
- "%d vs %d on %s for gfid:%s",
- (int) first.ia_uid,
- (int) replies[i].poststat.ia_uid,
- priv->children[i]->name,
- uuid_utoa (replies[i].poststat.ia_gfid));
-
- if (metadata_selfheal)
- *metadata_selfheal = _gf_true;
- }
-
- if (!IA_EQUAL (first, replies[i].poststat, gid)) {
- gf_msg_debug (this->name, 0,
- "GID mismatch "
- "%d vs %d on %s for gfid:%s",
- (int) first.ia_uid,
- (int) replies[i].poststat.ia_uid,
- priv->children[i]->name,
- uuid_utoa (replies[i].poststat.ia_gfid));
-
- if (metadata_selfheal)
- *metadata_selfheal = _gf_true;
- }
-
- if (!IA_EQUAL (first, replies[i].poststat, prot)) {
- gf_msg_debug (this->name, 0,
- "MODE mismatch "
- "%d vs %d on %s for gfid:%s",
- (int) st_mode_from_ia (first.ia_prot, 0),
- (int) st_mode_from_ia
- (replies[i].poststat.ia_prot, 0),
- priv->children[i]->name,
- uuid_utoa (replies[i].poststat.ia_gfid));
-
- if (metadata_selfheal)
- *metadata_selfheal = _gf_true;
- }
-
- if (IA_ISREG(first.ia_type) &&
- !IA_EQUAL (first, replies[i].poststat, size)) {
- gf_msg_debug (this->name, 0,
- "SIZE mismatch "
- "%lld vs %lld on %s for gfid:%s",
- (long long) first.ia_size,
- (long long) replies[i].poststat.ia_size,
- priv->children[i]->name,
- uuid_utoa (replies[i].poststat.ia_gfid));
-
- if (data_selfheal)
- *data_selfheal = _gf_true;
- }
- }
-
- if (valid_cnt > 0 && link_inode) {
- *link_inode = afr_inode_link (inode, &first);
- if (!*link_inode) {
- ret = -EINVAL;
- goto out;
- }
- } else if (valid_cnt < 2) {
- ret = afr_check_stale_error (replies, priv);
- goto out;
+ valid_cnt++;
+ if (valid_cnt == 1) {
+ first = replies[i].poststat;
+ first_idx = i;
+ continue;
}
- ret = 0;
-out:
- if (inode)
- inode_unref (inode);
- if (replies)
- afr_replies_wipe (replies, priv->child_count);
+ if (!IA_EQUAL(first, replies[i].poststat, type)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "TYPE mismatch %d vs %d on %s for gfid:%s",
+ (int)first.ia_type, (int)replies[i].poststat.ia_type,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;"
+ "type=file;gfid=%s;"
+ "ia_type-%d=%s;ia_type-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(replies[i].poststat.ia_gfid), first_idx,
+ gf_inode_type_to_str(first.ia_type), i,
+ gf_inode_type_to_str(replies[i].poststat.ia_type));
+ ret = -EIO;
+ goto out;
+ }
- return ret;
-}
+ if (!IA_EQUAL(first, replies[i].poststat, uid)) {
+ gf_msg_debug(this->name, 0,
+ "UID mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
+
+ if (!IA_EQUAL(first, replies[i].poststat, gid)) {
+ gf_msg_debug(this->name, 0,
+ "GID mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)first.ia_uid, (int)replies[i].poststat.ia_uid,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
+
+ if (!IA_EQUAL(first, replies[i].poststat, prot)) {
+ gf_msg_debug(this->name, 0,
+ "MODE mismatch "
+ "%d vs %d on %s for gfid:%s",
+ (int)st_mode_from_ia(first.ia_prot, 0),
+ (int)st_mode_from_ia(replies[i].poststat.ia_prot, 0),
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (metadata_selfheal)
+ *metadata_selfheal = _gf_true;
+ }
+
+ if (IA_ISREG(first.ia_type) &&
+ !IA_EQUAL(first, replies[i].poststat, size)) {
+ gf_msg_debug(this->name, 0,
+ "SIZE mismatch "
+ "%lld vs %lld on %s for gfid:%s",
+ (long long)first.ia_size,
+ (long long)replies[i].poststat.ia_size,
+ priv->children[i]->name,
+ uuid_utoa(replies[i].poststat.ia_gfid));
+
+ if (data_selfheal)
+ *data_selfheal = _gf_true;
+ }
+ }
+
+ if (valid_cnt > 0 && link_inode) {
+ *link_inode = inode_link(inode, NULL, NULL, &first);
+ if (!*link_inode) {
+ ret = -EINVAL;
+ goto out;
+ }
+ } else if (valid_cnt < 2) {
+ ret = afr_check_stale_error(replies, priv);
+ goto out;
+ }
+ ret = 0;
+out:
+ if (replies && replies_dst)
+ afr_replies_copy(replies_dst, replies, priv->child_count);
+ if (inode)
+ inode_unref(inode);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+
+ return ret;
+}
inode_t *
-afr_inode_find (xlator_t *this, uuid_t gfid)
+afr_inode_find(xlator_t *this, uuid_t gfid)
{
- inode_table_t *table = NULL;
- inode_t *inode = NULL;
+ inode_table_t *table = NULL;
+ inode_t *inode = NULL;
- table = this->itable;
- if (!table)
- return NULL;
+ table = this->itable;
+ if (!table)
+ return NULL;
- inode = inode_find (table, gfid);
- if (inode)
- return inode;
+ inode = inode_find(table, gfid);
+ if (inode)
+ return inode;
- inode = inode_new (table);
- if (!inode)
- return NULL;
+ inode = inode_new(table);
+ if (!inode)
+ return NULL;
- gf_uuid_copy (inode->gfid, gfid);
+ gf_uuid_copy(inode->gfid, gfid);
- return inode;
+ return inode;
}
-
call_frame_t *
-afr_frame_create (xlator_t *this)
+afr_frame_create(xlator_t *this, int32_t *op_errno)
{
- call_frame_t *frame = NULL;
- afr_local_t *local = NULL;
- int op_errno = 0;
- pid_t pid = GF_CLIENT_PID_SELF_HEALD;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ pid_t pid = GF_CLIENT_PID_SELF_HEALD;
- frame = create_frame (this, this->ctx->pool);
- if (!frame)
- return NULL;
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame) {
+ if (op_errno)
+ *op_errno = ENOMEM;
+ return NULL;
+ }
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local) {
- STACK_DESTROY (frame->root);
- return NULL;
- }
+ local = AFR_FRAME_INIT(frame, (*op_errno));
+ if (!local) {
+ STACK_DESTROY(frame->root);
+ return NULL;
+ }
- syncopctx_setfspid (&pid);
+ syncopctx_setfspid(&pid);
- frame->root->pid = pid;
+ frame->root->pid = pid;
- afr_set_lk_owner (frame, this, frame->root);
+ afr_set_lk_owner(frame, this, frame->root);
- return frame;
+ return frame;
}
int
-afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode,
- int source, struct afr_reply *replies,
- unsigned char *sources, unsigned char *newentry)
+afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, struct afr_reply *replies,
+ unsigned char *sources, unsigned char *newentry)
{
- int ret = 0;
- int i = 0;
- afr_private_t *priv = NULL;
- dict_t *xattr = NULL;
- int **changelog = NULL;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int **changelog = NULL;
- priv = this->private;
+ priv = this->private;
- gf_uuid_copy (inode->gfid, replies[source].poststat.ia_gfid);
+ gf_uuid_copy(inode->gfid, replies[source].poststat.ia_gfid);
- xattr = dict_new();
- if (!xattr)
- return -ENOMEM;
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
- changelog = afr_mark_pending_changelog (priv, newentry, xattr,
- replies[source].poststat.ia_type);
+ changelog = afr_mark_pending_changelog(priv, newentry, xattr,
+ replies[source].poststat.ia_type);
- if (!changelog)
- goto out;
+ if (!changelog) {
+ ret = -ENOMEM;
+ goto out;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- afr_selfheal_post_op (frame, this, inode, i, xattr);
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ ret |= afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+ }
out:
- if (changelog)
- afr_matrix_cleanup (changelog, priv->child_count);
- if (xattr)
- dict_unref (xattr);
- return ret;
+ if (changelog)
+ afr_matrix_cleanup(changelog, priv->child_count);
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
}
int
-afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid)
-{
- int ret = -1;
- int entry_ret = 1;
- int metadata_ret = 1;
- int data_ret = 1;
- int or_ret = 0;
- inode_t *inode = NULL;
- gf_boolean_t data_selfheal = _gf_false;
- gf_boolean_t metadata_selfheal = _gf_false;
- gf_boolean_t entry_selfheal = _gf_false;
- afr_private_t *priv = NULL;
- gf_boolean_t dataheal_enabled = _gf_false;
-
- priv = this->private;
- gf_string2boolean (priv->data_self_heal, &dataheal_enabled);
-
- ret = afr_selfheal_unlocked_inspect (frame, this, gfid, &inode,
- &data_selfheal,
- &metadata_selfheal,
- &entry_selfheal);
- if (ret)
- goto out;
-
- if (!(data_selfheal || metadata_selfheal || entry_selfheal)) {
- ret = 2;
- goto out;
+afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid)
+{
+ int ret = -1;
+ int entry_ret = 1;
+ int metadata_ret = 1;
+ int data_ret = 1;
+ int or_ret = 0;
+ inode_t *inode = NULL;
+ fd_t *fd = NULL;
+ gf_boolean_t data_selfheal = _gf_false;
+ gf_boolean_t metadata_selfheal = _gf_false;
+ gf_boolean_t entry_selfheal = _gf_false;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_inspect(frame, this, gfid, &inode,
+ &data_selfheal, &metadata_selfheal,
+ &entry_selfheal, NULL);
+ if (ret)
+ goto out;
+
+ if (!(data_selfheal || metadata_selfheal || entry_selfheal)) {
+ ret = 2;
+ goto out;
+ }
+
+ if (inode->ia_type == IA_IFREG) {
+ ret = afr_selfheal_data_open(this, inode, &fd);
+ if (!fd) {
+ ret = -EIO;
+ goto out;
}
+ }
- if (data_selfheal && dataheal_enabled)
- data_ret = afr_selfheal_data (frame, this, inode);
+ if (data_selfheal && priv->data_self_heal)
+ data_ret = afr_selfheal_data(frame, this, fd);
- if (metadata_selfheal && priv->metadata_self_heal)
- metadata_ret = afr_selfheal_metadata (frame, this, inode);
+ if (metadata_selfheal && priv->metadata_self_heal)
+ metadata_ret = afr_selfheal_metadata(frame, this, inode);
- if (entry_selfheal && priv->entry_self_heal)
- entry_ret = afr_selfheal_entry (frame, this, inode);
+ if (entry_selfheal && priv->entry_self_heal)
+ entry_ret = afr_selfheal_entry(frame, this, inode);
- or_ret = (data_ret | metadata_ret | entry_ret);
+ or_ret = (data_ret | metadata_ret | entry_ret);
- if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO)
- ret = -EIO;
- else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1)
- ret = 1;
- else if (or_ret < 0)
- ret = or_ret;
- else
- ret = 0;
+ if (data_ret == -EIO || metadata_ret == -EIO || entry_ret == -EIO)
+ ret = -EIO;
+ else if (data_ret == 1 && metadata_ret == 1 && entry_ret == 1)
+ ret = 1;
+ else if (or_ret < 0)
+ ret = or_ret;
+ else
+ ret = 0;
out:
- if (inode) {
- inode_forget (inode, 1);
- inode_unref (inode);
- }
- return ret;
+ if (inode)
+ inode_unref(inode);
+ if (fd)
+ fd_unref(fd);
+ return ret;
}
/*
* This is the entry point for healing a given GFID. The return values for this
@@ -1406,19 +2593,342 @@ out:
*/
int
-afr_selfheal (xlator_t *this, uuid_t gfid)
+afr_selfheal(xlator_t *this, uuid_t gfid)
{
- int ret = -1;
- call_frame_t *frame = NULL;
+ int ret = -1;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+
+ frame = afr_frame_create(this, NULL);
+ if (!frame)
+ return ret;
+
+ local = frame->local;
+ local->xdata_req = dict_new();
+
+ ret = afr_selfheal_do(frame, this, gfid);
+
+ if (frame)
+ AFR_STACK_DESTROY(frame);
- frame = afr_frame_create (this);
- if (!frame)
- return ret;
+ return ret;
+}
- ret = afr_selfheal_do (frame, this, gfid);
+afr_local_t *
+__afr_dequeue_heals(afr_private_t *priv)
+{
+ afr_local_t *local = NULL;
+
+ if (list_empty(&priv->heal_waiting))
+ goto none;
+ if ((priv->background_self_heal_count > 0) &&
+ (priv->healers >= priv->background_self_heal_count))
+ goto none;
+
+ local = list_entry(priv->heal_waiting.next, afr_local_t, healer);
+ priv->heal_waiters--;
+ GF_ASSERT(priv->heal_waiters >= 0);
+ list_del_init(&local->healer);
+ list_add(&local->healer, &priv->healing);
+ priv->healers++;
+ return local;
+none:
+ gf_msg_debug(THIS->name, 0,
+ "Nothing dequeued. "
+ "Num healers: %d, Num Waiters: %d",
+ priv->healers, priv->heal_waiters);
+ return NULL;
+}
- if (frame)
- AFR_STACK_DESTROY (frame);
+int
+afr_refresh_selfheal_wrap(void *opaque)
+{
+ call_frame_t *heal_frame = opaque;
+ afr_local_t *local = heal_frame->local;
+ int ret = 0;
- return ret;
+ ret = afr_selfheal(heal_frame->this, local->refreshinode->gfid);
+ return ret;
+}
+
+int
+afr_refresh_heal_done(int ret, call_frame_t *frame, void *opaque)
+{
+ call_frame_t *heal_frame = opaque;
+ xlator_t *this = heal_frame->this;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = heal_frame->local;
+
+ LOCK(&priv->lock);
+ {
+ list_del_init(&local->healer);
+ priv->healers--;
+ GF_ASSERT(priv->healers >= 0);
+ local = __afr_dequeue_heals(priv);
+ }
+ UNLOCK(&priv->lock);
+
+ AFR_STACK_DESTROY(heal_frame);
+
+ if (local)
+ afr_heal_synctask(this, local);
+ return 0;
+}
+
+void
+afr_heal_synctask(xlator_t *this, afr_local_t *local)
+{
+ int ret = 0;
+ call_frame_t *heal_frame = NULL;
+
+ heal_frame = local->heal_frame;
+ ret = synctask_new(this->ctx->env, afr_refresh_selfheal_wrap,
+ afr_refresh_heal_done, heal_frame, heal_frame);
+ if (ret < 0)
+ /* Heal not launched. Will be queued when the next inode
+ * refresh happens and shd hasn't healed it yet. */
+ afr_refresh_heal_done(ret, heal_frame, heal_frame);
+}
+
+gf_boolean_t
+afr_throttled_selfheal(call_frame_t *frame, xlator_t *this)
+{
+ gf_boolean_t can_heal = _gf_true;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+
+ LOCK(&priv->lock);
+ {
+ if ((priv->background_self_heal_count > 0) &&
+ (priv->heal_wait_qlen + priv->background_self_heal_count) >
+ (priv->heal_waiters + priv->healers)) {
+ list_add_tail(&local->healer, &priv->heal_waiting);
+ priv->heal_waiters++;
+ local = __afr_dequeue_heals(priv);
+ } else {
+ can_heal = _gf_false;
+ }
+ }
+ UNLOCK(&priv->lock);
+
+ if (can_heal) {
+ if (local)
+ afr_heal_synctask(this, local);
+ else
+ gf_msg_debug(this->name, 0,
+ "Max number of heals are "
+ "pending, background self-heal rejected.");
+ }
+
+ return can_heal;
+}
+
+int
+afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
+ afr_transaction_type type)
+{
+ int source = -1;
+ int i = 0;
+
+ /* Give preference to local child to save on bandwidth */
+ for (i = 0; i < priv->child_count; i++) {
+ if (priv->local[i] && sources[i]) {
+ if ((type == AFR_DATA_TRANSACTION) && AFR_IS_ARBITER_BRICK(priv, i))
+ continue;
+
+ source = i;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ goto out;
+ }
+ }
+out:
+ return source;
+}
+
+static int
+afr_anon_inode_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ afr_local_t *local = frame->local;
+ int i = (long)cookie;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+ if (op_ret == 0) {
+ local->op_ret = 0;
+ local->replies[i].poststat = *buf;
+ local->replies[i].preparent = *preparent;
+ local->replies[i].postparent = *postparent;
+ }
+ if (xdata) {
+ local->replies[i].xdata = dict_ref(xdata);
+ }
+
+ syncbarrier_wake(&local->barrier);
+ return 0;
+}
+
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode)
+{
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ unsigned char *mkdir_on = alloca0(priv->child_count);
+ unsigned char *lookup_on = alloca0(priv->child_count);
+ loc_t loc = {0};
+ int32_t op_errno = 0;
+ int32_t child_op_errno = 0;
+ struct iatt iatt = {0};
+ dict_t *xdata = NULL;
+ uuid_t anon_inode_gfid = {0};
+ int mkdir_count = 0;
+ int i = 0;
+
+ /*Try to mkdir everywhere and return success if the dir exists on 'child'
+ */
+
+ if (!priv->use_anon_inode) {
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &op_errno);
+ if (op_errno) {
+ goto out;
+ }
+ local = frame->local;
+ if (!local->child_up[child]) {
+ /*Other bricks may need mkdir so don't error out yet*/
+ child_op_errno = ENOTCONN;
+ }
+ gf_uuid_parse(priv->anon_gfid_str, anon_inode_gfid);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->child_up[i])
+ continue;
+
+ if (priv->anon_inode[i]) {
+ mkdir_on[i] = 0;
+ } else {
+ mkdir_on[i] = 1;
+ mkdir_count++;
+ }
+ }
+
+ if (mkdir_count == 0) {
+ *linked_inode = inode_find(this->itable, anon_inode_gfid);
+ if (*linked_inode) {
+ op_errno = 0;
+ goto out;
+ }
+ }
+
+ loc.parent = inode_ref(this->itable->root);
+ loc.name = priv->anon_inode_name;
+ loc.inode = inode_new(this->itable);
+ if (!loc.inode) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+
+ op_errno = -dict_set_gfuuid(xdata, "gfid-req", anon_inode_gfid, _gf_true);
+ if (op_errno) {
+ goto out;
+ }
+
+ if (mkdir_count == 0) {
+ memcpy(lookup_on, local->child_up, priv->child_count);
+ goto lookup;
+ }
+
+ AFR_ONLIST(mkdir_on, frame, afr_anon_inode_mkdir_cbk, mkdir, &loc, 0755, 0,
+ xdata);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!mkdir_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else if (local->replies[i].op_ret < 0 &&
+ local->replies[i].op_errno == EEXIST) {
+ lookup_on[i] = 1;
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+
+ if (AFR_COUNT(lookup_on, priv->child_count) == 0) {
+ goto link;
+ }
+
+lookup:
+ AFR_ONLIST(lookup_on, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ xdata);
+ for (i = 0; i < priv->child_count; i++) {
+ if (!lookup_on[i]) {
+ continue;
+ }
+
+ if (local->replies[i].op_ret == 0) {
+ if (gf_uuid_compare(anon_inode_gfid,
+ local->replies[i].poststat.ia_gfid) == 0) {
+ priv->anon_inode[i] = 1;
+ iatt = local->replies[i].poststat;
+ } else {
+ if (i == child)
+ child_op_errno = EINVAL;
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_DATA,
+ "%s has gfid: %s", priv->anon_inode_name,
+ uuid_utoa(local->replies[i].poststat.ia_gfid));
+ }
+ } else if (i == child) {
+ child_op_errno = local->replies[i].op_errno;
+ }
+ }
+link:
+ if (!gf_uuid_is_null(iatt.ia_gfid)) {
+ *linked_inode = inode_link(loc.inode, loc.parent, loc.name, &iatt);
+ if (*linked_inode) {
+ op_errno = 0;
+ inode_lookup(*linked_inode);
+ } else {
+ op_errno = ENOMEM;
+ }
+ goto out;
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+ /*child_op_errno takes precedence*/
+ if (child_op_errno == 0) {
+ child_op_errno = op_errno;
+ }
+
+ if (child_op_errno && *linked_inode) {
+ inode_unref(*linked_inode);
+ *linked_inode = NULL;
+ }
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return -child_op_errno;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c
index 27d2849c158..37bcc2b3f9e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-data.c
+++ b/xlators/cluster/afr/src/afr-self-heal-data.c
@@ -8,615 +8,592 @@
cases as published by the Free Software Foundation.
*/
-
#include "afr.h"
#include "afr-self-heal.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "protocol-common.h"
#include "afr-messages.h"
-
-enum {
- AFR_SELFHEAL_DATA_FULL = 0,
- AFR_SELFHEAL_DATA_DIFF,
-};
-
+#include <glusterfs/events.h>
#define HAS_HOLES(i) ((i->ia_blocks * 512) < (i->ia_size))
static int
-__checksum_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, uint32_t weak, uint8_t *strong,
- dict_t *xdata)
+__checksum_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, uint32_t weak, uint8_t *strong, dict_t *xdata)
{
- afr_local_t *local = NULL;
- struct afr_reply *replies = NULL;
- int i = (long) cookie;
-
- local = frame->local;
- replies = local->replies;
-
- replies[i].valid = 1;
- replies[i].op_ret = op_ret;
- replies[i].op_errno = op_errno;
- if (xdata)
- replies[i].buf_has_zeroes = dict_get_str_boolean (xdata,
- "buf-has-zeroes", _gf_false);
- if (strong)
- memcpy (local->replies[i].checksum, strong, MD5_DIGEST_LENGTH);
-
- syncbarrier_wake (&local->barrier);
- return 0;
-}
-
-
-static int
-attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *pre, struct iatt *post,
- dict_t *xdata)
-{
- int i = (long) cookie;
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- local->replies[i].valid = 1;
- local->replies[i].op_ret = op_ret;
- local->replies[i].op_errno = op_errno;
- if (pre)
- local->replies[i].prestat = *pre;
- if (post)
- local->replies[i].poststat = *post;
- if (xdata)
- local->replies[i].xdata = dict_ref (xdata);
-
- syncbarrier_wake (&local->barrier);
+ afr_local_t *local = NULL;
+ struct afr_reply *replies = NULL;
+ int i = (long)cookie;
+
+ local = frame->local;
+ replies = local->replies;
+
+ replies[i].valid = 1;
+ replies[i].op_ret = op_ret;
+ replies[i].op_errno = op_errno;
+ if (xdata) {
+ replies[i].buf_has_zeroes = dict_get_str_boolean(
+ xdata, "buf-has-zeroes", _gf_false);
+ replies[i].fips_mode_rchecksum = dict_get_str_boolean(
+ xdata, "fips-mode-rchecksum", _gf_false);
+ }
+ if (strong) {
+ if (replies[i].fips_mode_rchecksum) {
+ memcpy(local->replies[i].checksum, strong, SHA256_DIGEST_LENGTH);
+ } else {
+ memcpy(local->replies[i].checksum, strong, MD5_DIGEST_LENGTH);
+ }
+ }
- return 0;
+ syncbarrier_wake(&local->barrier);
+ return 0;
}
-
static gf_boolean_t
-__afr_can_skip_data_block_heal (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int source, unsigned char *healed_sinks,
- off_t offset, size_t size,
- struct iatt *poststat)
+__afr_can_skip_data_block_heal(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ off_t offset, size_t size, struct iatt *poststat)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- unsigned char *wind_subvols = NULL;
- gf_boolean_t checksum_match = _gf_true;
- dict_t *xdata = NULL;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
- xdata = dict_new();
- if (xdata)
- i = dict_set_int32 (xdata, "check-zero-filled", 1);
- wind_subvols = alloca0 (priv->child_count);
- for (i = 0; i < priv->child_count; i++) {
- if (i == source || healed_sinks[i])
- wind_subvols[i] = 1;
- }
-
- AFR_ONLIST (wind_subvols, frame, __checksum_cbk, rchecksum, fd,
- offset, size, xdata);
- if (xdata)
- dict_unref (xdata);
-
- if (!local->replies[source].valid || local->replies[source].op_ret != 0)
- return _gf_false;
-
- for (i = 0; i < priv->child_count; i++) {
- if (i == source)
- continue;
- if (local->replies[i].valid) {
- if (memcmp (local->replies[source].checksum,
- local->replies[i].checksum,
- MD5_DIGEST_LENGTH)) {
- checksum_match = _gf_false;
- break;
- }
- }
- }
-
- if (checksum_match) {
- if (HAS_HOLES (poststat))
- return _gf_true;
-
- /* For non-sparse files, we might be better off writing the
- * zeroes to sinks to avoid mismatch of disk-usage in bricks. */
- if (local->replies[source].buf_has_zeroes)
- return _gf_false;
- else
- return _gf_true;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ unsigned char *wind_subvols = NULL;
+ gf_boolean_t checksum_match = _gf_true;
+ struct afr_reply *replies = NULL;
+ dict_t *xdata = NULL;
+ int i = 0;
+
+ priv = this->private;
+ local = frame->local;
+ replies = local->replies;
+
+ xdata = dict_new();
+ if (!xdata)
+ goto out;
+ if (dict_set_int32_sizen(xdata, "check-zero-filled", 1)) {
+ dict_unref(xdata);
+ goto out;
+ }
+
+ wind_subvols = alloca0(priv->child_count);
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || healed_sinks[i])
+ wind_subvols[i] = 1;
+ }
+
+ AFR_ONLIST(wind_subvols, frame, __checksum_cbk, rchecksum, fd, offset, size,
+ xdata);
+ if (xdata)
+ dict_unref(xdata);
+
+ if (!replies[source].valid || replies[source].op_ret != 0)
+ return _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source)
+ continue;
+ if (replies[i].valid) {
+ if (memcmp(replies[source].checksum, replies[i].checksum,
+ replies[source].fips_mode_rchecksum
+ ? SHA256_DIGEST_LENGTH
+ : MD5_DIGEST_LENGTH)) {
+ checksum_match = _gf_false;
+ break;
+ }
}
+ }
- return _gf_false;
-}
+ if (checksum_match) {
+ if (HAS_HOLES(poststat))
+ return _gf_true;
+ /* For non-sparse files, we might be better off writing the
+ * zeroes to sinks to avoid mismatch of disk-usage in bricks. */
+ if (local->replies[source].buf_has_zeroes)
+ return _gf_false;
+ else
+ return _gf_true;
+ }
+out:
+ return _gf_false;
+}
static gf_boolean_t
-__afr_is_sink_zero_filled (xlator_t *this, fd_t *fd, size_t size,
- off_t offset, int sink)
+__afr_is_sink_zero_filled(xlator_t *this, fd_t *fd, size_t size, off_t offset,
+ int sink)
{
- afr_private_t *priv = NULL;
- struct iobref *iobref = NULL;
- struct iovec *iovec = NULL;
- int count = 0;
- int ret = 0;
- gf_boolean_t zero_filled = _gf_false;
-
- priv = this->private;
- ret = syncop_readv (priv->children[sink], fd, size, offset, 0, &iovec,
- &count, &iobref, NULL, NULL);
- if (ret < 0)
- goto out;
- ret = iov_0filled (iovec, count);
- if (!ret)
- zero_filled = _gf_true;
+ afr_private_t *priv = NULL;
+ struct iobref *iobref = NULL;
+ struct iovec *iovec = NULL;
+ int count = 0;
+ int ret = 0;
+ gf_boolean_t zero_filled = _gf_false;
+
+ priv = this->private;
+ ret = syncop_readv(priv->children[sink], fd, size, offset, 0, &iovec,
+ &count, &iobref, NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ ret = iov_0filled(iovec, count);
+ if (!ret)
+ zero_filled = _gf_true;
out:
- if (iovec)
- GF_FREE (iovec);
- if (iobref)
- iobref_unref (iobref);
- return zero_filled;
+ if (iovec)
+ GF_FREE(iovec);
+ if (iobref)
+ iobref_unref(iobref);
+ return zero_filled;
}
static int
-__afr_selfheal_data_read_write (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int source, unsigned char *healed_sinks,
- off_t offset, size_t size,
- struct afr_reply *replies, int type)
+__afr_selfheal_data_read_write(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks,
+ off_t offset, size_t size,
+ struct afr_reply *replies, int type)
{
- struct iovec *iovec = NULL;
- int count = 0;
- struct iobref *iobref = NULL;
- int ret = 0;
- int i = 0;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- ret = syncop_readv (priv->children[source], fd, size, offset, 0,
- &iovec, &count, &iobref, NULL, NULL);
- if (ret <= 0)
- return ret;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!healed_sinks[i])
- continue;
-
- /*
- * TODO: Use fiemap() and discard() to heal holes
- * in the future.
- *
- * For now,
- *
- * - if the source had any holes at all,
- * AND
- * - if we are writing past the original file size
- * of the sink
- * AND
- * - is NOT the last block of the source file. if
- * the block contains EOF, it has to be written
- * in order to set the file size even if the
- * last block is 0-filled.
- * AND
- * - if the read buffer is filled with only 0's
- *
- * then, skip writing to this source. We don't depend
- * on the write to happen to update the size as we
- * have performed an ftruncate() upfront anyways.
- */
-#define is_last_block(o,b,s) ((s >= o) && (s <= (o + b)))
- if (HAS_HOLES ((&replies[source].poststat)) &&
- offset >= replies[i].poststat.ia_size &&
- !is_last_block (offset, size,
- replies[source].poststat.ia_size) &&
- (iov_0filled (iovec, count) == 0))
- continue;
-
- /* Avoid filling up sparse regions of the sink with 0-filled
- * writes.*/
- if (type == AFR_SELFHEAL_DATA_FULL &&
- HAS_HOLES ((&replies[source].poststat)) &&
- ((offset + size) <= replies[i].poststat.ia_size) &&
- (iov_0filled (iovec, count) == 0) &&
- __afr_is_sink_zero_filled (this, fd, size, offset, i)) {
- continue;
- }
-
- ret = syncop_writev (priv->children[i], fd, iovec, count,
- offset, iobref, 0, NULL, NULL);
- if (ret != iov_length (iovec, count)) {
- /* write() failed on this sink. unset the corresponding
- member in sinks[] (which is healed_sinks[] in the
- caller) so that this server does NOT get considered
- as successfully healed.
- */
- healed_sinks[i] = 0;
- }
- }
- if (iovec)
- GF_FREE (iovec);
- if (iobref)
- iobref_unref (iobref);
-
- return ret;
+ struct iovec *iovec = NULL;
+ int count = 0;
+ struct iobref *iobref = NULL;
+ int ret = 0;
+ int i = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ ret = syncop_readv(priv->children[source], fd, size, offset, 0, &iovec,
+ &count, &iobref, NULL, NULL, NULL);
+ if (ret <= 0)
+ return ret;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+
+ /*
+ * TODO: Use fiemap() and discard() to heal holes
+ * in the future.
+ *
+ * For now,
+ *
+ * - if the source had any holes at all,
+ * AND
+ * - if we are writing past the original file size
+ * of the sink
+ * AND
+ * - is NOT the last block of the source file. if
+ * the block contains EOF, it has to be written
+ * in order to set the file size even if the
+ * last block is 0-filled.
+ * AND
+ * - if the read buffer is filled with only 0's
+ *
+ * then, skip writing to this source. We don't depend
+ * on the write to happen to update the size as we
+ * have performed an ftruncate() upfront anyways.
+ */
+#define is_last_block(o, b, s) ((s >= o) && (s <= (o + b)))
+ if (HAS_HOLES((&replies[source].poststat)) &&
+ offset >= replies[i].poststat.ia_size &&
+ !is_last_block(offset, size, replies[source].poststat.ia_size) &&
+ (iov_0filled(iovec, count) == 0))
+ continue;
+
+ /* Avoid filling up sparse regions of the sink with 0-filled
+ * writes.*/
+ if (type == AFR_SELFHEAL_DATA_FULL &&
+ HAS_HOLES((&replies[source].poststat)) &&
+ ((offset + size) <= replies[i].poststat.ia_size) &&
+ (iov_0filled(iovec, count) == 0) &&
+ __afr_is_sink_zero_filled(this, fd, size, offset, i)) {
+ continue;
+ }
+
+ ret = syncop_writev(priv->children[i], fd, iovec, count, offset, iobref,
+ 0, NULL, NULL, NULL, NULL);
+ if (ret != iov_length(iovec, count)) {
+ /* write() failed on this sink. unset the corresponding
+ member in sinks[] (which is healed_sinks[] in the
+ caller) so that this server does NOT get considered
+ as successfully healed.
+ */
+ healed_sinks[i] = 0;
+ }
+ }
+ if (iovec)
+ GF_FREE(iovec);
+ if (iobref)
+ iobref_unref(iobref);
+
+ return ret;
}
-static int
-afr_selfheal_data_block (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int source, unsigned char *healed_sinks, off_t offset,
- size_t size, int type, struct afr_reply *replies)
+static gf_boolean_t
+afr_source_sinks_locked(xlator_t *this, unsigned char *locked_on, int source,
+ unsigned char *healed_sinks)
{
- int ret = -1;
- int sink_count = 0;
- afr_private_t *priv = NULL;
- unsigned char *data_lock = NULL;
-
- priv = this->private;
- sink_count = AFR_COUNT (healed_sinks, priv->child_count);
- data_lock = alloca0 (priv->child_count);
-
- ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name,
- offset, size, data_lock);
- {
- if (ret < sink_count) {
- ret = -ENOTCONN;
- goto unlock;
- }
-
- if (type == AFR_SELFHEAL_DATA_DIFF &&
- __afr_can_skip_data_block_heal (frame, this, fd, source,
- healed_sinks, offset, size,
- &replies[source].poststat)) {
- ret = 0;
- goto unlock;
- }
-
- ret = __afr_selfheal_data_read_write (frame, this, fd, source,
- healed_sinks, offset, size,
- replies, type);
- }
-unlock:
- afr_selfheal_uninodelk (frame, this, fd->inode, this->name,
- offset, size, data_lock);
- return ret;
-}
+ afr_private_t *priv = this->private;
+ int i = 0;
+ if (!locked_on[source])
+ return _gf_false;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (healed_sinks[i] && locked_on[i])
+ return _gf_true;
+ }
+ return _gf_false;
+}
static int
-afr_selfheal_data_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd,
- unsigned char *healed_sinks)
+afr_selfheal_data_block(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int source, unsigned char *healed_sinks, off_t offset,
+ size_t size, int type, struct afr_reply *replies)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- AFR_ONLIST (healed_sinks, frame, attr_cbk, fsync, fd, 0, NULL);
-
- for (i = 0; i < priv->child_count; i++)
- if (healed_sinks[i] && local->replies[i].op_ret != 0)
- /* fsync() failed. Do NOT consider this server
- as successfully healed. Mark it so.
- */
- healed_sinks[i] = 0;
- return 0;
-}
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ unsigned char *data_lock = NULL;
+
+ priv = this->private;
+ data_lock = alloca0(priv->child_count);
+
+ ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, offset, size,
+ data_lock);
+ {
+ if (!afr_source_sinks_locked(this, data_lock, source, healed_sinks)) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ if (type == AFR_SELFHEAL_DATA_DIFF &&
+ __afr_can_skip_data_block_heal(frame, this, fd, source,
+ healed_sinks, offset, size,
+ &replies[source].poststat)) {
+ ret = 0;
+ goto unlock;
+ }
+ ret = __afr_selfheal_data_read_write(
+ frame, this, fd, source, healed_sinks, offset, size, replies, type);
+ }
+unlock:
+ afr_selfheal_uninodelk(frame, this, fd->inode, this->name, offset, size,
+ data_lock);
+ return ret;
+}
static int
-afr_selfheal_data_restore_time (call_frame_t *frame, xlator_t *this,
- inode_t *inode, int source,
- unsigned char *healed_sinks,
- struct afr_reply *replies)
+afr_selfheal_data_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *healed_sinks)
{
- loc_t loc = {0, };
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, inode->gfid);
+ local = frame->local;
+ priv = this->private;
- AFR_ONLIST (healed_sinks, frame, attr_cbk, setattr, &loc,
- &replies[source].poststat,
- (GF_SET_ATTR_ATIME|GF_SET_ATTR_MTIME), NULL);
+ if (!priv->ensure_durability)
+ return 0;
- loc_wipe (&loc);
+ AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, fsync, fd, 0, NULL);
- return 0;
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret != 0)
+ /* fsync() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
+ return 0;
}
static int
-afr_data_self_heal_type_get (afr_private_t *priv, unsigned char *healed_sinks,
- int source, struct afr_reply *replies)
+afr_data_self_heal_type_get(afr_private_t *priv, unsigned char *healed_sinks,
+ int source, struct afr_reply *replies)
{
- int type = AFR_SELFHEAL_DATA_FULL;
- int i = 0;
-
- if (priv->data_self_heal_algorithm == NULL) {
- type = AFR_SELFHEAL_DATA_FULL;
- for (i = 0; i < priv->child_count; i++) {
- if (!healed_sinks[i] && i != source)
- continue;
- if (replies[i].poststat.ia_size) {
- type = AFR_SELFHEAL_DATA_DIFF;
- break;
- }
- }
- } else if (strcmp (priv->data_self_heal_algorithm, "full") == 0) {
- type = AFR_SELFHEAL_DATA_FULL;
- } else if (strcmp (priv->data_self_heal_algorithm, "diff") == 0) {
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int i = 0;
+
+ if (priv->data_self_heal_algorithm == AFR_SELFHEAL_DATA_DYNAMIC) {
+ type = AFR_SELFHEAL_DATA_FULL;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i] && i != source)
+ continue;
+ if (replies[i].poststat.ia_size) {
type = AFR_SELFHEAL_DATA_DIFF;
+ break;
+ }
}
- return type;
+ } else {
+ type = priv->data_self_heal_algorithm;
+ }
+ return type;
}
static int
-afr_selfheal_data_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int source, unsigned char *healed_sinks,
- struct afr_reply *replies)
+afr_selfheal_data_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
+ unsigned char *healed_sinks, struct afr_reply *replies)
{
- afr_private_t *priv = NULL;
- off_t off = 0;
- size_t block = 128 * 1024;
- int type = AFR_SELFHEAL_DATA_FULL;
- int ret = -1;
- call_frame_t *iter_frame = NULL;
- unsigned char arbiter_sink_status = 0;
-
- priv = this->private;
- if (priv->arbiter_count) {
- arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
- healed_sinks[ARBITER_BRICK_INDEX] = 0;
+ afr_private_t *priv = NULL;
+ off_t off = 0;
+ size_t block = 0;
+ int type = AFR_SELFHEAL_DATA_FULL;
+ int ret = -1;
+ call_frame_t *iter_frame = NULL;
+ unsigned char arbiter_sink_status = 0;
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "performing data selfheal on %s", uuid_utoa(fd->inode->gfid));
+
+ priv = this->private;
+ if (priv->arbiter_count) {
+ arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
+ healed_sinks[ARBITER_BRICK_INDEX] = 0;
+ }
+
+ block = 128 * 1024 * priv->data_self_heal_window_size;
+
+ type = afr_data_self_heal_type_get(priv, healed_sinks, source, replies);
+
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (off = 0; off < replies[source].poststat.ia_size; off += block) {
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ ret = -ENOTCONN;
+ goto out;
}
- type = afr_data_self_heal_type_get (priv, healed_sinks, source,
- replies);
+ ret = afr_selfheal_data_block(iter_frame, this, fd, source,
+ healed_sinks, off, block, type, replies);
+ if (ret < 0)
+ goto out;
- iter_frame = afr_copy_frame (frame);
- if (!iter_frame) {
- ret = -ENOMEM;
- goto out;
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = -ENOTCONN;
+ goto out;
}
+ }
- for (off = 0; off < replies[source].poststat.ia_size; off += block) {
- if (AFR_COUNT (healed_sinks, priv->child_count) == 0) {
- ret = -ENOTCONN;
- goto out;
- }
-
- ret = afr_selfheal_data_block (iter_frame, this, fd, source,
- healed_sinks, off, block, type,
- replies);
- if (ret < 0)
- goto out;
-
- AFR_STACK_RESET (iter_frame);
- if (iter_frame->local == NULL) {
- ret = -ENOTCONN;
- goto out;
- }
- }
-
- ret = afr_selfheal_data_fsync (frame, this, fd, healed_sinks);
+ ret = afr_selfheal_data_fsync(frame, this, fd, healed_sinks);
out:
- if (arbiter_sink_status)
- healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status;
+ if (arbiter_sink_status)
+ healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status;
- if (iter_frame)
- AFR_STACK_DESTROY (iter_frame);
- return ret;
+ if (iter_frame)
+ AFR_STACK_DESTROY(iter_frame);
+ return ret;
}
-
static int
-__afr_selfheal_truncate_sinks (call_frame_t *frame, xlator_t *this,
- fd_t *fd, unsigned char *healed_sinks,
- uint64_t size)
+__afr_selfheal_truncate_sinks(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *healed_sinks, uint64_t size)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- unsigned char arbiter_sink_status = 0;
- int i = 0;
-
- local = frame->local;
- priv = this->private;
-
- if (priv->arbiter_count) {
- arbiter_sink_status = healed_sinks[ARBITER_BRICK_INDEX];
- healed_sinks[ARBITER_BRICK_INDEX] = 0;
- }
-
- AFR_ONLIST (healed_sinks, frame, attr_cbk, ftruncate, fd, size, NULL);
-
- for (i = 0; i < priv->child_count; i++)
- if (healed_sinks[i] && local->replies[i].op_ret == -1)
- /* truncate() failed. Do NOT consider this server
- as successfully healed. Mark it so.
- */
- healed_sinks[i] = 0;
-
- if (arbiter_sink_status)
- healed_sinks[ARBITER_BRICK_INDEX] = arbiter_sink_status;
- return 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ /* This will send truncate on the arbiter brick as well if it is marked as
+ * sink. If changelog is enabled on the volume it captures truncate as a
+ * data transactions on the arbiter brick. This will help geo-rep to
+ * properly sync the data from master to slave if arbiter is the ACTIVE
+ * brick during syncing and which had got some entries healed for data as
+ * part of self heal.
+ */
+ AFR_ONLIST(healed_sinks, frame, afr_sh_generic_fop_cbk, ftruncate, fd, size,
+ NULL);
+
+ for (i = 0; i < priv->child_count; i++)
+ if (healed_sinks[i] && local->replies[i].op_ret == -1)
+ /* truncate() failed. Do NOT consider this server
+ as successfully healed. Mark it so.
+ */
+ healed_sinks[i] = 0;
+
+ return 0;
}
gf_boolean_t
-afr_has_source_witnesses (xlator_t *this, unsigned char *sources,
- uint64_t *witness)
+afr_has_source_witnesses(xlator_t *this, unsigned char *sources,
+ uint64_t *witness)
{
- int i = 0;
- afr_private_t *priv = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] && witness[i])
- return _gf_true;
- }
- return _gf_false;
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] && witness[i])
+ return _gf_true;
+ }
+ return _gf_false;
}
static gf_boolean_t
-afr_does_size_mismatch (xlator_t *this, unsigned char *sources,
- struct afr_reply *replies)
+afr_does_size_mismatch(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
{
- int i = 0;
- afr_private_t *priv = NULL;
- struct iatt *min = NULL;
- struct iatt *max = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct iatt *min = NULL;
+ struct iatt *max = NULL;
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
- if (replies[i].op_ret < 0)
- continue;
+ if (replies[i].op_ret < 0)
+ continue;
- if (!sources[i])
- continue;
+ if (!sources[i])
+ continue;
- if (AFR_IS_ARBITER_BRICK (priv, i) &&
- (replies[i].poststat.ia_size == 0))
- continue;
+ if (AFR_IS_ARBITER_BRICK(priv, i) && (replies[i].poststat.ia_size == 0))
+ continue;
- if (!min)
- min = &replies[i].poststat;
+ if (!min)
+ min = &replies[i].poststat;
- if (!max)
- max = &replies[i].poststat;
+ if (!max)
+ max = &replies[i].poststat;
- if (min->ia_size > replies[i].poststat.ia_size)
- min = &replies[i].poststat;
+ if (min->ia_size > replies[i].poststat.ia_size)
+ min = &replies[i].poststat;
- if (max->ia_size < replies[i].poststat.ia_size)
- max = &replies[i].poststat;
- }
+ if (max->ia_size < replies[i].poststat.ia_size)
+ max = &replies[i].poststat;
+ }
- if (min && max) {
- if (min->ia_size != max->ia_size)
- return _gf_true;
- }
+ if (min && max) {
+ if (min->ia_size != max->ia_size)
+ return _gf_true;
+ }
- return _gf_false;
+ return _gf_false;
}
static void
-afr_mark_biggest_witness_as_source (xlator_t *this, unsigned char *sources,
- uint64_t *witness)
+afr_mark_biggest_witness_as_source(xlator_t *this, unsigned char *sources,
+ uint64_t *witness)
{
- int i = 0;
- afr_private_t *priv = NULL;
- uint64_t biggest_witness = 0;
-
- priv = this->private;
- /* Find source with biggest witness count */
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- if (biggest_witness < witness[i])
- biggest_witness = witness[i];
- }
-
- /* Mark files with less witness count as not source */
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- if (witness[i] < biggest_witness)
- sources[i] = 0;
- }
-
- return;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ uint64_t biggest_witness = 0;
+
+ priv = this->private;
+ /* Find source with biggest witness count */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (biggest_witness < witness[i])
+ biggest_witness = witness[i];
+ }
+
+ /* Mark files with less witness count as not source */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ if (witness[i] < biggest_witness)
+ sources[i] = 0;
+ }
+
+ return;
}
/* This is a tie breaker function. Only one source be assigned here */
static void
-afr_mark_newest_file_as_source (xlator_t *this, unsigned char *sources,
- struct afr_reply *replies)
+afr_mark_newest_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies)
{
- int i = 0;
- afr_private_t *priv = NULL;
- int source = -1;
- uint32_t max_ctime = 0;
-
- priv = this->private;
- /* Find source with latest ctime */
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
-
- if (max_ctime <= replies[i].poststat.ia_ctime) {
- source = i;
- max_ctime = replies[i].poststat.ia_ctime;
- }
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ uint32_t max_ctime = 0;
+
+ priv = this->private;
+ /* Find source with latest ctime */
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+
+ if (max_ctime <= replies[i].poststat.ia_ctime) {
+ source = i;
+ max_ctime = replies[i].poststat.ia_ctime;
}
+ }
- /* Only mark one of the files as source to break ties */
- memset (sources, 0, sizeof (*sources) * priv->child_count);
- sources[source] = 1;
+ /* Only mark one of the files as source to break ties */
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ sources[source] = 1;
}
static int
-__afr_selfheal_data_finalize_source (call_frame_t *frame, xlator_t *this,
- unsigned char *sources,
- unsigned char *sinks,
- unsigned char *healed_sinks,
- unsigned char *locked_on,
- struct afr_reply *replies,
- uint64_t *witness)
+__afr_selfheal_data_finalize_source(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ unsigned char *undid_pending, struct afr_reply *replies, uint64_t *witness)
{
- int i = 0;
- afr_private_t *priv = NULL;
- int source = -1;
- int sources_count = 0;
- priv = this->private;
-
- sources_count = AFR_COUNT (sources, priv->child_count);
-
- if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
- || !sources_count) {
- /* split brain */
- source = afr_mark_split_brain_source_sinks (frame, this,
- sources, sinks,
- healed_sinks,
- locked_on, replies,
- AFR_DATA_TRANSACTION);
- if (source < 0)
- return -EIO;
- return source;
- }
-
- /* No split brain at this point. If we were called from
- * afr_heal_splitbrain_file(), abort.*/
- if (afr_dict_contains_heal_op(frame))
- return -EIO;
-
- /* If there are no witnesses/size-mismatches on sources we are done*/
- if (!afr_does_size_mismatch (this, sources, replies) &&
- !afr_has_source_witnesses (this, sources, witness))
- goto out;
-
- afr_mark_largest_file_as_source (this, sources, replies);
- afr_mark_biggest_witness_as_source (this, sources, witness);
- afr_mark_newest_file_as_source (this, sources, replies);
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int sources_count = 0;
+ priv = this->private;
+
+ sources_count = AFR_COUNT(sources, priv->child_count);
+
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count) {
+ /* split brain */
+ source = afr_mark_split_brain_source_sinks(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ replies, AFR_DATA_TRANSACTION);
+ if (source < 0) {
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=data;"
+ "file=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(inode->gfid));
+ return -EIO;
+ }
+
+ _afr_fav_child_reset_sink_xattrs(
+ frame, this, inode, source, healed_sinks, undid_pending,
+ AFR_DATA_TRANSACTION, locked_on, replies);
+ goto out;
+ }
+
+ /* No split brain at this point. If we were called from
+ * afr_heal_splitbrain_file(), abort.*/
+ if (afr_dict_contains_heal_op(frame))
+ return -EIO;
+
+ /* If there are no witnesses/size-mismatches on sources we are done*/
+ if (!afr_does_size_mismatch(this, sources, replies) &&
+ !afr_has_source_witnesses(this, sources, witness))
+ goto out;
+
+ afr_mark_largest_file_as_source(this, sources, replies);
+ afr_mark_biggest_witness_as_source(this, sources, witness);
+ afr_mark_newest_file_as_source(this, sources, replies);
+ if (priv->arbiter_count)
+ /* Choose non-arbiter brick as source for empty files. */
+ afr_mark_source_sinks_if_file_empty(this, sources, sinks, healed_sinks,
+ locked_on, replies,
+ AFR_DATA_TRANSACTION);
out:
- afr_mark_active_sinks (this, sources, locked_on, healed_sinks);
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ source = afr_choose_source_by_policy(priv, sources, AFR_DATA_TRANSACTION);
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i]) {
- source = i;
- break;
- }
- }
- return source;
+ return source;
}
/*
@@ -629,246 +606,286 @@ out:
* for self-healing, or -1 if no healing is necessary/split brain.
*/
int
-__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
- inode_t *inode, unsigned char *locked_on,
- unsigned char *sources, unsigned char *sinks,
- unsigned char *healed_sinks,
- struct afr_reply *replies, gf_boolean_t *pflag)
+__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *pflag)
{
- int ret = -1;
- int source = -1;
- afr_private_t *priv = NULL;
- uint64_t *witness = NULL;
-
- priv = this->private;
-
- ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
-
- if (ret)
- return ret;
-
- witness = alloca0(priv->child_count * sizeof (*witness));
- ret = afr_selfheal_find_direction (frame, this, replies,
- AFR_DATA_TRANSACTION,
- locked_on, sources, sinks, witness,
- pflag);
- if (ret)
- return ret;
-
- /* Initialize the healed_sinks[] array optimistically to
- the intersection of to-be-healed (i.e sinks[]) and
- the list of servers which are up (i.e locked_on[]).
- As we encounter failures in the healing process, we
- will unmark the respective servers in the healed_sinks[]
- array.
- */
- AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count);
-
- source = __afr_selfheal_data_finalize_source (frame, this, sources,
- sinks, healed_sinks,
- locked_on, replies,
- witness);
- if (source < 0)
- return -EIO;
-
- return source;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+
+ if (ret)
+ return ret;
+
+ witness = alloca0(priv->child_count * sizeof(*witness));
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_DATA_TRANSACTION, locked_on, sources,
+ sinks, witness, pflag);
+ if (ret)
+ return ret;
+
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+ source = __afr_selfheal_data_finalize_source(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ undid_pending, replies, witness);
+ if (source < 0)
+ return -EIO;
+
+ return source;
}
-
static int
-__afr_selfheal_data (call_frame_t *frame, xlator_t *this, fd_t *fd,
- unsigned char *locked_on)
+__afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
{
- afr_private_t *priv = NULL;
- int ret = -1;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *data_lock = NULL;
- unsigned char *healed_sinks = NULL;
- struct afr_reply *locked_replies = NULL;
- int source = -1;
- gf_boolean_t did_sh = _gf_true;
- gf_boolean_t is_arbiter_the_only_sink = _gf_false;
-
- priv = this->private;
-
- sources = alloca0 (priv->child_count);
- sinks = alloca0 (priv->child_count);
- healed_sinks = alloca0 (priv->child_count);
- data_lock = alloca0 (priv->child_count);
-
- locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
-
- ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name, 0, 0,
- data_lock);
- {
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
- gf_msg_debug (this->name, 0, "%s: Skipping "
- "self-heal as only %d number "
- "of subvolumes "
- "could be locked",
- uuid_utoa (fd->inode->gfid),
- ret);
- ret = -ENOTCONN;
- goto unlock;
- }
-
- ret = __afr_selfheal_data_prepare (frame, this, fd->inode,
- data_lock, sources, sinks,
- healed_sinks,
- locked_replies,
- NULL);
- if (ret < 0)
- goto unlock;
-
- if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
- did_sh = _gf_false;
- goto unlock;
- }
-
- source = ret;
-
- if (AFR_IS_ARBITER_BRICK(priv, source) &&
- AFR_COUNT (sources, priv->child_count) == 1) {
- did_sh = _gf_false;
- goto unlock;
- }
-
- if (priv->arbiter_count &&
- AFR_COUNT (healed_sinks, priv->child_count) == 1 &&
- healed_sinks[ARBITER_BRICK_INDEX]) {
- is_arbiter_the_only_sink = _gf_true;
- goto restore_time;
- }
-
- ret = __afr_selfheal_truncate_sinks (frame, this, fd, healed_sinks,
- locked_replies[source].poststat.ia_size);
- if (ret < 0)
- goto unlock;
-
- ret = 0;
-
- }
-unlock:
- afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
- data_lock);
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ int source = -1;
+ gf_boolean_t did_sh = _gf_true;
+ gf_boolean_t is_arbiter_the_only_sink = _gf_false;
+ gf_boolean_t empty_file = _gf_false;
+
+ priv = this->private;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ data_lock = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "self-heal as only %d number "
+ "of subvolumes "
+ "could be locked",
+ uuid_utoa(fd->inode->gfid), ret);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_data_prepare(frame, this, fd->inode, data_lock,
+ sources, sinks, healed_sinks,
+ undid_pending, locked_replies, NULL);
if (ret < 0)
- goto out;
+ goto unlock;
+
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ did_sh = _gf_false;
+ goto unlock;
+ }
+
+ source = ret;
- if (!did_sh)
- goto out;
+ if (AFR_IS_ARBITER_BRICK(priv, source)) {
+ empty_file = afr_is_file_empty_on_all_children(priv,
+ locked_replies);
+ if (empty_file)
+ goto restore_time;
- ret = afr_selfheal_data_do (frame, this, fd, source, healed_sinks,
- locked_replies);
- if (ret)
- goto out;
+ did_sh = _gf_false;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_truncate_sinks(
+ frame, this, fd, healed_sinks,
+ locked_replies[source].poststat.ia_size);
+ if (ret < 0)
+ goto unlock;
+
+ if (priv->arbiter_count &&
+ AFR_COUNT(healed_sinks, priv->child_count) == 1 &&
+ healed_sinks[ARBITER_BRICK_INDEX]) {
+ is_arbiter_the_only_sink = _gf_true;
+ goto restore_time;
+ }
+ ret = 0;
+ }
+unlock:
+ afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock);
+ if (ret < 0)
+ goto out;
+
+ if (!did_sh)
+ goto out;
+
+ ret = afr_selfheal_data_do(frame, this, fd, source, healed_sinks,
+ locked_replies);
+ if (ret)
+ goto out;
restore_time:
- afr_selfheal_data_restore_time (frame, this, fd->inode, source,
- healed_sinks, locked_replies);
-
- if (!is_arbiter_the_only_sink) {
- ret = afr_selfheal_inodelk (frame, this, fd->inode, this->name,
- 0, 0, data_lock);
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
- ret = -ENOTCONN;
- did_sh = _gf_false;
- goto skip_undo_pending;
- }
+ afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
+ locked_replies);
+
+ if (!is_arbiter_the_only_sink && !empty_file) {
+ ret = afr_selfheal_inodelk(frame, this, fd->inode, this->name, 0, 0,
+ data_lock);
+ if (ret < priv->child_count) {
+ ret = -ENOTCONN;
+ did_sh = _gf_false;
+ goto skip_undo_pending;
}
- ret = afr_selfheal_undo_pending (frame, this, fd->inode,
- sources, sinks, healed_sinks,
- AFR_DATA_TRANSACTION,
- locked_replies, data_lock);
+ }
+ ret = afr_selfheal_undo_pending(
+ frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
+ AFR_DATA_TRANSACTION, locked_replies, data_lock);
skip_undo_pending:
- afr_selfheal_uninodelk (frame, this, fd->inode, this->name, 0, 0,
- data_lock);
+ afr_selfheal_uninodelk(frame, this, fd->inode, this->name, 0, 0, data_lock);
out:
- if (did_sh)
- afr_log_selfheal (fd->inode->gfid, this, ret, "data", source,
- healed_sinks);
- else
- ret = 1;
+ if (did_sh)
+ afr_log_selfheal(fd->inode->gfid, this, ret, "data", source, sources,
+ healed_sinks);
+ else
+ ret = 1;
- if (locked_replies)
- afr_replies_wipe (locked_replies, priv->child_count);
+ if (locked_replies)
+ afr_replies_wipe(locked_replies, priv->child_count);
- return ret;
+ return ret;
}
+int
+afr_selfheal_data_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, fd_t *fd,
+ dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int i = (long)cookie;
+
+ local = frame->local;
+
+ local->replies[i].valid = 1;
+ local->replies[i].op_ret = op_ret;
+ local->replies[i].op_errno = op_errno;
+
+ syncbarrier_wake(&local->barrier);
+
+ return 0;
+}
int
-afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd)
+afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd)
{
- int ret = 0;
- fd_t *fd_tmp = NULL;
- loc_t loc = {0,};
-
- fd_tmp = fd_create (inode, 0);
- if (!fd_tmp)
- return -ENOMEM;
-
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, inode->gfid);
-
- ret = syncop_open (this, &loc, O_RDWR|O_LARGEFILE, fd_tmp, NULL, NULL);
- if (ret < 0) {
- fd_unref (fd_tmp);
- loc_wipe (&loc);
- goto out;
- } else {
- fd_bind (fd_tmp);
- }
-
- *fd = fd_tmp;
+ int ret = 0;
+ fd_t *fd_tmp = NULL;
+ loc_t loc = {
+ 0,
+ };
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ fd_tmp = fd_create(inode, 0);
+ if (!fd_tmp)
+ return -ENOMEM;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ fd_unref(fd_tmp);
+ goto out;
+ }
+ local = frame->local;
+
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_data_open_cbk, open, &loc,
+ O_RDWR | O_LARGEFILE, fd_tmp, NULL);
+
+ ret = -ENOTCONN;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+
+ if (local->replies[i].op_ret < 0) {
+ ret = -local->replies[i].op_errno;
+ continue;
+ }
+
+ ret = 0;
+ break;
+ }
+
+ if (ret < 0) {
+ fd_unref(fd_tmp);
+ goto out;
+ } else {
+ fd_bind(fd_tmp);
+ }
+
+ *fd = fd_tmp;
out:
- return ret;
+ loc_wipe(&loc);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return ret;
}
int
-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode)
+afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd)
{
- afr_private_t *priv = NULL;
- unsigned char *locked_on = NULL;
- int ret = 0;
- fd_t *fd = NULL;
-
- priv = this->private;
-
- ret = afr_selfheal_data_open (this, inode, &fd);
- if (!fd) {
- gf_msg_debug (this->name, -ret, "%s: Failed to open",
- uuid_utoa (inode->gfid));
- return -EIO;
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ int ret = 0;
+ inode_t *inode = fd->inode;
+
+ priv = this->private;
+
+ locked_on = alloca0(priv->child_count);
+
+ ret = afr_selfheal_tie_breaker_inodelk(frame, this, inode, priv->sh_domain,
+ 0, 0, locked_on);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "self-heal as only %d number of "
+ "subvolumes could be locked",
+ uuid_utoa(fd->inode->gfid), ret);
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
}
- locked_on = alloca0 (priv->child_count);
-
- ret = afr_selfheal_tryinodelk (frame, this, inode, priv->sh_domain, 0, 0,
- locked_on);
- {
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
- gf_msg_debug (this->name, 0, "%s: Skipping "
- "self-heal as only %d number of "
- "subvolumes could be locked",
- uuid_utoa (fd->inode->gfid),
- ret);
- /* Either less than two subvols available, or another
- selfheal (from another server) is in progress. Skip
- for now in any case there isn't anything to do.
- */
- ret = -ENOTCONN;
- goto unlock;
- }
-
- ret = __afr_selfheal_data (frame, this, fd, locked_on);
- }
+ ret = __afr_selfheal_data(frame, this, fd, locked_on);
+ }
unlock:
- afr_selfheal_uninodelk (frame, this, inode, priv->sh_domain, 0, 0, locked_on);
-
- if (fd)
- fd_unref (fd);
+ afr_selfheal_uninodelk(frame, this, inode, priv->sh_domain, 0, 0,
+ locked_on);
- return ret;
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c
index 42a6c0453b1..64893f441e3 100644
--- a/xlators/cluster/afr/src/afr-self-heal-entry.c
+++ b/xlators/cluster/afr/src/afr-self-heal-entry.c
@@ -8,799 +8,1269 @@
cases as published by the Free Software Foundation.
*/
-
#include "afr.h"
#include "afr-self-heal.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "afr-transaction.h"
#include "afr-messages.h"
+#include <glusterfs/syncop-utils.h>
+#include <glusterfs/events.h>
-/* Max file name length is 255 this filename is of length 256. No file with
- * this name can ever come, entry-lock with this name is going to prevent
- * self-heals from older versions while the granular entry-self-heal is going
- * on in newer version.*/
-#define LONG_FILENAME "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\
- "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\
- "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\
- "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"\
- "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-
-static int
-afr_selfheal_entry_delete (xlator_t *this, inode_t *dir, const char *name,
- inode_t *inode, int child, struct afr_reply *replies)
+int
+afr_selfheal_entry_anon_inode(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child,
+ struct afr_reply *replies,
+ gf_boolean_t *anon_inode)
{
- afr_private_t *priv = NULL;
- xlator_t *subvol = NULL;
- int ret = 0;
- loc_t loc = {0, };
- char g[64];
-
- priv = this->private;
-
- subvol = priv->children[child];
-
- loc.parent = inode_ref (dir);
- gf_uuid_copy (loc.pargfid, dir->gfid);
- loc.name = name;
- loc.inode = inode_ref (inode);
-
- if (replies[child].valid && replies[child].op_ret == 0) {
- switch (replies[child].poststat.ia_type) {
- case IA_IFDIR:
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging dir %s/%s (%s) on %s",
- uuid_utoa (dir->gfid), name,
- uuid_utoa_r (replies[child].poststat.ia_gfid, g),
- subvol->name);
- ret = syncop_rmdir (subvol, &loc, 1, NULL, NULL);
- break;
- default:
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging file %s/%s (%s) on %s",
- uuid_utoa (dir->gfid), name,
- uuid_utoa_r (replies[child].poststat.ia_gfid, g),
- subvol->name);
- ret = syncop_unlink (subvol, &loc, NULL, NULL);
- break;
- }
- }
-
- loc_wipe (&loc);
-
- return ret;
-}
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ int i = 0;
+ char g[64] = {0};
+ unsigned char *lookup_success = NULL;
+ call_frame_t *frame = NULL;
+ loc_t loc2 = {
+ 0,
+ };
+ loc_t loc = {
+ 0,
+ };
+
+ priv = this->private;
+ subvol = priv->children[child];
+ lookup_success = alloca0(priv->child_count);
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (replies[child].poststat.ia_type == IA_IFDIR) {
+ /* This directory may have sub-directory hierarchy which may need to
+ * be preserved for subsequent heals. So unconditionally move the
+ * directory to anonymous-inode directory*/
+ *anon_inode = _gf_true;
+ goto anon_inode;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ gf_uuid_copy(loc.gfid, replies[child].poststat.ia_gfid);
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ lookup_success[i] = 1;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ ret = -local->replies[i].op_errno;
+ }
+ }
+ if (priv->quorum_count) {
+ if (afr_has_quorum(lookup_success, this, NULL)) {
+ *anon_inode = _gf_true;
+ }
+ } else if (AFR_COUNT(lookup_success, priv->child_count) > 1) {
+ *anon_inode = _gf_true;
+ } else if (ret) {
+ goto out;
+ }
+
+anon_inode:
+ if (!*anon_inode) {
+ ret = 0;
+ goto out;
+ }
+
+ loc.parent = inode_ref(dir);
+ gf_uuid_copy(loc.pargfid, dir->gfid);
+ loc.name = name;
+
+ ret = afr_anon_inode_create(this, child, &loc2.parent);
+ if (ret < 0)
+ goto out;
+
+ loc2.name = g;
+ ret = syncop_rename(subvol, &loc, &loc2, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s failed",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "Rename to %s dir %s/%s (%s) on %s successful",
+ priv->anon_inode_name, uuid_utoa(dir->gfid), name, g,
+ subvol->name);
+ }
+
+out:
+ loc_wipe(&loc);
+ loc_wipe(&loc2);
+ if (frame) {
+ AFR_STACK_DESTROY(frame);
+ }
+
+ return ret;
+}
int
-afr_selfheal_recreate_entry (xlator_t *this, int dst, int source, inode_t *dir,
- const char *name, inode_t *inode,
- struct afr_reply *replies,
- unsigned char *newentry)
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies)
{
- int ret = 0;
- loc_t loc = {0,};
- loc_t srcloc = {0,};
- afr_private_t *priv = NULL;
- dict_t *xdata = NULL;
- struct iatt *iatt = NULL;
- char *linkname = NULL;
- mode_t mode = 0;
- struct iatt newent = {0,};
- priv = this->private;
-
- xdata = dict_new();
- if (!xdata)
- return -ENOMEM;
-
- loc.parent = inode_ref (dir);
- gf_uuid_copy (loc.pargfid, dir->gfid);
- loc.name = name;
- loc.inode = inode_ref (inode);
-
- ret = afr_selfheal_entry_delete (this, dir, name, inode, dst, replies);
- if (ret)
- goto out;
-
- ret = dict_set_static_bin (xdata, "gfid-req",
- replies[source].poststat.ia_gfid, 16);
- if (ret)
- goto out;
-
- iatt = &replies[source].poststat;
-
- srcloc.inode = inode_ref (inode);
- gf_uuid_copy (srcloc.gfid, iatt->ia_gfid);
-
- mode = st_mode_from_ia (iatt->ia_prot, iatt->ia_type);
-
- switch (iatt->ia_type) {
- case IA_IFDIR:
- ret = syncop_mkdir (priv->children[dst], &loc, mode, 0,
- xdata, NULL);
- if (ret == 0)
- newentry[dst] = 1;
- break;
- case IA_IFLNK:
- ret = syncop_lookup (priv->children[dst], &srcloc, 0, 0, 0, 0);
- if (ret == 0) {
- ret = syncop_link (priv->children[dst], &srcloc, &loc,
- &newent, NULL, NULL);
- } else {
- ret = syncop_readlink (priv->children[source], &srcloc,
- &linkname, 4096, NULL, NULL);
- if (ret <= 0)
- goto out;
- ret = syncop_symlink (priv->children[dst], &loc,
- linkname, NULL, xdata, NULL);
- if (ret == 0)
- newentry[dst] = 1;
- }
- break;
- default:
- ret = dict_set_int32 (xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
- if (ret)
- goto out;
- ret = syncop_mknod (priv->children[dst], &loc, mode,
- iatt->ia_rdev, &newent, xdata, NULL);
- if (ret == 0 && newent.ia_nlink == 1) {
- /* New entry created. Mark @dst pending on all sources */
- newentry[dst] = 1;
- }
- break;
- }
+ char g[64] = {0};
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ gf_boolean_t anon_inode = _gf_false;
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ if ((!replies[child].valid) || (replies[child].op_ret < 0)) {
+ /*Nothing to do*/
+ ret = 0;
+ goto out;
+ }
+
+ if (priv->use_anon_inode) {
+ ret = afr_selfheal_entry_anon_inode(this, dir, name, inode, child,
+ replies, &anon_inode);
+ if (ret < 0 || anon_inode)
+ goto out;
+ }
+
+ loc.parent = inode_ref(dir);
+ loc.inode = inode_new(inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ loc.name = name;
+ switch (replies[child].poststat.ia_type) {
+ case IA_IFDIR:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging dir %s/%s (%s) on %s", uuid_utoa(dir->gfid), name,
+ uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+ break;
+ default:
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging file %s/%s (%s) on %s", uuid_utoa(dir->gfid),
+ name, uuid_utoa_r(replies[child].poststat.ia_gfid, g),
+ subvol->name);
+ ret = syncop_unlink(subvol, &loc, NULL, NULL);
+ break;
+ }
out:
- if (xdata)
- dict_unref (xdata);
- GF_FREE (linkname);
- loc_wipe (&loc);
- loc_wipe (&srcloc);
- return ret;
+ loc_wipe(&loc);
+ return ret;
}
+int
+afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
+ unsigned char *sources, inode_t *dir,
+ const char *name, inode_t *inode,
+ struct afr_reply *replies)
+{
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ loc_t srcloc = {
+ 0,
+ };
+ loc_t anonloc = {
+ 0,
+ };
+ xlator_t *this = frame->this;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ struct iatt *iatt = NULL;
+ char *linkname = NULL;
+ mode_t mode = 0;
+ struct iatt newent = {
+ 0,
+ };
+ unsigned char *newentry = NULL;
+ char iatt_uuid_str[64] = {0};
+ char dir_uuid_str[64] = {0};
+
+ priv = this->private;
+ iatt = &replies[source].poststat;
+ uuid_utoa_r(iatt->ia_gfid, iatt_uuid_str);
+ if (iatt->ia_type == IA_INVAL || gf_uuid_is_null(iatt->ia_gfid)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SELF_HEAL_FAILED,
+ "Invalid ia_type (%d) or gfid(%s). source brick=%d, "
+ "pargfid=%s, name=%s",
+ iatt->ia_type, iatt_uuid_str, source,
+ uuid_utoa_r(dir->gfid, dir_uuid_str), name);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata)
+ return -ENOMEM;
+ newentry = alloca0(priv->child_count);
+ loc.parent = inode_ref(dir);
+ gf_uuid_copy(loc.pargfid, dir->gfid);
+ loc.name = name;
+ loc.inode = inode_ref(inode);
+
+ ret = afr_selfheal_entry_delete(this, dir, name, inode, dst, replies);
+ if (ret)
+ goto out;
+
+ ret = dict_set_gfuuid(xdata, "gfid-req", replies[source].poststat.ia_gfid,
+ true);
+ if (ret)
+ goto out;
+
+ srcloc.inode = inode_ref(inode);
+ gf_uuid_copy(srcloc.gfid, iatt->ia_gfid);
+ ret = syncop_lookup(priv->children[dst], &srcloc, 0, 0, 0, 0);
+ if (ret == -ENOENT || ret == -ESTALE) {
+ newentry[dst] = 1;
+ ret = afr_selfheal_newentry_mark(frame, this, inode, source, replies,
+ sources, newentry);
+ if (ret)
+ goto out;
+ } else if (ret == 0 && iatt->ia_type == IA_IFDIR && priv->use_anon_inode) {
+ // Try rename from hidden directory
+ ret = afr_anon_inode_create(this, dst, &anonloc.parent);
+ if (ret < 0)
+ goto out;
+ anonloc.inode = inode_ref(inode);
+ anonloc.name = iatt_uuid_str;
+ ret = syncop_rename(priv->children[dst], &anonloc, &loc, NULL, NULL);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = -1; /*This sets 'mismatch' to true*/
+ goto out;
+ }
+
+ mode = st_mode_from_ia(iatt->ia_prot, iatt->ia_type);
+
+ switch (iatt->ia_type) {
+ case IA_IFDIR:
+ ret = syncop_mkdir(priv->children[dst], &loc, mode, 0, xdata, NULL);
+ break;
+ case IA_IFLNK:
+ if (!newentry[dst]) {
+ ret = syncop_link(priv->children[dst], &srcloc, &loc, &newent,
+ NULL, NULL);
+ } else {
+ ret = syncop_readlink(priv->children[source], &srcloc,
+ &linkname, 4096, NULL, NULL);
+ if (ret <= 0)
+ goto out;
+ ret = syncop_symlink(priv->children[dst], &loc, linkname, NULL,
+ xdata, NULL);
+ }
+ break;
+ default:
+ ret = dict_set_int32_sizen(xdata, GLUSTERFS_INTERNAL_FOP_KEY, 1);
+ if (ret)
+ goto out;
+ ret = syncop_mknod(
+ priv->children[dst], &loc, mode,
+ makedev(ia_major(iatt->ia_rdev), ia_minor(iatt->ia_rdev)),
+ &newent, xdata, NULL);
+ break;
+ }
+
+out:
+ if (xdata)
+ dict_unref(xdata);
+ GF_FREE(linkname);
+ loc_wipe(&loc);
+ loc_wipe(&srcloc);
+ loc_wipe(&anonloc);
+ return ret;
+}
static int
-__afr_selfheal_heal_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
- char *name, inode_t *inode, int source,
- unsigned char *sources, unsigned char *healed_sinks,
- unsigned char *locked_on, struct afr_reply *replies)
+__afr_selfheal_heal_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- int ret = 0;
- afr_private_t *priv = NULL;
- int i = 0;
- unsigned char *newentry = NULL;
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
- priv = this->private;
+ priv = this->private;
- newentry = alloca0 (priv->child_count);
+ if (!replies[source].valid)
+ return -EIO;
- if (!replies[source].valid)
- return -EIO;
+ /* Skip healing this entry if the last lookup on it failed for reasons
+ * other than ENOENT.
+ */
+ if ((replies[source].op_ret < 0) && (replies[source].op_errno != ENOENT))
+ return -replies[source].op_errno;
- /* Skip healing this entry if the last lookup on it failed for reasons
- * other than ENOENT.
- */
- if ((replies[source].op_ret < 0) &&
- (replies[source].op_errno != ENOENT))
- return -replies[source].op_errno;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!healed_sinks[i])
- continue;
- if (replies[source].op_ret == -1 &&
- replies[source].op_errno == ENOENT) {
- ret = afr_selfheal_entry_delete (this, fd->inode, name,
- inode, i, replies);
- } else {
- if (!gf_uuid_compare (replies[i].poststat.ia_gfid,
- replies[source].poststat.ia_gfid))
- continue;
-
- ret = afr_selfheal_recreate_entry (this, i, source,
- fd->inode, name, inode,
- replies, newentry);
- }
- if (ret < 0)
- break;
- }
-
- if (AFR_COUNT (newentry, priv->child_count))
- afr_selfheal_newentry_mark (frame, this, inode, source, replies,
- sources, newentry);
- return ret;
+ if (replies[source].op_ret == 0) {
+ ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies,
+ source, sources,
+ &replies[source].poststat.ia_gfid, NULL);
+ if (ret)
+ return ret;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+ if (replies[source].op_ret == -1 &&
+ replies[source].op_errno == ENOENT) {
+ ret = afr_selfheal_entry_delete(this, fd->inode, name, inode, i,
+ replies);
+ } else {
+ if (!gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[source].poststat.ia_gfid))
+ continue;
+
+ ret = afr_selfheal_recreate_entry(frame, i, source, sources,
+ fd->inode, name, inode, replies);
+ }
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
}
static int
-afr_selfheal_detect_gfid_and_type_mismatch (xlator_t *this,
- struct afr_reply *replies,
- uuid_t pargfid, char *bname,
- int src_idx)
+afr_selfheal_detect_gfid_and_type_mismatch(xlator_t *this,
+ struct afr_reply *replies,
+ inode_t *inode, uuid_t pargfid,
+ char *bname, int src_idx,
+ unsigned char *locked_on, int *src)
{
- int i = 0;
- char g1[64] = {0,};
- char g2[64] = {0,};
- afr_private_t *priv = NULL;
+ int i = 0;
+ int ret = -1;
+ afr_private_t *priv = NULL;
+ void *gfid = NULL;
+ ia_type_t ia_type = IA_INVAL;
- priv = this->private;
+ priv = this->private;
+ gfid = &replies[src_idx].poststat.ia_gfid;
+ ia_type = replies[src_idx].poststat.ia_type;
- for (i = 0; i < priv->child_count; i++) {
- if (i == src_idx)
- continue;
-
- if (!replies[i].valid)
- continue;
-
- if (replies[i].op_ret != 0)
- continue;
-
- if (gf_uuid_compare (replies[src_idx].poststat.ia_gfid,
- replies[i].poststat.ia_gfid)) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SPLIT_BRAIN, "Gfid mismatch "
- "detected for <%s/%s>, %s on %s and %s on %s. "
- "Skipping conservative merge on the file.",
- uuid_utoa (pargfid), bname,
- uuid_utoa_r (replies[i].poststat.ia_gfid, g1),
- priv->children[i]->name,
- uuid_utoa_r (replies[src_idx].poststat.ia_gfid,
- g2), priv->children[src_idx]->name);
- return -1;
- }
-
- if ((replies[src_idx].poststat.ia_type) !=
- (replies[i].poststat.ia_type)) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SPLIT_BRAIN, "Type mismatch "
- "detected for <%s/%s>, %d on %s and %d on %s. "
- "Skipping conservative merge on the file.",
- uuid_utoa (pargfid), bname,
- replies[i].poststat.ia_type,
- priv->children[i]->name,
- replies[src_idx].poststat.ia_type,
- priv->children[src_idx]->name);
- return -1;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == src_idx)
+ continue;
+
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret != 0)
+ continue;
+
+ if (gf_uuid_is_null(replies[i].poststat.ia_gfid))
+ continue;
+
+ if (replies[i].poststat.ia_type == IA_INVAL)
+ continue;
+
+ if (ia_type == IA_INVAL || gf_uuid_is_null(gfid)) {
+ src_idx = i;
+ ia_type = replies[src_idx].poststat.ia_type;
+ gfid = &replies[src_idx].poststat.ia_gfid;
+ continue;
}
- return 0;
+ if (gf_uuid_compare(gfid, replies[i].poststat.ia_gfid) &&
+ (ia_type == replies[i].poststat.ia_type)) {
+ ret = afr_gfid_split_brain_source(this, replies, inode, pargfid,
+ bname, src_idx, i, locked_on, src,
+ NULL);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Skipping conservative merge on the "
+ "file.");
+ return ret;
+ }
+
+ if (ia_type != replies[i].poststat.ia_type) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_SPLIT_BRAIN,
+ "Type mismatch detected "
+ "for <gfid:%s>/%s>, %s on %s and %s on %s. "
+ "Skipping conservative merge on the file.",
+ uuid_utoa(pargfid), bname,
+ gf_inode_type_to_str(replies[i].poststat.ia_type),
+ priv->children[i]->name,
+ gf_inode_type_to_str(replies[src_idx].poststat.ia_type),
+ priv->children[src_idx]->name);
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=file;"
+ "file=<gfid:%s>/%s>;count=2;child-%d=%s;type-"
+ "%d=%s;child-%d=%s;type-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(pargfid), bname, i, priv->children[i]->name, i,
+ gf_inode_type_to_str(replies[i].poststat.ia_type), src_idx,
+ priv->children[src_idx]->name, src_idx,
+ gf_inode_type_to_str(replies[src_idx].poststat.ia_type));
+ return -1;
+ }
+ }
+
+ return 0;
}
static int
-__afr_selfheal_merge_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
- char *name, inode_t *inode, unsigned char *sources,
- unsigned char *healed_sinks, unsigned char *locked_on,
- struct afr_reply *replies)
+__afr_selfheal_merge_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, unsigned char *sources,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
{
- int ret = 0;
- int i = 0;
- int source = -1;
- unsigned char *newentry = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- newentry = alloca0 (priv->child_count);
-
- for (i = 0; i < priv->child_count; i++) {
- if (replies[i].valid && replies[i].op_ret == 0) {
- source = i;
- break;
- }
- }
-
- if (source == -1) {
- /* entry got deleted in the mean time? */
- return 0;
- }
-
- /* Set all the sources as 1, otheriwse newentry_mark won't be set */
- for (i = 0; i < priv->child_count; i++) {
- if (replies[i].valid && replies[i].op_ret == 0) {
- sources[i] = 1;
- }
- }
-
- /* In case of a gfid or type mismatch on the entry, return -1.*/
- ret = afr_selfheal_detect_gfid_and_type_mismatch (this, replies,
- fd->inode->gfid,
- name, source);
+ int ret = 0;
+ int i = 0;
+ int source = -1;
+ int src = -1;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ source = i;
+ break;
+ }
+ }
- if (ret < 0)
- return ret;
+ if (source == -1) {
+ /* entry got deleted in the mean time? */
+ return 0;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (i == source || !healed_sinks[i])
- continue;
+ /* Set all the sources as 1, otheriwse newentry_mark won't be set */
+ for (i = 0; i < priv->child_count; i++) {
+ if (replies[i].valid && replies[i].op_ret == 0) {
+ sources[i] = 1;
+ }
+ }
+
+ ret = afr_lookup_and_heal_gfid(this, fd->inode, name, inode, replies,
+ source, sources,
+ &replies[source].poststat.ia_gfid, NULL);
+ if (ret)
+ return ret;
+
+ /* In case of type mismatch / unable to resolve gfid mismatch on the
+ * entry, return -1.*/
+ ret = afr_selfheal_detect_gfid_and_type_mismatch(
+ this, replies, inode, fd->inode->gfid, name, source, locked_on, &src);
+
+ if (ret < 0)
+ return ret;
+ if (src != -1) {
+ source = src;
+ for (i = 0; i < priv->child_count; i++) {
+ if (i != src && replies[i].valid &&
+ gf_uuid_compare(replies[src].poststat.ia_gfid,
+ replies[i].poststat.ia_gfid)) {
+ sources[i] = 0;
+ }
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source || !healed_sinks[i])
+ continue;
+
+ if (src != -1) {
+ if (!gf_uuid_compare(replies[src].poststat.ia_gfid,
+ replies[i].poststat.ia_gfid))
+ continue;
+ } else if (replies[i].op_errno != ENOENT) {
+ continue;
+ }
- if (replies[i].op_errno != ENOENT)
- continue;
+ ret |= afr_selfheal_recreate_entry(frame, i, source, sources, fd->inode,
+ name, inode, replies);
+ }
- ret = afr_selfheal_recreate_entry (this, i, source, fd->inode,
- name, inode, replies,
- newentry);
- }
+ return ret;
+}
- if (AFR_COUNT (newentry, priv->child_count))
- afr_selfheal_newentry_mark (frame, this, inode, source, replies,
- sources, newentry);
- return ret;
+static int
+__afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *inode, int source,
+ unsigned char *sources, unsigned char *healed_sinks,
+ unsigned char *locked_on, struct afr_reply *replies)
+{
+ int ret = -1;
+
+ if (source < 0)
+ ret = __afr_selfheal_merge_dirent(frame, this, fd, name, inode, sources,
+ healed_sinks, locked_on, replies);
+ else
+ ret = __afr_selfheal_heal_dirent(frame, this, fd, name, inode, source,
+ sources, healed_sinks, locked_on,
+ replies);
+ return ret;
}
+static gf_boolean_t
+is_full_heal_marker_present(xlator_t *this, dict_t *xdata, int idx)
+{
+ int i = 0;
+ int pending[3] = {
+ 0,
+ };
+ void *pending_raw = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (!xdata)
+ return _gf_false;
+
+ /* Iterate over each of the priv->pending_keys[] elements and then
+ * see if any of them have data segment non-zero. If they do, return
+ * true. Else return false.
+ */
+ for (i = 0; i < priv->child_count; i++) {
+ if (dict_get_ptr(xdata, priv->pending_key[i], &pending_raw))
+ continue;
+
+ if (!pending_raw)
+ continue;
+
+ memcpy(pending, pending_raw, sizeof(pending));
+ if (ntoh32(pending[idx]))
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
-static int
-__afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this, fd_t *fd,
- char *name, inode_t *inode, int source,
- unsigned char *sources, unsigned char *healed_sinks,
- unsigned char *locked_on,
- struct afr_reply *replies)
+static gf_boolean_t
+afr_need_full_heal(xlator_t *this, struct afr_reply *replies, int source,
+ unsigned char *healed_sinks, afr_transaction_type type)
{
- int ret = -1;
-
- if (source < 0)
- ret = __afr_selfheal_merge_dirent (frame, this, fd, name, inode,
- sources, healed_sinks,
- locked_on, replies);
- else
- ret = __afr_selfheal_heal_dirent (frame, this, fd, name, inode,
- source, sources, healed_sinks,
- locked_on, replies);
- return ret;
+ int i = 0;
+ int idx = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ if (!priv->esh_granular)
+ return _gf_true;
+
+ if (type != AFR_ENTRY_TRANSACTION)
+ return _gf_true;
+
+ priv = this->private;
+ idx = afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+
+ /* If there is a clear source, check whether the full-heal-indicator
+ * is present in its xdata. Otherwise, we need to examine all the
+ * participating bricks and then figure if *even* one of them has a
+ * full-heal-indicator.
+ */
+
+ if (source != -1) {
+ if (is_full_heal_marker_present(this, replies[source].xdata, idx))
+ return _gf_true;
+ }
+
+ /* else ..*/
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!healed_sinks[i])
+ continue;
+
+ if (is_full_heal_marker_present(this, replies[i].xdata, idx))
+ return _gf_true;
+ }
+
+ return _gf_false;
}
static int
-__afr_selfheal_entry_finalize_source (xlator_t *this, unsigned char *sources,
- unsigned char *healed_sinks,
- unsigned char *locked_on,
- struct afr_reply *replies,
- uint64_t *witness)
+__afr_selfheal_entry_finalize_source(xlator_t *this, unsigned char *sources,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ uint64_t *witness)
{
- int i = 0;
- afr_private_t *priv = NULL;
- int source = -1;
- int sources_count = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int sources_count = 0;
+ int i = 0;
- priv = this->private;
+ priv = this->private;
- sources_count = AFR_COUNT (sources, priv->child_count);
+ sources_count = AFR_COUNT(sources, priv->child_count);
- if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
- || !sources_count || afr_does_witness_exist (this, witness)) {
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count || afr_does_witness_exist(this, witness)) {
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ return -1;
+ }
- memset (sources, 0, sizeof (*sources) * priv->child_count);
- afr_mark_active_sinks (this, sources, locked_on, healed_sinks);
- return -1;
- }
+ source = afr_choose_source_by_policy(priv, sources, AFR_ENTRY_TRANSACTION);
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i]) {
- source = i;
- break;
- }
- }
+ /*If the selected source does not blame any other brick, then mark
+ * everything as sink to trigger conservative merge.
+ */
+ if (source != -1 && !AFR_COUNT(healed_sinks, priv->child_count)) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_on[i]) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+ return -1;
+ }
- return source;
+ return source;
}
int
-__afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this,
- inode_t *inode, unsigned char *locked_on,
- unsigned char *sources, unsigned char *sinks,
- unsigned char *healed_sinks,
- struct afr_reply *replies, int *source_p,
- gf_boolean_t *pflag)
+__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ struct afr_reply *replies, int *source_p,
+ unsigned char *pflag)
{
- int ret = -1;
- int source = -1;
- afr_private_t *priv = NULL;
- uint64_t *witness = NULL;
-
- priv = this->private;
-
- ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
- if (ret)
- return ret;
-
- witness = alloca0 (sizeof (*witness) * priv->child_count);
- ret = afr_selfheal_find_direction (frame, this, replies,
- AFR_ENTRY_TRANSACTION,
- locked_on, sources, sinks, witness,
- pflag);
- if (ret)
- return ret;
-
- /* Initialize the healed_sinks[] array optimistically to
- the intersection of to-be-healed (i.e sinks[]) and
- the list of servers which are up (i.e locked_on[]).
-
- As we encounter failures in the healing process, we
- will unmark the respective servers in the healed_sinks[]
- array.
- */
- AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count);
-
- source = __afr_selfheal_entry_finalize_source (this, sources,
- healed_sinks,
- locked_on, replies,
- witness);
-
- if (source < 0) {
- /* If source is < 0 (typically split-brain), we perform a
- conservative merge of entries rather than erroring out */
- }
- *source_p = source;
-
- return ret;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+ if (ret)
+ return ret;
+
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_ENTRY_TRANSACTION, locked_on, sources,
+ sinks, witness, pflag);
+ if (ret)
+ return ret;
+
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+ source = __afr_selfheal_entry_finalize_source(this, sources, healed_sinks,
+ locked_on, replies, witness);
+
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
+
+ return ret;
}
static int
-afr_selfheal_entry_dirent (call_frame_t *frame, xlator_t *this,
- fd_t *fd, char *name)
+afr_selfheal_entry_dirent(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ char *name, inode_t *parent_idx_inode,
+ xlator_t *subvol, gf_boolean_t full_crawl)
{
- int ret = 0;
- int source = -1;
- unsigned char *locked_on = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *healed_sinks = NULL;
- inode_t *inode = NULL;
- struct afr_reply *replies = NULL;
- struct afr_reply *par_replies = NULL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- sources = alloca0 (priv->child_count);
- sinks = alloca0 (priv->child_count);
- healed_sinks = alloca0 (priv->child_count);
- locked_on = alloca0 (priv->child_count);
-
- replies = alloca0 (priv->child_count * sizeof(*replies));
- par_replies = alloca0 (priv->child_count * sizeof(*par_replies));
-
- ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL,
- locked_on);
- {
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
- gf_msg_debug (this->name, 0, "%s: Skipping "
- "entry self-heal as only %d sub-volumes "
- " could be locked in %s domain",
- uuid_utoa (fd->inode->gfid),
- ret, this->name);
- ret = -ENOTCONN;
- goto unlock;
- }
-
- ret = __afr_selfheal_entry_prepare (frame, this, fd->inode,
- locked_on,
- sources, sinks,
- healed_sinks, par_replies,
- &source, NULL);
- if (ret < 0)
- goto unlock;
-
- inode = afr_selfheal_unlocked_lookup_on (frame, fd->inode, name,
- replies, locked_on,
- NULL);
- if (!inode) {
- ret = -ENOMEM;
- goto unlock;
- }
-
- ret = __afr_selfheal_entry_dirent (frame, this, fd, name, inode,
- source, sources, healed_sinks,
- locked_on, replies);
- }
+ int ret = 0;
+ int source = -1;
+ unsigned char *locked_on = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ inode_t *inode = NULL;
+ struct afr_reply *replies = NULL;
+ struct afr_reply *par_replies = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+
+ priv = this->private;
+
+ if (afr_is_private_directory(priv, fd->inode->gfid, name,
+ GF_CLIENT_PID_SELF_HEALD)) {
+ return 0;
+ }
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+ ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1);
+ if (ret) {
+ dict_unref(xattr);
+ return -1;
+ }
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ locked_on = alloca0(priv->child_count);
+
+ replies = alloca0(priv->child_count * sizeof(*replies));
+ par_replies = alloca0(priv->child_count * sizeof(*par_replies));
+
+ ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+ locked_on);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "entry self-heal as only %d sub-volumes "
+ " could be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret, this->name);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, locked_on,
+ sources, sinks, healed_sinks,
+ par_replies, &source, NULL);
+ if (ret < 0)
+ goto unlock;
+
+ inode = afr_selfheal_unlocked_lookup_on(frame, fd->inode, name, replies,
+ locked_on, xattr);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry_dirent(frame, this, fd, name, inode, source,
+ sources, healed_sinks, locked_on,
+ replies);
+
+ if ((ret == 0) && (priv->esh_granular) && parent_idx_inode) {
+ ret = afr_shd_entry_purge(subvol, parent_idx_inode, name,
+ inode->ia_type);
+ /* Why is ret force-set to 0? We do not care about
+ * index purge failing for full heal as it is quite
+ * possible during replace-brick that not all files
+ * and directories have their name indices present in
+ * entry-changes/.
+ */
+ ret = 0;
+ }
+ }
+
unlock:
- afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
- locked_on);
- if (inode)
- inode_unref (inode);
- if (replies)
- afr_replies_wipe (replies, priv->child_count);
- if (par_replies)
- afr_replies_wipe (par_replies, priv->child_count);
-
- return ret;
+ afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, locked_on,
+ NULL);
+ if (inode)
+ inode_unref(inode);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ if (par_replies)
+ afr_replies_wipe(par_replies, priv->child_count);
+ if (xattr)
+ dict_unref(xattr);
+
+ return ret;
}
+static inode_t *
+afr_shd_entry_changes_index_inode(xlator_t *this, xlator_t *subvol,
+ uuid_t pargfid)
+{
+ int ret = -1;
+ void *index_gfid = NULL;
+ loc_t rootloc = {
+ 0,
+ };
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+ inode_t *inode = NULL;
+ struct iatt iatt = {
+ 0,
+ };
+
+ rootloc.inode = inode_ref(this->itable->root);
+ gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
+
+ ret = syncop_getxattr(subvol, &rootloc, &xattr,
+ GF_XATTROP_ENTRY_CHANGES_GFID, NULL, NULL);
+ if (ret || !xattr) {
+ errno = -ret;
+ goto out;
+ }
+
+ ret = dict_get_ptr(xattr, GF_XATTROP_ENTRY_CHANGES_GFID, &index_gfid);
+ if (ret) {
+ errno = EINVAL;
+ goto out;
+ }
+
+ loc.inode = inode_new(this->itable);
+ if (!loc.inode) {
+ errno = ENOMEM;
+ goto out;
+ }
+
+ gf_uuid_copy(loc.pargfid, index_gfid);
+ loc.name = gf_strdup(uuid_utoa(pargfid));
+
+ ret = syncop_lookup(subvol, &loc, &iatt, NULL, NULL, NULL);
+ if (ret < 0) {
+ errno = -ret;
+ goto out;
+ }
+
+ inode = inode_link(loc.inode, NULL, NULL, &iatt);
+
+out:
+ if (xattr)
+ dict_unref(xattr);
+ loc_wipe(&rootloc);
+ GF_FREE((char *)loc.name);
+ loc_wipe(&loc);
+
+ return inode;
+}
static int
-afr_selfheal_entry_do_subvol (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int child)
+afr_selfheal_entry_do_subvol(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int child)
{
- int ret = 0;
- gf_dirent_t entries;
- gf_dirent_t *entry = NULL;
- off_t offset = 0;
- call_frame_t *iter_frame = NULL;
- xlator_t *subvol = NULL;
- afr_private_t *priv = NULL;
- gf_boolean_t mismatch = _gf_false;
-
- priv = this->private;
- subvol = priv->children[child];
-
- INIT_LIST_HEAD (&entries.list);
-
- iter_frame = afr_copy_frame (frame);
- if (!iter_frame)
- return -ENOMEM;
-
- while ((ret = syncop_readdir (subvol, fd, 131072, offset, &entries,
- NULL, NULL))) {
- if (ret > 0)
- ret = 0;
- list_for_each_entry (entry, &entries.list, list) {
- offset = entry->d_off;
-
- if (!strcmp (entry->d_name, ".") ||
- !strcmp (entry->d_name, ".."))
- continue;
-
- if (__is_root_gfid (fd->inode->gfid) &&
- !strcmp (entry->d_name, GF_REPLICATE_TRASH_DIR))
- continue;
-
- ret = afr_selfheal_entry_dirent (iter_frame, this, fd,
- entry->d_name);
- AFR_STACK_RESET (iter_frame);
- if (iter_frame->local == NULL) {
- ret = -ENOTCONN;
- break;
- }
-
- if (ret == -1) {
- /* gfid or type mismatch. */
- mismatch = _gf_true;
- ret = 0;
- }
- if (ret)
- break;
- }
-
- gf_dirent_free (&entries);
- if (ret)
- break;
- }
-
- AFR_STACK_DESTROY (iter_frame);
- if (mismatch == _gf_true)
- /* undo pending will be skipped */
- ret = -1;
- return ret;
+ int ret = 0;
+ gf_dirent_t entries;
+ gf_dirent_t *entry = NULL;
+ off_t offset = 0;
+ call_frame_t *iter_frame = NULL;
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t mismatch = _gf_false;
+ afr_local_t *local = NULL;
+ loc_t loc = {
+ 0,
+ };
+
+ priv = this->private;
+ subvol = priv->children[child];
+
+ INIT_LIST_HEAD(&entries.list);
+
+ local = frame->local;
+
+ iter_frame = afr_copy_frame(frame);
+ if (!iter_frame)
+ return -ENOMEM;
+
+ loc.inode = afr_shd_entry_changes_index_inode(this, subvol,
+ fd->inode->gfid);
+
+ while ((ret = syncop_readdir(subvol, fd, 131072, offset, &entries, NULL,
+ NULL))) {
+ if (ret > 0)
+ ret = 0;
+ list_for_each_entry(entry, &entries.list, list)
+ {
+ offset = entry->d_off;
+
+ if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
+ continue;
+
+ ret = afr_selfheal_entry_dirent(iter_frame, this, fd, entry->d_name,
+ loc.inode, subvol,
+ local->need_full_crawl);
+ AFR_STACK_RESET(iter_frame);
+ if (iter_frame->local == NULL) {
+ ret = -ENOTCONN;
+ break;
+ }
+
+ if (ret == -1) {
+ /* gfid or type mismatch. */
+ mismatch = _gf_true;
+ ret = 0;
+ }
+ if (ret)
+ break;
+ }
+
+ gf_dirent_free(&entries);
+ if (ret)
+ break;
+ }
+
+ loc_wipe(&loc);
+
+ AFR_STACK_DESTROY(iter_frame);
+ if (mismatch == _gf_true)
+ /* undo pending will be skipped */
+ ret = -1;
+ return ret;
}
static int
-afr_selfheal_entry_do (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int source, unsigned char *sources,
- unsigned char *healed_sinks)
+afr_selfheal_entry_granular_dirent(xlator_t *subvol, gf_dirent_t *entry,
+ loc_t *parent, void *data)
{
- int i = 0;
- afr_private_t *priv = NULL;
- gf_boolean_t mismatch = _gf_false;
- int ret = 0;
-
- priv = this->private;
-
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_SELF_HEAL_INFO, "performing entry selfheal on %s",
- uuid_utoa (fd->inode->gfid));
-
- for (i = 0; i < priv->child_count; i++) {
- if (!healed_sinks[i])
- continue;
- ret = afr_selfheal_entry_do_subvol (frame, this, fd, i);
- if (ret == -1) {
- /* gfid or type mismatch. */
- mismatch = _gf_true;
- ret = 0;
- }
- if (ret)
- break;
- }
- if (!ret && source != -1)
- ret = afr_selfheal_entry_do_subvol (frame, this, fd, source);
-
- if (mismatch == _gf_true)
- /* undo pending will be skipped */
- ret = -1;
- return ret;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ struct iatt iatt = {
+ 0,
+ };
+ afr_granular_esh_args_t *args = data;
+
+ /* Look up the actual inode associated with entry. If the lookup returns
+ * ESTALE or ENOENT, then it means we have a stale index. Remove it.
+ * This is analogous to the check in afr_shd_index_heal() except that
+ * here it is achieved through LOOKUP and in afr_shd_index_heal() through
+ * a GETXATTR.
+ */
+
+ loc.inode = inode_new(args->xl->itable);
+ loc.parent = inode_ref(args->heal_fd->inode);
+ gf_uuid_copy(loc.pargfid, loc.parent->gfid);
+ loc.name = entry->d_name;
+
+ ret = syncop_lookup(args->xl, &loc, &iatt, NULL, NULL, NULL);
+ if ((ret == -ENOENT) || (ret == -ESTALE)) {
+ /* The name indices under the pgfid index dir are guaranteed
+ * to be regular files. Hence the hardcoding.
+ */
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, IA_IFREG);
+ ret = 0;
+ goto out;
+ }
+ /* TBD: afr_shd_zero_xattrop? */
+
+ ret = afr_selfheal_entry_dirent(args->frame, args->xl, args->heal_fd,
+ entry->d_name, parent->inode, subvol,
+ _gf_false);
+ AFR_STACK_RESET(args->frame);
+ if (args->frame->local == NULL)
+ ret = -ENOTCONN;
+
+ if (ret == -1)
+ args->mismatch = _gf_true;
+
+out:
+ loc_wipe(&loc);
+ return 0;
}
static int
-__afr_selfheal_entry (call_frame_t *frame, xlator_t *this, fd_t *fd,
- unsigned char *locked_on)
+afr_selfheal_entry_granular(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ int subvol_idx, gf_boolean_t is_src)
{
- int ret = -1;
- int source = -1;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *data_lock = NULL;
- unsigned char *postop_lock = NULL;
- unsigned char *healed_sinks = NULL;
- struct afr_reply *locked_replies = NULL;
- afr_private_t *priv = NULL;
- gf_boolean_t did_sh = _gf_true;
-
- priv = this->private;
-
- sources = alloca0 (priv->child_count);
- sinks = alloca0 (priv->child_count);
- healed_sinks = alloca0 (priv->child_count);
- data_lock = alloca0 (priv->child_count);
- postop_lock = alloca0 (priv->child_count);
-
- locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
-
- ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL,
- data_lock);
- {
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
- gf_msg_debug (this->name, 0, "%s: Skipping "
- "entry self-heal as only %d sub-volumes could "
- "be locked in %s domain",
- uuid_utoa (fd->inode->gfid), ret,
- this->name);
- ret = -ENOTCONN;
- goto unlock;
- }
-
- ret = __afr_selfheal_entry_prepare (frame, this, fd->inode,
- data_lock, sources, sinks,
- healed_sinks,
- locked_replies, &source,
- NULL);
- if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
- did_sh = _gf_false;
- goto unlock;
- }
- }
-unlock:
- afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
- data_lock);
- if (ret < 0)
- goto out;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+ xlator_t *subvol = NULL;
+ afr_private_t *priv = NULL;
+ afr_granular_esh_args_t args = {
+ 0,
+ };
+
+ priv = this->private;
+ subvol = priv->children[subvol_idx];
+
+ args.frame = afr_copy_frame(frame);
+ if (!args.frame)
+ goto out;
+ args.xl = this;
+ /* args.heal_fd represents the fd associated with the original directory
+ * on which entry heal is being attempted.
+ */
+ args.heal_fd = fd;
+
+ /* @subvol here represents the subvolume of AFR where
+ * indices/entry-changes/<pargfid> will be processed
+ */
+ loc.inode = afr_shd_entry_changes_index_inode(this, subvol,
+ fd->inode->gfid);
+ if (!loc.inode) {
+ /* If granular heal failed on the sink (as it might sometimes
+ * because it is the src that would mostly contain the granular
+ * changelogs and the sink's entry-changes would be empty),
+ * do not treat heal as failure.
+ */
+ if (is_src)
+ ret = -errno;
+ else
+ ret = 0;
+ goto out;
+ }
- if (!did_sh)
- goto out;
+ ret = syncop_dir_scan(subvol, &loc, GF_CLIENT_PID_SELF_HEALD, &args,
+ afr_selfheal_entry_granular_dirent);
- ret = afr_selfheal_entry_do (frame, this, fd, source, sources,
- healed_sinks);
- if (ret)
- goto out;
-
- /* Take entrylks in xlator domain before doing post-op (undo-pending) in
- * entry self-heal. This is to prevent a parallel name self-heal on
- * an entry under @fd->inode from reading pending xattrs while it is
- * being modified by SHD after entry sh below, given that
- * name self-heal takes locks ONLY in xlator domain and is free to read
- * pending changelog in the absence of the following locking.
- */
- ret = afr_selfheal_entrylk (frame, this, fd->inode, this->name, NULL,
- postop_lock);
- {
- if (AFR_CMP (data_lock, postop_lock, priv->child_count) != 0) {
- gf_msg_debug (this->name, 0, "%s: Skipping "
- "post-op after entry self-heal as %d "
- "sub-volumes, as opposed to %d, "
- "could be locked in %s domain",
- uuid_utoa (fd->inode->gfid),
- ret, AFR_COUNT (data_lock,
- priv->child_count), this->name);
- ret = -ENOTCONN;
- goto postop_unlock;
- }
-
- ret = afr_selfheal_undo_pending (frame, this, fd->inode,
- sources, sinks, healed_sinks,
- AFR_ENTRY_TRANSACTION,
- locked_replies, postop_lock);
- }
-postop_unlock:
- afr_selfheal_unentrylk (frame, this, fd->inode, this->name, NULL,
- postop_lock);
+ loc_wipe(&loc);
+
+ if (args.mismatch == _gf_true)
+ ret = -1;
out:
- if (did_sh)
- afr_log_selfheal (fd->inode->gfid, this, ret, "entry", source,
- healed_sinks);
+ if (args.frame)
+ AFR_STACK_DESTROY(args.frame);
+ return ret;
+}
+
+static int
+afr_selfheal_entry_do(call_frame_t *frame, xlator_t *this, fd_t *fd, int source,
+ unsigned char *sources, unsigned char *healed_sinks)
+{
+ int i = 0;
+ int ret = 0;
+ gf_boolean_t mismatch = _gf_false;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "performing entry selfheal on %s", uuid_utoa(fd->inode->gfid));
+
+ for (i = 0; i < priv->child_count; i++) {
+ /* Expunge */
+ if (!healed_sinks[i])
+ continue;
+
+ if (!local->need_full_crawl)
+ /* Why call afr_selfheal_entry_granular() on a "healed sink",
+ * given that it is the source that contains the granular
+ * indices?
+ * If the index for this directory is non-existent or empty on
+ * this subvol (=> clear sink), then it will return early
+ * without failure status.
+ * If the index is non-empty and it is yet a 'healed sink', then
+ * it is due to a split-brain in which case we anyway need to
+ * crawl the indices/entry-changes/pargfid directory.
+ */
+ ret = afr_selfheal_entry_granular(frame, this, fd, i, _gf_false);
else
- ret = 1;
+ ret = afr_selfheal_entry_do_subvol(frame, this, fd, i);
- if (locked_replies)
- afr_replies_wipe (locked_replies, priv->child_count);
- return ret;
+ if (ret == -1) {
+ /* gfid or type mismatch. */
+ mismatch = _gf_true;
+ ret = 0;
+ }
+ if (ret)
+ break;
+ }
+
+ if (!ret && source != -1) {
+ /* Impunge */
+ if (local->need_full_crawl)
+ ret = afr_selfheal_entry_do_subvol(frame, this, fd, source);
+ else
+ ret = afr_selfheal_entry_granular(frame, this, fd, source,
+ _gf_true);
+ }
+
+ if (mismatch == _gf_true)
+ /* undo pending will be skipped */
+ ret = -1;
+ return ret;
}
+static int
+__afr_selfheal_entry(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ unsigned char *locked_on)
+{
+ int ret = -1;
+ int source = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *postop_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t did_sh = _gf_true;
+
+ priv = this->private;
+ local = frame->local;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+ data_lock = alloca0(priv->child_count);
+ postop_lock = alloca0(priv->child_count);
+
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+ data_lock);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "entry self-heal as only %d sub-volumes could "
+ "be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret, this->name);
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry_prepare(frame, this, fd->inode, data_lock,
+ sources, sinks, healed_sinks,
+ locked_replies, &source, NULL);
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ did_sh = _gf_false;
+ goto unlock;
+ }
+
+ local->need_full_crawl = afr_need_full_heal(
+ this, locked_replies, source, healed_sinks, AFR_ENTRY_TRANSACTION);
+ }
+unlock:
+ afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL, data_lock,
+ NULL);
+ if (ret < 0)
+ goto out;
+
+ if (!did_sh)
+ goto out;
+
+ ret = afr_selfheal_entry_do(frame, this, fd, source, sources, healed_sinks);
+ if (ret)
+ goto out;
+
+ /* Take entrylks in xlator domain before doing post-op (undo-pending) in
+ * entry self-heal. This is to prevent a parallel name self-heal on
+ * an entry under @fd->inode from reading pending xattrs while it is
+ * being modified by SHD after entry sh below, given that
+ * name self-heal takes locks ONLY in xlator domain and is free to read
+ * pending changelog in the absence of the following locking.
+ */
+ ret = afr_selfheal_entrylk(frame, this, fd->inode, this->name, NULL,
+ postop_lock);
+ {
+ if (AFR_CMP(data_lock, postop_lock, priv->child_count) != 0) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "post-op after entry self-heal as %d "
+ "sub-volumes, as opposed to %d, "
+ "could be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret,
+ AFR_COUNT(data_lock, priv->child_count), this->name);
+ ret = -ENOTCONN;
+ goto postop_unlock;
+ }
+
+ afr_selfheal_restore_time(frame, this, fd->inode, source, healed_sinks,
+ locked_replies);
+ ret = afr_selfheal_undo_pending(
+ frame, this, fd->inode, sources, sinks, healed_sinks, undid_pending,
+ AFR_ENTRY_TRANSACTION, locked_replies, postop_lock);
+ }
+postop_unlock:
+ afr_selfheal_unentrylk(frame, this, fd->inode, this->name, NULL,
+ postop_lock, NULL);
+out:
+ if (did_sh)
+ afr_log_selfheal(fd->inode->gfid, this, ret, "entry", source, sources,
+ healed_sinks);
+ else
+ ret = 1;
+
+ if (locked_replies)
+ afr_replies_wipe(locked_replies, priv->child_count);
+ return ret;
+}
static fd_t *
-afr_selfheal_data_opendir (xlator_t *this, inode_t *inode)
+afr_selfheal_data_opendir(xlator_t *this, inode_t *inode)
{
- loc_t loc = {0,};
- int ret = 0;
- fd_t *fd = NULL;
-
- fd = fd_create (inode, 0);
- if (!fd)
- return NULL;
-
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, inode->gfid);
-
- ret = syncop_opendir (this, &loc, fd, NULL, NULL);
- if (ret) {
- fd_unref (fd);
- fd = NULL;
- } else {
- fd_bind (fd);
- }
-
- loc_wipe (&loc);
- return fd;
+ loc_t loc = {
+ 0,
+ };
+ int ret = 0;
+ fd_t *fd = NULL;
+
+ fd = fd_create(inode, 0);
+ if (!fd)
+ return NULL;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ ret = syncop_opendir(this, &loc, fd, NULL, NULL);
+ if (ret) {
+ fd_unref(fd);
+ fd = NULL;
+ } else {
+ fd_bind(fd);
+ }
+
+ loc_wipe(&loc);
+ return fd;
}
-
int
-afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode)
+afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- afr_private_t *priv = NULL;
- unsigned char *locked_on = NULL;
- unsigned char *long_name_locked = NULL;
- fd_t *fd = NULL;
- int ret = 0;
-
- priv = this->private;
-
- fd = afr_selfheal_data_opendir (this, inode);
- if (!fd)
- return -EIO;
-
- locked_on = alloca0 (priv->child_count);
- long_name_locked = alloca0 (priv->child_count);
-
- ret = afr_selfheal_tryentrylk (frame, this, inode, priv->sh_domain, NULL,
- locked_on);
- {
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
- gf_msg_debug (this->name, 0, "%s: Skipping "
- "entry self-heal as only %d sub-volumes could "
- "be locked in %s domain",
- uuid_utoa (fd->inode->gfid), ret,
- priv->sh_domain);
- /* Either less than two subvols available, or another
- selfheal (from another server) is in progress. Skip
- for now in any case there isn't anything to do.
- */
- ret = -ENOTCONN;
- goto unlock;
- }
-
- ret = afr_selfheal_tryentrylk (frame, this, inode, this->name,
- LONG_FILENAME, long_name_locked);
- {
- if (ret < 1) {
- gf_msg_debug (this->name, 0, "%s: Skipping"
- " entry self-heal as only %d "
- "sub-volumes could be "
- "locked in special-filename "
- "domain",
- uuid_utoa (fd->inode->gfid),
- ret);
- ret = -ENOTCONN;
- goto unlock;
- }
- ret = __afr_selfheal_entry (frame, this, fd, locked_on);
- }
- afr_selfheal_unentrylk (frame, this, inode, this->name,
- LONG_FILENAME, long_name_locked);
- }
+ afr_private_t *priv = NULL;
+ unsigned char *locked_on = NULL;
+ fd_t *fd = NULL;
+ int ret = 0;
+
+ priv = this->private;
+
+ fd = afr_selfheal_data_opendir(this, inode);
+ if (!fd)
+ return -EIO;
+
+ locked_on = alloca0(priv->child_count);
+
+ ret = afr_selfheal_tie_breaker_entrylk(frame, this, inode, priv->sh_domain,
+ NULL, locked_on);
+ {
+ if (ret < priv->child_count) {
+ gf_msg_debug(this->name, 0,
+ "%s: Skipping "
+ "entry self-heal as only %d sub-volumes could "
+ "be locked in %s domain",
+ uuid_utoa(fd->inode->gfid), ret, priv->sh_domain);
+ /* Either less than two subvols available, or another
+ selfheal (from another server) is in progress. Skip
+ for now in any case there isn't anything to do.
+ */
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_entry(frame, this, fd, locked_on);
+ }
unlock:
- afr_selfheal_unentrylk (frame, this, inode, priv->sh_domain, NULL, locked_on);
+ afr_selfheal_unentrylk(frame, this, inode, priv->sh_domain, NULL, locked_on,
+ NULL);
- if (fd)
- fd_unref (fd);
+ if (fd)
+ fd_unref(fd);
- return ret;
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c
index 24b139ad216..03f43bad16e 100644
--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c
+++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c
@@ -8,106 +8,108 @@
cases as published by the Free Software Foundation.
*/
-
#include "afr.h"
#include "afr-self-heal.h"
-#include "byte-order.h"
+#include <glusterfs/byte-order.h>
#include "protocol-common.h"
+#include <glusterfs/events.h>
-#define AFR_HEAL_ATTR (GF_SET_ATTR_UID|GF_SET_ATTR_GID|GF_SET_ATTR_MODE)
+#define AFR_HEAL_ATTR (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE)
static gf_boolean_t
-_afr_ignorable_key_match (dict_t *d, char *k, data_t *val, void *mdata)
+_afr_ignorable_key_match(dict_t *d, char *k, data_t *val, void *mdata)
{
- return afr_is_xattr_ignorable (k);
+ return afr_is_xattr_ignorable(k);
}
void
-afr_delete_ignorable_xattrs (dict_t *xattr)
+afr_delete_ignorable_xattrs(dict_t *xattr)
{
- dict_foreach_match (xattr, _afr_ignorable_key_match, NULL,
- dict_remove_foreach_fn, NULL);
+ dict_foreach_match(xattr, _afr_ignorable_key_match, NULL,
+ dict_remove_foreach_fn, NULL);
}
int
-__afr_selfheal_metadata_do (call_frame_t *frame, xlator_t *this, inode_t *inode,
- int source, unsigned char *healed_sinks,
- struct afr_reply *locked_replies)
+__afr_selfheal_metadata_do(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *locked_replies)
{
- int ret = -1;
- loc_t loc = {0,};
- dict_t *xattr = NULL;
- dict_t *old_xattr = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
-
- priv = this->private;
-
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.gfid, inode->gfid);
-
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_SELF_HEAL_INFO, "performing metadata selfheal on %s",
- uuid_utoa (inode->gfid));
-
- ret = syncop_getxattr (priv->children[source], &loc, &xattr, NULL,
- NULL, NULL);
- if (ret < 0) {
- ret = -EIO;
- goto out;
- }
-
- afr_delete_ignorable_xattrs (xattr);
-
- for (i = 0; i < priv->child_count; i++) {
- if (old_xattr) {
- dict_unref (old_xattr);
- old_xattr = NULL;
- }
-
- if (!healed_sinks[i])
- continue;
-
- ret = syncop_setattr (priv->children[i], &loc,
- &locked_replies[source].poststat,
- AFR_HEAL_ATTR, NULL, NULL, NULL, NULL);
- if (ret)
- healed_sinks[i] = 0;
-
- ret = syncop_getxattr (priv->children[i], &loc, &old_xattr, 0,
- NULL, NULL);
- if (old_xattr) {
- afr_delete_ignorable_xattrs (old_xattr);
- ret = syncop_removexattr (priv->children[i], &loc, "",
- old_xattr, NULL);
- }
-
- ret = syncop_setxattr (priv->children[i], &loc, xattr, 0, NULL,
- NULL);
- if (ret)
- healed_sinks[i] = 0;
- }
- ret = 0;
+ int ret = -1;
+ loc_t loc = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+ dict_t *old_xattr = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+
+ loc.inode = inode_ref(inode);
+ gf_uuid_copy(loc.gfid, inode->gfid);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "performing metadata selfheal on %s", uuid_utoa(inode->gfid));
+
+ ret = syncop_getxattr(priv->children[source], &loc, &xattr, NULL, NULL,
+ NULL);
+ if (ret < 0) {
+ ret = -EIO;
+ goto out;
+ }
+
+ afr_delete_ignorable_xattrs(xattr);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (old_xattr) {
+ dict_unref(old_xattr);
+ old_xattr = NULL;
+ }
+
+ if (!healed_sinks[i])
+ continue;
+
+ ret = syncop_setattr(priv->children[i], &loc,
+ &locked_replies[source].poststat, AFR_HEAL_ATTR,
+ NULL, NULL, NULL, NULL);
+ if (ret)
+ healed_sinks[i] = 0;
+
+ ret = syncop_getxattr(priv->children[i], &loc, &old_xattr, 0, NULL,
+ NULL);
+ if (old_xattr) {
+ afr_delete_ignorable_xattrs(old_xattr);
+ ret = syncop_removexattr(priv->children[i], &loc, "", old_xattr,
+ NULL);
+ if (ret)
+ healed_sinks[i] = 0;
+ }
+
+ ret = syncop_setxattr(priv->children[i], &loc, xattr, 0, NULL, NULL);
+ if (ret)
+ healed_sinks[i] = 0;
+ }
+ ret = 0;
out:
- loc_wipe (&loc);
- if (xattr)
- dict_unref (xattr);
- if (old_xattr)
- dict_unref (old_xattr);
+ loc_wipe(&loc);
+ if (xattr)
+ dict_unref(xattr);
+ if (old_xattr)
+ dict_unref(old_xattr);
- return ret;
+ return ret;
}
static uint64_t
mtime_ns(struct iatt *ia)
{
- uint64_t ret;
+ uint64_t ret;
- ret = (((uint64_t)(ia->ia_mtime)) * 1000000000)
- + (uint64_t)(ia->ia_mtime_nsec);
+ ret = (((uint64_t)(ia->ia_mtime)) * 1000000000) +
+ (uint64_t)(ia->ia_mtime_nsec);
- return ret;
+ return ret;
}
/*
@@ -120,319 +122,425 @@ mtime_ns(struct iatt *ia)
* the source with the most recent modification date.
*/
static int
-afr_dirtime_splitbrain_source (call_frame_t *frame, xlator_t *this,
- struct afr_reply *replies,
- unsigned char *locked_on)
+afr_dirtime_splitbrain_source(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ unsigned char *locked_on)
{
- afr_private_t *priv = NULL;
- int source = -1;
- struct iatt source_ia;
- struct iatt child_ia;
- uint64_t mtime = 0;
- int i;
- int ret = -1;
-
- priv = this->private;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ struct iatt source_ia;
+ struct iatt child_ia;
+ uint64_t mtime = 0;
+ int i;
+ int ret = -1;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!locked_on[i])
+ continue;
+
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret != 0)
+ continue;
+
+ if (mtime_ns(&replies[i].poststat) <= mtime)
+ continue;
+
+ mtime = mtime_ns(&replies[i].poststat);
+ source = i;
+ }
+
+ if (source == -1)
+ goto out;
+
+ source_ia = replies[source].poststat;
+ if (source_ia.ia_type != IA_IFDIR)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (i == source)
+ continue;
+
+ if (!replies[i].valid)
+ continue;
+
+ if (replies[i].op_ret != 0)
+ continue;
+
+ child_ia = replies[i].poststat;
+
+ if (!IA_EQUAL(source_ia, child_ia, gfid) ||
+ !IA_EQUAL(source_ia, child_ia, type) ||
+ !IA_EQUAL(source_ia, child_ia, prot) ||
+ !IA_EQUAL(source_ia, child_ia, uid) ||
+ !IA_EQUAL(source_ia, child_ia, gid) ||
+ !afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata))
+ goto out;
+ }
+
+ /*
+ * Metadata split brain is just about [amc]time
+ * We return our source.
+ */
+ ret = source;
+out:
+ return ret;
+}
- for (i = 0; i < priv->child_count; i++) {
- if (!locked_on[i])
- continue;
+static int
+__afr_selfheal_metadata_mark_pending_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode,
+ struct afr_reply *replies,
+ unsigned char *sources)
+{
+ int ret = 0;
+ int i = 0;
+ int m_idx = 0;
+ afr_private_t *priv = NULL;
+ int raw[AFR_NUM_CHANGE_LOGS] = {0};
+ dict_t *xattr = NULL;
+
+ priv = this->private;
+ m_idx = afr_index_for_transaction_type(AFR_METADATA_TRANSACTION);
+ raw[m_idx] = 1;
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i])
+ continue;
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ ret = -1;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
+ ret = afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, AFR_MSG_SELF_HEAL_INFO,
+ "Failed to set pending metadata xattr on child %d for %s", i,
+ uuid_utoa(inode->gfid));
+ goto out;
+ }
+ }
- if (!replies[i].valid)
- continue;
+ afr_replies_wipe(replies, priv->child_count);
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
- if (replies[i].op_ret != 0)
- continue;
+out:
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
+}
- if (mtime_ns(&replies[i].poststat) <= mtime)
- continue;
+/*
+ * Look for mismatching uid/gid or mode or user xattrs even if
+ * AFR xattrs don't say so, and pick one arbitrarily as winner. */
- mtime = mtime_ns(&replies[i].poststat);
- source = i;
+static int
+__afr_selfheal_metadata_finalize_source(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ unsigned char *locked_on,
+ struct afr_reply *replies)
+{
+ int i = 0;
+ afr_private_t *priv = NULL;
+ struct iatt srcstat = {
+ 0,
+ };
+ int source = -1;
+ int sources_count = 0;
+ int ret = 0;
+
+ priv = this->private;
+
+ sources_count = AFR_COUNT(sources, priv->child_count);
+
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count) {
+ source = afr_mark_split_brain_source_sinks(
+ frame, this, inode, sources, sinks, healed_sinks, locked_on,
+ replies, AFR_METADATA_TRANSACTION);
+ if (source >= 0) {
+ _afr_fav_child_reset_sink_xattrs(
+ frame, this, inode, source, healed_sinks, undid_pending,
+ AFR_METADATA_TRANSACTION, locked_on, replies);
+ goto out;
}
- if (source == -1)
- goto out;
+ /* If this is a directory mtime/ctime only split brain
+ use the most recent */
+ source = afr_dirtime_splitbrain_source(frame, this, replies, locked_on);
+ if (source != -1) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SPLIT_BRAIN,
+ "clear time "
+ "split brain on %s",
+ uuid_utoa(replies[source].poststat.ia_gfid));
+ sources[source] = 1;
+ healed_sinks[source] = 0;
+ goto out;
+ }
- source_ia = replies[source].poststat;
- if (source_ia.ia_type != IA_IFDIR)
- goto out;
+ if (!priv->metadata_splitbrain_forced_heal) {
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;"
+ "type=metadata;file=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(inode->gfid));
+ return -EIO;
+ }
+ /* Metadata split brain, select one subvol
+ arbitrarily */
for (i = 0; i < priv->child_count; i++) {
- if (i == source)
- continue;
-
- if (!replies[i].valid)
- continue;
-
- if (replies[i].op_ret != 0)
- continue;
+ if (locked_on[i] && healed_sinks[i]) {
+ sources[i] = 1;
+ healed_sinks[i] = 0;
+ break;
+ }
+ }
+ }
+
+ /* No split brain at this point. If we were called from
+ * afr_heal_splitbrain_file(), abort.*/
+ if (afr_dict_contains_heal_op(frame))
+ return -EIO;
+
+ source = afr_choose_source_by_policy(priv, sources,
+ AFR_METADATA_TRANSACTION);
+ srcstat = replies[source].poststat;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i] || i == source)
+ continue;
+ if (!IA_EQUAL(srcstat, replies[i].poststat, type) ||
+ !IA_EQUAL(srcstat, replies[i].poststat, uid) ||
+ !IA_EQUAL(srcstat, replies[i].poststat, gid) ||
+ !IA_EQUAL(srcstat, replies[i].poststat, prot)) {
+ gf_msg_debug(this->name, 0,
+ "%s: iatt mismatch "
+ "for source(%d) vs (%d)",
+ uuid_utoa(replies[source].poststat.ia_gfid), source,
+ i);
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i] || i == source)
+ continue;
+ if (!afr_xattrs_are_equal(replies[source].xdata, replies[i].xdata)) {
+ gf_msg_debug(this->name, 0,
+ "%s: xattr mismatch "
+ "for source(%d) vs (%d)",
+ uuid_utoa(replies[source].poststat.ia_gfid), source,
+ i);
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
+ }
+ if ((sources_count == priv->child_count) && (source > -1) &&
+ (AFR_COUNT(healed_sinks, priv->child_count) != 0)) {
+ ret = __afr_selfheal_metadata_mark_pending_xattrs(frame, this, inode,
+ replies, sources);
+ if (ret < 0)
+ return ret;
+ }
+out:
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ return source;
+}
- child_ia = replies[i].poststat;
+int
+__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *pflag)
+{
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ uint64_t *witness = NULL;
- if (!IA_EQUAL(source_ia, child_ia, gfid) ||
- !IA_EQUAL(source_ia, child_ia, type) ||
- !IA_EQUAL(source_ia, child_ia, prot) ||
- !IA_EQUAL(source_ia, child_ia, uid) ||
- !IA_EQUAL(source_ia, child_ia, gid) ||
- !afr_xattrs_are_equal (replies[source].xdata,
- replies[i].xdata))
- goto out;
- }
+ priv = this->private;
- /*
- * Metadata split brain is just about [amc]time
- * We return our source.
- */
- ret = source;
-out:
+ ret = afr_selfheal_unlocked_discover(frame, inode, inode->gfid, replies);
+ if (ret)
return ret;
-}
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_METADATA_TRANSACTION, locked_on,
+ sources, sinks, witness, pflag);
+ if (ret)
+ return ret;
-/*
- * Look for mismatching uid/gid or mode or user xattrs even if
- * AFR xattrs don't say so, and pick one arbitrarily as winner. */
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+ /* If any source has witness, pick first
+ * witness source and make everybody else sinks */
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i] && witness[i]) {
+ source = i;
+ break;
+ }
+ }
-static int
-__afr_selfheal_metadata_finalize_source (call_frame_t *frame, xlator_t *this,
- unsigned char *sources,
- unsigned char *sinks,
- unsigned char *healed_sinks,
- unsigned char *locked_on,
- struct afr_reply *replies)
-{
- int i = 0;
- afr_private_t *priv = NULL;
- struct iatt first = {0, };
- int source = -1;
- int sources_count = 0;
-
- priv = this->private;
-
- sources_count = AFR_COUNT (sources, priv->child_count);
-
- if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
- || !sources_count) {
-
- source = afr_mark_split_brain_source_sinks (frame, this,
- sources, sinks,
- healed_sinks,
- locked_on, replies,
- AFR_METADATA_TRANSACTION);
- if (source >= 0)
- return source;
-
- /* If this is a directory mtime/ctime only split brain
- use the most recent */
- source = afr_dirtime_splitbrain_source (frame, this,
- replies, locked_on);
- if (source != -1) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_SPLIT_BRAIN, "clear time "
- "split brain on %s",
- uuid_utoa (replies[source].poststat.ia_gfid));
- sources[source] = 1;
- healed_sinks[source] = 0;
- return source;
- }
-
- if (!priv->metadata_splitbrain_forced_heal) {
- return -EIO;
- }
-
- /* Metadata split brain, select one subvol
- arbitrarily */
- for (i = 0; i < priv->child_count; i++) {
- if (locked_on[i] && healed_sinks[i]) {
- sources[i] = 1;
- healed_sinks[i] = 0;
- break;
- }
- }
- }
-
- /* No split brain at this point. If we were called from
- * afr_heal_splitbrain_file(), abort.*/
- if (afr_dict_contains_heal_op(frame))
- return -EIO;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
- if (source == -1) {
- source = i;
- first = replies[i].poststat;
- break;
- }
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i] || i == source)
- continue;
- if (!IA_EQUAL (first, replies[i].poststat, type) ||
- !IA_EQUAL (first, replies[i].poststat, uid) ||
- !IA_EQUAL (first, replies[i].poststat, gid) ||
- !IA_EQUAL (first, replies[i].poststat, prot)) {
- gf_msg_debug (this->name, 0, "%s: iatt mismatch "
- "for source(%d) vs (%d)",
- uuid_utoa
- (replies[source].poststat.ia_gfid),
- source, i);
- sources[i] = 0;
- healed_sinks[i] = 1;
- }
- }
-
- for (i =0; i < priv->child_count; i++) {
- if (!sources[i] || i == source)
- continue;
- if (!afr_xattrs_are_equal (replies[source].xdata,
- replies[i].xdata)) {
- gf_msg_debug (this->name, 0, "%s: xattr mismatch "
- "for source(%d) vs (%d)",
- uuid_utoa
- (replies[source].poststat.ia_gfid),
- source, i);
- sources[i] = 0;
- healed_sinks[i] = 1;
- }
+ if (source != -1) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (i != source && sources[i]) {
+ sources[i] = 0;
+ healed_sinks[i] = 1;
+ }
}
+ }
- return source;
-}
+ source = __afr_selfheal_metadata_finalize_source(
+ frame, this, inode, sources, sinks, healed_sinks, undid_pending,
+ locked_on, replies);
+ if (source < 0)
+ return -EIO;
+
+ return source;
+}
int
-__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this, inode_t *inode,
- unsigned char *locked_on, unsigned char *sources,
- unsigned char *sinks, unsigned char *healed_sinks,
- struct afr_reply *replies, gf_boolean_t *pflag)
+afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode)
{
- int ret = -1;
- int source = -1;
- afr_private_t *priv = NULL;
- int i = 0;
- uint64_t *witness = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *data_lock = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *undid_pending = NULL;
+ struct afr_reply *locked_replies = NULL;
+ gf_boolean_t did_sh = _gf_true;
+ int source = -1;
+
+ priv = this->private;
+
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+ undid_pending = alloca0(priv->child_count);
+ data_lock = alloca0(priv->child_count);
+
+ locked_replies = alloca0(sizeof(*locked_replies) * priv->child_count);
+
+ ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
+ data_lock);
+ {
+ if (ret < priv->child_count) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
- priv = this->private;
+ ret = __afr_selfheal_metadata_prepare(
+ frame, this, inode, data_lock, sources, sinks, healed_sinks,
+ undid_pending, locked_replies, NULL);
+ if (ret < 0)
+ goto unlock;
- ret = afr_selfheal_unlocked_discover (frame, inode, inode->gfid,
- replies);
- if (ret)
- return ret;
-
- witness = alloca0 (sizeof (*witness) * priv->child_count);
- ret = afr_selfheal_find_direction (frame, this, replies,
- AFR_METADATA_TRANSACTION,
- locked_on, sources, sinks, witness,
- pflag);
- if (ret)
- return ret;
-
- /* Initialize the healed_sinks[] array optimistically to
- the intersection of to-be-healed (i.e sinks[]) and
- the list of servers which are up (i.e locked_on[]).
-
- As we encounter failures in the healing process, we
- will unmark the respective servers in the healed_sinks[]
- array.
- */
- AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count);
-
- /* If any source has witness, pick first
- * witness source and make everybody else sinks */
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i] && witness[i]) {
- source = i;
- break;
- }
- }
+ source = ret;
- if (source != -1) {
- for (i = 0; i < priv->child_count; i++) {
- if (i != source && sources[i]) {
- sources[i] = 0;
- healed_sinks[i] = 1;
- }
- }
+ if (AFR_COUNT(healed_sinks, priv->child_count) == 0) {
+ did_sh = _gf_false;
+ goto unlock;
}
- source = __afr_selfheal_metadata_finalize_source (frame, this, sources,
- sinks, healed_sinks,
- locked_on, replies);
+ ret = __afr_selfheal_metadata_do(frame, this, inode, source,
+ healed_sinks, locked_replies);
+ if (ret)
+ goto unlock;
- if (source < 0)
- return -EIO;
+ afr_selfheal_restore_time(frame, this, inode, source, healed_sinks,
+ locked_replies);
- return source;
+ ret = afr_selfheal_undo_pending(
+ frame, this, inode, sources, sinks, healed_sinks, undid_pending,
+ AFR_METADATA_TRANSACTION, locked_replies, data_lock);
+ }
+unlock:
+ afr_selfheal_uninodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0,
+ data_lock);
+
+ if (did_sh)
+ afr_log_selfheal(inode->gfid, this, ret, "metadata", source, sources,
+ healed_sinks);
+ else
+ ret = 1;
+
+ if (locked_replies)
+ afr_replies_wipe(locked_replies, priv->child_count);
+ return ret;
}
int
-afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode)
+afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf)
{
- afr_private_t *priv = NULL;
- int ret = -1;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *data_lock = NULL;
- unsigned char *healed_sinks = NULL;
- struct afr_reply *locked_replies = NULL;
- gf_boolean_t did_sh = _gf_true;
- int source = -1;
-
- priv = this->private;
-
- sources = alloca0 (priv->child_count);
- sinks = alloca0 (priv->child_count);
- healed_sinks = alloca0 (priv->child_count);
- data_lock = alloca0 (priv->child_count);
-
- locked_replies = alloca0 (sizeof (*locked_replies) * priv->child_count);
-
- ret = afr_selfheal_inodelk (frame, this, inode, this->name,
- LLONG_MAX - 1, 0, data_lock);
- {
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
- ret = -ENOTCONN;
- goto unlock;
- }
-
- ret = __afr_selfheal_metadata_prepare (frame, this, inode,
- data_lock, sources,
- sinks, healed_sinks,
- locked_replies, NULL);
- if (ret < 0)
- goto unlock;
-
- source = ret;
-
- if (AFR_COUNT (healed_sinks, priv->child_count) == 0) {
- did_sh = _gf_false;
- goto unlock;
- }
-
- ret = __afr_selfheal_metadata_do (frame, this, inode, source,
- healed_sinks, locked_replies);
- if (ret)
- goto unlock;
-
- ret = afr_selfheal_undo_pending (frame, this, inode, sources,
- sinks, healed_sinks,
- AFR_METADATA_TRANSACTION,
- locked_replies, data_lock);
- }
-unlock:
- afr_selfheal_uninodelk (frame, this, inode, this->name,
- LLONG_MAX -1, 0, data_lock);
-
- if (did_sh)
- afr_log_selfheal (inode->gfid, this, ret, "metadata", source,
- healed_sinks);
- else
- ret = 1;
-
- if (locked_replies)
- afr_replies_wipe (locked_replies, priv->child_count);
- return ret;
+ inode_t *inode = NULL;
+ inode_t *link_inode = NULL;
+ call_frame_t *frame = NULL;
+ int ret = 0;
+
+ if (gf_uuid_is_null(stbuf->ia_gfid)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ inode = inode_new(this->itable);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ link_inode = inode_link(inode, NULL, NULL, stbuf);
+ if (!link_inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ frame = afr_frame_create(this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = afr_selfheal_metadata(frame, this, link_inode);
+out:
+ if (inode)
+ inode_unref(inode);
+ if (link_inode)
+ inode_unref(link_inode);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal-name.c b/xlators/cluster/afr/src/afr-self-heal-name.c
index d8ac170f96c..834aac86d48 100644
--- a/xlators/cluster/afr/src/afr-self-heal-name.c
+++ b/xlators/cluster/afr/src/afr-self-heal-name.c
@@ -8,712 +8,609 @@
cases as published by the Free Software Foundation.
*/
-
+#include <glusterfs/events.h>
#include "afr.h"
#include "afr-self-heal.h"
#include "afr-messages.h"
int
-__afr_selfheal_assign_gfid (xlator_t *this, inode_t *parent, uuid_t pargfid,
- const char *bname, inode_t *inode,
- struct afr_reply *replies, void *gfid,
- unsigned char *locked_on,
- gf_boolean_t is_gfid_absent)
+__afr_selfheal_assign_gfid(xlator_t *this, inode_t *parent, uuid_t pargfid,
+ const char *bname, inode_t *inode,
+ struct afr_reply *replies, void *gfid,
+ unsigned char *locked_on, int source,
+ unsigned char *sources, gf_boolean_t is_gfid_absent,
+ int *gfid_idx)
{
- int ret = 0;
- int up_count = 0;
- int locked_count = 0;
- afr_private_t *priv = NULL;
- dict_t *xdata = NULL;
- loc_t loc = {0, };
- call_frame_t *new_frame = NULL;
- afr_local_t *new_local = NULL;
-
- priv = this->private;
-
- new_frame = afr_frame_create (this);
- if (!new_frame) {
- ret = -ENOMEM;
- goto out;
- }
-
- new_local = new_frame->local;
-
- gf_uuid_copy (parent->gfid, pargfid);
+ int ret = 0;
+ int up_count = 0;
+ int locked_count = 0;
+ afr_private_t *priv = NULL;
- xdata = dict_new ();
- if (!xdata) {
- ret = -ENOMEM;
- goto out;
- }
+ priv = this->private;
- ret = dict_set_static_bin (xdata, "gfid-req", gfid, 16);
- if (ret) {
- ret = -ENOMEM;
- goto out;
- }
+ gf_uuid_copy(parent->gfid, pargfid);
- loc.parent = inode_ref (parent);
- loc.inode = inode_ref (inode);
- gf_uuid_copy (loc.pargfid, pargfid);
- loc.name = bname;
+ if (is_gfid_absent) {
+ /* Ensure all children of AFR are up before performing gfid heal, to
+ * guard against the possibility of gfid split brain. */
- if (is_gfid_absent) {
- /* Ensure all children of AFR are up before performing gfid heal, to
- * guard against the possibility of gfid split brain. */
-
- up_count = AFR_COUNT (priv->child_up, priv->child_count);
- if (up_count != priv->child_count) {
- ret = -EIO;
- goto out;
- }
-
- locked_count = AFR_COUNT (locked_on, priv->child_count);
- if (locked_count != priv->child_count) {
- ret = -EIO;
- goto out;
- }
+ up_count = AFR_COUNT(priv->child_up, priv->child_count);
+ if (up_count != priv->child_count) {
+ ret = -EIO;
+ goto out;
}
- /* Clear out old replies here and wind lookup on all locked
- * subvolumes to achieve two things:
- * a. gfid heal on those subvolumes that do not have gfid associated
- * with the inode, and
- * b. refresh replies, which can be consumed by
- * __afr_selfheal_name_impunge().
- */
-
- afr_replies_wipe (replies, priv->child_count);
-
- AFR_ONLIST (locked_on, new_frame, afr_selfheal_discover_cbk, lookup,
- &loc, xdata);
+ locked_count = AFR_COUNT(locked_on, priv->child_count);
+ if (locked_count != priv->child_count) {
+ ret = -EIO;
+ goto out;
+ }
+ }
- afr_replies_copy (replies, new_local->replies, priv->child_count);
+ ret = afr_lookup_and_heal_gfid(this, parent, bname, inode, replies, source,
+ sources, gfid, gfid_idx);
out:
- loc_wipe (&loc);
- if (xdata)
- dict_unref (xdata);
- if (new_frame)
- AFR_STACK_DESTROY (new_frame);
-
- return ret;
+ return ret;
}
int
-__afr_selfheal_name_impunge (call_frame_t *frame, xlator_t *this,
- inode_t *parent, uuid_t pargfid,
- const char *bname, inode_t *inode,
- struct afr_reply *replies, int gfid_idx)
+__afr_selfheal_name_impunge(call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid, const char *bname,
+ inode_t *inode, struct afr_reply *replies,
+ int gfid_idx)
{
- int i = 0;
- afr_private_t *priv = NULL;
- int ret = 0;
- unsigned char *newentry = NULL;
- unsigned char *sources = NULL;
-
- priv = this->private;
-
- newentry = alloca0 (priv->child_count);
- sources = alloca0 (priv->child_count);
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
+ unsigned char *sources = NULL;
- gf_uuid_copy (parent->gfid, pargfid);
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
+ sources = alloca0(priv->child_count);
- if (gf_uuid_compare (replies[i].poststat.ia_gfid,
- replies[gfid_idx].poststat.ia_gfid) == 0) {
- sources[i] = 1;
- continue;
- }
+ gf_uuid_copy(parent->gfid, pargfid);
- ret |= afr_selfheal_recreate_entry (this, i, gfid_idx, parent,
- bname, inode, replies,
- newentry);
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
- if (AFR_COUNT (newentry, priv->child_count))
- afr_selfheal_newentry_mark (frame, this, inode, gfid_idx, replies,
- sources, newentry);
- return ret;
-}
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[gfid_idx].poststat.ia_gfid) == 0) {
+ sources[i] = 1;
+ continue;
+ }
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i])
+ continue;
-int
-__afr_selfheal_name_expunge (xlator_t *this, inode_t *parent, uuid_t pargfid,
- const char *bname, inode_t *inode,
- struct afr_reply *replies)
-{
- loc_t loc = {0, };
- int i = 0;
- afr_private_t *priv = NULL;
- char g[64];
- int ret = 0;
-
- priv = this->private;
-
- loc.parent = inode_ref (parent);
- gf_uuid_copy (loc.pargfid, pargfid);
- loc.name = bname;
- loc.inode = inode_ref (inode);
-
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
-
- if (replies[i].op_ret)
- continue;
-
- switch (replies[i].poststat.ia_type) {
- case IA_IFDIR:
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging dir %s/%s (%s) on %s",
- uuid_utoa (pargfid), bname,
- uuid_utoa_r (replies[i].poststat.ia_gfid, g),
- priv->children[i]->name);
-
- ret |= syncop_rmdir (priv->children[i], &loc, 1, NULL,
- NULL);
- break;
- default:
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_EXPUNGING_FILE_OR_DIR,
- "expunging file %s/%s (%s) on %s",
- uuid_utoa (pargfid), bname,
- uuid_utoa_r (replies[i].poststat.ia_gfid, g),
- priv->children[i]->name);
-
- ret |= syncop_unlink (priv->children[i], &loc, NULL,
- NULL);
- break;
- }
- }
-
- loc_wipe (&loc);
-
- return ret;
+ ret |= afr_selfheal_recreate_entry(frame, i, gfid_idx, sources, parent,
+ bname, inode, replies);
+ }
+ return ret;
}
-/* This function is to be called after ensuring that there is no gfid mismatch
- * for the inode across multiple sources
- */
-static int
-afr_selfheal_gfid_idx_get (xlator_t *this, struct afr_reply *replies,
- unsigned char *sources)
+int
+__afr_selfheal_name_expunge(xlator_t *this, inode_t *parent, uuid_t pargfid,
+ const char *bname, inode_t *inode,
+ struct afr_reply *replies)
{
- int i = 0;
- int gfid_idx = -1;
- afr_private_t *priv = NULL;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int ret = 0;
- priv = this->private;
+ priv = this->private;
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
- if (!sources[i])
- continue;
+ if (replies[i].op_ret)
+ continue;
- if (gf_uuid_is_null (replies[i].poststat.ia_gfid))
- continue;
+ ret |= afr_selfheal_entry_delete(this, parent, bname, inode, i,
+ replies);
+ }
- gfid_idx = i;
- break;
- }
- return gfid_idx;
+ return ret;
}
static gf_boolean_t
-afr_selfheal_name_need_heal_check (xlator_t *this, struct afr_reply *replies)
+afr_selfheal_name_need_heal_check(xlator_t *this, struct afr_reply *replies)
{
- int i = 0;
- int first_idx = -1;
- gf_boolean_t need_heal = _gf_false;
- afr_private_t *priv = NULL;
-
- priv = this->private;
+ int i = 0;
+ int first_idx = -1;
+ gf_boolean_t need_heal = _gf_false;
+ afr_private_t *priv = NULL;
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
+ priv = this->private;
- if ((replies[i].op_ret == -1) &&
- (replies[i].op_errno == ENODATA))
- need_heal = _gf_true;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
- if (first_idx == -1) {
- first_idx = i;
- continue;
- }
+ if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA))
+ need_heal = _gf_true;
- if (replies[i].op_ret != replies[first_idx].op_ret)
- need_heal = _gf_true;
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
- if (gf_uuid_compare (replies[i].poststat.ia_gfid,
- replies[first_idx].poststat.ia_gfid))
- need_heal = _gf_true;
+ if (replies[i].op_ret != replies[first_idx].op_ret)
+ need_heal = _gf_true;
- if ((replies[i].op_ret == 0) &&
- (gf_uuid_is_null(replies[i].poststat.ia_gfid)))
- need_heal = _gf_true;
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid))
+ need_heal = _gf_true;
- }
+ if ((replies[i].op_ret == 0) &&
+ (gf_uuid_is_null(replies[i].poststat.ia_gfid)))
+ need_heal = _gf_true;
+ }
- return need_heal;
+ return need_heal;
}
static int
-afr_selfheal_name_type_mismatch_check (xlator_t *this, struct afr_reply *replies,
- int source, unsigned char *sources,
- uuid_t pargfid, const char *bname)
+afr_selfheal_name_type_mismatch_check(xlator_t *this, struct afr_reply *replies,
+ int source, unsigned char *sources,
+ uuid_t pargfid, const char *bname)
{
- int i = 0;
- int type_idx = -1;
- ia_type_t inode_type = IA_INVAL;
- afr_private_t *priv = NULL;
-
- priv = this->private;
+ int i = 0;
+ int type_idx = -1;
+ ia_type_t inode_type = IA_INVAL;
+ ia_type_t inode_type1 = IA_INVAL;
+ afr_private_t *priv = NULL;
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
+ priv = this->private;
- if (replies[i].poststat.ia_type == IA_INVAL)
- continue;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
- if (inode_type == IA_INVAL) {
- inode_type = replies[i].poststat.ia_type;
- type_idx = i;
- continue;
- }
+ if (replies[i].poststat.ia_type == IA_INVAL)
+ continue;
- if (sources[i] || source == -1) {
- if ((sources[type_idx] || source == -1) &&
- (inode_type != replies[i].poststat.ia_type)) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_SPLIT_BRAIN,
- "Type mismatch for <gfid:%s>/%s: "
- "%d on %s and %d on %s",
- uuid_utoa(pargfid), bname,
- replies[i].poststat.ia_type,
- priv->children[i]->name,
- replies[type_idx].poststat.ia_type,
- priv->children[type_idx]->name);
-
- return -EIO;
- }
- inode_type = replies[i].poststat.ia_type;
- type_idx = i;
- }
+ if (inode_type == IA_INVAL) {
+ inode_type = replies[i].poststat.ia_type;
+ type_idx = i;
+ continue;
}
- return 0;
+ inode_type1 = replies[i].poststat.ia_type;
+ if (sources[i] || source == -1) {
+ if ((sources[type_idx] || source == -1) &&
+ (inode_type != inode_type1)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SPLIT_BRAIN,
+ "Type mismatch for <gfid:%s>/%s: "
+ "%s on %s and %s on %s",
+ uuid_utoa(pargfid), bname,
+ gf_inode_type_to_str(inode_type1),
+ priv->children[i]->name,
+ gf_inode_type_to_str(inode_type),
+ priv->children[type_idx]->name);
+ gf_event(EVENT_AFR_SPLIT_BRAIN,
+ "client-pid=%d;"
+ "subvol=%s;type=file;"
+ "file=<gfid:%s>/%s;count=2;"
+ "child-%d=%s;type-%d=%s;child-%d=%s;"
+ "type-%d=%s",
+ this->ctx->cmd_args.client_pid, this->name,
+ uuid_utoa(pargfid), bname, i, priv->children[i]->name,
+ i, gf_inode_type_to_str(inode_type1), type_idx,
+ priv->children[type_idx]->name, type_idx,
+ gf_inode_type_to_str(inode_type));
+ return -EIO;
+ }
+ inode_type = replies[i].poststat.ia_type;
+ type_idx = i;
+ }
+ }
+ return 0;
}
static int
-afr_selfheal_name_gfid_mismatch_check (xlator_t *this, struct afr_reply *replies,
- int source, unsigned char *sources,
- int *gfid_idx, uuid_t pargfid,
- const char *bname)
+afr_selfheal_name_gfid_mismatch_check(xlator_t *this, struct afr_reply *replies,
+ int source, unsigned char *sources,
+ int *gfid_idx, uuid_t pargfid,
+ const char *bname, inode_t *inode,
+ unsigned char *locked_on, dict_t *xdata)
{
- int i = 0;
- int gfid_idx_iter = -1;
- void *gfid = NULL;
- afr_private_t *priv = NULL;
- char g1[64], g2[64];
-
- priv = this->private;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
-
- if (gf_uuid_is_null (replies[i].poststat.ia_gfid))
- continue;
-
- if (!gfid) {
- gfid = &replies[i].poststat.ia_gfid;
- gfid_idx_iter = i;
- continue;
- }
-
- if (sources[i] || source == -1) {
- if ((sources[gfid_idx_iter] || source == -1) &&
- gf_uuid_compare (gfid, replies[i].poststat.ia_gfid)) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_SPLIT_BRAIN,
- "GFID mismatch for <gfid:%s>/%s "
- "%s on %s and %s on %s",
- uuid_utoa (pargfid), bname,
- uuid_utoa_r (replies[i].poststat.ia_gfid, g1),
- priv->children[i]->name,
- uuid_utoa_r (replies[gfid_idx_iter].poststat.ia_gfid, g2),
- priv->children[gfid_idx_iter]->name);
-
- return -EIO;
- }
-
- gfid = &replies[i].poststat.ia_gfid;
- gfid_idx_iter = i;
- }
- }
-
- *gfid_idx = gfid_idx_iter;
- return 0;
+ int i = 0;
+ int gfid_idx_iter = -1;
+ int ret = -1;
+ void *gfid = NULL;
+ void *gfid1 = NULL;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid || replies[i].op_ret != 0)
+ continue;
+
+ if (gf_uuid_is_null(replies[i].poststat.ia_gfid))
+ continue;
+
+ if (!gfid) {
+ gfid = &replies[i].poststat.ia_gfid;
+ gfid_idx_iter = i;
+ continue;
+ }
+
+ gfid1 = &replies[i].poststat.ia_gfid;
+ if (sources[i] || source == -1) {
+ if ((sources[gfid_idx_iter] || source == -1) &&
+ gf_uuid_compare(gfid, gfid1)) {
+ ret = afr_gfid_split_brain_source(this, replies, inode, pargfid,
+ bname, gfid_idx_iter, i,
+ locked_on, gfid_idx, xdata);
+ if (!ret && *gfid_idx >= 0) {
+ ret = dict_set_sizen_str_sizen(xdata, "gfid-heal-msg",
+ "GFID split-brain resolved");
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ AFR_MSG_DICT_SET_FAILED,
+ "Error setting gfid-"
+ "heal-msg dict");
+ }
+ return ret;
+ }
+ gfid = &replies[i].poststat.ia_gfid;
+ gfid_idx_iter = i;
+ }
+ }
+
+ *gfid_idx = gfid_idx_iter;
+ return 0;
}
static gf_boolean_t
-afr_selfheal_name_source_empty_check (xlator_t *this, struct afr_reply *replies,
- unsigned char *sources, int source)
+afr_selfheal_name_source_empty_check(xlator_t *this, struct afr_reply *replies,
+ unsigned char *sources, int source)
{
- int i = 0;
- afr_private_t *priv = NULL;
- gf_boolean_t source_is_empty = _gf_true;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ gf_boolean_t source_is_empty = _gf_true;
- priv = this->private;
+ priv = this->private;
- if (source == -1) {
- source_is_empty = _gf_false;
- goto out;
- }
+ if (source == -1) {
+ source_is_empty = _gf_false;
+ goto out;
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (!sources[i])
- continue;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!sources[i])
+ continue;
- if (replies[i].op_ret == -1 && replies[i].op_errno == ENOENT)
- continue;
+ if (replies[i].op_ret == -1 && replies[i].op_errno == ENOENT)
+ continue;
- source_is_empty = _gf_false;
- break;
- }
+ source_is_empty = _gf_false;
+ break;
+ }
out:
- return source_is_empty;
+ return source_is_empty;
}
int
-__afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
- uuid_t pargfid, const char *bname, inode_t *inode,
- unsigned char *sources, unsigned char *sinks,
- unsigned char *healed_sinks, int source,
- unsigned char *locked_on, struct afr_reply *replies,
- void *gfid_req)
+__afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks, int source,
+ unsigned char *locked_on, struct afr_reply *replies,
+ void *gfid_req, dict_t *xdata)
{
- int gfid_idx = -1;
- int ret = -1;
- void *gfid = NULL;
- gf_boolean_t source_is_empty = _gf_true;
- gf_boolean_t need_heal = _gf_false;
- gf_boolean_t is_gfid_absent = _gf_false;
-
- need_heal = afr_selfheal_name_need_heal_check (this, replies);
- if (!need_heal)
- return 0;
-
- source_is_empty = afr_selfheal_name_source_empty_check (this, replies,
- sources,
- source);
- if (source_is_empty) {
- ret = __afr_selfheal_name_expunge (this, parent, pargfid,
- bname, inode, replies);
- if (ret == -EIO)
- ret = -1;
- return ret;
- }
-
- ret = afr_selfheal_name_type_mismatch_check (this, replies, source,
- sources, pargfid, bname);
- if (ret)
- return ret;
+ int gfid_idx = -1;
+ int ret = -1;
+ void *gfid = NULL;
+ gf_boolean_t source_is_empty = _gf_true;
+ gf_boolean_t need_heal = _gf_false;
+ gf_boolean_t is_gfid_absent = _gf_false;
+
+ need_heal = afr_selfheal_name_need_heal_check(this, replies);
+ if (!need_heal)
+ return 0;
- ret = afr_selfheal_name_gfid_mismatch_check (this, replies, source,
- sources, &gfid_idx,
- pargfid, bname);
- if (ret)
- return ret;
+ source_is_empty = afr_selfheal_name_source_empty_check(this, replies,
+ sources, source);
+ if (source_is_empty) {
+ ret = __afr_selfheal_name_expunge(this, parent, pargfid, bname, inode,
+ replies);
+ if (ret == -EIO)
+ ret = -1;
+ return ret;
+ }
- if (gfid_idx == -1) {
- if (!gfid_req || gf_uuid_is_null (gfid_req))
- return -1;
- gfid = gfid_req;
- } else {
- gfid = &replies[gfid_idx].poststat.ia_gfid;
- }
+ ret = afr_selfheal_name_type_mismatch_check(this, replies, source, sources,
+ pargfid, bname);
+ if (ret)
+ return ret;
- is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false;
- ret = __afr_selfheal_assign_gfid (this, parent, pargfid, bname, inode,
- replies, gfid, locked_on,
- is_gfid_absent);
- if (ret)
- return ret;
+ ret = afr_selfheal_name_gfid_mismatch_check(this, replies, source, sources,
+ &gfid_idx, pargfid, bname,
+ inode, locked_on, xdata);
+ if (ret)
+ return ret;
- if (gfid_idx == -1) {
- gfid_idx = afr_selfheal_gfid_idx_get (this, replies, sources);
- if (gfid_idx == -1)
- return -1;
- }
+ if (gfid_idx == -1) {
+ if (!gfid_req || gf_uuid_is_null(gfid_req))
+ return -1;
+ gfid = gfid_req;
+ } else {
+ gfid = &replies[gfid_idx].poststat.ia_gfid;
+ if (source == -1)
+ /* Either entry split-brain or dirty xattrs are present on parent.*/
+ source = gfid_idx;
+ }
+
+ is_gfid_absent = (gfid_idx == -1) ? _gf_true : _gf_false;
+ ret = __afr_selfheal_assign_gfid(this, parent, pargfid, bname, inode,
+ replies, gfid, locked_on, source, sources,
+ is_gfid_absent, &gfid_idx);
+ if (ret || (gfid_idx < 0))
+ return ret;
- ret = __afr_selfheal_name_impunge (frame, this, parent, pargfid,
- bname, inode,
- replies, gfid_idx);
- if (ret == -EIO)
- ret = -1;
+ ret = __afr_selfheal_name_impunge(frame, this, parent, pargfid, bname,
+ inode, replies, gfid_idx);
+ if (ret == -EIO)
+ ret = -1;
- return ret;
+ return ret;
}
-
int
-__afr_selfheal_name_finalize_source (xlator_t *this, unsigned char *sources,
- unsigned char *healed_sinks,
- unsigned char *locked_on,
- struct afr_reply *replies,
- uint64_t *witness)
+__afr_selfheal_name_finalize_source(xlator_t *this, unsigned char *sources,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on, uint64_t *witness)
{
- int i = 0;
- afr_private_t *priv = NULL;
- int source = -1;
- int sources_count = 0;
-
- priv = this->private;
-
- sources_count = AFR_COUNT (sources, priv->child_count);
-
- if ((AFR_CMP (locked_on, healed_sinks, priv->child_count) == 0)
- || !sources_count || afr_does_witness_exist (this, witness)) {
- memset (sources, 0, sizeof (*sources) * priv->child_count);
- afr_mark_active_sinks (this, sources, locked_on, healed_sinks);
- return -1;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (sources[i]) {
- source = i;
- break;
- }
- }
-
- return source;
+ int i = 0;
+ afr_private_t *priv = NULL;
+ int source = -1;
+ int sources_count = 0;
+
+ priv = this->private;
+
+ sources_count = AFR_COUNT(sources, priv->child_count);
+
+ if ((AFR_CMP(locked_on, healed_sinks, priv->child_count) == 0) ||
+ !sources_count || afr_does_witness_exist(this, witness)) {
+ memset(sources, 0, sizeof(*sources) * priv->child_count);
+ afr_mark_active_sinks(this, sources, locked_on, healed_sinks);
+ return -1;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (sources[i]) {
+ source = i;
+ break;
+ }
+ }
+
+ return source;
}
int
-__afr_selfheal_name_prepare (call_frame_t *frame, xlator_t *this, inode_t *parent,
- uuid_t pargfid, unsigned char *locked_on,
- unsigned char *sources, unsigned char *sinks,
- unsigned char *healed_sinks, int *source_p)
+__afr_selfheal_name_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ int *source_p)
{
- int ret = -1;
- int source = -1;
- afr_private_t *priv = NULL;
- struct afr_reply *replies = NULL;
- uint64_t *witness = NULL;
-
- priv = this->private;
-
- replies = alloca0 (priv->child_count * sizeof(*replies));
-
- ret = afr_selfheal_unlocked_discover (frame, parent, pargfid, replies);
- if (ret)
- goto out;
-
- witness = alloca0 (sizeof (*witness) * priv->child_count);
- ret = afr_selfheal_find_direction (frame, this, replies,
- AFR_ENTRY_TRANSACTION,
- locked_on, sources, sinks, witness,
- NULL);
- if (ret)
- goto out;
-
- /* Initialize the healed_sinks[] array optimistically to
- the intersection of to-be-healed (i.e sinks[]) and
- the list of servers which are up (i.e locked_on[]).
-
- As we encounter failures in the healing process, we
- will unmark the respective servers in the healed_sinks[]
- array.
- */
- AFR_INTERSECT (healed_sinks, sinks, locked_on, priv->child_count);
-
- source = __afr_selfheal_name_finalize_source (this, sources,
- healed_sinks,
- locked_on, replies,
- witness);
- if (source < 0) {
- /* If source is < 0 (typically split-brain), we perform a
- conservative merge of entries rather than erroring out */
- }
- *source_p = source;
+ int ret = -1;
+ int source = -1;
+ afr_private_t *priv = NULL;
+ struct afr_reply *replies = NULL;
+ uint64_t *witness = NULL;
+
+ priv = this->private;
+
+ replies = alloca0(priv->child_count * sizeof(*replies));
+
+ ret = afr_selfheal_unlocked_discover(frame, parent, pargfid, replies);
+ if (ret)
+ goto out;
+
+ witness = alloca0(sizeof(*witness) * priv->child_count);
+ ret = afr_selfheal_find_direction(frame, this, replies,
+ AFR_ENTRY_TRANSACTION, locked_on, sources,
+ sinks, witness, NULL);
+ if (ret)
+ goto out;
+
+ /* Initialize the healed_sinks[] array optimistically to
+ the intersection of to-be-healed (i.e sinks[]) and
+ the list of servers which are up (i.e locked_on[]).
+
+ As we encounter failures in the healing process, we
+ will unmark the respective servers in the healed_sinks[]
+ array.
+ */
+ AFR_INTERSECT(healed_sinks, sinks, locked_on, priv->child_count);
+
+ source = __afr_selfheal_name_finalize_source(this, sources, healed_sinks,
+ locked_on, witness);
+ if (source < 0) {
+ /* If source is < 0 (typically split-brain), we perform a
+ conservative merge of entries rather than erroring out */
+ }
+ *source_p = source;
out:
- if (replies)
- afr_replies_wipe (replies, priv->child_count);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
- return ret;
+ return ret;
}
-
int
-afr_selfheal_name_do (call_frame_t *frame, xlator_t *this, inode_t *parent,
- uuid_t pargfid, const char *bname, void *gfid_req)
+afr_selfheal_name_do(call_frame_t *frame, xlator_t *this, inode_t *parent,
+ uuid_t pargfid, const char *bname, void *gfid_req,
+ dict_t *xdata)
{
- afr_private_t *priv = NULL;
- unsigned char *sources = NULL;
- unsigned char *sinks = NULL;
- unsigned char *healed_sinks = NULL;
- unsigned char *locked_on = NULL;
- int source = -1;
- struct afr_reply *replies = NULL;
- int ret = -1;
- inode_t *inode = NULL;
- dict_t *xattr = NULL;
-
- xattr = dict_new ();
- if (!xattr)
- return -ENOMEM;
-
- ret = dict_set_int32 (xattr, GF_GFIDLESS_LOOKUP, 1);
- if (ret) {
- dict_destroy (xattr);
- return -1;
+ afr_private_t *priv = NULL;
+ unsigned char *sources = NULL;
+ unsigned char *sinks = NULL;
+ unsigned char *healed_sinks = NULL;
+ unsigned char *locked_on = NULL;
+ int source = -1;
+ struct afr_reply *replies = NULL;
+ int ret = -1;
+ inode_t *inode = NULL;
+ dict_t *xattr = NULL;
+
+ xattr = dict_new();
+ if (!xattr)
+ return -ENOMEM;
+
+ ret = dict_set_int32_sizen(xattr, GF_GFIDLESS_LOOKUP, 1);
+ if (ret) {
+ dict_unref(xattr);
+ return -1;
+ }
+
+ priv = this->private;
+
+ locked_on = alloca0(priv->child_count);
+ sources = alloca0(priv->child_count);
+ sinks = alloca0(priv->child_count);
+ healed_sinks = alloca0(priv->child_count);
+
+ replies = alloca0(priv->child_count * sizeof(*replies));
+
+ ret = afr_selfheal_entrylk(frame, this, parent, this->name, bname,
+ locked_on);
+ {
+ if (ret < priv->child_count) {
+ ret = -ENOTCONN;
+ goto unlock;
+ }
+
+ ret = __afr_selfheal_name_prepare(frame, this, parent, pargfid,
+ locked_on, sources, sinks,
+ healed_sinks, &source);
+ if (ret)
+ goto unlock;
+
+ inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
+ locked_on, xattr);
+ if (!inode) {
+ ret = -ENOMEM;
+ goto unlock;
}
- priv = this->private;
-
- locked_on = alloca0 (priv->child_count);
- sources = alloca0 (priv->child_count);
- sinks = alloca0 (priv->child_count);
- healed_sinks = alloca0 (priv->child_count);
-
- replies = alloca0 (priv->child_count * sizeof(*replies));
-
- ret = afr_selfheal_entrylk (frame, this, parent, this->name, bname,
- locked_on);
- {
- if (ret < AFR_SH_MIN_PARTICIPANTS) {
- ret = -ENOTCONN;
- goto unlock;
- }
-
- ret = __afr_selfheal_name_prepare (frame, this, parent, pargfid,
- locked_on, sources, sinks,
- healed_sinks, &source);
- if (ret)
- goto unlock;
-
- inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname,
- replies, locked_on,
- xattr);
- if (!inode) {
- ret = -ENOMEM;
- goto unlock;
- }
-
- ret = __afr_selfheal_name_do (frame, this, parent, pargfid,
- bname, inode, sources, sinks,
- healed_sinks, source, locked_on,
- replies, gfid_req);
- }
+ ret = __afr_selfheal_name_do(frame, this, parent, pargfid, bname, inode,
+ sources, sinks, healed_sinks, source,
+ locked_on, replies, gfid_req, xdata);
+ }
unlock:
- afr_selfheal_unentrylk (frame, this, parent, this->name, bname,
- locked_on);
- if (inode)
- inode_unref (inode);
+ afr_selfheal_unentrylk(frame, this, parent, this->name, bname, locked_on,
+ NULL);
+ if (inode)
+ inode_unref(inode);
- if (replies)
- afr_replies_wipe (replies, priv->child_count);
- if (xattr)
- dict_unref (xattr);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ if (xattr)
+ dict_unref(xattr);
- return ret;
+ return ret;
}
-
int
-afr_selfheal_name_unlocked_inspect (call_frame_t *frame, xlator_t *this,
- inode_t *parent, uuid_t pargfid,
- const char *bname, gf_boolean_t *need_heal)
+afr_selfheal_name_unlocked_inspect(call_frame_t *frame, xlator_t *this,
+ inode_t *parent, uuid_t pargfid,
+ const char *bname, gf_boolean_t *need_heal)
{
- afr_private_t *priv = NULL;
- int i = 0;
- struct afr_reply *replies = NULL;
- inode_t *inode = NULL;
- int first_idx = -1;
-
- priv = this->private;
-
- replies = alloca0 (sizeof (*replies) * priv->child_count);
-
- inode = afr_selfheal_unlocked_lookup_on (frame, parent, bname,
- replies, priv->child_up, NULL);
- if (!inode)
- return -ENOMEM;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!replies[i].valid)
- continue;
-
- if ((replies[i].op_ret == -1) &&
- (replies[i].op_errno == ENODATA))
- *need_heal = _gf_true;
-
- if (first_idx == -1) {
- first_idx = i;
- continue;
- }
-
- if (replies[i].op_ret != replies[first_idx].op_ret)
- *need_heal = _gf_true;
-
- if (gf_uuid_compare (replies[i].poststat.ia_gfid,
- replies[first_idx].poststat.ia_gfid))
- *need_heal = _gf_true;
- }
-
- if (inode)
- inode_unref (inode);
- if (replies)
- afr_replies_wipe (replies, priv->child_count);
- return 0;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ struct afr_reply *replies = NULL;
+ inode_t *inode = NULL;
+ int first_idx = -1;
+ afr_local_t *local = NULL;
+
+ priv = this->private;
+ local = frame->local;
+
+ replies = alloca0(sizeof(*replies) * priv->child_count);
+
+ inode = afr_selfheal_unlocked_lookup_on(frame, parent, bname, replies,
+ local->child_up, NULL);
+ if (!inode)
+ return -ENOMEM;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!replies[i].valid)
+ continue;
+
+ if ((replies[i].op_ret == -1) && (replies[i].op_errno == ENODATA)) {
+ *need_heal = _gf_true;
+ break;
+ }
+
+ if (first_idx == -1) {
+ first_idx = i;
+ continue;
+ }
+
+ if (replies[i].op_ret != replies[first_idx].op_ret) {
+ *need_heal = _gf_true;
+ break;
+ }
+
+ if (gf_uuid_compare(replies[i].poststat.ia_gfid,
+ replies[first_idx].poststat.ia_gfid)) {
+ *need_heal = _gf_true;
+ break;
+ }
+ }
+
+ if (inode)
+ inode_unref(inode);
+ if (replies)
+ afr_replies_wipe(replies, priv->child_count);
+ return 0;
}
int
-afr_selfheal_name (xlator_t *this, uuid_t pargfid, const char *bname,
- void *gfid_req)
+afr_selfheal_name(xlator_t *this, uuid_t pargfid, const char *bname,
+ void *gfid_req, dict_t *xdata)
{
- inode_t *parent = NULL;
- call_frame_t *frame = NULL;
- int ret = -1;
- gf_boolean_t need_heal = _gf_false;
-
- parent = afr_inode_find (this, pargfid);
- if (!parent)
- goto out;
-
- frame = afr_frame_create (this);
- if (!frame)
- goto out;
-
- ret = afr_selfheal_name_unlocked_inspect (frame, this, parent, pargfid,
- bname, &need_heal);
- if (ret)
- goto out;
-
- if (need_heal) {
- ret = afr_selfheal_name_do (frame, this, parent, pargfid, bname,
- gfid_req);
- if (ret)
- goto out;
- }
+ inode_t *parent = NULL;
+ call_frame_t *frame = NULL;
+ int ret = -1;
+ gf_boolean_t need_heal = _gf_false;
+
+ parent = afr_inode_find(this, pargfid);
+ if (!parent)
+ goto out;
+
+ frame = afr_frame_create(this, NULL);
+ if (!frame)
+ goto out;
+
+ ret = afr_selfheal_name_unlocked_inspect(frame, this, parent, pargfid,
+ bname, &need_heal);
+ if (ret)
+ goto out;
+
+ if (need_heal) {
+ ret = afr_selfheal_name_do(frame, this, parent, pargfid, bname,
+ gfid_req, xdata);
+ if (ret)
+ goto out;
+ }
- ret = 0;
+ ret = 0;
out:
- if (parent)
- inode_unref (parent);
- if (frame)
- AFR_STACK_DESTROY (frame);
+ if (parent)
+ inode_unref(parent);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
- return ret;
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h
index 74e852aa038..48e6dbcfb18 100644
--- a/xlators/cluster/afr/src/afr-self-heal.h
+++ b/xlators/cluster/afr/src/afr-self-heal.h
@@ -8,259 +8,370 @@
cases as published by the Free Software Foundation.
*/
-
#ifndef _AFR_SELFHEAL_H
#define _AFR_SELFHEAL_H
-#define AFR_SH_MIN_PARTICIPANTS 2
-
/* Perform fop on all UP subvolumes and wait for all callbacks to return */
-#define AFR_ONALL(frame, rfn, fop, args ...) do { \
- afr_local_t *__local = frame->local; \
- afr_private_t *__priv = frame->this->private; \
- int __i = 0, __count = 0; \
- \
- afr_local_replies_wipe (__local, __priv); \
- \
- for (__i = 0; __i < __priv->child_count; __i++) { \
- if (!__priv->child_up[__i]) continue; \
- STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
- __priv->children[__i], \
- __priv->children[__i]->fops->fop, args); \
- __count++; \
- } \
- syncbarrier_wait (&__local->barrier, __count); \
- } while (0)
-
+#define AFR_ONALL(frame, rfn, fop, args...) \
+ do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0, __count = 0; \
+ unsigned char *__child_up = alloca(__priv->child_count); \
+ \
+ memcpy(__child_up, __priv->child_up, \
+ sizeof(*__child_up) * __priv->child_count); \
+ __count = AFR_COUNT(__child_up, __priv->child_count); \
+ \
+ __local->barrier.waitfor = __count; \
+ afr_local_replies_wipe(__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__child_up[__i]) \
+ continue; \
+ STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ } \
+ syncbarrier_wait(&__local->barrier, __count); \
+ } while (0)
/* Perform fop on all subvolumes represented by list[] array and wait
for all callbacks to return */
-#define AFR_ONLIST(list, frame, rfn, fop, args ...) do { \
- afr_local_t *__local = frame->local; \
- afr_private_t *__priv = frame->this->private; \
- int __i = 0, __count = 0; \
- \
- afr_local_replies_wipe (__local, __priv); \
- \
- for (__i = 0; __i < __priv->child_count; __i++) { \
- if (!list[__i]) continue; \
- STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
- __priv->children[__i], \
- __priv->children[__i]->fops->fop, args); \
- __count++; \
- } \
- syncbarrier_wait (&__local->barrier, __count); \
- } while (0)
-
-
-#define AFR_SEQ(frame, rfn, fop, args ...) do { \
- afr_local_t *__local = frame->local; \
- afr_private_t *__priv = frame->this->private; \
- int __i = 0; \
- \
- afr_local_replies_wipe (__local, __priv); \
- \
- for (__i = 0; __i < __priv->child_count; __i++) { \
- if (!__priv->child_up[__i]) continue; \
- STACK_WIND_COOKIE (frame, rfn, (void *)(long) __i, \
- __priv->children[__i], \
- __priv->children[__i]->fops->fop, args); \
- syncbarrier_wait (&__local->barrier, 1); \
- } \
- } while (0)
-
-
-#define ALLOC_MATRIX(n, type) ({type **__ptr = NULL; \
- int __i; \
- __ptr = alloca0 (n * sizeof(type *)); \
- for (__i = 0; __i < n; __i++) __ptr[__i] = alloca0 (n * sizeof(type)); \
- __ptr;})
-
+#define AFR_ONLIST(list, frame, rfn, fop, args...) \
+ do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0; \
+ int __count = 0; \
+ unsigned char *__list = alloca(__priv->child_count); \
+ \
+ memcpy(__list, list, sizeof(*__list) * __priv->child_count); \
+ __count = AFR_COUNT(__list, __priv->child_count); \
+ __local->barrier.waitfor = __count; \
+ afr_local_replies_wipe(__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__list[__i]) \
+ continue; \
+ STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ } \
+ syncbarrier_wait(&__local->barrier, __count); \
+ } while (0)
+
+#define AFR_SEQ(frame, rfn, fop, args...) \
+ do { \
+ afr_local_t *__local = frame->local; \
+ afr_private_t *__priv = frame->this->private; \
+ int __i = 0; \
+ \
+ afr_local_replies_wipe(__local, __priv); \
+ \
+ for (__i = 0; __i < __priv->child_count; __i++) { \
+ if (!__priv->child_up[__i]) \
+ continue; \
+ STACK_WIND_COOKIE(frame, rfn, (void *)(long)__i, \
+ __priv->children[__i], \
+ __priv->children[__i]->fops->fop, args); \
+ syncbarrier_wait(&__local->barrier, 1); \
+ } \
+ } while (0)
+
+#define ALLOC_MATRIX(n, type) \
+ ({ \
+ int __i; \
+ type **__ptr = alloca(n * sizeof(type *)); \
+ \
+ for (__i = 0; __i < n; __i++) \
+ __ptr[__i] = alloca0(n * sizeof(type)); \
+ __ptr; \
+ })
+
+#define IA_EQUAL(f, s, field) \
+ (memcmp(&(f.ia_##field), &(s.ia_##field), sizeof(s.ia_##field)) == 0)
+
+#define SBRAIN_HEAL_NO_GO_MSG \
+ "Failed to obtain replies from all bricks of " \
+ "the replica (are they up?). Cannot resolve split-brain."
+#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain"
+#define SNO_BIGGER_FILE "No bigger file"
+#define SNO_DIFF_IN_MTIME "No difference in mtime"
+#define SUSE_SOURCE_BRICK_TO_HEAL \
+ "Use source-brick option to heal metadata" \
+ " split-brain"
+#define SINVALID_BRICK_NAME "Invalid brick name"
+#define SBRICK_IS_NOT_UP "Brick is not up"
+#define SBRICK_NOT_CONNECTED "Brick is not connected"
+#define SLESS_THAN2_BRICKS_in_REP "< 2 bricks in replica are up"
+#define SBRICK_IS_REMOTE "Brick is remote"
+#define SSTARTED_SELF_HEAL "Started self-heal"
+#define SOP_NOT_SUPPORTED "Operation Not Supported"
+#define SFILE_NOT_UNDER_DATA \
+ "The file is not under data or metadata " \
+ "split-brain"
+#define SFILE_NOT_IN_SPLIT_BRAIN "File not in split-brain"
+#define SALL_BRICKS_UP_TO_RESOLVE \
+ "All the bricks should be up to resolve the" \
+ " gfid split brain"
+#define SERROR_GETTING_SRC_BRICK "Error getting the source brick"
+int
+afr_selfheal(xlator_t *this, uuid_t gfid);
-#define IA_EQUAL(f,s,field) (memcmp (&(f.ia_##field), &(s.ia_##field), sizeof (s.ia_##field)) == 0)
+gf_boolean_t
+afr_throttled_selfheal(call_frame_t *frame, xlator_t *this);
+int
+afr_selfheal_name(xlator_t *this, uuid_t gfid, const char *name, void *gfid_req,
+ dict_t *xdata);
int
-afr_selfheal (xlator_t *this, uuid_t gfid);
+afr_selfheal_data(call_frame_t *frame, xlator_t *this, fd_t *fd);
int
-afr_selfheal_name (xlator_t *this, uuid_t gfid, const char *name,
- void *gfid_req);
+afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode);
int
-afr_selfheal_data (call_frame_t *frame, xlator_t *this, inode_t *inode);
+afr_selfheal_entry(call_frame_t *frame, xlator_t *this, inode_t *inode);
int
-afr_selfheal_metadata (call_frame_t *frame, xlator_t *this, inode_t *inode);
+afr_lookup_and_heal_gfid(xlator_t *this, inode_t *parent, const char *name,
+ inode_t *inode, struct afr_reply *replies, int source,
+ unsigned char *sources, void *gfid, int *gfid_idx);
int
-afr_selfheal_entry (call_frame_t *frame, xlator_t *this, inode_t *inode);
+afr_selfheal_inodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
+int
+afr_selfheal_tryinodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ unsigned char *locked_on);
int
-afr_selfheal_inodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, off_t off, size_t size,
- unsigned char *locked_on);
+afr_selfheal_tie_breaker_inodelk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, off_t off,
+ size_t size, unsigned char *locked_on);
int
-afr_selfheal_tryinodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, off_t off, size_t size,
- unsigned char *locked_on);
+afr_selfheal_uninodelk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, off_t off, size_t size,
+ const unsigned char *locked_on);
int
-afr_selfheal_uninodelk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, off_t off, size_t size,
- const unsigned char *locked_on);
+afr_selfheal_entrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
int
-afr_selfheal_entrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, const char *name, unsigned char *locked_on);
+afr_selfheal_tryentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on);
int
-afr_selfheal_tryentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, const char *name, unsigned char *locked_on);
+afr_selfheal_tie_breaker_entrylk(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, char *dom, const char *name,
+ unsigned char *locked_on);
int
-afr_selfheal_unentrylk (call_frame_t *frame, xlator_t *this, inode_t *inode,
- char *dom, const char *name, unsigned char *locked_on);
+afr_selfheal_unentrylk(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ char *dom, const char *name, unsigned char *locked_on,
+ dict_t *xdata);
int
-afr_selfheal_unlocked_discover (call_frame_t *frame, inode_t *inode,
- uuid_t gfid, struct afr_reply *replies);
+afr_selfheal_unlocked_discover(call_frame_t *frame, inode_t *inode, uuid_t gfid,
+ struct afr_reply *replies);
+int
+afr_selfheal_unlocked_discover_on(call_frame_t *frame, inode_t *inode,
+ uuid_t gfid, struct afr_reply *replies,
+ unsigned char *discover_on, dict_t *dict);
inode_t *
-afr_selfheal_unlocked_lookup_on (call_frame_t *frame, inode_t *parent,
- const char *name, struct afr_reply *replies,
- unsigned char *lookup_on, dict_t *xattr);
+afr_selfheal_unlocked_lookup_on(call_frame_t *frame, inode_t *parent,
+ const char *name, struct afr_reply *replies,
+ unsigned char *lookup_on, dict_t *xattr);
int
-afr_selfheal_find_direction (call_frame_t *frame, xlator_t *this,
- struct afr_reply *replies,
- afr_transaction_type type,
- unsigned char *locked_on, unsigned char *sources,
- unsigned char *sinks, uint64_t *witness,
- gf_boolean_t *flag);
+afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this,
+ struct afr_reply *replies,
+ afr_transaction_type type, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ uint64_t *witness, unsigned char *flag);
int
-afr_selfheal_fill_matrix (xlator_t *this, int **matrix, int subvol, int idx,
- dict_t *xdata);
+afr_selfheal_fill_matrix(xlator_t *this, int **matrix, int subvol, int idx,
+ dict_t *xdata);
int
-afr_selfheal_extract_xattr (xlator_t *this, struct afr_reply *replies,
- afr_transaction_type type, int *dirty, int **matrix);
+afr_selfheal_extract_xattr(xlator_t *this, struct afr_reply *replies,
+ afr_transaction_type type, int *dirty, int **matrix);
int
-afr_selfheal_undo_pending (call_frame_t *frame, xlator_t *this, inode_t *inode,
- unsigned char *sources, unsigned char *sinks,
- unsigned char *healed_sinks, afr_transaction_type type,
- struct afr_reply *replies, unsigned char *locked_on);
+afr_sh_generic_fop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata);
+
+int
+afr_selfheal_restore_time(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, unsigned char *healed_sinks,
+ struct afr_reply *replies);
+int
+afr_selfheal_undo_pending(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type, struct afr_reply *replies,
+ unsigned char *locked_on);
int
-afr_selfheal_recreate_entry (xlator_t *this, int dst, int source, inode_t *dir,
- const char *name, inode_t *inode,
- struct afr_reply *replies,
- unsigned char *newentry);
+afr_selfheal_recreate_entry(call_frame_t *frame, int dst, int source,
+ unsigned char *sources, inode_t *dir,
+ const char *name, inode_t *inode,
+ struct afr_reply *replies);
int
-afr_selfheal_post_op (call_frame_t *frame, xlator_t *this, inode_t *inode,
- int subvol, dict_t *xattr);
+afr_selfheal_post_op(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int subvol, dict_t *xattr, dict_t *xdata);
call_frame_t *
-afr_frame_create (xlator_t *this);
+afr_frame_create(xlator_t *this, int32_t *op_errno);
inode_t *
-afr_inode_find (xlator_t *this, uuid_t gfid);
+afr_inode_find(xlator_t *this, uuid_t gfid);
int
-afr_selfheal_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *buf, dict_t *xdata,
- struct iatt *parbuf);
+afr_selfheal_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xdata, struct iatt *parbuf);
+void
+afr_reply_copy(struct afr_reply *dst, struct afr_reply *src);
void
-afr_replies_copy (struct afr_reply *dst, struct afr_reply *src, int count);
+afr_replies_copy(struct afr_reply *dst, struct afr_reply *src, int count);
int
-afr_selfheal_newentry_mark (call_frame_t *frame, xlator_t *this, inode_t *inode,
- int source, struct afr_reply *replies,
- unsigned char *sources, unsigned char *newentry);
-
-inode_t*
-afr_inode_link (inode_t *inode, struct iatt *iatt);
+afr_selfheal_newentry_mark(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ int source, struct afr_reply *replies,
+ unsigned char *sources, unsigned char *newentry);
unsigned int
-afr_success_count (struct afr_reply *replies, unsigned int count);
+afr_success_count(struct afr_reply *replies, unsigned int count);
void
-afr_log_selfheal (uuid_t gfid, xlator_t *this, int ret, char *type,
- int source, unsigned char *healed_sinks);
+afr_log_selfheal(uuid_t gfid, xlator_t *this, int ret, char *type, int source,
+ unsigned char *sources, unsigned char *healed_sinks);
void
-afr_mark_largest_file_as_source (xlator_t *this, unsigned char *sources,
- struct afr_reply *replies);
+afr_mark_largest_file_as_source(xlator_t *this, unsigned char *sources,
+ struct afr_reply *replies);
void
-afr_mark_active_sinks (xlator_t *this, unsigned char *sources,
- unsigned char *locked_on, unsigned char *sinks);
+afr_mark_active_sinks(xlator_t *this, unsigned char *sources,
+ unsigned char *locked_on, unsigned char *sinks);
+
+gf_boolean_t
+afr_dict_contains_heal_op(call_frame_t *frame);
gf_boolean_t
-afr_dict_contains_heal_op (call_frame_t *frame);
+afr_can_decide_split_brain_source_sinks(struct afr_reply *replies,
+ int child_count);
+int
+afr_mark_split_brain_source_sinks(
+ call_frame_t *frame, xlator_t *this, inode_t *inode, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks, unsigned char *locked_on,
+ struct afr_reply *replies, afr_transaction_type type);
+
+int
+afr_sh_get_fav_by_policy(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, char **policy_str);
int
-afr_mark_split_brain_source_sinks (call_frame_t *frame, xlator_t *this,
- unsigned char *sources,
- unsigned char *sinks,
- unsigned char *healed_sinks,
- unsigned char *locked_on,
- struct afr_reply *replies,
- afr_transaction_type type);
+_afr_fav_child_reset_sink_xattrs(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, int source,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ afr_transaction_type type,
+ unsigned char *locked_on,
+ struct afr_reply *replies);
int
-afr_get_child_index_from_name (xlator_t *this, char *name);
+afr_get_child_index_from_name(xlator_t *this, char *name);
gf_boolean_t
-afr_does_witness_exist (xlator_t *this, uint64_t *witness);
+afr_does_witness_exist(xlator_t *this, uint64_t *witness);
int
-__afr_selfheal_data_prepare (call_frame_t *frame, xlator_t *this,
+__afr_selfheal_data_prepare(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ unsigned char *locked_on, unsigned char *sources,
+ unsigned char *sinks, unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *flag);
+
+int
+__afr_selfheal_metadata_prepare(call_frame_t *frame, xlator_t *this,
+ inode_t *inode, unsigned char *locked_on,
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *undid_pending,
+ struct afr_reply *replies, unsigned char *flag);
+int
+__afr_selfheal_entry_prepare(call_frame_t *frame, xlator_t *this,
inode_t *inode, unsigned char *locked_on,
- unsigned char *sources,
- unsigned char *sinks, unsigned char *healed_sinks,
- struct afr_reply *replies,
- gf_boolean_t *flag);
+ unsigned char *sources, unsigned char *sinks,
+ unsigned char *healed_sinks,
+ struct afr_reply *replies, int *source_p,
+ unsigned char *flag);
int
-__afr_selfheal_metadata_prepare (call_frame_t *frame, xlator_t *this,
- inode_t *inode, unsigned char *locked_on,
- unsigned char *sources,
- unsigned char *sinks,
- unsigned char *healed_sinks,
- struct afr_reply *replies,
- gf_boolean_t *flag);
+afr_selfheal_unlocked_inspect(call_frame_t *frame, xlator_t *this, uuid_t gfid,
+ inode_t **link_inode, gf_boolean_t *data_selfheal,
+ gf_boolean_t *metadata_selfheal,
+ gf_boolean_t *entry_selfheal,
+ struct afr_reply *replies);
+
int
-__afr_selfheal_entry_prepare (call_frame_t *frame, xlator_t *this,
- inode_t *inode, unsigned char *locked_on,
- unsigned char *sources,
- unsigned char *sinks,
- unsigned char *healed_sinks,
- struct afr_reply *replies, int *source_p,
- gf_boolean_t *flag);
+afr_selfheal_do(call_frame_t *frame, xlator_t *this, uuid_t gfid);
int
-afr_selfheal_unlocked_inspect (call_frame_t *frame, xlator_t *this,
- uuid_t gfid, inode_t **link_inode,
- gf_boolean_t *data_selfheal,
- gf_boolean_t *metadata_selfheal,
- gf_boolean_t *entry_selfheal);
+afr_selfheal_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
int
-afr_selfheal_do (call_frame_t *frame, xlator_t *this, uuid_t gfid);
+afr_locked_fill(call_frame_t *frame, xlator_t *this, unsigned char *locked_on);
+int
+afr_choose_source_by_policy(afr_private_t *priv, unsigned char *sources,
+ afr_transaction_type type);
int
-afr_selfheal_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata);
+afr_selfheal_metadata_by_stbuf(xlator_t *this, struct iatt *stbuf);
int
-afr_locked_fill (call_frame_t *frame, xlator_t *this,
- unsigned char *locked_on);
+afr_sh_fav_by_size(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+int
+afr_sh_fav_by_mtime(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+int
+afr_sh_fav_by_ctime(xlator_t *this, struct afr_reply *replies, inode_t *inode);
+
+int
+afr_gfid_split_brain_source(xlator_t *this, struct afr_reply *replies,
+ inode_t *inode, uuid_t pargfid, const char *bname,
+ int src_idx, int child_idx,
+ unsigned char *locked_on, int *src, dict_t *xdata);
+int
+afr_mark_source_sinks_if_file_empty(xlator_t *this, unsigned char *sources,
+ unsigned char *sinks,
+ unsigned char *healed_sinks,
+ unsigned char *locked_on,
+ struct afr_reply *replies,
+ afr_transaction_type type);
+
+gf_boolean_t
+afr_is_file_empty_on_all_children(afr_private_t *priv,
+ struct afr_reply *replies);
+
+int
+afr_selfheal_entry_delete(xlator_t *this, inode_t *dir, const char *name,
+ inode_t *inode, int child, struct afr_reply *replies);
+int
+afr_anon_inode_create(xlator_t *this, int child, inode_t **linked_inode);
#endif /* !_AFR_SELFHEAL_H */
diff --git a/xlators/cluster/afr/src/afr-self-heald.c b/xlators/cluster/afr/src/afr-self-heald.c
index c539e117607..109fd4b7421 100644
--- a/xlators/cluster/afr/src/afr-self-heald.c
+++ b/xlators/cluster/afr/src/afr-self-heald.c
@@ -8,1157 +8,1709 @@
cases as published by the Free Software Foundation.
*/
-
#include "afr.h"
#include "afr-self-heal.h"
#include "afr-self-heald.h"
#include "protocol-common.h"
-#include "syncop-utils.h"
+#include <glusterfs/syncop-utils.h>
#include "afr-messages.h"
-
-#define SHD_INODE_LRU_LIMIT 2048
-#define AFR_EH_SPLIT_BRAIN_LIMIT 1024
-#define AFR_STATISTICS_HISTORY_SIZE 50
-
-
-#define ASSERT_LOCAL(this, healer) \
- if (!afr_shd_is_subvol_local(this, healer->subvol)) { \
- healer->local = _gf_false; \
- if (safe_break (healer)) { \
- break; \
- } else { \
- continue; \
- } \
- } else { \
- healer->local = _gf_true; \
- }
-
-
-#define NTH_INDEX_HEALER(this, n) &((((afr_private_t *)this->private))->shd.index_healers[n])
-#define NTH_FULL_HEALER(this, n) &((((afr_private_t *)this->private))->shd.full_healers[n])
+#include <glusterfs/byte-order.h>
+
+#define AFR_EH_SPLIT_BRAIN_LIMIT 1024
+#define AFR_STATISTICS_HISTORY_SIZE 50
+
+#define ASSERT_LOCAL(this, healer) \
+ if (!afr_shd_is_subvol_local(this, healer->subvol)) { \
+ healer->local = _gf_false; \
+ if (safe_break(healer)) { \
+ break; \
+ } else { \
+ continue; \
+ } \
+ } else { \
+ healer->local = _gf_true; \
+ }
+
+#define NTH_INDEX_HEALER(this, n) \
+ &((((afr_private_t *)this->private))->shd.index_healers[n])
+#define NTH_FULL_HEALER(this, n) \
+ &((((afr_private_t *)this->private))->shd.full_healers[n])
char *
-afr_subvol_name (xlator_t *this, int subvol)
+afr_subvol_name(xlator_t *this, int subvol)
{
- afr_private_t *priv = NULL;
+ afr_private_t *priv = NULL;
- priv = this->private;
- if (subvol < 0 || subvol > priv->child_count)
- return NULL;
+ priv = this->private;
+ if (subvol < 0 || subvol > priv->child_count)
+ return NULL;
- return priv->children[subvol]->name;
+ return priv->children[subvol]->name;
}
-
void
-afr_destroy_crawl_event_data (void *data)
+afr_destroy_crawl_event_data(void *data)
{
- return;
+ return;
}
-
void
-afr_destroy_shd_event_data (void *data)
+afr_destroy_shd_event_data(void *data)
{
- shd_event_t *shd_event = data;
-
- if (!shd_event)
- return;
- GF_FREE (shd_event->path);
+ shd_event_t *shd_event = data;
+ if (!shd_event)
return;
-}
+ GF_FREE(shd_event->path);
+ return;
+}
gf_boolean_t
-afr_shd_is_subvol_local (xlator_t *this, int subvol)
+afr_shd_is_subvol_local(xlator_t *this, int subvol)
{
- afr_private_t *priv = NULL;
- gf_boolean_t is_local = _gf_false;
- loc_t loc = {0, };
-
- loc.inode = this->itable->root;
- gf_uuid_copy (loc.gfid, loc.inode->gfid);
- priv = this->private;
- syncop_is_subvol_local(priv->children[subvol], &loc, &is_local);
- return is_local;
+ afr_private_t *priv = NULL;
+ gf_boolean_t is_local = _gf_false;
+ loc_t loc = {
+ 0,
+ };
+
+ loc.inode = this->itable->root;
+ gf_uuid_copy(loc.gfid, loc.inode->gfid);
+ priv = this->private;
+ syncop_is_subvol_local(priv->children[subvol], &loc, &is_local);
+ return is_local;
}
-
int
-__afr_shd_healer_wait (struct subvol_healer *healer)
+__afr_shd_healer_wait(struct subvol_healer *healer)
{
- afr_private_t *priv = NULL;
- struct timespec wait_till = {0, };
- int ret = 0;
+ afr_private_t *priv = NULL;
+ struct timespec wait_till = {
+ 0,
+ };
+ int ret = 0;
- priv = healer->this->private;
+ priv = healer->this->private;
disabled_loop:
- wait_till.tv_sec = time (NULL) + priv->shd.timeout;
+ wait_till.tv_sec = gf_time() + priv->shd.timeout;
- while (!healer->rerun) {
- ret = pthread_cond_timedwait (&healer->cond,
- &healer->mutex,
- &wait_till);
- if (ret == ETIMEDOUT)
- break;
- }
+ while (!healer->rerun) {
+ ret = pthread_cond_timedwait(&healer->cond, &healer->mutex, &wait_till);
+ if (ret == ETIMEDOUT)
+ break;
+ }
- ret = healer->rerun;
- healer->rerun = 0;
+ ret = healer->rerun;
+ healer->rerun = 0;
- if (!priv->shd.enabled)
- goto disabled_loop;
+ if (!priv->shd.enabled)
+ goto disabled_loop;
- return ret;
+ return ret;
}
-
int
-afr_shd_healer_wait (struct subvol_healer *healer)
+afr_shd_healer_wait(struct subvol_healer *healer)
{
- int ret = 0;
+ int ret = 0;
- pthread_mutex_lock (&healer->mutex);
- {
- ret = __afr_shd_healer_wait (healer);
- }
- pthread_mutex_unlock (&healer->mutex);
+ pthread_mutex_lock(&healer->mutex);
+ {
+ ret = __afr_shd_healer_wait(healer);
+ }
+ pthread_mutex_unlock(&healer->mutex);
- return ret;
+ return ret;
}
-
gf_boolean_t
-safe_break (struct subvol_healer *healer)
+safe_break(struct subvol_healer *healer)
{
- gf_boolean_t ret = _gf_false;
+ gf_boolean_t ret = _gf_false;
- pthread_mutex_lock (&healer->mutex);
- {
- if (healer->rerun)
- goto unlock;
+ pthread_mutex_lock(&healer->mutex);
+ {
+ if (healer->rerun)
+ goto unlock;
- healer->running = _gf_false;
- ret = _gf_true;
- }
+ healer->running = _gf_false;
+ ret = _gf_true;
+ }
unlock:
- pthread_mutex_unlock (&healer->mutex);
+ pthread_mutex_unlock(&healer->mutex);
- return ret;
+ return ret;
}
-
inode_t *
-afr_shd_inode_find (xlator_t *this, xlator_t *subvol, uuid_t gfid)
+afr_shd_inode_find(xlator_t *this, xlator_t *subvol, uuid_t gfid)
{
- inode_t *inode = NULL;
- int ret = 0;
- loc_t loc = {0, };
- struct iatt iatt = {0, };
-
- inode = inode_find (this->itable, gfid);
- if (inode) {
- inode_lookup (inode);
- goto out;
- }
-
- loc.inode = inode_new (this->itable);
- if (!loc.inode)
- goto out;
- gf_uuid_copy (loc.gfid, gfid);
-
- ret = syncop_lookup (subvol, &loc, &iatt, NULL, NULL, NULL);
- if (ret < 0)
- goto out;
-
- inode = inode_link (loc.inode, NULL, NULL, &iatt);
- if (inode)
- inode_lookup (inode);
+ int ret = 0;
+ uint64_t val = IA_INVAL;
+ dict_t *xdata = NULL;
+ dict_t *rsp_dict = NULL;
+ inode_t *inode = NULL;
+
+ xdata = dict_new();
+ if (!xdata)
+ goto out;
+
+ ret = dict_set_int8(xdata, GF_INDEX_IA_TYPE_GET_REQ, 1);
+ if (ret)
+ goto out;
+
+ ret = syncop_inode_find(this, subvol, gfid, &inode, xdata, &rsp_dict);
+ if (ret < 0)
+ goto out;
+
+ if (rsp_dict) {
+ ret = dict_get_uint64(rsp_dict, GF_INDEX_IA_TYPE_GET_RSP, &val);
+ if (ret)
+ goto out;
+ }
+ ret = inode_ctx_set2(inode, subvol, 0, &val);
out:
- loc_wipe (&loc);
- return inode;
+ if (ret && inode) {
+ inode_unref(inode);
+ inode = NULL;
+ }
+ if (xdata)
+ dict_unref(xdata);
+ if (rsp_dict)
+ dict_unref(rsp_dict);
+ return inode;
}
-inode_t*
-afr_shd_index_inode (xlator_t *this, xlator_t *subvol, char *vgfid)
+inode_t *
+afr_shd_index_inode(xlator_t *this, xlator_t *subvol, char *vgfid)
{
- loc_t rootloc = {0, };
- inode_t *inode = NULL;
- int ret = 0;
- dict_t *xattr = NULL;
- void *index_gfid = NULL;
+ loc_t rootloc = {
+ 0,
+ };
+ inode_t *inode = NULL;
+ int ret = 0;
+ dict_t *xattr = NULL;
+ void *index_gfid = NULL;
- rootloc.inode = inode_ref (this->itable->root);
- gf_uuid_copy (rootloc.gfid, rootloc.inode->gfid);
+ rootloc.inode = inode_ref(this->itable->root);
+ gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
- ret = syncop_getxattr (subvol, &rootloc, &xattr,
- vgfid, NULL, NULL);
- if (ret || !xattr) {
- errno = -ret;
- goto out;
- }
+ ret = syncop_getxattr(subvol, &rootloc, &xattr, vgfid, NULL, NULL);
+ if (ret || !xattr) {
+ errno = -ret;
+ goto out;
+ }
- ret = dict_get_ptr (xattr, vgfid, &index_gfid);
- if (ret)
- goto out;
+ ret = dict_get_ptr(xattr, vgfid, &index_gfid);
+ if (ret)
+ goto out;
- gf_msg_debug (this->name, 0, "%s dir gfid for %s: %s",
- vgfid, subvol->name, uuid_utoa (index_gfid));
+ gf_msg_debug(this->name, 0, "%s dir gfid for %s: %s", vgfid, subvol->name,
+ uuid_utoa(index_gfid));
- inode = afr_shd_inode_find (this, subvol, index_gfid);
+ inode = afr_shd_inode_find(this, subvol, index_gfid);
out:
- loc_wipe (&rootloc);
+ loc_wipe(&rootloc);
- if (xattr)
- dict_unref (xattr);
+ if (xattr)
+ dict_unref(xattr);
- return inode;
+ return inode;
}
int
-afr_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name)
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
+ ia_type_t type)
{
- loc_t loc = {0, };
- int ret = 0;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
- loc.parent = inode_ref (inode);
- loc.name = name;
+ loc.parent = inode_ref(inode);
+ loc.name = name;
- ret = syncop_unlink (subvol, &loc, NULL, NULL);
+ if (IA_ISDIR(type))
+ ret = syncop_rmdir(subvol, &loc, 1, NULL, NULL);
+ else
+ ret = syncop_unlink(subvol, &loc, NULL, NULL);
- loc_wipe (&loc);
- return ret;
+ loc_wipe(&loc);
+ return ret;
}
void
-afr_shd_zero_xattrop (xlator_t *this, uuid_t gfid)
+afr_shd_zero_xattrop(xlator_t *this, uuid_t gfid)
{
-
- call_frame_t *frame = NULL;
- inode_t *inode = NULL;
- afr_private_t *priv = NULL;
- dict_t *xattr = NULL;
- int ret = 0;
- int i = 0;
- int raw[AFR_NUM_CHANGE_LOGS] = {0};
-
- priv = this->private;
- frame = afr_frame_create (this);
- if (!frame)
- goto out;
- inode = afr_inode_find (this, gfid);
- if (!inode)
- goto out;
- xattr = dict_new();
- if (!xattr)
- goto out;
- ret = dict_set_static_bin (xattr, AFR_DIRTY, raw,
- sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ call_frame_t *frame = NULL;
+ inode_t *inode = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int ret = 0;
+ int i = 0;
+ int raw[AFR_NUM_CHANGE_LOGS] = {0};
+
+ priv = this->private;
+ frame = afr_frame_create(this, NULL);
+ if (!frame)
+ goto out;
+ inode = afr_inode_find(this, gfid);
+ if (!inode)
+ goto out;
+ xattr = dict_new();
+ if (!xattr)
+ goto out;
+ ret = dict_set_static_bin(xattr, AFR_DIRTY, raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], raw,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
if (ret)
- goto out;
- for (i = 0; i < priv->child_count; i++) {
- ret = dict_set_static_bin (xattr, priv->pending_key[i], raw,
- sizeof(int) * AFR_NUM_CHANGE_LOGS);
- if (ret)
- goto out;
- }
+ goto out;
+ }
- /*Send xattrop to all bricks. Doing a lookup to see if bricks are up or
- * has valid repies for this gfid seems a bit of an overkill.*/
- for (i = 0; i < priv->child_count; i++)
- afr_selfheal_post_op (frame, this, inode, i, xattr);
+ /*Send xattrop to all bricks. Doing a lookup to see if bricks are up or
+ * has valid repies for this gfid seems a bit of an overkill.*/
+ for (i = 0; i < priv->child_count; i++)
+ afr_selfheal_post_op(frame, this, inode, i, xattr, NULL);
out:
- if (frame)
- AFR_STACK_DESTROY (frame);
- if (inode)
- inode_unref (inode);
- if (xattr)
- dict_unref (xattr);
- return;
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ if (inode)
+ inode_unref(inode);
+ if (xattr)
+ dict_unref(xattr);
+ return;
}
int
-afr_shd_selfheal_name (struct subvol_healer *healer, int child, uuid_t parent,
- const char *bname)
+afr_shd_selfheal_name(struct subvol_healer *healer, int child, uuid_t parent,
+ const char *bname)
{
- int ret = -1;
+ int ret = -1;
- ret = afr_selfheal_name (THIS, parent, bname, NULL);
+ ret = afr_selfheal_name(THIS, parent, bname, NULL, NULL);
- return ret;
+ return ret;
}
int
-afr_shd_selfheal (struct subvol_healer *healer, int child, uuid_t gfid)
+afr_shd_selfheal(struct subvol_healer *healer, int child, uuid_t gfid)
{
- int ret = 0;
- eh_t *eh = NULL;
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- shd_event_t *shd_event = NULL;
- char *path = NULL;
- xlator_t *subvol = NULL;
- xlator_t *this = NULL;
- crawl_event_t *crawl_event = NULL;
-
- this = healer->this;
- priv = this->private;
- shd = &priv->shd;
- crawl_event = &healer->crawl_event;
-
- subvol = priv->children[child];
-
- //If this fails with ENOENT/ESTALE index is stale
- ret = syncop_gfid_to_path (this->itable, subvol, gfid, &path);
- if (ret < 0)
- return ret;
-
- ret = afr_selfheal (this, gfid);
-
- if (ret == -EIO) {
- eh = shd->split_brain;
- crawl_event->split_brain_count++;
- } else if (ret < 0) {
- crawl_event->heal_failed_count++;
- } else if (ret == 0) {
- crawl_event->healed_count++;
- }
-
- if (eh) {
- shd_event = GF_CALLOC (1, sizeof(*shd_event),
- gf_afr_mt_shd_event_t);
- if (!shd_event)
- goto out;
-
- shd_event->child = child;
- shd_event->path = path;
-
- if (eh_save_history (eh, shd_event) < 0)
- goto out;
-
- shd_event = NULL;
- path = NULL;
- }
+ int ret = 0;
+ eh_t *eh = NULL;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ crawl_event_t *crawl_event = NULL;
+
+ this = healer->this;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = &healer->crawl_event;
+
+ subvol = priv->children[child];
+
+ // If this fails with ENOENT/ESTALE index is stale
+ ret = syncop_gfid_to_path(this->itable, subvol, gfid, &path);
+ if (ret < 0)
+ return ret;
+
+ ret = afr_selfheal(this, gfid);
+
+ LOCK(&priv->lock);
+ {
+ if (ret == -EIO) {
+ eh = shd->split_brain;
+ crawl_event->split_brain_count++;
+ } else if (ret < 0) {
+ crawl_event->heal_failed_count++;
+ } else if (ret == 0) {
+ crawl_event->healed_count++;
+ }
+ }
+ UNLOCK(&priv->lock);
+
+ if (eh) {
+ shd_event = GF_CALLOC(1, sizeof(*shd_event), gf_afr_mt_shd_event_t);
+ if (!shd_event)
+ goto out;
+
+ shd_event->child = child;
+ shd_event->path = path;
+
+ if (eh_save_history(eh, shd_event) < 0)
+ goto out;
+
+ shd_event = NULL;
+ path = NULL;
+ }
out:
- GF_FREE (shd_event);
- GF_FREE (path);
- return ret;
+ GF_FREE(shd_event);
+ GF_FREE(path);
+ return ret;
}
-
void
-afr_shd_sweep_prepare (struct subvol_healer *healer)
+afr_shd_sweep_prepare(struct subvol_healer *healer)
{
- crawl_event_t *event = NULL;
+ crawl_event_t *event = NULL;
- event = &healer->crawl_event;
+ event = &healer->crawl_event;
- event->healed_count = 0;
- event->split_brain_count = 0;
- event->heal_failed_count = 0;
+ event->healed_count = 0;
+ event->split_brain_count = 0;
+ event->heal_failed_count = 0;
- time (&event->start_time);
- event->end_time = 0;
+ event->start_time = gf_time();
+ event->end_time = 0;
+ _mask_cancellation();
}
-
void
-afr_shd_sweep_done (struct subvol_healer *healer)
+afr_shd_sweep_done(struct subvol_healer *healer)
{
- crawl_event_t *event = NULL;
- crawl_event_t *history = NULL;
- afr_self_heald_t *shd = NULL;
+ crawl_event_t *event = NULL;
+ crawl_event_t *history = NULL;
+ afr_self_heald_t *shd = NULL;
- event = &healer->crawl_event;
- shd = &(((afr_private_t *)healer->this->private)->shd);
+ event = &healer->crawl_event;
+ shd = &(((afr_private_t *)healer->this->private)->shd);
- time (&event->end_time);
- history = memdup (event, sizeof (*event));
- event->start_time = 0;
+ event->end_time = gf_time();
+ history = gf_memdup(event, sizeof(*event));
+ event->start_time = 0;
- if (!history)
- return;
+ if (!history)
+ return;
- if (eh_save_history (shd->statistics[healer->subvol], history) < 0)
- GF_FREE (history);
+ if (eh_save_history(shd->statistics[healer->subvol], history) < 0)
+ GF_FREE(history);
+ _unmask_cancellation();
}
int
-afr_shd_index_heal (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
- void *data)
+afr_shd_index_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
{
- struct subvol_healer *healer = data;
- afr_private_t *priv = NULL;
- uuid_t gfid = {0};
- int ret = 0;
+ struct subvol_healer *healer = data;
+ afr_private_t *priv = NULL;
+ uuid_t gfid = {0};
+ int ret = 0;
+ uint64_t val = IA_INVAL;
- priv = healer->this->private;
- if (!priv->shd.enabled)
- return -EBUSY;
+ priv = healer->this->private;
+ if (!priv->shd.enabled)
+ return -EBUSY;
- gf_msg_debug (healer->this->name, 0, "got entry: %s",
- entry->d_name);
+ gf_msg_debug(healer->this->name, 0, "got entry: %s from %s", entry->d_name,
+ priv->children[healer->subvol]->name);
- ret = gf_uuid_parse (entry->d_name, gfid);
- if (ret)
- return 0;
+ ret = gf_uuid_parse(entry->d_name, gfid);
+ if (ret)
+ return 0;
- ret = afr_shd_selfheal (healer, healer->subvol, gfid);
+ inode_ctx_get2(parent->inode, subvol, NULL, &val);
- if (ret == -ENOENT || ret == -ESTALE)
- afr_shd_index_purge (subvol, parent->inode, entry->d_name);
- if (ret == 2)
- /* If bricks crashed in pre-op after creating indices/xattrop
- * link but before setting afr changelogs, we end up with stale
- * xattrop links but zero changelogs. Remove such entries by
- * sending a post-op with zero changelogs.
- */
- afr_shd_zero_xattrop (healer->this, gfid);
+ ret = afr_shd_selfheal(healer, healer->subvol, gfid);
- return 0;
+ if (ret == -ENOENT || ret == -ESTALE)
+ afr_shd_entry_purge(subvol, parent->inode, entry->d_name, val);
+
+ if (ret == 2)
+ /* If bricks crashed in pre-op after creating indices/xattrop
+ * link but before setting afr changelogs, we end up with stale
+ * xattrop links but zero changelogs. Remove such entries by
+ * sending a post-op with zero changelogs.
+ */
+ afr_shd_zero_xattrop(healer->this, gfid);
+
+ return 0;
}
int
-afr_shd_index_sweep (struct subvol_healer *healer, char *vgfid)
+afr_shd_index_sweep(struct subvol_healer *healer, char *vgfid)
{
- loc_t loc = {0};
- afr_private_t *priv = NULL;
- int ret = 0;
- xlator_t *subvol = NULL;
-
- priv = healer->this->private;
- subvol = priv->children[healer->subvol];
-
- loc.inode = afr_shd_index_inode (healer->this, subvol, vgfid);
- if (!loc.inode) {
- gf_msg (healer->this->name, GF_LOG_WARNING,
- 0, AFR_MSG_INDEX_DIR_GET_FAILED,
- "unable to get index-dir on %s", subvol->name);
- return -errno;
- }
-
- ret = syncop_dir_scan (subvol, &loc, GF_CLIENT_PID_SELF_HEALD,
- healer, afr_shd_index_heal);
+ loc_t loc = {0};
+ afr_private_t *priv = NULL;
+ int ret = 0;
+ xlator_t *subvol = NULL;
+ dict_t *xdata = NULL;
+ call_frame_t *frame = NULL;
+
+ priv = healer->this->private;
+ subvol = priv->children[healer->subvol];
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ loc.inode = afr_shd_index_inode(healer->this, subvol, vgfid);
+ if (!loc.inode) {
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_INDEX_DIR_GET_FAILED, "unable to get index-dir on %s",
+ subvol->name);
+ ret = -errno;
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata || dict_set_int32_sizen(xdata, "get-gfid-type", 1)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = syncop_mt_dir_scan(frame, subvol, &loc, GF_CLIENT_PID_SELF_HEALD,
+ healer, afr_shd_index_heal, xdata,
+ priv->shd.max_threads, priv->shd.wait_qlength);
+
+ if (ret == 0)
+ ret = healer->crawl_event.healed_count;
- inode_forget (loc.inode, 1);
- loc_wipe (&loc);
+out:
+ loc_wipe(&loc);
- if (ret == 0)
- ret = healer->crawl_event.healed_count;
+ if (xdata)
+ dict_unref(xdata);
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ return ret;
+}
- return ret;
+int
+afr_shd_index_sweep_all(struct subvol_healer *healer)
+{
+ int ret = 0;
+ int count = 0;
+
+ ret = afr_shd_index_sweep(healer, GF_XATTROP_INDEX_GFID);
+ if (ret < 0)
+ goto out;
+ count = ret;
+
+ ret = afr_shd_index_sweep(healer, GF_XATTROP_DIRTY_GFID);
+ if (ret < 0)
+ goto out;
+ count += ret;
+
+ ret = afr_shd_index_sweep(healer, GF_XATTROP_ENTRY_CHANGES_GFID);
+ if (ret < 0)
+ goto out;
+ count += ret;
+out:
+ if (ret < 0)
+ return ret;
+ else
+ return count;
}
int
-afr_shd_index_sweep_all (struct subvol_healer *healer)
+afr_shd_full_heal(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
{
- int ret = 0;
- int count = 0;
+ struct subvol_healer *healer = data;
+ xlator_t *this = healer->this;
+ afr_private_t *priv = NULL;
- ret = afr_shd_index_sweep (healer, GF_XATTROP_INDEX_GFID);
- if (ret < 0)
- goto out;
- count = ret;
+ priv = this->private;
- ret = afr_shd_index_sweep (healer, GF_XATTROP_DIRTY_GFID);
- if (ret < 0)
- goto out;
- count += ret;
+ if (this->cleanup_starting) {
+ return -ENOTCONN;
+ }
-out:
- if (ret < 0)
- return ret;
- else
- return count;
+ if (!priv->shd.enabled)
+ return -EBUSY;
+
+ afr_shd_selfheal_name(healer, healer->subvol, parent->inode->gfid,
+ entry->d_name);
+
+ afr_shd_selfheal(healer, healer->subvol, entry->d_stat.ia_gfid);
+
+ return 0;
}
int
-afr_shd_full_heal (xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
- void *data)
+afr_shd_full_sweep(struct subvol_healer *healer, inode_t *inode)
{
- struct subvol_healer *healer = data;
- xlator_t *this = healer->this;
- afr_private_t *priv = NULL;
+ afr_private_t *priv = NULL;
+ loc_t loc = {0};
- priv = this->private;
- if (!priv->shd.enabled)
- return -EBUSY;
+ priv = healer->this->private;
+ loc.inode = inode;
+ return syncop_ftw(priv->children[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer, afr_shd_full_heal);
+}
- afr_shd_selfheal_name (healer, healer->subvol,
- parent->inode->gfid, entry->d_name);
+int
+afr_shd_fill_ta_loc(xlator_t *this, loc_t *loc)
+{
+ afr_private_t *priv = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ int ret = -1;
+
+ priv = this->private;
+ loc->parent = inode_ref(this->itable->root);
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+ loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+ loc->inode = inode_new(loc->parent->table);
+ GF_CHECK_ALLOC(loc->inode, ret, out);
+
+ if (!gf_uuid_is_null(priv->ta_gfid))
+ goto assign_gfid;
+
+ ret = syncop_lookup(priv->children[THIN_ARBITER_BRICK_INDEX], loc, &stbuf,
+ 0, 0, 0);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed lookup on file %s.", loc->name);
+ goto out;
+ }
+
+ gf_uuid_copy(priv->ta_gfid, stbuf.ia_gfid);
+
+assign_gfid:
+ gf_uuid_copy(loc->gfid, priv->ta_gfid);
+ ret = 0;
- afr_shd_selfheal (healer, healer->subvol, entry->d_stat.ia_gfid);
+out:
+ if (ret)
+ loc_wipe(loc);
- return 0;
+ return ret;
}
int
-afr_shd_full_sweep (struct subvol_healer *healer, inode_t *inode)
+_afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata)
{
- afr_private_t *priv = NULL;
- loc_t loc = {0};
-
- priv = healer->this->private;
- loc.inode = inode;
- return syncop_ftw (priv->children[healer->subvol], &loc,
- GF_CLIENT_PID_SELF_HEALD, healer,
- afr_shd_full_heal);
-}
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ int raw[AFR_NUM_CHANGE_LOGS] = {
+ 0,
+ };
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+
+ xattr = dict_new();
+ if (!xattr) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_DICT_GET_FAILED,
+ "Failed to create dict.");
+ goto out;
+ }
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], &raw,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret)
+ goto out;
+ }
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, xdata, NULL);
+ if (ret || !(*xdata)) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Xattrop failed on %s.", loc->name);
+ }
-void *
-afr_shd_index_healer (void *data)
-{
- struct subvol_healer *healer = NULL;
- xlator_t *this = NULL;
- int ret = 0;
-
- healer = data;
- THIS = this = healer->this;
-
- for (;;) {
- afr_shd_healer_wait (healer);
-
- ASSERT_LOCAL(this, healer);
-
- do {
- gf_msg_debug (this->name, 0,
- "starting index sweep on subvol %s",
- afr_subvol_name (this, healer->subvol));
-
- afr_shd_sweep_prepare (healer);
-
- ret = afr_shd_index_sweep_all (healer);
-
- afr_shd_sweep_done (healer);
- /*
- As long as at least one gfid was
- healed, keep retrying. We may have
- just healed a directory and thereby
- created entries for other gfids which
- could not be healed thus far.
- */
-
- gf_msg_debug (this->name, 0,
- "finished index sweep on subvol %s",
- afr_subvol_name (this, healer->subvol));
- /*
- Give a pause before retrying to avoid a busy loop
- in case the only entry in index is because of
- an ongoing I/O.
- */
- sleep (1);
- } while (ret > 0);
- }
-
- return NULL;
-}
+out:
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
+}
-void *
-afr_shd_full_healer (void *data)
+void
+afr_shd_ta_get_xattrs(xlator_t *this, loc_t *loc, struct subvol_healer *healer,
+ dict_t **xdata)
{
- struct subvol_healer *healer = NULL;
- xlator_t *this = NULL;
- int run = 0;
+ int ret = 0;
+
+ loc_wipe(loc);
+ if (afr_shd_fill_ta_loc(this, loc)) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc->name);
+ ret = -1;
+ goto out;
+ }
+
+ ret = afr_ta_post_op_lock(this, loc);
+ if (ret)
+ goto out;
+
+ ret = _afr_shd_ta_get_xattrs(this, loc, xdata);
+ if (ret) {
+ if (*xdata) {
+ dict_unref(*xdata);
+ *xdata = NULL;
+ }
+ }
- healer = data;
- THIS = this = healer->this;
+ afr_ta_post_op_unlock(this, loc);
- for (;;) {
- pthread_mutex_lock (&healer->mutex);
- {
- run = __afr_shd_healer_wait (healer);
- if (!run)
- healer->running = _gf_false;
- }
- pthread_mutex_unlock (&healer->mutex);
+out:
+ if (ret)
+ healer->rerun = 1;
+}
- if (!run)
- break;
+int
+afr_shd_ta_unset_xattrs(xlator_t *this, loc_t *loc, dict_t **xdata, int healer)
+{
+ afr_private_t *priv = NULL;
+ dict_t *xattr = NULL;
+ gf_boolean_t need_xattrop = _gf_false;
+ void *pending_raw = NULL;
+ int *raw = NULL;
+ int pending[AFR_NUM_CHANGE_LOGS] = {
+ 0,
+ };
+ int i = 0;
+ int j = 0;
+ int val = 0;
+ int ret = -1;
+
+ priv = this->private;
+
+ xattr = dict_new();
+ if (!xattr) {
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ raw = GF_CALLOC(AFR_NUM_CHANGE_LOGS, sizeof(int), gf_afr_mt_int32_t);
+ if (!raw) {
+ goto out;
+ }
- ASSERT_LOCAL(this, healer);
+ ret = dict_get_ptr(*xdata, priv->pending_key[i], &pending_raw);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "Error getting value "
+ "of pending key %s",
+ priv->pending_key[i]);
+ GF_FREE(raw);
+ goto out;
+ }
- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
- "starting full sweep on subvol %s",
- afr_subvol_name (this, healer->subvol));
+ memcpy(pending, pending_raw, sizeof(pending));
+ for (j = 0; j < AFR_NUM_CHANGE_LOGS; j++) {
+ val = ntoh32(pending[j]);
+ if (val) {
+ if (i == healer) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_THIN_ARB,
+ "I am "
+ "not the good shd. Skipping. "
+ "SHD = %d.",
+ healer);
+ ret = 0;
+ GF_FREE(raw);
+ goto out;
+ }
+ need_xattrop = _gf_true;
+ raw[j] = hton32(-val);
+ }
+ }
- afr_shd_sweep_prepare (healer);
+ ret = dict_set_bin(xattr, priv->pending_key[i], raw,
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ if (ret) {
+ GF_FREE(raw);
+ goto out;
+ }
- afr_shd_full_sweep (healer, this->itable->root);
+ if (need_xattrop)
+ break;
+ }
- afr_shd_sweep_done (healer);
+ if (!need_xattrop) {
+ ret = 0;
+ goto out;
+ }
- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
- "finished full sweep on subvol %s",
- afr_subvol_name (this, healer->subvol));
- }
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Xattrop failed.");
- return NULL;
-}
+out:
+ if (xattr)
+ dict_unref(xattr);
+ return ret;
+}
-int
-afr_shd_healer_init (xlator_t *this, struct subvol_healer *healer)
+void
+afr_shd_ta_check_and_unset_xattrs(xlator_t *this, loc_t *loc,
+ struct subvol_healer *healer,
+ dict_t *pre_crawl_xdata)
{
- int ret = 0;
+ int ret_lock = 0;
+ int ret = 0;
+ dict_t *post_crawl_xdata = NULL;
- ret = pthread_mutex_init (&healer->mutex, NULL);
- if (ret)
- goto out;
+ ret_lock = afr_ta_post_op_lock(this, loc);
+ if (ret_lock)
+ goto unref;
- ret = pthread_cond_init (&healer->cond, NULL);
- if (ret)
- goto out;
+ ret = _afr_shd_ta_get_xattrs(this, loc, &post_crawl_xdata);
+ if (ret)
+ goto unref;
- healer->this = this;
- healer->running = _gf_false;
- healer->rerun = _gf_false;
- healer->local = _gf_false;
-out:
- return ret;
-}
+ if (!are_dicts_equal(pre_crawl_xdata, post_crawl_xdata, NULL, NULL)) {
+ ret = -1;
+ goto unref;
+ }
+ ret = afr_shd_ta_unset_xattrs(this, loc, &post_crawl_xdata, healer->subvol);
-int
-afr_shd_healer_spawn (xlator_t *this, struct subvol_healer *healer,
- void *(threadfn)(void *))
-{
- int ret = 0;
-
- pthread_mutex_lock (&healer->mutex);
- {
- if (healer->running) {
- pthread_cond_signal (&healer->cond);
- } else {
- ret = gf_thread_create (&healer->thread, NULL,
- threadfn, healer);
- if (ret)
- goto unlock;
- healer->running = 1;
- }
-
- healer->rerun = 1;
- }
-unlock:
- pthread_mutex_unlock (&healer->mutex);
-
- return ret;
-}
+unref:
+ if (post_crawl_xdata) {
+ dict_unref(post_crawl_xdata);
+ post_crawl_xdata = NULL;
+ }
+ if (ret || ret_lock)
+ healer->rerun = 1;
-int
-afr_shd_full_healer_spawn (xlator_t *this, int subvol)
-{
- return afr_shd_healer_spawn (this, NTH_FULL_HEALER (this, subvol),
- afr_shd_full_healer);
+ if (!ret_lock)
+ afr_ta_post_op_unlock(this, loc);
}
-
-int
-afr_shd_index_healer_spawn (xlator_t *this, int subvol)
+gf_boolean_t
+afr_bricks_available_for_heal(afr_private_t *priv)
{
- return afr_shd_healer_spawn (this, NTH_INDEX_HEALER (this, subvol),
- afr_shd_index_healer);
-}
+ int up_children = 0;
+ up_children = __afr_get_up_children_count(priv);
+ if (up_children < 2) {
+ return _gf_false;
+ }
+ return _gf_true;
+}
-int
-afr_shd_dict_add_crawl_event (xlator_t *this, dict_t *output,
- crawl_event_t *crawl_event)
+static gf_boolean_t
+afr_shd_ta_needs_heal(xlator_t *this, struct subvol_healer *healer)
{
- int ret = 0;
- uint64_t count = 0;
- char key[256] = {0};
- int xl_id = 0;
- uint64_t healed_count = 0;
- uint64_t split_brain_count = 0;
- uint64_t heal_failed_count = 0;
- char *start_time_str = 0;
- char *end_time_str = NULL;
- char *crawl_type = NULL;
- int progress = -1;
- int child = -1;
-
- child = crawl_event->child;
- healed_count = crawl_event->healed_count;
- split_brain_count = crawl_event->split_brain_count;
- heal_failed_count = crawl_event->heal_failed_count;
- crawl_type = crawl_event->crawl_type;
-
- if (!crawl_event->start_time)
- goto out;
-
- start_time_str = gf_strdup (ctime (&crawl_event->start_time));
-
- if (crawl_event->end_time)
- end_time_str = gf_strdup (ctime (&crawl_event->end_time));
-
- ret = dict_get_int32 (output, this->name, &xl_id);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- AFR_MSG_DICT_GET_FAILED, "xl does not have id");
- goto out;
+ dict_t *xdata = NULL;
+ afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ int ret = -1;
+ int i = 0;
+ gf_boolean_t need_heal = _gf_false;
+
+ priv = this->private;
+
+ ret = afr_shd_fill_ta_loc(this, &loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate thin-arbiter loc for: %s.", loc.name);
+ healer->rerun = 1;
+ goto out;
+ }
+
+ if (_afr_shd_ta_get_xattrs(this, &loc, &xdata)) {
+ healer->rerun = 1;
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (afr_ta_dict_contains_pending_xattr(xdata, priv, i)) {
+ need_heal = _gf_true;
+ break;
}
+ }
- snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child);
- ret = dict_get_uint64 (output, key, &count);
+out:
+ if (xdata)
+ dict_unref(xdata);
+ loc_wipe(&loc);
+ return need_heal;
+}
- snprintf (key, sizeof (key), "statistics_healed_cnt-%d-%d-%"PRIu64,
- xl_id, child, count);
- ret = dict_set_uint64(output, key, healed_count);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Could not add statistics_healed_count to outout");
- goto out;
- }
-
- snprintf (key, sizeof (key), "statistics_sb_cnt-%d-%d-%"PRIu64,
- xl_id, child, count);
- ret = dict_set_uint64 (output, key, split_brain_count);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Could not add statistics_split_brain_count to outout");
- goto out;
+static int
+afr_shd_anon_inode_cleaner(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ void *data)
+{
+ struct subvol_healer *healer = data;
+ afr_private_t *priv = healer->this->private;
+ call_frame_t *frame = NULL;
+ afr_local_t *local = NULL;
+ int ret = 0;
+ loc_t loc = {0};
+ int count = 0;
+ int i = 0;
+ int op_errno = 0;
+ struct iatt *iatt = NULL;
+ gf_boolean_t multiple_links = _gf_false;
+ unsigned char *gfid_present = alloca0(priv->child_count);
+ unsigned char *entry_present = alloca0(priv->child_count);
+ char *type = "file";
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+ local = frame->local;
+ if (AFR_COUNT(local->child_up, priv->child_count) != priv->child_count) {
+ gf_msg_debug(healer->this->name, 0,
+ "Not all bricks are up. Skipping "
+ "cleanup of %s on %s",
+ entry->d_name, subvol->name);
+ ret = 0;
+ goto out;
+ }
+
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = gf_uuid_parse(entry->d_name, loc.gfid);
+ if (ret) {
+ ret = 0;
+ goto out;
+ }
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup, &loc,
+ NULL);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ gfid_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ if (iatt->ia_type == IA_IFDIR) {
+ type = "dir";
+ }
+
+ if (i == healer->subvol) {
+ if (local->replies[i].poststat.ia_nlink > 1) {
+ multiple_links = _gf_true;
+ }
+ }
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
+ goto out;
}
-
- snprintf (key, sizeof (key), "statistics_crawl_type-%d-%d-%"PRIu64,
- xl_id, child, count);
- ret = dict_set_str (output, key, crawl_type);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Could not add statistics_crawl_type to output");
- goto out;
+ }
+
+ /*Inode is deleted from subvol*/
+ if (count == 1 || (iatt->ia_type != IA_IFDIR && multiple_links)) {
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR, "expunging %s %s/%s on %s", type,
+ priv->anon_inode_name, entry->d_name, subvol->name);
+ ret = afr_shd_entry_purge(subvol, parent->inode, entry->d_name,
+ iatt->ia_type);
+ if (ret == -ENOENT || ret == -ESTALE)
+ ret = 0;
+ } else if (count > 1) {
+ loc_wipe(&loc);
+ loc.parent = inode_ref(parent->inode);
+ loc.name = entry->d_name;
+ loc.inode = inode_new(parent->inode->table);
+ if (!loc.inode) {
+ ret = -ENOMEM;
+ goto out;
}
-
- snprintf (key, sizeof (key), "statistics_heal_failed_cnt-%d-%d-%"PRIu64,
- xl_id, child, count);
- ret = dict_set_uint64 (output, key, heal_failed_count);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Could not add statistics_healed_failed_count to outout");
+ AFR_ONLIST(local->child_up, frame, afr_selfheal_discover_cbk, lookup,
+ &loc, NULL);
+ count = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->replies[i].op_ret == 0) {
+ count++;
+ entry_present[i] = 1;
+ iatt = &local->replies[i].poststat;
+ } else if (local->replies[i].op_errno != ENOENT &&
+ local->replies[i].op_errno != ESTALE) {
+ /*We don't have complete view. Skip the entry*/
+ gf_msg_debug(healer->this->name, local->replies[i].op_errno,
+ "Skipping cleanup of %s on %s", entry->d_name,
+ subvol->name);
+ ret = 0;
goto out;
+ }
}
-
- snprintf (key, sizeof (key), "statistics_strt_time-%d-%d-%"PRIu64,
- xl_id, child, count);
- ret = dict_set_dynstr (output, key, start_time_str);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Could not add statistics_crawl_start_time to outout");
- goto out;
- } else {
- start_time_str = NULL;
- }
-
- if (!end_time_str)
- progress = 1;
- else
- progress = 0;
-
- snprintf (key, sizeof (key), "statistics_end_time-%d-%d-%"PRIu64,
- xl_id, child, count);
- if (!end_time_str)
- end_time_str = gf_strdup ("Could not determine the end time");
- ret = dict_set_dynstr (output, key, end_time_str);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Could not add statistics_crawl_end_time to outout");
+ for (i = 0; i < priv->child_count; i++) {
+ if (gfid_present[i] && !entry_present[i]) {
+ /*Entry is not anonymous on at least one subvol*/
+ gf_msg_debug(healer->this->name, 0,
+ "Valid entry present on %s "
+ "Skipping cleanup of %s on %s",
+ priv->children[i]->name, entry->d_name,
+ subvol->name);
+ ret = 0;
goto out;
- } else {
- end_time_str = NULL;
- }
-
- snprintf (key, sizeof (key), "statistics_inprogress-%d-%d-%"PRIu64,
- xl_id, child, count);
+ }
+ }
- ret = dict_set_int32 (output, key, progress);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Could not add statistics_inprogress to outout");
- goto out;
+ gf_msg(healer->this->name, GF_LOG_WARNING, 0,
+ AFR_MSG_EXPUNGING_FILE_OR_DIR,
+ "expunging %s %s/%s on all subvols", type, priv->anon_inode_name,
+ entry->d_name);
+ ret = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ op_errno = -afr_shd_entry_purge(priv->children[i], loc.parent,
+ entry->d_name, iatt->ia_type);
+ if (op_errno != ENOENT && op_errno != ESTALE) {
+ ret |= -op_errno;
+ }
}
+ }
- snprintf (key, sizeof (key), "statistics-%d-%d-count", xl_id, child);
- ret = dict_set_uint64 (output, key, count + 1);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Could not increment the counter.");
- goto out;
- }
out:
- GF_FREE (start_time_str);
- GF_FREE (end_time_str);
- return ret;
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return ret;
}
-
-int
-afr_shd_dict_add_path (xlator_t *this, dict_t *output, int child, char *path,
- struct timeval *tv)
+static void
+afr_cleanup_anon_inode_dir(struct subvol_healer *healer)
{
- int ret = -1;
- uint64_t count = 0;
- char key[256] = {0};
- int xl_id = 0;
+ int ret = 0;
+ call_frame_t *frame = NULL;
+ afr_private_t *priv = healer->this->private;
+ loc_t loc = {0};
+
+ ret = afr_anon_inode_create(healer->this, healer->subvol, &loc.inode);
+ if (ret)
+ goto out;
+
+ frame = afr_frame_create(healer->this, &ret);
+ if (!frame) {
+ ret = -ret;
+ goto out;
+ }
+
+ ret = syncop_mt_dir_scan(frame, priv->children[healer->subvol], &loc,
+ GF_CLIENT_PID_SELF_HEALD, healer,
+ afr_shd_anon_inode_cleaner, NULL,
+ priv->shd.max_threads, priv->shd.wait_qlength);
+out:
+ if (frame)
+ AFR_STACK_DESTROY(frame);
+ loc_wipe(&loc);
+ return;
+}
- ret = dict_get_int32 (output, this->name, &xl_id);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- AFR_MSG_DICT_GET_FAILED, "xl does not have id");
- goto out;
+void *
+afr_shd_index_healer(void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int ret = 0;
+ afr_private_t *priv = NULL;
+ dict_t *pre_crawl_xdata = NULL;
+ loc_t loc = {
+ 0,
+ };
+
+ healer = data;
+ THIS = this = healer->this;
+ priv = this->private;
+
+ for (;;) {
+ afr_shd_healer_wait(healer);
+
+ if (!afr_bricks_available_for_heal(priv))
+ continue;
+
+ ASSERT_LOCAL(this, healer);
+ priv->local[healer->subvol] = healer->local;
+
+ if (priv->thin_arbiter_count) {
+ if (afr_shd_ta_needs_heal(this, healer))
+ afr_shd_ta_get_xattrs(this, &loc, healer, &pre_crawl_xdata);
}
- snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
- ret = dict_get_uint64 (output, key, &count);
+ do {
+ gf_msg_debug(this->name, 0, "starting index sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+
+ afr_shd_sweep_prepare(healer);
+
+ ret = afr_shd_index_sweep_all(healer);
+
+ afr_shd_sweep_done(healer);
+ /*
+ As long as at least one gfid was
+ healed, keep retrying. We may have
+ just healed a directory and thereby
+ created entries for other gfids which
+ could not be healed thus far.
+ */
+
+ gf_msg_debug(this->name, 0, "finished index sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+ /*
+ Give a pause before retrying to avoid a busy loop
+ in case the only entry in index is because of
+ an ongoing I/O.
+ */
+ sleep(1);
+ } while (ret > 0);
+
+ if (ret == 0) {
+ afr_cleanup_anon_inode_dir(healer);
+ }
- snprintf (key, sizeof (key), "%d-%d-%"PRIu64, xl_id, child, count);
- ret = dict_set_dynstr (output, key, path);
+ if (ret == 0 && pre_crawl_xdata &&
+ !healer->crawl_event.heal_failed_count) {
+ afr_shd_ta_check_and_unset_xattrs(this, &loc, healer,
+ pre_crawl_xdata);
+ }
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- AFR_MSG_DICT_SET_FAILED, "%s: Could not add to output",
- path);
- goto out;
+ if (pre_crawl_xdata) {
+ dict_unref(pre_crawl_xdata);
+ pre_crawl_xdata = NULL;
}
+ }
- if (tv) {
- snprintf (key, sizeof (key), "%d-%d-%"PRIu64"-time", xl_id,
- child, count);
- ret = dict_set_uint32 (output, key, tv->tv_sec);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "%s: Could not set time",
- path);
- goto out;
- }
- }
-
- snprintf (key, sizeof (key), "%d-%d-count", xl_id, child);
-
- ret = dict_set_uint64 (output, key, count + 1);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_DICT_SET_FAILED,
- "Could not increment count");
- goto out;
+ return NULL;
+}
+
+void *
+afr_shd_full_healer(void *data)
+{
+ struct subvol_healer *healer = NULL;
+ xlator_t *this = NULL;
+ int run = 0;
+
+ healer = data;
+ THIS = this = healer->this;
+
+ for (;;) {
+ pthread_mutex_lock(&healer->mutex);
+ {
+ run = __afr_shd_healer_wait(healer);
+ if (!run)
+ healer->running = _gf_false;
}
+ pthread_mutex_unlock(&healer->mutex);
- ret = 0;
-out:
- return ret;
+ if (!run)
+ break;
+
+ ASSERT_LOCAL(this, healer);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "starting full sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+
+ afr_shd_sweep_prepare(healer);
+
+ afr_shd_full_sweep(healer, this->itable->root);
+
+ afr_shd_sweep_done(healer);
+
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_SELF_HEAL_INFO,
+ "finished full sweep on subvol %s",
+ afr_subvol_name(this, healer->subvol));
+ }
+
+ return NULL;
}
int
-afr_add_shd_event (circular_buffer_t *cb, void *data)
+afr_shd_healer_init(xlator_t *this, struct subvol_healer *healer)
{
- dict_t *output = NULL;
- xlator_t *this = THIS;
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- shd_event_t *shd_event = NULL;
- char *path = NULL;
-
- output = data;
- priv = this->private;
- shd = &priv->shd;
- shd_event = cb->data;
-
- if (!shd->index_healers[shd_event->child].local)
- return 0;
-
- path = gf_strdup (shd_event->path);
- if (!path)
- return -ENOMEM;
-
- afr_shd_dict_add_path (this, output, shd_event->child, path,
- &cb->tv);
- return 0;
+ int ret = 0;
+
+ ret = pthread_mutex_init(&healer->mutex, NULL);
+ if (ret)
+ goto out;
+
+ ret = pthread_cond_init(&healer->cond, NULL);
+ if (ret)
+ goto out;
+
+ healer->this = this;
+ healer->running = _gf_false;
+ healer->rerun = _gf_false;
+ healer->local = _gf_false;
+out:
+ return ret;
}
int
-afr_add_crawl_event (circular_buffer_t *cb, void *data)
+afr_shd_healer_spawn(xlator_t *this, struct subvol_healer *healer,
+ void *(threadfn)(void *))
{
- dict_t *output = NULL;
- xlator_t *this = THIS;
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- crawl_event_t *crawl_event = NULL;
+ int ret = 0;
- output = data;
- priv = this->private;
- shd = &priv->shd;
- crawl_event = cb->data;
+ pthread_mutex_lock(&healer->mutex);
+ {
+ if (healer->running) {
+ pthread_cond_signal(&healer->cond);
+ } else {
+ ret = gf_thread_create(&healer->thread, NULL, threadfn, healer,
+ "shdheal");
+ if (ret)
+ goto unlock;
+ healer->running = 1;
+ }
- if (!shd->index_healers[crawl_event->child].local)
- return 0;
+ healer->rerun = 1;
+ }
+unlock:
+ pthread_mutex_unlock(&healer->mutex);
- afr_shd_dict_add_crawl_event (this, output, crawl_event);
+ return ret;
+}
- return 0;
+int
+afr_shd_full_healer_spawn(xlator_t *this, int subvol)
+{
+ return afr_shd_healer_spawn(this, NTH_FULL_HEALER(this, subvol),
+ afr_shd_full_healer);
}
+int
+afr_shd_index_healer_spawn(xlator_t *this, int subvol)
+{
+ return afr_shd_healer_spawn(this, NTH_INDEX_HEALER(this, subvol),
+ afr_shd_index_healer);
+}
int
-afr_selfheal_daemon_init (xlator_t *this)
+afr_shd_dict_add_crawl_event(xlator_t *this, dict_t *output,
+ crawl_event_t *crawl_event)
{
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- int ret = -1;
- int i = 0;
-
- priv = this->private;
- shd = &priv->shd;
-
- this->itable = inode_table_new (SHD_INODE_LRU_LIMIT, this);
- if (!this->itable)
- goto out;
-
- shd->index_healers = GF_CALLOC (sizeof(*shd->index_healers),
- priv->child_count,
- gf_afr_mt_subvol_healer_t);
- if (!shd->index_healers)
- goto out;
-
- for (i = 0; i < priv->child_count; i++) {
- shd->index_healers[i].subvol = i;
- ret = afr_shd_healer_init (this, &shd->index_healers[i]);
- if (ret)
- goto out;
- }
-
- shd->full_healers = GF_CALLOC (sizeof(*shd->full_healers),
- priv->child_count,
- gf_afr_mt_subvol_healer_t);
- if (!shd->full_healers)
- goto out;
- for (i = 0; i < priv->child_count; i++) {
- shd->full_healers[i].subvol = i;
- ret = afr_shd_healer_init (this, &shd->full_healers[i]);
- if (ret)
- goto out;
- }
-
- shd->split_brain = eh_new (AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false,
- afr_destroy_shd_event_data);
- if (!shd->split_brain)
- goto out;
-
- shd->statistics = GF_CALLOC (sizeof(eh_t *), priv->child_count,
- gf_common_mt_eh_t);
- if (!shd->statistics)
- goto out;
+ int ret = 0;
+ uint64_t count = 0;
+ char key[128] = {0};
+ int keylen = 0;
+ char suffix[64] = {0};
+ int xl_id = 0;
+ uint64_t healed_count = 0;
+ uint64_t split_brain_count = 0;
+ uint64_t heal_failed_count = 0;
+ char *start_time_str = 0;
+ char *end_time_str = NULL;
+ char *crawl_type = NULL;
+ int progress = -1;
+ int child = -1;
+
+ child = crawl_event->child;
+ healed_count = crawl_event->healed_count;
+ split_brain_count = crawl_event->split_brain_count;
+ heal_failed_count = crawl_event->heal_failed_count;
+ crawl_type = crawl_event->crawl_type;
+
+ if (!crawl_event->start_time)
+ goto out;
+
+ start_time_str = gf_strdup(ctime(&crawl_event->start_time));
+
+ if (crawl_event->end_time)
+ end_time_str = gf_strdup(ctime(&crawl_event->end_time));
+
+ ret = dict_get_int32(output, this->name, &xl_id);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "xl does not have id");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_get_uint64(output, key, &count);
+
+ snprintf(suffix, sizeof(suffix), "%d-%d-%" PRIu64, xl_id, child, count);
+ snprintf(key, sizeof(key), "statistics_healed_cnt-%s", suffix);
+ ret = dict_set_uint64(output, key, healed_count);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_healed_count to output");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics_sb_cnt-%s", suffix);
+ ret = dict_set_uint64(output, key, split_brain_count);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_split_brain_count to output");
+ goto out;
+ }
+
+ keylen = snprintf(key, sizeof(key), "statistics_crawl_type-%s", suffix);
+ ret = dict_set_strn(output, key, keylen, crawl_type);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_crawl_type to output");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics_heal_failed_cnt-%s", suffix);
+ ret = dict_set_uint64(output, key, heal_failed_count);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_healed_failed_count to output");
+ goto out;
+ }
+
+ keylen = snprintf(key, sizeof(key), "statistics_strt_time-%s", suffix);
+ ret = dict_set_dynstrn(output, key, keylen, start_time_str);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_crawl_start_time to output");
+ goto out;
+ } else {
+ start_time_str = NULL;
+ }
+
+ if (!end_time_str)
+ progress = 1;
+ else
+ progress = 0;
+
+ keylen = snprintf(key, sizeof(key), "statistics_end_time-%s", suffix);
+ if (!end_time_str)
+ end_time_str = gf_strdup("Could not determine the end time");
+ ret = dict_set_dynstrn(output, key, keylen, end_time_str);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_crawl_end_time to output");
+ goto out;
+ } else {
+ end_time_str = NULL;
+ }
+
+ keylen = snprintf(key, sizeof(key), "statistics_inprogress-%s", suffix);
+
+ ret = dict_set_int32n(output, key, keylen, progress);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not add statistics_inprogress to output");
+ goto out;
+ }
+
+ snprintf(key, sizeof(key), "statistics-%d-%d-count", xl_id, child);
+ ret = dict_set_uint64(output, key, count + 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not increment the counter.");
+ goto out;
+ }
+out:
+ GF_FREE(start_time_str);
+ GF_FREE(end_time_str);
+ return ret;
+}
- for (i = 0; i < priv->child_count ; i++) {
- shd->statistics[i] = eh_new (AFR_STATISTICS_HISTORY_SIZE,
- _gf_false,
- afr_destroy_crawl_event_data);
- if (!shd->statistics[i])
- goto out;
- shd->full_healers[i].crawl_event.child = i;
- shd->full_healers[i].crawl_event.crawl_type = "FULL";
- shd->index_healers[i].crawl_event.child = i;
- shd->index_healers[i].crawl_event.crawl_type = "INDEX";
+int
+afr_shd_dict_add_path(xlator_t *this, dict_t *output, int child, char *path,
+ struct timeval *tv)
+{
+ int ret = -1;
+ uint64_t count = 0;
+ char key[64] = {0};
+ int keylen = 0;
+ char xl_id_child_str[32] = {0};
+ int xl_id = 0;
+
+ ret = dict_get_int32(output, this->name, &xl_id);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "xl does not have id");
+ goto out;
+ }
+
+ snprintf(xl_id_child_str, sizeof(xl_id_child_str), "%d-%d", xl_id, child);
+ snprintf(key, sizeof(key), "%s-count", xl_id_child_str);
+ ret = dict_get_uint64(output, key, &count);
+
+ keylen = snprintf(key, sizeof(key), "%s-%" PRIu64, xl_id_child_str, count);
+ ret = dict_set_dynstrn(output, key, keylen, path);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Could not add to output", path);
+ goto out;
+ }
+
+ if (tv) {
+ snprintf(key, sizeof(key), "%s-%" PRIu64 "-time", xl_id_child_str,
+ count);
+ ret = dict_set_uint32(output, key, tv->tv_sec);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "%s: Could not set time", path);
+ goto out;
}
+ }
+
+ snprintf(key, sizeof(key), "%s-count", xl_id_child_str);
- ret = 0;
+ ret = dict_set_uint64(output, key, count + 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "Could not increment count");
+ goto out;
+ }
+
+ ret = 0;
out:
- return ret;
+ return ret;
}
+int
+afr_add_shd_event(circular_buffer_t *cb, void *data)
+{
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ shd_event_t *shd_event = NULL;
+ char *path = NULL;
+
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ shd_event = cb->data;
+
+ if (!shd->index_healers[shd_event->child].local)
+ return 0;
+
+ path = gf_strdup(shd_event->path);
+ if (!path)
+ return -ENOMEM;
+
+ afr_shd_dict_add_path(this, output, shd_event->child, path, &cb->tv);
+ return 0;
+}
int
-afr_selfheal_childup (xlator_t *this, int subvol)
+afr_add_crawl_event(circular_buffer_t *cb, void *data)
{
- afr_shd_index_healer_spawn (this, subvol);
+ dict_t *output = NULL;
+ xlator_t *this = THIS;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ crawl_event_t *crawl_event = NULL;
+
+ output = data;
+ priv = this->private;
+ shd = &priv->shd;
+ crawl_event = cb->data;
+
+ if (!shd->index_healers[crawl_event->child].local)
+ return 0;
+
+ afr_shd_dict_add_crawl_event(this, output, crawl_event);
- return 0;
+ return 0;
+}
+
+int
+afr_selfheal_daemon_init(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ int ret = -1;
+ int i = 0;
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ shd->index_healers = GF_CALLOC(sizeof(*shd->index_healers),
+ priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->index_healers)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ shd->index_healers[i].subvol = i;
+ ret = afr_shd_healer_init(this, &shd->index_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->full_healers = GF_CALLOC(sizeof(*shd->full_healers), priv->child_count,
+ gf_afr_mt_subvol_healer_t);
+ if (!shd->full_healers)
+ goto out;
+ for (i = 0; i < priv->child_count; i++) {
+ shd->full_healers[i].subvol = i;
+ ret = afr_shd_healer_init(this, &shd->full_healers[i]);
+ if (ret)
+ goto out;
+ }
+
+ shd->split_brain = eh_new(AFR_EH_SPLIT_BRAIN_LIMIT, _gf_false,
+ afr_destroy_shd_event_data);
+ if (!shd->split_brain)
+ goto out;
+
+ shd->statistics = GF_CALLOC(sizeof(eh_t *), priv->child_count,
+ gf_common_mt_eh_t);
+ if (!shd->statistics)
+ goto out;
+
+ for (i = 0; i < priv->child_count; i++) {
+ shd->statistics[i] = eh_new(AFR_STATISTICS_HISTORY_SIZE, _gf_false,
+ afr_destroy_crawl_event_data);
+ if (!shd->statistics[i])
+ goto out;
+ shd->full_healers[i].crawl_event.child = i;
+ shd->full_healers[i].crawl_event.crawl_type = "FULL";
+ shd->index_healers[i].crawl_event.child = i;
+ shd->index_healers[i].crawl_event.crawl_type = "INDEX";
+ }
+
+ ret = 0;
+out:
+ return ret;
}
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv)
+{
+ int subvol = 0;
+
+ if (!priv->shd.iamshd)
+ return;
+ for (subvol = 0; subvol < priv->child_count; subvol++)
+ if (priv->child_up[subvol])
+ afr_shd_index_healer_spawn(this, subvol);
+
+ return;
+}
int
-afr_shd_get_index_count (xlator_t *this, int i, uint64_t *count)
+afr_shd_get_index_count(xlator_t *this, int i, uint64_t *count)
{
- afr_private_t *priv = NULL;
- xlator_t *subvol = NULL;
- loc_t rootloc = {0, };
- dict_t *xattr = NULL;
- int ret = -1;
+ afr_private_t *priv = NULL;
+ xlator_t *subvol = NULL;
+ loc_t rootloc = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+ int ret = -1;
- priv = this->private;
- subvol = priv->children[i];
+ priv = this->private;
+ subvol = priv->children[i];
- rootloc.inode = inode_ref (this->itable->root);
- gf_uuid_copy (rootloc.gfid, rootloc.inode->gfid);
+ rootloc.inode = inode_ref(this->itable->root);
+ gf_uuid_copy(rootloc.gfid, rootloc.inode->gfid);
- ret = syncop_getxattr (subvol, &rootloc, &xattr,
- GF_XATTROP_INDEX_COUNT, NULL, NULL);
- if (ret < 0)
- goto out;
+ ret = syncop_getxattr(subvol, &rootloc, &xattr, GF_XATTROP_INDEX_COUNT,
+ NULL, NULL);
+ if (ret < 0)
+ goto out;
- ret = dict_get_uint64 (xattr, GF_XATTROP_INDEX_COUNT, count);
- if (ret)
- goto out;
+ ret = dict_get_uint64(xattr, GF_XATTROP_INDEX_COUNT, count);
+ if (ret)
+ goto out;
- ret = 0;
+ ret = 0;
out:
- if (xattr)
- dict_unref (xattr);
- loc_wipe (&rootloc);
+ if (xattr)
+ dict_unref(xattr);
+ loc_wipe(&rootloc);
- return ret;
+ return ret;
}
-
int
-afr_xl_op (xlator_t *this, dict_t *input, dict_t *output)
+afr_xl_op(xlator_t *this, dict_t *input, dict_t *output)
{
- gf_xl_afr_op_t op = GF_SHD_OP_INVALID;
- int ret = 0;
- int xl_id = 0;
- afr_private_t *priv = NULL;
- afr_self_heald_t *shd = NULL;
- struct subvol_healer *healer = NULL;
- int i = 0;
- char key[64];
- int op_ret = 0;
- uint64_t cnt = 0;
-
- priv = this->private;
- shd = &priv->shd;
-
- ret = dict_get_int32 (input, "xl-op", (int32_t*)&op);
- if (ret)
- goto out;
- ret = dict_get_int32 (input, this->name, &xl_id);
- if (ret)
- goto out;
- ret = dict_set_int32 (output, this->name, xl_id);
- if (ret)
- goto out;
- switch (op) {
+ gf_xl_afr_op_t op = GF_SHD_OP_INVALID;
+ int ret = 0;
+ int xl_id = 0;
+ afr_private_t *priv = NULL;
+ afr_self_heald_t *shd = NULL;
+ struct subvol_healer *healer = NULL;
+ int i = 0;
+ char key[64];
+ int keylen = 0;
+ int this_name_len = 0;
+ int op_ret = 0;
+ uint64_t cnt = 0;
+
+#define AFR_SET_DICT_AND_LOG(name, output, key, keylen, dict_str, \
+ dict_str_len) \
+ { \
+ int ret; \
+ \
+ ret = dict_set_nstrn(output, key, keylen, dict_str, dict_str_len); \
+ if (ret) { \
+ gf_smsg(name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED, \
+ "key=%s", key, "value=%s", dict_str, NULL); \
+ } \
+ }
+
+ priv = this->private;
+ shd = &priv->shd;
+
+ ret = dict_get_int32_sizen(input, "xl-op", (int32_t *)&op);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "key=xl-op", NULL);
+ goto out;
+ }
+ this_name_len = strlen(this->name);
+ ret = dict_get_int32n(input, this->name, this_name_len, &xl_id);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_GET_FAILED,
+ "key=%s", this->name, NULL);
+ goto out;
+ }
+ ret = dict_set_int32n(output, this->name, this_name_len, xl_id);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_DICT_SET_FAILED,
+ "key=%s", this->name, NULL);
+ goto out;
+ }
+ switch (op) {
case GF_SHD_OP_HEAL_INDEX:
- op_ret = 0;
-
- for (i = 0; i < priv->child_count; i++) {
- healer = &shd->index_healers[i];
- snprintf (key, sizeof (key), "%d-%d-status", xl_id, i);
-
- if (!priv->child_up[i]) {
- ret = dict_set_str (output, key,
- "Brick is not connected");
- op_ret = -1;
- } else if (AFR_COUNT (priv->child_up,
- priv->child_count) < 2) {
- ret = dict_set_str (output, key,
- "< 2 bricks in replica are up");
- op_ret = -1;
- } else if (!afr_shd_is_subvol_local (this, healer->subvol)) {
- ret = dict_set_str (output, key,
- "Brick is remote");
- } else {
- ret = dict_set_str (output, key,
- "Started self-heal");
- afr_shd_index_healer_spawn (this, i);
- }
- }
- break;
+ op_ret = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->index_healers[i];
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
+ op_ret = -1;
+ } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SLESS_THAN2_BRICKS_in_REP,
+ SLEN(SLESS_THAN2_BRICKS_in_REP));
+ op_ret = -1;
+ } else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_IS_REMOTE,
+ SLEN(SBRICK_IS_REMOTE));
+ } else {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SSTARTED_SELF_HEAL,
+ SLEN(SSTARTED_SELF_HEAL));
+
+ ret = afr_shd_index_healer_spawn(this, i);
+
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+ }
+ }
+ }
+ break;
case GF_SHD_OP_HEAL_FULL:
- op_ret = -1;
-
- for (i = 0; i < priv->child_count; i++) {
- healer = &shd->full_healers[i];
- snprintf (key, sizeof (key), "%d-%d-status", xl_id, i);
-
- if (!priv->child_up[i]) {
- ret = dict_set_str (output, key,
- "Brick is not connected");
- } else if (AFR_COUNT (priv->child_up,
- priv->child_count) < 2) {
- ret = dict_set_str (output, key,
- "< 2 bricks in replica are up");
- } else if (!afr_shd_is_subvol_local (this, healer->subvol)) {
- ret = dict_set_str (output, key,
- "Brick is remote");
- } else {
- ret = dict_set_str (output, key,
- "Started self-heal");
- afr_shd_full_healer_spawn (this, i);
- op_ret = 0;
- }
- }
- break;
- case GF_SHD_OP_INDEX_SUMMARY:
- /* this case has been handled in glfs-heal.c */
- break;
- case GF_SHD_OP_HEALED_FILES:
- case GF_SHD_OP_HEAL_FAILED_FILES:
- for (i = 0; i < priv->child_count; i++) {
- snprintf (key, sizeof (key), "%d-%d-status", xl_id, i);
- ret = dict_set_str (output, key, "Operation Not "
- "Supported");
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->full_healers[i];
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id, i);
+
+ if (!priv->child_up[i]) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
+ } else if (AFR_COUNT(priv->child_up, priv->child_count) < 2) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SLESS_THAN2_BRICKS_in_REP,
+ SLEN(SLESS_THAN2_BRICKS_in_REP));
+ } else if (!afr_shd_is_subvol_local(this, healer->subvol)) {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_IS_REMOTE,
+ SLEN(SBRICK_IS_REMOTE));
+ } else {
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SSTARTED_SELF_HEAL,
+ SLEN(SSTARTED_SELF_HEAL));
+
+ ret = afr_shd_full_healer_spawn(this, i);
+
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_HEALER_SPAWN_FAILED, NULL);
+ }
+ op_ret = 0;
}
- break;
+ }
+ break;
+ case GF_SHD_OP_INDEX_SUMMARY:
+ /* this case has been handled in glfs-heal.c */
+ break;
case GF_SHD_OP_SPLIT_BRAIN_FILES:
- eh_dump (shd->split_brain, output, afr_add_shd_event);
- break;
+ eh_dump(shd->split_brain, output, afr_add_shd_event);
+ break;
case GF_SHD_OP_STATISTICS:
- for (i = 0; i < priv->child_count; i++) {
- eh_dump (shd->statistics[i], output,
- afr_add_crawl_event);
- afr_shd_dict_add_crawl_event (this, output,
- &shd->index_healers[i].crawl_event);
- afr_shd_dict_add_crawl_event (this, output,
- &shd->full_healers[i].crawl_event);
- }
- break;
+ for (i = 0; i < priv->child_count; i++) {
+ eh_dump(shd->statistics[i], output, afr_add_crawl_event);
+ ret = afr_shd_dict_add_crawl_event(
+ this, output, &shd->index_healers[i].crawl_event);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+ }
+
+ ret = afr_shd_dict_add_crawl_event(
+ this, output, &shd->full_healers[i].crawl_event);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_ADD_CRAWL_EVENT_FAILED, NULL);
+ }
+ }
+ break;
case GF_SHD_OP_STATISTICS_HEAL_COUNT:
case GF_SHD_OP_STATISTICS_HEAL_COUNT_PER_REPLICA:
- op_ret = -1;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!priv->child_up[i]) {
- snprintf (key, sizeof (key), "%d-%d-status",
- xl_id, i);
- ret = dict_set_str (output, key,
- "Brick is not connected");
- } else {
- snprintf (key, sizeof (key), "%d-%d-hardlinks",
- xl_id, i);
- ret = afr_shd_get_index_count (this, i, &cnt);
- if (ret == 0) {
- ret = dict_set_uint64 (output, key, cnt);
- }
- op_ret = 0;
- }
- }
-
-// ret = _do_crawl_op_on_local_subvols (this, INDEX_TO_BE_HEALED,
-// STATISTICS_TO_BE_HEALED,
-// output);
- break;
+ op_ret = -1;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!priv->child_up[i]) {
+ keylen = snprintf(key, sizeof(key), "%d-%d-status", xl_id,
+ i);
+ AFR_SET_DICT_AND_LOG(this->name, output, key, keylen,
+ SBRICK_NOT_CONNECTED,
+ SLEN(SBRICK_NOT_CONNECTED));
+ } else {
+ snprintf(key, sizeof(key), "%d-%d-hardlinks", xl_id, i);
+ ret = afr_shd_get_index_count(this, i, &cnt);
+ if (ret == 0) {
+ ret = dict_set_uint64(output, key, cnt);
+ }
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ AFR_MSG_DICT_SET_FAILED, NULL);
+ }
+ op_ret = 0;
+ }
+ }
+
+ break;
default:
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_INVALID_ARG, "Unknown set op %d", op);
- break;
- }
+ gf_smsg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_ARG, "op=%d",
+ op, NULL);
+ break;
+ }
out:
- dict_del (output, this->name);
- return op_ret;
+ dict_deln(output, this->name, this_name_len);
+ return op_ret;
+
+#undef AFR_SET_DICT_AND_LOG
}
diff --git a/xlators/cluster/afr/src/afr-self-heald.h b/xlators/cluster/afr/src/afr-self-heald.h
index 58b088e4cd7..18db728ea7b 100644
--- a/xlators/cluster/afr/src/afr-self-heald.h
+++ b/xlators/cluster/afr/src/afr-self-heald.h
@@ -8,70 +8,68 @@
cases as published by the Free Software Foundation.
*/
-
#ifndef _AFR_SELF_HEALD_H
#define _AFR_SELF_HEALD_H
#include <pthread.h>
-
typedef struct {
- int child;
- char *path;
+ char *path;
+ int child;
} shd_event_t;
typedef struct {
- int child;
- uint64_t healed_count;
- uint64_t split_brain_count;
- uint64_t heal_failed_count;
-
- /* If start_time is 0, it means crawler is not in progress
- and stats are not valid */
- time_t start_time;
- /* If start_time is NOT 0 and end_time is 0, it means
- cralwer is in progress */
- time_t end_time;
- char *crawl_type;
+ uint64_t healed_count;
+ uint64_t split_brain_count;
+ uint64_t heal_failed_count;
+
+ /* If start_time is 0, it means crawler is not in progress
+ and stats are not valid */
+ time_t start_time;
+ /* If start_time is NOT 0 and end_time is 0, it means
+ cralwer is in progress */
+ time_t end_time;
+ char *crawl_type;
+ int child;
} crawl_event_t;
struct subvol_healer {
- xlator_t *this;
- int subvol;
- gf_boolean_t local;
- gf_boolean_t running;
- gf_boolean_t rerun;
- crawl_event_t crawl_event;
- pthread_mutex_t mutex;
- pthread_cond_t cond;
- pthread_t thread;
+ xlator_t *this;
+ crawl_event_t crawl_event;
+ pthread_mutex_t mutex;
+ pthread_cond_t cond;
+ pthread_t thread;
+ int subvol;
+ gf_boolean_t local;
+ gf_boolean_t running;
+ gf_boolean_t rerun;
};
typedef struct {
- gf_boolean_t iamshd;
- gf_boolean_t enabled;
- int timeout;
- struct subvol_healer *index_healers;
- struct subvol_healer *full_healers;
-
- eh_t *split_brain;
- eh_t **statistics;
+ struct subvol_healer *index_healers;
+ struct subvol_healer *full_healers;
+
+ eh_t *split_brain;
+ eh_t **statistics;
+ int timeout;
+ uint32_t max_threads;
+ uint32_t wait_qlength;
+ uint32_t halo_max_latency_msec;
+ gf_boolean_t iamshd;
+ gf_boolean_t enabled;
} afr_self_heald_t;
-
-int
-afr_selfheal_childup (xlator_t *this, int subvol);
-
int
-afr_selfheal_daemon_init (xlator_t *this);
+afr_selfheal_daemon_init(xlator_t *this);
int
-afr_xl_op (xlator_t *this, dict_t *input, dict_t *output);
+afr_xl_op(xlator_t *this, dict_t *input, dict_t *output);
int
-afr_shd_gfid_to_path (xlator_t *this, xlator_t *subvol, uuid_t gfid,
- char **path_p);
+afr_shd_gfid_to_path(xlator_t *this, xlator_t *subvol, uuid_t gfid,
+ char **path_p);
int
-afr_shd_index_purge (xlator_t *subvol, inode_t *inode, char *name);
+afr_shd_entry_purge(xlator_t *subvol, inode_t *inode, char *name,
+ ia_type_t type);
#endif /* !_AFR_SELF_HEALD_H */
diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
index 92f68b91113..a51f79b1f43 100644
--- a/xlators/cluster/afr/src/afr-transaction.c
+++ b/xlators/cluster/afr/src/afr-transaction.c
@@ -8,10 +8,10 @@
cases as published by the Free Software Foundation.
*/
-#include "dict.h"
-#include "byte-order.h"
-#include "common-utils.h"
-#include "timer.h"
+#include <glusterfs/dict.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/common-utils.h>
+#include <glusterfs/timer.h>
#include "afr.h"
#include "afr-transaction.h"
@@ -20,1991 +20,2908 @@
#include <signal.h>
+typedef enum {
+ AFR_TRANSACTION_PRE_OP,
+ AFR_TRANSACTION_POST_OP,
+} afr_xattrop_type_t;
+
+static void
+afr_lock_resume_shared(struct list_head *list);
+
+static void
+afr_post_op_handle_success(call_frame_t *frame, xlator_t *this);
+
+static void
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno);
+
+void
+__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared);
+
+void
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this);
+
+int
+afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this);
+
gf_boolean_t
-afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this);
+afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this);
gf_boolean_t
-afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this);
+afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this);
int
-afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr,
- afr_changelog_resume_t changelog_resume);
+afr_changelog_call_count(afr_transaction_type type,
+ unsigned char *pre_op_subvols,
+ unsigned char *failed_subvols,
+ unsigned int child_count);
+int
+afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume,
+ afr_xattrop_type_t op);
-static int32_t
-afr_quorum_errno (afr_private_t *priv)
+static void
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this);
+
+static int
+afr_ta_post_op_do(void *opaque);
+
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local);
+
+static int
+afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this);
+
+static void
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno);
+
+void
+afr_ta_locked_priv_invalidate(afr_private_t *priv)
+{
+ priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+ priv->release_ta_notify_dom_lock = _gf_false;
+ priv->ta_notify_dom_lock_offset = 0;
+}
+
+static void
+afr_ta_process_waitq(xlator_t *this)
{
- if (priv->quorum_reads)
- return ENOTCONN;
- return EROFS;
+ afr_local_t *entry = NULL;
+ afr_private_t *priv = this->private;
+ struct list_head waitq = {
+ 0,
+ };
+
+ INIT_LIST_HEAD(&waitq);
+ LOCK(&priv->lock);
+ list_splice_init(&priv->ta_waitq, &waitq);
+ UNLOCK(&priv->lock);
+ list_for_each_entry(entry, &waitq, ta_waitq)
+ {
+ afr_ta_decide_post_op_state(entry->transaction.frame, this);
+ }
}
int
-__afr_txn_write_fop (call_frame_t *frame, xlator_t *this)
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- unsigned char *failed_subvols = NULL;
- int i = 0;
+ afr_ta_process_waitq(ta_frame->this);
+ STACK_DESTROY(ta_frame->root);
+ return 0;
+}
- local = frame->local;
- priv = this->private;
+int
+afr_release_notify_lock_for_ta(void *opaque)
+{
+ xlator_t *this = NULL;
+ afr_private_t *priv = NULL;
+ loc_t loc = {
+ 0,
+ };
+ struct gf_flock flock = {
+ 0,
+ };
+ int ret = -1;
+
+ this = (xlator_t *)opaque;
+ priv = this->private;
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate loc for thin-arbiter.");
+ goto out;
+ }
+ flock.l_type = F_UNLCK;
+ flock.l_start = priv->ta_notify_dom_lock_offset;
+ flock.l_len = 1;
+ ret = syncop_inodelk(priv->children[THIN_ARBITER_BRICK_INDEX],
+ AFR_TA_DOM_NOTIFY, &loc, F_SETLK, &flock, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to unlock AFR_TA_DOM_NOTIFY lock.");
+ }
+
+ LOCK(&priv->lock);
+ {
+ afr_ta_locked_priv_invalidate(priv);
+ }
+ UNLOCK(&priv->lock);
+out:
+ loc_wipe(&loc);
+ return ret;
+}
- failed_subvols = local->transaction.failed_subvols;
+void
+afr_zero_fill_stat(afr_local_t *local)
+{
+ if (!local)
+ return;
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION) {
+ gf_zero_fill_stat(&local->cont.inode_wfop.prebuf);
+ gf_zero_fill_stat(&local->cont.inode_wfop.postbuf);
+ } else if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ gf_zero_fill_stat(&local->cont.dir_fop.buf);
+ gf_zero_fill_stat(&local->cont.dir_fop.preparent);
+ gf_zero_fill_stat(&local->cont.dir_fop.postparent);
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION)
+ return;
+ gf_zero_fill_stat(&local->cont.dir_fop.prenewparent);
+ gf_zero_fill_stat(&local->cont.dir_fop.postnewparent);
+ }
+}
+
+/* In case of errors afr needs to choose which xdata from lower xlators it needs
+ * to unwind with. The way it is done is by checking if there are
+ * any good subvols which failed. Give preference to errnos other than
+ * ENOTCONN even if the child is source */
+void
+afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1,
+ unsigned char *readable1, inode_t *inode2,
+ unsigned char *readable2)
+{
+ int s = -1; /*selection*/
+ int i = 0;
+ unsigned char *readable = NULL;
+
+ if (local->xdata_rsp) {
+ dict_unref(local->xdata_rsp);
+ local->xdata_rsp = NULL;
+ }
+
+ readable = alloca0(priv->child_count * sizeof(*readable));
+ if (inode2 && readable2) { /*rename fop*/
+ AFR_INTERSECT(readable, readable1, readable2, priv->child_count);
+ } else {
+ memcpy(readable, readable1, sizeof(*readable) * priv->child_count);
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
+
+ if (local->replies[i].op_ret >= 0)
+ continue;
+
+ if (local->replies[i].op_errno == ENOTCONN)
+ continue;
+
+ /*Order is important in the following condition*/
+ if ((s < 0) || (!readable[s] && readable[i]))
+ s = i;
+ }
+
+ if (s != -1 && local->replies[s].xdata) {
+ local->xdata_rsp = dict_ref(local->replies[s].xdata);
+ } else if (s == -1) {
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->replies[i].valid)
+ continue;
- call_count = priv->child_count - AFR_COUNT (failed_subvols,
- priv->child_count);
+ if (local->replies[i].op_ret >= 0)
+ continue;
- if (call_count == 0) {
- local->transaction.resume (frame, this);
- return 0;
+ if (!local->replies[i].xdata)
+ continue;
+ local->xdata_rsp = dict_ref(local->replies[i].xdata);
+ break;
}
+ }
+}
+
+gf_boolean_t
+afr_needs_changelog_update(afr_local_t *local)
+{
+ if (local->transaction.type == AFR_DATA_TRANSACTION)
+ return _gf_true;
+ if (!local->optimistic_change_log)
+ return _gf_true;
+ return _gf_false;
+}
- local->call_count = call_count;
+gf_boolean_t
+afr_changelog_has_quorum(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ int i = 0;
+ unsigned char *success_children = NULL;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i] && !failed_subvols[i]) {
- local->transaction.wind (frame, this, i);
+ priv = this->private;
+ success_children = alloca0(priv->child_count);
- if (!--call_count)
- break;
- }
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.failed_subvols[i]) {
+ success_children[i] = 1;
}
+ }
- return 0;
+ if (afr_has_quorum(success_children, this, NULL)) {
+ return _gf_true;
+ }
+
+ return _gf_false;
}
+gf_boolean_t
+afr_is_write_subvol_valid(call_frame_t *frame, xlator_t *this)
+{
+ int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ uint64_t write_subvol = 0;
+ unsigned char *writable = NULL;
+ uint16_t datamap = 0;
+
+ local = frame->local;
+ priv = this->private;
+ writable = alloca0(priv->child_count);
+
+ write_subvol = afr_write_subvol_get(frame, this);
+ datamap = (write_subvol & 0x00000000ffff0000) >> 16;
+ for (i = 0; i < priv->child_count; i++) {
+ if (datamap & (1 << i))
+ writable[i] = 1;
+
+ if (writable[i] && !local->transaction.failed_subvols[i])
+ return _gf_true;
+ }
+
+ return _gf_false;
+}
int
-__afr_txn_write_done (call_frame_t *frame, xlator_t *this)
+afr_transaction_fop(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int call_count = -1;
+ unsigned char *failed_subvols = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ failed_subvols = local->transaction.failed_subvols;
+ call_count = priv->child_count -
+ AFR_COUNT(failed_subvols, priv->child_count);
+ /* Fail if pre-op did not succeed on quorum no. of bricks. */
+ if (!afr_changelog_has_quorum(local, this) || !call_count) {
+ local->op_ret = -1;
+ /* local->op_errno is already captured in changelog cbk. */
+ afr_transaction_resume(frame, this);
+ return 0;
+ }
- local = frame->local;
+ /* Fail if at least one writeable brick isn't up.*/
+ if (local->transaction.type == AFR_DATA_TRANSACTION &&
+ !afr_is_write_subvol_valid(frame, this)) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ afr_transaction_resume(frame, this);
+ return 0;
+ }
- local->transaction.unwind (frame, this);
+ local->call_count = call_count;
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] && !failed_subvols[i]) {
+ local->transaction.wind(frame, this, i);
- AFR_STACK_DESTROY (frame);
+ if (!--call_count)
+ break;
+ }
+ }
- return 0;
+ return 0;
}
-
-call_frame_t*
-afr_transaction_detach_fop_frame (call_frame_t *frame)
+int
+afr_transaction_done(call_frame_t *frame, xlator_t *this)
{
- afr_local_t * local = NULL;
- call_frame_t *fop_frame = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t unwind = _gf_false;
+ afr_lock_t *lock = NULL;
+ afr_local_t *lock_local = NULL;
- local = frame->local;
+ priv = this->private;
+ local = frame->local;
- LOCK (&frame->lock);
+ if (priv->consistent_metadata) {
+ LOCK(&frame->lock);
{
- fop_frame = local->transaction.main_frame;
- local->transaction.main_frame = NULL;
+ unwind = (local->transaction.main_frame != NULL);
}
- UNLOCK (&frame->lock);
-
- return fop_frame;
+ UNLOCK(&frame->lock);
+ if (unwind) /*It definitely did post-op*/
+ afr_zero_fill_stat(local);
+ }
+
+ if (local->transaction.do_eager_unlock) {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ lock->acquired = _gf_false;
+ lock->release = _gf_false;
+ list_splice_init(&lock->frozen, &lock->waiting);
+ if (list_empty(&lock->waiting))
+ goto unlock;
+ lock_local = list_entry(lock->waiting.next, afr_local_t,
+ transaction.wait_list);
+ list_del_init(&lock_local->transaction.wait_list);
+ list_add(&lock_local->transaction.owner_list, &lock->owners);
+ }
+ unlock:
+ UNLOCK(&local->inode->lock);
+ }
+ if (lock_local) {
+ afr_lock(lock_local->transaction.frame,
+ lock_local->transaction.frame->this);
+ }
+ local->transaction.unwind(frame, this);
+
+ GF_ASSERT(list_empty(&local->transaction.owner_list));
+ GF_ASSERT(list_empty(&local->transaction.wait_list));
+ AFR_STACK_DESTROY(frame);
+
+ return 0;
}
+static void
+afr_lock_fail_shared(afr_local_t *local, struct list_head *list)
+{
+ afr_local_t *each = NULL;
+
+ while (!list_empty(list)) {
+ each = list_entry(list->next, afr_local_t, transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ each->op_ret = -1;
+ each->op_errno = local->op_errno;
+ afr_transaction_done(each->transaction.frame,
+ each->transaction.frame->this);
+ }
+}
static void
-afr_save_lk_owner (call_frame_t *frame)
+afr_handle_lock_acquire_failure(afr_local_t *local)
{
- afr_local_t * local = NULL;
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
+
+ if (!local->transaction.eager_lock_on)
+ goto out;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
- local = frame->local;
+ INIT_LIST_HEAD(&shared);
+ LOCK(&local->inode->lock);
+ {
+ lock->release = _gf_true;
+ list_splice_init(&lock->waiting, &shared);
+ }
+ UNLOCK(&local->inode->lock);
- local->saved_lk_owner = frame->root->lk_owner;
+ afr_lock_fail_shared(local, &shared);
+ local->transaction.do_eager_unlock = _gf_true;
+out:
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ afr_unlock(local->transaction.frame, local->transaction.frame->this);
}
+call_frame_t *
+afr_transaction_detach_fop_frame(call_frame_t *frame)
+{
+ afr_local_t *local = NULL;
+ call_frame_t *fop_frame = NULL;
+
+ local = frame->local;
+
+ afr_handle_inconsistent_fop(frame, &local->op_ret, &local->op_errno);
+ LOCK(&frame->lock);
+ {
+ fop_frame = local->transaction.main_frame;
+ local->transaction.main_frame = NULL;
+ }
+ UNLOCK(&frame->lock);
+
+ return fop_frame;
+}
static void
-afr_restore_lk_owner (call_frame_t *frame)
+afr_save_lk_owner(call_frame_t *frame)
{
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- frame->root->lk_owner = local->saved_lk_owner;
+ local->saved_lk_owner = frame->root->lk_owner;
}
-void
-__mark_all_success (call_frame_t *frame, xlator_t *this)
+static void
+afr_restore_lk_owner(call_frame_t *frame)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i;
+ afr_local_t *local = NULL;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- for (i = 0; i < priv->child_count; i++) {
- local->transaction.failed_subvols[i] = 0;
- }
+ frame->root->lk_owner = local->saved_lk_owner;
}
void
-afr_compute_pre_op_sources (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_transaction_type type = -1;
- dict_t *xdata = NULL;
- int **matrix = NULL;
- int idx = -1;
- int i = 0;
- int j = 0;
-
- priv = this->private;
- local = frame->local;
- type = local->transaction.type;
- idx = afr_index_for_transaction_type (type);
- matrix = ALLOC_MATRIX (priv->child_count, int);
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op_xdata[i])
- continue;
- xdata = local->transaction.pre_op_xdata[i];
- afr_selfheal_fill_matrix (this, matrix, i, idx, xdata);
- }
-
- memset (local->transaction.pre_op_sources, 1, priv->child_count);
+__mark_all_success(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i;
- /*If lock or pre-op failed on a brick, it is not a source. */
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.failed_subvols[i])
- local->transaction.pre_op_sources[i] = 0;
- }
+ local = frame->local;
+ priv = this->private;
- /* If brick is blamed by others, it is not a source. */
- for (i = 0; i < priv->child_count; i++)
- for (j = 0; j < priv->child_count; j++)
- if (matrix[i][j] != 0)
- local->transaction.pre_op_sources[j] = 0;
+ for (i = 0; i < priv->child_count; i++) {
+ local->transaction.failed_subvols[i] = 0;
+ }
+}
- /*We don't need the xattrs any more. */
- for (i = 0; i < priv->child_count; i++)
- if (local->transaction.pre_op_xdata[i]) {
- dict_unref (local->transaction.pre_op_xdata[i]);
- local->transaction.pre_op_xdata[i] = NULL;
- }
+void
+afr_compute_pre_op_sources(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_transaction_type type = -1;
+ dict_t *xdata = NULL;
+ int **matrix = NULL;
+ int idx = -1;
+ int i = 0;
+ int j = 0;
+
+ priv = this->private;
+ local = frame->local;
+ type = local->transaction.type;
+ idx = afr_index_for_transaction_type(type);
+ matrix = ALLOC_MATRIX(priv->child_count, int);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.changelog_xdata[i])
+ continue;
+ xdata = local->transaction.changelog_xdata[i];
+ afr_selfheal_fill_matrix(this, matrix, i, idx, xdata);
+ }
+
+ memset(local->transaction.pre_op_sources, 1, priv->child_count);
+
+ /*If lock or pre-op failed on a brick, it is not a source. */
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->transaction.pre_op_sources[i] = 0;
+ }
+
+ /* If brick is blamed by others, it is not a source. */
+ for (i = 0; i < priv->child_count; i++)
+ for (j = 0; j < priv->child_count; j++)
+ if (matrix[i][j] != 0)
+ local->transaction.pre_op_sources[j] = 0;
}
void
-afr_txn_arbitrate_fop_cbk (call_frame_t *frame, xlator_t *this)
+afr_txn_arbitrate_fop(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- gf_boolean_t fop_failed = _gf_false;
- unsigned char *pre_op_sources = NULL;
- int i = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int pre_op_sources_count = 0;
+ int i = 0;
- local = frame->local;
- priv = this->private;
- pre_op_sources = local->transaction.pre_op_sources;
+ priv = this->private;
+ local = frame->local;
- if (priv->arbiter_count != 1 || local->op_ret < 0)
- return;
+ afr_compute_pre_op_sources(frame, this);
+ pre_op_sources_count = AFR_COUNT(local->transaction.pre_op_sources,
+ priv->child_count);
- /* If the fop failed on the brick, it is not a source. */
+ /* If arbiter is the only source, do not proceed. */
+ if (pre_op_sources_count < 2 &&
+ local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
+ local->op_ret = -1;
+ local->op_errno = ENOTCONN;
for (i = 0; i < priv->child_count; i++)
- if (local->transaction.failed_subvols[i])
- pre_op_sources[i] = 0;
-
- switch (AFR_COUNT (pre_op_sources, priv->child_count)) {
- case 1:
- if (pre_op_sources[ARBITER_BRICK_INDEX])
- fop_failed = _gf_true;
- break;
- case 0:
- fop_failed = _gf_true;
- break;
- }
+ local->transaction.failed_subvols[i] = 1;
+ }
- if (fop_failed) {
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- }
+ afr_transaction_fop(frame, this);
- return;
+ return;
}
-void
-afr_txn_arbitrate_fop (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int pre_op_sources_count = 0;
-
- priv = this->private;
- local = frame->local;
-
- afr_compute_pre_op_sources (frame, this);
- pre_op_sources_count = AFR_COUNT (local->transaction.pre_op_sources,
- priv->child_count);
-
- /* If arbiter is the only source, do not proceed. */
- if (pre_op_sources_count < 2 &&
- local->transaction.pre_op_sources[ARBITER_BRICK_INDEX]) {
- local->internal_lock.lock_cbk = local->transaction.done;
- local->op_ret = -1;
- local->op_errno = ENOTCONN;
- afr_restore_lk_owner (frame);
- afr_unlock (frame, this);
- } else {
- local->transaction.fop (frame, this);
+int
+afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ int ret = 0;
+ int failure_count = 0;
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
+
+ local = frame->local;
+ priv = this->private;
+
+ INIT_LIST_HEAD(&shared);
+ if (local->transaction.type == AFR_DATA_TRANSACTION &&
+ !local->transaction.inherited) {
+ ret = afr_write_subvol_set(frame, this);
+ if (ret) {
+ /*act as if operation failed on all subvols*/
+ local->op_ret = -1;
+ local->op_errno = -ret;
+ for (i = 0; i < priv->child_count; i++)
+ local->transaction.failed_subvols[i] = 1;
}
-
- return;
+ }
+
+ if (local->pre_op_compat)
+ /* old mode, pre-op was done as afr_changelog_do()
+ just now, before OP */
+ afr_changelog_pre_op_update(frame, this);
+
+ if (!local->transaction.eager_lock_on || local->transaction.inherited)
+ goto fop;
+ failure_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
+ if (failure_count == priv->child_count) {
+ afr_handle_lock_acquire_failure(local);
+ return 0;
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ lock->acquired = _gf_true;
+ __afr_transaction_wake_shared(local, &shared);
+ }
+ UNLOCK(&local->inode->lock);
+ }
+
+fop:
+ /* Perform fops with the lk-owner from top xlator.
+ * Eg: lk-owner of posix-lk and flush should be same,
+ * flush cant clear the posix-lks without that lk-owner.
+ */
+ afr_save_lk_owner(frame);
+ frame->root->lk_owner = local->transaction.main_frame->root->lk_owner;
+
+ if (priv->arbiter_count == 1) {
+ afr_txn_arbitrate_fop(frame, this);
+ } else {
+ afr_transaction_fop(frame, this);
+ }
+
+ afr_lock_resume_shared(&shared);
+ return 0;
}
int
-afr_transaction_perform_fop (call_frame_t *frame, xlator_t *this)
+afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int **pending)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
+ int i = 0;
+ int ret = 0;
- local = frame->local;
- priv = this->private;
- fd = local->fd;
+ for (i = 0; i < priv->child_count; i++) {
+ ret = dict_set_static_bin(xattr, priv->pending_key[i], pending[i],
+ AFR_NUM_CHANGE_LOGS * sizeof(int));
+ /* 3 = data+metadata+entry */
- /* Perform fops with the lk-owner from top xlator.
- * Eg: lk-owner of posix-lk and flush should be same,
- * flush cant clear the posix-lks without that lk-owner.
- */
- afr_save_lk_owner (frame);
- frame->root->lk_owner =
- local->transaction.main_frame->root->lk_owner;
-
- if (local->pre_op_compat)
- /* old mode, pre-op was done as afr_changelog_do()
- just now, before OP */
- afr_changelog_pre_op_update (frame, this);
-
- /* The wake up needs to happen independent of
- what type of fop arrives here. If it was
- a write, then it has already inherited the
- lock and changelog. If it was not a write,
- then the presumption of the optimization (of
- optimizing for successive write operations)
- fails.
- */
- if (fd)
- afr_delayed_changelog_wake_up (this, fd);
- if (priv->arbiter_count == 1) {
- afr_txn_arbitrate_fop (frame, this);
- } else {
- local->transaction.fop (frame, this);
- }
+ if (ret)
+ break;
+ }
- return 0;
+ return ret;
}
-static int
-__changelog_enabled (afr_private_t *priv, afr_transaction_type type)
+static void
+afr_ta_dom_lock_check_and_release(afr_ta_fop_state_t fop_state, xlator_t *this)
{
- int ret = 0;
-
- switch (type) {
- case AFR_DATA_TRANSACTION:
- if (priv->data_change_log)
- ret = 1;
-
+ afr_private_t *priv = this->private;
+ unsigned int inmem_count = 0;
+ unsigned int onwire_count = 0;
+ gf_boolean_t release = _gf_false;
+
+ LOCK(&priv->lock);
+ {
+ /*Once we get notify lock release upcall notification,
+ if any of the fop state counters are non-zero, we will
+ not release the lock.
+ */
+ onwire_count = priv->ta_on_wire_txn_count;
+ inmem_count = priv->ta_in_mem_txn_count;
+ switch (fop_state) {
+ case TA_GET_INFO_FROM_TA_FILE:
+ onwire_count = --priv->ta_on_wire_txn_count;
break;
-
- case AFR_METADATA_TRANSACTION:
- if (priv->metadata_change_log)
- ret = 1;
-
+ case TA_INFO_IN_MEMORY_SUCCESS:
+ case TA_INFO_IN_MEMORY_FAILED:
+ inmem_count = --priv->ta_in_mem_txn_count;
break;
-
- case AFR_ENTRY_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- if (priv->entry_change_log)
- ret = 1;
-
+ case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+ GF_ASSERT(0);
+ break;
+ case TA_SUCCESS:
break;
}
+ release = priv->release_ta_notify_dom_lock;
+ }
+ UNLOCK(&priv->lock);
- return ret;
-}
+ if (inmem_count != 0 || release == _gf_false || onwire_count != 0)
+ return;
+ afr_ta_lock_release_synctask(this);
+}
-static int
-__fop_changelog_needed (call_frame_t *frame, xlator_t *this)
+static void
+afr_ta_process_onwireq(afr_ta_fop_state_t fop_state, xlator_t *this)
{
- afr_private_t * priv = NULL;
- afr_local_t * local = NULL;
- int op_ret = 0;
- afr_transaction_type type = -1;
-
- priv = this->private;
- local = frame->local;
- type = local->transaction.type;
-
- if (__changelog_enabled (priv, type)) {
- switch (local->op) {
-
- case GF_FOP_WRITE:
- case GF_FOP_FTRUNCATE:
- op_ret = 1;
- break;
-
- case GF_FOP_FLUSH:
- op_ret = 0;
- break;
-
- default:
- op_ret = 1;
- }
+ afr_private_t *priv = this->private;
+ afr_local_t *entry = NULL;
+ int bad_child = AFR_CHILD_UNKNOWN;
+
+ struct list_head onwireq = {
+ 0,
+ };
+ INIT_LIST_HEAD(&onwireq);
+
+ LOCK(&priv->lock);
+ {
+ bad_child = priv->ta_bad_child_index;
+ if (bad_child == AFR_CHILD_UNKNOWN) {
+ /*The previous on-wire ta_post_op was a failure. Just dequeue
+ *one element to wind on-wire again. */
+ entry = list_entry(priv->ta_onwireq.next, afr_local_t, ta_onwireq);
+ list_del_init(&entry->ta_onwireq);
+ } else {
+ /* Prepare to process all fops based on bad_child_index. */
+ list_splice_init(&priv->ta_onwireq, &onwireq);
}
+ }
+ UNLOCK(&priv->lock);
- return op_ret;
+ if (entry) {
+ afr_ta_post_op_synctask(this, entry);
+ return;
+ } else {
+ while (!list_empty(&onwireq)) {
+ entry = list_entry(onwireq.next, afr_local_t, ta_onwireq);
+ list_del_init(&entry->ta_onwireq);
+ if (entry->ta_failed_subvol == bad_child) {
+ afr_post_op_handle_success(entry->transaction.frame, this);
+ } else {
+ afr_post_op_handle_failure(entry->transaction.frame, this, EIO);
+ }
+ }
+ }
}
-
int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int **pending)
+afr_changelog_post_op_done(call_frame_t *frame, xlator_t *this)
{
- int i = 0;
- int ret = 0;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_private_t *priv = NULL;
- for (i = 0; i < priv->child_count; i++) {
+ local = frame->local;
+ priv = this->private;
+ int_lock = &local->internal_lock;
- ret = dict_set_static_bin (xattr, priv->pending_key[i],
- pending[i],
- AFR_NUM_CHANGE_LOGS * sizeof (int));
- /* 3 = data+metadata+entry */
+ if (priv->thin_arbiter_count) {
+ /*fop should not come here with TA_WAIT_FOR_NOTIFY_LOCK_REL state */
+ afr_ta_dom_lock_check_and_release(local->fop_state, this);
+ }
- if (ret)
- break;
- }
+ /* Fail the FOP if post-op did not succeed on quorum no. of bricks. */
+ if (!afr_changelog_has_quorum(local, this)) {
+ local->op_ret = -1;
+ /*local->op_errno is already captured in changelog cbk*/
+ }
- return ret;
-}
+ if (local->transaction.resume_stub) {
+ call_resume(local->transaction.resume_stub);
+ local->transaction.resume_stub = NULL;
+ }
-int
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type)
-{
- int ret = 0;
+ int_lock->lock_cbk = afr_transaction_done;
+ afr_unlock(frame, this);
- switch (type) {
- case AFR_DATA_TRANSACTION:
- ret = priv->child_count;
- break;
+ return 0;
+}
- case AFR_METADATA_TRANSACTION:
- ret = priv->child_count;
- break;
+static void
+afr_changelog_post_op_fail(call_frame_t *frame, xlator_t *this, int op_errno)
+{
+ afr_local_t *local = frame->local;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- case AFR_ENTRY_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- ret = priv->child_count;
- break;
- }
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, AFR_MSG_THIN_ARB,
+ "Failing %s for gfid %s. Fop state is:%d", gf_fop_list[local->op],
+ uuid_utoa(local->inode->gfid), local->fop_state);
- return ret;
+ afr_changelog_post_op_done(frame, this);
}
-/* {{{ pending */
-
+unsigned char *
+afr_locked_nodes_get(afr_transaction_type type, afr_internal_lock_t *int_lock)
+{
+ /*Because same set of subvols participate in all lockee
+ * entities*/
+ return int_lock->lockee[0].locked_nodes;
+}
int
-afr_changelog_post_op_done (call_frame_t *frame, xlator_t *this)
+afr_changelog_call_count(afr_transaction_type type,
+ unsigned char *pre_op_subvols,
+ unsigned char *failed_subvols,
+ unsigned int child_count)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- afr_internal_lock_t *int_lock = NULL;
+ int i = 0;
+ int call_count = 0;
- local = frame->local;
- priv = this->private;
- int_lock = &local->internal_lock;
-
- if (local->transaction.resume_stub) {
- call_resume (local->transaction.resume_stub);
- local->transaction.resume_stub = NULL;
- }
+ for (i = 0; i < child_count; i++) {
+ if (pre_op_subvols[i] && !failed_subvols[i]) {
+ call_count++;
+ }
+ }
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- local->transaction.done (frame, this);
- } else {
- int_lock->lock_cbk = local->transaction.done;
- afr_unlock (frame, this);
- }
+ if (type == AFR_ENTRY_RENAME_TRANSACTION)
+ call_count *= 2;
- return 0;
+ return call_count;
}
-
-afr_inodelk_t*
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom)
+gf_boolean_t
+afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this)
{
- afr_inodelk_t *inodelk = NULL;
- int i = 0;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ if (priv->thin_arbiter_count) {
+ /* We need to perform post-op even if 1 data brick was down
+ * before the txn started.*/
+ if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count))
+ return _gf_false;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] &&
+ local->transaction.failed_subvols[i])
+ return _gf_false;
+ }
+
+ return _gf_true;
+}
- for (i = 0; int_lock->inodelk[i].domain; i++) {
- inodelk = &int_lock->inodelk[i];
- if (strcmp (dom, inodelk->domain) == 0)
- return inodelk;
- }
- return NULL;
+void
+afr_handle_symmetric_errors(call_frame_t *frame, xlator_t *this)
+{
+ if (afr_is_symmetric_error(frame, this))
+ __mark_all_success(frame, this);
}
-unsigned char*
-afr_locked_nodes_get (afr_transaction_type type, afr_internal_lock_t *int_lock)
+gf_boolean_t
+afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame)
{
- unsigned char *locked_nodes = NULL;
- afr_inodelk_t *inodelk = NULL;
- switch (type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
- locked_nodes = inodelk->locked_nodes;
- break;
+ unsigned int quorum_count = 0;
+ afr_private_t *priv = NULL;
+ unsigned int up_children_count = 0;
- case AFR_ENTRY_TRANSACTION:
- case AFR_ENTRY_RENAME_TRANSACTION:
- /*Because same set of subvols participate in all lockee
- * entities*/
- locked_nodes = int_lock->lockee[0].locked_nodes;
- break;
+ priv = this->private;
+ up_children_count = AFR_COUNT(subvols, priv->child_count);
+
+ if (afr_lookup_has_quorum(frame, up_children_count))
+ return _gf_true;
+
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ /*
+ * Special case for auto-quorum with an even number of nodes.
+ *
+ * A replica set with even count N can only handle the same
+ * number of failures as odd N-1 before losing "vanilla"
+ * quorum, and the probability of more simultaneous failures is
+ * actually higher. For example, with a 1% chance of failure
+ * we'd have a 0.03% chance of two simultaneous failures with
+ * N=3 but a 0.06% chance with N=4. However, the special case
+ * is necessary for N=2 because there's no real quorum in that
+ * case (i.e. can't normally survive *any* failures). In that
+ * case, we treat the first node as a tie-breaker, allowing
+ * quorum to be retained in some cases while still honoring the
+ * all-important constraint that there can not simultaneously
+ * be two partitioned sets of nodes each believing they have
+ * quorum. Of two equally sized sets, the one without that
+ * first node will lose.
+ *
+ * It turns out that the special case is beneficial for higher
+ * values of N as well. Continuing the example above, the
+ * probability of losing quorum with N=4 and this type of
+ * quorum is (very) slightly lower than with N=3 and vanilla
+ * quorum. The difference becomes even more pronounced with
+ * higher N. Therefore, even though such replica counts are
+ * unlikely to be seen in practice, we might as well use the
+ * "special" quorum then as well.
+ */
+ if ((up_children_count * 2) == priv->child_count) {
+ return subvols[0];
}
- return locked_nodes;
-}
+ }
+ if (priv->quorum_count == AFR_QUORUM_AUTO) {
+ quorum_count = priv->child_count / 2 + 1;
+ } else {
+ quorum_count = priv->quorum_count;
+ }
-int
-afr_changelog_call_count (afr_transaction_type type,
- unsigned char *pre_op_subvols,
- unsigned int child_count)
-{
- int call_count = 0;
+ if (up_children_count >= quorum_count)
+ return _gf_true;
- call_count = AFR_COUNT(pre_op_subvols, child_count);
+ return _gf_false;
+}
- if (type == AFR_ENTRY_RENAME_TRANSACTION)
- call_count *= 2;
+static gf_boolean_t
+afr_has_fop_quorum(call_frame_t *frame)
+{
+ xlator_t *this = frame->this;
+ afr_local_t *local = frame->local;
+ unsigned char *locked_nodes = NULL;
- return call_count;
+ locked_nodes = afr_locked_nodes_get(local->transaction.type,
+ &local->internal_lock);
+ return afr_has_quorum(locked_nodes, this, NULL);
}
+static gf_boolean_t
+afr_has_fop_cbk_quorum(call_frame_t *frame)
+{
+ afr_local_t *local = frame->local;
+ xlator_t *this = frame->this;
+ afr_private_t *priv = this->private;
+ unsigned char *success = alloca0(priv->child_count);
+ int i = 0;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i])
+ if (!local->transaction.failed_subvols[i])
+ success[i] = 1;
+ }
+
+ return afr_has_quorum(success, this, NULL);
+}
gf_boolean_t
-afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this)
+afr_need_dirty_marking(call_frame_t *frame, xlator_t *this)
{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
- int i = 0;
+ afr_private_t *priv = this->private;
+ afr_local_t *local = NULL;
+ gf_boolean_t need_dirty = _gf_false;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i] &&
- local->transaction.failed_subvols[i])
- return _gf_false;
- }
+ if (!priv->quorum_count || !local->optimistic_change_log)
+ return _gf_false;
- return _gf_true;
-}
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION)
+ return _gf_false;
+ if (AFR_COUNT(local->transaction.failed_subvols, priv->child_count) ==
+ priv->child_count)
+ return _gf_false;
-void
-afr_handle_symmetric_errors (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int op_errno = 0;
- int i_errno = 0;
- gf_boolean_t matching_errors = _gf_true;
- int i = 0;
-
- priv = this->private;
- local = frame->local;
-
- for (i = 0; i < priv->child_count; i++) {
- if (!local->replies[i].valid)
- continue;
- if (local->replies[i].op_ret != -1) {
- /* Operation succeeded on at least on subvol,
- so it is not a failed-everywhere situation.
- */
- matching_errors = _gf_false;
- break;
- }
- i_errno = local->replies[i].op_errno;
-
- if (i_errno == ENOTCONN) {
- /* ENOTCONN is not a symmetric error. We do not
- know if the operation was performed on the
- backend or not.
- */
- matching_errors = _gf_false;
- break;
- }
-
- if (!op_errno) {
- op_errno = i_errno;
- } else if (op_errno != i_errno) {
- /* Mismatching op_errno's */
- matching_errors = _gf_false;
- break;
- }
- }
-
- if (matching_errors)
- __mark_all_success (frame, this);
+ if (!afr_has_fop_cbk_quorum(frame))
+ need_dirty = _gf_true;
+
+ return need_dirty;
}
-gf_boolean_t
-afr_has_quorum (unsigned char *subvols, xlator_t *this)
+void
+afr_handle_quorum(call_frame_t *frame, xlator_t *this)
{
- unsigned int quorum_count = 0;
- afr_private_t *priv = NULL;
- unsigned int up_children_count = 0;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ const char *file = NULL;
+ uuid_t gfid = {0};
- priv = this->private;
- up_children_count = AFR_COUNT (subvols, priv->child_count);
+ local = frame->local;
+ priv = frame->this->private;
- if (priv->quorum_count == AFR_QUORUM_AUTO) {
- /*
- * Special case for even numbers of nodes in auto-quorum:
- * if we have exactly half children up
- * and that includes the first ("senior-most") node, then that counts
- * as quorum even if it wouldn't otherwise. This supports e.g. N=2
- * while preserving the critical property that there can only be one
- * such group.
- */
- if ((priv->child_count % 2 == 0) &&
- (up_children_count == (priv->child_count/2)))
- return subvols[0];
- }
+ if (priv->quorum_count == 0)
+ return;
- if (priv->quorum_count == AFR_QUORUM_AUTO) {
- quorum_count = priv->child_count/2 + 1;
- } else {
- quorum_count = priv->quorum_count;
- }
+ /* If the fop already failed return right away to preserve errno */
+ if (local->op_ret == -1)
+ return;
- if (up_children_count >= quorum_count)
- return _gf_true;
+ /*
+ * Network split may happen just after the fops are unwound, so check
+ * if the fop succeeded in a way it still follows quorum. If it doesn't,
+ * mark the fop as failure, mark the changelogs so it reflects that
+ * failure.
+ *
+ * Scenario:
+ * There are 3 mounts on 3 machines(node1, node2, node3) all writing to
+ * single file. Network split happened in a way that node1 can't see
+ * node2, node3. Node2, node3 both of them can't see node1. Now at the
+ * time of sending write all the bricks are up. Just after write fop is
+ * wound on node1, network split happens. Node1 thinks write fop failed
+ * on node2, node3 so marks pending changelog for those 2 extended
+ * attributes on node1. Node2, node3 thinks writes failed on node1 so
+ * they mark pending changelog for node1. When the network is stable
+ * again the file already is in split-brain. These checks prevent
+ * marking pending changelog on other subvolumes if the fop doesn't
+ * succeed in a way it is still following quorum. So with this fix what
+ * is happening is, node1 will have all pending changelog(FOOL) because
+ * the write succeeded only on node1 but failed on node2, node3 so
+ * instead of marking pending changelogs on node2, node3 it just treats
+ * the fop as failure and goes into DIRTY state. Where as node2, node3
+ * say they are sources and have pending changelog to node1 so there is
+ * no split-brain with the fix. The problem is eliminated completely.
+ */
+
+ if (afr_has_fop_cbk_quorum(frame))
+ return;
- return _gf_false;
+ if (afr_need_dirty_marking(frame, this))
+ goto set_response;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i])
+ afr_transaction_fop_failed(frame, frame->this, i);
+ }
+
+set_response:
+ local->op_ret = -1;
+ local->op_errno = afr_final_errno(local, priv);
+ if (local->op_errno == 0)
+ local->op_errno = afr_quorum_errno(priv);
+
+ if (local->fd) {
+ gf_uuid_copy(gfid, local->fd->inode->gfid);
+ file = uuid_utoa(gfid);
+ } else {
+ loc_path(&local->loc, local->loc.name);
+ file = local->loc.path;
+ }
+
+ gf_msg(frame->this->name, GF_LOG_WARNING, local->op_errno,
+ AFR_MSG_QUORUM_FAIL, "%s: Failing %s as quorum is not met", file,
+ gf_fop_list[local->op]);
+
+ switch (local->transaction.type) {
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ afr_pick_error_xdata(local, priv, local->parent, local->readable,
+ local->parent2, local->readable2);
+ break;
+ default:
+ afr_pick_error_xdata(local, priv, local->inode, local->readable,
+ NULL, NULL);
+ break;
+ }
}
-static gf_boolean_t
-afr_has_fop_quorum (call_frame_t *frame)
+int
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop)
{
- xlator_t *this = frame->this;
- afr_local_t *local = frame->local;
- unsigned char *locked_nodes = NULL;
-
- locked_nodes = afr_locked_nodes_get (local->transaction.type,
- &local->internal_lock);
- return afr_has_quorum (locked_nodes, this);
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+ loc->parent = inode_ref(priv->root_inode);
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+ loc->name = priv->pending_key[THIN_ARBITER_BRICK_INDEX];
+ if (is_gfid_based_fop && gf_uuid_is_null(priv->ta_gfid)) {
+ /* Except afr_ta_id_file_check() which is path based, all other gluster
+ * FOPS need gfid.*/
+ return -EINVAL;
+ }
+ gf_uuid_copy(loc->gfid, priv->ta_gfid);
+ loc->inode = inode_new(loc->parent->table);
+ if (!loc->inode) {
+ loc_wipe(loc);
+ return -ENOMEM;
+ }
+ return 0;
}
-static gf_boolean_t
-afr_has_fop_cbk_quorum (call_frame_t *frame)
+static int
+afr_ta_post_op_done(int ret, call_frame_t *frame, void *opaque)
{
- afr_local_t *local = frame->local;
- xlator_t *this = frame->this;
- afr_private_t *priv = this->private;
- unsigned char *success = alloca0(priv->child_count);
- int i = 0;
+ xlator_t *this = NULL;
+ afr_local_t *local = NULL;
+ call_frame_t *txn_frame = NULL;
+ afr_ta_fop_state_t fop_state;
+
+ local = (afr_local_t *)opaque;
+ fop_state = local->fop_state;
+ txn_frame = local->transaction.frame;
+ this = frame->this;
+
+ if (ret == 0) {
+ /*Mark pending xattrs on the up data brick.*/
+ afr_post_op_handle_success(txn_frame, this);
+ } else {
+ afr_post_op_handle_failure(txn_frame, this, -ret);
+ }
+
+ STACK_DESTROY(frame->root);
+ afr_ta_process_onwireq(fop_state, this);
+
+ return 0;
+}
+int **
+afr_set_changelog_xattr(afr_private_t *priv, unsigned char *pending,
+ dict_t *xattr, afr_local_t *local)
+{
+ int **changelog = NULL;
+ int idx = 0;
+ int ret = 0;
+ int i;
+
+ if (local->is_new_entry == _gf_true) {
+ changelog = afr_mark_pending_changelog(priv, pending, xattr,
+ local->cont.dir_fop.buf.ia_type);
+ } else {
+ idx = afr_index_for_transaction_type(local->transaction.type);
+ changelog = afr_matrix_create(priv->child_count, AFR_NUM_CHANGE_LOGS);
+ if (!changelog) {
+ goto out;
+ }
for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i])
- if (!local->transaction.failed_subvols[i])
- success[i] = 1;
+ if (local->transaction.failed_subvols[i])
+ changelog[i][idx] = hton32(1);
+ }
+ ret = afr_set_pending_dict(priv, xattr, changelog);
+ if (ret < 0) {
+ afr_matrix_cleanup(changelog, priv->child_count);
+ return NULL;
}
+ }
- return afr_has_quorum (success, this);
+out:
+ return changelog;
}
-void
-afr_handle_quorum (call_frame_t *frame)
+static void
+afr_ta_locked_xattrop_validate(afr_private_t *priv, afr_local_t *local,
+ gf_boolean_t *valid)
+{
+ if (priv->ta_event_gen > local->ta_event_gen) {
+ /* We can't trust the ta's response anymore.*/
+ afr_ta_locked_priv_invalidate(priv);
+ *valid = _gf_false;
+ return;
+ }
+ return;
+}
+
+static int
+afr_ta_post_op_do(void *opaque)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- const char *file = NULL;
- uuid_t gfid = {0};
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ xlator_t *this = NULL;
+ dict_t *xattr = NULL;
+ unsigned char *pending = NULL;
+ int **changelog = NULL;
+ int failed_subvol = -1;
+ int success_subvol = -1;
+ loc_t loc = {
+ 0,
+ };
+ int i = 0;
+ int ret = 0;
+ gf_boolean_t valid = _gf_true;
+
+ local = (afr_local_t *)opaque;
+ this = local->transaction.frame->this;
+ priv = this->private;
+
+ ret = afr_fill_ta_loc(this, &loc, _gf_true);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Failed to populate loc for thin-arbiter.");
+ goto out;
+ }
+
+ xattr = dict_new();
+ if (!xattr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pending = alloca0(priv->child_count);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i]) {
+ pending[i] = 1;
+ failed_subvol = i;
+ } else {
+ success_subvol = i;
+ }
+ }
+
+ changelog = afr_set_changelog_xattr(priv, pending, xattr, local);
+
+ if (!changelog) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = afr_ta_post_op_lock(this, &loc);
+ if (ret)
+ goto out;
+
+ ret = syncop_xattrop(priv->children[THIN_ARBITER_BRICK_INDEX], &loc,
+ GF_XATTROP_ADD_ARRAY, xattr, NULL, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s failed for gfid %s.",
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+ uuid_utoa(local->inode->gfid));
+ }
+ LOCK(&priv->lock);
+ {
+ if (ret == 0) {
+ priv->ta_bad_child_index = failed_subvol;
+ } else if (ret == -EINVAL) {
+ priv->ta_bad_child_index = success_subvol;
+ ret = -EIO; /* TA failed the fop. Return EIO to application. */
+ }
- local = frame->local;
- priv = frame->this->private;
+ afr_ta_locked_xattrop_validate(priv, local, &valid);
+ }
+ UNLOCK(&priv->lock);
+ if (valid == _gf_false) {
+ gf_msg(this->name, GF_LOG_ERROR, EIO, AFR_MSG_THIN_ARB,
+ "Post-op on thin-arbiter id file %s for gfid %s invalidated due "
+ "to event-gen mismatch.",
+ priv->pending_key[THIN_ARBITER_BRICK_INDEX],
+ uuid_utoa(local->inode->gfid));
+ ret = -EIO;
+ }
+
+ afr_ta_post_op_unlock(this, &loc);
+out:
+ if (xattr)
+ dict_unref(xattr);
- if (priv->quorum_count == 0)
- return;
+ if (changelog)
+ afr_matrix_cleanup(changelog, priv->child_count);
- /* If the fop already failed return right away to preserve errno */
- if (local->op_ret == -1)
- return;
+ loc_wipe(&loc);
- /*
- * Network split may happen just after the fops are unwound, so check
- * if the fop succeeded in a way it still follows quorum. If it doesn't,
- * mark the fop as failure, mark the changelogs so it reflects that
- * failure.
- *
- * Scenario:
- * There are 3 mounts on 3 machines(node1, node2, node3) all writing to
- * single file. Network split happened in a way that node1 can't see
- * node2, node3. Node2, node3 both of them can't see node1. Now at the
- * time of sending write all the bricks are up. Just after write fop is
- * wound on node1, network split happens. Node1 thinks write fop failed
- * on node2, node3 so marks pending changelog for those 2 extended
- * attributes on node1. Node2, node3 thinks writes failed on node1 so
- * they mark pending changelog for node1. When the network is stable
- * again the file already is in split-brain. These checks prevent
- * marking pending changelog on other subvolumes if the fop doesn't
- * succeed in a way it is still following quorum. So with this fix what
- * is happening is, node1 will have all pending changelog(FOOL) because
- * the write succeeded only on node1 but failed on node2, node3 so
- * instead of marking pending changelogs on node2, node3 it just treats
- * the fop as failure and goes into DIRTY state. Where as node2, node3
- * say they are sources and have pending changelog to node1 so there is
- * no split-brain with the fix. The problem is eliminated completely.
- */
+ return ret;
+}
- if (afr_has_fop_cbk_quorum (frame))
- return;
+static int
+afr_ta_post_op_synctask(xlator_t *this, afr_local_t *local)
+{
+ call_frame_t *ta_frame = NULL;
+ int ret = 0;
+
+ ta_frame = afr_ta_frame_create(this);
+ if (!ta_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to create ta_frame");
+ goto err;
+ }
+ ret = synctask_new(this->ctx->env, afr_ta_post_op_do, afr_ta_post_op_done,
+ ta_frame, local);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, AFR_MSG_THIN_ARB,
+ "Failed to launch post-op on thin arbiter for gfid %s",
+ uuid_utoa(local->inode->gfid));
+ STACK_DESTROY(ta_frame->root);
+ goto err;
+ }
+
+ return ret;
+err:
+ afr_changelog_post_op_fail(local->transaction.frame, this, ENOMEM);
+ return ret;
+}
- if (local->fd) {
- gf_uuid_copy (gfid, local->fd->inode->gfid);
- file = uuid_utoa (gfid);
+static void
+afr_ta_set_fop_state(afr_private_t *priv, afr_local_t *local,
+ int *on_wire_count)
+{
+ LOCK(&priv->lock);
+ {
+ if (priv->release_ta_notify_dom_lock == _gf_true) {
+ /* Put the fop in waitq until notify dom lock is released.*/
+ local->fop_state = TA_WAIT_FOR_NOTIFY_LOCK_REL;
+ list_add_tail(&local->ta_waitq, &priv->ta_waitq);
+ } else if (priv->ta_bad_child_index == AFR_CHILD_UNKNOWN) {
+ /* Post-op on thin-arbiter to decide success/failure. */
+ local->fop_state = TA_GET_INFO_FROM_TA_FILE;
+ *on_wire_count = ++priv->ta_on_wire_txn_count;
+ if (*on_wire_count > 1) {
+ /*Avoid sending multiple on-wire post-ops on TA*/
+ list_add_tail(&local->ta_onwireq, &priv->ta_onwireq);
+ }
+ } else if (local->ta_failed_subvol == priv->ta_bad_child_index) {
+ /* Post-op on TA not needed as the fop failed on the in-memory bad
+ * brick. Just mark pending xattrs on the good data brick.*/
+ local->fop_state = TA_INFO_IN_MEMORY_SUCCESS;
+ priv->ta_in_mem_txn_count++;
} else {
- loc_path (&local->loc, local->loc.name);
- file = local->loc.path;
+ /* Post-op on TA not needed as the fop succeeded only on the
+ * in-memory bad data brick and not the good one. Fail the fop.*/
+ local->fop_state = TA_INFO_IN_MEMORY_FAILED;
+ priv->ta_in_mem_txn_count++;
}
+ }
+ UNLOCK(&priv->lock);
+}
- gf_msg (frame->this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
- "%s: Failing %s as quorum is not met",
- file, gf_fop_list[local->op]);
+static void
+afr_ta_fill_failed_subvol(afr_private_t *priv, afr_local_t *local)
+{
+ int i = 0;
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i])
- afr_transaction_fop_failed (frame, frame->this, i);
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i]) {
+ local->ta_failed_subvol = i;
+ break;
}
-
- local->op_ret = -1;
- local->op_errno = afr_quorum_errno (priv);
+ }
}
-int
-afr_changelog_post_op_now (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = this->private;
- int i = 0;
- int ret = 0;
- int idx = 0;
- afr_local_t * local = NULL;
- dict_t *xattr = NULL;
- int nothing_failed = 1;
- gf_boolean_t need_undirty = _gf_false;
-
- afr_handle_quorum (frame);
- local = frame->local;
- idx = afr_index_for_transaction_type (local->transaction.type);
-
- nothing_failed = afr_txn_nothing_failed (frame, this);
-
- if (afr_changelog_pre_op_uninherit (frame, this))
- need_undirty = _gf_false;
- else
- need_undirty = _gf_true;
-
- if (local->op_ret < 0) {
- afr_changelog_post_op_done (frame, this);
- goto out;
- }
+static void
+afr_post_op_handle_success(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
- if (nothing_failed && !need_undirty) {
- afr_changelog_post_op_done (frame, this);
- goto out;
- }
-
- xattr = dict_new ();
- if (!xattr) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done (frame, this);
- goto out;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.failed_subvols[i])
- local->pending[i][idx] = hton32(1);
- }
-
- ret = afr_set_pending_dict (priv, xattr, local->pending);
- if (ret < 0) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done (frame, this);
- goto out;
- }
-
- if (need_undirty)
- local->dirty[idx] = hton32(-1);
- else
- local->dirty[idx] = hton32(0);
-
- ret = dict_set_static_bin (xattr, AFR_DIRTY, local->dirty,
- sizeof(int) * AFR_NUM_CHANGE_LOGS);
- if (ret) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- afr_changelog_post_op_done (frame, this);
- goto out;
- }
-
- afr_changelog_do (frame, this, xattr, afr_changelog_post_op_done);
-out:
- if (xattr)
- dict_unref (xattr);
+ local = frame->local;
+ if (local->is_new_entry == _gf_true) {
+ afr_mark_new_entry_changelog(frame, this);
+ }
+ afr_changelog_post_op_do(frame, this);
- return 0;
+ return;
}
+static void
+afr_post_op_handle_failure(call_frame_t *frame, xlator_t *this, int op_errno)
+{
+ afr_changelog_post_op_fail(frame, this, op_errno);
-gf_boolean_t
-afr_changelog_pre_op_uninherit (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
- int i = 0;
- gf_boolean_t ret = _gf_false;
- afr_fd_ctx_t *fd_ctx = NULL;
- int type = 0;
-
- local = frame->local;
- priv = this->private;
- fd = local->fd;
-
- type = afr_index_for_transaction_type (local->transaction.type);
- if (type != AFR_DATA_TRANSACTION)
- return !local->transaction.dirtied;
-
- if (!fd)
- return !local->transaction.dirtied;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_false;
-
- if (local->transaction.no_uninherit)
- return _gf_false;
-
- /* This function must be idempotent. So check if we
- were called before and return the same answer again.
-
- It is important to keep this function idempotent for
- the call in afr_changelog_post_op_safe() to not have
- side effects on the call from afr_changelog_post_op_now()
- */
- if (local->transaction.uninherit_done)
- return local->transaction.uninherit_value;
-
- LOCK(&fd->lock);
- {
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i] !=
- fd_ctx->pre_op_done[type][i]) {
- ret = !local->transaction.dirtied;
- goto unlock;
- }
- }
-
- if (fd_ctx->inherited[type]) {
- ret = _gf_true;
- fd_ctx->inherited[type]--;
- } else if (fd_ctx->on_disk[type]) {
- ret = _gf_false;
- fd_ctx->on_disk[type]--;
- } else {
- /* ASSERT */
- ret = _gf_false;
- }
-
- if (!fd_ctx->inherited[type] && !fd_ctx->on_disk[type]) {
- for (i = 0; i < priv->child_count; i++)
- fd_ctx->pre_op_done[type][i] = 0;
- }
- }
-unlock:
- UNLOCK(&fd->lock);
+ return;
+}
- local->transaction.uninherit_done = _gf_true;
- local->transaction.uninherit_value = ret;
+static void
+afr_ta_decide_post_op_state(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int on_wire_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ afr_ta_set_fop_state(priv, local, &on_wire_count);
+
+ switch (local->fop_state) {
+ case TA_GET_INFO_FROM_TA_FILE:
+ if (on_wire_count == 1)
+ afr_ta_post_op_synctask(this, local);
+ /*else, fop is queued in ta_onwireq.*/
+ break;
+ case TA_WAIT_FOR_NOTIFY_LOCK_REL:
+ /*Post releasing the notify lock, we will act on this queue*/
+ break;
+ case TA_INFO_IN_MEMORY_SUCCESS:
+ afr_post_op_handle_success(frame, this);
+ break;
+ case TA_INFO_IN_MEMORY_FAILED:
+ afr_post_op_handle_failure(frame, this, EIO);
+ break;
+ default:
+ break;
+ }
+ return;
+}
- return ret;
+static void
+afr_handle_failure_using_thin_arbiter(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = frame->local;
+
+ afr_ta_fill_failed_subvol(priv, local);
+ gf_msg_debug(this->name, 0,
+ "Fop failed on data brick (%s) for gfid=%s. "
+ "ta info needed to decide fop result.",
+ priv->children[local->ta_failed_subvol]->name,
+ uuid_utoa(local->inode->gfid));
+ afr_ta_decide_post_op_state(frame, this);
}
+void
+afr_changelog_post_op_do(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = this->private;
+ afr_local_t *local = NULL;
+ dict_t *xattr = NULL;
+ int i = 0;
+ int ret = 0;
+ int idx = 0;
+ int nothing_failed = 1;
+ gf_boolean_t need_undirty = _gf_false;
+
+ afr_handle_quorum(frame, this);
+ local = frame->local;
+ idx = afr_index_for_transaction_type(local->transaction.type);
+
+ xattr = dict_new();
+ if (!xattr) {
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
+ goto out;
+ }
+
+ nothing_failed = afr_txn_nothing_failed(frame, this);
+
+ if (afr_changelog_pre_op_uninherit(frame, this))
+ need_undirty = _gf_false;
+ else
+ need_undirty = _gf_true;
+
+ if (local->op_ret < 0 && !nothing_failed) {
+ if (afr_need_dirty_marking(frame, this)) {
+ local->dirty[idx] = hton32(1);
+ goto set_dirty;
+ }
-gf_boolean_t
-afr_changelog_pre_op_inherit (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
- int i = 0;
- gf_boolean_t ret = _gf_false;
- afr_fd_ctx_t *fd_ctx = NULL;
- int type = 0;
-
- local = frame->local;
- priv = this->private;
- fd = local->fd;
-
- if (local->transaction.type != AFR_DATA_TRANSACTION)
- return _gf_false;
-
- type = afr_index_for_transaction_type (local->transaction.type);
-
- if (!fd)
- return _gf_false;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_false;
-
- LOCK(&fd->lock);
- {
- if (!fd_ctx->on_disk[type]) {
- /* nothing to inherit yet */
- ret = _gf_false;
- goto unlock;
- }
-
- for (i = 0; i < priv->child_count; i++) {
- if (local->transaction.pre_op[i] !=
- fd_ctx->pre_op_done[type][i]) {
- /* either inherit exactly, or don't */
- ret = _gf_false;
- goto unlock;
- }
- }
-
- fd_ctx->inherited[type]++;
-
- ret = _gf_true;
-
- local->transaction.inherited = _gf_true;
- }
-unlock:
- UNLOCK(&fd->lock);
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
+
+ if (nothing_failed && !need_undirty) {
+ afr_changelog_post_op_done(frame, this);
+ goto out;
+ }
+
+ if (local->transaction.in_flight_sb) {
+ afr_changelog_post_op_fail(frame, this,
+ local->transaction.in_flight_sb_errno);
+ goto out;
+ }
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.failed_subvols[i])
+ local->pending[i][idx] = hton32(1);
+ }
+
+ ret = afr_set_pending_dict(priv, xattr, local->pending);
+ if (ret < 0) {
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
+ goto out;
+ }
+
+ if (need_undirty)
+ local->dirty[idx] = hton32(-1);
+ else
+ local->dirty[idx] = hton32(0);
+
+set_dirty:
+ ret = dict_set_static_bin(xattr, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ afr_changelog_post_op_fail(frame, this, ENOMEM);
+ goto out;
+ }
+
+ afr_changelog_do(frame, this, xattr, afr_changelog_post_op_done,
+ AFR_TRANSACTION_POST_OP);
+out:
+ if (xattr)
+ dict_unref(xattr);
- return ret;
+ return;
}
+static int
+afr_changelog_post_op_now(call_frame_t *frame, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
+ int failed_count = 0;
+
+ priv = this->private;
+ local = frame->local;
+
+ if (priv->thin_arbiter_count) {
+ failed_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
+ if (failed_count == 1) {
+ afr_handle_failure_using_thin_arbiter(frame, this);
+ return 0;
+ } else {
+ /* Txn either succeeded or failed on both data bricks. Let
+ * post_op_do handle it as the case might be. */
+ }
+ }
-gf_boolean_t
-afr_changelog_pre_op_update (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- fd_t *fd = NULL;
- afr_fd_ctx_t *fd_ctx = NULL;
- int i = 0;
- gf_boolean_t ret = _gf_false;
- int type = 0;
-
- local = frame->local;
- priv = this->private;
- fd = local->fd;
-
- if (!fd)
- return _gf_false;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_false;
-
- if (local->transaction.inherited)
- /* was already inherited in afr_changelog_pre_op */
- return _gf_false;
-
- if (!local->transaction.dirtied)
- return _gf_false;
-
- if (!afr_txn_nothing_failed (frame, this))
- return _gf_false;
-
- type = afr_index_for_transaction_type (local->transaction.type);
-
- ret = _gf_false;
-
- LOCK(&fd->lock);
- {
- if (!fd_ctx->on_disk[type]) {
- for (i = 0; i < priv->child_count; i++)
- fd_ctx->pre_op_done[type][i] =
- local->transaction.pre_op[i];
- } else {
- for (i = 0; i < priv->child_count; i++)
- if (fd_ctx->pre_op_done[type][i] !=
- local->transaction.pre_op[i]) {
- local->transaction.no_uninherit = 1;
- goto unlock;
- }
- }
- fd_ctx->on_disk[type]++;
-
- ret = _gf_true;
- }
-unlock:
- UNLOCK(&fd->lock);
-
- return ret;
+ afr_changelog_post_op_do(frame, this);
+ return 0;
}
-
-int
-afr_changelog_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+gf_boolean_t
+afr_changelog_pre_op_uninherit(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int call_count = -1;
- int child_index = -1;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_inode_ctx_t *ctx = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
+
+ local = frame->local;
+ priv = this->private;
+ ctx = local->inode_ctx;
+
+ type = afr_index_for_transaction_type(local->transaction.type);
+ if (type != AFR_DATA_TRANSACTION)
+ return !local->transaction.dirtied;
+
+ if (local->transaction.no_uninherit)
+ return _gf_false;
+
+ /* This function must be idempotent. So check if we
+ were called before and return the same answer again.
- local = frame->local;
- priv = this->private;
- child_index = (long) cookie;
+ It is important to keep this function idempotent for
+ the call in afr_changelog_post_op_safe() to not have
+ side effects on the call from afr_changelog_post_op_now()
+ */
+ if (local->transaction.uninherit_done)
+ return local->transaction.uninherit_value;
- if (op_ret == -1) {
- local->op_errno = op_errno;
- afr_transaction_fop_failed (frame, this, child_index);
+ LOCK(&local->inode->lock);
+ {
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.pre_op[i] != ctx->pre_op_done[type][i]) {
+ ret = !local->transaction.dirtied;
+ goto unlock;
+ }
}
- if (priv->arbiter_count == 1 && !op_ret) {
- if (xattr)
- local->transaction.pre_op_xdata[child_index] =
- dict_ref (xattr);
+ if (ctx->inherited[type]) {
+ ret = _gf_true;
+ ctx->inherited[type]--;
+ } else if (ctx->on_disk[type]) {
+ ret = _gf_false;
+ ctx->on_disk[type]--;
+ } else {
+ /* ASSERT */
+ ret = _gf_false;
}
- call_count = afr_frame_return (frame);
+ if (!ctx->inherited[type] && !ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ ctx->pre_op_done[type][i] = 0;
+ }
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
- if (call_count == 0)
- local->transaction.changelog_resume (frame, this);
+ local->transaction.uninherit_done = _gf_true;
+ local->transaction.uninherit_value = ret;
- return 0;
+ return ret;
}
-
-int
-afr_changelog_do (call_frame_t *frame, xlator_t *this, dict_t *xattr,
- afr_changelog_resume_t changelog_resume)
+gf_boolean_t
+afr_changelog_pre_op_inherit(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- int i = 0;
- int call_count = 0;
-
- local = frame->local;
- priv = this->private;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
- call_count = afr_changelog_call_count (local->transaction.type,
- local->transaction.pre_op,
- priv->child_count);
+ local = frame->local;
+ priv = this->private;
- if (call_count == 0) {
- changelog_resume (frame, this);
- return 0;
- }
+ if (local->transaction.type != AFR_DATA_TRANSACTION)
+ return _gf_false;
- local->call_count = call_count;
+ type = afr_index_for_transaction_type(local->transaction.type);
- local->transaction.changelog_resume = changelog_resume;
+ LOCK(&local->inode->lock);
+ {
+ if (!local->inode_ctx->on_disk[type]) {
+ /* nothing to inherit yet */
+ ret = _gf_false;
+ goto unlock;
+ }
for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op[i])
- continue;
+ if (local->transaction.pre_op[i] !=
+ local->inode_ctx->pre_op_done[type][i]) {
+ /* either inherit exactly, or don't */
+ ret = _gf_false;
+ goto unlock;
+ }
+ }
- switch (local->transaction.type) {
- case AFR_DATA_TRANSACTION:
- case AFR_METADATA_TRANSACTION:
- if (!local->fd) {
- STACK_WIND_COOKIE (frame, afr_changelog_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->loc,
- GF_XATTROP_ADD_ARRAY, xattr,
- NULL);
- } else {
- STACK_WIND_COOKIE (frame, afr_changelog_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr,
- NULL);
- }
- break;
- case AFR_ENTRY_RENAME_TRANSACTION:
-
- STACK_WIND_COOKIE (frame, afr_changelog_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.new_parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr,
- NULL);
- call_count--;
+ local->inode_ctx->inherited[type]++;
- /* fall through */
+ ret = _gf_true;
- case AFR_ENTRY_TRANSACTION:
- if (local->fd)
- STACK_WIND_COOKIE (frame, afr_changelog_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->fxattrop,
- local->fd,
- GF_XATTROP_ADD_ARRAY, xattr,
- NULL);
- else
- STACK_WIND_COOKIE (frame, afr_changelog_cbk,
- (void *) (long) i,
- priv->children[i],
- priv->children[i]->fops->xattrop,
- &local->transaction.parent_loc,
- GF_XATTROP_ADD_ARRAY, xattr,
- NULL);
- break;
- }
-
- if (!--call_count)
- break;
- }
+ local->transaction.inherited = _gf_true;
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
- return 0;
+ return ret;
}
+gf_boolean_t
+afr_changelog_pre_op_update(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+ gf_boolean_t ret = _gf_false;
+ int type = 0;
-int
-afr_changelog_pre_op (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t * priv = this->private;
- int i = 0;
- int ret = 0;
- int call_count = 0;
- int op_errno = 0;
- afr_local_t *local = NULL;
- afr_internal_lock_t *int_lock = NULL;
- unsigned char *locked_nodes = NULL;
- int idx = -1;
- gf_boolean_t pre_nop = _gf_true;
- dict_t *xdata_req = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
- idx = afr_index_for_transaction_type (local->transaction.type);
-
- locked_nodes = afr_locked_nodes_get (local->transaction.type, int_lock);
-
- for (i = 0; i < priv->child_count; i++) {
- if (locked_nodes[i]) {
- local->transaction.pre_op[i] = 1;
- call_count++;
- } else {
- local->transaction.failed_subvols[i] = 1;
- }
- }
-
- /* This condition should not be met with present code, as
- * transaction.done will be called if locks are not acquired on even a
- * single node.
- */
- if (call_count == 0) {
- op_errno = ENOTCONN;
- goto err;
- }
-
- /* Check if the fop can be performed on at least
- * quorum number of nodes.
- */
- if (priv->quorum_count && !afr_has_fop_quorum (frame)) {
- op_errno = afr_quorum_errno (priv);
- goto err;
- }
+ local = frame->local;
+ priv = this->private;
- xdata_req = dict_new();
- if (!xdata_req) {
- op_errno = ENOMEM;
- goto err;
- }
+ if (local->transaction.type == AFR_ENTRY_TRANSACTION ||
+ local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION)
+ return _gf_false;
- if (afr_changelog_pre_op_inherit (frame, this))
- goto next;
+ if (local->transaction.inherited)
+ /* was already inherited in afr_changelog_pre_op */
+ return _gf_false;
- if (call_count < priv->child_count)
- pre_nop = _gf_false;
+ if (!local->transaction.dirtied)
+ return _gf_false;
- /* Set an all-zero pending changelog so that in the cbk, we can get the
- * current on-disk values. In a replica 3 volume with arbiter enabled,
- * these values are needed to arrive at a go/ no-go of the fop phase to
- * avoid ending up in split-brain.*/
+ if (!afr_txn_nothing_failed(frame, this))
+ return _gf_false;
- ret = afr_set_pending_dict (priv, xdata_req, local->pending);
- if (ret < 0) {
- op_errno = ENOMEM;
- goto err;
- }
+ type = afr_index_for_transaction_type(local->transaction.type);
- if ((local->transaction.type == AFR_DATA_TRANSACTION ||
- !local->optimistic_change_log)) {
+ ret = _gf_false;
- local->dirty[idx] = hton32(1);
+ LOCK(&local->inode->lock);
+ {
+ if (!local->inode_ctx->on_disk[type]) {
+ for (i = 0; i < priv->child_count; i++)
+ local->inode_ctx->pre_op_done[type][i] =
+ (!local->transaction.failed_subvols[i]);
+ } else {
+ for (i = 0; i < priv->child_count; i++)
+ if (local->inode_ctx->pre_op_done[type][i] !=
+ (!local->transaction.failed_subvols[i])) {
+ local->transaction.no_uninherit = 1;
+ goto unlock;
+ }
+ }
+ local->inode_ctx->on_disk[type]++;
- ret = dict_set_static_bin (xdata_req, AFR_DIRTY, local->dirty,
- sizeof(int) * AFR_NUM_CHANGE_LOGS);
- if (ret) {
- op_errno = ENOMEM;
- goto err;
- }
+ ret = _gf_true;
+ }
+unlock:
+ UNLOCK(&local->inode->lock);
- pre_nop = _gf_false;
- local->transaction.dirtied = 1;
- }
+ return ret;
+}
- if (pre_nop)
- goto next;
+int
+afr_changelog_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ afr_local_t *local = NULL;
+ int call_count = -1;
+ int child_index = -1;
- if (!local->pre_op_compat) {
- dict_copy (xdata_req, local->xdata_req);
- goto next;
- }
+ local = frame->local;
+ child_index = (long)cookie;
- afr_changelog_do (frame, this, xdata_req, afr_transaction_perform_fop);
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ afr_transaction_fop_failed(frame, this, child_index);
+ }
- if (xdata_req)
- dict_unref (xdata_req);
+ if (xattr)
+ local->transaction.changelog_xdata[child_index] = dict_ref(xattr);
- return 0;
-next:
- afr_transaction_perform_fop (frame, this);
+ call_count = afr_frame_return(frame);
- if (xdata_req)
- dict_unref (xdata_req);
+ if (call_count == 0) {
+ local->transaction.changelog_resume(frame, this);
+ }
- return 0;
-err:
- local->internal_lock.lock_cbk = local->transaction.done;
- local->op_ret = -1;
- local->op_errno = op_errno;
+ return 0;
+}
- afr_unlock (frame, this);
+void
+afr_changelog_populate_xdata(call_frame_t *frame, afr_xattrop_type_t op,
+ dict_t **xdata, dict_t **newloc_xdata)
+{
+ int i = 0;
+ int ret = 0;
+ char *key = NULL;
+ int keylen = 0;
+ const char *name = NULL;
+ dict_t *xdata1 = NULL;
+ dict_t *xdata2 = NULL;
+ xlator_t *this = NULL;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ gf_boolean_t need_entry_key_set = _gf_true;
+
+ local = frame->local;
+ this = THIS;
+ priv = this->private;
+
+ if (local->transaction.type == AFR_DATA_TRANSACTION ||
+ local->transaction.type == AFR_METADATA_TRANSACTION)
+ goto out;
+
+ if (!priv->esh_granular)
+ goto out;
+
+ xdata1 = dict_new();
+ if (!xdata1)
+ goto out;
+
+ name = local->loc.name;
+ if (local->op == GF_FOP_LINK)
+ name = local->newloc.name;
+
+ switch (op) {
+ case AFR_TRANSACTION_PRE_OP:
+ key = GF_XATTROP_ENTRY_IN_KEY;
+ break;
+ case AFR_TRANSACTION_POST_OP:
+ if (afr_txn_nothing_failed(frame, this)) {
+ key = GF_XATTROP_ENTRY_OUT_KEY;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.failed_subvols[i])
+ continue;
+ need_entry_key_set = _gf_false;
+ break;
+ }
+ /* If the transaction itself did not fail and there
+ * are no failed subvolumes, check whether the fop
+ * failed due to a symmetric error. If it did, do
+ * not set the ENTRY_OUT xattr which would end up
+ * deleting a name index which was created possibly by
+ * an earlier entry txn that may have failed on some
+ * of the sub-volumes.
+ */
+ if (local->op_ret)
+ need_entry_key_set = _gf_false;
+ } else {
+ key = GF_XATTROP_ENTRY_IN_KEY;
+ }
+ break;
+ }
+
+ if (need_entry_key_set) {
+ keylen = strlen(key);
+ ret = dict_set_strn(xdata1, key, keylen, (char *)name);
+ if (ret)
+ gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "%s/%s: Could not set %s key during xattrop",
+ uuid_utoa(local->loc.pargfid), local->loc.name, key);
+ if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) {
+ xdata2 = dict_new();
+ if (!xdata2)
+ goto out;
- if (xdata_req)
- dict_unref (xdata_req);
+ ret = dict_set_strn(xdata2, key, keylen,
+ (char *)local->newloc.name);
+ if (ret)
+ gf_msg(THIS->name, GF_LOG_ERROR, 0, AFR_MSG_DICT_SET_FAILED,
+ "%s/%s: Could not set %s key during "
+ "xattrop",
+ uuid_utoa(local->newloc.pargfid), local->newloc.name,
+ key);
+ }
+ }
- return 0;
+ *xdata = xdata1;
+ *newloc_xdata = xdata2;
+ xdata1 = xdata2 = NULL;
+out:
+ if (xdata1)
+ dict_unref(xdata1);
+ return;
}
-
int
-afr_post_blocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
+afr_changelog_prepare(xlator_t *this, call_frame_t *frame, int *call_count,
+ afr_changelog_resume_t changelog_resume,
+ afr_xattrop_type_t op, dict_t **xdata,
+ dict_t **newloc_xdata)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ priv = this->private;
- if (int_lock->lock_op_ret < 0) {
- gf_msg (this->name, GF_LOG_INFO,
- 0, AFR_MSG_BLOCKING_LKS_FAILED,
- "Blocking inodelks failed.");
- local->transaction.done (frame, this);
- } else {
+ *call_count = afr_changelog_call_count(
+ local->transaction.type, local->transaction.pre_op,
+ local->transaction.failed_subvols, priv->child_count);
- gf_msg_debug (this->name, 0,
- "Blocking inodelks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
- }
+ if (*call_count == 0) {
+ changelog_resume(frame, this);
+ return -1;
+ }
- return 0;
-}
+ afr_changelog_populate_xdata(frame, op, xdata, newloc_xdata);
+ local->call_count = *call_count;
+ local->transaction.changelog_resume = changelog_resume;
+ return 0;
+}
int
-afr_post_nonblocking_inodelk_cbk (call_frame_t *frame, xlator_t *this)
+afr_changelog_do(call_frame_t *frame, xlator_t *this, dict_t *xattr,
+ afr_changelog_resume_t changelog_resume, afr_xattrop_type_t op)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
-
- local = frame->local;
- int_lock = &local->internal_lock;
-
- /* Initiate blocking locks if non-blocking has failed */
- if (int_lock->lock_op_ret < 0) {
- gf_msg_debug (this->name, 0,
- "Non blocking inodelks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_post_blocking_inodelk_cbk;
- afr_blocking_lock (frame, this);
- } else {
-
- gf_msg_debug (this->name, 0,
- "Non blocking inodelks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ dict_t *newloc_xdata = NULL;
+ int i = 0;
+ int call_count = 0;
+ int ret = 0;
+
+ local = frame->local;
+ priv = this->private;
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (local->transaction.changelog_xdata[i]) {
+ dict_unref(local->transaction.changelog_xdata[i]);
+ local->transaction.changelog_xdata[i] = NULL;
}
+ }
+
+ ret = afr_changelog_prepare(this, frame, &call_count, changelog_resume, op,
+ &xdata, &newloc_xdata);
+ if (ret)
return 0;
-}
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i] ||
+ local->transaction.failed_subvols[i])
+ continue;
-int
-afr_post_blocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ if (!local->fd) {
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->xattrop,
+ &local->loc, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ } else {
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fxattrop,
+ local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ }
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
- local = frame->local;
- int_lock = &local->internal_lock;
+ STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.new_parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr, newloc_xdata);
+ call_count--;
- if (int_lock->lock_op_ret < 0) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_BLOCKING_LKS_FAILED,
- "Blocking entrylks failed.");
- local->transaction.done (frame, this);
- } else {
+ /* fall through */
- gf_msg_debug (this->name, 0,
- "Blocking entrylks done. Proceeding to FOP");
- afr_internal_lock_finish (frame, this);
+ case AFR_ENTRY_TRANSACTION:
+ if (local->fd)
+ STACK_WIND_COOKIE(
+ frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fxattrop,
+ local->fd, GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ else
+ STACK_WIND_COOKIE(frame, afr_changelog_cbk, (void *)(long)i,
+ priv->children[i],
+ priv->children[i]->fops->xattrop,
+ &local->transaction.parent_loc,
+ GF_XATTROP_ADD_ARRAY, xattr, xdata);
+ break;
}
- return 0;
+ if (!--call_count)
+ break;
+ }
+
+ if (xdata)
+ dict_unref(xdata);
+ if (newloc_xdata)
+ dict_unref(newloc_xdata);
+ return 0;
}
+static void
+afr_init_optimistic_changelog_for_txn(xlator_t *this, afr_local_t *local)
+{
+ int locked_count = 0;
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
+
+ locked_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
+ if (priv->optimistic_change_log && locked_count == priv->child_count)
+ local->optimistic_change_log = 1;
+
+ return;
+}
int
-afr_post_nonblocking_entrylk_cbk (call_frame_t *frame, xlator_t *this)
+afr_changelog_pre_op(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_private_t *priv = this->private;
+ int i = 0;
+ int ret = 0;
+ int call_count = 0;
+ int op_errno = 0;
+ afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ unsigned char *locked_nodes = NULL;
+ int idx = -1;
+ gf_boolean_t pre_nop = _gf_true;
+ dict_t *xdata_req = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+ idx = afr_index_for_transaction_type(local->transaction.type);
+
+ locked_nodes = afr_locked_nodes_get(local->transaction.type, int_lock);
+
+ for (i = 0; i < priv->child_count; i++) {
+ if (locked_nodes[i]) {
+ local->transaction.pre_op[i] = 1;
+ call_count++;
+ } else {
+ local->transaction.failed_subvols[i] = 1;
+ }
+ }
+
+ afr_init_optimistic_changelog_for_txn(this, local);
+
+ if (afr_changelog_pre_op_inherit(frame, this))
+ goto next;
+
+ /* This condition should not be met with present code, as
+ * transaction.done will be called if locks are not acquired on even a
+ * single node.
+ */
+ if (call_count == 0) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+
+ /* Check if the fop can be performed on at least
+ * quorum number of nodes.
+ */
+ if (priv->quorum_count && !afr_has_fop_quorum(frame)) {
+ op_errno = int_lock->lock_op_errno;
+ if (op_errno == 0)
+ op_errno = afr_quorum_errno(priv);
+ goto err;
+ }
+
+ xdata_req = dict_new();
+ if (!xdata_req) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (call_count < priv->child_count)
+ pre_nop = _gf_false;
+
+ /* Set an all-zero pending changelog so that in the cbk, we can get the
+ * current on-disk values. In a replica 3 volume with arbiter enabled,
+ * these values are needed to arrive at a go/ no-go of the fop phase to
+ * avoid ending up in split-brain.*/
+
+ ret = afr_set_pending_dict(priv, xdata_req, local->pending);
+ if (ret < 0) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (afr_needs_changelog_update(local)) {
+ local->dirty[idx] = hton32(1);
+
+ ret = dict_set_static_bin(xdata_req, AFR_DIRTY, local->dirty,
+ sizeof(int) * AFR_NUM_CHANGE_LOGS);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- local = frame->local;
- int_lock = &local->internal_lock;
+ pre_nop = _gf_false;
+ local->transaction.dirtied = 1;
+ }
- /* Initiate blocking locks if non-blocking has failed */
- if (int_lock->lock_op_ret < 0) {
- gf_msg_debug (this->name, 0,
- "Non blocking entrylks failed. Proceeding to blocking");
- int_lock->lock_cbk = afr_post_blocking_entrylk_cbk;
- afr_blocking_lock (frame, this);
- } else {
+ if (pre_nop)
+ goto next;
- gf_msg_debug (this->name, 0,
- "Non blocking entrylks done. Proceeding to FOP");
+ if (!local->pre_op_compat) {
+ dict_copy(xdata_req, local->xdata_req);
+ goto next;
+ }
- afr_internal_lock_finish (frame, this);
- }
+ afr_changelog_do(frame, this, xdata_req, afr_transaction_perform_fop,
+ AFR_TRANSACTION_PRE_OP);
- return 0;
-}
+ if (xdata_req)
+ dict_unref(xdata_req);
+ return 0;
+next:
+ afr_transaction_perform_fop(frame, this);
-int
-afr_post_blocking_rename_cbk (call_frame_t *frame, xlator_t *this)
-{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ if (xdata_req)
+ dict_unref(xdata_req);
- local = frame->local;
- int_lock = &local->internal_lock;
+ return 0;
+err:
+ local->internal_lock.lock_cbk = afr_transaction_done;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
- if (int_lock->lock_op_ret < 0) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- AFR_MSG_BLOCKING_LKS_FAILED,
- "Blocking entrylks failed.");
+ afr_handle_lock_acquire_failure(local);
- local->transaction.done (frame, this);
- } else {
+ if (xdata_req)
+ dict_unref(xdata_req);
- gf_msg_debug (this->name, 0,
- "Blocking entrylks done. Proceeding to FOP");
+ return 0;
+}
- afr_internal_lock_finish (frame, this);
- }
- return 0;
+int
+afr_post_nonblocking_lock_cbk(call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+
+ local = frame->local;
+ int_lock = &local->internal_lock;
+
+ /* Initiate blocking locks if non-blocking has failed */
+ if (int_lock->lock_op_ret < 0) {
+ gf_msg_debug(this->name, 0,
+ "Non blocking locks failed. Proceeding to blocking");
+ int_lock->lock_cbk = afr_internal_lock_finish;
+ afr_blocking_lock(frame, this);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Non blocking locks done. Proceeding to FOP");
+
+ afr_internal_lock_finish(frame, this);
+ }
+
+ return 0;
}
int
-afr_post_lower_unlock_cbk (call_frame_t *frame, xlator_t *this)
+afr_post_blocking_rename_cbk(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
- int_lock = &local->internal_lock;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- GF_ASSERT (!int_lock->higher_locked);
+ if (int_lock->lock_op_ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, AFR_MSG_INTERNAL_LKS_FAILED,
+ "Blocking entrylks failed.");
- int_lock->lock_cbk = afr_post_blocking_rename_cbk;
- afr_blocking_lock (frame, this);
+ afr_transaction_done(frame, this);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Blocking entrylks done. Proceeding to FOP");
- return 0;
+ afr_internal_lock_finish(frame, this);
+ }
+ return 0;
}
-
int
-afr_set_transaction_flock (afr_local_t *local)
+afr_post_lower_unlock_cbk(call_frame_t *frame, xlator_t *this)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_inodelk_t *inodelk = NULL;
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
- int_lock = &local->internal_lock;
- inodelk = afr_get_inodelk (int_lock, int_lock->domain);
+ local = frame->local;
+ int_lock = &local->internal_lock;
- inodelk->flock.l_len = local->transaction.len;
- inodelk->flock.l_start = local->transaction.start;
- inodelk->flock.l_type = F_WRLCK;
+ GF_ASSERT(!int_lock->higher_locked);
- return 0;
+ int_lock->lock_cbk = afr_post_blocking_rename_cbk;
+ afr_blocking_lock(frame, this);
+
+ return 0;
}
int
-afr_lock_rec (call_frame_t *frame, xlator_t *this)
+afr_set_transaction_flock(xlator_t *this, afr_local_t *local,
+ afr_lockee_t *lockee)
{
- afr_internal_lock_t *int_lock = NULL;
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ struct gf_flock *flock = NULL;
+
+ priv = this->private;
+ flock = &lockee->flock;
+
+ if ((priv->arbiter_count || local->transaction.eager_lock_on ||
+ priv->full_lock) &&
+ local->transaction.type == AFR_DATA_TRANSACTION) {
+ /*Lock entire file to avoid network split brains.*/
+ flock->l_len = 0;
+ flock->l_start = 0;
+ } else {
+ flock->l_len = local->transaction.len;
+ flock->l_start = local->transaction.start;
+ }
+ flock->l_type = F_WRLCK;
+
+ return 0;
+}
- local = frame->local;
- int_lock = &local->internal_lock;
+int
+afr_lock(call_frame_t *frame, xlator_t *this)
+{
+ afr_internal_lock_t *int_lock = NULL;
+ afr_local_t *local = NULL;
+ int i = 0;
- int_lock->transaction_lk_type = AFR_TRANSACTION_LK;
- int_lock->domain = this->name;
+ local = frame->local;
+ int_lock = &local->internal_lock;
- switch (local->transaction.type) {
+ int_lock->lock_cbk = afr_post_nonblocking_lock_cbk;
+ int_lock->domain = this->name;
+
+ switch (local->transaction.type) {
case AFR_DATA_TRANSACTION:
case AFR_METADATA_TRANSACTION:
- afr_set_transaction_flock (local);
-
- int_lock->lock_cbk = afr_post_nonblocking_inodelk_cbk;
+ for (i = 0; i < int_lock->lockee_count; i++) {
+ afr_set_transaction_flock(this, local, &int_lock->lockee[i]);
+ }
- afr_nonblocking_inodelk (frame, this);
- break;
-
- case AFR_ENTRY_RENAME_TRANSACTION:
-
- int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
- afr_nonblocking_entrylk (frame, this);
- break;
+ break;
case AFR_ENTRY_TRANSACTION:
- int_lock->lk_basename = local->transaction.basename;
- if (local->transaction.parent_loc.path)
- int_lock->lk_loc = &local->transaction.parent_loc;
- else
- GF_ASSERT (local->fd);
-
- int_lock->lock_cbk = afr_post_nonblocking_entrylk_cbk;
- afr_nonblocking_entrylk (frame, this);
- break;
- }
+ int_lock->lk_basename = local->transaction.basename;
+ if (local->transaction.parent_loc.path)
+ int_lock->lk_loc = &local->transaction.parent_loc;
+ else
+ GF_ASSERT(local->fd);
+ break;
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ break;
+ }
+ afr_lock_nonblocking(frame, this);
- return 0;
+ return 0;
}
-
-int
-afr_lock (call_frame_t *frame, xlator_t *this)
+static gf_boolean_t
+afr_locals_overlap(afr_local_t *local1, afr_local_t *local2)
{
- afr_set_lock_number (frame, this);
-
- return afr_lock_rec (frame, this);
+ uint64_t start1 = local1->transaction.start;
+ uint64_t start2 = local2->transaction.start;
+ uint64_t end1 = 0;
+ uint64_t end2 = 0;
+
+ if (local1->transaction.len)
+ end1 = start1 + local1->transaction.len - 1;
+ else
+ end1 = ULLONG_MAX;
+
+ if (local2->transaction.len)
+ end2 = start2 + local2->transaction.len - 1;
+ else
+ end2 = ULLONG_MAX;
+
+ return ((end1 >= start2) && (end2 >= start1));
}
-
-/* }}} */
-
-int
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this)
+gf_boolean_t
+afr_has_lock_conflict(afr_local_t *local, gf_boolean_t waitlist_check)
{
- if (__fop_changelog_needed (frame, this)) {
- afr_changelog_pre_op (frame, this);
- } else {
- afr_transaction_perform_fop (frame, this);
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = NULL;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ /*
+ * Once full file lock is acquired in eager-lock phase, overlapping
+ * writes do not compete for inode-locks, instead are transferred to the
+ * next writes. Because of this overlapping writes are not ordered.
+ * This can cause inconsistencies in replication.
+ * Example:
+ * Two overlapping writes w1, w2 are sent in parallel on same fd
+ * in two threads t1, t2.
+ * Both threads can execute afr_writev_wind in the following manner.
+ * t1 winds w1 on brick-0
+ * t2 winds w2 on brick-0
+ * t2 winds w2 on brick-1
+ * t1 winds w1 on brick-1
+ *
+ * This check makes sure the locks are not transferred for
+ * overlapping writes.
+ */
+ list_for_each_entry(each, &lock->owners, transaction.owner_list)
+ {
+ if (afr_locals_overlap(each, local)) {
+ return _gf_true;
}
+ }
- return 0;
+ if (!waitlist_check)
+ return _gf_false;
+ list_for_each_entry(each, &lock->waiting, transaction.wait_list)
+ {
+ if (afr_locals_overlap(each, local)) {
+ return _gf_true;
+ }
+ }
+ return _gf_false;
}
+/* }}} */
+static void
+afr_copy_inodelk_vars(afr_internal_lock_t *dst, afr_internal_lock_t *src,
+ xlator_t *this, int lockee_num)
+{
+ afr_private_t *priv = this->private;
+ afr_lockee_t *sl = &src->lockee[lockee_num];
+ afr_lockee_t *dl = &dst->lockee[lockee_num];
+
+ dst->domain = src->domain;
+ dl->flock.l_len = sl->flock.l_len;
+ dl->flock.l_start = sl->flock.l_start;
+ dl->flock.l_type = sl->flock.l_type;
+ dl->locked_count = sl->locked_count;
+ memcpy(dl->locked_nodes, sl->locked_nodes,
+ priv->child_count * sizeof(*dl->locked_nodes));
+}
void
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this)
+__afr_transaction_wake_shared(afr_local_t *local, struct list_head *shared)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- /* call this function from any of the related optimizations
- which benefit from delaying post op are enabled, namely:
-
- - changelog piggybacking
- - eager locking
- */
-
- priv = this->private;
- if (!priv)
- return;
-
- if (!priv->post_op_delay_secs)
- return;
-
- local = frame->local;
- if (!local)
- return;
+ gf_boolean_t conflict = _gf_false;
+ afr_local_t *each = NULL;
+ afr_lock_t *lock = &local->inode_ctx->lock[local->transaction.type];
+
+ while (!conflict) {
+ if (list_empty(&lock->waiting))
+ return;
+ each = list_entry(lock->waiting.next, afr_local_t,
+ transaction.wait_list);
+ if (afr_has_lock_conflict(each, _gf_false)) {
+ conflict = _gf_true;
+ }
+ if (conflict && !list_empty(&lock->owners))
+ return;
+ afr_copy_inodelk_vars(&each->internal_lock, &local->internal_lock,
+ each->transaction.frame->this, 0);
+ list_move_tail(&each->transaction.wait_list, shared);
+ list_add_tail(&each->transaction.owner_list, &lock->owners);
+ }
+}
- if (!local->transaction.eager_lock_on)
- return;
+static void
+afr_lock_resume_shared(struct list_head *list)
+{
+ afr_local_t *each = NULL;
+
+ while (!list_empty(list)) {
+ each = list_entry(list->next, afr_local_t, transaction.wait_list);
+ list_del_init(&each->transaction.wait_list);
+ afr_changelog_pre_op(each->transaction.frame,
+ each->transaction.frame->this);
+ }
+}
- if (!local->fd)
- return;
+int
+afr_internal_lock_finish(call_frame_t *frame, xlator_t *this)
+{
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
+
+ local->internal_lock.lock_cbk = NULL;
+ if (!local->transaction.eager_lock_on) {
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_transaction_done(frame, this);
+ return 0;
+ }
+ afr_changelog_pre_op(frame, this);
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (local->internal_lock.lock_op_ret < 0) {
+ afr_handle_lock_acquire_failure(local);
+ } else {
+ lock->event_generation = local->event_generation;
+ afr_changelog_pre_op(frame, this);
+ }
+ }
- if (local->op == GF_FOP_WRITE)
- local->delayed_post_op = _gf_true;
+ return 0;
}
gf_boolean_t
-afr_are_multiple_fds_opened (fd_t *fd, xlator_t *this)
+afr_are_conflicting_ops_waiting(afr_local_t *local, xlator_t *this)
{
- afr_fd_ctx_t *fd_ctx = NULL;
-
- if (!fd) {
- /* If false is returned, it may keep on taking eager-lock
- * which may lead to starvation, so return true to avoid that.
- */
- gf_msg_callingfn (this->name, GF_LOG_ERROR, EBADF,
- AFR_MSG_INVALID_ARG, "Invalid fd");
- return _gf_true;
+ afr_lock_t *lock = NULL;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+
+ /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
+ * is taken mount2 opened the same file, it won't be able to
+ * perform any {meta,}data operations until mount1 releases eager-lock.
+ * To avoid such scenario do not enable eager-lock for this transaction
+ * if open-fd-count is > 1 for metadata transactions and if num-inodelks > 1
+ * for data transactions
+ */
+
+ if (local->transaction.type == AFR_METADATA_TRANSACTION) {
+ if (local->inode_ctx->open_fd_count > 1) {
+ return _gf_true;
}
- /* Lets say mount1 has eager-lock(full-lock) and after the eager-lock
- * is taken mount2 opened the same file, it won't be able to
- * perform any data operations until mount1 releases eager-lock.
- * To avoid such scenario do not enable eager-lock for this transaction
- * if open-fd-count is > 1
- */
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- return _gf_true;
-
- if (fd_ctx->open_fd_count > 1)
- return _gf_true;
+ } else if (local->transaction.type == AFR_DATA_TRANSACTION) {
+ if (lock->num_inodelks > 1) {
+ return _gf_true;
+ }
+ }
- return _gf_false;
+ return _gf_false;
}
-
gf_boolean_t
-is_afr_delayed_changelog_post_op_needed (call_frame_t *frame, xlator_t *this)
+afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this,
+ int delay)
{
- afr_local_t *local = NULL;
- gf_boolean_t res = _gf_false;
-
- local = frame->local;
- if (!local)
- goto out;
-
- if (!local->delayed_post_op)
- goto out;
-
- //Mark pending changelog ASAP
- if (!afr_txn_nothing_failed (frame, this))
- goto out;
-
- if (local->fd && afr_are_multiple_fds_opened (local->fd, this))
- goto out;
-
- res = _gf_true;
+ afr_local_t *local = NULL;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t res = _gf_false;
+
+ local = frame->local;
+ lock = &local->inode_ctx->lock[local->transaction.type];
+
+ if (!afr_txn_nothing_failed(frame, this)) {
+ lock->release = _gf_true;
+ goto out;
+ }
+
+ if (afr_are_conflicting_ops_waiting(local, this)) {
+ lock->release = _gf_true;
+ goto out;
+ }
+
+ if (!list_empty(&lock->owners))
+ goto out;
+ else
+ GF_ASSERT(list_empty(&lock->waiting));
+
+ if (lock->release) {
+ goto out;
+ }
+
+ if (!delay) {
+ goto out;
+ }
+
+ if (local->transaction.disable_delayed_post_op) {
+ goto out;
+ }
+
+ if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) &&
+ (local->op != GF_FOP_FSYNC)) {
+ /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so
+ * they are fine too*/
+ goto out;
+ }
+
+ res = _gf_true;
out:
- return res;
+ return res;
}
-
void
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
- call_stub_t *stub);
-
-void
-afr_delayed_changelog_wake_up_cbk (void *data)
+afr_delayed_changelog_wake_up_cbk(void *data)
{
- fd_t *fd = NULL;
-
- fd = data;
-
- afr_delayed_changelog_wake_up (THIS, fd);
+ afr_lock_t *lock = NULL;
+ afr_local_t *local = data;
+ afr_local_t *timer_local = NULL;
+ struct list_head shared;
+
+ INIT_LIST_HEAD(&shared);
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ timer_local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ if (list_empty(&lock->owners) && (local == timer_local)) {
+ GF_ASSERT(list_empty(&lock->waiting));
+ /*Last owner*/
+ lock->release = _gf_true;
+ lock->delay_timer = NULL;
+ }
+ }
+ UNLOCK(&local->inode->lock);
+ afr_changelog_post_op_now(local->transaction.frame,
+ local->transaction.frame->this);
}
-
/* SET operation */
int
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd)
+afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local)
{
- afr_fd_ctx_t *fdctx = NULL;
-
- fdctx = afr_fd_ctx_get (fd, this);
+ LOCK(&local->inode->lock);
+ {
+ local->inode_ctx->witnessed_unstable_write = _gf_true;
+ }
+ UNLOCK(&local->inode->lock);
- LOCK(&fd->lock);
- {
- fdctx->witnessed_unstable_write = _gf_true;
- }
- UNLOCK(&fd->lock);
-
- return 0;
+ return 0;
}
/* TEST and CLEAR operation */
gf_boolean_t
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd)
+afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode)
{
- afr_fd_ctx_t *fdctx = NULL;
- gf_boolean_t witness = _gf_false;
+ afr_inode_ctx_t *ctx = NULL;
+ gf_boolean_t witness = _gf_false;
- fdctx = afr_fd_ctx_get (fd, this);
- if (!fdctx)
- return _gf_true;
+ LOCK(&inode->lock);
+ {
+ (void)__afr_inode_ctx_get(this, inode, &ctx);
- LOCK(&fd->lock);
- {
- if (fdctx->witnessed_unstable_write) {
- witness = _gf_true;
- fdctx->witnessed_unstable_write = _gf_false;
- }
+ if (ctx->witnessed_unstable_write) {
+ witness = _gf_true;
+ ctx->witnessed_unstable_write = _gf_false;
}
- UNLOCK (&fd->lock);
+ }
+ UNLOCK(&inode->lock);
- return witness;
+ return witness;
}
-
int
-afr_changelog_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *pre,
- struct iatt *post, dict_t *xdata)
+afr_changelog_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *pre,
+ struct iatt *post, dict_t *xdata)
{
- afr_private_t *priv = NULL;
- int child_index = (long) cookie;
- int call_count = -1;
- afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int child_index = (long)cookie;
+ int call_count = -1;
+ afr_local_t *local = NULL;
- priv = this->private;
- local = frame->local;
+ priv = this->private;
+ local = frame->local;
- if (op_ret != 0) {
- /* Failure of fsync() is as good as failure of previous
- write(). So treat it like one.
- */
- gf_msg (this->name, GF_LOG_WARNING,
- op_errno, AFR_MSG_FSYNC_FAILED,
- "fsync(%s) failed on subvolume %s. Transaction was %s",
- uuid_utoa (local->fd->inode->gfid),
- priv->children[child_index]->name,
- gf_fop_list[local->op]);
+ if (op_ret != 0) {
+ /* Failure of fsync() is as good as failure of previous
+ write(). So treat it like one.
+ */
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, AFR_MSG_FSYNC_FAILED,
+ "fsync(%s) failed on subvolume %s. Transaction was %s",
+ uuid_utoa(local->fd->inode->gfid),
+ priv->children[child_index]->name, gf_fop_list[local->op]);
- afr_transaction_fop_failed (frame, this, child_index);
- }
+ afr_transaction_fop_failed(frame, this, child_index);
+ }
- call_count = afr_frame_return (frame);
+ call_count = afr_frame_return(frame);
- if (call_count == 0)
- afr_changelog_post_op_now (frame, this);
+ if (call_count == 0)
+ afr_changelog_post_op_now(frame, this);
- return 0;
+ return 0;
}
-
int
-afr_changelog_fsync (call_frame_t *frame, xlator_t *this)
+afr_changelog_fsync(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- int i = 0;
- int call_count = 0;
- afr_private_t *priv = NULL;
- dict_t *xdata = NULL;
- GF_UNUSED int ret = -1;
+ afr_local_t *local = NULL;
+ int i = 0;
+ int call_count = 0;
+ afr_private_t *priv = NULL;
+ dict_t *xdata = NULL;
+ GF_UNUSED int ret = -1;
- local = frame->local;
- priv = this->private;
+ local = frame->local;
+ priv = this->private;
- call_count = AFR_COUNT (local->transaction.pre_op, priv->child_count);
+ call_count = AFR_COUNT(local->transaction.pre_op, priv->child_count);
- if (!call_count) {
- /* will go straight to unlock */
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
+ if (!call_count) {
+ /* will go straight to unlock */
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
- local->call_count = call_count;
+ local->call_count = call_count;
- xdata = dict_new();
- if (xdata)
- ret = dict_set_int32 (xdata, "batch-fsync", 1);
+ xdata = dict_new();
+ if (xdata) {
+ ret = dict_set_int32_sizen(xdata, "batch-fsync", 1);
+ ret = dict_set_str(xdata, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ }
- for (i = 0; i < priv->child_count; i++) {
- if (!local->transaction.pre_op[i])
- continue;
+ for (i = 0; i < priv->child_count; i++) {
+ if (!local->transaction.pre_op[i])
+ continue;
- STACK_WIND_COOKIE (frame, afr_changelog_fsync_cbk,
- (void *) (long) i, priv->children[i],
- priv->children[i]->fops->fsync, local->fd,
- 1, xdata);
- if (!--call_count)
- break;
- }
+ STACK_WIND_COOKIE(frame, afr_changelog_fsync_cbk, (void *)(long)i,
+ priv->children[i], priv->children[i]->fops->fsync,
+ local->fd, 1, xdata);
+ if (!--call_count)
+ break;
+ }
- if (xdata)
- dict_unref (xdata);
+ if (xdata)
+ dict_unref(xdata);
- return 0;
+ return 0;
}
-
int
-afr_changelog_post_op_safe (call_frame_t *frame, xlator_t *this)
+afr_changelog_post_op_safe(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
-
- local = frame->local;
- priv = this->private;
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
- if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
+ local = frame->local;
+ priv = this->private;
- if (afr_changelog_pre_op_uninherit (frame, this) &&
- afr_txn_nothing_failed (frame, this)) {
- /* just detected that this post-op is about to
- be optimized away as a new write() has
- already piggybacked on this frame's changelog.
- */
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
+ if (!local->fd || local->transaction.type != AFR_DATA_TRANSACTION) {
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
- /* Calling afr_changelog_post_op_now() now will result in
- issuing ->[f]xattrop().
-
- Performing a hard POST-OP (->[f]xattrop() FOP) is a more
- responsible operation that what it might appear on the surface.
-
- The changelog of a file (in the xattr of the file on the server)
- stores information (pending count) about the state of the file
- on the OTHER server. This changelog is blindly trusted, and must
- therefore be updated in such a way it remains trustworthy. This
- implies that decrementing the pending count (essentially "clearing
- the dirty flag") must be done STRICTLY after we are sure that the
- operation on the other server has reached stable storage.
-
- While the backend filesystem on that server will eventually flush
- it to stable storage, we (being in userspace) have no mechanism
- to get notified when the write became "stable".
-
- This means we need take matter into our own hands and issue an
- fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
- and get an acknowledgement for it. And we need to wait for the
- fsync() acknowledgement before initiating the hard POST-OP.
-
- However if the FD itself was opened in O_SYNC or O_DSYNC then
- we are already guaranteed that the writes were made stable as
- part of the FOP itself. The same holds true for NFS stable
- writes which happen on an anonymous FD with O_DSYNC or O_SYNC
- flag set in the writev() @flags param. For all other write types,
- mark a flag in the fdctx whenever an unstable write is witnessed.
+ if (afr_changelog_pre_op_uninherit(frame, this) &&
+ afr_txn_nothing_failed(frame, this)) {
+ /* just detected that this post-op is about to
+ be optimized away as a new write() has
+ already piggybacked on this frame's changelog.
*/
-
- if (!afr_fd_has_witnessed_unstable_write (this, local->fd)) {
- afr_changelog_post_op_now (frame, this);
- return 0;
- }
-
- /* Check whether users want durability and perform fsync/post-op
- * accordingly.
- */
- if (priv->ensure_durability) {
- /* Time to fsync() */
- afr_changelog_fsync (frame, this);
- } else {
- afr_changelog_post_op_now (frame, this);
- }
-
+ afr_changelog_post_op_now(frame, this);
return 0;
+ }
+
+ /* Calling afr_changelog_post_op_now() now will result in
+ issuing ->[f]xattrop().
+
+ Performing a hard POST-OP (->[f]xattrop() FOP) is a more
+ responsible operation that what it might appear on the surface.
+
+ The changelog of a file (in the xattr of the file on the server)
+ stores information (pending count) about the state of the file
+ on the OTHER server. This changelog is blindly trusted, and must
+ therefore be updated in such a way it remains trustworthy. This
+ implies that decrementing the pending count (essentially "clearing
+ the dirty flag") must be done STRICTLY after we are sure that the
+ operation on the other server has reached stable storage.
+
+ While the backend filesystem on that server will eventually flush
+ it to stable storage, we (being in userspace) have no mechanism
+ to get notified when the write became "stable".
+
+ This means we need take matter into our own hands and issue an
+ fsync() EVEN IF THE APPLICATION WAS PERFORMING UNSTABLE WRITES,
+ and get an acknowledgement for it. And we need to wait for the
+ fsync() acknowledgement before initiating the hard POST-OP.
+
+ However if the FD itself was opened in O_SYNC or O_DSYNC then
+ we are already guaranteed that the writes were made stable as
+ part of the FOP itself. The same holds true for NFS stable
+ writes which happen on an anonymous FD with O_DSYNC or O_SYNC
+ flag set in the writev() @flags param. For all other write types,
+ mark a flag in the fdctx whenever an unstable write is witnessed.
+ */
+
+ if (!afr_fd_has_witnessed_unstable_write(this, local->inode)) {
+ afr_changelog_post_op_now(frame, this);
+ return 0;
+ }
+
+ /* Check whether users want durability and perform fsync/post-op
+ * accordingly.
+ */
+ if (priv->ensure_durability) {
+ /* Time to fsync() */
+ afr_changelog_fsync(frame, this);
+ } else {
+ afr_changelog_post_op_now(frame, this);
+ }
+
+ return 0;
}
-
void
-afr_delayed_changelog_post_op (xlator_t *this, call_frame_t *frame, fd_t *fd,
- call_stub_t *stub)
+afr_changelog_post_op(call_frame_t *frame, xlator_t *this)
{
- afr_fd_ctx_t *fd_ctx = NULL;
- call_frame_t *prev_frame = NULL;
- struct timespec delta = {0, };
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- priv = this->private;
-
- fd_ctx = afr_fd_ctx_get (fd, this);
- if (!fd_ctx)
- goto out;
+ struct timespec delta = {
+ 0,
+ };
+ afr_private_t *priv = NULL;
+ afr_local_t *local = frame->local;
+ afr_lock_t *lock = NULL;
+ gf_boolean_t post_op = _gf_true;
+ struct list_head shared;
+
+ priv = this->private;
+ delta.tv_sec = priv->post_op_delay_secs;
+ delta.tv_nsec = 0;
+
+ INIT_LIST_HEAD(&shared);
+ if (!local->transaction.eager_lock_on)
+ goto out;
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ LOCK(&local->inode->lock);
+ {
+ list_del_init(&local->transaction.owner_list);
+ list_add(&local->transaction.owner_list, &lock->post_op);
+ __afr_transaction_wake_shared(local, &shared);
+
+ if (!afr_is_delayed_changelog_post_op_needed(frame, this,
+ delta.tv_sec)) {
+ if (list_empty(&lock->owners))
+ lock->release = _gf_true;
+ goto unlock;
+ }
- delta.tv_sec = priv->post_op_delay_secs;
- delta.tv_nsec = 0;
-
- pthread_mutex_lock (&fd_ctx->delay_lock);
- {
- prev_frame = fd_ctx->delay_frame;
- fd_ctx->delay_frame = NULL;
- if (fd_ctx->delay_timer)
- gf_timer_call_cancel (this->ctx, fd_ctx->delay_timer);
- fd_ctx->delay_timer = NULL;
- if (!frame)
- goto unlock;
- fd_ctx->delay_timer = gf_timer_call_after (this->ctx, delta,
- afr_delayed_changelog_wake_up_cbk,
- fd);
- fd_ctx->delay_frame = frame;
- }
+ GF_ASSERT(lock->delay_timer == NULL);
+ lock->delay_timer = gf_timer_call_after(
+ this->ctx, delta, afr_delayed_changelog_wake_up_cbk, local);
+ if (!lock->delay_timer) {
+ lock->release = _gf_true;
+ } else {
+ post_op = _gf_false;
+ }
+ }
unlock:
- pthread_mutex_unlock (&fd_ctx->delay_lock);
-
-out:
- if (prev_frame) {
- local = prev_frame->local;
- local->transaction.resume_stub = stub;
- afr_changelog_post_op_now (prev_frame, this);
- } else if (stub) {
- call_resume (stub);
- }
-}
-
-
-void
-afr_changelog_post_op (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
+ UNLOCK(&local->inode->lock);
- local = frame->local;
+ if (!list_empty(&shared)) {
+ afr_lock_resume_shared(&shared);
+ }
- if (is_afr_delayed_changelog_post_op_needed (frame, this))
- afr_delayed_changelog_post_op (this, frame, local->fd, NULL);
- else
- afr_changelog_post_op_safe (frame, this);
-}
-
-
-
-/* Wake up the sleeping/delayed post-op, and also register
- a stub to have it resumed after this transaction
- completely finishes.
-
- The @stub gets saved in @local and gets resumed in
- afr_local_cleanup()
- */
-void
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub)
-{
- afr_delayed_changelog_post_op (this, NULL, fd, stub);
-}
-
-
-void
-afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd)
-{
- afr_delayed_changelog_post_op (this, NULL, fd, NULL);
+out:
+ if (post_op) {
+ if (!local->transaction.eager_lock_on || lock->release) {
+ afr_changelog_post_op_safe(frame, this);
+ } else {
+ afr_changelog_post_op_now(frame, this);
+ }
+ }
}
-
int
-afr_transaction_resume (call_frame_t *frame, xlator_t *this)
+afr_transaction_resume(call_frame_t *frame, xlator_t *this)
{
- afr_local_t *local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
-
- if (local->transaction.eager_lock_on) {
- /* We don't need to retain "local" in the
- fd list anymore, writes to all subvols
- are finished by now */
- afr_remove_eager_lock_stub (local);
- }
+ local = frame->local;
- afr_restore_lk_owner (frame);
+ afr_restore_lk_owner(frame);
- afr_handle_symmetric_errors (frame, this);
+ afr_handle_symmetric_errors(frame, this);
- if (!local->pre_op_compat)
- /* new mode, pre-op was done along
- with OP */
- afr_changelog_pre_op_update (frame, this);
+ if (!local->pre_op_compat)
+ /* new mode, pre-op was done along
+ with OP */
+ afr_changelog_pre_op_update(frame, this);
- if (__fop_changelog_needed (frame, this)) {
- afr_changelog_post_op (frame, this);
- } else {
- afr_changelog_post_op_done (frame, this);
- }
+ afr_changelog_post_op(frame, this);
- return 0;
+ return 0;
}
-
/**
* afr_transaction_fop_failed - inform that an fop failed
*/
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
- int child_index)
+afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this, int child_index)
{
- afr_local_t * local = NULL;
+ afr_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- local->transaction.failed_subvols[child_index] = 1;
+ local->transaction.failed_subvols[child_index] = 1;
}
-
-
- static gf_boolean_t
-afr_locals_overlap (afr_local_t *local1, afr_local_t *local2)
+static gf_boolean_t
+__need_previous_lock_unlocked(afr_local_t *local)
{
- uint64_t start1 = local1->transaction.start;
- uint64_t start2 = local2->transaction.start;
- uint64_t end1 = 0;
- uint64_t end2 = 0;
-
- if (local1->transaction.len)
- end1 = start1 + local1->transaction.len - 1;
- else
- end1 = ULLONG_MAX;
-
- if (local2->transaction.len)
- end2 = start2 + local2->transaction.len - 1;
- else
- end2 = ULLONG_MAX;
+ afr_lock_t *lock = NULL;
- return ((end1 >= start2) && (end2 >= start1));
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (!lock->acquired)
+ return _gf_false;
+ if (lock->acquired && lock->event_generation != local->event_generation)
+ return _gf_true;
+ return _gf_false;
}
void
-afr_transaction_eager_lock_init (afr_local_t *local, xlator_t *this)
+__afr_eager_lock_handle(afr_local_t *local, gf_boolean_t *take_lock,
+ gf_boolean_t *do_pre_op, afr_local_t **timer_local)
{
- afr_private_t *priv = NULL;
- afr_fd_ctx_t *fdctx = NULL;
- afr_local_t *each = NULL;
-
- priv = this->private;
-
- if (!local->fd)
- return;
-
- if (local->transaction.type != AFR_DATA_TRANSACTION)
- return;
-
- if (!priv->eager_lock)
- return;
-
- fdctx = afr_fd_ctx_get (local->fd, this);
- if (!fdctx)
- return;
-
- if (afr_are_multiple_fds_opened (local->fd, this))
- return;
- /*
- * Once full file lock is acquired in eager-lock phase, overlapping
- * writes do not compete for inode-locks, instead are transferred to the
- * next writes. Because of this overlapping writes are not ordered.
- * This can cause inconsistencies in replication.
- * Example:
- * Two overlapping writes w1, w2 are sent in parallel on same fd
- * in two threads t1, t2.
- * Both threads can execute afr_writev_wind in the following manner.
- * t1 winds w1 on brick-0
- * t2 winds w2 on brick-0
- * t2 winds w2 on brick-1
- * t1 winds w1 on brick-1
- *
- * This check makes sure the locks are not transferred for
- * overlapping writes.
- */
- LOCK (&local->fd->lock);
- {
- list_for_each_entry (each, &fdctx->eager_locked,
- transaction.eager_locked) {
- if (afr_locals_overlap (each, local)) {
- local->transaction.eager_lock_on = _gf_false;
- goto unlock;
- }
- }
+ afr_lock_t *lock = NULL;
+ afr_local_t *owner_local = NULL;
+ xlator_t *this = local->transaction.frame->this;
+
+ local->transaction.eager_lock_on = _gf_true;
+ afr_set_lk_owner(local->transaction.frame, this, local->inode);
+
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (__need_previous_lock_unlocked(local)) {
+ if (!list_empty(&lock->owners)) {
+ lock->release = _gf_true;
+ } else if (lock->delay_timer) {
+ lock->release = _gf_true;
+ if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+ /* It will be put in frozen list
+ * in the code flow below*/
+ } else {
+ *timer_local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ lock->delay_timer = NULL;
+ }
+ }
+ }
+
+ if (lock->release) {
+ list_add_tail(&local->transaction.wait_list, &lock->frozen);
+ *take_lock = _gf_false;
+ goto out;
+ }
+
+ if (lock->delay_timer) {
+ *take_lock = _gf_false;
+ if (gf_timer_call_cancel(this->ctx, lock->delay_timer)) {
+ list_add_tail(&local->transaction.wait_list, &lock->frozen);
+ } else {
+ *timer_local = list_entry(lock->post_op.next, afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars(&local->internal_lock,
+ &(*timer_local)->internal_lock, this, 0);
+ lock->delay_timer = NULL;
+ *do_pre_op = _gf_true;
+ list_add_tail(&local->transaction.owner_list, &lock->owners);
+ }
+ goto out;
+ }
- local->transaction.eager_lock_on = _gf_true;
- list_add_tail (&local->transaction.eager_locked,
- &fdctx->eager_locked);
+ if (!list_empty(&lock->owners)) {
+ if (!lock->acquired || afr_has_lock_conflict(local, _gf_true)) {
+ list_add_tail(&local->transaction.wait_list, &lock->waiting);
+ *take_lock = _gf_false;
+ goto out;
}
-unlock:
- UNLOCK (&local->fd->lock);
+ owner_local = list_entry(lock->owners.next, afr_local_t,
+ transaction.owner_list);
+ afr_copy_inodelk_vars(&local->internal_lock,
+ &owner_local->internal_lock, this, 0);
+ *take_lock = _gf_false;
+ *do_pre_op = _gf_true;
+ }
+
+ if (lock->acquired)
+ GF_ASSERT(!(*take_lock));
+ list_add_tail(&local->transaction.owner_list, &lock->owners);
+out:
+ return;
}
+void
+afr_transaction_start(afr_local_t *local, xlator_t *this)
+{
+ afr_private_t *priv = NULL;
+ gf_boolean_t take_lock = _gf_true;
+ gf_boolean_t do_pre_op = _gf_false;
+ afr_local_t *timer_local = NULL;
+
+ priv = this->private;
+
+ if (local->transaction.type != AFR_DATA_TRANSACTION &&
+ local->transaction.type != AFR_METADATA_TRANSACTION)
+ goto lock_phase;
+
+ if (!priv->eager_lock)
+ goto lock_phase;
+
+ LOCK(&local->inode->lock);
+ {
+ __afr_eager_lock_handle(local, &take_lock, &do_pre_op, &timer_local);
+ }
+ UNLOCK(&local->inode->lock);
+lock_phase:
+ if (!local->transaction.eager_lock_on) {
+ afr_set_lk_owner(local->transaction.frame, this,
+ local->transaction.frame->root);
+ }
+
+ if (take_lock) {
+ afr_lock(local->transaction.frame, this);
+ } else if (do_pre_op) {
+ afr_changelog_pre_op(local->transaction.frame, this);
+ }
+ /*Always call delayed_changelog_wake_up_cbk after calling pre-op above
+ * so that any inheriting can happen*/
+ if (timer_local)
+ afr_delayed_changelog_wake_up_cbk(timer_local);
+}
int
-afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+afr_write_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
{
- afr_local_t * local = NULL;
- afr_private_t * priv = NULL;
- fd_t *fd = NULL;
- int ret = -1;
-
- local = frame->local;
- priv = this->private;
+ afr_local_t *local = frame->local;
+
+ if (err) {
+ AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(-1, err);
+ goto fail;
+ }
+
+ afr_transaction_start(local, this);
+ return 0;
+fail:
+ local->transaction.unwind(frame, this);
+ AFR_STACK_DESTROY(frame);
+ return 0;
+}
- local->transaction.resume = afr_transaction_resume;
- local->transaction.type = type;
+int
+afr_transaction_lockee_init(call_frame_t *frame)
+{
+ afr_local_t *local = frame->local;
+ afr_internal_lock_t *int_lock = &local->internal_lock;
+ afr_private_t *priv = frame->this->private;
+ int ret = 0;
- ret = afr_transaction_local_init (local, this);
- if (ret < 0)
- goto out;
+ switch (local->transaction.type) {
+ case AFR_DATA_TRANSACTION:
+ case AFR_METADATA_TRANSACTION:
+ ret = afr_add_inode_lockee(local, priv->child_count);
+ break;
- ret = afr_inode_get_readable (frame, local->inode, this, 0, 0, type);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, EIO, AFR_MSG_SPLIT_BRAIN,
- "Failing %s on gfid %s: split-brain observed.",
- gf_fop_list[local->op], uuid_utoa (local->inode->gfid));
+ case AFR_ENTRY_TRANSACTION:
+ case AFR_ENTRY_RENAME_TRANSACTION:
+ ret = afr_add_entry_lockee(local, &local->transaction.parent_loc,
+ local->transaction.basename,
+ priv->child_count);
+ if (ret) {
goto out;
- }
- afr_transaction_eager_lock_init (local, this);
-
- if (local->fd && local->transaction.eager_lock_on)
- afr_set_lk_owner (frame, this, local->fd);
- else
- afr_set_lk_owner (frame, this, frame->root);
-
- if (!local->transaction.eager_lock_on && local->loc.inode) {
- fd = fd_lookup (local->loc.inode, frame->root->pid);
- if (fd == NULL)
- fd = fd_lookup_anonymous (local->loc.inode);
+ }
+ if (local->op == GF_FOP_RENAME) {
+ ret = afr_add_entry_lockee(
+ local, &local->transaction.new_parent_loc,
+ local->transaction.new_basename, priv->child_count);
+ if (ret) {
+ goto out;
+ }
- if (fd) {
- afr_delayed_changelog_wake_up (this, fd);
- fd_unref (fd);
+ if (local->newloc.inode &&
+ IA_ISDIR(local->newloc.inode->ia_type)) {
+ ret = afr_add_entry_lockee(local, &local->newloc, NULL,
+ priv->child_count);
+ if (ret) {
+ goto out;
+ }
}
- }
+ } else if (local->op == GF_FOP_RMDIR) {
+ ret = afr_add_entry_lockee(local, &local->loc, NULL,
+ priv->child_count);
+ if (ret) {
+ goto out;
+ }
+ }
+
+ if (int_lock->lockee_count > 1) {
+ qsort(int_lock->lockee, int_lock->lockee_count,
+ sizeof(*int_lock->lockee), afr_entry_lockee_cmp);
+ }
+ break;
+ }
+out:
+ return ret;
+}
- if (afr_lock_server_count (priv, local->transaction.type) == 0) {
- afr_internal_lock_finish (frame, this);
- } else {
- afr_lock (frame, this);
- }
+int
+afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type)
+{
+ afr_local_t *local = NULL;
+ afr_private_t *priv = NULL;
+ int ret = -1;
+ int event_generation = 0;
+
+ local = frame->local;
+ priv = this->private;
+ local->transaction.frame = frame;
+
+ local->transaction.type = type;
+
+ if (priv->quorum_count && !afr_has_quorum(local->child_up, this, NULL)) {
+ ret = -afr_quorum_errno(priv);
+ goto out;
+ }
+
+ if (!afr_is_consistent_io_possible(local, priv, &ret)) {
+ ret = -ret; /*op_errno to ret conversion*/
+ goto out;
+ }
+
+ if (priv->thin_arbiter_count && !afr_ta_has_quorum(priv, local)) {
+ ret = -afr_quorum_errno(priv);
+ goto out;
+ }
+
+ ret = afr_transaction_local_init(local, this);
+ if (ret < 0)
+ goto out;
+
+ ret = afr_transaction_lockee_init(frame);
+ if (ret)
+ goto out;
+
+ if (type != AFR_METADATA_TRANSACTION) {
+ goto txn_start;
+ }
+
+ ret = afr_inode_get_readable(frame, local->inode, this, local->readable,
+ &event_generation, type);
+ if (ret < 0 ||
+ afr_is_inode_refresh_reqd(local->inode, this, priv->event_generation,
+ event_generation)) {
+ afr_inode_refresh(frame, this, local->inode, local->loc.gfid,
+ afr_write_txn_refresh_done);
ret = 0;
+ goto out;
+ }
+
+txn_start:
+ ret = 0;
+ afr_transaction_start(local, this);
out:
- return ret;
+ return ret;
}
diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h
index 47d43d88991..beefa26f4a6 100644
--- a/xlators/cluster/afr/src/afr-transaction.h
+++ b/xlators/cluster/afr/src/afr-transaction.h
@@ -14,43 +14,62 @@
#include "afr.h"
void
-afr_transaction_fop_failed (call_frame_t *frame, xlator_t *this,
- int child_index);
-void
-afr_txn_arbitrate_fop_cbk (call_frame_t *frame, xlator_t *this);
+afr_transaction_fop_failed(call_frame_t *frame, xlator_t *this,
+ int child_index);
+
+int32_t
+afr_transaction(call_frame_t *frame, xlator_t *this, afr_transaction_type type);
int
-afr_lock_server_count (afr_private_t *priv, afr_transaction_type type);
+afr_set_pending_dict(afr_private_t *priv, dict_t *xattr, int32_t **pending);
-afr_inodelk_t*
-afr_get_inodelk (afr_internal_lock_t *int_lock, char *dom);
+void
+afr_delayed_changelog_wake_up(xlator_t *this, fd_t *fd);
-int32_t
-afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type);
+void
+__mark_all_success(call_frame_t *frame, xlator_t *this);
+
+gf_boolean_t
+afr_txn_nothing_failed(call_frame_t *frame, xlator_t *this);
int
-afr_set_pending_dict (afr_private_t *priv, dict_t *xattr, int32_t **pending);
+afr_read_txn(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ afr_read_txn_wind_t readfn, afr_transaction_type type);
-void
-afr_set_delayed_post_op (call_frame_t *frame, xlator_t *this);
+int
+afr_read_txn_continue(call_frame_t *frame, xlator_t *this, int subvol);
void
-afr_delayed_changelog_wake_up (xlator_t *this, fd_t *fd);
+afr_pending_read_increment(afr_private_t *priv, int child_index);
void
-__mark_all_success (call_frame_t *frame, xlator_t *this);
+afr_pending_read_decrement(afr_private_t *priv, int child_index);
+call_frame_t *
+afr_transaction_detach_fop_frame(call_frame_t *frame);
gf_boolean_t
-afr_txn_nothing_failed (call_frame_t *frame, xlator_t *this);
+afr_has_quorum(unsigned char *subvols, xlator_t *this, call_frame_t *frame);
+gf_boolean_t
+afr_needs_changelog_update(afr_local_t *local);
+void
+afr_zero_fill_stat(afr_local_t *local);
-int afr_read_txn (call_frame_t *frame, xlator_t *this, inode_t *inode,
- afr_read_txn_wind_t readfn, afr_transaction_type type);
+void
+afr_pick_error_xdata(afr_local_t *local, afr_private_t *priv, inode_t *inode1,
+ unsigned char *readable1, inode_t *inode2,
+ unsigned char *readable2);
+int
+afr_transaction_resume(call_frame_t *frame, xlator_t *this);
-int afr_read_txn_continue (call_frame_t *frame, xlator_t *this, int subvol);
+int
+afr_lock(call_frame_t *frame, xlator_t *this);
-int __afr_txn_write_fop (call_frame_t *frame, xlator_t *this);
-int __afr_txn_write_done (call_frame_t *frame, xlator_t *this);
-call_frame_t *afr_transaction_detach_fop_frame (call_frame_t *frame);
-gf_boolean_t afr_has_quorum (unsigned char *subvols, xlator_t *this);
+void
+afr_delayed_changelog_wake_up_cbk(void *data);
+
+int
+afr_release_notify_lock_for_ta(void *opaque);
+int
+afr_ta_lock_release_done(int ret, call_frame_t *ta_frame, void *opaque);
#endif /* __TRANSACTION_H__ */
diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
index 98e4d3e2699..df7366f0a65 100644
--- a/xlators/cluster/afr/src/afr.c
+++ b/xlators/cluster/afr/src/afr.c
@@ -20,808 +20,1325 @@
struct volume_options options[];
+static char *afr_favorite_child_policies[AFR_FAV_CHILD_POLICY_MAX + 1] = {
+ [AFR_FAV_CHILD_NONE] = "none",
+ [AFR_FAV_CHILD_BY_SIZE] = "size",
+ [AFR_FAV_CHILD_BY_CTIME] = "ctime",
+ [AFR_FAV_CHILD_BY_MTIME] = "mtime",
+ [AFR_FAV_CHILD_BY_MAJORITY] = "majority",
+ [AFR_FAV_CHILD_POLICY_MAX] = NULL,
+};
+
int32_t
-notify (xlator_t *this, int32_t event,
- void *data, ...)
+notify(xlator_t *this, int32_t event, void *data, ...)
{
- int ret = -1;
- va_list ap;
- void *data2 = NULL;
+ int ret = -1;
+ va_list ap;
+ void *data2 = NULL;
- va_start (ap, data);
- data2 = va_arg (ap, dict_t*);
- va_end (ap);
- ret = afr_notify (this, event, data, data2);
+ va_start(ap, data);
+ data2 = va_arg(ap, dict_t *);
+ va_end(ap);
+ ret = afr_notify(this, event, data, data2);
- return ret;
+ return ret;
}
int32_t
-mem_acct_init (xlator_t *this)
+mem_acct_init(xlator_t *this)
{
- int ret = -1;
+ int ret = -1;
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1);
+ if (!this)
+ return ret;
- if (ret != 0) {
- return ret;
- }
+ ret = xlator_mem_acct_init(this, gf_afr_mt_end + 1);
+ if (ret != 0) {
return ret;
-}
+ }
+ return ret;
+}
int
-xlator_subvolume_index (xlator_t *this, xlator_t *subvol)
+xlator_subvolume_index(xlator_t *this, xlator_t *subvol)
{
- int index = -1;
- int i = 0;
- xlator_list_t *list = NULL;
-
- list = this->children;
-
- while (list) {
- if (subvol == list->xlator ||
- strcmp (subvol->name, list->xlator->name) == 0) {
- index = i;
- break;
- }
- list = list->next;
- i++;
+ int index = -1;
+ int i = 0;
+ xlator_list_t *list = NULL;
+
+ list = this->children;
+
+ while (list) {
+ if (subvol == list->xlator ||
+ strcmp(subvol->name, list->xlator->name) == 0) {
+ index = i;
+ break;
}
+ list = list->next;
+ i++;
+ }
- return index;
+ return index;
}
static void
-fix_quorum_options (xlator_t *this, afr_private_t *priv, char *qtype,
- dict_t *options)
+fix_quorum_options(xlator_t *this, afr_private_t *priv, char *qtype,
+ dict_t *options)
{
- if (dict_get (options, "quorum-type") == NULL) {
- /* If user doesn't configure anything enable auto-quorum if the
- * replica has odd number of subvolumes */
- if (priv->child_count % 2)
- qtype = "auto";
- }
-
- if (priv->quorum_count && strcmp (qtype, "fixed")) {
- gf_msg (this->name,GF_LOG_WARNING, 0, AFR_MSG_QUORUM_OVERRIDE,
- "quorum-type %s overriding quorum-count %u",
- qtype, priv->quorum_count);
- }
-
- if (!strcmp (qtype, "none")) {
- priv->quorum_count = 0;
- } else if (!strcmp (qtype, "auto")) {
- priv->quorum_count = AFR_QUORUM_AUTO;
- }
+ if (dict_get_sizen(options, "quorum-type") == NULL) {
+ /* If user doesn't configure anything enable auto-quorum if the
+ * replica has more than two subvolumes */
+ if (priv->child_count > 2)
+ qtype = "auto";
+ }
+
+ if (priv->quorum_count && strcmp(qtype, "fixed")) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_OVERRIDE,
+ "quorum-type %s overriding quorum-count %u", qtype,
+ priv->quorum_count);
+ }
+
+ if (!strcmp(qtype, "none")) {
+ priv->quorum_count = 0;
+ } else if (!strcmp(qtype, "auto")) {
+ priv->quorum_count = AFR_QUORUM_AUTO;
+ }
}
int
-reconfigure (xlator_t *this, dict_t *options)
+afr_set_favorite_child_policy(afr_private_t *priv, char *policy)
{
- afr_private_t *priv = NULL;
- xlator_t *read_subvol = NULL;
- int read_subvol_index = -1;
- int ret = -1;
- int index = -1;
- char *qtype = NULL;
+ int index = -1;
- priv = this->private;
+ index = gf_get_index_by_elem(afr_favorite_child_policies, policy);
+ if (index < 0 || index >= AFR_FAV_CHILD_POLICY_MAX)
+ return -1;
- GF_OPTION_RECONF ("afr-dirty-xattr",
- priv->afr_dirty, options, str,
- out);
+ priv->fav_child_policy = index;
- GF_OPTION_RECONF ("metadata-splitbrain-forced-heal",
- priv->metadata_splitbrain_forced_heal, options, bool,
- out);
-
- GF_OPTION_RECONF ("background-self-heal-count",
- priv->background_self_heal_count, options, uint32,
- out);
+ return 0;
+}
- GF_OPTION_RECONF ("metadata-self-heal",
- priv->metadata_self_heal, options, bool, out);
+static void
+set_data_self_heal_algorithm(afr_private_t *priv, char *algo)
+{
+ if (!algo) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
+ } else if (strcmp(algo, "full") == 0) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_FULL;
+ } else if (strcmp(algo, "diff") == 0) {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DIFF;
+ } else {
+ priv->data_self_heal_algorithm = AFR_SELFHEAL_DATA_DYNAMIC;
+ }
+}
- GF_OPTION_RECONF ("data-self-heal", priv->data_self_heal, options, str,
- out);
+void
+afr_handle_anon_inode_options(afr_private_t *priv, dict_t *options)
+{
+ char *volfile_id_str = NULL;
+ uuid_t anon_inode_gfid = {0};
+
+ /*If volume id is not present don't enable anything*/
+ if (dict_get_str(options, "volume-id", &volfile_id_str))
+ return;
+ GF_ASSERT(strlen(AFR_ANON_DIR_PREFIX) + strlen(volfile_id_str) <= NAME_MAX);
+ /*anon_inode_name is not supposed to change once assigned*/
+ if (!priv->anon_inode_name[0]) {
+ snprintf(priv->anon_inode_name, sizeof(priv->anon_inode_name), "%s-%s",
+ AFR_ANON_DIR_PREFIX, volfile_id_str);
+ gf_uuid_parse(volfile_id_str, anon_inode_gfid);
+ /*Flip a bit to make sure volfile-id and anon-gfid are not same*/
+ anon_inode_gfid[0] ^= 1;
+ uuid_utoa_r(anon_inode_gfid, priv->anon_gfid_str);
+ }
+}
- GF_OPTION_RECONF ("entry-self-heal", priv->entry_self_heal, options,
- bool, out);
+int
+reconfigure(xlator_t *this, dict_t *options)
+{
+ afr_private_t *priv = NULL;
+ xlator_t *read_subvol = NULL;
+ int read_subvol_index = -1;
+ int timeout_old = 0;
+ int ret = -1;
+ int index = -1;
+ char *qtype = NULL;
+ char *fav_child_policy = NULL;
+ char *data_self_heal = NULL;
+ char *data_self_heal_algorithm = NULL;
+ char *locking_scheme = NULL;
+ gf_boolean_t consistent_io = _gf_false;
+ gf_boolean_t choose_local_old = _gf_false;
+ gf_boolean_t enabled_old = _gf_false;
- GF_OPTION_RECONF ("data-self-heal-window-size",
- priv->data_self_heal_window_size, options,
- uint32, out);
+ priv = this->private;
- GF_OPTION_RECONF ("data-change-log", priv->data_change_log, options,
- bool, out);
+ GF_OPTION_RECONF("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, options, bool, out);
- GF_OPTION_RECONF ("metadata-change-log",
- priv->metadata_change_log, options, bool, out);
+ GF_OPTION_RECONF("background-self-heal-count",
+ priv->background_self_heal_count, options, uint32, out);
- GF_OPTION_RECONF ("entry-change-log", priv->entry_change_log, options,
- bool, out);
+ GF_OPTION_RECONF("heal-wait-queue-length", priv->heal_wait_qlen, options,
+ uint32, out);
- GF_OPTION_RECONF ("data-self-heal-algorithm",
- priv->data_self_heal_algorithm, options, str, out);
+ GF_OPTION_RECONF("metadata-self-heal", priv->metadata_self_heal, options,
+ bool, out);
- GF_OPTION_RECONF ("read-subvolume", read_subvol, options, xlator, out);
+ GF_OPTION_RECONF("data-self-heal", data_self_heal, options, str, out);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
- GF_OPTION_RECONF ("read-hash-mode", priv->hash_mode,
- options, uint32, out);
+ GF_OPTION_RECONF("entry-self-heal", priv->entry_self_heal, options, bool,
+ out);
- if (read_subvol) {
- index = xlator_subvolume_index (this, read_subvol);
- if (index == -1) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_INVALID_SUBVOL, "%s not a subvolume",
- read_subvol->name);
- goto out;
- }
- priv->read_child = index;
- }
+ GF_OPTION_RECONF("data-self-heal-window-size",
+ priv->data_self_heal_window_size, options, uint32, out);
- GF_OPTION_RECONF ("read-subvolume-index",read_subvol_index, options,int32,out);
-
- if (read_subvol_index >-1) {
- index=read_subvol_index;
- if (index >= priv->child_count) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_INVALID_SUBVOL,
- "%d not a subvolume-index", index);
- goto out;
- }
- priv->read_child = index;
- }
+ GF_OPTION_RECONF("data-self-heal-algorithm", data_self_heal_algorithm,
+ options, str, out);
+ set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
- GF_OPTION_RECONF ("pre-op-compat", priv->pre_op_compat, options, bool, out);
+ GF_OPTION_RECONF("halo-enabled", priv->halo_enabled, options, bool, out);
- GF_OPTION_RECONF ("eager-lock", priv->eager_lock, options, bool, out);
- GF_OPTION_RECONF ("quorum-type", qtype, options, str, out);
- GF_OPTION_RECONF ("quorum-count", priv->quorum_count, options,
- uint32, out);
- fix_quorum_options (this, priv, qtype, options);
- if (priv->quorum_count && !afr_has_quorum (priv->child_up, this))
- gf_msg (this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
- "Client-quorum is not met");
+ GF_OPTION_RECONF("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
+ options, uint32, out);
+ GF_OPTION_RECONF("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
+ options, uint32, out);
- GF_OPTION_RECONF ("post-op-delay-secs", priv->post_op_delay_secs, options,
- uint32, out);
+ GF_OPTION_RECONF("halo-max-latency", priv->halo_max_latency_msec, options,
+ uint32, out);
- GF_OPTION_RECONF (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size,
- options, size_uint64, out);
- /* Reset this so we re-discover in case the topology changed. */
- GF_OPTION_RECONF ("ensure-durability", priv->ensure_durability, options,
- bool, out);
+ GF_OPTION_RECONF("halo-max-replicas", priv->halo_max_replicas, options,
+ uint32, out);
- GF_OPTION_RECONF ("self-heal-daemon", priv->shd.enabled, options,
- bool, out);
+ GF_OPTION_RECONF("halo-min-replicas", priv->halo_min_replicas, options,
+ uint32, out);
- GF_OPTION_RECONF ("iam-self-heal-daemon", priv->shd.iamshd, options,
- bool, out);
+ GF_OPTION_RECONF("read-subvolume", read_subvol, options, xlator, out);
- GF_OPTION_RECONF ("heal-timeout", priv->shd.timeout, options,
- int32, out);
+ choose_local_old = priv->choose_local;
+ GF_OPTION_RECONF("choose-local", priv->choose_local, options, bool, out);
- GF_OPTION_RECONF ("quorum-reads", priv->quorum_reads, options,
- bool, out);
- GF_OPTION_RECONF ("consistent-metadata", priv->consistent_metadata,
- options, bool, out);
+ if (choose_local_old != priv->choose_local) {
+ priv->read_child = -1;
+ if (choose_local_old == _gf_false)
+ priv->did_discovery = _gf_false;
+ }
+
+ GF_OPTION_RECONF("read-hash-mode", priv->hash_mode, options, uint32, out);
+
+ if (read_subvol) {
+ index = xlator_subvolume_index(this, read_subvol);
+ if (index == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%s not a subvolume", read_subvol->name);
+ goto out;
+ }
+ priv->read_child = index;
+ }
+
+ GF_OPTION_RECONF("read-subvolume-index", read_subvol_index, options, int32,
+ out);
+
+ if (read_subvol_index > -1) {
+ index = read_subvol_index;
+ if (index >= priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%d not a subvolume-index", index);
+ goto out;
+ }
+ priv->read_child = index;
+ }
+
+ GF_OPTION_RECONF("pre-op-compat", priv->pre_op_compat, options, bool, out);
+ GF_OPTION_RECONF("locking-scheme", locking_scheme, options, str, out);
+ priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
+ GF_OPTION_RECONF("full-lock", priv->full_lock, options, bool, out);
+ GF_OPTION_RECONF("granular-entry-heal", priv->esh_granular, options, bool,
+ out);
+
+ GF_OPTION_RECONF("eager-lock", priv->eager_lock, options, bool, out);
+ GF_OPTION_RECONF("optimistic-change-log", priv->optimistic_change_log,
+ options, bool, out);
+ GF_OPTION_RECONF("quorum-type", qtype, options, str, out);
+ GF_OPTION_RECONF("quorum-count", priv->quorum_count, options, uint32, out);
+ fix_quorum_options(this, priv, qtype, options);
+ if (priv->quorum_count && !afr_has_quorum(priv->child_up, this, NULL))
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_QUORUM_FAIL,
+ "Client-quorum is not met");
+
+ GF_OPTION_RECONF("post-op-delay-secs", priv->post_op_delay_secs, options,
+ uint32, out);
+
+ GF_OPTION_RECONF(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, options,
+ size_uint64, out);
+ /* Reset this so we re-discover in case the topology changed. */
+ GF_OPTION_RECONF("ensure-durability", priv->ensure_durability, options,
+ bool, out);
+
+ enabled_old = priv->shd.enabled;
+ GF_OPTION_RECONF("self-heal-daemon", priv->shd.enabled, options, bool, out);
+
+ GF_OPTION_RECONF("iam-self-heal-daemon", priv->shd.iamshd, options, bool,
+ out);
+
+ timeout_old = priv->shd.timeout;
+ GF_OPTION_RECONF("heal-timeout", priv->shd.timeout, options, int32, out);
+
+ GF_OPTION_RECONF("consistent-metadata", priv->consistent_metadata, options,
+ bool, out);
+
+ GF_OPTION_RECONF("shd-max-threads", priv->shd.max_threads, options, uint32,
+ out);
+
+ GF_OPTION_RECONF("shd-wait-qlength", priv->shd.wait_qlength, options,
+ uint32, out);
+
+ GF_OPTION_RECONF("favorite-child-policy", fav_child_policy, options, str,
+ out);
+ if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
+ goto out;
+
+ priv->did_discovery = _gf_false;
+
+ GF_OPTION_RECONF("consistent-io", consistent_io, options, bool, out);
+ if (priv->quorum_count != 0)
+ consistent_io = _gf_false;
+ priv->consistent_io = consistent_io;
+
+ afr_handle_anon_inode_options(priv, options);
+
+ GF_OPTION_RECONF("use-anonymous-inode", priv->use_anon_inode, options, bool,
+ out);
+ if (priv->shd.enabled) {
+ if ((priv->shd.enabled != enabled_old) ||
+ (timeout_old != priv->shd.timeout))
+ afr_selfheal_childup(this, priv);
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
- priv->did_discovery = _gf_false;
+static int
+afr_pending_xattrs_init(afr_private_t *priv, xlator_t *this)
+{
+ int ret = -1;
+ int i = 0;
+ char *ptr = NULL;
+ char *ptr1 = NULL;
+ char *xattrs_list = NULL;
+ xlator_list_t *trav = NULL;
+ int child_count = -1;
+
+ trav = this->children;
+ child_count = priv->child_count;
+ if (priv->thin_arbiter_count) {
+ /* priv->pending_key[THIN_ARBITER_BRICK_INDEX] is used as the
+ * name of the thin arbiter file for persistence across add/
+ * removal of DHT subvols.*/
+ child_count++;
+ }
+
+ GF_OPTION_INIT("afr-pending-xattr", xattrs_list, str, out);
+ priv->pending_key = GF_CALLOC(sizeof(*priv->pending_key), child_count,
+ gf_afr_mt_char);
+ if (!priv->pending_key) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ if (!xattrs_list) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_NO_CHANGELOG,
+ "Unable to fetch afr-pending-xattr option from volfile."
+ " Falling back to using client translator names. ");
+ while (i < child_count) {
+ ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
+ trav->xlator->name);
+ if (ret == -1) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ trav = trav->next;
+ i++;
+ }
ret = 0;
-out:
- return ret;
+ goto out;
+ }
+
+ ptr = ptr1 = gf_strdup(xattrs_list);
+ if (!ptr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ for (i = 0, ptr = strtok(ptr, ","); ptr; ptr = strtok(NULL, ",")) {
+ ret = gf_asprintf(&priv->pending_key[i], "%s.%s", AFR_XATTR_PREFIX,
+ ptr);
+ if (ret == -1) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ i++;
+ }
+ ret = 0;
+out:
+ GF_FREE(ptr1);
+ return ret;
}
-
-static const char *favorite_child_warning_str = "You have specified subvolume '%s' "
- "as the 'favorite child'. This means that if a discrepancy in the content "
- "or attributes (ownership, permission, etc.) of a file is detected among "
- "the subvolumes, the file on '%s' will be considered the definitive "
- "version and its contents will OVERWRITE the contents of the file on other "
- "subvolumes. All versions of the file except that on '%s' "
- "WILL BE LOST.";
-
+void
+afr_ta_init(afr_private_t *priv)
+{
+ priv->thin_arbiter_count = 1;
+ priv->child_count--;
+ priv->ta_child_up = 0;
+ priv->ta_bad_child_index = AFR_CHILD_UNKNOWN;
+ priv->ta_notify_dom_lock_offset = 0;
+ priv->ta_in_mem_txn_count = 0;
+ priv->ta_on_wire_txn_count = 0;
+ priv->release_ta_notify_dom_lock = _gf_false;
+ INIT_LIST_HEAD(&priv->ta_waitq);
+ INIT_LIST_HEAD(&priv->ta_onwireq);
+ gf_uuid_clear(priv->ta_gfid);
+}
int32_t
-init (xlator_t *this)
+init(xlator_t *this)
{
- afr_private_t *priv = NULL;
- int child_count = 0;
- xlator_list_t *trav = NULL;
- int i = 0;
- int ret = -1;
- GF_UNUSED int op_errno = 0;
- xlator_t *read_subvol = NULL;
- int read_subvol_index = -1;
- xlator_t *fav_child = NULL;
- char *qtype = NULL;
- char *xattrs_list = NULL;
- char *ptr = NULL;
-
- if (!this->children) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_CHILD_MISCONFIGURED,
- "replicate translator needs more than one "
- "subvolume defined.");
- return -1;
+ afr_private_t *priv = NULL;
+ int child_count = 0;
+ xlator_list_t *trav = NULL;
+ int i = 0;
+ int ret = -1;
+ GF_UNUSED int op_errno = 0;
+ xlator_t *read_subvol = NULL;
+ int read_subvol_index = -1;
+ char *qtype = NULL;
+ char *fav_child_policy = NULL;
+ char *thin_arbiter = NULL;
+ char *data_self_heal = NULL;
+ char *locking_scheme = NULL;
+ char *data_self_heal_algorithm = NULL;
+
+ if (!this->children) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_CHILD_MISCONFIGURED,
+ "replicate translator needs more than one "
+ "subvolume defined.");
+ return -1;
+ }
+
+ if (!this->parents) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_VOL_MISCONFIGURED,
+ "Volume is dangling.");
+ }
+
+ this->private = GF_CALLOC(1, sizeof(afr_private_t),
+ gf_afr_mt_afr_private_t);
+ if (!this->private)
+ goto out;
+
+ priv = this->private;
+ INIT_LIST_HEAD(&priv->saved_locks);
+ INIT_LIST_HEAD(&priv->lk_healq);
+ LOCK_INIT(&priv->lock);
+
+ child_count = xlator_subvolume_count(this);
+
+ priv->child_count = child_count;
+
+ priv->read_child = -1;
+
+ GF_OPTION_INIT("arbiter-count", priv->arbiter_count, uint32, out);
+ GF_OPTION_INIT("thin-arbiter", thin_arbiter, str, out);
+ if (thin_arbiter && strlen(thin_arbiter) > 0) {
+ afr_ta_init(priv);
+ }
+ INIT_LIST_HEAD(&priv->healing);
+ INIT_LIST_HEAD(&priv->heal_waiting);
+
+ priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT;
+
+ GF_OPTION_INIT("afr-dirty-xattr", priv->afr_dirty, str, out);
+
+ GF_OPTION_INIT("metadata-splitbrain-forced-heal",
+ priv->metadata_splitbrain_forced_heal, bool, out);
+
+ GF_OPTION_INIT("read-subvolume", read_subvol, xlator, out);
+ if (read_subvol) {
+ priv->read_child = xlator_subvolume_index(this, read_subvol);
+ if (priv->read_child == -1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%s not a subvolume", read_subvol->name);
+ goto out;
}
-
- if (!this->parents) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_VOL_MISCONFIGURED, "Volume is dangling.");
+ }
+ GF_OPTION_INIT("read-subvolume-index", read_subvol_index, int32, out);
+ if (read_subvol_index > -1) {
+ if (read_subvol_index >= priv->child_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, AFR_MSG_INVALID_SUBVOL,
+ "%d not a subvolume-index", read_subvol_index);
+ goto out;
}
+ priv->read_child = read_subvol_index;
+ }
+ GF_OPTION_INIT("choose-local", priv->choose_local, bool, out);
- this->private = GF_CALLOC (1, sizeof (afr_private_t),
- gf_afr_mt_afr_private_t);
- if (!this->private)
- goto out;
+ priv->pending_reads = GF_CALLOC(sizeof(*priv->pending_reads),
+ priv->child_count, gf_afr_mt_atomic_t);
- priv = this->private;
- LOCK_INIT (&priv->lock);
+ GF_OPTION_INIT("read-hash-mode", priv->hash_mode, uint32, out);
- child_count = xlator_subvolume_count (this);
+ priv->favorite_child = -1;
- priv->child_count = child_count;
+ GF_OPTION_INIT("favorite-child-policy", fav_child_policy, str, out);
+ if (afr_set_favorite_child_policy(priv, fav_child_policy) == -1)
+ goto out;
- priv->read_child = -1;
+ GF_OPTION_INIT("shd-max-threads", priv->shd.max_threads, uint32, out);
- GF_OPTION_INIT ("arbiter-count", priv->arbiter_count, uint32, out);
+ GF_OPTION_INIT("shd-wait-qlength", priv->shd.wait_qlength, uint32, out);
- priv->spb_choice_timeout = AFR_DEFAULT_SPB_CHOICE_TIMEOUT;
+ GF_OPTION_INIT("background-self-heal-count",
+ priv->background_self_heal_count, uint32, out);
- GF_OPTION_INIT ("afr-dirty-xattr", priv->afr_dirty, str, out);
+ GF_OPTION_INIT("heal-wait-queue-length", priv->heal_wait_qlen, uint32, out);
- GF_OPTION_INIT ("metadata-splitbrain-forced-heal",
- priv->metadata_splitbrain_forced_heal, bool, out);
+ GF_OPTION_INIT("data-self-heal", data_self_heal, str, out);
+ if (gf_string2boolean(data_self_heal, &priv->data_self_heal) == -1)
+ goto out;
- GF_OPTION_INIT ("read-subvolume", read_subvol, xlator, out);
- if (read_subvol) {
- priv->read_child = xlator_subvolume_index (this, read_subvol);
- if (priv->read_child == -1) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_INVALID_SUBVOL, "%s not a subvolume",
- read_subvol->name);
- goto out;
- }
- }
- GF_OPTION_INIT ("read-subvolume-index",read_subvol_index,int32,out);
- if (read_subvol_index > -1) {
- if (read_subvol_index >= priv->child_count) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_INVALID_SUBVOL,
- "%d not a subvolume-index", read_subvol_index);
- goto out;
- }
- priv->read_child = read_subvol_index;
- }
- GF_OPTION_INIT ("choose-local", priv->choose_local, bool, out);
-
- GF_OPTION_INIT ("read-hash-mode", priv->hash_mode, uint32, out);
-
- priv->favorite_child = -1;
- GF_OPTION_INIT ("favorite-child", fav_child, xlator, out);
- if (fav_child) {
- priv->favorite_child = xlator_subvolume_index (this, fav_child);
- if (priv->favorite_child == -1) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_INVALID_SUBVOL, "%s not a subvolume, "
- "cannot set it as favorite child",
- fav_child->name);
- goto out;
- }
- gf_msg (this->name, GF_LOG_WARNING, 0, AFR_MSG_FAVORITE_CHILD,
- favorite_child_warning_str, fav_child->name,
- fav_child->name, fav_child->name);
- }
+ GF_OPTION_INIT("data-self-heal-algorithm", data_self_heal_algorithm, str,
+ out);
+ set_data_self_heal_algorithm(priv, data_self_heal_algorithm);
+ GF_OPTION_INIT("data-self-heal-window-size",
+ priv->data_self_heal_window_size, uint32, out);
- GF_OPTION_INIT ("background-self-heal-count",
- priv->background_self_heal_count, uint32, out);
+ GF_OPTION_INIT("metadata-self-heal", priv->metadata_self_heal, bool, out);
- GF_OPTION_INIT ("data-self-heal", priv->data_self_heal, str, out);
+ GF_OPTION_INIT("entry-self-heal", priv->entry_self_heal, bool, out);
- GF_OPTION_INIT ("data-self-heal-algorithm",
- priv->data_self_heal_algorithm, str, out);
+ GF_OPTION_INIT("halo-shd-max-latency", priv->shd.halo_max_latency_msec,
+ uint32, out);
- GF_OPTION_INIT ("data-self-heal-window-size",
- priv->data_self_heal_window_size, uint32, out);
+ GF_OPTION_INIT("halo-max-latency", priv->halo_max_latency_msec, uint32,
+ out);
+ GF_OPTION_INIT("halo-max-replicas", priv->halo_max_replicas, uint32, out);
+ GF_OPTION_INIT("halo-min-replicas", priv->halo_min_replicas, uint32, out);
- GF_OPTION_INIT ("metadata-self-heal", priv->metadata_self_heal, bool,
- out);
+ GF_OPTION_INIT("halo-enabled", priv->halo_enabled, bool, out);
- GF_OPTION_INIT ("entry-self-heal", priv->entry_self_heal, bool, out);
+ GF_OPTION_INIT("halo-nfsd-max-latency", priv->nfsd.halo_max_latency_msec,
+ uint32, out);
- GF_OPTION_INIT ("data-change-log", priv->data_change_log, bool, out);
+ GF_OPTION_INIT("iam-nfs-daemon", priv->nfsd.iamnfsd, bool, out);
- GF_OPTION_INIT ("metadata-change-log", priv->metadata_change_log, bool,
- out);
+ GF_OPTION_INIT("optimistic-change-log", priv->optimistic_change_log, bool,
+ out);
- GF_OPTION_INIT ("entry-change-log", priv->entry_change_log, bool, out);
+ GF_OPTION_INIT("pre-op-compat", priv->pre_op_compat, bool, out);
+ GF_OPTION_INIT("locking-scheme", locking_scheme, str, out);
+ priv->granular_locks = (strcmp(locking_scheme, "granular") == 0);
+ GF_OPTION_INIT("full-lock", priv->full_lock, bool, out);
+ GF_OPTION_INIT("granular-entry-heal", priv->esh_granular, bool, out);
- GF_OPTION_INIT ("optimistic-change-log", priv->optimistic_change_log,
- bool, out);
+ GF_OPTION_INIT("eager-lock", priv->eager_lock, bool, out);
+ GF_OPTION_INIT("quorum-type", qtype, str, out);
+ GF_OPTION_INIT("quorum-count", priv->quorum_count, uint32, out);
+ GF_OPTION_INIT(AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64,
+ out);
+ fix_quorum_options(this, priv, qtype, this->options);
- GF_OPTION_INIT ("inodelk-trace", priv->inodelk_trace, bool, out);
+ GF_OPTION_INIT("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
+ GF_OPTION_INIT("ensure-durability", priv->ensure_durability, bool, out);
- GF_OPTION_INIT ("entrylk-trace", priv->entrylk_trace, bool, out);
+ GF_OPTION_INIT("self-heal-daemon", priv->shd.enabled, bool, out);
- GF_OPTION_INIT ("pre-op-compat", priv->pre_op_compat, bool, out);
+ GF_OPTION_INIT("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
+ GF_OPTION_INIT("heal-timeout", priv->shd.timeout, int32, out);
- GF_OPTION_INIT ("eager-lock", priv->eager_lock, bool, out);
- GF_OPTION_INIT ("quorum-type", qtype, str, out);
- GF_OPTION_INIT ("quorum-count", priv->quorum_count, uint32, out);
- GF_OPTION_INIT (AFR_SH_READDIR_SIZE_KEY, priv->sh_readdir_size, size_uint64,
- out);
- fix_quorum_options (this, priv, qtype, this->options);
+ GF_OPTION_INIT("consistent-metadata", priv->consistent_metadata, bool, out);
+ GF_OPTION_INIT("consistent-io", priv->consistent_io, bool, out);
+ afr_handle_anon_inode_options(priv, this->options);
- GF_OPTION_INIT ("post-op-delay-secs", priv->post_op_delay_secs, uint32, out);
- GF_OPTION_INIT ("ensure-durability", priv->ensure_durability, bool,
- out);
+ GF_OPTION_INIT("use-anonymous-inode", priv->use_anon_inode, bool, out);
+ if (priv->quorum_count != 0)
+ priv->consistent_io = _gf_false;
- GF_OPTION_INIT ("self-heal-daemon", priv->shd.enabled, bool, out);
+ priv->wait_count = 1;
- GF_OPTION_INIT ("iam-self-heal-daemon", priv->shd.iamshd, bool, out);
- GF_OPTION_INIT ("heal-timeout", priv->shd.timeout, int32, out);
+ priv->local = GF_CALLOC(sizeof(unsigned char), child_count, gf_afr_mt_char);
+ if (!priv->local) {
+ ret = -ENOMEM;
+ goto out;
+ }
- GF_OPTION_INIT ("quorum-reads", priv->quorum_reads, bool, out);
- GF_OPTION_INIT ("consistent-metadata", priv->consistent_metadata, bool,
- out);
+ priv->anon_inode = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
- priv->wait_count = 1;
+ priv->child_up = GF_CALLOC(sizeof(unsigned char), child_count,
+ gf_afr_mt_char);
- priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
+ priv->child_latency = GF_MALLOC(sizeof(*priv->child_latency) * child_count,
+ gf_afr_mt_child_latency_t);
+ priv->halo_child_up = GF_CALLOC(sizeof(unsigned char), child_count,
gf_afr_mt_char);
- if (!priv->child_up) {
- ret = -ENOMEM;
- goto out;
- }
-
- for (i = 0; i < child_count; i++)
- priv->child_up[i] = -1; /* start with unknown state.
- this initialization needed
- for afr_notify() to work
- reliably
- */
-
- priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
- gf_afr_mt_xlator_t);
- if (!priv->children) {
- ret = -ENOMEM;
- goto out;
- }
-
- GF_OPTION_INIT ("afr-pending-xattr", xattrs_list, str, out);
- priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key),
- child_count,
- gf_afr_mt_char);
- if (!priv->pending_key) {
- ret = -ENOMEM;
- goto out;
- }
- if (!xattrs_list) {
- ret = -EINVAL;
- gf_msg (this->name, GF_LOG_ERROR, -ret, AFR_MSG_NO_CHANGELOG,
- "Unable to fetch afr pending changelogs. Is op-version"
- " >= 30707?");
- goto out;
- }
- ptr = gf_strdup (xattrs_list);
- if (!ptr) {
- ret = -ENOMEM;
- goto out;
- }
- for (i = 0, ptr = strtok (ptr, ","); ptr; ptr = strtok (NULL, ",")) {
- ret = gf_asprintf (&priv->pending_key[i], "%s.%s",
- AFR_XATTR_PREFIX, ptr);
- if (ret == -1) {
- ret = -ENOMEM;
- goto out;
- }
- i++;
- }
- trav = this->children;
- i = 0;
- while (i < child_count) {
- priv->children[i] = trav->xlator;
- trav = trav->next;
- i++;
+ if (!priv->child_up || !priv->child_latency || !priv->halo_child_up ||
+ !priv->anon_inode) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ /*Initialize to -ve ping timeout so that they are not considered
+ * in child-up events until ping-event comes*/
+ for (i = 0; i < child_count; i++)
+ priv->child_latency[i] = -1;
+
+ priv->children = GF_CALLOC(sizeof(xlator_t *), child_count,
+ gf_afr_mt_xlator_t);
+ if (!priv->children) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = afr_pending_xattrs_init(priv, this);
+ if (ret)
+ goto out;
+
+ trav = this->children;
+ i = 0;
+ while (i < child_count) {
+ priv->children[i] = trav->xlator;
+ trav = trav->next;
+ i++;
+ }
+
+ ret = gf_asprintf(&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT, this->name);
+ if (-1 == ret) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ priv->last_event = GF_CALLOC(child_count, sizeof(*priv->last_event),
+ gf_afr_mt_int32_t);
+ if (!priv->last_event) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ this->itable = inode_table_new(SHD_INODE_LRU_LIMIT, this);
+ if (!this->itable) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (priv->shd.iamshd) {
+ ret = afr_selfheal_daemon_init(this);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out;
}
+ }
- ret = gf_asprintf (&priv->sh_domain, AFR_SH_DATA_DOMAIN_FMT,
- this->name);
- if (-1 == ret) {
- ret = -ENOMEM;
- goto out;
- }
+ /* keep more local here as we may need them for self-heal etc */
+ this->local_pool = mem_pool_new(afr_local_t, 512);
+ if (!this->local_pool) {
+ ret = -1;
+ goto out;
+ }
- priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
- gf_afr_mt_int32_t);
- if (!priv->last_event) {
- ret = -ENOMEM;
- goto out;
- }
+ priv->root_inode = NULL;
- ret = afr_selfheal_daemon_init (this);
- if (ret) {
- ret = -ENOMEM;
- goto out;
- }
+ ret = 0;
+out:
+ return ret;
+}
+void
+afr_destroy_healer_object(xlator_t *this, struct subvol_healer *healer)
+{
+ int ret = -1;
+
+ if (!healer)
+ return;
+
+ if (healer->running) {
+ /*
+ * If there are any resources to cleanup, We need
+ * to do that gracefully using pthread_cleanup_push
+ */
+ ret = gf_thread_cleanup_xint(healer->thread);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, AFR_MSG_SELF_HEAL_FAILED,
+ "Failed to clean up healer threads.");
+ healer->thread = 0;
+ }
+ pthread_cond_destroy(&healer->cond);
+ pthread_mutex_destroy(&healer->mutex);
+}
- /* keep more local here as we may need them for self-heal etc */
- this->local_pool = mem_pool_new (afr_local_t, 512);
- if (!this->local_pool) {
- ret = -1;
- goto out;
- }
+void
+afr_selfheal_daemon_fini(xlator_t *this)
+{
+ struct subvol_healer *healer = NULL;
+ afr_self_heald_t *shd = NULL;
+ afr_private_t *priv = NULL;
+ int i = 0;
+
+ priv = this->private;
+ if (!priv)
+ return;
+
+ shd = &priv->shd;
+ if (!shd->iamshd)
+ return;
+
+ for (i = 0; i < priv->child_count; i++) {
+ healer = &shd->index_healers[i];
+ afr_destroy_healer_object(this, healer);
+
+ healer = &shd->full_healers[i];
+ afr_destroy_healer_object(this, healer);
+
+ if (shd->statistics[i])
+ eh_destroy(shd->statistics[i]);
+ }
+ GF_FREE(shd->index_healers);
+ GF_FREE(shd->full_healers);
+ GF_FREE(shd->statistics);
+ if (shd->split_brain)
+ eh_destroy(shd->split_brain);
+}
+void
+fini(xlator_t *this)
+{
+ afr_private_t *priv = NULL;
- priv->root_inode = NULL;
+ priv = this->private;
- ret = 0;
-out:
- GF_FREE (ptr);
- return ret;
-}
+ afr_selfheal_daemon_fini(this);
+ GF_ASSERT(list_empty(&priv->saved_locks));
+ LOCK(&priv->lock);
+ if (priv->timer != NULL) {
+ gf_timer_call_cancel(this->ctx, priv->timer);
+ priv->timer = NULL;
+ }
+ UNLOCK(&priv->lock);
-int
-fini (xlator_t *this)
-{
- afr_private_t *priv = NULL;
+ if (this->local_pool != NULL) {
+ mem_pool_destroy(this->local_pool);
+ this->local_pool = NULL;
+ }
- priv = this->private;
- this->private = NULL;
- afr_priv_destroy (priv);
- //if (this->itable);//I dont see any destroy func
+ this->private = NULL;
+ afr_priv_destroy(priv);
+ if (this->itable) {
+ inode_table_destroy(this->itable);
+ this->itable = NULL;
+ }
- return 0;
+ return;
}
-
struct xlator_fops fops = {
- .lookup = afr_lookup,
- .open = afr_open,
- .lk = afr_lk,
- .flush = afr_flush,
- .statfs = afr_statfs,
- .fsync = afr_fsync,
- .fsyncdir = afr_fsyncdir,
- .xattrop = afr_xattrop,
- .fxattrop = afr_fxattrop,
- .inodelk = afr_inodelk,
- .finodelk = afr_finodelk,
- .entrylk = afr_entrylk,
- .fentrylk = afr_fentrylk,
-
- /* inode read */
- .access = afr_access,
- .stat = afr_stat,
- .fstat = afr_fstat,
- .readlink = afr_readlink,
- .getxattr = afr_getxattr,
- .fgetxattr = afr_fgetxattr,
- .readv = afr_readv,
-
- /* inode write */
- .writev = afr_writev,
- .truncate = afr_truncate,
- .ftruncate = afr_ftruncate,
- .setxattr = afr_setxattr,
- .fsetxattr = afr_fsetxattr,
- .setattr = afr_setattr,
- .fsetattr = afr_fsetattr,
- .removexattr = afr_removexattr,
- .fremovexattr = afr_fremovexattr,
- .fallocate = afr_fallocate,
- .discard = afr_discard,
- .zerofill = afr_zerofill,
-
- /* dir read */
- .opendir = afr_opendir,
- .readdir = afr_readdir,
- .readdirp = afr_readdirp,
-
- /* dir write */
- .create = afr_create,
- .mknod = afr_mknod,
- .mkdir = afr_mkdir,
- .unlink = afr_unlink,
- .rmdir = afr_rmdir,
- .link = afr_link,
- .symlink = afr_symlink,
- .rename = afr_rename,
+ .lookup = afr_lookup,
+ .lk = afr_lk,
+ .flush = afr_flush,
+ .statfs = afr_statfs,
+ .fsyncdir = afr_fsyncdir,
+ .inodelk = afr_inodelk,
+ .finodelk = afr_finodelk,
+ .entrylk = afr_entrylk,
+ .fentrylk = afr_fentrylk,
+ .ipc = afr_ipc,
+ .lease = afr_lease,
+
+ /* inode read */
+ .access = afr_access,
+ .stat = afr_stat,
+ .fstat = afr_fstat,
+ .readlink = afr_readlink,
+ .getxattr = afr_getxattr,
+ .fgetxattr = afr_fgetxattr,
+ .readv = afr_readv,
+ .seek = afr_seek,
+
+ /* inode write */
+ .writev = afr_writev,
+ .truncate = afr_truncate,
+ .ftruncate = afr_ftruncate,
+ .setxattr = afr_setxattr,
+ .fsetxattr = afr_fsetxattr,
+ .setattr = afr_setattr,
+ .fsetattr = afr_fsetattr,
+ .removexattr = afr_removexattr,
+ .fremovexattr = afr_fremovexattr,
+ .fallocate = afr_fallocate,
+ .discard = afr_discard,
+ .zerofill = afr_zerofill,
+ .xattrop = afr_xattrop,
+ .fxattrop = afr_fxattrop,
+ .fsync = afr_fsync,
+
+ /*inode open*/
+ .opendir = afr_opendir,
+ .open = afr_open,
+
+ /* dir read */
+ .readdir = afr_readdir,
+ .readdirp = afr_readdirp,
+
+ /* dir write */
+ .create = afr_create,
+ .mknod = afr_mknod,
+ .mkdir = afr_mkdir,
+ .unlink = afr_unlink,
+ .rmdir = afr_rmdir,
+ .link = afr_link,
+ .symlink = afr_symlink,
+ .rename = afr_rename,
};
-
struct xlator_dumpops dumpops = {
- .priv = afr_priv_dump,
+ .priv = afr_priv_dump,
};
-
struct xlator_cbks cbks = {
- .release = afr_release,
- .releasedir = afr_releasedir,
- .forget = afr_forget,
+ .release = afr_release,
+ .releasedir = afr_releasedir,
+ .forget = afr_forget,
};
-
struct volume_options options[] = {
- { .key = {"read-subvolume" },
- .type = GF_OPTION_TYPE_XLATOR,
- .description = "inode-read fops happen only on one of the bricks in "
- "replicate. Afr will prefer the one specified using "
- "this option if it is not stale. Option value must be "
- "one of the xlator names of the children. "
- "Ex: <volname>-client-0 till "
- "<volname>-client-<number-of-bricks - 1>"
- },
- { .key = {"read-subvolume-index" },
- .type = GF_OPTION_TYPE_INT,
- .default_value = "-1",
- .description = "inode-read fops happen only on one of the bricks in "
- "replicate. AFR will prefer the one specified using "
- "this option if it is not stale. allowed options"
- " include -1 till replica-count - 1"
- },
- { .key = {"read-hash-mode" },
- .type = GF_OPTION_TYPE_INT,
- .min = 0,
- .max = 2,
- .default_value = "1",
- .description = "inode-read fops happen only on one of the bricks in "
- "replicate. AFR will prefer the one computed using "
- "the method specified using this option"
- "0 = first up server, "
- "1 = hash by GFID of file (all clients use "
- "same subvolume), "
- "2 = hash by GFID of file and client PID",
- },
- { .key = {"choose-local" },
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "true",
- .description = "Choose a local subvolume (i.e. Brick) to read from"
- " if read-subvolume is not explicitly set.",
- },
- { .key = {"favorite-child"},
- .type = GF_OPTION_TYPE_XLATOR,
- .description = "If a split-brain happens choose subvol/brick set by "
- "this option as source."
- },
- { .key = {"background-self-heal-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0,
- .default_value = "16",
- .validate = GF_OPT_VALIDATE_MIN,
- .description = "This specifies the number of self-heals that can be "
- " performed in background without blocking the fop"
- },
- { .key = {"data-self-heal"},
- .type = GF_OPTION_TYPE_STR,
- .value = {"1", "on", "yes", "true", "enable",
- "0", "off", "no", "false", "disable",
- "open"},
- .default_value = "on",
- .description = "Using this option we can enable/disable data "
- "self-heal on the file. \"open\" means data "
- "self-heal action will only be triggered by file "
- "open operations."
- },
- { .key = {"data-self-heal-algorithm"},
- .type = GF_OPTION_TYPE_STR,
- .description = "Select between \"full\", \"diff\". The "
- "\"full\" algorithm copies the entire file from "
- "source to sink. The \"diff\" algorithm copies to "
- "sink only those blocks whose checksums don't match "
- "with those of source. If no option is configured "
- "the option is chosen dynamically as follows: "
- "If the file does not exist on one of the sinks "
- "or empty file exists or if the source file size is "
- "about the same as page size the entire file will "
- "be read and written i.e \"full\" algo, "
- "otherwise \"diff\" algo is chosen.",
- .value = { "diff", "full"}
- },
- { .key = {"data-self-heal-window-size"},
- .type = GF_OPTION_TYPE_INT,
- .min = 1,
- .max = 1024,
- .default_value = "1",
- .description = "Maximum number blocks per file for which self-heal "
- "process would be applied simultaneously."
- },
- { .key = {"metadata-self-heal"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- .description = "Using this option we can enable/disable metadata "
- "i.e. Permissions, ownerships, xattrs self-heal on "
- "the file/directory."
- },
- { .key = {"entry-self-heal"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- .description = "Using this option we can enable/disable entry "
- "self-heal on the directory."
- },
- { .key = {"data-change-log"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- .description = "Data fops like write/truncate will not perform "
- "pre/post fop changelog operations in afr transaction "
- "if this option is disabled"
- },
- { .key = {"metadata-change-log"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- .description = "Metadata fops like setattr/setxattr will not perform "
- "pre/post fop changelog operations in afr transaction "
- "if this option is disabled"
- },
- { .key = {"entry-change-log"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- .description = "Entry fops like create/unlink will not perform "
- "pre/post fop changelog operations in afr transaction "
- "if this option is disabled"
- },
- { .key = {"optimistic-change-log"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- .description = "Entry/Metadata fops will not perform "
- "pre fop changelog operations in afr transaction "
- "if this option is enabled."
- },
- { .key = {"inodelk-trace"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- .description = "Enabling this option logs inode lock/unlocks"
- },
- { .key = {"entrylk-trace"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- .description = "Enabling this option logs entry lock/unlocks"
- },
- { .key = {"pre-op-compat"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- .description = "Use separate pre-op xattrop() FOP rather than "
- "overloading xdata of the OP"
- },
- { .key = {"eager-lock"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- .description = "Lock phase of a transaction has two sub-phases. "
- "First is an attempt to acquire locks in parallel by "
- "broadcasting non-blocking lock requests. If lock "
- "acquisition fails on any server, then the held locks "
- "are unlocked and revert to a blocking locked mode "
- "sequentially on one server after another. If this "
- "option is enabled the initial broadcasting lock "
- "request attempt to acquire lock on the entire file. "
- "If this fails, we revert back to the sequential "
- "\"regional\" blocking lock as before. In the case "
- "where such an \"eager\" lock is granted in the "
- "non-blocking phase, it gives rise to an opportunity "
- "for optimization. i.e, if the next write transaction "
- "on the same FD arrives before the unlock phase of "
- "the first transaction, it \"takes over\" the full "
- "file lock. Similarly if yet another data transaction "
- "arrives before the unlock phase of the \"optimized\" "
- "transaction, that in turn \"takes over\" the lock as "
- "well. The actual unlock now happens at the end of "
- "the last \"optimized\" transaction."
-
- },
- { .key = {"self-heal-daemon"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "on",
- .description = "This option applies to only self-heal-daemon. "
- "Index directory crawl and automatic healing of files "
- "will not be performed if this option is turned off."
- },
- { .key = {"iam-self-heal-daemon"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- .description = "This option differentiates if the replicate "
- "translator is running as part of self-heal-daemon "
- "or not."
- },
- { .key = {"quorum-type"},
- .type = GF_OPTION_TYPE_STR,
- .value = { "none", "auto", "fixed"},
- .default_value = "none",
- .description = "If value is \"fixed\" only allow writes if "
- "quorum-count bricks are present. If value is "
- "\"auto\" only allow writes if more than half of "
- "bricks, or exactly half including the first, are "
- "present.",
- },
- { .key = {"quorum-count"},
- .type = GF_OPTION_TYPE_INT,
- .min = 1,
- .max = INT_MAX,
- .default_value = 0,
- .description = "If quorum-type is \"fixed\" only allow writes if "
- "this many bricks or present. Other quorum types "
- "will OVERWRITE this value.",
- },
- { .key = {"quorum-reads"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "no",
- .description = "If quorum-reads is \"true\" only allow reads if "
- "quorum is met when quorum is enabled.",
- },
- { .key = {"node-uuid"},
- .type = GF_OPTION_TYPE_STR,
- .description = "Local glusterd uuid string, used in starting "
- "self-heal-daemon so that it can crawl only on "
- "local index directories.",
- },
- { .key = {"post-op-delay-secs"},
- .type = GF_OPTION_TYPE_INT,
- .min = 0,
- .max = INT_MAX,
- .default_value = "1",
- .description = "Time interval induced artificially before "
- "post-operation phase of the transaction to "
- "enhance overlap of adjacent write operations.",
- },
- { .key = {AFR_SH_READDIR_SIZE_KEY},
- .type = GF_OPTION_TYPE_SIZET,
- .description = "readdirp size for performing entry self-heal",
- .min = 1024,
- .max = 131072,
- .default_value = "1KB",
- },
- { .key = {"ensure-durability"},
- .type = GF_OPTION_TYPE_BOOL,
- .description = "Afr performs fsyncs for transactions if this "
- "option is on to make sure the changelogs/data is "
- "written to the disk",
- .default_value = "on",
- },
- { .key = {"afr-dirty-xattr"},
- .type = GF_OPTION_TYPE_STR,
- .default_value = AFR_DIRTY_DEFAULT,
- },
- { .key = {"afr-pending-xattr"},
- .type = GF_OPTION_TYPE_STR,
- .description = "Comma seperated list of xattrs that are used to "
- "capture information on pending heals."
- },
- { .key = {"metadata-splitbrain-forced-heal"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
- },
- { .key = {"heal-timeout"},
- .type = GF_OPTION_TYPE_INT,
- .min = 60,
- .max = INT_MAX,
- .default_value = "600",
- .description = "time interval for checking the need to self-heal "
- "in self-heal-daemon"
- },
- { .key = {"consistent-metadata"},
- .type = GF_OPTION_TYPE_BOOL,
- .default_value = "no",
- .description = "If this option is enabled, readdirp will force "
- "lookups on those entries read whose read child is "
- "not the same as that of the parent. This will "
- "guarantee that all read operations on a file serve "
- "attributes from the same subvol as long as it holds "
- " a good copy of the file/dir.",
- },
- { .key = {"arbiter-count"},
- .type = GF_OPTION_TYPE_INT,
- .description = "subset of child_count. Has to be 0 or 1."
- },
- { .key = {NULL} },
+ {.key = {"read-subvolume"},
+ .type = GF_OPTION_TYPE_XLATOR,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. Afr will prefer the one specified using "
+ "this option if it is not stale. Option value must be "
+ "one of the xlator names of the children. "
+ "Ex: <volname>-client-0 till "
+ "<volname>-client-<number-of-bricks - 1>"},
+ {.key = {"read-subvolume-index"},
+ .type = GF_OPTION_TYPE_INT,
+ .default_value = "-1",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "inode-read fops happen only on one of the bricks in "
+ "replicate. AFR will prefer the one specified using "
+ "this option if it is not stale. allowed options"
+ " include -1 till replica-count - 1"},
+ {.key = {"read-hash-mode"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 5,
+ .default_value = "1",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description =
+ "inode-read fops happen only on one of the bricks in "
+ "replicate. AFR will prefer the one computed using "
+ "the method specified using this option.\n"
+ "0 = first readable child of AFR, starting from 1st child.\n"
+ "1 = hash by GFID of file (all clients use "
+ "same subvolume).\n"
+ "2 = hash by GFID of file and client PID.\n"
+ "3 = brick having the least outstanding read requests.\n"
+ "4 = brick having the least network ping latency.\n"
+ "5 = Hybrid mode between 3 and 4, ie least value among "
+ "network-latency multiplied by outstanding-read-requests."},
+ {
+ .key = {"choose-local"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "true",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Choose a local subvolume (i.e. Brick) to read from"
+ " if read-subvolume is not explicitly set.",
+ },
+ {.key = {"background-self-heal-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 256,
+ .default_value = "8",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This specifies the number of per client self-heal "
+ "jobs that can perform parallel heals in the "
+ "background."},
+ {.key = {"halo-shd-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "99999",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Maximum latency for shd halo replication in msec."},
+ {.key = {"halo-enabled"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "False",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Enable Halo (geo) replication mode."},
+ {.key = {"halo-nfsd-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "5",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Maximum latency for nfsd halo replication in msec."},
+ {.key = {"halo-max-latency"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = AFR_HALO_MAX_LATENCY,
+ .default_value = "5",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "Maximum latency for halo replication in msec."},
+ {.key = {"halo-max-replicas"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "99999",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "The maximum number of halo replicas; replicas"
+ " beyond this value will be written asynchronously"
+ "via the SHD."},
+ {.key = {"halo-min-replicas"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 99999,
+ .default_value = "2",
+ .op_version = {GD_OP_VERSION_3_11_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate", "halo"},
+ .description = "The minimmum number of halo replicas, before adding "
+ "out of region replicas."},
+ {.key = {"heal-wait-queue-length"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = 10000, /*Around 100MB with sizeof(afr_local_t)= 10496 bytes*/
+ .default_value = "128",
+ .validate = GF_OPT_VALIDATE_MIN,
+ .op_version = {GD_OP_VERSION_3_7_10},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This specifies the number of heals that can be queued"
+ " for the parallel background self heal jobs."},
+ {.key = {"data-self-heal"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"1", "on", "yes", "true", "enable", "0", "off", "no", "false",
+ "disable", "open"},
+ .default_value = "off",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Using this option we can enable/disable data "
+ "self-heal on the file. \"open\" means data "
+ "self-heal action will only be triggered by file "
+ "open operations."},
+ {.key = {"data-self-heal-algorithm"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Select between \"full\", \"diff\". The "
+ "\"full\" algorithm copies the entire file from "
+ "source to sink. The \"diff\" algorithm copies to "
+ "sink only those blocks whose checksums don't match "
+ "with those of source. If no option is configured "
+ "the option is chosen dynamically as follows: "
+ "If the file does not exist on one of the sinks "
+ "or empty file exists or if the source file size is "
+ "about the same as page size the entire file will "
+ "be read and written i.e \"full\" algo, "
+ "otherwise \"diff\" algo is chosen.",
+ .value = {"diff", "full"}},
+ {.key = {"data-self-heal-window-size"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 1024,
+ .default_value = "1",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Maximum number blocks per file for which self-heal "
+ "process would be applied simultaneously."},
+ {.key = {"metadata-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.validate_fn = validate_replica*/
+ .description = "Using this option we can enable/disable metadata "
+ "i.e. Permissions, ownerships, xattrs self-heal on "
+ "the file/directory."},
+ {.key = {"entry-self-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.validate_fn = validate_replica*/
+ .description = "Using this option we can enable/disable entry "
+ "self-heal on the directory."},
+ {.key = {"data-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"metadata-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"entry-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"optimistic-change-log"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Entry/Metadata fops will not perform "
+ "pre fop changelog operations in afr transaction "
+ "if this option is enabled."},
+ {.key = {"inodelk-trace"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enabling this option logs inode lock/unlocks"},
+ {.key = {"entrylk-trace"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "Enabling this option logs entry lock/unlocks"},
+ {.key = {"pre-op-compat"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .description = "Use separate pre-op xattrop() FOP rather than "
+ "overloading xdata of the OP"},
+ {.key = {"eager-lock"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description =
+ "Enable/Disable eager lock for replica volume. "
+ "Lock phase of a transaction has two sub-phases. "
+ "First is an attempt to acquire locks in parallel by "
+ "broadcasting non-blocking lock requests. If lock "
+ "acquisition fails on any server, then the held locks "
+ "are unlocked and we revert to a blocking locks mode "
+ "sequentially on one server after another. If this "
+ "option is enabled the initial broadcasting lock "
+ "request attempts to acquire a full lock on the entire file. "
+ "If this fails, we revert back to the sequential "
+ "\"regional\" blocking locks as before. In the case "
+ "where such an \"eager\" lock is granted in the "
+ "non-blocking phase, it gives rise to an opportunity "
+ "for optimization. i.e, if the next write transaction "
+ "on the same FD arrives before the unlock phase of "
+ "the first transaction, it \"takes over\" the full "
+ "file lock. Similarly if yet another data transaction "
+ "arrives before the unlock phase of the \"optimized\" "
+ "transaction, that in turn \"takes over\" the lock as "
+ "well. The actual unlock now happens at the end of "
+ "the last \"optimized\" transaction."
+
+ },
+ {.key = {"self-heal-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "on",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ /*.validate_fn = validate_replica_heal_enable_disable*/
+ .description = "This option applies to only self-heal-daemon. "
+ "Index directory crawl and automatic healing of files "
+ "will not be performed if this option is turned off."},
+ {.key = {"iam-self-heal-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the replicate "
+ "translator is running as part of self-heal-daemon "
+ "or not."},
+ {.key = {"iam-nfs-daemon"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ .description = "This option differentiates if the replicate "
+ "translator is running as part of an NFS daemon "
+ "or not."},
+ {
+ .key = {"quorum-type"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"none", "auto", "fixed"},
+ .default_value = "none",
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.option = quorum-type*/
+ .description = "If value is \"fixed\" only allow writes if "
+ "quorum-count bricks are present. If value is "
+ "\"auto\" only allow writes if more than half of "
+ "bricks, or exactly half including the first, are "
+ "present.",
+ },
+ {
+ .key = {"quorum-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = INT_MAX,
+ .default_value = 0,
+ .op_version = {1},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ /*.option = quorum-count*/
+ /*.validate_fn = validate_quorum_count*/
+ .description = "If quorum-type is \"fixed\" only allow writes if "
+ "this many bricks are present. Other quorum types "
+ "will OVERWRITE this value.",
+ },
+ {
+ .key = {"quorum-reads"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option has been removed. Reads are not allowed "
+ "if quorum is not met.",
+ },
+ {
+ .key = {"node-uuid"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Local glusterd uuid string, used in starting "
+ "self-heal-daemon so that it can crawl only on "
+ "local index directories.",
+ },
+ {
+ .key = {"post-op-delay-secs"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 0,
+ .max = INT_MAX,
+ .default_value = "1",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Time interval induced artificially before "
+ "post-operation phase of the transaction to "
+ "enhance overlap of adjacent write operations.",
+ },
+ {
+ .key = {AFR_SH_READDIR_SIZE_KEY},
+ .type = GF_OPTION_TYPE_SIZET,
+ .description = "readdirp size for performing entry self-heal",
+ .min = 1024,
+ .max = 131072,
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .default_value = "1KB",
+ },
+ {
+ .key = {"ensure-durability"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .op_version = {3},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Afr performs fsyncs for transactions if this "
+ "option is on to make sure the changelogs/data is "
+ "written to the disk",
+ .default_value = "on",
+ },
+ {
+ .key = {"afr-dirty-xattr"},
+ .type = GF_OPTION_TYPE_STR,
+ .default_value = AFR_DIRTY_DEFAULT,
+ },
+ {.key = {"afr-pending-xattr"},
+ .type = GF_OPTION_TYPE_STR,
+ .description = "Comma separated list of xattrs that are used to "
+ "capture information on pending heals."},
+ {
+ .key = {"metadata-splitbrain-forced-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "off",
+ },
+ {.key = {"heal-timeout"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 5,
+ .max = INT_MAX,
+ .default_value = "600",
+ .op_version = {2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "time interval for checking the need to self-heal "
+ "in self-heal-daemon"},
+ {
+ .key = {"consistent-metadata"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_7_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "If this option is enabled, readdirp will force "
+ "lookups on those entries read whose read child is "
+ "not the same as that of the parent. This will "
+ "guarantee that all read operations on a file serve "
+ "attributes from the same subvol as long as it holds "
+ " a good copy of the file/dir.",
+ },
+ {.key = {"arbiter-count"},
+ .type = GF_OPTION_TYPE_INT,
+ .description = "subset of child_count. Has to be 0 or 1."},
+ {
+ .key = {"thin-arbiter"},
+ .type = GF_OPTION_TYPE_STR,
+ .op_version = {GD_OP_VERSION_4_1_0},
+ .flags = OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "contains host:path of thin abriter brick",
+ },
+ {.key = {"shd-max-threads"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 64,
+ .default_value = "1",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "Maximum number of parallel heals SHD can do per "
+ "local brick. This can substantially lower heal times"
+ ", but can also crush your bricks if you don't have "
+ "the storage hardware to support this."},
+ {
+ .key = {"shd-wait-qlength"},
+ .type = GF_OPTION_TYPE_INT,
+ .min = 1,
+ .max = 655536,
+ .default_value = "1024",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option can be used to control number of heals"
+ " that can wait in SHD per subvolume",
+ },
+ {
+ .key = {"locking-scheme"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"full", "granular"},
+ .default_value = "full",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "If this option is set to granular, self-heal will "
+ "stop being compatible with afr-v1, which helps afr "
+ "be more granular while self-healing",
+ },
+ {.key = {"full-lock"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "yes",
+ .op_version = {GD_OP_VERSION_3_13_2},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "If this option is disabled, then the IOs will take "
+ "range locks same as versions till 3.13.1."},
+ {
+ .key = {"granular-entry-heal"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_8_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "If this option is enabled, self-heal will resort to "
+ "granular way of recording changelogs and doing entry "
+ "self-heal.",
+ },
+ {
+ .key = {"favorite-child-policy"},
+ .type = GF_OPTION_TYPE_STR,
+ .value = {"none", "size", "ctime", "mtime", "majority"},
+ .default_value = "none",
+ .op_version = {GD_OP_VERSION_3_7_12},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option can be used to automatically resolve "
+ "split-brains using various policies without user "
+ "intervention. \"size\" picks the file with the "
+ "biggest size as the source. \"ctime\" and \"mtime\" "
+ "pick the file with the latest ctime and mtime "
+ "respectively as the source. \"majority\" picks a file"
+ " with identical mtime and size in more than half the "
+ "number of bricks in the replica.",
+ },
+ {
+ .key = {"consistent-io"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .description = "If this option is enabled, i/o will fail even if "
+ "one of the bricks is down in the replicas",
+ },
+ {.key = {"use-compound-fops"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_3_8_4},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
+ .tags = {"replicate"},
+ .description = "This option exists only for backward compatibility "
+ "and configuring it doesn't have any effect"},
+ {.key = {"use-anonymous-inode"},
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "no",
+ .op_version = {GD_OP_VERSION_8_0},
+ .flags = OPT_FLAG_CLIENT_OPT | OPT_FLAG_SETTABLE,
+ .tags = {"replicate"},
+ .description = "Setting this option heals directory renames efficiently"},
+
+ {.key = {NULL}},
+};
+
+xlator_api_t xlator_api = {
+ .init = init,
+ .fini = fini,
+ .notify = notify,
+ .reconfigure = reconfigure,
+ .mem_acct_init = mem_acct_init,
+ .op_version = {1}, /* Present from the initial version */
+ .dumpops = &dumpops,
+ .fops = &fops,
+ .cbks = &cbks,
+ .options = options,
+ .identifier = "replicate",
+ .category = GF_MAINTAINED,
};
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index 6f75e07f95c..d62f9a9caf2 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -8,162 +8,292 @@
cases as published by the Free Software Foundation.
*/
-
#ifndef __AFR_H__
#define __AFR_H__
-#include "call-stub.h"
-#include "compat-errno.h"
+#include <glusterfs/call-stub.h>
+#include <glusterfs/compat-errno.h>
#include "afr-mem-types.h"
#include "libxlator.h"
-#include "timer.h"
-#include "syncop.h"
+#include <glusterfs/timer.h>
+#include <glusterfs/syncop.h>
#include "afr-self-heald.h"
#include "afr-messages.h"
-#define AFR_XATTR_PREFIX "trusted.afr"
+#define SHD_INODE_LRU_LIMIT 1
#define AFR_PATHINFO_HEADER "REPLICATE:"
#define AFR_SH_READDIR_SIZE_KEY "self-heal-readdir-size"
#define AFR_SH_DATA_DOMAIN_FMT "%s:self-heal"
#define AFR_DIRTY_DEFAULT AFR_XATTR_PREFIX ".dirty"
-#define AFR_DIRTY (((afr_private_t *) (THIS->private))->afr_dirty)
+#define AFR_DIRTY (((afr_private_t *)(THIS->private))->afr_dirty)
-#define AFR_LOCKEE_COUNT_MAX 3
-#define AFR_DOM_COUNT_MAX 3
-#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
+#define AFR_LOCKEE_COUNT_MAX 3
+#define AFR_DOM_COUNT_MAX 3
+#define AFR_NUM_CHANGE_LOGS 3 /*data + metadata + entry*/
#define AFR_DEFAULT_SPB_CHOICE_TIMEOUT 300 /*in seconds*/
#define ARBITER_BRICK_INDEX 2
+#define THIN_ARBITER_BRICK_INDEX 2
+#define AFR_TA_DOM_NOTIFY "afr.ta.dom-notify"
+#define AFR_TA_DOM_MODIFY "afr.ta.dom-modify"
+
+#define AFR_LK_HEAL_DOM "afr.lock-heal.domain"
+
+#define AFR_HALO_MAX_LATENCY 99999
+#define AFR_ANON_DIR_PREFIX ".glusterfs-anonymous-inode"
+
+#define PFLAG_PENDING (1 << 0)
+#define PFLAG_SBRAIN (1 << 1)
+
+typedef int (*afr_lock_cbk_t)(call_frame_t *frame, xlator_t *this);
+
+typedef int (*afr_read_txn_wind_t)(call_frame_t *frame, xlator_t *this,
+ int subvol);
+
+typedef int (*afr_inode_refresh_cbk_t)(call_frame_t *frame, xlator_t *this,
+ int err);
+
+typedef int (*afr_changelog_resume_t)(call_frame_t *frame, xlator_t *this);
+
+#define AFR_COUNT(array, max) \
+ ({ \
+ int __i; \
+ int __res = 0; \
+ for (__i = 0; __i < max; __i++) \
+ if (array[__i]) \
+ __res++; \
+ __res; \
+ })
+#define AFR_INTERSECT(dst, src1, src2, max) \
+ ({ \
+ int __i; \
+ for (__i = 0; __i < max; __i++) \
+ dst[__i] = src1[__i] && src2[__i]; \
+ })
+#define AFR_CMP(a1, a2, len) \
+ ({ \
+ int __cmp = 0; \
+ int __i; \
+ for (__i = 0; __i < len; __i++) \
+ if (a1[__i] != a2[__i]) { \
+ __cmp = 1; \
+ break; \
+ } \
+ __cmp; \
+ })
+#define AFR_IS_ARBITER_BRICK(priv, index) \
+ ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX))
+
+#define AFR_SET_ERROR_AND_CHECK_SPLIT_BRAIN(ret, errnum) \
+ do { \
+ local->op_ret = ret; \
+ local->op_errno = errnum; \
+ if (local->op_errno == EIO) \
+ gf_msg(this->name, GF_LOG_ERROR, local->op_errno, \
+ AFR_MSG_SPLIT_BRAIN, \
+ "Failing %s on gfid %s: " \
+ "split-brain observed.", \
+ gf_fop_list[local->op], uuid_utoa(local->inode->gfid)); \
+ } while (0)
+
+#define AFR_ERROR_OUT_IF_FDCTX_INVALID(__fd, __this, __error, __label) \
+ do { \
+ afr_fd_ctx_t *__fd_ctx = NULL; \
+ __fd_ctx = afr_fd_ctx_get(__fd, __this); \
+ if (__fd_ctx && __fd_ctx->is_fd_bad) { \
+ __error = EBADF; \
+ goto __label; \
+ } \
+ } while (0)
-typedef int (*afr_lock_cbk_t) (call_frame_t *frame, xlator_t *this);
-
-typedef int (*afr_read_txn_wind_t) (call_frame_t *frame, xlator_t *this, int subvol);
-
-typedef int (*afr_inode_refresh_cbk_t) (call_frame_t *frame, xlator_t *this, int err);
-
-typedef int (*afr_changelog_resume_t) (call_frame_t *frame, xlator_t *this);
-
-#define alloca0(size) ({void *__ptr; __ptr = alloca(size); memset(__ptr, 0, size); __ptr;})
-#define AFR_COUNT(array,max) ({int __i; int __res = 0; for (__i = 0; __i < max; __i++) if (array[__i]) __res++; __res;})
-#define AFR_INTERSECT(dst,src1,src2,max) ({int __i; for (__i = 0; __i < max; __i++) dst[__i] = src1[__i] && src2[__i];})
-#define AFR_CMP(a1,a2,len) ({int __cmp = 0; int __i; for (__i = 0; __i < len; __i++) if (a1[__i] != a2[__i]) { __cmp = 1; break;} __cmp;})
-#define AFR_IS_ARBITER_BRICK(priv, index) ((priv->arbiter_count == 1) && (index == ARBITER_BRICK_INDEX))
-typedef struct _afr_private {
- gf_lock_t lock; /* to guard access to child_count, etc */
- unsigned int child_count; /* total number of children */
- unsigned int arbiter_count; /*subset of child_count.
- Has to be 0 or 1.*/
-
- xlator_t **children;
-
- inode_t *root_inode;
-
- unsigned char *child_up;
-
- char **pending_key;
-
- char *data_self_heal; /* on/off/open */
- char * data_self_heal_algorithm; /* name of algorithm */
- unsigned int data_self_heal_window_size; /* max number of pipelined
- read/writes */
-
- unsigned int background_self_heal_count;
- unsigned int background_self_heals_started;
- gf_boolean_t metadata_self_heal; /* on/off */
- gf_boolean_t entry_self_heal; /* on/off */
-
- gf_boolean_t data_change_log; /* on/off */
- gf_boolean_t metadata_change_log; /* on/off */
- gf_boolean_t entry_change_log; /* on/off */
-
- gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
- int read_child; /* read-subvolume */
- unsigned int hash_mode; /* for when read_child is not set */
- int favorite_child; /* subvolume to be preferred in resolving
- split-brain cases */
-
- gf_boolean_t inodelk_trace;
- gf_boolean_t entrylk_trace;
-
- unsigned int wait_count; /* # of servers to wait for success */
-
- uint64_t up_count; /* number of CHILD_UPs we have seen */
- uint64_t down_count; /* number of CHILD_DOWNs we have seen */
-
- gf_boolean_t optimistic_change_log;
- gf_boolean_t eager_lock;
- gf_boolean_t pre_op_compat; /* on/off */
- uint32_t post_op_delay_secs;
- unsigned int quorum_count;
- gf_boolean_t quorum_reads;
-
- char vol_uuid[UUID_SIZE + 1];
- int32_t *last_event;
-
- /* @event_generation: Keeps count of number of events received which can
- potentially impact consistency decisions. The events are CHILD_UP
- and CHILD_DOWN, when we have to recalculate the freshness/staleness
- of copies to detect if changes had happened while the other server
- was down. CHILD_DOWN and CHILD_UP can also be received on network
- disconnect/reconnects and not necessarily server going down/up.
- Recalculating freshness/staleness on network events is equally
- important as we might have had a network split brain.
- */
- uint32_t event_generation;
-
- gf_boolean_t choose_local;
- gf_boolean_t did_discovery;
- uint64_t sh_readdir_size;
- gf_boolean_t ensure_durability;
- char *sh_domain;
- char *afr_dirty;
-
- afr_self_heald_t shd;
-
- /* pump dependencies */
- void *pump_private;
- gf_boolean_t use_afr_in_pump;
- gf_boolean_t consistent_metadata;
- uint64_t spb_choice_timeout;
- gf_boolean_t need_heal;
-} afr_private_t;
-
+typedef enum {
+ AFR_READ_POLICY_FIRST_UP,
+ AFR_READ_POLICY_GFID_HASH,
+ AFR_READ_POLICY_GFID_PID_HASH,
+ AFR_READ_POLICY_LESS_LOAD,
+ AFR_READ_POLICY_LEAST_LATENCY,
+ AFR_READ_POLICY_LOAD_LATENCY_HYBRID,
+} afr_read_hash_mode_t;
typedef enum {
- AFR_DATA_TRANSACTION, /* truncate, write, ... */
- AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
- AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
- AFR_ENTRY_RENAME_TRANSACTION, /* rename */
-} afr_transaction_type;
+ AFR_FAV_CHILD_NONE,
+ AFR_FAV_CHILD_BY_SIZE,
+ AFR_FAV_CHILD_BY_CTIME,
+ AFR_FAV_CHILD_BY_MTIME,
+ AFR_FAV_CHILD_BY_MAJORITY,
+ AFR_FAV_CHILD_POLICY_MAX,
+} afr_favorite_child_policy;
typedef enum {
- AFR_TRANSACTION_LK,
- AFR_SELFHEAL_LK,
-} transaction_lk_type_t;
+ AFR_SELFHEAL_DATA_FULL = 0,
+ AFR_SELFHEAL_DATA_DIFF,
+ AFR_SELFHEAL_DATA_DYNAMIC,
+} afr_data_self_heal_type_t;
typedef enum {
- AFR_LOCK_OP,
- AFR_UNLOCK_OP,
-} afr_lock_op_type_t;
+ AFR_CHILD_UNKNOWN = -1,
+ AFR_CHILD_ZERO,
+ AFR_CHILD_ONE,
+ AFR_CHILD_THIN_ARBITER,
+} afr_child_index;
typedef enum {
- AFR_DATA_SELF_HEAL_LK,
- AFR_METADATA_SELF_HEAL_LK,
- AFR_ENTRY_SELF_HEAL_LK,
-}selfheal_lk_type_t;
+ TA_WAIT_FOR_NOTIFY_LOCK_REL, /*FOP came after notify domain lock upcall
+ notification and waiting for its release.*/
+ TA_GET_INFO_FROM_TA_FILE, /*FOP needs post-op on ta file to get
+ *info about which brick is bad.*/
+ TA_INFO_IN_MEMORY_SUCCESS, /*Bad brick info is in memory and fop failed
+ *on BAD brick - Success*/
+ TA_INFO_IN_MEMORY_FAILED, /*Bad brick info is in memory and fop failed
+ *on GOOD brick - Failed*/
+ TA_SUCCESS, /*FOP succeeded on both data bricks.*/
+} afr_ta_fop_state_t;
+
+struct afr_nfsd {
+ uint32_t halo_max_latency_msec;
+ gf_boolean_t iamnfsd;
+};
+
+typedef struct _afr_lk_heal_info {
+ fd_t *fd;
+ int32_t cmd;
+ struct gf_flock flock;
+ dict_t *xdata_req;
+ unsigned char *locked_nodes;
+ struct list_head pos;
+ gf_lkowner_t lk_owner;
+ pid_t pid;
+ int32_t *child_up_event_gen;
+ int32_t *child_down_event_gen;
+} afr_lk_heal_info_t;
+
+typedef struct _afr_private {
+ gf_lock_t lock; /* to guard access to child_count, etc */
+ unsigned int child_count; /* total number of children */
+ unsigned int arbiter_count; /*subset of child_count.
+ Has to be 0 or 1.*/
+
+ xlator_t **children;
+
+ inode_t *root_inode;
+
+ int favorite_child; /* subvolume to be preferred in resolving
+ split-brain cases */
+ /* For thin-arbiter. */
+ uuid_t ta_gfid;
+ unsigned int thin_arbiter_count; /* 0 or 1 at the moment.*/
+ int ta_bad_child_index;
+ int ta_event_gen;
+ unsigned int ta_in_mem_txn_count;
+ unsigned int ta_on_wire_txn_count;
+ struct list_head ta_waitq;
+ struct list_head ta_onwireq;
+
+ unsigned char *anon_inode;
+ unsigned char *child_up;
+ unsigned char *halo_child_up;
+ int64_t *child_latency;
+ unsigned char *local;
+
+ char **pending_key;
+
+ afr_data_self_heal_type_t data_self_heal_algorithm;
+ unsigned int data_self_heal_window_size; /* max number of pipelined
+ read/writes */
+
+ struct list_head heal_waiting; /*queue for files that need heal*/
+ uint32_t heal_wait_qlen; /*configurable queue length for heal_waiting*/
+ int32_t heal_waiters; /* No. of elements currently in wait queue.*/
+
+ struct list_head healing; /* queue for files that are undergoing
+ background heal*/
+ uint32_t background_self_heal_count; /*configurable queue length for
+ healing queue*/
+ int32_t healers; /* No. of elements currently undergoing background
+ heal*/
+
+ gf_boolean_t release_ta_notify_dom_lock;
+
+ gf_boolean_t metadata_self_heal; /* on/off */
+ gf_boolean_t entry_self_heal; /* on/off */
+
+ gf_boolean_t metadata_splitbrain_forced_heal; /* on/off */
+ int read_child; /* read-subvolume */
+ gf_atomic_t *pending_reads; /*No. of pending read cbks per child.*/
+
+ gf_timer_t *timer; /* launched when parent up is received */
+
+ unsigned int wait_count; /* # of servers to wait for success */
+
+ unsigned char ta_child_up;
+ gf_boolean_t optimistic_change_log;
+ gf_boolean_t eager_lock;
+ gf_boolean_t pre_op_compat; /* on/off */
+ uint32_t post_op_delay_secs;
+ unsigned int quorum_count;
+
+ off_t ta_notify_dom_lock_offset;
+ afr_favorite_child_policy fav_child_policy; /*Policy to use for automatic
+ resolution of split-brains.*/
+ afr_read_hash_mode_t hash_mode; /* for when read_child is not set */
+
+ int32_t *last_event;
+
+ /* @event_generation: Keeps count of number of events received which can
+ potentially impact consistency decisions. The events are CHILD_UP
+ and CHILD_DOWN, when we have to recalculate the freshness/staleness
+ of copies to detect if changes had happened while the other server
+ was down. CHILD_DOWN and CHILD_UP can also be received on network
+ disconnect/reconnects and not necessarily server going down/up.
+ Recalculating freshness/staleness on network events is equally
+ important as we might have had a network split brain.
+ */
+ uint32_t event_generation;
+ char vol_uuid[UUID_SIZE + 1];
+
+ gf_boolean_t choose_local;
+ gf_boolean_t did_discovery;
+ gf_boolean_t ensure_durability;
+ gf_boolean_t halo_enabled;
+ gf_boolean_t consistent_metadata;
+ gf_boolean_t need_heal;
+ gf_boolean_t granular_locks;
+ uint64_t sh_readdir_size;
+ char *sh_domain;
+ char *afr_dirty;
+
+ uint64_t spb_choice_timeout;
+
+ afr_self_heald_t shd;
+ struct afr_nfsd nfsd;
+
+ uint32_t halo_max_latency_msec;
+ uint32_t halo_max_replicas;
+ uint32_t halo_min_replicas;
+
+ gf_boolean_t full_lock;
+ gf_boolean_t esh_granular;
+ gf_boolean_t consistent_io;
+ gf_boolean_t data_self_heal; /* on/off */
+ gf_boolean_t use_anon_inode;
+
+ /*For lock healing.*/
+ struct list_head saved_locks;
+ struct list_head lk_healq;
+
+ /*For anon-inode handling */
+ char anon_inode_name[NAME_MAX + 1];
+ char anon_gfid_str[UUID_SIZE + 1];
+} afr_private_t;
typedef enum {
- AFR_INODELK_TRANSACTION,
- AFR_INODELK_NB_TRANSACTION,
- AFR_ENTRYLK_TRANSACTION,
- AFR_ENTRYLK_NB_TRANSACTION,
- AFR_INODELK_SELFHEAL,
- AFR_INODELK_NB_SELFHEAL,
- AFR_ENTRYLK_SELFHEAL,
- AFR_ENTRYLK_NB_SELFHEAL,
-} afr_lock_call_type_t;
+ AFR_DATA_TRANSACTION, /* truncate, write, ... */
+ AFR_METADATA_TRANSACTION, /* chmod, chown, ... */
+ AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */
+ AFR_ENTRY_RENAME_TRANSACTION, /* rename */
+} afr_transaction_type;
/*
xattr format: trusted.afr.volume = [x y z]
@@ -173,861 +303,940 @@ typedef enum {
*/
static inline int
-afr_index_for_transaction_type (afr_transaction_type type)
+afr_index_for_transaction_type(afr_transaction_type type)
{
- switch (type) {
-
+ switch (type) {
case AFR_DATA_TRANSACTION:
- return 0;
+ return 0;
case AFR_METADATA_TRANSACTION:
- return 1;
+ return 1;
case AFR_ENTRY_TRANSACTION:
case AFR_ENTRY_RENAME_TRANSACTION:
- return 2;
- }
+ return 2;
+ }
- return -1; /* make gcc happy */
+ return -1; /* make gcc happy */
}
static inline int
-afr_index_from_ia_type (ia_type_t type)
+afr_index_from_ia_type(ia_type_t type)
{
- switch (type) {
+ switch (type) {
case IA_IFDIR:
- return afr_index_for_transaction_type (AFR_ENTRY_TRANSACTION);
+ return afr_index_for_transaction_type(AFR_ENTRY_TRANSACTION);
case IA_IFREG:
- return afr_index_for_transaction_type (AFR_DATA_TRANSACTION);
- default: return -1;
- }
+ return afr_index_for_transaction_type(AFR_DATA_TRANSACTION);
+ default:
+ return -1;
+ }
}
typedef struct {
- loc_t loc;
- char *basename;
- unsigned char *locked_nodes;
- int locked_count;
+ struct gf_flock flock;
+ loc_t loc;
+ fd_t *fd;
+ char *basename;
+ unsigned char *locked_nodes;
+ int locked_count;
-} afr_entry_lockee_t;
+} afr_lockee_t;
int
-afr_entry_lockee_cmp (const void *l1, const void *l2);
+afr_entry_lockee_cmp(const void *l1, const void *l2);
typedef struct {
- char *domain; /* Domain on which inodelk is taken */
- struct gf_flock flock;
- unsigned char *locked_nodes;
- int32_t lock_count;
-} afr_inodelk_t;
+ loc_t *lk_loc;
-typedef struct {
- loc_t *lk_loc;
+ afr_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
- int lockee_count;
- afr_entry_lockee_t lockee[AFR_LOCKEE_COUNT_MAX];
+ const char *lk_basename;
+ const char *lower_basename;
+ const char *higher_basename;
- afr_inodelk_t inodelk[AFR_DOM_COUNT_MAX];
- const char *lk_basename;
- const char *lower_basename;
- const char *higher_basename;
- char lower_locked;
- char higher_locked;
+ unsigned char *lower_locked_nodes;
- unsigned char *locked_nodes;
- unsigned char *lower_locked_nodes;
+ afr_lock_cbk_t lock_cbk;
- selfheal_lk_type_t selfheal_lk_type;
- transaction_lk_type_t transaction_lk_type;
+ int lockee_count;
- int32_t lock_count;
- int32_t entrylk_lock_count;
+ int32_t lk_call_count;
+ int32_t lk_expected_count;
+ int32_t lk_attempted_count;
- uint64_t lock_number;
- int32_t lk_call_count;
- int32_t lk_expected_count;
- int32_t lk_attempted_count;
-
- int32_t lock_op_ret;
- int32_t lock_op_errno;
- afr_lock_cbk_t lock_cbk;
- char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
+ int32_t lock_op_ret;
+ int32_t lock_op_errno;
+ char *domain; /* Domain on which inode/entry lock/unlock in progress.*/
+ int32_t lock_count;
+ char lower_locked;
+ char higher_locked;
} afr_internal_lock_t;
struct afr_reply {
- int valid;
- int32_t op_ret;
- int32_t op_errno;
- dict_t *xattr;/*For xattrop*/
- dict_t *xdata;
- struct iatt poststat;
- struct iatt postparent;
- struct iatt prestat;
- struct iatt preparent;
- struct iatt preparent2;
- struct iatt postparent2;
- /* For rchecksum */
- uint8_t checksum[MD5_DIGEST_LENGTH];
- gf_boolean_t buf_has_zeroes;
- /* For lookup */
- int8_t need_heal;
+ int valid;
+ int32_t op_ret;
+ dict_t *xattr; /*For xattrop*/
+ dict_t *xdata;
+ struct iatt poststat;
+ struct iatt postparent;
+ struct iatt prestat;
+ struct iatt preparent;
+ struct iatt preparent2;
+ struct iatt postparent2;
+ int32_t op_errno;
+ /* For rchecksum */
+ uint8_t checksum[SHA256_DIGEST_LENGTH];
+ gf_boolean_t buf_has_zeroes;
+ gf_boolean_t fips_mode_rchecksum;
+ /* For lookup */
+ int8_t need_heal;
};
typedef enum {
- AFR_FD_NOT_OPENED,
- AFR_FD_OPENED,
- AFR_FD_OPENING
+ AFR_FD_NOT_OPENED,
+ AFR_FD_OPENED,
+ AFR_FD_OPENING
} afr_fd_open_status_t;
typedef struct {
- unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
- int inherited[AFR_NUM_CHANGE_LOGS];
- int on_disk[AFR_NUM_CHANGE_LOGS];
- afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
-
- unsigned int *lock_piggyback;
- unsigned int *lock_acquired;
-
- int flags;
-
- /* used for delayed-post-op optimization */
- pthread_mutex_t delay_lock;
- gf_timer_t *delay_timer;
- call_frame_t *delay_frame;
-
- /* set if any write on this fd was a non stable write
- (i.e, without O_SYNC or O_DSYNC)
- */
- gf_boolean_t witnessed_unstable_write;
-
- /* @open_fd_count:
- Number of open FDs queried from the server, as queried through
- xdata in FOPs. Currently, used to decide if eager-locking must be
- temporarily disabled.
- */
- uint32_t open_fd_count;
-
-
- /* list of frames currently in progress */
- struct list_head eager_locked;
-
- /* the subvolume on which the latest sequence of readdirs (starting
- at offset 0) has begun. Till the next readdir request with 0 offset
- arrives, we continue to read off this subvol.
- */
- int readdir_subvol;
+ afr_fd_open_status_t *opened_on; /* which subvolumes the fd is open on */
+ int flags;
+
+ /* the subvolume on which the latest sequence of readdirs (starting
+ at offset 0) has begun. Till the next readdir request with 0 offset
+ arrives, we continue to read off this subvol.
+ */
+ int readdir_subvol;
+ /* lock-healing related members. */
+ gf_boolean_t is_fd_bad;
+ afr_lk_heal_info_t *lk_heal_info;
+
} afr_fd_ctx_t;
+typedef enum {
+ AFR_FOP_LOCK_PARALLEL,
+ AFR_FOP_LOCK_SERIAL,
+ AFR_FOP_LOCK_QUORUM_FAILED,
+} afr_fop_lock_state_t;
+
+typedef struct _afr_inode_lock_t {
+ /* @num_inodelks:
+ Number of inodelks queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
+ */
+ int32_t num_inodelks;
+ unsigned int event_generation;
+ gf_timer_t *delay_timer;
+ struct list_head owners; /*Transactions that are performing fop*/
+ struct list_head post_op; /*Transactions that are done with the fop
+ *So can not conflict with the fops*/
+ struct list_head waiting; /*Transaction that are waiting for
+ *conflicting transactions to complete*/
+ struct list_head frozen; /*Transactions that need to go as part of
+ * next batch of eager-lock*/
+ gf_boolean_t release;
+ gf_boolean_t acquired;
+} afr_lock_t;
+
+typedef struct _afr_inode_ctx {
+ uint64_t read_subvol;
+ uint64_t write_subvol;
+ int lock_count;
+ int spb_choice;
+ gf_timer_t *timer;
+ unsigned int *pre_op_done[AFR_NUM_CHANGE_LOGS];
+ int inherited[AFR_NUM_CHANGE_LOGS];
+ int on_disk[AFR_NUM_CHANGE_LOGS];
+ /*Only 2 types of transactions support eager-locks now. DATA/METADATA*/
+ afr_lock_t lock[2];
+
+ /* @open_fd_count:
+ Number of open FDs queried from the server, as queried through
+ xdata in FOPs. Currently, used to decide if eager-locking must be
+ temporarily disabled.
+ */
+ uint32_t open_fd_count;
+ gf_boolean_t need_refresh;
+
+ /* set if any write on this fd was a non stable write
+ (i.e, without O_SYNC or O_DSYNC)
+ */
+ gf_boolean_t witnessed_unstable_write;
+} afr_inode_ctx_t;
typedef struct _afr_local {
- glusterfs_fop_t op;
- unsigned int call_count;
+ glusterfs_fop_t op;
+ unsigned int call_count;
- /* @event_generation: copy of priv->event_generation taken at the
- time of starting the transaction. The copy is made so that we
- have a stable value through the various phases of the transaction.
- */
- unsigned int event_generation;
+ /* @event_generation: copy of priv->event_generation taken at the
+ time of starting the transaction. The copy is made so that we
+ have a stable value through the various phases of the transaction.
+ */
+ unsigned int event_generation;
- uint32_t open_fd_count;
- gf_boolean_t update_open_fd_count;
+ uint32_t open_fd_count;
+ int32_t num_inodelks;
- gf_lkowner_t saved_lk_owner;
+ int32_t op_ret;
+ int32_t op_errno;
- int32_t op_ret;
- int32_t op_errno;
+ int dirty[AFR_NUM_CHANGE_LOGS];
- int32_t **pending;
+ int32_t **pending;
- int dirty[AFR_NUM_CHANGE_LOGS];
+ loc_t loc;
+ loc_t newloc;
- loc_t loc;
- loc_t newloc;
+ fd_t *fd;
+ afr_fd_ctx_t *fd_ctx;
- fd_t *fd;
- afr_fd_ctx_t *fd_ctx;
+ /* @child_up: copy of priv->child_up taken at the time of transaction
+ start. The copy is taken so that we have a stable child_up array
+ through the phases of the transaction as priv->child_up[i] can keep
+ changing through time.
+ */
+ unsigned char *child_up;
- /* @child_up: copy of priv->child_up taken at the time of transaction
- start. The copy is taken so that we have a stable child_up array
- through the phases of the transaction as priv->child_up[i] can keep
- changing through time.
- */
- unsigned char *child_up;
+ /* @read_attempted:
+ array of flags representing subvolumes where read operations of
+ the read transaction have already been attempted. The array is
+ first pre-filled with down subvolumes, and as reads are performed
+ on other subvolumes, those are set as well. This way if the read
+ operation fails we do not retry on that subvolume again.
+ */
+ unsigned char *read_attempted;
- /* @read_attempted:
- array of flags representing subvolumes where read operations of
- the read transaction have already been attempted. The array is
- first pre-filled with down subvolumes, and as reads are performed
- on other subvolumes, those are set as well. This way if the read
- operation fails we do not retry on that subvolume again.
- */
- unsigned char *read_attempted;
+ /* @readfn:
- /* @readfn:
+ pointer to function which will perform the read operation on a given
+ subvolume. Used in read transactions.
+ */
- pointer to function which will perform the read operation on a given
- subvolume. Used in read transactions.
- */
+ afr_read_txn_wind_t readfn;
- afr_read_txn_wind_t readfn;
+ /* @inode:
- /* @refreshed:
+ the inode on which the read txn is performed on. ref'ed and copied
+ from either fd->inode or loc.inode
+ */
- the inode was "refreshed" (i.e, pending xattrs from all subvols
- freshly inspected and inode ctx updated accordingly) as part of
- this transaction already.
- */
- gf_boolean_t refreshed;
+ inode_t *inode;
- /* @inode:
+ /* @parent[2]:
- the inode on which the read txn is performed on. ref'ed and copied
- from either fd->inode or loc.inode
- */
+ parent inode[s] on which directory transactions are performed.
+ */
- inode_t *inode;
+ inode_t *parent;
+ inode_t *parent2;
- /* @parent[2]:
+ /* @readable:
- parent inode[s] on which directory transactions are performed.
- */
+ array of flags representing servers from which a read can be
+ performed. This is the output of afr_inode_refresh()
+ */
+ unsigned char *readable;
+ unsigned char *readable2; /*For rename transaction*/
- inode_t *parent;
- inode_t *parent2;
+ afr_inode_refresh_cbk_t refreshfn;
- /* @readable:
+ /* @refreshinode:
- array of flags representing servers from which a read can be
- performed. This is the output of afr_inode_refresh()
- */
- unsigned char *readable;
+ Inode currently getting refreshed.
+ */
+ inode_t *refreshinode;
- afr_inode_refresh_cbk_t refreshfn;
+ dict_t *xattr_req;
- /* @refreshinode:
+ dict_t *dict;
- Inode currently getting refreshed.
- */
- inode_t *refreshinode;
+ int read_subvol; /* Current read subvolume */
- /*
- @pre_op_compat:
+ int optimistic_change_log;
- compatibility mode of pre-op. send a separate pre-op and
- op operations as part of transaction, rather than combining
- */
+ afr_internal_lock_t internal_lock;
- gf_boolean_t pre_op_compat;
+ /*To handle setattr/setxattr on yet to be linked inode from dht*/
+ uuid_t refreshgfid;
- dict_t *xattr_req;
+ /* @refreshed:
- afr_internal_lock_t internal_lock;
+ the inode was "refreshed" (i.e, pending xattrs from all subvols
+ freshly inspected and inode ctx updated accordingly) as part of
+ this transaction already.
+ */
+ gf_boolean_t refreshed;
- dict_t *dict;
+ gf_boolean_t update_num_inodelks;
+ gf_boolean_t update_open_fd_count;
- int optimistic_change_log;
- gf_boolean_t delayed_post_op;
+ /*
+ @pre_op_compat:
- /* Is the current writev() going to perform a stable write?
- i.e, is fd->flags or @flags writev param have O_SYNC or
- O_DSYNC?
- */
- gf_boolean_t stable_write;
+ compatibility mode of pre-op. send a separate pre-op and
+ op operations as part of transaction, rather than combining
+ */
- /* This write appended to the file. Nnot necessarily O_APPEND,
- just means the offset of write was at the end of file.
- */
- gf_boolean_t append_write;
+ gf_boolean_t pre_op_compat;
- /*
- This struct contains the arguments for the "continuation"
- (scheme-like) of fops
- */
+ /* Is the current writev() going to perform a stable write?
+ i.e, is fd->flags or @flags writev param have O_SYNC or
+ O_DSYNC?
+ */
+ gf_boolean_t stable_write;
+ /* This write appended to the file. Nnot necessarily O_APPEND,
+ just means the offset of write was at the end of file.
+ */
+ gf_boolean_t append_write;
+
+ /*
+ This struct contains the arguments for the "continuation"
+ (scheme-like) of fops
+ */
+
+ struct {
struct {
- struct {
- gf_boolean_t needs_fresh_lookup;
- uuid_t gfid_req;
- } lookup;
-
- struct {
- unsigned char buf_set;
- struct statvfs buf;
- } statfs;
-
- struct {
- int32_t flags;
- } open;
-
- struct {
- int32_t cmd;
- struct gf_flock user_flock;
- struct gf_flock ret_flock;
- unsigned char *locked_nodes;
- } lk;
-
- /* inode read */
-
- struct {
- int32_t mask;
- int last_index; /* index of the child we tried previously */
- } access;
-
- struct {
- int last_index;
- } stat;
-
- struct {
- int last_index;
- } fstat;
-
- struct {
- size_t size;
- int last_index;
- } readlink;
-
- struct {
- char *name;
- int last_index;
- long xattr_len;
- } getxattr;
-
- struct {
- size_t size;
- off_t offset;
- int last_index;
- uint32_t flags;
- } readv;
-
- /* dir read */
-
- struct {
- int success_count;
- int32_t op_ret;
- int32_t op_errno;
-
- uint32_t *checksum;
- } opendir;
-
- struct {
- int32_t op_ret;
- int32_t op_errno;
- size_t size;
- off_t offset;
- dict_t *dict;
- gf_boolean_t failed;
- int last_index;
- } readdir;
- /* inode write */
-
- struct {
- struct iatt prebuf;
- struct iatt postbuf;
- } inode_wfop; //common structure for all inode-write-fops
-
- struct {
- int32_t op_ret;
-
- struct iovec *vector;
- struct iobref *iobref;
- int32_t count;
- off_t offset;
- uint32_t flags;
- } writev;
-
- struct {
- off_t offset;
- } truncate;
-
- struct {
- off_t offset;
- } ftruncate;
-
- struct {
- struct iatt in_buf;
- int32_t valid;
- } setattr;
-
- struct {
- struct iatt in_buf;
- int32_t valid;
- } fsetattr;
-
- struct {
- dict_t *dict;
- int32_t flags;
- } setxattr;
-
- struct {
- dict_t *dict;
- int32_t flags;
- } fsetxattr;
-
- struct {
- char *name;
- } removexattr;
-
- struct {
- dict_t *xattr;
- gf_xattrop_flags_t optype;
- } xattrop;
-
- /* dir write */
-
- struct {
- inode_t *inode;
- struct iatt buf;
- struct iatt preparent;
- struct iatt postparent;
- struct iatt prenewparent;
- struct iatt postnewparent;
- } dir_fop; //common structure for all dir fops
-
- struct {
- fd_t *fd;
- dict_t *params;
- int32_t flags;
- mode_t mode;
- } create;
-
- struct {
- dev_t dev;
- mode_t mode;
- dict_t *params;
- } mknod;
-
- struct {
- int32_t mode;
- dict_t *params;
- } mkdir;
-
- struct {
- int flags;
- } rmdir;
-
- struct {
- dict_t *params;
- char *linkpath;
- } symlink;
-
- struct {
- int32_t mode;
- off_t offset;
- size_t len;
- } fallocate;
-
- struct {
- off_t offset;
- size_t len;
- } discard;
-
- struct {
- off_t offset;
- off_t len;
- struct iatt prebuf;
- struct iatt postbuf;
- } zerofill;
-
- struct {
- char *volume;
- int32_t cmd;
- struct gf_flock flock;
- } inodelk;
-
- struct {
- off_t offset;
- gf_seek_what_t what;
- } seek;
-
- } cont;
+ struct statvfs buf;
+ unsigned char buf_set;
+ } statfs;
struct {
- off_t start, len;
+ fd_t *fd;
+ int32_t flags;
+ } open;
- gf_boolean_t eager_lock_on;
- int *eager_lock;
+ struct {
+ struct gf_flock user_flock;
+ struct gf_flock ret_flock;
+ unsigned char *locked_nodes;
+ int32_t cmd;
+ /*For lock healing only.*/
+ unsigned char *dom_locked_nodes;
+ int32_t *dom_lock_op_ret;
+ int32_t *dom_lock_op_errno;
+ struct gf_flock *getlk_rsp;
+ } lk;
+
+ /* inode read */
- char *basename;
- char *new_basename;
+ struct {
+ int32_t mask;
+ int last_index; /* index of the child we tried previously */
+ } access;
- loc_t parent_loc;
- loc_t new_parent_loc;
+ struct {
+ int last_index;
+ } stat;
- afr_transaction_type type;
+ struct {
+ int last_index;
+ } fstat;
- /* stub to resume on destruction
- of the transaction frame */
- call_stub_t *resume_stub;
+ struct {
+ size_t size;
+ int last_index;
+ } readlink;
- struct list_head eager_locked;
+ struct {
+ char *name;
+ long xattr_len;
+ int last_index;
+ } getxattr;
- unsigned char *pre_op;
+ struct {
+ size_t size;
+ off_t offset;
+ int last_index;
+ uint32_t flags;
+ } readv;
- /* For arbiter configuration only. */
- dict_t **pre_op_xdata;
- unsigned char *pre_op_sources;
+ /* dir read */
- /* @failed_subvols: subvolumes on which a pre-op or a
- FOP failed. */
- unsigned char *failed_subvols;
+ struct {
+ uint32_t *checksum;
+ int success_count;
+ int32_t op_ret;
+ int32_t op_errno;
+ } opendir;
- /* @dirtied: flag which indicates whether we set dirty flag
- in the OP. Typically true when we are performing operation
- on more than one subvol and optimistic changelog is disabled
+ struct {
+ int32_t op_ret;
+ int32_t op_errno;
+ size_t size;
+ off_t offset;
+ dict_t *dict;
+ int last_index;
+ gf_boolean_t failed;
+ } readdir;
+ /* inode write */
- A 'true' value set in @dirtied flag means an 'undirtying'
- has to be done in POST-OP phase.
- */
- gf_boolean_t dirtied;
+ struct {
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } inode_wfop; // common structure for all inode-write-fops
- /* @inherited: flag which indicates that the dirty flags
- of the previous transaction were inherited
- */
- gf_boolean_t inherited;
+ struct {
+ struct iovec *vector;
+ struct iobref *iobref;
+ off_t offset;
+ int32_t op_ret;
+ int32_t count;
+ uint32_t flags;
+ } writev;
- /*
- @no_uninherit: flag which indicates that a pre_op_uninherit()
- must _not_ be attempted (and returned as failure) always. This
- flag is set when a hard pre-op is performed, but not accounted
- for it in fd_ctx->on_disk[]. Such transactions are "isolated"
- from the pre-op piggybacking entirely and therefore uninherit
- must not be attempted.
- */
- gf_boolean_t no_uninherit;
+ struct {
+ off_t offset;
+ } truncate;
- /* @uninherit_done:
- @uninherit_value:
+ struct {
+ off_t offset;
+ } ftruncate;
- The above pair variables make pre_op_uninherit() idempotent.
- Both are FALSE initially. The first call to pre_op_uninherit
- sets @uninherit_done to TRUE and the return value to
- @uninherit_value. Further calls will check for @uninherit_done
- to be TRUE and if so will simply return @uninherit_value.
- */
- gf_boolean_t uninherit_done;
- gf_boolean_t uninherit_value;
+ struct {
+ struct iatt in_buf;
+ int32_t valid;
+ } setattr;
- /* @changelog_resume: function to be called after changlogging
- (either pre-op or post-op) is done
- */
+ struct {
+ struct iatt in_buf;
+ int32_t valid;
+ } fsetattr;
- afr_changelog_resume_t changelog_resume;
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } setxattr;
- call_frame_t *main_frame;
+ struct {
+ dict_t *dict;
+ int32_t flags;
+ } fsetxattr;
- int (*wind) (call_frame_t *frame, xlator_t *this, int subvol);
+ struct {
+ char *name;
+ } removexattr;
- int (*fop) (call_frame_t *frame, xlator_t *this);
+ struct {
+ dict_t *xattr;
+ gf_xattrop_flags_t optype;
+ } xattrop;
- int (*done) (call_frame_t *frame, xlator_t *this);
+ /* dir write */
- int (*resume) (call_frame_t *frame, xlator_t *this);
+ struct {
+ inode_t *inode;
+ struct iatt buf;
+ struct iatt preparent;
+ struct iatt postparent;
+ struct iatt prenewparent;
+ struct iatt postnewparent;
+ } dir_fop; // common structure for all dir fops
- int (*unwind) (call_frame_t *frame, xlator_t *this);
+ struct {
+ fd_t *fd;
+ dict_t *params;
+ int32_t flags;
+ mode_t mode;
+ } create;
- /* post-op hook */
- } transaction;
+ struct {
+ dict_t *params;
+ dev_t dev;
+ mode_t mode;
+ } mknod;
- syncbarrier_t barrier;
+ struct {
+ dict_t *params;
+ int32_t mode;
+ } mkdir;
- /* extra data for fops */
- dict_t *xdata_req;
- dict_t *xdata_rsp;
+ struct {
+ dict_t *params;
+ char *linkpath;
+ } symlink;
- dict_t *xattr_rsp; /*for [f]xattrop*/
+ struct {
+ off_t offset;
+ size_t len;
+ int32_t mode;
+ } fallocate;
- mode_t umask;
- int xflag;
- gf_boolean_t do_discovery;
- struct afr_reply *replies;
-} afr_local_t;
+ struct {
+ off_t offset;
+ size_t len;
+ } discard;
+ struct {
+ off_t offset;
+ off_t len;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ } zerofill;
-typedef struct _afr_inode_ctx {
- uint64_t read_subvol;
- int spb_choice;
- gf_timer_t *timer;
-} afr_inode_ctx_t;
+ struct {
+ char *volume;
+ int32_t cmd;
+ int32_t in_cmd;
+ struct gf_flock in_flock;
+ struct gf_flock flock;
+ void *xdata;
+ } inodelk;
+
+ struct {
+ char *volume;
+ char *basename;
+ void *xdata;
+ entrylk_cmd in_cmd;
+ entrylk_cmd cmd;
+ entrylk_type type;
+ } entrylk;
+
+ struct {
+ off_t offset;
+ gf_seek_what_t what;
+ } seek;
+
+ struct {
+ struct gf_lease user_lease;
+ struct gf_lease ret_lease;
+ unsigned char *locked_nodes;
+ } lease;
+
+ struct {
+ int flags;
+ } rmdir;
+
+ struct {
+ int32_t datasync;
+ } fsync;
+
+ struct {
+ uuid_t gfid_req;
+ gf_boolean_t needs_fresh_lookup;
+ } lookup;
+
+ } cont;
+
+ struct {
+ char *basename;
+ char *new_basename;
+
+ loc_t parent_loc;
+ loc_t new_parent_loc;
+
+ /* stub to resume on destruction
+ of the transaction frame */
+ call_stub_t *resume_stub;
+
+ struct list_head owner_list;
+ struct list_head wait_list;
+
+ unsigned char *pre_op;
+
+ /* Changelog xattr dict for [f]xattrop*/
+ dict_t **changelog_xdata;
+ unsigned char *pre_op_sources;
+
+ /* @failed_subvols: subvolumes on which a pre-op or a
+ FOP failed. */
+ unsigned char *failed_subvols;
+
+ call_frame_t *main_frame; /*Fop frame*/
+ call_frame_t *frame; /*Transaction frame*/
+
+ int (*wind)(call_frame_t *frame, xlator_t *this, int subvol);
+
+ int (*unwind)(call_frame_t *frame, xlator_t *this);
+
+ off_t start, len;
+
+ afr_transaction_type type;
+
+ int32_t in_flight_sb_errno; /* This is where the cause of the
+ failure on the last good copy of
+ the file is stored.
+ */
+
+ /* @changelog_resume: function to be called after changlogging
+ (either pre-op or post-op) is done
+ */
+ afr_changelog_resume_t changelog_resume;
+
+ gf_boolean_t eager_lock_on;
+ gf_boolean_t do_eager_unlock;
+
+ /* @dirtied: flag which indicates whether we set dirty flag
+ in the OP. Typically true when we are performing operation
+ on more than one subvol and optimistic changelog is disabled
+
+ A 'true' value set in @dirtied flag means an 'undirtying'
+ has to be done in POST-OP phase.
+ */
+ gf_boolean_t dirtied;
+
+ /* @inherited: flag which indicates that the dirty flags
+ of the previous transaction were inherited
+ */
+ gf_boolean_t inherited;
+
+ /*
+ @no_uninherit: flag which indicates that a pre_op_uninherit()
+ must _not_ be attempted (and returned as failure) always. This
+ flag is set when a hard pre-op is performed, but not accounted
+ for it in fd_ctx->on_disk[]. Such transactions are "isolated"
+ from the pre-op piggybacking entirely and therefore uninherit
+ must not be attempted.
+ */
+ gf_boolean_t no_uninherit;
+
+ gf_boolean_t in_flight_sb; /* Indicator for occurrence of
+ split-brain while in the middle of
+ a txn. */
+
+ /* @uninherit_done:
+ @uninherit_value:
+
+ The above pair variables make pre_op_uninherit() idempotent.
+ Both are FALSE initially. The first call to pre_op_uninherit
+ sets @uninherit_done to TRUE and the return value to
+ @uninherit_value. Further calls will check for @uninherit_done
+ to be TRUE and if so will simply return @uninherit_value.
+ */
+ gf_boolean_t uninherit_done;
+ gf_boolean_t uninherit_value;
+
+ gf_boolean_t disable_delayed_post_op;
+ } transaction;
+
+ syncbarrier_t barrier;
+
+ /* extra data for fops */
+ dict_t *xdata_req;
+ dict_t *xdata_rsp;
+
+ dict_t *xattr_rsp; /*for [f]xattrop*/
+
+ mode_t umask;
+ int xflag;
+ struct afr_reply *replies;
+
+ /* For client side background heals. */
+ struct list_head healer;
+ call_frame_t *heal_frame;
+
+ afr_inode_ctx_t *inode_ctx;
+
+ /*For thin-arbiter transactions.*/
+ int ta_failed_subvol;
+ int ta_event_gen;
+ struct list_head ta_waitq;
+ struct list_head ta_onwireq;
+ afr_ta_fop_state_t fop_state;
+ afr_fop_lock_state_t fop_lock_state;
+ gf_lkowner_t saved_lk_owner;
+ unsigned char read_txn_query_child;
+ unsigned char ta_child_up;
+ gf_boolean_t do_discovery;
+ gf_boolean_t need_full_crawl;
+ gf_boolean_t is_read_txn;
+ gf_boolean_t is_new_entry;
+} afr_local_t;
typedef struct afr_spbc_timeout {
- call_frame_t *frame;
- gf_boolean_t d_spb;
- gf_boolean_t m_spb;
- loc_t *loc;
- int spb_child_index;
+ call_frame_t *frame;
+ loc_t *loc;
+ int spb_child_index;
+ gf_boolean_t d_spb;
+ gf_boolean_t m_spb;
} afr_spbc_timeout_t;
typedef struct afr_spb_status {
- call_frame_t *frame;
- loc_t *loc;
+ call_frame_t *frame;
+ loc_t *loc;
} afr_spb_status_t;
-typedef struct afr_replace_brick_args {
- call_frame_t *frame;
- loc_t loc;
- int rb_index;
-} afr_replace_brick_args_t;
+typedef struct afr_empty_brick_args {
+ call_frame_t *frame;
+ char *op_type;
+ loc_t loc;
+ int empty_index;
+} afr_empty_brick_args_t;
typedef struct afr_read_subvol_args {
- ia_type_t ia_type;
- uuid_t gfid;
+ ia_type_t ia_type;
+ uuid_t gfid;
} afr_read_subvol_args_t;
-/* did a call fail due to a child failing? */
-#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \
- ((op_errno == ENOTCONN) || \
- (op_errno == EBADFD)))
+typedef struct afr_granular_esh_args {
+ fd_t *heal_fd;
+ xlator_t *xl;
+ call_frame_t *frame;
+ gf_boolean_t mismatch; /* flag to represent occurrence of type/gfid
+ mismatch */
+} afr_granular_esh_args_t;
int
-afr_inode_get_readable (call_frame_t *frame, inode_t *inode, xlator_t *this,
- unsigned char *readable, int *event_p, int type);
+afr_inode_get_readable(call_frame_t *frame, inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type);
int
-afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
- unsigned char *data_subvols,
- unsigned char *metadata_subvols,
- int *event_generation);
+afr_inode_read_subvol_get(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
int
-__afr_inode_read_subvol_get (inode_t *inode, xlator_t *this,
- unsigned char *data_subvols,
- unsigned char *metadata_subvols,
- int *event_generation);
+__afr_inode_read_subvol_get(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int *event_generation);
int
-__afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
- unsigned char *data_subvols,
- unsigned char *metadata_subvol,
- int event_generation);
+__afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvol,
+ int event_generation);
+int
+afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
+ unsigned char *data_subvols,
+ unsigned char *metadata_subvols,
+ int event_generation);
+
int
-afr_inode_read_subvol_set (inode_t *inode, xlator_t *this,
- unsigned char *data_subvols,
- unsigned char *metadata_subvols,
- int event_generation);
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
int
-afr_inode_read_subvol_reset (inode_t *inode, xlator_t *this);
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
int
-afr_read_subvol_select_by_policy (inode_t *inode, xlator_t *this,
- unsigned char *readable,
- afr_read_subvol_args_t *args);
+afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
+ unsigned char *readable,
+ afr_read_subvol_args_t *args);
int
-afr_inode_read_subvol_type_get (inode_t *inode, xlator_t *this,
- unsigned char *readable, int *event_p,
- int type);
+afr_inode_read_subvol_type_get(inode_t *inode, xlator_t *this,
+ unsigned char *readable, int *event_p, int type);
int
-afr_read_subvol_get (inode_t *inode, xlator_t *this, int *subvol_p,
- int *event_p, afr_transaction_type type,
- afr_read_subvol_args_t *args);
+afr_read_subvol_get(inode_t *inode, xlator_t *this, int *subvol_p,
+ unsigned char *readables, int *event_p,
+ afr_transaction_type type, afr_read_subvol_args_t *args);
-#define afr_data_subvol_get(i, t, s, e, a) \
- afr_read_subvol_get(i, t, s, e, AFR_DATA_TRANSACTION, a)
+#define afr_data_subvol_get(i, t, s, r, e, a) \
+ afr_read_subvol_get(i, t, s, r, e, AFR_DATA_TRANSACTION, a)
-#define afr_metadata_subvol_get(i, t, s, e, a) \
- afr_read_subvol_get(i, t, s, e, AFR_METADATA_TRANSACTION, a)
+#define afr_metadata_subvol_get(i, t, s, r, e, a) \
+ afr_read_subvol_get(i, t, s, r, e, AFR_METADATA_TRANSACTION, a)
int
-afr_inode_refresh (call_frame_t *frame, xlator_t *this, inode_t *inode,
- afr_inode_refresh_cbk_t cbk);
+afr_inode_refresh(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, afr_inode_refresh_cbk_t cbk);
int32_t
-afr_notify (xlator_t *this, int32_t event, void *data, void *data2);
+afr_notify(xlator_t *this, int32_t event, void *data, void *data2);
int
-xattr_is_equal (dict_t *this, char *key1, data_t *value1, void *data);
+xattr_is_equal(dict_t *this, char *key1, data_t *value1, void *data);
int
-afr_init_entry_lockee (afr_entry_lockee_t *lockee, afr_local_t *local,
- loc_t *loc, char *basename, int child_count);
+afr_add_entry_lockee(afr_local_t *local, loc_t *loc, char *basename,
+ int child_count);
+
+int
+afr_add_inode_lockee(afr_local_t *local, int child_count);
void
-afr_entry_lockee_cleanup (afr_internal_lock_t *int_lock);
+afr_lockees_cleanup(afr_internal_lock_t *int_lock);
int
-afr_attempt_lock_recovery (xlator_t *this, int32_t child_index);
+afr_attempt_lock_recovery(xlator_t *this, int32_t child_index);
int
-afr_mark_locked_nodes (xlator_t *this, fd_t *fd,
- unsigned char *locked_nodes);
+afr_mark_locked_nodes(xlator_t *this, fd_t *fd, unsigned char *locked_nodes);
void
-afr_set_lk_owner (call_frame_t *frame, xlator_t *this, void *lk_owner);
+afr_set_lk_owner(call_frame_t *frame, xlator_t *this, void *lk_owner);
int
-afr_set_lock_number (call_frame_t *frame, xlator_t *this);
+afr_set_lock_number(call_frame_t *frame, xlator_t *this);
int32_t
-afr_unlock (call_frame_t *frame, xlator_t *this);
-
-int
-afr_nonblocking_entrylk (call_frame_t *frame, xlator_t *this);
-
-int
-afr_nonblocking_inodelk (call_frame_t *frame, xlator_t *this);
+afr_unlock(call_frame_t *frame, xlator_t *this);
int
-afr_blocking_lock (call_frame_t *frame, xlator_t *this);
+afr_lock_nonblocking(call_frame_t *frame, xlator_t *this);
int
-afr_internal_lock_finish (call_frame_t *frame, xlator_t *this);
+afr_blocking_lock(call_frame_t *frame, xlator_t *this);
int
-afr_lk_transfer_datalock (call_frame_t *dst, call_frame_t *src, char *dom,
- unsigned int child_count);
+afr_internal_lock_finish(call_frame_t *frame, xlator_t *this);
int
-__afr_fd_ctx_set (xlator_t *this, fd_t *fd);
-
-int
-afr_fd_ctx_set (xlator_t *this, fd_t *fd);
+__afr_fd_ctx_set(xlator_t *this, fd_t *fd);
afr_fd_ctx_t *
-afr_fd_ctx_get (fd_t *fd, xlator_t *this);
+afr_fd_ctx_get(fd_t *fd, xlator_t *this);
int
-afr_build_parent_loc (loc_t *parent, loc_t *child, int32_t *op_errno);
+afr_build_parent_loc(loc_t *parent, loc_t *child, int32_t *op_errno);
int
-afr_locked_nodes_count (unsigned char *locked_nodes, int child_count);
+afr_locked_nodes_count(unsigned char *locked_nodes, int child_count);
int
-afr_replies_interpret (call_frame_t *frame, xlator_t *this, inode_t *inode);
+afr_replies_interpret(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ gf_boolean_t *start_heal);
void
-afr_local_replies_wipe (afr_local_t *local, afr_private_t *priv);
+afr_local_replies_wipe(afr_local_t *local, afr_private_t *priv);
void
-afr_local_cleanup (afr_local_t *local, xlator_t *this);
+afr_local_cleanup(afr_local_t *local, xlator_t *this);
int
-afr_frame_return (call_frame_t *frame);
+afr_frame_return(call_frame_t *frame);
int
-afr_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
- fd_t *fd, dict_t *xdata);
+afr_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata);
void
-afr_local_transaction_cleanup (afr_local_t *local, xlator_t *this);
+afr_local_transaction_cleanup(afr_local_t *local, xlator_t *this);
int
-afr_cleanup_fd_ctx (xlator_t *this, fd_t *fd);
-
-#define AFR_STACK_UNWIND(fop, frame, params ...) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- if (frame) { \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
- } \
- STACK_UNWIND_STRICT (fop, frame, params); \
- if (__local) { \
- afr_local_cleanup (__local, __this); \
- mem_put (__local); \
- } \
- } while (0)
-
-#define AFR_STACK_DESTROY(frame) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
- STACK_DESTROY (frame->root); \
- if (__local) { \
- afr_local_cleanup (__local, __this); \
- mem_put (__local); \
- } \
- } while (0);
-
-#define AFR_FRAME_INIT(frame, op_errno) \
- ({frame->local = mem_get0 (THIS->local_pool); \
- if (afr_local_init (frame->local, THIS->private, &op_errno)) { \
- afr_local_cleanup (frame->local, THIS); \
- mem_put (frame->local); \
- frame->local = NULL; }; \
- frame->local;})
-
-#define AFR_STACK_RESET(frame) \
- do { \
- afr_local_t *__local = NULL; \
- xlator_t *__this = NULL; \
- __local = frame->local; \
- __this = frame->this; \
- frame->local = NULL; \
- int __opr; \
- STACK_RESET (frame->root); \
- if (__local) { \
- afr_local_cleanup (__local, __this); \
- mem_put (__local); \
- } \
- AFR_FRAME_INIT (frame, __opr); \
- } while (0)
+afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
+
+#define AFR_STACK_UNWIND(fop, frame, op_ret, op_errno, params...) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ int32_t __op_ret = 0; \
+ int32_t __op_errno = 0; \
+ \
+ __op_ret = op_ret; \
+ __op_errno = op_errno; \
+ if (frame) { \
+ __local = frame->local; \
+ __this = frame->this; \
+ afr_handle_inconsistent_fop(frame, &__op_ret, &__op_errno); \
+ if (__local && __local->is_read_txn) \
+ afr_pending_read_decrement(__this->private, \
+ __local->read_subvol); \
+ if (__local && __local->xdata_req && \
+ afr_is_lock_mode_mandatory(__local->xdata_req)) \
+ afr_dom_lock_release(frame); \
+ frame->local = NULL; \
+ } \
+ \
+ STACK_UNWIND_STRICT(fop, frame, __op_ret, __op_errno, params); \
+ if (__local) { \
+ afr_local_cleanup(__local, __this); \
+ mem_put(__local); \
+ } \
+ } while (0)
+
+#define AFR_STACK_DESTROY(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ STACK_DESTROY(frame->root); \
+ if (__local) { \
+ afr_local_cleanup(__local, __this); \
+ mem_put(__local); \
+ } \
+ } while (0);
+
+#define AFR_FRAME_INIT(frame, op_errno) \
+ ({ \
+ frame->local = mem_get0(THIS->local_pool); \
+ if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \
+ afr_local_cleanup(frame->local, frame->this); \
+ mem_put(frame->local); \
+ frame->local = NULL; \
+ }; \
+ frame->local; \
+ })
+
+#define AFR_STACK_RESET(frame) \
+ do { \
+ afr_local_t *__local = NULL; \
+ xlator_t *__this = NULL; \
+ __local = frame->local; \
+ __this = frame->this; \
+ frame->local = NULL; \
+ int __opr; \
+ STACK_RESET(frame->root); \
+ if (__local) { \
+ afr_local_cleanup(__local, __this); \
+ mem_put(__local); \
+ } \
+ AFR_FRAME_INIT(frame, __opr); \
+ } while (0)
/* allocate and return a string that is the basename of argument */
static inline char *
-AFR_BASENAME (const char *str)
+AFR_BASENAME(const char *str)
{
- char *__tmp_str = NULL;
- char *__basename_str = NULL;
- __tmp_str = gf_strdup (str);
- __basename_str = gf_strdup (basename (__tmp_str));
- GF_FREE (__tmp_str);
- return __basename_str;
+ char *__tmp_str = NULL;
+ char *__basename_str = NULL;
+ __tmp_str = gf_strdup(str);
+ __basename_str = gf_strdup(basename(__tmp_str));
+ GF_FREE(__tmp_str);
+ return __basename_str;
}
call_frame_t *
-afr_copy_frame (call_frame_t *base);
+afr_copy_frame(call_frame_t *base);
int
-afr_transaction_local_init (afr_local_t *local, xlator_t *this);
+afr_transaction_local_init(afr_local_t *local, xlator_t *this);
int32_t
-afr_marker_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *name,afr_local_t *local, afr_private_t *priv );
+afr_marker_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, afr_local_t *local, afr_private_t *priv);
int
-afr_local_init (afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
+afr_local_init(afr_local_t *local, afr_private_t *priv, int32_t *op_errno);
int
-afr_internal_lock_init (afr_internal_lock_t *lk, size_t child_count,
- transaction_lk_type_t lk_type);
+afr_internal_lock_init(afr_internal_lock_t *lk, size_t child_count);
int
-afr_higher_errno (int32_t old_errno, int32_t new_errno);
+afr_higher_errno(int32_t old_errno, int32_t new_errno);
int
-afr_final_errno (afr_local_t *local, afr_private_t *priv);
+afr_final_errno(afr_local_t *local, afr_private_t *priv);
int
-afr_xattr_req_prepare (xlator_t *this, dict_t *xattr_req);
+afr_xattr_req_prepare(xlator_t *this, dict_t *xattr_req);
void
-afr_fix_open (fd_t *fd, xlator_t *this);
+afr_fix_open(fd_t *fd, xlator_t *this);
afr_fd_ctx_t *
-afr_fd_ctx_get (fd_t *fd, xlator_t *this);
+afr_fd_ctx_get(fd_t *fd, xlator_t *this);
void
-afr_set_low_priority (call_frame_t *frame);
+afr_set_low_priority(call_frame_t *frame);
int
-afr_child_fd_ctx_set (xlator_t *this, fd_t *fd, int32_t child,
- int flags);
+afr_child_fd_ctx_set(xlator_t *this, fd_t *fd, int32_t child, int flags);
void
-afr_matrix_cleanup (int32_t **pending, unsigned int m);
+afr_matrix_cleanup(int32_t **pending, unsigned int m);
-int32_t**
-afr_matrix_create (unsigned int m, unsigned int n);
+int32_t **
+afr_matrix_create(unsigned int m, unsigned int n);
-int**
-afr_mark_pending_changelog (afr_private_t *priv, unsigned char *pending,
- dict_t *xattr, ia_type_t iat);
+int **
+afr_mark_pending_changelog(afr_private_t *priv, unsigned char *pending,
+ dict_t *xattr, ia_type_t iat);
void
-afr_filter_xattrs (dict_t *xattr);
+afr_filter_xattrs(dict_t *xattr);
/*
* Special value indicating we should use the "auto" quorum method instead of
@@ -1036,69 +1245,179 @@ afr_filter_xattrs (dict_t *xattr);
#define AFR_QUORUM_AUTO INT_MAX
int
-afr_fd_report_unstable_write (xlator_t *this, fd_t *fd);
+afr_fd_report_unstable_write(xlator_t *this, afr_local_t *local);
+
+gf_boolean_t
+afr_fd_has_witnessed_unstable_write(xlator_t *this, inode_t *inode);
+
+void
+afr_reply_wipe(struct afr_reply *reply);
+
+void
+afr_replies_wipe(struct afr_reply *replies, int count);
+
+gf_boolean_t
+afr_xattrs_are_equal(dict_t *dict1, dict_t *dict2);
+
+gf_boolean_t
+afr_is_xattr_ignorable(char *key);
+
+int
+afr_get_heal_info(call_frame_t *frame, xlator_t *this, loc_t *loc);
+
+int
+afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc);
+
+int
+afr_get_split_brain_status(void *opaque);
+
+int
+afr_get_split_brain_status_cbk(int ret, call_frame_t *frame, void *opaque);
+
+int
+afr_inode_split_brain_choice_set(inode_t *inode, xlator_t *this,
+ int spb_choice);
+int
+afr_split_brain_read_subvol_get(inode_t *inode, xlator_t *this,
+ call_frame_t *frame, int *spb_subvol);
+int
+afr_get_child_index_from_name(xlator_t *this, char *name);
+
+int
+afr_is_split_brain(call_frame_t *frame, xlator_t *this, inode_t *inode,
+ uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb);
+int
+afr_spb_choice_timeout_cancel(xlator_t *this, inode_t *inode);
+
+int
+afr_set_split_brain_choice(int ret, call_frame_t *frame, void *opaque);
gf_boolean_t
-afr_fd_has_witnessed_unstable_write (xlator_t *this, fd_t *fd);
+afr_get_need_heal(xlator_t *this);
void
-afr_delayed_changelog_wake_resume (xlator_t *this, fd_t *fd, call_stub_t *stub);
+afr_set_need_heal(xlator_t *this, afr_local_t *local);
+
+int
+afr_selfheal_data_open(xlator_t *this, inode_t *inode, fd_t **fd);
+
+int
+afr_get_msg_id(char *op_type);
int
-afr_inodelk_init (afr_inodelk_t *lk, char *dom, size_t child_count);
+afr_set_in_flight_sb_status(xlator_t *this, call_frame_t *frame,
+ inode_t *inode);
+
+int32_t
+afr_quorum_errno(afr_private_t *priv);
+gf_boolean_t
+afr_is_consistent_io_possible(afr_local_t *local, afr_private_t *priv,
+ int32_t *op_errno);
void
-afr_handle_open_fd_count (call_frame_t *frame, xlator_t *this);
+afr_handle_inconsistent_fop(call_frame_t *frame, int32_t *op_ret,
+ int32_t *op_errno);
void
-afr_remove_eager_lock_stub (afr_local_t *local);
+afr_inode_write_fill(call_frame_t *frame, xlator_t *this, int child_index,
+ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata);
+void
+afr_process_post_writev(call_frame_t *frame, xlator_t *this);
void
-afr_replies_wipe (struct afr_reply *replies, int count);
+afr_writev_unwind(call_frame_t *frame, xlator_t *this);
-gf_boolean_t
-afr_xattrs_are_equal (dict_t *dict1, dict_t *dict2);
+void
+afr_writev_copy_outvars(call_frame_t *src_frame, call_frame_t *dst_frame);
+
+void
+afr_update_uninodelk(afr_local_t *local, afr_internal_lock_t *int_lock,
+ int32_t child_index);
+afr_fd_ctx_t *
+__afr_fd_ctx_get(fd_t *fd, xlator_t *this);
gf_boolean_t
-afr_is_xattr_ignorable (char *key);
+afr_is_inode_refresh_reqd(inode_t *inode, xlator_t *this, int event_gen1,
+ int event_gen2);
int
-afr_get_heal_info (call_frame_t *frame, xlator_t *this, loc_t *loc);
+afr_serialize_xattrs_with_delimiter(call_frame_t *frame, xlator_t *this,
+ char *buf, const char *default_str,
+ int32_t *serz_len, char delimiter);
+gf_boolean_t
+afr_is_symmetric_error(call_frame_t *frame, xlator_t *this);
int
-afr_heal_splitbrain_file(call_frame_t *frame, xlator_t *this, loc_t *loc);
+__afr_inode_ctx_get(xlator_t *this, inode_t *inode, afr_inode_ctx_t **ctx);
-int
-afr_get_split_brain_status (void *opaque);
+uint64_t
+afr_write_subvol_get(call_frame_t *frame, xlator_t *this);
int
-afr_get_split_brain_status_cbk (int ret, call_frame_t *frame, void *opaque);
+afr_write_subvol_set(call_frame_t *frame, xlator_t *this);
int
-afr_inode_split_brain_choice_set (inode_t *inode, xlator_t *this,
- int spb_choice);
+afr_write_subvol_reset(call_frame_t *frame, xlator_t *this);
+
int
-afr_inode_split_brain_choice_get (inode_t *inode, xlator_t *this,
- int *spb_choice);
+afr_set_inode_local(xlator_t *this, afr_local_t *local, inode_t *inode);
+
int
-afr_get_child_index_from_name (xlator_t *this, char *name);
+afr_fill_ta_loc(xlator_t *this, loc_t *loc, gf_boolean_t is_gfid_based_fop);
int
-afr_is_split_brain (call_frame_t *frame, xlator_t *this, inode_t *inode,
- uuid_t gfid, gf_boolean_t *d_spb, gf_boolean_t *m_spb);
+afr_ta_post_op_lock(xlator_t *this, loc_t *loc);
+
int
-afr_spb_choice_timeout_cancel (xlator_t *this, inode_t *inode);
+afr_ta_post_op_unlock(xlator_t *this, loc_t *loc);
+
+gf_boolean_t
+afr_is_pending_set(xlator_t *this, dict_t *xdata, int type);
int
-afr_set_split_brain_choice (int ret, call_frame_t *frame, void *opaque);
+__afr_get_up_children_count(afr_private_t *priv);
+
+call_frame_t *
+afr_ta_frame_create(xlator_t *this);
gf_boolean_t
-afr_get_need_heal (xlator_t *this);
+afr_ta_has_quorum(afr_private_t *priv, afr_local_t *local);
void
-afr_set_need_heal (xlator_t *this, afr_local_t *local);
+afr_ta_lock_release_synctask(xlator_t *this);
-int
-afr_selfheal_data_open (xlator_t *this, inode_t *inode, fd_t **fd);
+void
+afr_ta_locked_priv_invalidate(afr_private_t *priv);
+
+gf_boolean_t
+afr_lookup_has_quorum(call_frame_t *frame,
+ const unsigned int up_children_count);
+
+void
+afr_mark_new_entry_changelog(call_frame_t *frame, xlator_t *this);
+
+void
+afr_handle_replies_quorum(call_frame_t *frame, xlator_t *this);
+gf_boolean_t
+afr_ta_dict_contains_pending_xattr(dict_t *dict, afr_private_t *priv,
+ int child);
+
+void
+afr_selfheal_childup(xlator_t *this, afr_private_t *priv);
+
+gf_boolean_t
+afr_is_lock_mode_mandatory(dict_t *xdata);
+
+void
+afr_dom_lock_release(call_frame_t *frame);
+
+void
+afr_fill_success_replies(afr_local_t *local, afr_private_t *priv,
+ unsigned char *replies);
+
+gf_boolean_t
+afr_is_private_directory(afr_private_t *priv, uuid_t pargfid, const char *name,
+ pid_t pid);
#endif /* __AFR_H__ */
diff --git a/xlators/cluster/afr/src/pump.c b/xlators/cluster/afr/src/pump.c
deleted file mode 100644
index ef299ec5855..00000000000
--- a/xlators/cluster/afr/src/pump.c
+++ /dev/null
@@ -1,2470 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#include <unistd.h>
-#include <sys/time.h>
-#include <stdlib.h>
-#include <fnmatch.h>
-
-#include "afr-common.c"
-#include "defaults.h"
-#include "glusterfs.h"
-#include "pump.h"
-#include "afr-messages.h"
-
-
-static int
-afr_set_dict_gfid (dict_t *dict, uuid_t gfid)
-{
- int ret = 0;
- uuid_t *pgfid = NULL;
-
- GF_ASSERT (gfid);
-
- pgfid = GF_CALLOC (1, sizeof (uuid_t), gf_common_mt_char);
- if (!pgfid) {
- ret = -1;
- goto out;
- }
-
- gf_uuid_copy (*pgfid, gfid);
-
- ret = dict_set_dynptr (dict, "gfid-req", pgfid, sizeof (uuid_t));
- if (ret)
- gf_msg (THIS->name, GF_LOG_ERROR, -ret,
- AFR_MSG_DICT_SET_FAILED, "gfid set failed");
-
-out:
- if (ret && pgfid)
- GF_FREE (pgfid);
- return ret;
-}
-
-static int
-afr_set_root_gfid (dict_t *dict)
-{
- uuid_t gfid;
- int ret = 0;
-
- memset (gfid, 0, 16);
- gfid[15] = 1;
-
- ret = afr_set_dict_gfid (dict, gfid);
-
- return ret;
-}
-
-static int
-afr_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
-{
- int ret = -1;
- uuid_t pargfid = {0};
-
- if (!child)
- goto out;
-
- if (!gf_uuid_is_null (parent->inode->gfid))
- gf_uuid_copy (pargfid, parent->inode->gfid);
- else if (!gf_uuid_is_null (parent->gfid))
- gf_uuid_copy (pargfid, parent->gfid);
-
- if (gf_uuid_is_null (pargfid))
- goto out;
-
- if (strcmp (parent->path, "/") == 0)
- ret = gf_asprintf ((char **)&child->path, "/%s", name);
- else
- ret = gf_asprintf ((char **)&child->path, "%s/%s", parent->path,
- name);
-
- if (-1 == ret) {
- }
-
- child->name = strrchr (child->path, '/');
- if (child->name)
- child->name++;
-
- child->parent = inode_ref (parent->inode);
- child->inode = inode_new (parent->inode->table);
- gf_uuid_copy (child->pargfid, pargfid);
-
- if (!child->inode) {
- ret = -1;
- goto out;
- }
-
- ret = 0;
-out:
- if ((ret == -1) && child)
- loc_wipe (child);
-
- return ret;
-}
-
-static void
-afr_build_root_loc (xlator_t *this, loc_t *loc)
-{
- afr_private_t *priv = NULL;
-
- priv = this->private;
- loc->path = gf_strdup ("/");
- loc->name = "";
- loc->inode = inode_ref (priv->root_inode);
- gf_uuid_copy (loc->gfid, loc->inode->gfid);
-}
-
-static void
-afr_update_loc_gfids (loc_t *loc, struct iatt *buf, struct iatt *postparent)
-{
- GF_ASSERT (loc);
- GF_ASSERT (buf);
-
- gf_uuid_copy (loc->gfid, buf->ia_gfid);
- if (postparent)
- gf_uuid_copy (loc->pargfid, postparent->ia_gfid);
-}
-
-static uint64_t pump_pid = 0;
-static void
-pump_fill_loc_info (loc_t *loc, struct iatt *iatt, struct iatt *parent)
-{
- afr_update_loc_gfids (loc, iatt, parent);
- gf_uuid_copy (loc->inode->gfid, iatt->ia_gfid);
-}
-
-static int
-pump_mark_start_pending (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- pump_priv->pump_start_pending = 1;
-
- return 0;
-}
-
-static int
-is_pump_start_pending (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- return (pump_priv->pump_start_pending);
-}
-
-static int
-pump_remove_start_pending (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- pump_priv->pump_start_pending = 0;
-
- return 0;
-}
-
-static pump_state_t
-pump_get_state ()
-{
- xlator_t *this = NULL;
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- pump_state_t ret;
-
- this = THIS;
- priv = this->private;
- pump_priv = priv->pump_private;
-
- LOCK (&pump_priv->pump_state_lock);
- {
- ret = pump_priv->pump_state;
- }
- UNLOCK (&pump_priv->pump_state_lock);
-
- return ret;
-}
-
-int
-pump_change_state (xlator_t *this, pump_state_t state)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- pump_state_t state_old;
- pump_state_t state_new;
-
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- GF_ASSERT (pump_priv);
-
- LOCK (&pump_priv->pump_state_lock);
- {
- state_old = pump_priv->pump_state;
- state_new = state;
-
- pump_priv->pump_state = state;
-
- }
- UNLOCK (&pump_priv->pump_state_lock);
-
- gf_msg_debug (this->name, 0,
- "Pump changing state from %d to %d",
- state_old, state_new);
-
- return 0;
-}
-
-static int
-pump_set_resume_path (xlator_t *this, const char *path)
-{
- int ret = 0;
-
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- GF_ASSERT (pump_priv);
-
- LOCK (&pump_priv->resume_path_lock);
- {
- strncpy (pump_priv->resume_path, path, strlen (path) + 1);
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- return ret;
-}
-
-static int
-pump_save_path (xlator_t *this, const char *path)
-{
- afr_private_t *priv = NULL;
- pump_state_t state;
- dict_t *dict = NULL;
- loc_t loc = {0};
- int dict_ret = 0;
- int ret = -1;
-
- state = pump_get_state ();
- if (state == PUMP_STATE_RESUME)
- return 0;
-
- priv = this->private;
-
- GF_ASSERT (priv->root_inode);
-
- afr_build_root_loc (this, &loc);
-
- dict = dict_new ();
- dict_ret = dict_set_str (dict, PUMP_PATH, (char *)path);
- if (dict_ret)
- gf_msg (this->name, GF_LOG_WARNING,
- -dict_ret, AFR_MSG_DICT_SET_FAILED,
- "%s: failed to set the key %s", path, PUMP_PATH);
-
- ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0, NULL,
- NULL);
-
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_INFO, -ret, AFR_MSG_INFO_COMMON,
- "setxattr failed - could not save path=%s", path);
- } else {
- gf_msg_debug (this->name, 0,
- "setxattr succeeded - saved path=%s", path);
- }
-
- dict_unref (dict);
-
- loc_wipe (&loc);
- return 0;
-}
-
-static int
-pump_check_and_update_status (xlator_t *this)
-{
- pump_state_t state;
- int ret = -1;
-
- state = pump_get_state ();
-
- switch (state) {
-
- case PUMP_STATE_RESUME:
- case PUMP_STATE_RUNNING:
- {
- ret = 0;
- break;
- }
- case PUMP_STATE_PAUSE:
- {
- ret = -1;
- break;
- }
- case PUMP_STATE_ABORT:
- {
- pump_save_path (this, "/");
- ret = -1;
- break;
- }
- default:
- {
- gf_msg_debug (this->name, 0,
- "Unknown pump state");
- ret = -1;
- break;
- }
-
- }
-
- return ret;
-}
-
-static const char *
-pump_get_resume_path (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- const char *resume_path = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- resume_path = pump_priv->resume_path;
-
- return resume_path;
-}
-
-static int
-pump_update_resume_state (xlator_t *this, const char *path)
-{
- pump_state_t state;
- const char *resume_path = NULL;
-
- state = pump_get_state ();
-
- if (state == PUMP_STATE_RESUME) {
- resume_path = pump_get_resume_path (this);
- if (strcmp (resume_path, "/") == 0) {
- gf_msg_debug (this->name, 0,
- "Reached the resume path (/). Proceeding to change state"
- " to running");
-
- pump_change_state (this, PUMP_STATE_RUNNING);
- } else if (strcmp (resume_path, path) == 0) {
- gf_msg_debug (this->name, 0,
- "Reached the resume path. Proceeding to change state"
- " to running");
-
- pump_change_state (this, PUMP_STATE_RUNNING);
- } else {
- gf_msg_debug (this->name, 0,
- "Not yet hit the resume path:res-path=%s,path=%s",
- resume_path, path);
- }
- }
-
- return 0;
-}
-
-static gf_boolean_t
-is_pump_traversal_allowed (xlator_t *this, const char *path)
-{
- pump_state_t state;
- const char *resume_path = NULL;
- gf_boolean_t ret = _gf_true;
-
- state = pump_get_state ();
-
- if (state == PUMP_STATE_RESUME) {
- resume_path = pump_get_resume_path (this);
- if (strstr (resume_path, path)) {
- gf_msg_debug (this->name, 0,
- "On the right path to resumption path");
- ret = _gf_true;
- } else {
- gf_msg_debug (this->name, 0,
- "Not the right path to resuming=> ignoring traverse");
- ret = _gf_false;
- }
- }
-
- return ret;
-}
-
-static int
-pump_save_file_stats (xlator_t *this, const char *path)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- LOCK (&pump_priv->resume_path_lock);
- {
- pump_priv->number_files_pumped++;
-
- strncpy (pump_priv->current_file, path,
- PATH_MAX);
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- return 0;
-}
-
-static int
-gf_pump_traverse_directory (loc_t *loc)
-{
- xlator_t *this = NULL;
- fd_t *fd = NULL;
- off_t offset = 0;
- loc_t entry_loc = {0};
- gf_dirent_t *entry = NULL;
- gf_dirent_t *tmp = NULL;
- gf_dirent_t entries;
- struct iatt iatt = {0};
- struct iatt parent = {0};
- dict_t *xattr_rsp = NULL;
- int ret = 0;
- gf_boolean_t is_directory_empty = _gf_true;
- gf_boolean_t free_entries = _gf_false;
-
- INIT_LIST_HEAD (&entries.list);
- this = THIS;
-
- GF_ASSERT (loc->inode);
-
- fd = fd_create (loc->inode, pump_pid);
- if (!fd) {
- gf_msg (this->name, GF_LOG_ERROR, 0, AFR_MSG_FD_CREATE_FAILED,
- "Failed to create fd for %s", loc->path);
- goto out;
- }
-
- ret = syncop_opendir (this, loc, fd, NULL, NULL);
- if (ret < 0) {
- gf_msg_debug (this->name, 0,
- "opendir failed on %s", loc->path);
- goto out;
- }
-
- gf_msg_trace (this->name, 0,
- "pump opendir on %s returned=%d",
- loc->path, ret);
-
- while (syncop_readdirp (this, fd, 131072, offset, &entries, NULL,
- NULL)) {
- free_entries = _gf_true;
-
- if (list_empty (&entries.list)) {
- gf_msg_trace (this->name, 0,
- "no more entries in directory");
- goto out;
- }
-
- list_for_each_entry_safe (entry, tmp, &entries.list, list) {
- gf_msg_debug (this->name, 0,
- "found readdir entry=%s", entry->d_name);
-
- offset = entry->d_off;
- if (gf_uuid_is_null (entry->d_stat.ia_gfid)) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_GFID_NULL, "%s/%s: No "
- "gfid present skipping",
- loc->path, entry->d_name);
- continue;
- }
- loc_wipe (&entry_loc);
- ret = afr_build_child_loc (this, &entry_loc, loc,
- entry->d_name);
- if (ret)
- goto out;
-
- if ((strcmp (entry->d_name, ".") == 0) ||
- (strcmp (entry->d_name, "..") == 0))
- continue;
-
- is_directory_empty = _gf_false;
- gf_msg_debug (this->name, 0,
- "lookup %s => %"PRId64,
- entry_loc.path,
- iatt.ia_ino);
-
- ret = syncop_lookup (this, &entry_loc, &iatt, &parent,
- NULL, &xattr_rsp);
-
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR,
- -ret, AFR_MSG_INFO_COMMON,
- "%s: lookup failed", entry_loc.path);
- continue;
- }
-
- ret = afr_selfheal_name (this, loc->gfid, entry->d_name,
- NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SELF_HEAL_FAILED,
- "%s: name self-heal failed (%s/%s)",
- entry_loc.path, uuid_utoa (loc->gfid),
- entry->d_name);
- continue;
- }
-
- ret = afr_selfheal (this, iatt.ia_gfid);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_SELF_HEAL_FAILED,
- "%s: self-heal failed (%s)",
- entry_loc.path,
- uuid_utoa (iatt.ia_gfid));
- continue;
- }
-
- pump_fill_loc_info (&entry_loc, &iatt, &parent);
-
- pump_update_resume_state (this, entry_loc.path);
-
- pump_save_path (this, entry_loc.path);
- pump_save_file_stats (this, entry_loc.path);
-
- ret = pump_check_and_update_status (this);
- if (ret < 0) {
- gf_msg_debug (this->name, 0,
- "Pump beginning to exit out");
- goto out;
- }
-
- if (IA_ISDIR (iatt.ia_type)) {
- if (is_pump_traversal_allowed (this, entry_loc.path)) {
- gf_msg_trace (this->name, 0,
- "entering dir=%s",
- entry->d_name);
- gf_pump_traverse_directory (&entry_loc);
- }
- }
- }
-
- gf_dirent_free (&entries);
- free_entries = _gf_false;
- gf_msg_trace (this->name, 0, "offset incremented to %d",
- (int32_t) offset);
-
- }
-
- ret = syncop_close (fd);
- if (ret < 0)
- gf_msg_debug (this->name, 0, "closing the fd failed");
-
- if (is_directory_empty && (strcmp (loc->path, "/") == 0)) {
- pump_change_state (this, PUMP_STATE_RUNNING);
- gf_msg (this->name, GF_LOG_INFO, 0, AFR_MSG_INFO_COMMON,
- "Empty source brick. Nothing to be done.");
- }
-
-out:
- if (entry_loc.path)
- loc_wipe (&entry_loc);
- if (free_entries)
- gf_dirent_free (&entries);
- return 0;
-}
-
-static int
-pump_update_resume_path (xlator_t *this)
-{
- const char *resume_path = NULL;
-
- resume_path = pump_get_resume_path (this);
-
- if (resume_path) {
- gf_msg_debug (this->name, 0,
- "Found a path to resume from: %s",
- resume_path);
-
- }else {
- gf_msg_debug (this->name, 0,
- "Did not find a path=> setting to '/'");
- pump_set_resume_path (this, "/");
- }
-
- pump_change_state (this, PUMP_STATE_RESUME);
-
- return 0;
-}
-
-static int32_t
-pump_xattr_cleaner (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- loc_t loc = {0};
- int i = 0;
- int ret = 0;
- int source = 0;
- int sink = 1;
-
- priv = this->private;
-
- afr_build_root_loc (this, &loc);
-
- ret = syncop_removexattr (priv->children[source], &loc,
- PUMP_PATH, 0, NULL);
-
- ret = syncop_removexattr (priv->children[sink], &loc,
- PUMP_SINK_COMPLETE, 0, NULL);
-
- for (i = 0; i < priv->child_count; i++) {
- ret = syncop_removexattr (priv->children[i], &loc,
- PUMP_SOURCE_COMPLETE, 0, NULL);
- if (ret) {
- gf_msg_debug (this->name, 0, "removexattr "
- "failed with %s", strerror (-ret));
- }
- }
-
- loc_wipe (&loc);
- return pump_command_reply (frame, this);
-}
-
-static int
-pump_complete_migration (xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
- dict_t *dict = NULL;
- pump_state_t state;
- loc_t loc = {0};
- int dict_ret = 0;
- int ret = -1;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- GF_ASSERT (priv->root_inode);
-
- afr_build_root_loc (this, &loc);
-
- dict = dict_new ();
-
- state = pump_get_state ();
- if (state == PUMP_STATE_RUNNING) {
- gf_msg_debug (this->name, 0,
- "Pump finished pumping");
-
- pump_priv->pump_finished = _gf_true;
-
- dict_ret = dict_set_str (dict, PUMP_SOURCE_COMPLETE, "jargon");
- if (dict_ret)
- gf_msg (this->name, GF_LOG_WARNING, -dict_ret,
- AFR_MSG_DICT_SET_FAILED,
- "%s: failed to set the key %s",
- loc.path, PUMP_SOURCE_COMPLETE);
-
- ret = syncop_setxattr (PUMP_SOURCE_CHILD (this), &loc, dict, 0,
- NULL, NULL);
- if (ret < 0) {
- gf_msg_debug (this->name, 0,
- "setxattr failed - while "
- "notifying source complete");
- }
- dict_ret = dict_set_str (dict, PUMP_SINK_COMPLETE, "jargon");
- if (dict_ret)
- gf_msg (this->name, GF_LOG_WARNING, -dict_ret,
- AFR_MSG_DICT_SET_FAILED,
- "%s: failed to set the key %s",
- loc.path, PUMP_SINK_COMPLETE);
-
- ret = syncop_setxattr (PUMP_SINK_CHILD (this), &loc, dict, 0,
- NULL, NULL);
- if (ret < 0) {
- gf_msg_debug (this->name, 0,
- "setxattr failed - while "
- "notifying sink complete");
- }
-
- pump_save_path (this, "/");
-
- } else if (state == PUMP_STATE_ABORT) {
- gf_msg_debug (this->name, 0, "Starting cleanup "
- "of pump internal xattrs");
- call_resume (pump_priv->cleaner);
- }
-
- loc_wipe (&loc);
- return 0;
-}
-
-static int
-pump_lookup_sink (loc_t *loc)
-{
- xlator_t *this = NULL;
- struct iatt iatt, parent;
- dict_t *xattr_rsp;
- dict_t *xattr_req = NULL;
- int ret = 0;
-
- this = THIS;
-
- xattr_req = dict_new ();
-
- ret = afr_set_root_gfid (xattr_req);
- if (ret)
- goto out;
-
- ret = syncop_lookup (PUMP_SINK_CHILD (this), loc, &iatt, &parent,
- xattr_req, &xattr_rsp);
-
- if (ret) {
- gf_msg_debug (this->name, 0,
- "Lookup on sink child failed");
- ret = -1;
- goto out;
- }
-
-out:
- if (xattr_req)
- dict_unref (xattr_req);
-
- return ret;
-}
-
-static int
-pump_task (void *data)
-{
- xlator_t *this = NULL;
- afr_private_t *priv = NULL;
-
-
- loc_t loc = {0};
- struct iatt iatt, parent;
- dict_t *xattr_rsp = NULL;
- dict_t *xattr_req = NULL;
-
- int ret = -1;
-
- this = THIS;
- priv = this->private;
-
- GF_ASSERT (priv->root_inode);
-
- afr_build_root_loc (this, &loc);
- xattr_req = dict_new ();
- if (!xattr_req) {
- gf_msg_debug (this->name, ENOMEM,
- "Out of memory");
- ret = -1;
- goto out;
- }
-
- afr_set_root_gfid (xattr_req);
- ret = syncop_lookup (this, &loc, &iatt, &parent,
- xattr_req, &xattr_rsp);
-
- gf_msg_trace (this->name, 0,
- "lookup: path=%s gfid=%s",
- loc.path, uuid_utoa (loc.inode->gfid));
-
- ret = pump_check_and_update_status (this);
- if (ret < 0) {
- goto out;
- }
-
- pump_update_resume_path (this);
-
- afr_set_root_gfid (xattr_req);
- ret = pump_lookup_sink (&loc);
- if (ret) {
- pump_update_resume_path (this);
- goto out;
- }
-
- gf_pump_traverse_directory (&loc);
-
- pump_complete_migration (this);
-out:
- if (xattr_req)
- dict_unref (xattr_req);
-
- loc_wipe (&loc);
- return 0;
-}
-
-
-static int
-pump_task_completion (int ret, call_frame_t *sync_frame, void *data)
-{
- xlator_t *this = NULL;
- afr_private_t *priv = NULL;
-
- this = THIS;
-
- priv = this->private;
-
- inode_unref (priv->root_inode);
- STACK_DESTROY (sync_frame->root);
-
- gf_msg_debug (this->name, 0,
- "Pump xlator exiting");
- return 0;
-}
-
-int
-pump_start (call_frame_t *pump_frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- int ret = -1;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- afr_set_lk_owner (pump_frame, this, pump_frame->root);
- pump_pid = (uint64_t) (unsigned long)pump_frame->root;
-
- ret = synctask_new (pump_priv->env, pump_task,
- pump_task_completion,
- pump_frame, NULL);
- if (ret == -1) {
- goto out;
- }
-
- gf_msg_debug (this->name, 0,
- "setting pump as started lk_owner: %s %"PRIu64,
- lkowner_utoa (&pump_frame->root->lk_owner), pump_pid);
-
- priv->use_afr_in_pump = 1;
-out:
- return ret;
-}
-
-static int
-pump_start_synctask (xlator_t *this)
-{
- call_frame_t *frame = NULL;
- int ret = 0;
-
- frame = create_frame (this, this->ctx->pool);
- if (!frame) {
- ret = -1;
- goto out;
- }
-
- pump_change_state (this, PUMP_STATE_RUNNING);
-
- ret = pump_start (frame, this);
-
-out:
- return ret;
-}
-
-int32_t
-pump_cmd_start_setxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno, dict_t *xdata)
-
-{
- call_frame_t *prev = NULL;
- afr_local_t *local = NULL;
- int ret = 0;
-
- local = frame->local;
-
- if (op_ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- AFR_MSG_INFO_COMMON,
- "Could not initiate destination "
- "brick connect");
- ret = op_ret;
- goto out;
- }
-
- gf_msg_debug (this->name, 0,
- "Successfully initiated destination "
- "brick connect");
-
- pump_mark_start_pending (this);
-
- /* send the PARENT_UP as pump is ready now */
- prev = cookie;
- if (prev && prev->this)
- prev->this->notify (prev->this, GF_EVENT_PARENT_UP, this);
-
-out:
- local->op_ret = ret;
- pump_command_reply (frame, this);
-
- return 0;
-}
-
-static int
-pump_initiate_sink_connect (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
- afr_private_t *priv = NULL;
- dict_t *dict = NULL;
- data_t *data = NULL;
- char *clnt_cmd = NULL;
- loc_t loc = {0};
-
- int ret = 0;
-
- priv = this->private;
- local = frame->local;
-
- GF_ASSERT (priv->root_inode);
-
- afr_build_root_loc (this, &loc);
-
- data = data_ref (dict_get (local->dict, RB_PUMP_CMD_START));
- if (!data) {
- ret = -1;
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- AFR_MSG_DICT_GET_FAILED,
- "Could not get destination brick value");
- goto out;
- }
-
- dict = dict_new ();
- if (!dict) {
- ret = -1;
- goto out;
- }
-
- clnt_cmd = GF_CALLOC (1, data->len+1, gf_common_mt_char);
- if (!clnt_cmd) {
- ret = -1;
- goto out;
- }
-
- memcpy (clnt_cmd, data->data, data->len);
- clnt_cmd[data->len] = '\0';
- gf_msg_debug (this->name, 0, "Got destination brick %s\n",
- clnt_cmd);
-
- ret = dict_set_dynstr (dict, CLIENT_CMD_CONNECT, clnt_cmd);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- AFR_MSG_DICT_SET_FAILED,
- "Could not inititiate destination brick "
- "connect");
- goto out;
- }
-
- STACK_WIND (frame,
- pump_cmd_start_setxattr_cbk,
- PUMP_SINK_CHILD(this),
- PUMP_SINK_CHILD(this)->fops->setxattr,
- &loc,
- dict,
- 0, NULL);
-
- ret = 0;
-
-out:
- if (dict)
- dict_unref (dict);
-
- if (data)
- data_unref (data);
-
- if (ret && clnt_cmd)
- GF_FREE (clnt_cmd);
-
- loc_wipe (&loc);
- return ret;
-}
-
-static int
-is_pump_aborted (xlator_t *this)
-{
- pump_state_t state;
-
- state = pump_get_state ();
-
- return ((state == PUMP_STATE_ABORT));
-}
-
-int32_t
-pump_cmd_start_getxattr_cbk (call_frame_t *frame,
- void *cookie,
- xlator_t *this,
- int32_t op_ret,
- int32_t op_errno,
- dict_t *dict, dict_t *xdata)
-{
- afr_local_t *local = NULL;
- char *path = NULL;
-
- pump_state_t state;
- int ret = 0;
- int need_unwind = 0;
- int dict_ret = -1;
-
- local = frame->local;
-
- if (op_ret < 0) {
- gf_msg_debug (this->name, 0,
- "getxattr failed - changing pump "
- "state to RUNNING with '/'");
- path = "/";
- ret = op_ret;
- } else {
- gf_msg_trace (this->name, 0,
- "getxattr succeeded");
-
- dict_ret = dict_get_str (dict, PUMP_PATH, &path);
- if (dict_ret < 0)
- path = "/";
- }
-
- state = pump_get_state ();
- if ((state == PUMP_STATE_RUNNING) ||
- (state == PUMP_STATE_RESUME)) {
- gf_msg (this->name, GF_LOG_ERROR,
- 0, AFR_MSG_PUMP_XLATOR_ERROR,
- "Pump is already started");
- ret = -1;
- goto out;
- }
-
- pump_set_resume_path (this, path);
-
- if (is_pump_aborted (this))
- /* We're re-starting pump afresh */
- ret = pump_initiate_sink_connect (frame, this);
- else {
- /* We're re-starting pump from a previous
- pause */
- gf_msg_debug (this->name, 0,
- "about to start synctask");
- ret = pump_start_synctask (this);
- need_unwind = 1;
- }
-
-out:
- if ((ret < 0) || (need_unwind == 1)) {
- local->op_ret = ret;
- pump_command_reply (frame, this);
- }
- return 0;
-}
-
-int
-pump_execute_status (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- uint64_t number_files = 0;
-
- char filename[PATH_MAX];
- char summary[PATH_MAX+256];
- char *dict_str = NULL;
-
- int32_t op_ret = 0;
- int32_t op_errno = 0;
-
- dict_t *dict = NULL;
- int ret = -1;
-
- priv = this->private;
- pump_priv = priv->pump_private;
-
- LOCK (&pump_priv->resume_path_lock);
- {
- number_files = pump_priv->number_files_pumped;
- strncpy (filename, pump_priv->current_file, PATH_MAX);
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- dict_str = GF_CALLOC (1, PATH_MAX + 256, gf_afr_mt_char);
- if (!dict_str) {
- op_ret = -1;
- op_errno = ENOMEM;
- goto out;
- }
-
- if (pump_priv->pump_finished) {
- snprintf (summary, PATH_MAX+256,
- "no_of_files=%"PRIu64, number_files);
- } else {
- snprintf (summary, PATH_MAX+256,
- "no_of_files=%"PRIu64":current_file=%s",
- number_files, filename);
- }
- snprintf (dict_str, PATH_MAX+256, "status=%d:%s",
- (pump_priv->pump_finished)?1:0, summary);
-
- dict = dict_new ();
-
- ret = dict_set_dynstr (dict, RB_PUMP_CMD_STATUS, dict_str);
- if (ret < 0) {
- gf_msg_debug (this->name, 0,
- "dict_set_dynstr returned negative value");
- } else {
- dict_str = NULL;
- }
-
- op_ret = 0;
-
-out:
-
- AFR_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, NULL);
-
- if (dict)
- dict_unref (dict);
-
- GF_FREE (dict_str);
-
- return 0;
-}
-
-int
-pump_execute_pause (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- pump_change_state (this, PUMP_STATE_PAUSE);
-
- local->op_ret = 0;
- pump_command_reply (frame, this);
-
- return 0;
-}
-
-int
-pump_execute_start (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- afr_local_t *local = NULL;
-
- int ret = 0;
- loc_t loc = {0};
-
- priv = this->private;
- local = frame->local;
-
- if (!priv->root_inode) {
- gf_msg (this->name, GF_LOG_ERROR,
- 0, AFR_MSG_PUMP_XLATOR_ERROR,
- "Pump xlator cannot be started without an initial "
- "lookup");
- ret = -1;
- goto out;
- }
-
- GF_ASSERT (priv->root_inode);
-
- afr_build_root_loc (this, &loc);
-
- STACK_WIND (frame,
- pump_cmd_start_getxattr_cbk,
- PUMP_SOURCE_CHILD(this),
- PUMP_SOURCE_CHILD(this)->fops->getxattr,
- &loc,
- PUMP_PATH, NULL);
-
- ret = 0;
-
-out:
- if (ret < 0) {
- local->op_ret = ret;
- pump_command_reply (frame, this);
- }
-
- loc_wipe (&loc);
- return 0;
-}
-
-static int
-pump_cleanup_helper (void *data) {
- call_frame_t *frame = data;
-
- pump_xattr_cleaner (frame, 0, frame->this, 0, 0, NULL);
-
- return 0;
-}
-
-static int
-pump_cleanup_done (int ret, call_frame_t *sync_frame, void *data)
-{
- STACK_DESTROY (sync_frame->root);
-
- return 0;
-}
-
-int
-pump_execute_commit (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
- afr_local_t *local = NULL;
- call_frame_t *sync_frame = NULL;
- int ret = 0;
-
- priv = this->private;
- pump_priv = priv->pump_private;
- local = frame->local;
-
- local->op_ret = 0;
- if (pump_priv->pump_finished) {
- pump_change_state (this, PUMP_STATE_COMMIT);
- sync_frame = create_frame (this, this->ctx->pool);
- ret = synctask_new (pump_priv->env, pump_cleanup_helper,
- pump_cleanup_done, sync_frame, frame);
- if (ret) {
- gf_msg_debug (this->name, 0, "Couldn't create "
- "synctask for cleaning up xattrs.");
- }
-
- } else {
- gf_msg (this->name, GF_LOG_ERROR, EINPROGRESS,
- AFR_MSG_MIGRATION_IN_PROGRESS,
- "Commit can't proceed. Migration in progress");
- local->op_ret = -1;
- local->op_errno = EINPROGRESS;
- pump_command_reply (frame, this);
- }
-
- return 0;
-}
-int
-pump_execute_abort (call_frame_t *frame, xlator_t *this)
-{
- afr_private_t *priv = NULL;
- pump_private_t *pump_priv = NULL;
- afr_local_t *local = NULL;
- call_frame_t *sync_frame = NULL;
- int ret = 0;
-
- priv = this->private;
- pump_priv = priv->pump_private;
- local = frame->local;
-
- pump_change_state (this, PUMP_STATE_ABORT);
-
- LOCK (&pump_priv->resume_path_lock);
- {
- pump_priv->number_files_pumped = 0;
- pump_priv->current_file[0] = '\0';
- }
- UNLOCK (&pump_priv->resume_path_lock);
-
- local->op_ret = 0;
- if (pump_priv->pump_finished) {
- sync_frame = create_frame (this, this->ctx->pool);
- ret = synctask_new (pump_priv->env, pump_cleanup_helper,
- pump_cleanup_done, sync_frame, frame);
- if (ret) {
- gf_msg_debug (this->name, 0, "Couldn't create "
- "synctask for cleaning up xattrs.");
- }
-
- } else {
- pump_priv->cleaner = fop_setxattr_cbk_stub (frame,
- pump_xattr_cleaner,
- 0, 0, NULL);
- }
-
- return 0;
-}
-
-gf_boolean_t
-pump_command_status (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, RB_PUMP_CMD_STATUS, &cmd);
- if (dict_ret < 0) {
- gf_msg_debug (this->name, 0,
- "Not a pump status command");
- ret = _gf_false;
- goto out;
- }
-
- gf_msg_debug (this->name, 0,
- "Hit a pump command - status");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-gf_boolean_t
-pump_command_pause (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, RB_PUMP_CMD_PAUSE, &cmd);
- if (dict_ret < 0) {
- gf_msg_debug (this->name, 0,
- "Not a pump pause command");
- ret = _gf_false;
- goto out;
- }
-
- gf_msg_debug (this->name, 0,
- "Hit a pump command - pause");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-gf_boolean_t
-pump_command_commit (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, RB_PUMP_CMD_COMMIT, &cmd);
- if (dict_ret < 0) {
- gf_msg_debug (this->name, 0,
- "Not a pump commit command");
- ret = _gf_false;
- goto out;
- }
-
- gf_msg_debug (this->name, 0,
- "Hit a pump command - commit");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-gf_boolean_t
-pump_command_abort (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, RB_PUMP_CMD_ABORT, &cmd);
- if (dict_ret < 0) {
- gf_msg_debug (this->name, 0,
- "Not a pump abort command");
- ret = _gf_false;
- goto out;
- }
-
- gf_msg_debug (this->name, 0,
- "Hit a pump command - abort");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-gf_boolean_t
-pump_command_start (xlator_t *this, dict_t *dict)
-{
- char *cmd = NULL;
- int dict_ret = -1;
- int ret = _gf_true;
-
- dict_ret = dict_get_str (dict, RB_PUMP_CMD_START, &cmd);
- if (dict_ret < 0) {
- gf_msg_debug (this->name, 0,
- "Not a pump start command");
- ret = _gf_false;
- goto out;
- }
-
- gf_msg_debug (this->name, 0,
- "Hit a pump command - start");
- ret = _gf_true;
-
-out:
- return ret;
-
-}
-
-int
-pump_getxattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- const char *name, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- int op_errno = 0;
- int ret = 0;
-
- priv = this->private;
-
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_getxattr_cbk,
- FIRST_CHILD (this),
- (FIRST_CHILD (this))->fops->getxattr,
- loc, name, xdata);
- return 0;
- }
-
- if (name) {
- if (!strncmp (name, AFR_XATTR_PREFIX,
- strlen (AFR_XATTR_PREFIX))) {
-
- op_errno = ENODATA;
- ret = -1;
- goto out;
- }
-
- if (!strcmp (name, RB_PUMP_CMD_STATUS)) {
- gf_msg_debug (this->name, 0,
- "Hit pump command - status");
- pump_execute_status (frame, this);
- goto out;
- }
- }
-
- afr_getxattr (frame, this, loc, name, xdata);
-
-out:
- if (ret < 0)
- AFR_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL);
- return 0;
-}
-
-int
-pump_command_reply (call_frame_t *frame, xlator_t *this)
-{
- afr_local_t *local = NULL;
-
- local = frame->local;
-
- if (local->op_ret < 0)
- gf_msg (this->name, GF_LOG_INFO,
- 0, AFR_MSG_INFO_COMMON,
- "Command failed");
- else
- gf_msg (this->name, GF_LOG_INFO,
- 0, AFR_MSG_INFO_COMMON,
- "Command succeeded");
-
- AFR_STACK_UNWIND (setxattr,
- frame,
- local->op_ret,
- local->op_errno, NULL);
-
- return 0;
-}
-
-int
-pump_parse_command (call_frame_t *frame, xlator_t *this, dict_t *dict,
- int *op_errno_p)
-{
- afr_local_t *local = NULL;
- int ret = -1;
- int op_errno = 0;
-
- if (pump_command_start (this, dict)) {
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
- local->dict = dict_ref (dict);
- ret = pump_execute_start (frame, this);
-
- } else if (pump_command_pause (this, dict)) {
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
- local->dict = dict_ref (dict);
- ret = pump_execute_pause (frame, this);
-
- } else if (pump_command_abort (this, dict)) {
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
- local->dict = dict_ref (dict);
- ret = pump_execute_abort (frame, this);
-
- } else if (pump_command_commit (this, dict)) {
- local = AFR_FRAME_INIT (frame, op_errno);
- if (!local)
- goto out;
- local->dict = dict_ref (dict);
- ret = pump_execute_commit (frame, this);
- }
-out:
- if (op_errno_p)
- *op_errno_p = op_errno;
- return ret;
-}
-
-int
-pump_setxattr (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
- int32_t flags, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- int ret = -1;
- int op_errno = 0;
-
- GF_IF_INTERNAL_XATTR_GOTO ("trusted.glusterfs.pump*", dict, op_errno, out);
-
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_setxattr_cbk,
- FIRST_CHILD (this),
- (FIRST_CHILD (this))->fops->setxattr,
- loc, dict, flags, xdata);
- return 0;
- }
-
- ret = pump_parse_command (frame, this, dict, &op_errno);
- if (ret >= 0)
- goto out;
-
- afr_setxattr (frame, this, loc, dict, flags, xdata);
-
- ret = 0;
-out:
- if (ret < 0) {
- AFR_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
- }
-
- return 0;
-}
-
-/* Defaults */
-static int32_t
-pump_lookup (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *xattr_req)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_lookup_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->lookup,
- loc,
- xattr_req);
- return 0;
- }
-
- afr_lookup (frame, this, loc, xattr_req);
- return 0;
-}
-
-
-static int32_t
-pump_truncate (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- off_t offset, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_truncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->truncate,
- loc,
- offset, xdata);
- return 0;
- }
-
- afr_truncate (frame, this, loc, offset, xdata);
- return 0;
-}
-
-
-static int32_t
-pump_ftruncate (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- off_t offset, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_ftruncate_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->ftruncate,
- fd,
- offset, xdata);
- return 0;
- }
-
- afr_ftruncate (frame, this, fd, offset, xdata);
- return 0;
-}
-
-
-
-
-int
-pump_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_mknod_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mknod,
- loc, mode, rdev, umask, xdata);
- return 0;
- }
- afr_mknod (frame, this, loc, mode, rdev, umask, xdata);
- return 0;
-
-}
-
-
-
-int
-pump_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_mkdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->mkdir,
- loc, mode, umask, xdata);
- return 0;
- }
- afr_mkdir (frame, this, loc, mode, umask, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_unlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc, int xflag, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_unlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->unlink,
- loc, xflag, xdata);
- return 0;
- }
- afr_unlink (frame, this, loc, xflag, xdata);
- return 0;
-
-}
-
-
-static int
-pump_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
-
- priv = this->private;
-
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_rmdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rmdir,
- loc, flags, xdata);
- return 0;
- }
-
- afr_rmdir (frame, this, loc, flags, xdata);
- return 0;
-
-}
-
-
-
-int
-pump_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *loc, mode_t umask, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_symlink_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->symlink,
- linkpath, loc, umask, xdata);
- return 0;
- }
- afr_symlink (frame, this, linkpath, loc, umask, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_rename (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_rename_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->rename,
- oldloc, newloc, xdata);
- return 0;
- }
- afr_rename (frame, this, oldloc, newloc, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_link (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_link_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->link,
- oldloc, newloc, xdata);
- return 0;
- }
- afr_link (frame, this, oldloc, newloc, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- mode_t umask, fd_t *fd, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame, default_create_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->create,
- loc, flags, mode, umask, fd, xdata);
- return 0;
- }
- afr_create (frame, this, loc, flags, mode, umask, fd, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_open (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags, fd_t *fd, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_open_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->open,
- loc, flags, fd, xdata);
- return 0;
- }
- afr_open (frame, this, loc, flags, fd, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t off, uint32_t flags,
- struct iobref *iobref, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_writev_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->writev,
- fd,
- vector,
- count,
- off, flags,
- iobref, xdata);
- return 0;
- }
-
- afr_writev (frame, this, fd, vector, count, off, flags, iobref, xdata);
- return 0;
-}
-
-
-static int32_t
-pump_flush (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_flush_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->flush,
- fd, xdata);
- return 0;
- }
- afr_flush (frame, this, fd, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_fsync (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fsync_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsync,
- fd,
- flags, xdata);
- return 0;
- }
- afr_fsync (frame, this, fd, flags, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_opendir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc, fd_t *fd, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_opendir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->opendir,
- loc, fd, xdata);
- return 0;
- }
- afr_opendir (frame, this, loc, fd, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_fsyncdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t flags, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fsyncdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fsyncdir,
- fd,
- flags, xdata);
- return 0;
- }
- afr_fsyncdir (frame, this, fd, flags, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_xattrop (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- gf_xattrop_flags_t flags,
- dict_t *dict, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_xattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->xattrop,
- loc,
- flags,
- dict, xdata);
- return 0;
- }
- afr_xattrop (frame, this, loc, flags, dict, xdata);
- return 0;
-
-}
-
-static int32_t
-pump_fxattrop (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- gf_xattrop_flags_t flags,
- dict_t *dict, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fxattrop_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->fxattrop,
- fd,
- flags,
- dict, xdata);
- return 0;
- }
- afr_fxattrop (frame, this, fd, flags, dict, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_removexattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- int op_errno = -1;
-
- VALIDATE_OR_GOTO (this, out);
-
- GF_IF_NATIVE_XATTR_GOTO ("trusted.glusterfs.pump*",
- name, op_errno, out);
-
- op_errno = 0;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_removexattr_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->removexattr,
- loc,
- name, xdata);
- return 0;
- }
- afr_removexattr (frame, this, loc, name, xdata);
-
- out:
- if (op_errno)
- AFR_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
- return 0;
-
-}
-
-
-
-static int32_t
-pump_readdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t off, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_readdir_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdir,
- fd, size, off, xdata);
- return 0;
- }
- afr_readdir (frame, this, fd, size, off, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd,
- size_t size, off_t off, dict_t *dict)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_readdirp_cbk,
- FIRST_CHILD(this),
- FIRST_CHILD(this)->fops->readdirp,
- fd, size, off, dict);
- return 0;
- }
- afr_readdirp (frame, this, fd, size, off, dict);
- return 0;
-
-}
-
-
-
-static int32_t
-pump_releasedir (xlator_t *this,
- fd_t *fd)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (priv->use_afr_in_pump)
- afr_releasedir (this, fd);
- return 0;
-
-}
-
-static int32_t
-pump_release (xlator_t *this,
- fd_t *fd)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (priv->use_afr_in_pump)
- afr_release (this, fd);
- return 0;
-
-}
-
-static int32_t
-pump_forget (xlator_t *this, inode_t *inode)
-{
- afr_private_t *priv = NULL;
-
- priv = this->private;
- if (priv->use_afr_in_pump)
- afr_forget (this, inode);
-
- return 0;
-}
-
-static int32_t
-pump_setattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- struct iatt *stbuf,
- int32_t valid, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_setattr_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->setattr,
- loc, stbuf, valid, xdata);
- return 0;
- }
- afr_setattr (frame, this, loc, stbuf, valid, xdata);
- return 0;
-
-}
-
-
-static int32_t
-pump_fsetattr (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iatt *stbuf,
- int32_t valid, dict_t *xdata)
-{
- afr_private_t *priv = NULL;
- priv = this->private;
- if (!priv->use_afr_in_pump) {
- STACK_WIND (frame,
- default_fsetattr_cbk,
- FIRST_CHILD (this),
- FIRST_CHILD (this)->fops->fsetattr,
- fd, stbuf, valid, xdata);
- return 0;
- }
- afr_fsetattr (frame, this, fd, stbuf, valid, xdata);
- return 0;
-
-}
-
-
-/* End of defaults */
-
-
-int32_t
-mem_acct_init (xlator_t *this)
-{
- int ret = -1;
-
- if (!this)
- return ret;
-
- ret = xlator_mem_acct_init (this, gf_afr_mt_end + 1);
-
- if (ret != 0) {
- return ret;
- }
-
- return ret;
-}
-
-static int
-is_xlator_pump_sink (xlator_t *child)
-{
- return (child == PUMP_SINK_CHILD(THIS));
-}
-
-static int
-is_xlator_pump_source (xlator_t *child)
-{
- return (child == PUMP_SOURCE_CHILD(THIS));
-}
-
-int32_t
-notify (xlator_t *this, int32_t event,
- void *data, ...)
-{
- int ret = -1;
- xlator_t *child_xl = NULL;
-
- child_xl = (xlator_t *) data;
-
- ret = afr_notify (this, event, data, NULL);
-
- switch (event) {
- case GF_EVENT_CHILD_DOWN:
- if (is_xlator_pump_source (child_xl))
- pump_change_state (this, PUMP_STATE_ABORT);
- break;
-
- case GF_EVENT_CHILD_UP:
- if (is_xlator_pump_sink (child_xl))
- if (is_pump_start_pending (this)) {
- gf_msg_debug (this->name, 0,
- "about to start synctask");
- ret = pump_start_synctask (this);
- if (ret < 0)
- gf_msg_debug (this->name, 0,
- "Could not start pump "
- "synctask");
- else
- pump_remove_start_pending (this);
- }
- }
-
- return ret;
-}
-
-int32_t
-init (xlator_t *this)
-{
- afr_private_t * priv = NULL;
- pump_private_t *pump_priv = NULL;
- int child_count = 0;
- xlator_list_t * trav = NULL;
- int i = 0;
- int ret = -1;
- GF_UNUSED int op_errno = 0;
-
- int source_child = 0;
-
- if (!this->children) {
- gf_msg (this->name, GF_LOG_ERROR,
- 0, AFR_MSG_CHILD_MISCONFIGURED,
- "pump translator needs a source and sink"
- "subvolumes defined.");
- return -1;
- }
-
- if (!this->parents) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- AFR_MSG_VOL_MISCONFIGURED, "Volume is dangling.");
- }
-
- priv = GF_CALLOC (1, sizeof (afr_private_t), gf_afr_mt_afr_private_t);
- if (!priv)
- goto out;
-
- LOCK_INIT (&priv->lock);
-
- child_count = xlator_subvolume_count (this);
- if (child_count != 2) {
- gf_msg (this->name, GF_LOG_ERROR,
- 0, AFR_MSG_CHILD_MISCONFIGURED,
- "There should be exactly 2 children - one source "
- "and one sink");
- return -1;
- }
- priv->child_count = child_count;
-
- priv->read_child = source_child;
- priv->favorite_child = source_child;
- priv->background_self_heal_count = 0;
-
- priv->data_self_heal = "on";
- priv->metadata_self_heal = 1;
- priv->entry_self_heal = 1;
-
- priv->data_self_heal_window_size = 16;
-
- priv->data_change_log = 1;
- priv->metadata_change_log = 1;
- priv->entry_change_log = 1;
- priv->use_afr_in_pump = 1;
- priv->sh_readdir_size = 65536;
-
- /* Locking options */
-
- /* Lock server count infact does not matter. Locks are held
- on all subvolumes, in this case being the source
- and the sink.
- */
-
- priv->child_up = GF_CALLOC (sizeof (unsigned char), child_count,
- gf_afr_mt_char);
- if (!priv->child_up) {
- op_errno = ENOMEM;
- goto out;
- }
-
- priv->children = GF_CALLOC (sizeof (xlator_t *), child_count,
- gf_afr_mt_xlator_t);
- if (!priv->children) {
- op_errno = ENOMEM;
- goto out;
- }
-
- priv->pending_key = GF_CALLOC (sizeof (*priv->pending_key),
- child_count,
- gf_afr_mt_char);
- if (!priv->pending_key) {
- op_errno = ENOMEM;
- goto out;
- }
-
- trav = this->children;
- i = 0;
- while (i < child_count) {
- priv->children[i] = trav->xlator;
-
- ret = gf_asprintf (&priv->pending_key[i], "%s.%s",
- AFR_XATTR_PREFIX,
- trav->xlator->name);
- if (-1 == ret) {
- op_errno = ENOMEM;
- goto out;
- }
-
- trav = trav->next;
- i++;
- }
-
- ret = gf_asprintf (&priv->sh_domain, "%s-self-heal", this->name);
- if (-1 == ret) {
- op_errno = ENOMEM;
- goto out;
- }
-
- priv->root_inode = NULL;
-
- priv->last_event = GF_CALLOC (child_count, sizeof (*priv->last_event),
- gf_afr_mt_int32_t);
- if (!priv->last_event) {
- ret = -ENOMEM;
- goto out;
- }
-
- pump_priv = GF_CALLOC (1, sizeof (*pump_priv),
- gf_afr_mt_pump_priv);
- if (!pump_priv) {
- op_errno = ENOMEM;
- goto out;
- }
-
- LOCK_INIT (&pump_priv->resume_path_lock);
- LOCK_INIT (&pump_priv->pump_state_lock);
-
- pump_priv->resume_path = GF_CALLOC (1, PATH_MAX,
- gf_afr_mt_char);
- if (!pump_priv->resume_path) {
- ret = -1;
- goto out;
- }
-
- pump_priv->env = this->ctx->env;
- if (!pump_priv->env) {
- ret = -1;
- goto out;
- }
-
- /* keep more local here as we may need them for self-heal etc */
- this->local_pool = mem_pool_new (afr_local_t, 128);
- if (!this->local_pool) {
- ret = -1;
- goto out;
- }
-
- priv->pump_private = pump_priv;
- pump_priv = NULL;
-
- this->private = priv;
- priv = NULL;
-
- pump_change_state (this, PUMP_STATE_ABORT);
-
- ret = 0;
-out:
-
- if (pump_priv) {
- GF_FREE (pump_priv->resume_path);
- LOCK_DESTROY (&pump_priv->resume_path_lock);
- LOCK_DESTROY (&pump_priv->pump_state_lock);
- GF_FREE (pump_priv);
- }
-
- if (priv) {
- GF_FREE (priv->child_up);
- GF_FREE (priv->children);
- GF_FREE (priv->pending_key);
- GF_FREE (priv->last_event);
- LOCK_DESTROY (&priv->lock);
- GF_FREE (priv);
- }
-
- return ret;
-}
-
-int
-fini (xlator_t *this)
-{
- afr_private_t * priv = NULL;
- pump_private_t *pump_priv = NULL;
-
- priv = this->private;
- this->private = NULL;
- if (!priv)
- goto out;
-
- pump_priv = priv->pump_private;
- if (!pump_priv)
- goto afr_priv;
-
- GF_FREE (pump_priv->resume_path);
- LOCK_DESTROY (&pump_priv->resume_path_lock);
- LOCK_DESTROY (&pump_priv->pump_state_lock);
- GF_FREE (pump_priv);
-afr_priv:
- afr_priv_destroy (priv);
-out:
- return 0;
-}
-
-
-struct xlator_fops fops = {
- .lookup = pump_lookup,
- .open = pump_open,
- .flush = pump_flush,
- .fsync = pump_fsync,
- .fsyncdir = pump_fsyncdir,
- .xattrop = pump_xattrop,
- .fxattrop = pump_fxattrop,
- .getxattr = pump_getxattr,
-
- /* inode write */
- .writev = pump_writev,
- .truncate = pump_truncate,
- .ftruncate = pump_ftruncate,
- .setxattr = pump_setxattr,
- .setattr = pump_setattr,
- .fsetattr = pump_fsetattr,
- .removexattr = pump_removexattr,
-
- /* dir read */
- .opendir = pump_opendir,
- .readdir = pump_readdir,
- .readdirp = pump_readdirp,
-
- /* dir write */
- .create = pump_create,
- .mknod = pump_mknod,
- .mkdir = pump_mkdir,
- .unlink = pump_unlink,
- .rmdir = pump_rmdir,
- .link = pump_link,
- .symlink = pump_symlink,
- .rename = pump_rename,
-};
-
-struct xlator_dumpops dumpops = {
- .priv = afr_priv_dump,
-};
-
-
-struct xlator_cbks cbks = {
- .release = pump_release,
- .releasedir = pump_releasedir,
- .forget = pump_forget,
-};
-
-struct volume_options options[] = {
- { .key = {NULL} },
-};
diff --git a/xlators/cluster/afr/src/pump.h b/xlators/cluster/afr/src/pump.h
deleted file mode 100644
index 7d5acd02bf6..00000000000
--- a/xlators/cluster/afr/src/pump.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- Copyright (c) 2008-2012 Red Hat, Inc. <http://www.redhat.com>
- This file is part of GlusterFS.
-
- This file is licensed to you under your choice of the GNU Lesser
- General Public License, version 3 or any later version (LGPLv3 or
- later), or the GNU General Public License, version 2 (GPLv2), in all
- cases as published by the Free Software Foundation.
-*/
-
-#ifndef __PUMP_H__
-#define __PUMP_H__
-
-#include "syncop.h"
-
-/* FIXME: Needs to be defined in a common file */
-#define CLIENT_CMD_CONNECT "trusted.glusterfs.client-connect"
-#define CLIENT_CMD_DISCONNECT "trusted.glusterfs.client-disconnect"
-
-#define PUMP_SOURCE_COMPLETE "trusted.glusterfs.pump-source-complete"
-#define PUMP_SINK_COMPLETE "trusted.glusterfs.pump-sink-complete"
-
-#define PUMP_PATH "trusted.glusterfs.pump-path"
-
-#define PUMP_SOURCE_CHILD(xl) (xl->children->xlator)
-#define PUMP_SINK_CHILD(xl) (xl->children->next->xlator)
-
-typedef enum {
- PUMP_STATE_RUNNING, /* Pump is running and migrating files */
- PUMP_STATE_RESUME, /* Pump is resuming from a previous pause */
- PUMP_STATE_PAUSE, /* Pump is paused */
- PUMP_STATE_ABORT, /* Pump is aborted */
- PUMP_STATE_COMMIT, /* Pump is committed */
-} pump_state_t;
-
-typedef struct _pump_private {
- struct syncenv *env; /* The env pointer to the pump synctask */
- char *resume_path; /* path to resume from the last pause */
- gf_lock_t resume_path_lock; /* Synchronize resume_path changes */
- gf_lock_t pump_state_lock; /* Synchronize pump_state changes */
- pump_state_t pump_state; /* State of pump */
- char current_file[PATH_MAX]; /* Current file being pumped */
- uint64_t number_files_pumped; /* Number of files pumped */
- gf_boolean_t pump_finished; /* Boolean to indicate pump termination */
- char pump_start_pending; /* Boolean to mark start pending until
- CHILD_UP */
- call_stub_t *cleaner;
-} pump_private_t;
-
-void
-build_root_loc (inode_t *inode, loc_t *loc);
-int pump_start (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_start (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_start (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_pause (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_pause (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_abort (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_abort (call_frame_t *frame, xlator_t *this);
-
-gf_boolean_t
-pump_command_status (xlator_t *this, dict_t *dict);
-
-int
-pump_execute_status (call_frame_t *frame, xlator_t *this);
-
-int
-pump_command_reply (call_frame_t *frame, xlator_t *this);
-
-#endif /* __PUMP_H__ */
diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
index 29be5ce4776..56f1f2ad7c8 100644
--- a/xlators/cluster/dht/src/Makefile.am
+++ b/xlators/cluster/dht/src/Makefile.am
@@ -1,7 +1,4 @@
xlator_LTLIBRARIES = dht.la nufa.la switch.la
-if BUILD_GFDB
- xlator_LTLIBRARIES += tier.la
-endif
AM_CFLAGS = -Wall $(GF_CFLAGS)
@@ -10,40 +7,34 @@ xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster
dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c dht-rebalance.c \
dht-selfheal.c dht-rename.c dht-hashfn.c dht-diskusage.c \
dht-common.c dht-inode-write.c dht-inode-read.c dht-shared.c \
- $(top_builddir)/xlators/lib/src/libxlator.c
+ dht-lock.c $(top_builddir)/xlators/lib/src/libxlator.c
dht_la_SOURCES = $(dht_common_source) dht.c
nufa_la_SOURCES = $(dht_common_source) nufa.c
switch_la_SOURCES = $(dht_common_source) switch.c
-tier_la_SOURCES = $(dht_common_source) tier.c tier-common.c
-dht_la_LDFLAGS = -module -avoid-version -export-symbols $(top_srcdir)/xlators/cluster/dht/src/dht.sym
+dht_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-nufa_la_LDFLAGS = -module -avoid-version -export-symbols $(top_srcdir)/xlators/cluster/dht/src/nufa.sym
+nufa_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-switch_la_LDFLAGS = -module -avoid-version -export-symbols $(top_srcdir)/xlators/cluster/dht/src/switch.sym
+switch_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-tier_la_LDFLAGS = -module -avoid-version -export-symbols $(top_srcdir)/xlators/cluster/dht/src/tier.sym
-tier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
-
-noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h dht-helper.h tier-common.h tier.h\
- $(top_builddir)/xlators/lib/src/libxlator.h
+noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h \
+ dht-lock.h $(top_builddir)/xlators/lib/src/libxlator.h
AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
- -I$(top_srcdir)/libglusterfs/src/gfdb \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src \
-I$(top_srcdir)/xlators/lib/src \
-DDATADIR=\"$(localstatedir)\" \
- -DLIBDIR=\"$(libdir)\" \
- -DLIBGFDB_VERSION=\"$(LIBGFDB_VERSION)\"
+ -DLIBDIR=\"$(libdir)\"
CLEANFILES =
-EXTRA_DIST = dht.sym nufa.sym switch.sym tier.sym
-
uninstall-local:
rm -f $(DESTDIR)$(xlatordir)/distribute.so
diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c
index 5886886e830..8ba0cc4c732 100644
--- a/xlators/cluster/dht/src/dht-common.c
+++ b/xlators/cluster/dht/src/dht-common.c
@@ -8,163 +8,439 @@
cases as published by the Free Software Foundation.
*/
-
/* TODO: add NS locking */
-#include "glusterfs.h"
-#include "xlator.h"
#include "libxlator.h"
#include "dht-common.h"
-#include "defaults.h"
-#include "byte-order.h"
-#include "glusterfs-acl.h"
-#include "quota-common-utils.h"
+#include "dht-lock.h"
+#include <glusterfs/byte-order.h>
+#include <glusterfs/quota-common-utils.h>
+#include <glusterfs/upcall-utils.h>
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+#include <glusterfs/common-utils.h>
#include <sys/time.h>
#include <libgen.h>
#include <signal.h>
-int run_defrag = 0;
+static int
+dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries,
+ dict_t *xdata);
-int dht_link2 (xlator_t *this, xlator_t *dst_node, call_frame_t *frame,
- int ret);
+static int
+dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
-int
-dht_removexattr2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame,
- int ret);
+static int
+dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req);
-int
-dht_setxattr2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame,
- int ret);
+static int
+dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this);
-int
-dht_aggregate_quota_xattr (dict_t *dst, char *key, data_t *value)
-{
- int ret = -1;
- quota_meta_t *meta_dst = NULL;
- quota_meta_t *meta_src = NULL;
- int64_t *size = NULL;
- int64_t dst_dir_count = 0;
- int64_t src_dir_count = 0;
-
- if (value == NULL) {
- gf_msg ("dht", GF_LOG_WARNING, 0,
- DHT_MSG_DATA_NULL, "data value is NULL");
- ret = -1;
- goto out;
- }
+static int
+dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
+
+static int
+dht_rmdir_unlock(call_frame_t *frame, xlator_t *this);
- ret = dict_get_bin (dst, key, (void **)&meta_dst);
+static const char *dht_dbg_vxattrs[] = {DHT_DBG_HASHED_SUBVOL_PATTERN, NULL};
+
+/* Check the xdata to make sure EBADF has been set by client xlator */
+int32_t
+dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno)
+{
+ if (op_ret == -1 && (op_errno == EBADF || op_errno == EBADFD) &&
+ !(local->fd_checked)) {
+ return 1;
+ }
+ return 0;
+}
+
+/* Sets the blocks and size values to fixed values. This is to be called
+ * only for dirs. The caller is responsible for checking the type
+ */
+int32_t
+dht_set_fixed_dir_stat(struct iatt *stat)
+{
+ if (stat) {
+ stat->ia_blocks = DHT_DIR_STAT_BLOCKS;
+ stat->ia_size = DHT_DIR_STAT_SIZE;
+ return 0;
+ }
+ return -1;
+}
+
+/* Return true if key exists in array
+ */
+static gf_boolean_t
+dht_match_xattr(const char *key)
+{
+ char **xattrs_to_heal = get_xattrs_to_heal();
+
+ return gf_get_index_by_elem(xattrs_to_heal, (char *)key) >= 0;
+}
+
+static int
+dht_aggregate_quota_xattr(dict_t *dst, char *key, data_t *value)
+{
+ int ret = -1;
+ quota_meta_t *meta_dst = NULL;
+ quota_meta_t *meta_src = NULL;
+ int64_t *size = NULL;
+ int64_t dst_dir_count = 0;
+ int64_t src_dir_count = 0;
+
+ if (value == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL,
+ "data value is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_bin(dst, key, (void **)&meta_dst);
+ if (ret < 0) {
+ meta_dst = GF_CALLOC(1, sizeof(quota_meta_t), gf_common_quota_meta_t);
+ if (meta_dst == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, ENOMEM, DHT_MSG_NO_MEMORY,
+ "Memory allocation failed");
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_bin(dst, key, meta_dst, sizeof(quota_meta_t));
if (ret < 0) {
- meta_dst = GF_CALLOC (1, sizeof (quota_meta_t),
- gf_common_quota_meta_t);
- if (meta_dst == NULL) {
- gf_msg ("dht", GF_LOG_WARNING, ENOMEM,
- DHT_MSG_NO_MEMORY,
- "Memory allocation failed");
- ret = -1;
- goto out;
- }
- ret = dict_set_bin (dst, key, meta_dst,
- sizeof (quota_meta_t));
- if (ret < 0) {
- gf_msg ("dht", GF_LOG_WARNING, EINVAL,
- DHT_MSG_DICT_SET_FAILED,
- "dht aggregate dict set failed");
- GF_FREE (meta_dst);
- ret = -1;
- goto out;
- }
+ gf_msg("dht", GF_LOG_WARNING, EINVAL, DHT_MSG_DICT_SET_FAILED,
+ "dht aggregate dict set failed");
+ GF_FREE(meta_dst);
+ ret = -1;
+ goto out;
}
+ }
- if (value->len > sizeof (int64_t)) {
- meta_src = data_to_bin (value);
+ if (value->len > sizeof(int64_t)) {
+ meta_src = data_to_bin(value);
- meta_dst->size = hton64 (ntoh64 (meta_dst->size) +
- ntoh64 (meta_src->size));
- meta_dst->file_count = hton64 (ntoh64 (meta_dst->file_count) +
- ntoh64 (meta_src->file_count));
+ meta_dst->size = hton64(ntoh64(meta_dst->size) +
+ ntoh64(meta_src->size));
+ meta_dst->file_count = hton64(ntoh64(meta_dst->file_count) +
+ ntoh64(meta_src->file_count));
- if (value->len > (2 * sizeof (int64_t))) {
- dst_dir_count = ntoh64 (meta_dst->dir_count);
- src_dir_count = ntoh64 (meta_src->dir_count);
+ if (value->len > (2 * sizeof(int64_t))) {
+ dst_dir_count = ntoh64(meta_dst->dir_count);
+ src_dir_count = ntoh64(meta_src->dir_count);
- if (src_dir_count > dst_dir_count)
- meta_dst->dir_count = meta_src->dir_count;
- } else {
- meta_dst->dir_count = 0;
- }
+ if (src_dir_count > dst_dir_count)
+ meta_dst->dir_count = meta_src->dir_count;
} else {
- size = data_to_bin (value);
- meta_dst->size = hton64 (ntoh64 (meta_dst->size) +
- ntoh64 (*size));
+ meta_dst->dir_count = 0;
}
+ } else {
+ size = data_to_bin(value);
+ meta_dst->size = hton64(ntoh64(meta_dst->size) + ntoh64(*size));
+ }
- ret = 0;
+ ret = 0;
out:
- return ret;
+ return ret;
}
+static int
+add_opt(char **optsp, const char *opt)
+{
+ char *newopts = NULL;
+ unsigned oldsize = 0;
+ unsigned newsize = 0;
+
+ if (*optsp == NULL)
+ newopts = gf_strdup(opt);
+ else {
+ oldsize = strlen(*optsp);
+ newsize = oldsize + 1 + strlen(opt) + 1;
+ newopts = GF_REALLOC(*optsp, newsize);
+ if (newopts)
+ sprintf(newopts + oldsize, ",%s", opt);
+ }
+ if (newopts == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to add choices in buffer in add_opt");
+ return -1;
+ }
+ *optsp = newopts;
+ return 0;
+}
-
-int
-dht_aggregate (dict_t *this, char *key, data_t *value, void *data)
+/* Return Choice list from Split brain status */
+static char *
+getChoices(const char *value)
{
- dict_t *dst = NULL;
- int32_t ret = -1;
- data_t *dict_data = NULL;
+ int i = 0;
+ char *ptr = NULL;
+ char *tok = NULL;
+ char *result = NULL;
+ char *newval = NULL;
+
+ ptr = strstr(value, "Choices:");
+ if (!ptr) {
+ result = ptr;
+ goto out;
+ }
- dst = data;
+ newval = gf_strdup(ptr);
+ if (!newval) {
+ result = newval;
+ goto out;
+ }
- if (strcmp (key, QUOTA_SIZE_KEY) == 0) {
- ret = dht_aggregate_quota_xattr (dst, key, value);
- if (ret) {
- gf_msg ("dht", GF_LOG_WARNING, 0,
- DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED,
- "Failed to aggregate quota xattr");
- goto out;
- }
- } else if (fnmatch (GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
- ret = gf_get_min_stime (THIS, dst, key, value);
- if (ret < 0)
- goto out;
- } else {
- /* compare user xattrs only */
- if (!strncmp (key, "user.", strlen ("user."))) {
- ret = dict_lookup (dst, key, &dict_data);
- if (!ret && dict_data && value) {
- ret = is_data_equal (dict_data, value);
- if (!ret)
- gf_msg_debug ("dht", 0,
- "xattr mismatch for %s",
- key);
- }
- }
- ret = dict_set (dst, key, value);
- if (ret) {
- gf_msg ("dht", GF_LOG_WARNING, 0,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value: key = %s",
- key);
- }
- }
+ tok = strtok(newval, ":");
+ if (!tok) {
+ result = tok;
+ goto out;
+ }
+
+ while (tok) {
+ i++;
+ if (i == 2)
+ break;
+ tok = strtok(NULL, ":");
+ }
+
+ result = gf_strdup(tok);
- ret = 0;
out:
- return ret;
+ if (newval)
+ GF_FREE(newval);
+
+ return result;
}
+/* This function prepare a list of choices for key
+ (replica.split-brain-status) in case of metadata split brain
+ only on the basis of key-value passed to this function.
+ After prepare the list of choices it update the same key in dict
+ with this value to reflect the same in
+ replica.split-brain-status attr for file.
-void
-dht_aggregate_xattr (dict_t *dst, dict_t *src)
+*/
+
+static int
+dht_aggregate_split_brain_xattr(dict_t *dst, char *key, data_t *value)
{
- if ((dst == NULL) || (src == NULL)) {
+ int ret = 0;
+ char *oldvalue = NULL;
+ char *old_choice = NULL;
+ char *new_choice = NULL;
+ char *full_choice = NULL;
+ char *status = NULL;
+
+ if (value == NULL) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DATA_NULL,
+ "GF_AFR_SBRAIN_STATUS value is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_get_str(dst, key, &oldvalue);
+ if (ret)
+ goto out;
+
+ /* skip code that is irrelevant if !oldvalue */
+ if (!oldvalue)
+ goto out;
+
+ if (strstr(oldvalue, "not")) {
+ gf_msg_debug("dht", 0, "Need to update split-brain status in dict");
+ ret = -1;
+ goto out;
+ }
+ if (strstr(oldvalue, "metadata-split-brain:yes") &&
+ (strstr(oldvalue, "data-split-brain:no"))) {
+ if (strstr(value->data, "not")) {
+ gf_msg_debug("dht", 0, "No need to update split-brain status");
+ ret = 0;
+ goto out;
+ }
+ if (strstr(value->data, "yes") &&
+ (strncmp(oldvalue, value->data, strlen(oldvalue)))) {
+ old_choice = getChoices(oldvalue);
+ if (!old_choice) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to get choices");
+ ret = -1;
+ goto out;
+ }
+
+ ret = add_opt(&full_choice, old_choice);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to add choices");
+ ret = -1;
+ goto out;
+ }
+
+ new_choice = getChoices(value->data);
+ if (!new_choice) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to get choices");
+ ret = -1;
+ goto out;
+ }
+
+ ret = add_opt(&full_choice, new_choice);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to add choices ");
+ ret = -1;
goto out;
+ }
+ ret = gf_asprintf(&status,
+ "data-split-brain:%s "
+ "metadata-split-brain:%s Choices:%s",
+ "no", "yes", full_choice);
+
+ if (-1 == ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_NO_MEMORY,
+ "Error to prepare status ");
+ goto out;
+ }
+ ret = dict_set_dynstr(dst, key, status);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set full choice");
+ }
}
+ }
- dict_foreach (src, dht_aggregate, dst);
out:
- return;
+ if (old_choice)
+ GF_FREE(old_choice);
+ if (new_choice)
+ GF_FREE(new_choice);
+ if (full_choice)
+ GF_FREE(full_choice);
+
+ return ret;
+}
+
+static int
+dht_aggregate(dict_t *this, char *key, data_t *value, void *data)
+{
+ dict_t *dst = NULL;
+ int32_t ret = -1;
+ data_t *dict_data = NULL;
+
+ dst = data;
+
+ /* compare split brain xattr only */
+ if (strcmp(key, GF_AFR_SBRAIN_STATUS) == 0) {
+ ret = dht_aggregate_split_brain_xattr(dst, key, value);
+ if (!ret)
+ goto out;
+ } else if (strcmp(key, QUOTA_SIZE_KEY) == 0) {
+ ret = dht_aggregate_quota_xattr(dst, key, value);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0,
+ DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED,
+ "Failed to aggregate quota xattr");
+ }
+ goto out;
+ } else if (fnmatch(GF_XATTR_STIME_PATTERN, key, FNM_NOESCAPE) == 0) {
+ ret = gf_get_min_stime(THIS, dst, key, value);
+ goto out;
+ } else {
+ /* compare user xattrs only */
+ if (!strncmp(key, "user.", SLEN("user."))) {
+ ret = dict_lookup(dst, key, &dict_data);
+ if (!ret && dict_data && value) {
+ ret = is_data_equal(dict_data, value);
+ if (!ret)
+ gf_msg_debug("dht", 0, "xattr mismatch for %s", key);
+ }
+ }
+ }
+
+ ret = dict_set(dst, key, value);
+ if (ret) {
+ gf_msg("dht", GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s", key);
+ }
+
+out:
+ return ret;
+}
+
+static void
+dht_aggregate_xattr(dict_t *dst, dict_t *src)
+{
+ if ((dst == NULL) || (src == NULL)) {
+ goto out;
+ }
+
+ dict_foreach(src, dht_aggregate, dst);
+out:
+ return;
+}
+
+/* Code to save hashed subvol on inode ctx as a mds subvol
+ */
+int
+dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+ uint64_t ctx_int = 0;
+ gf_boolean_t ctx_free = _gf_false;
+
+ LOCK(&inode->lock);
+ {
+ ret = __inode_ctx_get(inode, this, &ctx_int);
+ if (ctx_int) {
+ ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int;
+ ctx->mds_subvol = mds_subvol;
+ } else {
+ ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t);
+ if (!ctx)
+ goto unlock;
+ ctx->mds_subvol = mds_subvol;
+ ctx_free = _gf_true;
+ ctx_int = (long)ctx;
+ ret = __inode_ctx_set(inode, this, &ctx_int);
+ }
+ }
+unlock:
+ UNLOCK(&inode->lock);
+ if (ret && ctx_free)
+ GF_FREE(ctx);
+ return ret;
+}
+
+/*Code to get mds subvol from inode ctx */
+
+int
+dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+
+ if (!mdsvol)
+ return ret;
+
+ if (__is_root_gfid(inode->gfid)) {
+ (*mdsvol) = FIRST_CHILD(this);
+ return 0;
+ }
+
+ ret = dht_inode_ctx_get(inode, this, &ctx);
+
+ if (!ret && ctx) {
+ if (ctx->mds_subvol) {
+ *mdsvol = ctx->mds_subvol;
+ ret = 0;
+ } else {
+ ret = -1;
+ }
+ }
+
+ return ret;
}
/* TODO:
@@ -174,1024 +450,1851 @@ out:
- complete linkfile selfheal
*/
-
-int
-dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
+static int
+dht_lookup_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int ret = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
- local = frame->local;
- ret = op_ret;
+ local = frame->local;
+ conf = this->private;
+ ret = op_ret;
- FRAME_SU_UNDO (frame, dht_local_t);
+ FRAME_SU_UNDO(frame, dht_local_t);
- if (ret == 0) {
- layout = local->selfheal.layout;
- ret = dht_layout_set (this, local->inode, layout);
- }
+ if (ret == 0) {
+ layout = local->selfheal.layout;
+ ret = dht_layout_set(this, local->inode, layout);
+ }
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- &local->postparent, 1);
- }
+ dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, &local->postparent,
+ 1);
+ }
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ /* Delete mds xattr at the time of STACK UNWIND */
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
- DHT_STACK_UNWIND (lookup, frame, ret, local->op_errno, local->inode,
- &local->stbuf, local->xattr, &local->postparent);
+ DHT_STACK_UNWIND(lookup, frame, ret, local->op_errno, local->inode,
+ &local->stbuf, local->xattr, &local->postparent);
out:
- return ret;
+ return ret;
}
-int
-dht_discover_complete (xlator_t *this, call_frame_t *discover_frame)
-{
- dht_local_t *local = NULL;
- dht_local_t *heal_local = NULL;
- call_frame_t *main_frame = NULL;
- call_frame_t *heal_frame = NULL;
- int op_errno = 0;
- int ret = -1;
- dht_layout_t *layout = NULL;
- dht_conf_t *conf = NULL;
- uint32_t vol_commit_hash = 0;
- xlator_t *source = NULL;
- int heal_path = 0;
- int i = 0;
- loc_t loc = {0 };
-
- local = discover_frame->local;
- layout = local->layout;
- conf = this->private;
+static int
+dht_discover_complete(xlator_t *this, call_frame_t *discover_frame)
+{
+ dht_local_t *local = NULL;
+ dht_local_t *heal_local = NULL;
+ call_frame_t *main_frame = NULL;
+ call_frame_t *heal_frame = NULL;
+ int op_errno = 0;
+ int ret = -1;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ uint32_t vol_commit_hash = 0;
+ xlator_t *source = NULL;
+ int heal_path = 0;
+ int error_while_marking_mds = 0;
+ int i = 0;
+ loc_t loc = {0};
+ int8_t is_read_only = 0, layout_anomalies = 0;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+
+ local = discover_frame->local;
+ layout = local->layout;
+ conf = this->private;
+ gf_uuid_unparse(local->gfid, gfid_local);
+
+ LOCK(&discover_frame->lock);
+ {
+ main_frame = local->main_frame;
+ local->main_frame = NULL;
+ }
+ UNLOCK(&discover_frame->lock);
+
+ if (!main_frame)
+ return 0;
+
+ /* Code to update all extended attributed from
+ subvol to local->xattr on that internal xattr has found
+ */
+ if (conf->subvolume_cnt == 1)
+ local->need_xattr_heal = 0;
+ if (local->need_xattr_heal && (local->mds_xattr)) {
+ dht_dir_set_heal_xattr(this, local, local->xattr, local->mds_xattr,
+ NULL, NULL);
+ dict_unref(local->mds_xattr);
+ local->mds_xattr = NULL;
+ }
+
+ ret = dict_get_int8(local->xattr_req, QUOTA_READ_ONLY_KEY, &is_read_only);
+ if (ret < 0)
+ gf_msg_debug(this->name, 0, "key = %s not present in dict",
+ QUOTA_READ_ONLY_KEY);
+
+ if (local->file_count && local->dir_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+ "path %s exists as a file on one subvolume "
+ "and directory on another. "
+ "Please fix it manually",
+ local->loc.path);
+ op_errno = EIO;
+ goto out;
+ }
- LOCK(&discover_frame->lock);
- {
- main_frame = local->main_frame;
- local->main_frame = NULL;
+ if (local->cached_subvol) {
+ ret = dht_layout_preset(this, local->cached_subvol, local->inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to set layout for subvolume %s",
+ local->cached_subvol ? local->cached_subvol->name : "<nil>");
+ op_errno = EINVAL;
+ goto out;
+ }
+ } else {
+ ret = dht_layout_normalize(this, &local->loc, layout);
+ if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) {
+ /* either the layout is incorrect or the directory is
+ * not found even in one subvolume.
+ */
+ gf_msg_debug(this->name, 0,
+ "normalizing failed on %s "
+ "(overlaps/holes present: %s, "
+ "ENOENT errors: %d)",
+ local->loc.path, (ret < 0) ? "yes" : "no",
+ (ret > 0) ? ret : 0);
+ layout_anomalies = 1;
+ } else if (local->inode) {
+ dht_layout_set(this, local->inode, layout);
+ }
+ }
+
+ if (!conf->vch_forced) {
+ ret = dict_get_uint32(local->xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
}
- UNLOCK(&discover_frame->lock);
+ }
- if (!main_frame)
- return 0;
+ if (IA_ISDIR(local->stbuf.ia_type) && !is_read_only) {
+ for (i = 0; i < layout->cnt; i++) {
+ if (!source && !layout->list[i].err)
+ source = layout->list[i].xlator;
+ if (layout->list[i].err == ENOENT ||
+ layout->list[i].err == ESTALE) {
+ heal_path = 1;
+ }
- if (local->file_count && local->dir_count) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_FILE_TYPE_MISMATCH,
- "path %s exists as a file on one subvolume "
- "and directory on another. "
- "Please fix it manually",
- local->loc.path);
- op_errno = EIO;
- goto out;
+ if (source && heal_path)
+ break;
}
+ }
- if (local->cached_subvol) {
- ret = dht_layout_preset (this, local->cached_subvol,
- local->inode);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_LAYOUT_SET_FAILED,
- "failed to set layout for subvolume %s",
- local->cached_subvol ? local->cached_subvol->name : "<nil>");
- op_errno = EINVAL;
- goto out;
- }
- } else {
- ret = dht_layout_normalize (this, &local->loc, layout);
- if ((ret < 0) || ((ret > 0) && (local->op_ret != 0))) {
- /* either the layout is incorrect or the directory is
- * not found even in one subvolume.
- */
- gf_msg_debug (this->name, 0,
- "normalizing failed on %s "
- "(overlaps/holes present: %s, "
- "ENOENT errors: %d)", local->loc.path,
- (ret < 0) ? "yes" : "no", (ret > 0) ? ret : 0);
- if ((ret > 0) && (ret == conf->subvolume_cnt)) {
- op_errno = ESTALE;
- goto out;
- }
+ if (IA_ISDIR(local->stbuf.ia_type)) {
+ /* Call function to save hashed subvol on inode ctx if
+ internal mds xattr is not present and all subvols are up
+ */
+ if (!local->op_ret && !__is_root_gfid(local->stbuf.ia_gfid))
+ (void)dht_common_mark_mdsxattr(discover_frame,
+ &error_while_marking_mds, 1);
+
+ if (local->need_xattr_heal && !heal_path) {
+ local->need_xattr_heal = 0;
+ ret = dht_dir_xattr_heal(this, local, &op_errno);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "xattr heal failed for "
+ "directory gfid is %s ",
+ gfid_local);
+ }
+ }
+ }
- /* For fixing the directory layout, we need to choose
- * the subvolume on which layout will be set first.
- * Because in nameless lookup, we have gfid only,
- * we are dependent on gfid. Therefore if conf->
- * randomize_by_gfid is set, then only we proceed for
- * healing layout of directory otherwise we don't heal.
- */
+ if (source && (heal_path || layout_anomalies || error_while_marking_mds)) {
+ gf_uuid_copy(loc.gfid, local->gfid);
+ if (gf_uuid_is_null(loc.gfid)) {
+ goto done;
+ }
- if (local->inode && conf->randomize_by_gfid)
- goto selfheal;
- }
+ if (local->inode)
+ loc.inode = inode_ref(local->inode);
+ else
+ goto done;
+
+ heal_frame = create_frame(this, this->ctx->pool);
+ if (heal_frame) {
+ heal_local = dht_local_init(heal_frame, &loc, NULL, 0);
+ if (!heal_local)
+ goto cleanup;
+
+ gf_uuid_copy(heal_local->gfid, local->gfid);
+ heal_frame->cookie = source;
+ heal_local->xattr = dict_ref(local->xattr);
+ heal_local->stbuf = local->stbuf;
+ heal_local->postparent = local->postparent;
+ heal_local->inode = inode_ref(loc.inode);
+ heal_local->main_frame = main_frame;
+ FRAME_SU_DO(heal_frame, dht_local_t);
+ ret = synctask_new(this->ctx->env, dht_heal_full_path,
+ dht_heal_full_path_done, heal_frame, heal_frame);
+ if (!ret) {
+ loc_wipe(&loc);
+ return 0;
+ }
+ /*
+ * Failed to spawn the synctask. Returning
+ * with out doing heal.
+ */
+ cleanup:
+ loc_wipe(&loc);
+ DHT_STACK_DESTROY(heal_frame);
+ }
+ }
+done:
+ dht_set_fixed_dir_stat(&local->postparent);
+ /* Delete mds xattr at the time of STACK UNWIND */
+ if (local->xattr)
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
- if (local->inode)
- dht_layout_set (this, local->inode, layout);
- }
+ DHT_STACK_UNWIND(lookup, main_frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ return 0;
- if (!conf->vch_forced) {
- ret = dict_get_uint32 (local->xattr,
- conf->commithash_xattr_name,
- &vol_commit_hash);
- if (ret == 0) {
- conf->vol_commit_hash = vol_commit_hash;
- }
- }
+out:
+ DHT_STACK_UNWIND(lookup, main_frame, -1, op_errno, NULL, NULL, NULL, NULL);
- if (IA_ISDIR (local->stbuf.ia_type)) {
- for (i = 0; i < layout->cnt; i++) {
- if (!source && !layout->list[i].err)
- source = layout->list[i].xlator;
- if (layout->list[i].err == ENOENT ||
- layout->list[i].err == ESTALE) {
- heal_path = 1;
- }
- if (source && heal_path)
- break;
- }
- }
- if (source && heal_path) {
- gf_uuid_copy (loc.gfid, local->gfid);
- if (gf_uuid_is_null (loc.gfid)) {
- goto done;
- }
+ return ret;
+}
- if (local->inode)
- loc.inode = inode_ref (local->inode);
- else
- goto done;
-
- heal_frame = create_frame (this, this->ctx->pool);
- if (heal_frame) {
- heal_local = dht_local_init (heal_frame, &loc,
- NULL, 0);
- if (!heal_local)
- goto cleanup;
-
- gf_uuid_copy (heal_local->gfid, local->gfid);
- heal_frame->cookie = source;
- heal_local->xattr = dict_ref (local->xattr);
- heal_local->stbuf = local->stbuf;
- heal_local->postparent = local->postparent;
- heal_local->inode = inode_ref (loc.inode);
- heal_local->main_frame = main_frame;
- FRAME_SU_DO (heal_frame, dht_local_t);
- ret = synctask_new (this->ctx->env,
- dht_heal_full_path,
- dht_heal_full_path_done,
- heal_frame, heal_frame);
- if (!ret) {
- loc_wipe (&loc);
- return 0;
- }
- /*
- * Failed to spawn the synctask. Returning
- * with out doing heal.
- */
-cleanup:
- loc_wipe (&loc);
- DHT_STACK_DESTROY (heal_frame);
- }
+static int
+dht_common_mark_mdsxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = cookie;
+ int ret = -1;
+ dht_conf_t *conf = 0;
+ dht_layout_t *layout = NULL;
+ int32_t mds_heal_fresh_lookup = 0;
+
+ GF_VALIDATE_OR_GOTO(this->name, frame, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+
+ local = frame->local;
+ conf = this->private;
+ layout = local->selfheal.layout;
+ mds_heal_fresh_lookup = local->mds_heal_fresh_lookup;
+
+ if (op_ret) {
+ gf_msg_debug(this->name, op_ret,
+ "Failed to set %s on the MDS %s for path %s. ",
+ conf->mds_xattr_key, prev->name, local->loc.path);
+ } else {
+ /* Save mds subvol on inode ctx */
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set mds subvol on inode ctx"
+ " %s for %s ",
+ prev->name, local->loc.path);
+ }
+ }
+ if (!local->mds_heal_fresh_lookup && layout) {
+ dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf, 0xffffffff,
+ layout);
+ }
+out:
+ if (mds_heal_fresh_lookup)
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
+static xlator_t *
+dht_inode_get_hashed_subvol(inode_t *inode, xlator_t *this, loc_t *loc)
+{
+ char *path = NULL;
+ loc_t populate_loc = {
+ 0,
+ };
+ char *name = NULL;
+ xlator_t *hash_subvol = NULL;
+
+ if (!inode)
+ return hash_subvol;
+
+ if (loc && loc->parent && loc->path) {
+ if (!loc->name) {
+ name = strrchr(loc->path, '/');
+ if (name) {
+ loc->name = name + 1;
+ } else {
+ goto out;
+ }
}
-done:
- DHT_STACK_UNWIND (lookup, main_frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
- return 0;
-out:
- DHT_STACK_UNWIND (lookup, main_frame, -1, op_errno, NULL, NULL, NULL,
- NULL);
+ hash_subvol = dht_subvol_get_hashed(this, loc);
+ goto out;
+ }
- return ret;
+ if (!gf_uuid_is_null(inode->gfid)) {
+ populate_loc.inode = inode_ref(inode);
+ populate_loc.parent = inode_parent(populate_loc.inode, NULL, NULL);
+ inode_path(populate_loc.inode, NULL, &path);
-selfheal:
+ if (!path)
+ goto out;
- main_frame->local = local;
- discover_frame->local = NULL;
- FRAME_SU_DO (main_frame, dht_local_t);
- gf_uuid_copy (local->loc.gfid, local->gfid);
- ret = dht_selfheal_directory_for_nameless_lookup (main_frame,
- dht_lookup_selfheal_cbk,
- &local->loc, layout);
- return ret;
+ populate_loc.path = path;
+ if (!populate_loc.name && populate_loc.path) {
+ name = strrchr(populate_loc.path, '/');
+ if (name) {
+ populate_loc.name = name + 1;
+ } else {
+ goto out;
+ }
+ }
+ hash_subvol = dht_subvol_get_hashed(this, &populate_loc);
+ }
+out:
+ if (populate_loc.inode)
+ loc_wipe(&populate_loc);
+ return hash_subvol;
}
+/* Common function call by revalidate/selfheal code path to populate
+ internal xattr if it is not present, mark_during_fresh_lookup value
+ determines either function is call by revalidate_cbk(discover_complete)
+ or call by selfheal code path while fresh lookup.
+ Here we do wind a call serially in case of fresh lookup and
+ for other lookup code path we do wind a call parallel.The reason
+ to wind a call serially is at the time of fresh lookup directory is not
+ discovered and at the time of revalidate_lookup directory is
+ already discovered. So, revalidate codepath can race with setxattr
+ codepath and can get into spurious heals because of an ongoing setxattr.
+ This can slow down revalidates, if healing happens in foreground.
+ However, if healing happens in background, there is no direct performance
+ penalty.
+*/
int
-dht_discover_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- int ret = -1;
- int is_dir = 0;
- int is_linkfile = 0;
- int attempt_unwind = 0;
- dht_conf_t *conf = 0;
- char gfid_local[GF_UUID_BUF_SIZE] = {0};
- char gfid_node[GF_UUID_BUF_SIZE] = {0};
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
-
- local = frame->local;
- prev = cookie;
- conf = this->private;
+dht_common_mark_mdsxattr(call_frame_t *frame, int *errst,
+ int mark_during_fresh_lookup)
+{
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ xlator_t *hashed_subvol = NULL;
+ int ret = 0;
+ int i = 0;
+ dict_t *xattrs = NULL;
+ char gfid_local[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+ int32_t zero[1] = {0};
+ dht_conf_t *conf = 0;
+ dht_layout_t *layout = NULL;
+ dht_local_t *copy_local = NULL;
+ call_frame_t *xattr_frame = NULL;
+ gf_boolean_t vol_down = _gf_false;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ this = frame->this;
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, frame->local, out);
+ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
+
+ local = frame->local;
+ conf = this->private;
+ layout = local->selfheal.layout;
+ local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
+
+ gf_uuid_unparse(local->gfid, gfid_local);
+
+ /* Code to update hashed subvol consider as a mds subvol
+ and wind a setxattr call on hashed subvol to update
+ internal xattr
+ */
+ if (!local->xattr || !dict_get(local->xattr, conf->mds_xattr_key)) {
+ /* It means no internal MDS xattr has been set yet
+ */
+ /* Check the status of all subvol are up while call
+ this function call by lookup code path
+ */
+ if (mark_during_fresh_lookup) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ vol_down = _gf_true;
+ break;
+ }
+ }
+ if (vol_down) {
+ gf_msg_debug(this->name, 0,
+ "subvol %s is down. Unable to "
+ " save mds subvol on inode for "
+ " path %s gfid is %s ",
+ conf->subvolumes[i]->name, local->loc.path,
+ gfid_local);
+ goto out;
+ }
+ }
- layout = local->layout;
+ /* Calculate hashed subvol based on inode and parent node
+ */
+ hashed_subvol = dht_inode_get_hashed_subvol(local->inode, this,
+ &local->loc);
+ if (!hashed_subvol) {
+ gf_msg(this->name, GF_LOG_DEBUG, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for path %s"
+ "gfid is %s ",
+ local->loc.path, gfid_local);
+ if (errst)
+ (*errst) = 1;
+ ret = -1;
+ goto out;
+ }
+ xattrs = dict_new();
+ if (!xattrs) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "dict_new failed");
+ ret = -1;
+ goto out;
+ }
+ /* Add internal MDS xattr on disk for hashed subvol
+ */
+ ret = dht_dict_set_array(xattrs, conf->mds_xattr_key, zero, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary"
+ " value:key = %s for "
+ "path %s",
+ conf->mds_xattr_key, local->loc.path);
+ ret = -1;
+ goto out;
+ }
+ /* Create a new frame to wind a call only while
+ this function call by revalidate_cbk code path
+ To wind a call parallel need to create a new frame
+ */
+ if (mark_during_fresh_lookup) {
+ xattr_frame = create_frame(this, this->ctx->pool);
+ if (!xattr_frame) {
+ ret = -1;
+ goto out;
+ }
+ copy_local = dht_local_init(xattr_frame, &(local->loc), NULL, 0);
+ if (!copy_local) {
+ ret = -1;
+ DHT_STACK_DESTROY(xattr_frame);
+ goto out;
+ }
+ copy_local->stbuf = local->stbuf;
+ copy_local->mds_heal_fresh_lookup = mark_during_fresh_lookup;
+ if (!copy_local->inode)
+ copy_local->inode = inode_ref(local->inode);
+ gf_uuid_copy(copy_local->loc.gfid, local->gfid);
+ FRAME_SU_DO(xattr_frame, dht_local_t);
+ STACK_WIND_COOKIE(xattr_frame, dht_common_mark_mdsxattr_cbk,
+ hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->setxattr, &local->loc,
+ xattrs, 0, NULL);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_common_mark_mdsxattr_cbk,
+ (void *)hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->setxattr, &local->loc,
+ xattrs, 0, NULL);
+ }
+ } else {
+ gf_msg_debug(this->name, 0,
+ "internal xattr %s is present on subvol"
+ "on path %s gfid is %s ",
+ conf->mds_xattr_key, local->loc.path, gfid_local);
+ if (!mark_during_fresh_lookup)
+ dht_selfheal_dir_setattr(frame, &local->loc, &local->stbuf,
+ 0xffffffff, layout);
+ }
+out:
+ if (xattrs)
+ dict_unref(xattrs);
+ return ret;
+}
- /* Check if the gfid is different for file from other node */
- if (!op_ret && gf_uuid_compare (local->gfid, stbuf->ia_gfid)) {
+/* Get the value of key from dict in the bytewise and save in array after
+ convert from network byte order to host byte order
+*/
+static int32_t
+dht_dict_get_array(dict_t *dict, char *key, int32_t value[], int32_t size,
+ int *errst)
+{
+ void *ptr = NULL;
+ int32_t len = -1;
+ int32_t vindex = -1;
+ int32_t err = -1;
+ int ret = 0;
+
+ if (dict == NULL) {
+ (*errst) = -1;
+ return -EINVAL;
+ }
+ err = dict_get_ptr_and_len(dict, key, &ptr, &len);
+ if (err != 0) {
+ (*errst) = -1;
+ return err;
+ }
+
+ if (len != (size * sizeof(int32_t))) {
+ (*errst) = -1;
+ return -EINVAL;
+ }
+
+ for (vindex = 0; vindex < size; vindex++) {
+ value[vindex] = ntoh32(*((int32_t *)ptr + vindex));
+ if (value[vindex] < 0)
+ ret = -1;
+ }
+
+ return ret;
+}
- gf_uuid_unparse(stbuf->ia_gfid, gfid_node);
- gf_uuid_unparse(local->gfid, gfid_local);
+static int
+dht_discover_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ int is_dir = 0;
+ int32_t check_mds = 0;
+ int is_linkfile = 0;
+ int attempt_unwind = 0;
+ dht_conf_t *conf = 0;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ char gfid_node[GF_UUID_BUF_SIZE] = {0};
+ int32_t mds_xattr_val[1] = {0};
+ int errst = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ layout = local->layout;
+
+ /* Check if the gfid is different for file from other node */
+ if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+ gf_uuid_unparse(stbuf->ia_gfid, gfid_node);
+ gf_uuid_unparse(local->gfid, gfid_local);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid different on %s, gfid local = %s"
+ "gfid other = %s",
+ local->loc.path, prev->name, gfid_local, gfid_node);
+ }
+
+ LOCK(&frame->lock);
+ {
+ /* TODO: assert equal mode on stbuf->st_mode and
+ local->stbuf->st_mode
+
+ else mkdir/chmod/chown and fix
+ */
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_GFID_MISMATCH,
- "%s: gfid different on %s, gfid local = %s"
- "gfid other = %s",
- local->loc.path, prev->this->name,
- gfid_local, gfid_node);
- }
+ ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ "%s: failed to merge layouts for subvol %s", local->loc.path,
+ prev->name);
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ gf_msg_debug(this->name, op_errno,
+ "lookup of %s on %s returned error", local->loc.path,
+ prev->name);
- LOCK (&frame->lock);
- {
- /* TODO: assert equal mode on stbuf->st_mode and
- local->stbuf->st_mode
+ goto unlock;
+ }
- else mkdir/chmod/chown and fix
- */
- ret = dht_layout_merge (this, layout, prev->this,
- op_ret, op_errno, xattr);
- if (ret)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_LAYOUT_MERGE_FAILED,
- "%s: failed to merge layouts for subvol %s",
- local->loc.path, prev->this->name);
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_msg_debug (this->name, op_errno,
- "lookup of %s on %s returned error",
- local->loc.path, prev->this->name);
-
- goto unlock;
- }
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr,
+ conf->link_xattr_name);
+ is_dir = check_is_dir(inode, stbuf, xattr);
- is_linkfile = check_is_linkfile (inode, stbuf, xattr,
- conf->link_xattr_name);
- is_dir = check_is_dir (inode, stbuf, xattr);
+ if (is_dir) {
+ local->dir_count++;
+ } else {
+ local->file_count++;
+
+ if (!is_linkfile && !local->cached_subvol) {
+ /* real file */
+ /* Ok, we somehow managed to find a file on
+ * more than one subvol. ignore this or we
+ * will end up overwriting information while a
+ * a thread is potentially unwinding from
+ * dht_discover_complete
+ */
+ local->cached_subvol = prev;
+ attempt_unwind = 1;
+ } else {
+ goto unlock;
+ }
+ }
- if (is_dir) {
- local->dir_count ++;
- } else {
- local->file_count ++;
-
- if (!is_linkfile) {
- /* real file */
- local->cached_subvol = prev->this;
- attempt_unwind = 1;
- } else {
- goto unlock;
- }
- }
+ local->op_ret = 0;
- local->op_ret = 0;
+ if (local->xattr == NULL) {
+ local->xattr = dict_ref(xattr);
+ } else {
+ /* Don't aggregate for files. See BZ#1484709 */
+ if (is_dir)
+ dht_aggregate_xattr(local->xattr, xattr);
+ }
- if (local->xattr == NULL) {
- local->xattr = dict_ref (xattr);
- } else {
- dht_aggregate_xattr (local->xattr, xattr);
- }
+ if (local->inode == NULL)
+ local->inode = inode_ref(inode);
- if (local->inode == NULL)
- local->inode = inode_ref (inode);
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->postparent, postparent);
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
- }
+ if (!dict_get(xattr, conf->mds_xattr_key)) {
+ goto unlock;
+ } else {
+ gf_msg_debug(this->name, 0,
+ "internal xattr %s is present on subvol"
+ "on path %s gfid is %s ",
+ conf->mds_xattr_key, local->loc.path, gfid_local);
+ }
+ check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+ mds_xattr_val, 1, &errst);
+ /* save mds subvol on inode ctx */
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set hashed subvol for %s vol is %s",
+ local->loc.path, prev->name);
+ }
+
+ if ((check_mds < 0) && !errst) {
+ local->mds_xattr = dict_ref(xattr);
+ gf_msg_debug(this->name, 0,
+ "Value of %s is not zero on mds subvol"
+ "so xattr needs to be healed on non mds"
+ " path is %s and vol name is %s "
+ " gfid is %s",
+ conf->mds_xattr_key, local->loc.path, prev->name,
+ gfid_local);
+ local->need_xattr_heal = 1;
+ local->mds_subvol = prev;
+ }
+ }
unlock:
- UNLOCK (&frame->lock);
+ UNLOCK(&frame->lock);
out:
- /* Make sure, the thread executing dht_discover_complete is the one
- * which calls STACK_DESTROY (frame). In the case of "attempt_unwind",
- * this makes sure that the thread don't call dht_frame_return, till
- * call to dht_discover_complete is done.
- */
- if (attempt_unwind) {
- dht_discover_complete (this, frame);
- }
+ /* Make sure, the thread executing dht_discover_complete is the one
+ * which calls STACK_DESTROY (frame). In the case of "attempt_unwind",
+ * this makes sure that the thread don't call dht_frame_return, till
+ * call to dht_discover_complete is done.
+ */
+ if (attempt_unwind) {
+ dht_discover_complete(this, frame);
+ }
- this_call_cnt = dht_frame_return (frame);
+ this_call_cnt = dht_frame_return(frame);
- if (is_last_call (this_call_cnt) && !attempt_unwind) {
- dht_discover_complete (this, frame);
- }
+ if (is_last_call(this_call_cnt) && !attempt_unwind) {
+ dht_discover_complete(this, frame);
+ }
- if (is_last_call (this_call_cnt))
- DHT_STACK_DESTROY (frame);
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_DESTROY(frame);
- return 0;
+ return 0;
}
+static int
+dht_set_file_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ int ret = -EINVAL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf) {
+ goto err;
+ }
+
+ if (!xattr_req) {
+ goto err;
+ }
+
+ /* Used to check whether this is a linkto file.
+ */
+ ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->link_xattr_name, loc->path);
+ goto err;
+ }
+
+ /* This is used to make sure we don't unlink linkto files
+ * which are the target of an ongoing file migration.
+ */
+ ret = dict_set_uint32(xattr_req, GLUSTERFS_OPEN_FD_COUNT, 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ GLUSTERFS_OPEN_FD_COUNT, loc->path);
+ goto err;
+ }
+
+ ret = 0;
+err:
+ return ret;
+}
-int
-dht_discover (call_frame_t *frame, xlator_t *this, loc_t *loc)
+/* This is a gfid based nameless lookup. Without a name, the hashed subvol
+ * cannot be calculated so a lookup is sent to all subvols.
+ */
+static int
+dht_do_discover(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
- int ret;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int call_cnt = 0;
- int op_errno = EINVAL;
- int i = 0;
- call_frame_t *discover_frame = NULL;
+ int ret;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int call_cnt = 0;
+ int op_errno = EINVAL;
+ int i = 0;
+ call_frame_t *discover_frame = NULL;
+
+ conf = this->private;
+ local = frame->local;
+
+ /* As we do not know if this is a file or directory, request
+ * both file and directory xattrs
+ */
+ ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ goto err;
+ }
+
+ ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ goto err;
+ }
+
+ if (loc_is_root(loc)) {
+ /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash)
+ * set on the brick root.
+ */
+ ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
- conf = this->private;
- local = frame->local;
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
- ret = dict_set_uint32 (local->xattr_req, conf->xattr_name, 4 * 4);
- if (ret)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value:key = %s",
- loc->path, conf->xattr_name);
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
- ret = dict_set_uint32 (local->xattr_req, conf->link_xattr_name, 256);
- if (ret)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value:key = %s",
- loc->path, conf->link_xattr_name);
+ if (!local->layout) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- if (__is_root_gfid(local->loc.gfid)) {
- ret = dict_set_uint32 (local->xattr_req,
- conf->commithash_xattr_name,
- sizeof(uint32_t));
- }
+ gf_uuid_copy(local->gfid, loc->gfid);
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
+ discover_frame = copy_frame(frame);
+ if (!discover_frame) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- local->layout = dht_layout_new (this, conf->subvolume_cnt);
+ discover_frame->local = local;
+ frame->local = NULL;
+ local->main_frame = frame;
- if (!local->layout) {
- op_errno = ENOMEM;
- goto err;
- }
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(discover_frame, dht_discover_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ local->xattr_req);
+ }
- gf_uuid_copy (local->gfid, loc->gfid);
+ return 0;
- discover_frame = copy_frame (frame);
- if (!discover_frame) {
- op_errno = ENOMEM;
- goto err;
- }
+err:
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
- discover_frame->local = local;
- frame->local = NULL;
- local->main_frame = frame;
+ return 0;
+}
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (discover_frame, dht_discover_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
+/* Code to call syntask to heal custom xattr from hashed subvol
+ to non hashed subvol
+*/
+int
+dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno)
+{
+ dht_local_t *copy_local = NULL;
+ call_frame_t *copy = NULL;
+ int ret = -1;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+
+ if (gf_uuid_is_null(local->gfid)) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "No gfid exists for path %s "
+ "so healing xattr is not possible",
+ local->loc.path);
+ *op_errno = EIO;
+ goto out;
+ }
+
+ gf_uuid_unparse(local->gfid, gfid_local);
+ copy = create_frame(this, this->ctx->pool);
+ if (copy) {
+ copy_local = dht_local_init(copy, &(local->loc), NULL, 0);
+ if (!copy_local) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "Memory allocation failed "
+ "for path %s gfid %s ",
+ local->loc.path, gfid_local);
+ *op_errno = ENOMEM;
+ DHT_STACK_DESTROY(copy);
+ } else {
+ copy_local->stbuf = local->stbuf;
+ gf_uuid_copy(copy_local->loc.gfid, local->gfid);
+ copy_local->mds_subvol = local->mds_subvol;
+ FRAME_SU_DO(copy, dht_local_t);
+ ret = synctask_new(this->ctx->env, dht_dir_heal_xattrs,
+ dht_dir_heal_xattrs_done, copy, copy);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED,
+ "Synctask creation failed to heal xattr "
+ "for path %s gfid %s ",
+ local->loc.path, gfid_local);
+ *op_errno = ENOMEM;
+ DHT_STACK_DESTROY(copy);
+ }
}
+ }
+out:
+ return ret;
+}
- return 0;
-
-err:
- DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL,
- NULL);
-
- return 0;
+static int
+dht_needs_selfheal(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int needs_selfheal = 0;
+ int ret = 0;
+
+ local = frame->local;
+ layout = local->layout;
+
+ if (local->need_attrheal || local->need_xattr_heal ||
+ local->need_selfheal) {
+ needs_selfheal = 1;
+ }
+
+ ret = dht_layout_normalize(this, &local->loc, layout);
+
+ if (ret != 0) {
+ gf_msg_debug(this->name, 0, "fixing assignment on %s", local->loc.path);
+ needs_selfheal = 1;
+ }
+ return needs_selfheal;
}
+static int
+is_permission_different(ia_prot_t *prot1, ia_prot_t *prot2)
+{
+ if ((prot1->owner.read != prot2->owner.read) ||
+ (prot1->owner.write != prot2->owner.write) ||
+ (prot1->owner.exec != prot2->owner.exec) ||
+ (prot1->group.read != prot2->group.read) ||
+ (prot1->group.write != prot2->group.write) ||
+ (prot1->group.exec != prot2->group.exec) ||
+ (prot1->other.read != prot2->other.read) ||
+ (prot1->other.write != prot2->other.write) ||
+ (prot1->other.exec != prot2->other.exec) ||
+ (prot1->suid != prot2->suid) || (prot1->sgid != prot2->sgid) ||
+ (prot1->sticky != prot2->sticky)) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
int
-dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- int ret = -1;
- int is_dir = 0;
- char gfid_local[GF_UUID_BUF_SIZE] = {0};
- char gfid_node[GF_UUID_BUF_SIZE] = {0};
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ int is_dir = 0;
+ int32_t check_mds = 0;
+ int errst = 0;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ char gfid_node[GF_UUID_BUF_SIZE] = {0};
+ int32_t mds_xattr_val[1] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ layout = local->layout;
+ gf_msg_debug(this->name, op_errno,
+ "%s: lookup on %s returned with op_ret = %d, op_errno = %d",
+ local->loc.path, prev->name, op_ret, op_errno);
+
+ /* The first successful lookup*/
+ if (!op_ret && gf_uuid_is_null(local->gfid)) {
+ memcpy(local->gfid, stbuf->ia_gfid, 16);
+ }
+ if (!gf_uuid_is_null(local->gfid)) {
+ gf_uuid_unparse(local->gfid, gfid_local);
+ }
+
+ /* Check if the gfid is different for file from other node */
+ if (!op_ret && gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+ gf_uuid_unparse(stbuf->ia_gfid, gfid_node);
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid different on %s."
+ " gfid local = %s, gfid subvol = %s",
+ local->loc.path, prev->name, gfid_local, gfid_node);
+ }
+
+ LOCK(&frame->lock);
+ {
+ /* TODO: assert equal mode on stbuf->st_mode and
+ local->stbuf->st_mode
+ else mkdir/chmod/chown and fix
+ */
+ ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, xattr);
- local = frame->local;
- prev = cookie;
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
- layout = local->layout;
+ /* The GFID is missing on this subvol. Force a heal. */
+ if (op_errno == ENODATA) {
+ local->need_lookup_everywhere = 1;
+ }
+ goto unlock;
+ }
- if (!op_ret && gf_uuid_is_null (local->gfid))
- memcpy (local->gfid, stbuf->ia_gfid, 16);
+ is_dir = check_is_dir(inode, stbuf, xattr);
+ if (!is_dir) {
+ gf_msg_debug(this->name, 0,
+ "%s: lookup on %s returned non dir 0%o"
+ "calling lookup_everywhere",
+ local->loc.path, prev->name, stbuf->ia_type);
- memcpy (local->loc.gfid, local->gfid, 16);
+ local->need_lookup_everywhere = 1;
+ goto unlock;
+ }
- /* Check if the gfid is different for file from other node */
- if (!op_ret && gf_uuid_compare (local->gfid, stbuf->ia_gfid)) {
+ local->op_ret = 0;
+ if (local->xattr == NULL) {
+ local->xattr = dict_ref(xattr);
+ } else {
+ dht_aggregate_xattr(local->xattr, xattr);
+ }
- gf_uuid_unparse(stbuf->ia_gfid, gfid_node);
- gf_uuid_unparse(local->gfid, gfid_local);
+ if (__is_root_gfid(stbuf->ia_gfid)) {
+ ret = dht_dir_has_layout(xattr, conf->xattr_name);
+ if (ret >= 0) {
+ if (is_greater_time(local->prebuf.ia_ctime,
+ local->prebuf.ia_ctime_nsec,
+ stbuf->ia_ctime, stbuf->ia_ctime_nsec)) {
+ /* Choose source */
+ local->prebuf.ia_gid = stbuf->ia_gid;
+ local->prebuf.ia_uid = stbuf->ia_uid;
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_GFID_MISMATCH,
- "%s: gfid different on %s."
- " gfid local = %s, gfid subvol = %s",
- local->loc.path, prev->this->name,
- gfid_local, gfid_node);
+ local->prebuf.ia_ctime = stbuf->ia_ctime;
+ local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec;
+ local->prebuf.ia_prot = stbuf->ia_prot;
+ }
+ }
}
- LOCK (&frame->lock);
- {
- /* TODO: assert equal mode on stbuf->st_mode and
- local->stbuf->st_mode
+ if (local->stbuf.ia_type != IA_INVAL) {
+ /* This is not the first subvol to respond
+ * Compare values to see if attrs need to be healed
+ */
+ if ((local->stbuf.ia_gid != stbuf->ia_gid) ||
+ (local->stbuf.ia_uid != stbuf->ia_uid) ||
+ (is_permission_different(&local->stbuf.ia_prot,
+ &stbuf->ia_prot))) {
+ local->need_attrheal = 1;
+ }
+ }
- else mkdir/chmod/chown and fix
- */
- ret = dht_layout_merge (this, layout, prev->this,
- op_ret, op_errno, xattr);
+ if (local->inode == NULL)
+ local->inode = inode_ref(inode);
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_msg_debug (this->name, op_errno,
- "lookup of %s on %s returned error",
- local->loc.path, prev->this->name);
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->postparent, postparent);
- goto unlock;
- }
+ if (!dict_get(xattr, conf->mds_xattr_key)) {
+ gf_msg_debug(this->name, 0,
+ "%s: mds xattr %s is not present "
+ "on %s(gfid = %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
+ gfid_local);
+ goto unlock;
+ }
- is_dir = check_is_dir (inode, stbuf, xattr);
- if (!is_dir) {
+ /* Save the mds subvol info and stbuf. This is the value that will
+ * be used for healing
+ */
+ local->mds_subvol = prev;
+ local->mds_stbuf = *stbuf;
- gf_msg_debug (this->name, 0,
- "lookup of %s on %s returned non"
- "dir 0%o"
- "calling lookup_everywhere",
- local->loc.path, prev->this->name,
- stbuf->ia_type);
+ /* Save mds subvol on inode ctx */
- local->need_selfheal = 1;
- goto unlock;
- }
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "%s: Failed to set mds (%s)", local->loc.path, prev->name);
+ }
+ check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+ mds_xattr_val, 1, &errst);
+ if ((check_mds < 0) && !errst) {
+ /* Check if xattrs need to be healed on the directories */
+ local->mds_xattr = dict_ref(xattr);
+ gf_msg_debug(this->name, 0,
+ "%s: %s is not zero on %s. Xattrs need to be healed."
+ "(gfid = %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
+ gfid_local);
+ local->need_xattr_heal = 1;
+ }
+ }
- local->op_ret = 0;
- if (local->xattr == NULL) {
- local->xattr = dict_ref (xattr);
- } else {
- dht_aggregate_xattr (local->xattr, xattr);
- }
+unlock:
+ UNLOCK(&frame->lock);
- if (local->inode == NULL)
- local->inode = inode_ref (inode);
+ this_call_cnt = dht_frame_return(frame);
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
+ if (is_last_call(this_call_cnt)) {
+ /* If the mds subvol is not set correctly*/
+ if (!__is_root_gfid(local->gfid) &&
+ (!dict_get(local->xattr, conf->mds_xattr_key))) {
+ local->need_selfheal = 1;
}
-unlock:
- UNLOCK (&frame->lock);
+ /* No need to call xattr heal code if volume count is 1
+ */
+ if (conf->subvolume_cnt == 1) {
+ local->need_xattr_heal = 0;
+ }
+
+ if (local->need_selfheal || local->need_lookup_everywhere) {
+ /* Set the gfid-req so posix will set the GFID*/
+ if (!gf_uuid_is_null(local->gfid)) {
+ /* Ok, this should _never_ happen */
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid, 16);
+ } else {
+ if (!gf_uuid_is_null(local->gfid_req))
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid_req, 16);
+ }
+ }
- this_call_cnt = dht_frame_return (frame);
-
- if (is_last_call (this_call_cnt)) {
- if (local->need_selfheal) {
- local->need_selfheal = 0;
- dht_lookup_everywhere (frame, this, &local->loc);
- return 0;
- }
+ if (local->need_lookup_everywhere) {
+ local->need_lookup_everywhere = 0;
+ dht_lookup_everywhere(frame, this, &local->loc);
+ return 0;
+ }
- if (local->op_ret == 0) {
- ret = dht_layout_normalize (this, &local->loc, layout);
+ if (local->op_ret == 0) {
+ if (dht_needs_selfheal(frame, this)) {
+ goto selfheal;
+ }
- if (ret != 0) {
- gf_msg_debug (this->name, 0,
- "fixing assignment on %s",
- local->loc.path);
- goto selfheal;
- }
+ dht_layout_set(this, local->inode, layout);
+ if (local->inode) {
+ dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
+ }
- dht_layout_set (this, local->inode, layout);
- }
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+ }
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- &local->postparent, 1);
- }
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ /* Delete mds xattr at the time of STACK UNWIND */
+ if (local->xattr)
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
- }
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ }
- return 0;
+ return 0;
selfheal:
- FRAME_SU_DO (frame, dht_local_t);
- gf_uuid_copy (local->loc.gfid, local->gfid);
- ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk,
- &local->loc, layout);
+ FRAME_SU_DO(frame, dht_local_t);
+ ret = dht_selfheal_directory(frame, dht_lookup_selfheal_cbk, &local->loc,
+ layout);
out:
- return ret;
+ return ret;
}
-int
-dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int is_dir = 0;
- int is_linkfile = 0;
- int follow_link = 0;
- call_frame_t *copy = NULL;
- dht_local_t *copy_local = NULL;
- char gfid[GF_UUID_BUF_SIZE] = {0};
- uint32_t vol_commit_hash = 0;
- xlator_t *subvol = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, err);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, err);
- GF_VALIDATE_OR_GOTO ("dht", cookie, err);
-
- local = frame->local;
- prev = cookie;
- conf = this->private;
- if (!conf)
- goto out;
+static int
+dht_lookup_directory(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ int call_cnt = 0;
+ int i = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, unwind);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, unwind);
+ GF_VALIDATE_OR_GOTO("dht", this->private, unwind);
+ GF_VALIDATE_OR_GOTO("dht", loc, unwind);
+
+ conf = this->private;
+ local = frame->local;
+
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
+
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ goto unwind;
+ }
+
+ if (local->xattr != NULL) {
+ dict_unref(local->xattr);
+ local->xattr = NULL;
+ }
+
+ if (!gf_uuid_is_null(local->gfid)) {
+ /* use this gfid in order to heal any missing ones */
+ ret = dict_set_gfuuid(local->xattr_req, "gfid-req", local->gfid, true);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: Failed to set dictionary value:"
+ " key = gfid-req",
+ local->loc.path);
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(
+ frame, dht_lookup_dir_cbk, conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc, local->xattr_req);
+ }
+ return 0;
+unwind:
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
+out:
+ return 0;
+}
- if (!conf->vch_forced) {
- ret = dict_get_uint32 (xattr, conf->commithash_xattr_name,
- &vol_commit_hash);
- if (ret == 0) {
- conf->vol_commit_hash = vol_commit_hash;
- }
+int
+dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int is_dir = 0;
+ int is_linkfile = 0;
+ int follow_link = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ uint32_t vol_commit_hash = 0;
+ xlator_t *subvol = NULL;
+ int32_t check_mds = 0;
+ int errst = 0, i = 0;
+ int32_t mds_xattr_val[1] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, err);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, err);
+ GF_VALIDATE_OR_GOTO("dht", cookie, err);
+ GF_VALIDATE_OR_GOTO("dht", this->private, err);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (!conf->vch_forced) {
+ /* Update the commithash value if available
+ */
+ ret = dict_get_uint32(xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
}
+ }
- gf_uuid_unparse (local->loc.gfid, gfid);
+ gf_uuid_unparse(local->loc.gfid, gfid);
- LOCK (&frame->lock);
- {
+ gf_msg_debug(this->name, op_errno,
+ "%s: revalidate lookup on %s returned op_ret %d",
+ local->loc.path, prev->name, op_ret);
- gf_msg_debug (this->name, op_errno,
- "revalidate lookup of %s "
- "returned with op_ret %d",
- local->loc.path, op_ret);
+ LOCK(&frame->lock);
+ {
+ if (gf_uuid_is_null(local->gfid)) {
+ memcpy(local->gfid, local->loc.gfid, 16);
+ }
- if (op_ret == -1) {
- local->op_errno = op_errno;
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+
+ if ((op_errno != ENOTCONN) && (op_errno != ENOENT) &&
+ (op_errno != ESTALE)) {
+ gf_msg(this->name, GF_LOG_INFO, op_errno,
+ DHT_MSG_REVALIDATE_CBK_INFO,
+ "Revalidate: subvolume %s for %s "
+ "(gfid = %s) returned -1",
+ prev->name, local->loc.path, gfid);
+ }
+ if (op_errno == ESTALE) {
+ /* propagate the ESTALE to parent.
+ * setting local->return_estale would send
+ * ESTALE to parent. */
+ local->return_estale = 1;
+ }
- if ((op_errno != ENOTCONN)
- && (op_errno != ENOENT)
- && (op_errno != ESTALE)) {
- gf_msg (this->name, GF_LOG_INFO, op_errno,
- DHT_MSG_REVALIDATE_CBK_INFO,
- "Revalidate: subvolume %s for %s "
- "(gfid = %s) returned -1",
- prev->this->name, local->loc.path,
- gfid);
- }
- if (op_errno == ESTALE) {
- /* propagate the ESTALE to parent.
- * setting local->return_estale would send
- * ESTALE to parent. */
- local->return_estale = 1;
+ /* if it is ENOENT, we may have to do a
+ * 'lookup_everywhere()' to make sure
+ * the file is not migrated */
+ if (op_errno == ENOENT) {
+ if (IA_ISREG(local->loc.inode->ia_type)) {
+ gf_msg_debug(this->name, 0,
+ "found ENOENT for %s. "
+ "Setting "
+ "need_lookup_everywhere"
+ " flag to 1",
+ local->loc.path);
+
+ local->need_lookup_everywhere = 1;
+ } else if (IA_ISDIR(local->loc.inode->ia_type)) {
+ layout = local->layout;
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == prev) {
+ layout->list[i].err = op_errno;
+ break;
}
+ }
- /* if it is ENOENT, we may have to do a
- * 'lookup_everywhere()' to make sure
- * the file is not migrated */
- if (op_errno == ENOENT) {
- if (IA_ISREG (local->loc.inode->ia_type)) {
-
- gf_msg_debug (this->name, 0,
- "found ENOENT for %s. "
- "Setting "
- "need_lookup_everywhere"
- " flag to 1",
- local->loc.path);
-
- local->need_lookup_everywhere = 1;
- }
- }
- goto unlock;
+ local->need_selfheal = 1;
}
+ }
- if ((!IA_ISINVAL(local->inode->ia_type)) &&
- stbuf->ia_type != local->inode->ia_type) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_FILE_TYPE_MISMATCH,
- "mismatching filetypes 0%o v/s 0%o for %s,"
- " gfid = %s",
- (stbuf->ia_type), (local->inode->ia_type),
- local->loc.path, gfid);
+ /* The GFID is missing on this subvol. Lookup everywhere to force a
+ * gfid heal
+ */
+ if ((op_errno == ENODATA) &&
+ (IA_ISDIR(local->loc.inode->ia_type))) {
+ local->need_lookup_everywhere = 1;
+ }
- local->op_ret = -1;
- local->op_errno = EINVAL;
+ goto unlock;
+ }
- goto unlock;
+ if ((!IA_ISINVAL(local->inode->ia_type)) &&
+ stbuf->ia_type != local->inode->ia_type) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+ "mismatching filetypes 0%o v/s 0%o for %s,"
+ " gfid = %s",
+ (stbuf->ia_type), (local->inode->ia_type), local->loc.path,
+ gfid);
- }
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
- layout = local->layout;
+ goto unlock;
+ }
+
+ layout = local->layout;
+
+ is_dir = check_is_dir(inode, stbuf, xattr);
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr,
+ conf->link_xattr_name);
+ if (is_linkfile) {
+ follow_link = 1;
+ goto unlock;
+ }
+ if (is_dir) {
+ ret = dht_dir_has_layout(xattr, conf->xattr_name);
+ if (ret >= 0) {
+ if (is_greater_time(local->prebuf.ia_ctime,
+ local->prebuf.ia_ctime_nsec,
+ stbuf->ia_ctime, stbuf->ia_ctime_nsec)) {
+ /* Choose source */
+ local->prebuf.ia_gid = stbuf->ia_gid;
+ local->prebuf.ia_uid = stbuf->ia_uid;
+
+ local->prebuf.ia_ctime = stbuf->ia_ctime;
+ local->prebuf.ia_ctime_nsec = stbuf->ia_ctime_nsec;
- is_dir = check_is_dir (inode, stbuf, xattr);
- is_linkfile = check_is_linkfile (inode, stbuf, xattr,
- conf->link_xattr_name);
- if (is_linkfile) {
- follow_link = 1;
- goto unlock;
+ if (__is_root_gfid(stbuf->ia_gfid))
+ local->prebuf.ia_prot = stbuf->ia_prot;
}
- if (is_dir) {
- ret = dht_dir_has_layout (xattr, conf->xattr_name);
- if (ret >= 0) {
- if (is_greater_time(local->stbuf.ia_ctime,
- local->stbuf.ia_ctime_nsec,
- stbuf->ia_ctime,
- stbuf->ia_ctime_nsec)) {
- local->prebuf.ia_gid = stbuf->ia_gid;
- local->prebuf.ia_uid = stbuf->ia_uid;
- }
- }
- if (local->stbuf.ia_type != IA_INVAL)
- {
- if ((local->stbuf.ia_gid != stbuf->ia_gid) ||
- (local->stbuf.ia_uid != stbuf->ia_uid)) {
- local->need_selfheal = 1;
- }
- }
- ret = dht_layout_dir_mismatch (this, layout,
- prev->this, &local->loc,
- xattr);
- if (ret != 0) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_LAYOUT_MISMATCH,
- "Mismatching layouts for %s, gfid = %s",
- local->loc.path, gfid);
-
- local->layout_mismatch = 1;
-
- goto unlock;
- }
+ }
+
+ if (local->stbuf.ia_type != IA_INVAL) {
+ if ((local->stbuf.ia_gid != stbuf->ia_gid) ||
+ (local->stbuf.ia_uid != stbuf->ia_uid) ||
+ is_permission_different(&local->stbuf.ia_prot,
+ &stbuf->ia_prot)) {
+ local->need_attrheal = 1;
}
+ }
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
+ if (!dict_get(xattr, conf->mds_xattr_key)) {
+ gf_msg_debug(this->name, 0,
+ "%s: internal xattr %s is not present"
+ " on subvol %s(gfid is %s)",
+ local->loc.path, conf->mds_xattr_key, prev->name,
+ gfid);
+ } else {
+ check_mds = dht_dict_get_array(xattr, conf->mds_xattr_key,
+ mds_xattr_val, 1, &errst);
+ local->mds_subvol = prev;
+ local->mds_stbuf.ia_gid = stbuf->ia_gid;
+ local->mds_stbuf.ia_uid = stbuf->ia_uid;
+ local->mds_stbuf.ia_prot = stbuf->ia_prot;
+
+ /* save mds subvol on inode ctx */
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, prev);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set MDS subvol for %s vol is %s",
+ local->loc.path, prev->name);
+ }
+ if ((check_mds < 0) && !errst) {
+ /* Check if xattrs need to be healed on the directory
+ */
+ local->mds_xattr = dict_ref(xattr);
+ gf_msg_debug(this->name, 0,
+ "Value of %s is not zero on "
+ "hashed subvol so xattr needs to"
+ " be healed on non hashed"
+ " path is %s and vol name is %s "
+ " gfid is %s",
+ conf->mds_xattr_key, local->loc.path,
+ prev->name, gfid);
+ local->need_xattr_heal = 1;
+ }
+ }
+ ret = dht_layout_dir_mismatch(this, layout, prev, &local->loc,
+ xattr);
+ if (ret != 0) {
+ /* In memory layout does not match on-disk layout.
+ */
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_MISMATCH,
+ "Mismatching layouts for %s, gfid = %s", local->loc.path,
+ gfid);
- local->op_ret = 0;
+ local->layout_mismatch = 1;
- if (!local->xattr) {
- local->xattr = dict_ref (xattr);
- } else if (is_dir) {
- dht_aggregate_xattr (local->xattr, xattr);
- }
+ goto unlock;
+ }
}
+
+ gf_uuid_copy(local->stbuf.ia_gfid, stbuf->ia_gfid);
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->postparent, postparent);
+
+ local->op_ret = 0;
+
+ if (!local->xattr) {
+ local->xattr = dict_ref(xattr);
+ } else if (is_dir) {
+ dht_aggregate_xattr(local->xattr, xattr);
+ }
+ }
unlock:
- UNLOCK (&frame->lock);
+ UNLOCK(&frame->lock);
- if (follow_link) {
- gf_uuid_copy (local->gfid, stbuf->ia_gfid);
+ if (follow_link) {
+ /* Found a linkto file. Follow it to see if the target file exists
+ */
+ gf_uuid_copy(local->gfid, stbuf->ia_gfid);
- subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
- if (!subvol) {
- op_errno = ESTALE;
- local->op_ret = -1;
- } else {
+ subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
+ if (!subvol) {
+ op_errno = ESTALE;
+ local->op_ret = -1;
+ } else {
+ STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc,
+ local->xattr_req);
+ return 0;
+ }
+ }
+
+ this_call_cnt = dht_frame_return(frame);
- STACK_WIND (frame, dht_lookup_linkfile_cbk,
- subvol, subvol->fops->lookup,
- &local->loc, local->xattr_req);
- return 0;
+ if (is_last_call(this_call_cnt)) {
+ if (!IA_ISDIR(local->stbuf.ia_type) &&
+ (local->hashed_subvol != local->cached_subvol) &&
+ (local->stbuf.ia_nlink == 1) &&
+ (conf && conf->unhashed_sticky_bit)) {
+ local->stbuf.ia_prot.sticky = 1;
+ }
+ /* No need to call heal code if volume count is 1
+ */
+ if (conf->subvolume_cnt == 1)
+ local->need_xattr_heal = 0;
+
+ if (IA_ISDIR(local->stbuf.ia_type)) {
+ /* No mds xattr found. Trigger a heal to set it */
+ if (!__is_root_gfid(local->loc.inode->gfid) &&
+ (!dict_get(local->xattr, conf->mds_xattr_key)))
+ local->need_selfheal = 1;
+
+ if (dht_needs_selfheal(frame, this)) {
+ if (!__is_root_gfid(local->loc.inode->gfid)) {
+ if (local->mds_subvol) {
+ local->stbuf.ia_gid = local->mds_stbuf.ia_gid;
+ local->stbuf.ia_uid = local->mds_stbuf.ia_uid;
+ local->stbuf.ia_prot = local->mds_stbuf.ia_prot;
+ }
+ } else {
+ local->stbuf.ia_gid = local->prebuf.ia_gid;
+ local->stbuf.ia_uid = local->prebuf.ia_uid;
+ local->stbuf.ia_prot = local->prebuf.ia_prot;
}
+
+ layout = local->layout;
+ dht_selfheal_directory(frame, dht_lookup_selfheal_cbk,
+ &local->loc, layout);
+ return 0;
+ }
}
-out:
- this_call_cnt = dht_frame_return (frame);
-
- if (is_last_call (this_call_cnt)) {
- if (!IA_ISDIR (local->stbuf.ia_type)
- && (local->hashed_subvol != local->cached_subvol)
- && (local->stbuf.ia_nlink == 1)
- && (conf && conf->unhashed_sticky_bit)) {
- local->stbuf.ia_prot.sticky = 1;
- }
- if (local->need_selfheal) {
- local->need_selfheal = 0;
- gf_uuid_copy (local->gfid, local->stbuf.ia_gfid);
- local->stbuf.ia_gid = local->prebuf.ia_gid;
- local->stbuf.ia_uid = local->prebuf.ia_uid;
- copy = create_frame (this, this->ctx->pool);
- if (copy) {
- copy_local = dht_local_init (copy, &local->loc,
- NULL, 0);
- if (!copy_local)
- goto cont;
- copy_local->stbuf = local->stbuf;
- copy->local = copy_local;
- FRAME_SU_DO (copy, dht_local_t);
- ret = synctask_new (this->ctx->env,
- dht_dir_attr_heal,
- dht_dir_attr_heal_done,
- copy, copy);
- }
- }
-cont:
- if (local->layout_mismatch) {
- /* Found layout mismatch in the directory, need to
- fix this in the inode context */
- dht_layout_unref (this, local->layout);
- local->layout = NULL;
- dht_lookup_directory (frame, this, &local->loc);
- return 0;
- }
+ if (local->layout_mismatch) {
+ /* Found layout mismatch in the directory, need to
+ fix this in the inode context */
+ dht_layout_unref(this, local->layout);
+ local->layout = NULL;
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
- if (local->need_lookup_everywhere) {
- /* As the current layout gave ENOENT error, we would
- need a new layout */
- dht_layout_unref (this, local->layout);
- local->layout = NULL;
-
- /* We know that current cached subvol is no more
- valid, get the new one */
- local->cached_subvol = NULL;
- dht_lookup_everywhere (frame, this, &local->loc);
- return 0;
- }
- if (local->return_estale) {
- local->op_ret = -1;
- local->op_errno = ESTALE;
- }
+ if (local->need_lookup_everywhere) {
+ /* As the current layout gave ENOENT error, we would
+ need a new layout */
+ dht_layout_unref(this, local->layout);
+ local->layout = NULL;
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- &local->postparent, 1);
+ /* We know that current cached subvol is no longer
+ valid, get the new one */
+ local->cached_subvol = NULL;
+ if (local->xattr_req) {
+ if (!gf_uuid_is_null(local->gfid)) {
+ ret = dict_set_static_bin(local->xattr_req, "gfid-req",
+ local->gfid, 16);
}
+ }
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
+ dht_lookup_everywhere(frame, this, &local->loc);
+ return 0;
+ }
+ if (local->return_estale) {
+ local->op_ret = -1;
+ local->op_errno = ESTALE;
}
-err:
- return ret;
-}
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
-int
-dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- dht_local_t *local = NULL;
- xlator_t *cached_subvol = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+ /* local->stbuf is updated only from subvols which have a layout
+ * The reason is to avoid choosing attr heal source from newly
+ * added bricks. In case e.g we have only one subvol and for
+ * some reason layout is not present on it, then local->stbuf
+ * will be EINVAL. This is an indication that the subvols
+ * active in the cluster do not have layouts on disk.
+ * Unwind with ESTALE to trigger a fresh lookup */
+ if (is_dir && local->stbuf.ia_type == IA_INVAL) {
+ local->op_ret = -1;
+ local->op_errno = ESTALE;
+ }
+ /* Delete mds xattr at the time of STACK UNWIND */
+ if (local->xattr)
+ GF_REMOVE_INTERNAL_XATTR(conf->mds_xattr_key, local->xattr);
- local = frame->local;
- cached_subvol = local->cached_subvol;
- conf = this->private;
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ }
- gf_uuid_unparse(local->loc.gfid, gfid);
+err:
+ return ret;
+}
- ret = dht_layout_preset (this, local->cached_subvol, local->loc.inode);
- if (ret < 0) {
- gf_msg_debug (this->name, EINVAL,
- "Failed to set layout for subvolume %s, "
- "(gfid = %s)",
- cached_subvol ? cached_subvol->name : "<nil>",
- gfid);
- local->op_ret = -1;
- local->op_errno = EINVAL;
- goto unwind;
- }
+static int
+dht_lookup_linkfile_create_cbk(call_frame_t *frame, void *cooie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+ local = frame->local;
+ cached_subvol = local->cached_subvol;
+ conf = this->private;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ if (local->locked)
+ dht_unlock_namespace(frame, &local->lock[0]);
+
+ ret = dht_layout_preset(this, local->cached_subvol, local->loc.inode);
+ if (ret < 0) {
+ gf_msg_debug(this->name, EINVAL,
+ "Failed to set layout for subvolume %s, "
+ "(gfid = %s)",
+ cached_subvol ? cached_subvol->name : "<nil>", gfid);
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ goto unwind;
+ }
- local->op_ret = 0;
- if ((local->stbuf.ia_nlink == 1)
- && (conf && conf->unhashed_sticky_bit)) {
- local->stbuf.ia_prot.sticky = 1;
- }
+ local->op_ret = 0;
+ if ((local->stbuf.ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) {
+ local->stbuf.ia_prot.sticky = 1;
+ }
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- postparent, 1);
- }
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
unwind:
- gf_msg_debug (this->name, 0,
- "creation of linkto on hashed subvol:%s, "
- "returned with op_ret %d and op_errno %d: %s",
- local->hashed_subvol->name,
- op_ret, op_errno, uuid_utoa (local->loc.gfid));
+ gf_msg_debug(this->name, 0,
+ "creation of linkto on hashed subvol:%s, "
+ "returned with op_ret %d and op_errno %d: %s",
+ local->hashed_subvol->name, op_ret, op_errno,
+ uuid_utoa(local->loc.gfid));
- if (local->linked == _gf_true)
- dht_linkfile_attr_heal (frame, this);
+ if (local->linked == _gf_true)
+ dht_linkfile_attr_heal(frame, this);
+ dht_set_fixed_dir_stat(&local->postparent);
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
out:
- return ret;
+ return ret;
}
-int
-dht_lookup_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
+static int
+dht_lookup_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- int this_call_cnt = 0;
- dht_local_t *local = NULL;
- const char *path = NULL;
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ const char *path = NULL;
- local = (dht_local_t*)frame->local;
- path = local->loc.path;
+ local = (dht_local_t *)frame->local;
+ path = local->loc.path;
+ FRAME_SU_UNDO(frame, dht_local_t);
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_UNLINK_LOOKUP_INFO, "lookup_unlink returned with "
- "op_ret -> %d and op-errno -> %d for %s", op_ret, op_errno,
- ((path == NULL)? "null" : path ));
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+ "lookup_unlink returned with "
+ "op_ret -> %d and op-errno -> %d for %s",
+ op_ret, op_errno, ((path == NULL) ? "null" : path));
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- dht_lookup_everywhere_done (frame, this);
- }
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ dht_lookup_everywhere_done(frame, this);
+ }
- return 0;
+ return 0;
}
-int
-dht_lookup_unlink_of_false_linkto_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+static int
+dht_lookup_unlink_of_false_linkto_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- int this_call_cnt = 0;
- dht_local_t *local = NULL;
- const char *path = NULL;
-
- local = (dht_local_t*)frame->local;
- path = local->loc.path;
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ const char *path = NULL;
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_UNLINK_LOOKUP_INFO, "lookup_unlink returned with "
- "op_ret -> %d and op-errno -> %d for %s", op_ret, op_errno,
- ((path == NULL)? "null" : path ));
+ local = (dht_local_t *)frame->local;
+ path = local->loc.path;
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
+ FRAME_SU_UNDO(frame, dht_local_t);
- if (op_ret == 0) {
- dht_lookup_everywhere_done (frame, this);
- } else {
- /*When dht_lookup_everywhere is performed, one cached
- *and one hashed file was found and hashed file does
- *not point to the above mentioned cached node. So it
- *was considered as stale and an unlink was performed.
- *But unlink fails. So may be rebalance is in progress.
- *now ideally we have two data-files. One obtained during
- *lookup_everywhere and one where unlink-failed. So
- *at this point in time we cannot decide which one to
- *choose because there are chances of first cached
- *file is truncated after rebalance and if it is chosen
- *as cached node, application will fail. So return EIO.*/
-
- if (op_errno == EBUSY) {
-
- gf_msg (this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_UNLINK_FAILED,
- "Could not unlink the linkto file as "
- "either fd is open and/or linkto xattr "
- "is set for %s",
- ((path == NULL)? "null":path));
-
- }
- DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL,
- NULL, NULL);
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+ "lookup_unlink returned with "
+ "op_ret -> %d and op-errno -> %d for %s",
+ op_ret, op_errno, ((path == NULL) ? "null" : path));
- }
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ if ((op_ret == 0) || ((op_errno != EBUSY) && (op_errno != ENOTCONN))) {
+ dht_lookup_everywhere_done(frame, this);
+ } else {
+ /*When dht_lookup_everywhere is performed, one cached
+ *and one hashed file was found and hashed file does
+ *not point to the above mentioned cached node. So it
+ *was considered as stale and an unlink was performed.
+ *But unlink fails. So may be rebalance is in progress.
+ *now ideally we have two data-files. One obtained during
+ *lookup_everywhere and one where unlink-failed. So
+ *at this point in time we cannot decide which one to
+ *choose because there are chances of first cached
+ *file is truncated after rebalance and if it is chosen
+ *as cached node, application will fail. So return EIO.*/
+
+ if (op_errno == EBUSY) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_UNLINK_FAILED,
+ "Could not unlink the linkto file as "
+ "either fd is open and/or linkto xattr "
+ "is set for %s",
+ ((path == NULL) ? "null" : path));
+ }
+ DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
}
+ }
- return 0;
+ return 0;
}
-int
-dht_lookup_unlink_stale_linkto_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+static int
+dht_lookup_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
+ dht_local_t *local = NULL;
+ const char *path = NULL;
- dht_local_t *local = NULL;
- const char *path = NULL;
+ /* NOTE:
+ * If stale file unlink fails either there is an open-fd or is not an
+ * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten
+ * to ENOENT
+ */
- /* NOTE:
- * If stale file unlink fails either there is an open-fd or is not an
- * dht-linkto-file then posix_unlink returns EBUSY, which is overwritten
- * to ENOENT
- */
+ local = frame->local;
- local = frame->local;
+ if (local) {
+ FRAME_SU_UNDO(frame, dht_local_t);
+ if (local->loc.path)
+ path = local->loc.path;
+ }
- if (local && local->loc.path)
- path = local->loc.path;
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_UNLINK_LOOKUP_INFO,
+ "Returned with op_ret %d and "
+ "op_errno %d for %s",
+ op_ret, op_errno, ((path == NULL) ? "null" : path));
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_UNLINK_LOOKUP_INFO,
- "Returned with op_ret %d and "
- "op_errno %d for %s", op_ret, op_errno,
- ((path==NULL)?"null":path));
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL);
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL, NULL,
- NULL);
-
- return 0;
+ return 0;
}
-int
-dht_fill_dict_to_avoid_unlink_of_migrating_file (dict_t *dict) {
+static int
+dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_t *dict)
+{
+ int ret = 0;
- int ret = 0;
- xlator_t *this = NULL;
- char *linktoskip_key = NULL;
+ ret = dict_set_int32_sizen(dict, DHT_SKIP_NON_LINKTO_UNLINK, 1);
- this = THIS;
- GF_VALIDATE_OR_GOTO ("dht", this, err);
+ if (ret)
+ return -1;
+
+ ret = dict_set_int32_sizen(dict, DHT_SKIP_OPEN_FD_UNLINK, 1);
- if (dht_is_tier_xlator (this))
- linktoskip_key = TIER_SKIP_NON_LINKTO_UNLINK;
+ if (ret)
+ return -1;
+
+ return 0;
+}
+
+static int32_t
+dht_linkfile_create_lookup_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int call_cnt = 0, ret = 0;
+ xlator_t *subvol = NULL;
+ uuid_t gfid = {
+ 0,
+ };
+ char gfid_str[GF_UUID_BUF_SIZE] = {0};
+
+ subvol = cookie;
+ local = frame->local;
+
+ if (subvol == local->hashed_subvol) {
+ if ((op_ret == 0) || (op_errno != ENOENT))
+ local->dont_create_linkto = _gf_true;
+ } else {
+ if (gf_uuid_is_null(local->gfid))
+ gf_uuid_copy(gfid, local->loc.gfid);
else
- linktoskip_key = DHT_SKIP_NON_LINKTO_UNLINK;
+ gf_uuid_copy(gfid, local->gfid);
+
+ if ((op_ret == 0) && gf_uuid_compare(gfid, buf->ia_gfid)) {
+ gf_uuid_unparse(gfid, gfid_str);
+ gf_msg_debug(this->name, 0,
+ "gfid (%s) different on cached subvol "
+ "(%s) and looked up inode (%s), not "
+ "creating linkto",
+ uuid_utoa(buf->ia_gfid), subvol->name, gfid_str);
+ local->dont_create_linkto = _gf_true;
+ } else if (op_ret == -1) {
+ local->dont_create_linkto = _gf_true;
+ }
+ }
+
+ call_cnt = dht_frame_return(frame);
+ if (is_last_call(call_cnt)) {
+ if (local->dont_create_linkto)
+ goto no_linkto;
+ else {
+ gf_msg_debug(this->name, 0,
+ "Creating linkto file on %s(hash) to "
+ "%s on %s (gfid = %s)",
+ local->hashed_subvol->name, local->loc.path,
+ local->cached_subvol->name, gfid_str);
+
+ ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk,
+ this, local->cached_subvol,
+ local->hashed_subvol, &local->loc);
+
+ if (ret < 0)
+ goto no_linkto;
+ }
+ }
+
+ return 0;
+
+no_linkto:
+ gf_msg_debug(this->name, 0,
+ "skipped linkto creation (path:%s) (gfid:%s) "
+ "(hashed-subvol:%s) (cached-subvol:%s)",
+ local->loc.path, gfid_str, local->hashed_subvol->name,
+ local->cached_subvol->name);
+
+ dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode,
+ &local->stbuf, &local->preparent,
+ &local->postparent, local->xattr);
+ return 0;
+}
- ret = dict_set_int32 (dict, linktoskip_key, 1);
+static int32_t
+dht_call_lookup_linkfile_create(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ int i = 0;
+ xlator_t *subvol = NULL;
- if (ret)
- goto err;
+ local = frame->local;
+ if (gf_uuid_is_null(local->gfid))
+ gf_uuid_unparse(local->loc.gfid, gfid);
+ else
+ gf_uuid_unparse(local->gfid, gfid);
- ret = dict_set_int32 (dict, DHT_SKIP_OPEN_FD_UNLINK, 1);
+ if (op_ret < 0) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "protecting namespace failed, skipping linkto "
+ "creation (path:%s)(gfid:%s)(hashed-subvol:%s)"
+ "(cached-subvol:%s)",
+ local->loc.path, gfid, local->hashed_subvol->name,
+ local->cached_subvol->name);
+ goto err;
+ }
- if (ret)
- goto err;
+ local->locked = _gf_true;
+ local->call_cnt = 2;
- return 0;
+ for (i = 0; i < 2; i++) {
+ subvol = (subvol == NULL) ? local->hashed_subvol : local->cached_subvol;
-err:
- return -1;
+ STACK_WIND_COOKIE(frame, dht_linkfile_create_lookup_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc, NULL);
+ }
+
+ return 0;
+err:
+ dht_lookup_linkfile_create_cbk(frame, NULL, this, 0, 0, local->loc.inode,
+ &local->stbuf, &local->preparent,
+ &local->postparent, local->xattr);
+ return 0;
}
+
/* Rebalance is performed from cached_node to hashed_node. Initial cached_node
* contains a non-linkto file. After migration it is converted to linkto and
* then unlinked. And at hashed_subvolume, first a linkto file is present,
@@ -1232,6836 +2335,9057 @@ err:
* dht_lookup_everywhere_done takes decision based on any of the above case
*/
-int
-dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this)
+static int
+dht_lookup_everywhere_done(call_frame_t *frame, xlator_t *this)
{
- int ret = 0;
- dht_local_t *local = NULL;
- xlator_t *hashed_subvol = NULL;
- xlator_t *cached_subvol = NULL;
- dht_layout_t *layout = NULL;
- char gfid[GF_UUID_BUF_SIZE] = {0};
- gf_boolean_t found_non_linkto_on_hashed = _gf_false;
+ int ret = 0;
+ dht_local_t *local = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_layout_t *layout = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ gf_boolean_t found_non_linkto_on_hashed = _gf_false;
+
+ local = frame->local;
+ hashed_subvol = local->hashed_subvol;
+ cached_subvol = local->cached_subvol;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ if (local->file_count && local->dir_count) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_FILE_TYPE_MISMATCH,
+ "path %s (gfid = %s)exists as a file on one "
+ "subvolume and directory on another. "
+ "Please fix it manually",
+ local->loc.path, gfid);
+ DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL, NULL, NULL);
+ return 0;
+ }
+ if (local->op_ret && local->gfid_missing) {
+ if (gf_uuid_is_null(local->gfid_req)) {
+ DHT_STACK_UNWIND(lookup, frame, -1, ENODATA, NULL, NULL, NULL,
+ NULL);
+ return 0;
+ }
+ /* A hack */
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
+
+ if (local->dir_count) {
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
+
+ gf_msg_debug(this->name, 0,
+ "STATUS: hashed_subvol %s "
+ "cached_subvol %s",
+ (hashed_subvol == NULL) ? "null" : hashed_subvol->name,
+ (cached_subvol == NULL) ? "null" : cached_subvol->name);
+
+ if (!cached_subvol) {
+ if (local->skip_unlink.handle_valid_link && hashed_subvol) {
+ /*Purpose of "DHT_SKIP_NON_LINKTO_UNLINK":
+ * If this lookup is performed by rebalance and this
+ * rebalance process detected hashed file and by
+ * the time it sends the lookup request to cached node,
+ * file got migrated and now at initial hashed_node,
+ * final migrated file is present. With current logic,
+ * because this process fails to find the cached_node,
+ * it will unlink the file at initial hashed_node.
+ *
+ * So we avoid this by setting key, and checking at the
+ * posix_unlink that unlink the file only if file is a
+ * linkto file and not a migrated_file.
+ */
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(
+ local->xattr_req);
+
+ if (ret) {
+ /* If for some reason, setting key in the dict
+ * fails, return with ENOENT, as with respect to
+ * this process, it detected only a stale link
+ * file.
+ *
+ * Next lookup will delete it.
+ *
+ * Performing deletion of stale link file when
+ * setting key in dict fails, may cause the data
+ * loss because of the above mentioned race.
+ */
- local = frame->local;
- hashed_subvol = local->hashed_subvol;
- cached_subvol = local->cached_subvol;
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+ NULL);
+ } else {
+ local->skip_unlink.handle_valid_link = _gf_false;
+
+ gf_msg_debug(this->name, 0,
+ "No Cached was found and "
+ "unlink on hashed was skipped"
+ " so performing now: %s",
+ local->loc.path);
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND(frame, dht_lookup_unlink_stale_linkto_cbk,
+ hashed_subvol, hashed_subvol->fops->unlink,
+ &local->loc, 0, local->xattr_req);
+ }
- gf_uuid_unparse (local->loc.gfid, gfid);
-
- if (local->file_count && local->dir_count) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_FILE_TYPE_MISMATCH,
- "path %s (gfid = %s)exists as a file on one "
- "subvolume and directory on another. "
- "Please fix it manually",
- local->loc.path, gfid);
- DHT_STACK_UNWIND (lookup, frame, -1, EIO, NULL, NULL, NULL,
- NULL);
- return 0;
- }
+ } else {
+ gf_msg_debug(this->name, 0,
+ "There was no cached file and "
+ "unlink on hashed is not skipped %s",
+ local->loc.path);
- if (local->dir_count) {
- dht_lookup_directory (frame, this, &local->loc);
- return 0;
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL, NULL);
}
+ return 0;
+ }
- gf_msg_debug (this->name, 0, "STATUS: hashed_subvol %s "
- "cached_subvol %s",
- (hashed_subvol == NULL)?"null":hashed_subvol->name,
- (cached_subvol == NULL)?"null":cached_subvol->name);
-
- if (!cached_subvol) {
-
- if (local->skip_unlink.handle_valid_link && hashed_subvol) {
-
- /*Purpose of "DHT_SKIP_NON_LINKTO_UNLINK":
- * If this lookup is performed by rebalance and this
- * rebalance process detected hashed file and by
- * the time it sends the lookup request to cached node,
- * file got migrated and now at initial hashed_node,
- * final migrated file is present. With current logic,
- * because this process fails to find the cached_node,
- * it will unlink the file at initial hashed_node.
- *
- * So we avoid this by setting key, and checking at the
- * posix_unlink that unlink the file only if file is a
- * linkto file and not a migrated_file.
- */
-
-
- ret = dht_fill_dict_to_avoid_unlink_of_migrating_file
- (local->xattr_req);
-
- if (ret) {
- /* If for some reason, setting key in the dict
- * fails, return with ENOENT, as with respect to
- * this process, it detected only a stale link
- * file.
- *
- * Next lookup will delete it.
- *
- * Performing deletion of stale link file when
- * setting key in dict fails, may cause the data
- * loss becase of the above mentioned race.
- */
-
-
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT,
- NULL, NULL, NULL, NULL);
- } else {
- local->skip_unlink.handle_valid_link = _gf_false;
-
- gf_msg_debug (this->name, 0,
- "No Cached was found and "
- "unlink on hashed was skipped"
- " so performing now: %s",
- local->loc.path);
-
- STACK_WIND (frame,
- dht_lookup_unlink_stale_linkto_cbk,
- hashed_subvol,
- hashed_subvol->fops->unlink,
- &local->loc, 0, local->xattr_req);
- }
+ /* At the time of dht_lookup, no file was found on hashed and that is
+ * why dht_lookup_everywhere is called, but by the time
+ * dht_lookup_everywhere
+ * reached to server, file might have already migrated. In that case we
+ * will find a migrated file at the hashed_node. In this case store the
+ * layout in context and return successfully.
+ */
- } else {
+ if (hashed_subvol || local->need_lookup_everywhere) {
+ if (local->need_lookup_everywhere) {
+ found_non_linkto_on_hashed = _gf_true;
- gf_msg_debug (this->name, 0,
- "There was no cached file and "
- "unlink on hashed is not skipped %s",
- local->loc.path);
+ } else if ((local->file_count == 1) &&
+ (hashed_subvol == cached_subvol)) {
+ gf_msg_debug(this->name, 0,
+ "found cached file on hashed subvolume "
+ "so store in context and return for %s",
+ local->loc.path);
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT, NULL, NULL,
- NULL, NULL);
- }
- return 0;
+ found_non_linkto_on_hashed = _gf_true;
}
- /* At the time of dht_lookup, no file was found on hashed and that is
- * why dht_lookup_everywhere is called, but by the time
- * dht_lookup_everywhere
- * reached to server, file might have already migrated. In that case we
- * will find a migrated file at the hashed_node. In this case store the
- * layout in context and return successfully.
- */
+ if (found_non_linkto_on_hashed)
+ goto preset_layout;
+ }
- if (hashed_subvol || local->need_lookup_everywhere) {
+ if (hashed_subvol) {
+ if (local->skip_unlink.handle_valid_link == _gf_true) {
+ if (cached_subvol == local->skip_unlink.hash_links_to) {
+ if (gf_uuid_compare(local->skip_unlink.cached_gfid,
+ local->skip_unlink.hashed_gfid)) {
+ /*GFID different, return error*/
+ DHT_STACK_UNWIND(lookup, frame, -1, ESTALE, NULL, NULL,
+ NULL, NULL);
- if (local->need_lookup_everywhere) {
+ return 0;
+ }
- found_non_linkto_on_hashed = _gf_true;
+ ret = dht_layout_preset(this, cached_subvol, local->loc.inode);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_INFO, 0,
+ DHT_MSG_LAYOUT_PRESET_FAILED,
+ "Could not set pre-set layout "
+ "for subvolume %s",
+ cached_subvol->name);
+ }
- } else if ((local->file_count == 1) &&
- (hashed_subvol == cached_subvol)) {
+ local->op_ret = (ret == 0) ? ret : -1;
+ local->op_errno = (ret == 0) ? ret : EINVAL;
- gf_msg_debug (this->name, 0,
- "found cached file on hashed subvolume "
- "so store in context and return for %s",
- local->loc.path);
+ /* Presence of local->cached_subvol validates
+ * that lookup from cached node is successful
+ */
- found_non_linkto_on_hashed = _gf_true;
+ if (!local->op_ret && local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+
+ gf_msg_debug(this->name, 0,
+ "Skipped unlinking linkto file "
+ "on the hashed subvolume. "
+ "Returning success as it is a "
+ "valid linkto file. Path:%s",
+ local->loc.path);
+
+ goto unwind_hashed_and_cached;
+ } else {
+ local->skip_unlink.handle_valid_link = _gf_false;
+
+ gf_msg_debug(this->name, 0,
+ "Linkto file found on hashed "
+ "subvol "
+ "and data file found on cached "
+ "subvolume. But linkto points to "
+ "different cached subvolume (%s) "
+ "path %s",
+ (local->skip_unlink.hash_links_to
+ ? local->skip_unlink.hash_links_to->name
+ : " <nil>"),
+ local->loc.path);
+
+ if (local->skip_unlink.opend_fd_count == 0) {
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(
+ local->xattr_req);
+
+ if (ret) {
+ DHT_STACK_UNWIND(lookup, frame, -1, EIO, NULL, NULL,
+ NULL, NULL);
+ } else {
+ local->call_cnt = 1;
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND(frame, dht_lookup_unlink_of_false_linkto_cbk,
+ hashed_subvol, hashed_subvol->fops->unlink,
+ &local->loc, 0, local->xattr_req);
+ }
+
+ return 0;
}
-
- if (found_non_linkto_on_hashed)
- goto preset_layout;
-
+ }
}
+ }
+preset_layout:
- if (hashed_subvol) {
- if (local->skip_unlink.handle_valid_link == _gf_true) {
- if (cached_subvol == local->skip_unlink.hash_links_to) {
-
- if (gf_uuid_compare (local->skip_unlink.cached_gfid,
- local->skip_unlink.hashed_gfid)){
-
- /*GFID different, return error*/
- DHT_STACK_UNWIND (lookup, frame, -1,
- ESTALE, NULL, NULL,
- NULL, NULL);
+ if (found_non_linkto_on_hashed) {
+ if (local->need_lookup_everywhere) {
+ if (gf_uuid_compare(local->gfid, local->inode->gfid)) {
+ /* GFID different, return error */
+ DHT_STACK_UNWIND(lookup, frame, -1, ENOENT, NULL, NULL, NULL,
+ NULL);
+ return 0;
+ }
+ }
- return 0;
- }
+ local->op_ret = 0;
+ local->op_errno = 0;
+ layout = dht_layout_for_subvol(this, cached_subvol);
+ if (!layout) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "%s: no pre-set layout for subvolume %s,"
+ " gfid = %s",
+ local->loc.path,
+ (cached_subvol ? cached_subvol->name : "<nil>"), gfid);
+ }
- ret = dht_layout_preset (this, cached_subvol,
- local->loc.inode);
- if (ret) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_LAYOUT_PRESET_FAILED,
- "Could not set pre-set layout "
- "for subvolume %s",
- cached_subvol->name);
- }
+ ret = dht_layout_set(this, local->inode, layout);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "%s: failed to set layout for subvol %s, "
+ "gfid = %s",
+ local->loc.path,
+ (cached_subvol ? cached_subvol->name : "<nil>"), gfid);
+ }
- local->op_ret = (ret == 0) ? ret : -1;
- local->op_errno = (ret == 0) ? ret : EINVAL;
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
- /* Presence of local->cached_subvol validates
- * that lookup from cached node is successful
- */
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ return 0;
+ }
- if (!local->op_ret && local->loc.parent) {
- dht_inode_ctx_time_update
- (local->loc.parent, this,
- &local->postparent, 1);
- }
+ if (!hashed_subvol) {
+ gf_msg_debug(this->name, 0,
+ "Cannot create linkfile for %s on %s: "
+ "hashed subvolume cannot be found, gfid = %s.",
+ local->loc.path, cached_subvol->name, gfid);
- gf_msg_debug (this->name, 0,
- "Skipped unlinking linkto file "
- "on the hashed subvolume. "
- "Returning success as it is a "
- "valid linkto file. Path:%s"
- ,local->loc.path);
+ local->op_ret = 0;
+ local->op_errno = 0;
- goto unwind_hashed_and_cached;
- } else {
+ ret = dht_layout_preset(frame->this, cached_subvol, local->inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+ "Failed to set layout for subvol %s"
+ ", gfid = %s",
+ cached_subvol ? cached_subvol->name : "<nil>", gfid);
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ }
- local->skip_unlink.handle_valid_link = _gf_false;
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
- gf_msg_debug (this->name, 0,
- "Linkto file found on hashed "
- "subvol "
- "and data file found on cached "
- "subvolume. But linkto points to "
- "different cached subvolume (%s) "
- "path %s",
- (local->skip_unlink.hash_links_to ?
- local->skip_unlink.hash_links_to->name :
- " <nil>"), local->loc.path);
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ return 0;
+ }
- if (local->skip_unlink.opend_fd_count == 0) {
+ if (frame->root->op != GF_FOP_RENAME) {
+ local->current = &local->lock[0];
+ ret = dht_protect_namespace(frame, &local->loc, hashed_subvol,
+ &local->current->ns,
+ dht_call_lookup_linkfile_create);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Creating linkto file on %s(hash) to %s on %s "
+ "(gfid = %s)",
+ hashed_subvol->name, local->loc.path, cached_subvol->name,
+ gfid);
+ ret = dht_linkfile_create(frame, dht_lookup_linkfile_create_cbk, this,
+ cached_subvol, hashed_subvol, &local->loc);
+ }
- ret = dht_fill_dict_to_avoid_unlink_of_migrating_file
- (local->xattr_req);
+ return ret;
+unwind_hashed_and_cached:
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(lookup, frame, local->op_ret, local->op_errno,
+ local->inode, &local->stbuf, local->xattr,
+ &local->postparent);
+ return 0;
+}
- if (ret) {
- DHT_STACK_UNWIND (lookup, frame, -1,
- EIO, NULL, NULL,
- NULL, NULL);
- } else {
- local->call_cnt = 1;
- STACK_WIND (frame,
- dht_lookup_unlink_of_false_linkto_cbk,
- hashed_subvol,
- hashed_subvol->fops->unlink,
- &local->loc, 0,
- local->xattr_req);
- }
+static int
+dht_lookup_everywhere_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *buf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ int is_linkfile = 0;
+ int is_dir = 0;
+ loc_t *loc = NULL;
+ xlator_t *link_subvol = NULL;
+ int ret = -1;
+ int32_t fd_count = 0;
+ dht_conf_t *conf = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ dict_t *dict_req = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+ local = frame->local;
+ loc = &local->loc;
+ conf = this->private;
+
+ prev = cookie;
+
+ gf_msg_debug(this->name, 0,
+ "returned with op_ret %d and op_errno %d (%s) "
+ "from subvol %s",
+ op_ret, op_errno, loc->path, prev->name);
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno != ENOENT)
+ local->op_errno = op_errno;
+ if (op_errno == ENODATA)
+ local->gfid_missing = _gf_true;
+ goto unlock;
+ }
- return 0;
+ if (gf_uuid_is_null(local->gfid))
+ gf_uuid_copy(local->gfid, buf->ia_gfid);
- }
- }
+ gf_uuid_unparse(local->gfid, gfid);
- }
+ if (gf_uuid_compare(local->gfid, buf->ia_gfid)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid differs on subvolume %s,"
+ " gfid local = %s, gfid node = %s",
+ loc->path, prev->name, gfid, uuid_utoa(buf->ia_gfid));
}
+ is_linkfile = check_is_linkfile(inode, buf, xattr,
+ conf->link_xattr_name);
-preset_layout:
-
- if (found_non_linkto_on_hashed) {
-
- if (local->need_lookup_everywhere) {
- if (gf_uuid_compare (local->gfid, local->inode->gfid)) {
- /* GFID different, return error */
- DHT_STACK_UNWIND (lookup, frame, -1, ENOENT,
- NULL, NULL, NULL, NULL);
- return 0;
- }
- }
+ if (is_linkfile) {
+ link_subvol = dht_linkfile_subvol(this, inode, buf, xattr);
+ gf_msg_debug(this->name, 0, "found on %s linkfile %s (-> %s)",
+ prev->name, loc->path,
+ link_subvol ? link_subvol->name : "''");
+ goto unlock;
+ }
- local->op_ret = 0;
- local->op_errno = 0;
- layout = dht_layout_for_subvol (this, cached_subvol);
- if (!layout) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_SUBVOL_INFO,
- "%s: no pre-set layout for subvolume %s,"
- " gfid = %s",
- local->loc.path, (cached_subvol ?
- cached_subvol->name :
- "<nil>"), gfid);
- }
+ is_dir = check_is_dir(inode, buf, xattr);
- ret = dht_layout_set (this, local->inode, layout);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_SUBVOL_INFO,
- "%s: failed to set layout for subvol %s, "
- "gfid = %s",
- local->loc.path, (cached_subvol ?
- cached_subvol->name :
- "<nil>"), gfid);
- }
+ /* non linkfile GFID takes precedence but don't overwrite
+ gfid if we have already found a cached file*/
+ if (!local->cached_subvol)
+ gf_uuid_copy(local->gfid, buf->ia_gfid);
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- &local->postparent, 1);
- }
+ if (is_dir) {
+ local->dir_count++;
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (lookup, frame, local->op_ret,
- local->op_errno, local->inode,
- &local->stbuf, local->xattr,
- &local->postparent);
- return 0;
+ gf_msg_debug(this->name, 0, "found on %s directory %s", prev->name,
+ loc->path);
+ } else {
+ local->file_count++;
+
+ gf_msg_debug(this->name, 0, "found cached file on %s for %s",
+ prev->name, loc->path);
+
+ if (!local->cached_subvol) {
+ /* found one file */
+ dht_iatt_merge(this, &local->stbuf, buf);
+
+ local->xattr = dict_ref(xattr);
+ local->cached_subvol = prev;
+
+ gf_msg_debug(this->name, 0,
+ "storing cached on %s file"
+ " %s",
+ prev->name, loc->path);
+
+ dht_iatt_merge(this, &local->postparent, postparent);
+
+ gf_uuid_copy(local->skip_unlink.cached_gfid, buf->ia_gfid);
+ } else {
+ /* This is where we need 'rename' both entries logic */
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_FILE_ON_MULT_SUBVOL,
+ "multiple subvolumes (%s and %s) have "
+ "file %s (preferably rename the file "
+ "in the backend,and do a fresh lookup)",
+ local->cached_subvol->name, prev->name, local->loc.path);
+ }
}
+ }
+unlock:
+ UNLOCK(&frame->lock);
- if (!hashed_subvol) {
+ if (is_linkfile) {
+ ret = dict_get_int32(xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count);
- gf_msg_debug (this->name, 0,
- "Cannot create linkfile for %s on %s: "
- "hashed subvolume cannot be found, gfid = %s.",
- local->loc.path, cached_subvol->name, gfid);
+ /* Any linkto file found on the non-hashed subvolume should
+ * be unlinked (performed in the "else if" block below)
+ *
+ * But if a linkto file is found on hashed subvolume, it may be
+ * pointing to valid cached node. So unlinking of linkto
+ * file on hashed subvolume is skipped and inside
+ * dht_lookup_everywhere_done, checks are performed. If this
+ * linkto file is found as stale linkto file, it is deleted
+ * otherwise unlink is skipped.
+ */
- local->op_ret = 0;
- local->op_errno = 0;
+ if (local->hashed_subvol && local->hashed_subvol == prev) {
+ local->skip_unlink.handle_valid_link = _gf_true;
+ local->skip_unlink.opend_fd_count = fd_count;
+ local->skip_unlink.hash_links_to = link_subvol;
+ gf_uuid_copy(local->skip_unlink.hashed_gfid, buf->ia_gfid);
+
+ gf_msg_debug(this->name, 0,
+ "Found"
+ " one linkto file on hashed subvol %s "
+ "for %s: Skipping unlinking till "
+ "everywhere_done",
+ prev->name, loc->path);
+
+ } else if (!ret && (fd_count == 0)) {
+ dict_req = dict_new();
+
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(dict_req);
+
+ if (ret) {
+ /* Skip unlinking for dict_failure
+ *File is found as a linkto file on non-hashed,
+ *subvolume. In the current implementation,
+ *finding a linkto-file on non-hashed does not
+ *always implies that it is stale. So deletion
+ *of file should be done only when both fd is
+ *closed and linkto-xattr is set. In case of
+ *dict_set failure, avoid skipping of file.
+ *NOTE: dht_frame_return should get called for
+ * this block.
+ */
- ret = dht_layout_preset (frame->this, cached_subvol,
- local->inode);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_LAYOUT_PRESET_FAILED,
- "Failed to set layout for subvol %s"
- ", gfid = %s",
- cached_subvol ? cached_subvol->name :
- "<nil>", gfid);
- local->op_ret = -1;
- local->op_errno = EINVAL;
- }
+ dict_unref(dict_req);
+
+ } else {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "attempting deletion of stale linkfile "
+ "%s on %s (hashed subvol is %s)",
+ loc->path, prev->name,
+ (local->hashed_subvol ? local->hashed_subvol->name
+ : "<null>"));
+ /* *
+ * These stale files may be created using root
+ * user. Hence deletion will work only with
+ * root.
+ */
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND(frame, dht_lookup_unlink_cbk, prev,
+ prev->fops->unlink, loc, 0, dict_req);
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- &local->postparent, 1);
- }
+ dict_unref(dict_req);
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (lookup, frame, local->op_ret,
- local->op_errno, local->inode,
- &local->stbuf, local->xattr,
- &local->postparent);
return 0;
+ }
}
+ }
- gf_msg_debug (this->name, 0,
- "Creating linkto file on %s(hash) to %s on %s (gfid = %s)",
- hashed_subvol->name, local->loc.path,
- cached_subvol->name, gfid);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ dht_lookup_everywhere_done(frame, this);
+ }
- ret = dht_linkfile_create (frame,
- dht_lookup_linkfile_create_cbk, this,
- cached_subvol, hashed_subvol, &local->loc);
-
- return ret;
-
-unwind_hashed_and_cached:
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (lookup, frame, local->op_ret, local->op_errno,
- local->loc.inode, &local->stbuf, local->xattr,
- &local->postparent);
- return 0;
+out:
+ return ret;
}
int
-dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *buf, dict_t *xattr,
- struct iatt *postparent)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- int is_linkfile = 0;
- int is_dir = 0;
- xlator_t *subvol = NULL;
- loc_t *loc = NULL;
- xlator_t *link_subvol = NULL;
- int ret = -1;
- int32_t fd_count = 0;
- dht_conf_t *conf = NULL;
- char gfid[GF_UUID_BUF_SIZE] = {0};
- dict_t *dict_req = {0};
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
-
- local = frame->local;
- loc = &local->loc;
- conf = this->private;
-
- prev = cookie;
- subvol = prev->this;
-
- gf_msg_debug (this->name, 0,
- "returned with op_ret %d and op_errno %d (%s) "
- "from subvol %s", op_ret, op_errno, loc->path,
- subvol->name);
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno != ENOENT)
- local->op_errno = op_errno;
- goto unlock;
- }
-
- if (gf_uuid_is_null (local->gfid))
- gf_uuid_copy (local->gfid, buf->ia_gfid);
-
- gf_uuid_unparse(local->gfid, gfid);
-
- if (gf_uuid_compare (local->gfid, buf->ia_gfid)) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_GFID_MISMATCH,
- "%s: gfid differs on subvolume %s,"
- " gfid local = %s, gfid node = %s",
- loc->path, prev->this->name, gfid,
- uuid_utoa(buf->ia_gfid));
- }
-
- is_linkfile = check_is_linkfile (inode, buf, xattr,
- conf->link_xattr_name);
-
- if (is_linkfile) {
- link_subvol = dht_linkfile_subvol (this, inode, buf,
- xattr);
- gf_msg_debug (this->name, 0,
- "found on %s linkfile %s (-> %s)",
- subvol->name, loc->path,
- link_subvol ? link_subvol->name : "''");
- goto unlock;
- }
-
- is_dir = check_is_dir (inode, buf, xattr);
-
- /* non linkfile GFID takes precedence but don't overwrite
- gfid if we have already found a cached file*/
- if (!local->cached_subvol)
- gf_uuid_copy (local->gfid, buf->ia_gfid);
-
- if (is_dir) {
- local->dir_count++;
-
- gf_msg_debug (this->name, 0,
- "found on %s directory %s",
- subvol->name, loc->path);
- } else {
- local->file_count++;
-
- gf_msg_debug (this->name, 0,
- "found cached file on %s for %s",
- subvol->name, loc->path);
-
- if (!local->cached_subvol) {
- /* found one file */
- dht_iatt_merge (this, &local->stbuf, buf,
- subvol);
- local->xattr = dict_ref (xattr);
- local->cached_subvol = subvol;
-
- gf_msg_debug (this->name, 0,
- "storing cached on %s file"
- " %s", subvol->name, loc->path);
-
- dht_iatt_merge (this, &local->postparent,
- postparent, subvol);
-
- gf_uuid_copy (local->skip_unlink.cached_gfid,
- buf->ia_gfid);
- } else {
- /* This is where we need 'rename' both entries logic */
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_FILE_ON_MULT_SUBVOL,
- "multiple subvolumes (%s and %s) have "
- "file %s (preferably rename the file "
- "in the backend,and do a fresh lookup)",
- local->cached_subvol->name,
- subvol->name, local->loc.path);
- }
- }
- }
-unlock:
- UNLOCK (&frame->lock);
-
- if (is_linkfile) {
- ret = dict_get_int32 (xattr, GLUSTERFS_OPEN_FD_COUNT, &fd_count);
-
- /* Any linkto file found on the non-hashed subvolume should
- * be unlinked (performed in the "else if" block below)
- *
- * But if a linkto file is found on hashed subvolume, it may be
- * pointing to vaild cached node. So unlinking of linkto
- * file on hashed subvolume is skipped and inside
- * dht_lookup_everywhere_done, checks are performed. If this
- * linkto file is found as stale linkto file, it is deleted
- * otherwise unlink is skipped.
- */
+dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int i = 0;
+ int call_cnt = 0;
- if (local->hashed_subvol && local->hashed_subvol == subvol) {
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+ GF_VALIDATE_OR_GOTO("dht", loc, out);
- local->skip_unlink.handle_valid_link = _gf_true;
- local->skip_unlink.opend_fd_count = fd_count;
- local->skip_unlink.hash_links_to = link_subvol;
- gf_uuid_copy (local->skip_unlink.hashed_gfid,
- buf->ia_gfid);
+ conf = this->private;
+ local = frame->local;
- gf_msg_debug (this->name, 0, "Found"
- " one linkto file on hashed subvol %s "
- "for %s: Skipping unlinking till "
- "everywhere_done", subvol->name,
- loc->path);
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
- } else if (!ret && (fd_count == 0)) {
+ if (!local->inode)
+ local->inode = inode_ref(loc->inode);
- dict_req = dict_new ();
+ gf_msg_debug(this->name, 0, "winding lookup call to %d subvols", call_cnt);
- ret = dht_fill_dict_to_avoid_unlink_of_migrating_file
- (dict_req);
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_lookup_everywhere_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, loc,
+ local->xattr_req);
+ }
- if (ret) {
+ return 0;
+out:
+ DHT_STACK_UNWIND(lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL);
+err:
+ return -1;
+}
- /* Skip unlinking for dict_failure
- *File is found as a linkto file on non-hashed,
- *subvolume. In the current implementation,
- *finding a linkto-file on non-hashed does not
- *always implies that it is stale. So deletion
- *of file should be done only when both fd is
- *closed and linkto-xattr is set. In case of
- *dict_set failure, avoid skipping of file.
- *NOTE: dht_frame_return should get called for
- * this block.
- */
+int
+dht_lookup_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
+{
+ xlator_t *prev = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ loc_t *loc = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, unwind);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, unwind);
+ GF_VALIDATE_OR_GOTO("dht", this->private, unwind);
+ GF_VALIDATE_OR_GOTO("dht", cookie, unwind);
+
+ prev = cookie;
+ subvol = prev;
+ conf = this->private;
+ local = frame->local;
+ loc = &local->loc;
+
+ gf_uuid_unparse(loc->gfid, gfid);
+
+ if (op_ret == -1) {
+ gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+ "Lookup of %s on %s (following linkfile) failed "
+ ",gfid = %s",
+ local->loc.path, subvol->name, gfid);
+
+ /* If cached subvol returned ENOTCONN, do not do
+ lookup_everywhere. We need to make sure linkfile does not get
+ removed, which can take away the namespace, and subvol is
+ anyways down. */
+
+ local->cached_subvol = NULL;
+ if (op_errno != ENOTCONN)
+ goto err;
+ else
+ goto unwind;
+ }
+
+ if (check_is_dir(inode, stbuf, xattr)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+ "Lookup of %s on %s (following linkfile) reached dir,"
+ " gfid = %s",
+ local->loc.path, subvol->name, gfid);
+ goto err;
+ }
+
+ if (check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+ "lookup of %s on %s (following linkfile) reached link,"
+ "gfid = %s",
+ local->loc.path, subvol->name, gfid);
+ goto err;
+ }
+
+ if (gf_uuid_compare(local->gfid, stbuf->ia_gfid)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GFID_MISMATCH,
+ "%s: gfid different on data file on %s,"
+ " gfid local = %s, gfid node = %s ",
+ local->loc.path, subvol->name, gfid, uuid_utoa(stbuf->ia_gfid));
+ goto err;
+ }
+
+ if ((stbuf->ia_nlink == 1) && (conf && conf->unhashed_sticky_bit)) {
+ stbuf->ia_prot.sticky = 1;
+ }
+
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+ "Failed to set layout for subvolume %s,"
+ "gfid = %s",
+ prev->name, gfid);
+ op_ret = -1;
+ op_errno = EINVAL;
+ }
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
- dict_unref (dict_req);
+unwind:
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(postparent);
+ DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+ postparent);
- } else {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_SUBVOL_INFO,
- "attempting deletion of stale linkfile "
- "%s on %s (hashed subvol is %s)",
- loc->path, subvol->name,
- (local->hashed_subvol?
- local->hashed_subvol->name : "<null>"));
+ return 0;
- STACK_WIND (frame, dht_lookup_unlink_cbk,
- subvol, subvol->fops->unlink, loc,
- 0, dict_req);
+err:
+ dht_lookup_everywhere(frame, this, loc);
+out:
+ return 0;
+}
- dict_unref (dict_req);
+/* Code to get hashed subvol based on inode and loc
+ First it check if loc->parent and loc->path exist then it get
+ hashed subvol based on loc.
+*/
- return 0;
- }
- }
+static gf_boolean_t
+dht_should_lookup_everywhere(xlator_t *this, dht_conf_t *conf, loc_t *loc)
+{
+ dht_layout_t *parent_layout = NULL;
+ int ret = 0;
+ gf_boolean_t lookup_everywhere = _gf_true;
+
+ /* lookup-optimize supersedes lookup-unhashed settings.
+ * If it is set, do not process search_unhashed
+ * If lookup-optimize if enabled, lookup everywhere if:
+ * - this is the rebalance daemon.
+ * - loc->parent is unavailable.
+ * - parent_layout is unavailable
+ * - parent_layout->commit_hash != conf->vol_commit_hash
+ */
+
+ if (conf->lookup_optimize) {
+ if (!conf->defrag && loc->parent) {
+ ret = dht_inode_ctx_layout_get(loc->parent, this, &parent_layout);
+ if (!ret && parent_layout &&
+ (parent_layout->commit_hash == conf->vol_commit_hash)) {
+ lookup_everywhere = _gf_false;
+ }
}
+ goto out;
+ } else {
+ if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
+ if (loc->parent) {
+ ret = dht_inode_ctx_layout_get(loc->parent, this,
+ &parent_layout);
+ if (ret || !parent_layout ||
+ (!parent_layout->search_unhashed)) {
+ lookup_everywhere = _gf_false;
+ }
+ } else {
+ lookup_everywhere = _gf_false;
+ }
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- dht_lookup_everywhere_done (frame, this);
+ goto out;
}
-
+ }
out:
- return ret;
+ return lookup_everywhere;
}
-
int
-dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc)
+dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int i = 0;
- int call_cnt = 0;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
- GF_VALIDATE_OR_GOTO ("dht", loc, out);
-
- conf = this->private;
- local = frame->local;
-
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
+ char is_linkfile = 0;
+ char is_dir = 0;
+ xlator_t *subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+ xlator_t *prev = NULL;
+ int ret = 0;
+ uint32_t vol_commit_hash = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+ GF_VALIDATE_OR_GOTO("dht", this->private, out);
+
+ conf = this->private;
+
+ prev = cookie;
+ local = frame->local;
+ loc = &local->loc;
+
+ gf_msg_debug(this->name, op_errno,
+ "%s: fresh_lookup on %s returned with op_ret %d", loc->path,
+ prev->name, op_ret);
+
+ if (op_ret == -1) {
+ if (ENTRY_MISSING(op_ret, op_errno)) {
+ if (1 == conf->subvolume_cnt) {
+ /* No need to lookup again */
+ goto out;
+ }
- if (!local->inode)
- local->inode = inode_ref (loc->inode);
+ gf_msg_debug(this->name, 0, "Entry %s missing on subvol %s",
+ loc->path, prev->name);
- gf_msg_debug (this->name, 0,
- "winding lookup call to %d subvols", call_cnt);
+ if (dht_should_lookup_everywhere(this, conf, loc)) {
+ local->op_errno = ENOENT;
+ dht_lookup_everywhere(frame, this, loc);
+ return 0;
+ }
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_everywhere_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- loc, local->xattr_req);
+ } else {
+ /* posix returns ENODATA if the gfid is not set but the client and
+ * server protocol layers do not send the stbuf. We need to
+ * heal this so check if this is a directory on the other subvols.
+ */
+ if ((op_errno == ENOTCONN) || (op_errno == ENODATA)) {
+ dht_lookup_directory(frame, this, &local->loc);
+ return 0;
+ }
}
+ gf_msg_debug(this->name, op_errno, "%s: Lookup on subvolume %s failed",
+ loc->path, prev->name);
+ goto out;
+ }
+
+ /* Lookup succeeded - op_ret = 0 */
+
+ /* This is required for handling stale linkfile deletion,
+ * or any more call which happens from this 'loc'.
+ */
+ if (gf_uuid_is_null(local->gfid)) {
+ /*This is set from the first successful response*/
+ memcpy(local->gfid, stbuf->ia_gfid, 16);
+ }
+
+ if (!conf->vch_forced) {
+ /* Update the commit hash in conf if it is found */
+ ret = dict_get_uint32(xattr, conf->commithash_xattr_name,
+ &vol_commit_hash);
+ if (ret == 0) {
+ conf->vol_commit_hash = vol_commit_hash;
+ }
+ }
+ is_dir = check_is_dir(inode, stbuf, xattr);
+ if (is_dir) {
+ /* A directory is present on all subvols, send the lookup to
+ * all subvols now */
+ local->inode = inode_ref(inode);
+ local->xattr = dict_ref(xattr);
+ dht_lookup_directory(frame, this, &local->loc);
return 0;
-out:
- DHT_STACK_UNWIND (lookup, frame, -1, EINVAL, NULL, NULL, NULL, NULL);
-err:
- return -1;
-}
+ }
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
-int
-dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- call_frame_t *prev = NULL;
- dht_local_t *local = NULL;
- xlator_t *subvol = NULL;
- loc_t *loc = NULL;
- dht_conf_t *conf = NULL;
- int ret = 0;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, unwind);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind);
- GF_VALIDATE_OR_GOTO ("dht", this->private, unwind);
- GF_VALIDATE_OR_GOTO ("dht", cookie, unwind);
-
- prev = cookie;
- subvol = prev->this;
- conf = this->private;
- local = frame->local;
- loc = &local->loc;
-
- gf_uuid_unparse(loc->gfid, gfid);
+ if (!is_linkfile) {
+ /* non-directory and not a linkto file. This is a data file
+ * Update the layout to point to the cached subvol
+ */
- if (op_ret == -1) {
- gf_msg (this->name, GF_LOG_INFO, op_errno,
- DHT_MSG_LINK_FILE_LOOKUP_INFO,
- "Lookup of %s on %s (following linkfile) failed "
- ",gfid = %s", local->loc.path, subvol->name, gfid);
-
- /* If cached subvol returned ENOTCONN, do not do
- lookup_everywhere. We need to make sure linkfile does not get
- removed, which can take away the namespace, and subvol is
- anyways down. */
-
- if (op_errno != ENOTCONN)
- goto err;
- else
- goto unwind;
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_PRESET_FAILED,
+ "%s: could not set pre-set layout for subvolume %s",
+ loc->path, prev->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
}
+ goto out;
+ }
- if (check_is_dir (inode, stbuf, xattr)) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_LINK_FILE_LOOKUP_INFO,
- "Lookup of %s on %s (following linkfile) reached dir,"
- " gfid = %s", local->loc.path, subvol->name, gfid);
- goto err;
- }
+ /* This is a linkto file. Get the value of the target subvol from the
+ * linkto xattr and lookup there to see if the file exists
+ */
+ subvol = dht_linkfile_subvol(this, inode, stbuf, xattr);
+ if (!subvol) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "%s: No link subvol for linkto", loc->path);
+ dht_lookup_everywhere(frame, this, loc);
+ return 0;
+ }
- if (check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_LINK_FILE_LOOKUP_INFO,
- "lookup of %s on %s (following linkfile) reached link,"
- "gfid = %s", local->loc.path, subvol->name, gfid);
- goto err;
- }
+ gf_msg_debug(this->name, 0, "%s: Calling lookup on linkto target %s",
+ loc->path, subvol->name);
- if (gf_uuid_compare (local->gfid, stbuf->ia_gfid)) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_GFID_MISMATCH,
- "%s: gfid different on data file on %s,"
- " gfid local = %s, gfid node = %s ",
- local->loc.path, subvol->name, gfid,
- uuid_utoa(stbuf->ia_gfid));
- goto err;
- }
+ STACK_WIND_COOKIE(frame, dht_lookup_linkfile_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc, local->xattr_req);
- if ((stbuf->ia_nlink == 1)
- && (conf && conf->unhashed_sticky_bit)) {
- stbuf->ia_prot.sticky = 1;
- }
+ return 0;
- ret = dht_layout_preset (this, prev->this, inode);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_LAYOUT_PRESET_FAILED,
- "Failed to set layout for subvolume %s,"
- "gfid = %s", prev->this->name, gfid);
- op_ret = -1;
- op_errno = EINVAL;
- }
+out:
+ /*
+ * FIXME: postparent->ia_size and postparent->st_blocks do not have
+ * correct values. since, postparent corresponds to a directory these
+ * two members should have values equal to sum of corresponding values
+ * from each of the subvolume. See dht_iatt_merge for reference.
+ */
+
+ if (!op_ret && local && local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(postparent);
+ DHT_STACK_UNWIND(lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
+ postparent);
+err:
+ return 0;
+}
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- postparent, 1);
- }
+/* For directories, check if acl xattrs have been requested (by the acl
+ * xlator), if not, request for them. These xattrs are needed for dht dir
+ * self-heal to perform proper self-healing of dirs
+ */
+static void
+dht_check_and_set_acl_xattr_req(xlator_t *this, dict_t *xattr_req)
+{
+ int ret = 0;
-unwind:
- DHT_STRIP_PHASE1_FLAGS (stbuf);
- DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
- postparent);
+ GF_ASSERT(xattr_req);
- return 0;
+ if (!dict_get(xattr_req, POSIX_ACL_ACCESS_XATTR)) {
+ ret = dict_set_int8(xattr_req, POSIX_ACL_ACCESS_XATTR, 0);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s",
+ POSIX_ACL_ACCESS_XATTR);
+ }
-err:
- dht_lookup_everywhere (frame, this, loc);
-out:
- return 0;
-}
+ if (!dict_get(xattr_req, POSIX_ACL_DEFAULT_XATTR)) {
+ ret = dict_set_int8(xattr_req, POSIX_ACL_DEFAULT_XATTR, 0);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s",
+ POSIX_ACL_DEFAULT_XATTR);
+ }
+ return;
+}
-int
-dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc)
+/* for directories, we need the following info:
+ * the layout : trusted.glusterfs.dht
+ * the mds information : trusted.glusterfs.dht.mds
+ * the acl info: See above
+ */
+static int
+dht_set_dir_xattr_req(xlator_t *this, loc_t *loc, dict_t *xattr_req)
{
- int call_cnt = 0;
- int i = 0;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int ret = 0;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, out);
- GF_VALIDATE_OR_GOTO ("dht", this, unwind);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, unwind);
- GF_VALIDATE_OR_GOTO ("dht", this->private, unwind);
- GF_VALIDATE_OR_GOTO ("dht", loc, unwind);
+ int ret = -EINVAL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ if (!conf) {
+ goto err;
+ }
+
+ if (!xattr_req) {
+ goto err;
+ }
+
+ /* Xattr to get the layout for a directory
+ */
+ ret = dict_set_uint32(xattr_req, conf->xattr_name, 4 * 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->xattr_name, loc->path);
+ goto err;
+ }
+
+ /*Non-fatal failure */
+ ret = dict_set_uint32(xattr_req, conf->mds_xattr_key, 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->mds_xattr_key, loc->path);
+ }
+
+ dht_check_and_set_acl_xattr_req(this, xattr_req);
+ ret = 0;
+err:
+ return ret;
+}
- conf = this->private;
- local = frame->local;
+/* If the hashed subvol is present, send the lookup to only that subvol first.
+ * If no hashed subvol, send a lookup to all subvols and proceed based on the
+ * responses.
+ */
+static int
+dht_do_fresh_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ xlator_t *hashed_subvol = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int call_cnt = 0;
+ int i = 0;
+
+ conf = this->private;
+ if (!conf) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = frame->local;
+ if (!local) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /* Since we don't know whether this is a file or a directory,
+ * request all xattrs*/
+ ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ /* Fuse sets a random value in gfid-req. If the gfid is missing
+ * on one or more subvols, posix will set the gfid to this value,
+ * causing GFID mismatches for directories. Remove the value fuse
+ * has sent before sending the lookup.
+ */
+ ret = dict_get_gfuuid(local->xattr_req, "gfid-req", &local->gfid_req);
+ if (ret) {
+ gf_msg_debug(this->name, 0, "%s: No gfid-req available", loc->path);
+ } else {
+ dict_del(local->xattr_req, "gfid-req");
+ }
+ /* This should have been set in dht_lookup */
+ hashed_subvol = local->hashed_subvol;
+
+ if (!hashed_subvol) {
+ gf_msg_debug(this->name, 0,
+ "%s: no subvolume in layout for path, "
+ "checking on all the subvols to see if "
+ "it is a directory",
+ loc->path);
- call_cnt = conf->subvolume_cnt;
+ call_cnt = conf->subvolume_cnt;
local->call_cnt = call_cnt;
- local->layout = dht_layout_new (this, conf->subvolume_cnt);
+ /* Allocate a layout. This will be populated and saved in
+ * the dht inode_ctx on successful lookup
+ */
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
if (!local->layout) {
- goto unwind;
- }
-
- if (local->xattr != NULL) {
- dict_unref (local->xattr);
- local->xattr = NULL;
+ op_errno = ENOMEM;
+ goto err;
}
- if (!gf_uuid_is_null (local->gfid)) {
- ret = dict_set_static_bin (local->xattr_req, "gfid-req",
- local->gfid, 16);
- if (ret)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value:"
- " key = gfid-req", local->loc.path);
- }
+ gf_msg_debug(this->name, 0,
+ "%s: Found null hashed subvol. Calling lookup"
+ " on all nodes.",
+ loc->path);
for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
+ STACK_WIND_COOKIE(frame, dht_lookup_dir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, &local->loc,
+ local->xattr_req);
}
return 0;
-unwind:
- DHT_STACK_UNWIND (lookup, frame, -1, ENOMEM, NULL, NULL, NULL, NULL);
-out:
- return 0;
+ }
-}
+ /* if the hashed_subvol is non-null, send the lookup there first so
+ * as to see whether we have a file or a directory */
+ gf_msg_debug(this->name, 0, "%s: Calling fresh lookup on %s", loc->path,
+ hashed_subvol->name);
+ STACK_WIND_COOKIE(frame, dht_lookup_cbk, hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->lookup, loc, local->xattr_req);
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
+}
-int
-dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
-{
- char is_linkfile = 0;
- char is_dir = 0;
- xlator_t *subvol = NULL;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- loc_t *loc = NULL;
- call_frame_t *prev = NULL;
- int ret = 0;
- dht_layout_t *parent_layout = NULL;
- uint32_t vol_commit_hash = 0;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
- GF_VALIDATE_OR_GOTO ("dht", this->private, out);
-
- conf = this->private;
-
- prev = cookie;
- local = frame->local;
- loc = &local->loc;
+static int
+dht_do_revalidate(call_frame_t *frame, xlator_t *this, loc_t *loc)
+{
+ xlator_t *subvol = NULL;
+ xlator_t *mds_subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+ int gen = 0;
+
+ conf = this->private;
+ if (!conf) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = frame->local;
+ if (!local) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0,
+ "path = %s. No layout found in the inode ctx.", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /* Generation number has changed. This layout may be stale. */
+ if (layout->gen && (layout->gen < conf->gen)) {
+ gen = layout->gen;
+ dht_layout_unref(this, local->layout);
+ local->layout = NULL;
+ local->cached_subvol = NULL;
+
+ gf_msg_debug(this->name, 0,
+ "path = %s. In memory layout may be stale."
+ "(layout->gen (%d) is less than "
+ "conf->gen (%d)). Calling fresh lookup.",
+ loc->path, gen, conf->gen);
+
+ dht_do_fresh_lookup(frame, this, loc);
+ return 0;
+ }
+
+ local->inode = inode_ref(loc->inode);
+
+ /* Since we don't know whether this has changed,
+ * request all xattrs*/
+ ret = dht_set_file_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ ret = dht_set_dir_xattr_req(this, loc, local->xattr_req);
+ if (ret) {
+ op_errno = -ret;
+ goto err;
+ }
+
+ if (IA_ISDIR(local->inode->ia_type)) {
+ ret = dht_inode_ctx_mdsvol_get(local->inode, this, &mds_subvol);
+ if (ret || !mds_subvol) {
+ gf_msg_debug(this->name, 0, "path = %s. No mds subvol in inode ctx",
+ local->loc.path);
+ }
+ local->mds_subvol = mds_subvol;
+ local->call_cnt = conf->subvolume_cnt;
- /* This is required for handling stale linkfile deletion,
- * or any more call which happens from this 'loc'.
+ /* local->call_cnt will change as responses are processed. Always use a
+ * local copy to loop through the STACK_WIND calls
*/
- if (!op_ret && gf_uuid_is_null (local->gfid))
- memcpy (local->gfid, stbuf->ia_gfid, 16);
-
- gf_msg_debug (this->name, op_errno,
- "fresh_lookup returned for %s with op_ret %d",
- loc->path, op_ret);
-
- if (!conf->vch_forced) {
- ret = dict_get_uint32 (xattr, conf->commithash_xattr_name,
- &vol_commit_hash);
- if (ret == 0) {
- conf->vol_commit_hash = vol_commit_hash;
- }
- }
-
- if (ENTRY_MISSING (op_ret, op_errno)) {
- gf_msg_debug (this->name, 0,
- "Entry %s missing on subvol %s",
- loc->path, prev->this->name);
-
- /* lookup-optimize supercedes lookup-unhashed settings,
- * - so if it is set, do not process search_unhashed
- * - except, in the case of rebalance deamon, we want to
- * force the lookup_everywhere behavior */
- if (!conf->defrag && conf->lookup_optimize && loc->parent) {
- ret = dht_inode_ctx_layout_get (loc->parent, this,
- &parent_layout);
- if (ret || !parent_layout ||
- (parent_layout->commit_hash !=
- conf->vol_commit_hash)) {
- gf_msg_debug (this->name, 0,
- "hashes don't match (ret - %d,"
- " parent_layout - %p, parent_hash - %x,"
- " vol_hash - %x), do global lookup",
- ret, parent_layout,
- (parent_layout ?
- parent_layout->commit_hash : -1),
- conf->vol_commit_hash);
- local->op_errno = ENOENT;
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
- } else {
- if (conf->search_unhashed ==
- GF_DHT_LOOKUP_UNHASHED_ON) {
- local->op_errno = ENOENT;
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
- if ((conf->search_unhashed ==
- GF_DHT_LOOKUP_UNHASHED_AUTO) &&
- (loc->parent)) {
- ret = dht_inode_ctx_layout_get (loc->parent,
- this,
- &parent_layout);
- if (ret || !parent_layout)
- goto out;
- if (parent_layout->search_unhashed) {
- local->op_errno = ENOENT;
- dht_lookup_everywhere (frame, this,
- loc);
- return 0;
- }
- }
- }
- }
-
- if (op_ret == 0) {
- is_dir = check_is_dir (inode, stbuf, xattr);
- if (is_dir) {
- local->inode = inode_ref (inode);
- local->xattr = dict_ref (xattr);
- }
- }
-
- if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) {
- dht_lookup_directory (frame, this, &local->loc);
- return 0;
- }
+ call_cnt = local->call_cnt;
- if (op_ret == -1) {
- gf_msg_debug (this->name, op_errno,
- "Lookup of %s for subvolume"
- " %s failed", loc->path,
- prev->this->name);
- goto out;
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_revalidate_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->lookup, loc,
+ local->xattr_req);
}
+ return 0;
+ }
- is_linkfile = check_is_linkfile (inode, stbuf, xattr,
- conf->link_xattr_name);
+ /* If not a dir, this should be 1 */
+ local->call_cnt = layout->cnt;
+ call_cnt = local->call_cnt;
- if (!is_linkfile) {
- /* non-directory and not a linkfile */
+ for (i = 0; i < call_cnt; i++) {
+ subvol = layout->list[i].xlator;
- ret = dht_layout_preset (this, prev->this, inode);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_LAYOUT_PRESET_FAILED,
- "could not set pre-set layout for subvolume %s",
- prev->this->name);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
- goto out;
- }
+ gf_msg_debug(this->name, 0,
+ "path = %s. Calling "
+ "revalidate lookup on %s",
+ loc->path, subvol->name);
- subvol = dht_linkfile_subvol (this, inode, stbuf, xattr);
- if (!subvol) {
-
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_SUBVOL_INFO, "linkfile not having link "
- "subvol for %s", loc->path);
+ STACK_WIND_COOKIE(frame, dht_revalidate_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->loc, local->xattr_req);
+ }
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
+}
- gf_msg_debug (this->name, 0,
- "linkfile not having link subvolume. path=%s",
- loc->path);
- dht_lookup_everywhere (frame, this, loc);
- return 0;
- }
+/* Depending on the input, decide if this is a:
+ * fresh-lookup: loc->name is provided but no dht inode ctx
+ * revalidation: loc->name is provided, dht inode ctx is present
+ * discover: gfid based nameless lookup.
+ */
- gf_msg_debug (this->name, 0,
- "Calling lookup on linkto target %s for path %s",
- subvol->name, loc->path);
+int
+dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
+{
+ xlator_t *hashed_subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int op_errno = -1;
+ loc_t new_loc = {
+ 0,
+ };
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ conf = this->private;
+ if (!conf)
+ goto err;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_LOOKUP);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ ret = dht_filter_loc_subvol_key(this, loc, &new_loc, &hashed_subvol);
+ if (ret) {
+ loc_wipe(&local->loc);
+ ret = loc_dup(&new_loc, &local->loc);
+
+ /* we no longer need 'new_loc' entries */
+ loc_wipe(&new_loc);
+
+ /* check if loc_dup() is successful */
+ if (ret == -1) {
+ op_errno = errno;
+ gf_msg_debug(this->name, errno,
+ "copying location failed for path=%s", loc->path);
+ goto err;
+ }
+ }
+
+ if (xattr_req) {
+ local->xattr_req = dict_ref(xattr_req);
+ } else {
+ local->xattr_req = dict_new();
+ }
+
+ /* Nameless lookup */
+
+ /* This is usually sent by NFS. Lookups are done based on the gfid and
+ * no name information is available. Without the name, dht cannot calculate
+ * the hash and has to send a lookup to all subvols.
+ */
+ if (gf_uuid_is_null(loc->pargfid) && !gf_uuid_is_null(loc->gfid) &&
+ !__is_root_gfid(loc->inode->gfid)) {
+ local->cached_subvol = NULL;
+ dht_do_discover(frame, this, loc);
+ return 0;
+ }
+
+ if (loc_is_root(loc)) {
+ /* Request the DHT commit hash xattr (trusted.glusterfs.dht.commithash)
+ * set on the brick root.
+ */
+ ret = dict_set_uint32(local->xattr_req, conf->commithash_xattr_name,
+ sizeof(uint32_t));
+ }
- STACK_WIND (frame, dht_lookup_linkfile_cbk,
- subvol, subvol->fops->lookup,
- &local->loc, local->xattr_req);
+ if (!hashed_subvol)
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ local->hashed_subvol = hashed_subvol;
+ if (is_revalidate(loc)) {
+ /* The entry has been looked up before and has a dht inode_ctx
+ */
+ dht_do_revalidate(frame, this, loc);
return 0;
-
-out:
- /*
- * FIXME: postparent->ia_size and postparent->st_blocks do not have
- * correct values. since, postparent corresponds to a directory these
- * two members should have values equal to sum of corresponding values
- * from each of the subvolume. See dht_iatt_merge for reference.
+ } else {
+ /* Entry has not been looked up before
*/
+ dht_do_fresh_lookup(frame, this, loc);
+ return 0;
+ }
- if (!op_ret && local && local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- postparent, 1);
- }
-
- DHT_STRIP_PHASE1_FLAGS (stbuf);
- DHT_STACK_UNWIND (lookup, frame, op_ret, op_errno, inode, stbuf, xattr,
- postparent);
+ return 0;
err:
- return 0;
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lookup, frame, -1, op_errno, NULL, NULL, NULL, NULL);
+ return 0;
}
-/* For directories, check if acl xattrs have been requested (by the acl xlator),
- * if not, request for them. These xattrs are needed for dht dir self-heal to
- * perform proper self-healing of dirs
- */
-void
-dht_check_and_set_acl_xattr_req (inode_t *inode, dict_t *xattr_req)
+static int
+dht_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- int ret = 0;
-
- GF_ASSERT (inode);
- GF_ASSERT (xattr_req);
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
- if (inode->ia_type != IA_IFDIR)
- return;
+ local = frame->local;
+ prev = cookie;
- if (!dict_get (xattr_req, POSIX_ACL_ACCESS_XATTR)) {
- ret = dict_set_int8 (xattr_req, POSIX_ACL_ACCESS_XATTR, 0);
- if (ret)
- gf_msg (THIS->name, GF_LOG_WARNING, -ret,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s",
- POSIX_ACL_ACCESS_XATTR);
+ LOCK(&frame->lock);
+ {
+ if ((op_ret == -1) &&
+ !((op_errno == ENOENT) || (op_errno == ENOTCONN))) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno,
+ "Unlink link: subvolume %s returned -1", prev->name);
+ goto post_unlock;
}
- if (!dict_get (xattr_req, POSIX_ACL_DEFAULT_XATTR)) {
- ret = dict_set_int8 (xattr_req, POSIX_ACL_DEFAULT_XATTR, 0);
- if (ret)
- gf_msg (THIS->name, GF_LOG_WARNING, -ret,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s",
- POSIX_ACL_DEFAULT_XATTR);
- }
-
- return;
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, xdata);
+
+ return 0;
}
-int
-dht_lookup (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr_req)
-{
- xlator_t *subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int op_errno = -1;
- dht_layout_t *layout = NULL;
- int i = 0;
- int call_cnt = 0;
- loc_t new_loc = {0,};
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- conf = this->private;
- if (!conf)
- goto err;
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_LOOKUP);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- ret = dht_filter_loc_subvol_key (this, loc, &new_loc,
- &hashed_subvol);
- if (ret) {
- loc_wipe (&local->loc);
- ret = loc_dup (&new_loc, &local->loc);
-
- /* we no more need 'new_loc' entries */
- loc_wipe (&new_loc);
-
- /* check if loc_dup() is successful */
- if (ret == -1) {
- op_errno = errno;
- gf_msg_debug (this->name, errno,
- "copying location failed for path=%s",
- loc->path);
- goto err;
- }
- }
-
- if (xattr_req) {
- local->xattr_req = dict_ref (xattr_req);
- } else {
- local->xattr_req = dict_new ();
- }
+static int
+dht_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *hashed_subvol = NULL;
- if (gf_uuid_is_null (loc->pargfid) && !gf_uuid_is_null (loc->gfid) &&
- !__is_root_gfid (loc->inode->gfid)) {
- local->cached_subvol = NULL;
- dht_discover (frame, this, loc);
- return 0;
- }
+ local = frame->local;
+ prev = cookie;
- if (__is_root_gfid(loc->gfid)) {
- ret = dict_set_uint32 (local->xattr_req,
- conf->commithash_xattr_name,
- sizeof(uint32_t));
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if (op_errno != ENOENT) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno,
+ "Unlink: subvolume %s returned -1", prev->name);
+ goto post_unlock;
}
- if (!hashed_subvol)
- hashed_subvol = dht_subvol_get_hashed (this, loc);
- local->hashed_subvol = hashed_subvol;
+ local->op_ret = 0;
- if (is_revalidate (loc)) {
- layout = local->layout;
- if (!layout) {
- gf_msg_debug (this->name, 0,
- "Revalidate lookup without cache."
- " path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ local->postparent = *postparent;
+ local->preparent = *preparent;
- if (layout->gen && (layout->gen < conf->gen)) {
- gf_msg_trace (this->name, 0,
- "incomplete layout failure for path=%s",
- loc->path);
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->preparent, 0);
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!local->op_ret) {
+ hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
+ if (hashed_subvol && hashed_subvol != local->cached_subvol) {
+ /*
+ * If hashed and cached are different, then we need
+ * to unlink linkfile from hashed subvol if data
+ * file is deleted successfully
+ */
+ STACK_WIND_COOKIE(frame, dht_unlink_linkfile_cbk, hashed_subvol,
+ hashed_subvol, hashed_subvol->fops->unlink,
+ &local->loc, local->flags, xdata);
+ return 0;
+ }
+ }
+
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, xdata);
+
+ return 0;
+}
- dht_layout_unref (this, local->layout);
- local->layout = NULL;
- local->cached_subvol = NULL;
+static int
+dht_common_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
- gf_msg_debug(this->name, 0,
- "Called revalidate lookup for %s, "
- "but layout->gen (%d) is less than "
- "conf->gen (%d), calling fresh_lookup",
- loc->path, layout->gen, conf->gen);
+static int
+dht_fix_layout_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
- goto do_fresh_lookup;
- }
+ if (op_ret == 0) {
+ /* update the layout in the inode ctx */
+ local = frame->local;
+ layout = local->selfheal.layout;
- local->inode = inode_ref (loc->inode);
+ dht_layout_set(this, local->loc.inode, layout);
+ }
- ret = dict_set_uint32 (local->xattr_req,
- conf->xattr_name, 4 * 4);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s for "
- "path %s", conf->xattr_name, loc->path);
- goto err;
- }
- /* need it in case file is not found on cached file
- * on revalidate path and we may encounter linkto files on
- * with dht_lookup_everywhere*/
- ret = dict_set_uint32 (local->xattr_req,
- conf->link_xattr_name, 256);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s for "
- "path %s", conf->link_xattr_name, loc->path);
- goto err;
- }
- if (IA_ISDIR (local->inode->ia_type)) {
- local->call_cnt = call_cnt = conf->subvolume_cnt;
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_revalidate_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- loc, local->xattr_req);
- }
- return 0;
- }
+ DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+ return 0;
+}
- call_cnt = local->call_cnt = layout->cnt;
+static int
+dht_err_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
- /* need it for self-healing linkfiles which is
- 'in-migration' state */
- ret = dict_set_uint32 (local->xattr_req,
- GLUSTERFS_OPEN_FD_COUNT, 4);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s for "
- "path %s", GLUSTERFS_OPEN_FD_COUNT, loc->path);
- goto err;
- }
- /* need it for dir self-heal */
- dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req);
+ local = frame->local;
+ prev = cookie;
- for (i = 0; i < call_cnt; i++) {
- subvol = layout->list[i].xlator;
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto post_unlock;
+ }
- gf_msg_debug (this->name, 0, "calling "
- "revalidate lookup for %s at %s",
- loc->path, subvol->name);
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ if ((local->fop == GF_FOP_SETXATTR) ||
+ (local->fop == GF_FOP_FSETXATTR)) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
+ if ((local->fop == GF_FOP_REMOVEXATTR) ||
+ (local->fop == GF_FOP_FREMOVEXATTR)) {
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ }
+ }
- STACK_WIND (frame, dht_revalidate_cbk,
- subvol, subvol->fops->lookup,
- &local->loc, local->xattr_req);
+out:
+ return 0;
+}
- }
- } else {
- do_fresh_lookup:
- /* TODO: remove the hard-coding */
- ret = dict_set_uint32 (local->xattr_req,
- conf->xattr_name, 4 * 4);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s for "
- "path %s", conf->xattr_name, loc->path);
- goto err;
- }
+/* Set the value[] of key into dict after convert from
+ host byte order to network byte order
+*/
+int32_t
+dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size)
+{
+ int ret = -1;
+ int32_t *ptr = NULL;
+ int32_t vindex;
+
+ if (value == NULL) {
+ return -EINVAL;
+ }
+
+ ptr = GF_MALLOC(sizeof(int32_t) * size, gf_common_mt_char);
+ if (ptr == NULL) {
+ return -ENOMEM;
+ }
+ for (vindex = 0; vindex < size; vindex++) {
+ ptr[vindex] = hton32(value[vindex]);
+ }
+ ret = dict_set_bin(dict, key, ptr, sizeof(int32_t) * size);
+ if (ret)
+ GF_FREE(ptr);
+ return ret;
+}
- ret = dict_set_uint32 (local->xattr_req,
- conf->link_xattr_name, 256);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s for "
- "path %s", conf->link_xattr_name, loc->path);
- goto err;
- }
- /* need it for self-healing linkfiles which is
- 'in-migration' state */
- ret = dict_set_uint32 (local->xattr_req,
- GLUSTERFS_OPEN_FD_COUNT, 4);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, ENOMEM,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value:key = %s for "
- "path %s", GLUSTERFS_OPEN_FD_COUNT, loc->path);
- goto err;
- }
- /* need it for dir self-heal */
- dht_check_and_set_acl_xattr_req (loc->inode, local->xattr_req);
+static int
+dht_common_mds_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *prev = cookie;
- if (!hashed_subvol) {
+ local = frame->local;
- gf_msg_debug (this->name, 0,
- "no subvolume in layout for path=%s, "
- "checking on all the subvols to see if "
- "it is a directory", loc->path);
+ if (op_ret)
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
- call_cnt = conf->subvolume_cnt;
- local->call_cnt = call_cnt;
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, 0, op_errno, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
- local->layout = dht_layout_new (this,
- conf->subvolume_cnt);
- if (!local->layout) {
- op_errno = ENOMEM;
- goto err;
- }
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, 0, op_errno, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
- gf_msg_debug (this->name, 0,
- "Found null hashed subvol. Calling lookup"
- " on all nodes.");
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, 0, op_errno, NULL);
+ /* 'local' itself may not be valid after this */
+ goto out;
+ }
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_lookup_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->lookup,
- &local->loc, local->xattr_req);
- }
- return 0;
- }
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, 0, op_errno, NULL);
+ }
- gf_msg_debug (this->name, 0, "Calling fresh lookup for %s on"
- " %s", loc->path, hashed_subvol->name);
+out:
+ return 0;
+}
- STACK_WIND (frame, dht_lookup_cbk,
- hashed_subvol, hashed_subvol->fops->lookup,
- loc, local->xattr_req);
+/* Code to wind a xattrop call to add 1 on current mds internal xattr
+ value
+*/
+static int
+dht_setxattr_non_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int ret = 0;
+ dict_t *xattrop = NULL;
+ int32_t addone[1] = {1};
+ call_frame_t *prev = NULL;
+ dht_conf_t *conf = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret && !local->op_ret) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
+ goto post_unlock;
}
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
- return 0;
-
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (lookup, frame, -1, op_errno, NULL, NULL, NULL,
- NULL);
- return 0;
-}
+ if (is_last_call(this_call_cnt)) {
+ if (!local->op_ret) {
+ xattrop = dict_new();
+ if (!xattrop) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0,
+ "dictionary creation failed");
+ ret = -1;
+ goto out;
+ }
+ ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, addone, 1);
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "dictionary set array failed ");
+ ret = -1;
+ goto out;
+ }
+ if ((local->fop == GF_FOP_SETXATTR) ||
+ (local->fop == GF_FOP_REMOVEXATTR)) {
+ STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol,
+ local->mds_subvol->fops->xattrop, &local->loc,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+ } else {
+ STACK_WIND(frame, dht_common_mds_xattrop_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fxattrop, local->fd,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+ }
+ } else {
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
-int
-dht_unlink_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if ((op_ret == -1) && !((op_errno == ENOENT) ||
- (op_errno == ENOTCONN))) {
- local->op_errno = op_errno;
- gf_msg_debug (this->name, op_errno,
- "Unlink link: subvolume %s"
- " returned -1",
- prev->this->name);
- goto unlock;
- }
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
- local->op_ret = 0;
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+ }
+ }
+out:
+ if (ret) {
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
}
-unlock:
- UNLOCK (&frame->lock);
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, 0, 0, local->xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
- DHT_STACK_UNWIND (unlink, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent, xdata);
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, 0, 0, NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
- return 0;
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, 0, 0, NULL);
+ }
+ }
+just_return:
+ if (xattrop)
+ dict_unref(xattrop);
+ return 0;
}
-int
-dht_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+static int
+dht_setxattr_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- xlator_t *hashed_subvol = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if (op_errno != ENOENT) {
- local->op_ret = -1;
- local->op_errno = op_errno;
- } else {
- local->op_ret = 0;
- }
- gf_msg_debug (this->name, op_errno,
- "Unlink: subvolume %s returned -1",
- prev->this->name);
- goto unlock;
- }
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ call_frame_t *prev = NULL;
+ xlator_t *mds_subvol = NULL;
+ int i = 0;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+ mds_subvol = local->mds_subvol;
+
+ if (op_ret == -1) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
+ goto out;
+ }
- local->op_ret = 0;
+ local->op_ret = 0;
+ local->call_cnt = conf->subvolume_cnt - 1;
+ local->xdata = dict_ref(xdata);
- local->postparent = *postparent;
- local->preparent = *preparent;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (mds_subvol && (mds_subvol == conf->subvolumes[i]))
+ continue;
+ if (local->fop == GF_FOP_SETXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->setxattr, &local->loc,
+ local->xattr, local->flags, local->xattr_req);
+ }
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- &local->preparent, 0);
- dht_inode_ctx_time_update (local->loc.parent, this,
- &local->postparent, 1);
- }
+ if (local->fop == GF_FOP_FSETXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->fsetxattr, local->fd,
+ local->xattr, local->flags, local->xattr_req);
}
-unlock:
- UNLOCK (&frame->lock);
- if (!local->op_ret) {
- hashed_subvol = dht_subvol_get_hashed (this, &local->loc);
- if (hashed_subvol &&
- hashed_subvol != local->cached_subvol) {
- /*
- * If hashed and cached are different, then we need
- * to unlink linkfile from hashed subvol if data
- * file is deleted successfully
- */
- STACK_WIND (frame, dht_unlink_linkfile_cbk,
- hashed_subvol,
- hashed_subvol->fops->unlink, &local->loc,
- local->flags, xdata);
- return 0;
- }
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->removexattr, &local->loc,
+ local->key, local->xattr_req);
}
- DHT_STACK_UNWIND (unlink, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent, xdata);
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_non_mds_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->fremovexattr, local->fd,
+ local->key, local->xattr_req);
+ }
+ }
- return 0;
+ return 0;
+out:
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ }
+
+just_return:
+ return 0;
}
-int
-dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
+static int
+dht_xattrop_mds_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
+ dht_local_t *local = NULL;
+ call_frame_t *prev = NULL;
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
- goto unlock;
- }
-
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
+ local = frame->local;
+ prev = cookie;
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (setxattr, frame, local->op_ret,
- local->op_errno, NULL);
- }
-
- return 0;
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ local->op_ret = op_ret;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->this->name);
+ goto out;
+ }
+
+ if (local->fop == GF_FOP_SETXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->setxattr, &local->loc, local->xattr,
+ local->flags, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fsetxattr, local->fd, local->xattr,
+ local->flags, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->removexattr, &local->loc,
+ local->key, local->xattr_req);
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ STACK_WIND(frame, dht_setxattr_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fremovexattr, local->fd, local->key,
+ local->xattr_req);
+ }
+
+ return 0;
+out:
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FSETXATTR) {
+ DHT_STACK_UNWIND(fsetxattr, frame, local->op_ret, local->op_errno,
+ xdata);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ /* 'local' itself may not be valid after this */
+ goto just_return;
+ }
+
+ if (local->fop == GF_FOP_FREMOVEXATTR) {
+ DHT_STACK_UNWIND(fremovexattr, frame, local->op_ret, local->op_errno,
+ NULL);
+ }
+
+just_return:
+ return 0;
}
static void
-fill_layout_info (dht_layout_t *layout, char *buf)
+fill_layout_info(dht_layout_t *layout, char *buf)
{
- int i = 0;
- char tmp_buf[128] = {0,};
-
- for (i = 0; i < layout->cnt; i++) {
- snprintf (tmp_buf, 128, "(%s %u %u)",
- layout->list[i].xlator->name,
- layout->list[i].start,
- layout->list[i].stop);
- if (i)
- strcat (buf, " ");
- strcat (buf, tmp_buf);
- }
+ int i = 0;
+ char tmp_buf[128] = {
+ 0,
+ };
+
+ for (i = 0; i < layout->cnt; i++) {
+ snprintf(tmp_buf, sizeof(tmp_buf), "(%s %u %u)",
+ layout->list[i].xlator->name, layout->list[i].start,
+ layout->list[i].stop);
+ if (i)
+ strcat(buf, " ");
+ strcat(buf, tmp_buf);
+ }
}
-void
-dht_fill_pathinfo_xattr (xlator_t *this, dht_local_t *local,
- char *xattr_buf, int32_t alloc_len,
- int flag, char *layout_buf)
-{
- if (flag && local->xattr_val)
- snprintf (xattr_buf, alloc_len,
- "((<"DHT_PATHINFO_HEADER"%s> %s) (%s-layout %s))",
- this->name, local->xattr_val, this->name,
- layout_buf);
- else if (local->xattr_val)
- snprintf (xattr_buf, alloc_len,
- "(<"DHT_PATHINFO_HEADER"%s> %s)",
- this->name, local->xattr_val);
- else if (flag)
- snprintf (xattr_buf, alloc_len, "(%s-layout %s)",
- this->name, layout_buf);
+static void
+dht_fill_pathinfo_xattr(xlator_t *this, dht_local_t *local, char *xattr_buf,
+ int32_t alloc_len, int flag, char *layout_buf)
+{
+ if (flag) {
+ if (local->xattr_val) {
+ snprintf(xattr_buf, alloc_len,
+ "((<" DHT_PATHINFO_HEADER "%s> %s) (%s-layout %s))",
+ this->name, local->xattr_val, this->name, layout_buf);
+ } else {
+ snprintf(xattr_buf, alloc_len, "(%s-layout %s)", this->name,
+ layout_buf);
+ }
+ } else if (local->xattr_val) {
+ snprintf(xattr_buf, alloc_len, "(<" DHT_PATHINFO_HEADER "%s> %s)",
+ this->name, local->xattr_val);
+ } else {
+ xattr_buf[0] = '\0';
+ }
}
-int
-dht_vgetxattr_alloc_and_fill (dht_local_t *local, dict_t *xattr, xlator_t *this,
- int op_errno)
+static int
+dht_vgetxattr_alloc_and_fill(dht_local_t *local, dict_t *xattr, xlator_t *this,
+ int op_errno)
{
- int ret = -1;
- char *value = NULL;
- int32_t plen = 0;
+ int ret = -1;
+ char *value = NULL;
- ret = dict_get_str (xattr, local->xsel, &value);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_GET_XATTR_FAILED,
- "Subvolume %s returned -1", this->name);
- local->op_ret = -1;
- local->op_errno = op_errno;
- goto out;
- }
+ ret = dict_get_str(xattr, local->xsel, &value);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED,
+ "Subvolume %s returned -1", this->name);
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto out;
+ }
- local->alloc_len += strlen(value);
+ local->alloc_len += strlen(value);
+ if (!local->xattr_val) {
+ local->alloc_len += (SLEN(DHT_PATHINFO_HEADER) + 10);
+ local->xattr_val = GF_MALLOC(local->alloc_len, gf_common_mt_char);
if (!local->xattr_val) {
- local->alloc_len += (strlen (DHT_PATHINFO_HEADER) + 10);
- local->xattr_val = GF_CALLOC (local->alloc_len, sizeof (char),
- gf_common_mt_char);
- if (!local->xattr_val) {
- ret = -1;
- goto out;
- }
+ ret = -1;
+ goto out;
}
+ local->xattr_val[0] = '\0';
+ }
- if (local->xattr_val) {
- plen = strlen (local->xattr_val);
- if (plen) {
- /* extra byte(s) for \0 to be safe */
- local->alloc_len += (plen + 2);
- local->xattr_val = GF_REALLOC (local->xattr_val,
- local->alloc_len);
- if (!local->xattr_val) {
- ret = -1;
- goto out;
- }
- }
-
- (void) strcat (local->xattr_val, value);
- (void) strcat (local->xattr_val, " ");
- local->op_ret = 0;
+ int plen = strlen(local->xattr_val);
+ if (plen) {
+ /* extra byte(s) for \0 to be safe */
+ local->alloc_len += (plen + 2);
+ local->xattr_val = GF_REALLOC(local->xattr_val, local->alloc_len);
+ if (!local->xattr_val) {
+ ret = -1;
+ goto out;
}
+ }
- ret = 0;
+ (void)strcat(local->xattr_val, value);
+ (void)strcat(local->xattr_val, " ");
+ local->op_ret = 0;
- out:
- return ret;
+ ret = 0;
+
+out:
+ return ret;
}
-int
-dht_vgetxattr_fill_and_set (dht_local_t *local, dict_t **dict, xlator_t *this,
- gf_boolean_t flag)
+static int
+dht_vgetxattr_fill_and_set(dht_local_t *local, dict_t **dict, xlator_t *this,
+ gf_boolean_t flag)
{
- int ret = -1;
- char *xattr_buf = NULL;
- char layout_buf[8192] = {0,};
-
- if (flag)
- fill_layout_info (local->layout, layout_buf);
+ int ret = -1;
+ char *xattr_buf = NULL;
+ char layout_buf[8192] = {
+ 0,
+ };
- *dict = dict_new ();
- if (!*dict)
- goto out;
+ if (flag)
+ fill_layout_info(local->layout, layout_buf);
- local->xattr_val[strlen (local->xattr_val) - 1] = '\0';
+ *dict = dict_new();
+ if (!*dict)
+ goto out;
- /* we would need max this many bytes to create xattr string
- * extra 40 bytes is just an estimated amount of additional
- * space required as we include translator name and some
- * spaces, brackets etc. when forming the pathinfo string.
- *
- * For node-uuid we just don't have all the pretty formatting,
- * but since this is a generic routine for pathinfo & node-uuid
- * we dont have conditional space allocation and try to be
- * generic
- */
- local->alloc_len += (2 * strlen (this->name))
- + strlen (layout_buf)
- + 40;
- xattr_buf = GF_CALLOC (local->alloc_len, sizeof (char),
- gf_common_mt_char);
- if (!xattr_buf)
- goto out;
+ local->xattr_val[strlen(local->xattr_val) - 1] = '\0';
+
+ /* we would need max this many bytes to create xattr string
+ * extra 40 bytes is just an estimated amount of additional
+ * space required as we include translator name and some
+ * spaces, brackets etc. when forming the pathinfo string.
+ *
+ * For node-uuid we just don't have all the pretty formatting,
+ * but since this is a generic routine for pathinfo & node-uuid
+ * we don't have conditional space allocation and try to be
+ * generic
+ */
+ local->alloc_len += (2 * strlen(this->name)) + strlen(layout_buf) + 40;
+ xattr_buf = GF_MALLOC(local->alloc_len, gf_common_mt_char);
+ if (!xattr_buf)
+ goto out;
- if (XATTR_IS_PATHINFO (local->xsel)) {
- (void) dht_fill_pathinfo_xattr (this, local, xattr_buf,
- local->alloc_len, flag,
- layout_buf);
- } else if (XATTR_IS_NODE_UUID (local->xsel)) {
- (void) snprintf (xattr_buf, local->alloc_len, "%s",
- local->xattr_val);
- } else {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_GET_XATTR_FAILED,
- "Unknown local->xsel (%s)", local->xsel);
- GF_FREE (xattr_buf);
- goto out;
- }
+ if (XATTR_IS_PATHINFO(local->xsel)) {
+ (void)dht_fill_pathinfo_xattr(this, local, xattr_buf, local->alloc_len,
+ flag, layout_buf);
+ } else if ((XATTR_IS_NODE_UUID(local->xsel)) ||
+ (XATTR_IS_NODE_UUID_LIST(local->xsel))) {
+ (void)snprintf(xattr_buf, local->alloc_len, "%s", local->xattr_val);
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_GET_XATTR_FAILED,
+ "Unknown local->xsel (%s)", local->xsel);
+ GF_FREE(xattr_buf);
+ goto out;
+ }
- ret = dict_set_dynstr (*dict, local->xsel, xattr_buf);
- if (ret)
- GF_FREE (xattr_buf);
- GF_FREE (local->xattr_val);
+ ret = dict_set_dynstr(*dict, local->xsel, xattr_buf);
+ if (ret)
+ GF_FREE(xattr_buf);
+ GF_FREE(local->xattr_val);
- out:
- return ret;
+out:
+ return ret;
}
-int
-dht_find_local_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr,
- dict_t *xdata)
-{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- call_frame_t *prev = NULL;
- int this_call_cnt = 0;
- int ret = 0;
- char *uuid_str = NULL;
- char *uuid_list = NULL;
- char *next_uuid_str = NULL;
- char *saveptr = NULL;
- uuid_t node_uuid = {0,};
+static int
+dht_find_local_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *prev = NULL;
+ int this_call_cnt = 0;
+ int ret = 0;
+ char *uuid_str = NULL;
+ char *uuid_list = NULL;
+ char *next_uuid_str = NULL;
+ char *saveptr = NULL;
+ uuid_t node_uuid = {
+ 0,
+ };
+ char *uuid_list_copy = NULL;
+ int count = 0;
+ int i = 0;
+ int index = 0;
+ int found = 0;
+ nodeuuid_info_t *tmp_ptr = NULL;
+
+ VALIDATE_OR_GOTO(frame, out);
+ VALIDATE_OR_GOTO(frame->local, out);
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ VALIDATE_OR_GOTO(conf->defrag, out);
+
+ gf_msg_debug(this->name, 0, "subvol %s returned", prev->name);
+
+ LOCK(&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ if (op_errno == ENODATA)
+ gf_msg_debug(this->name, 0, "failed to get node-uuid");
+ else
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_GET_XATTR_FAILED, "failed to get node-uuid");
+ goto post_unlock;
+ }
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (frame->local, out);
+ ret = dict_get_str(xattr, local->xsel, &uuid_list);
- local = frame->local;
- prev = cookie;
- conf = this->private;
-
- LOCK (&frame->lock);
- {
- this_call_cnt = --local->call_cnt;
- if (op_ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_GET_XATTR_FAILED,
- "getxattr err for dir");
- local->op_ret = -1;
- local->op_errno = op_errno;
- goto unlock;
- }
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_GET_FAILED,
+ "Failed to get %s", local->xsel);
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ goto unlock;
+ }
- ret = dict_get_str (xattr, local->xsel, &uuid_list);
+ /* As DHT will not know details of its child xlators
+ * we need to parse this twice to get the count first
+ * and allocate memory later.
+ */
+ count = 0;
+ index = conf->local_subvols_cnt;
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_DICT_GET_FAILED,
- "Failed to get %s", local->xsel);
- local->op_ret = -1;
- local->op_errno = EINVAL;
- goto unlock;
- }
+ uuid_list_copy = gf_strdup(uuid_list);
+ if (!uuid_list_copy)
+ goto unlock;
- for (uuid_str = strtok_r (uuid_list, " ", &saveptr);
- uuid_str;
- uuid_str = next_uuid_str) {
-
- next_uuid_str = strtok_r (NULL, " ", &saveptr);
- if (gf_uuid_parse (uuid_str, node_uuid)) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_UUID_PARSE_ERROR,
- "Failed to parse uuid"
- " failed for %s", prev->this->name);
- local->op_ret = -1;
- local->op_errno = EINVAL;
- goto unlock;
- }
+ for (uuid_str = strtok_r(uuid_list, " ", &saveptr); uuid_str;
+ uuid_str = next_uuid_str) {
+ next_uuid_str = strtok_r(NULL, " ", &saveptr);
+ if (gf_uuid_parse(uuid_str, node_uuid)) {
+ local->op_ret = -1;
+ local->op_errno = EINVAL;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UUID_PARSE_ERROR,
+ "Failed to parse uuid for %s", prev->name);
+ goto post_unlock;
+ }
- if (gf_uuid_compare (node_uuid, conf->defrag->node_uuid)) {
- gf_msg_debug (this->name, 0, "subvol %s does not"
- "belong to this node",
- prev->this->name);
- } else {
- conf->local_subvols[(conf->local_subvols_cnt)++]
- = prev->this;
- gf_msg_debug (this->name, 0, "subvol %s belongs to"
- " this node", prev->this->name);
- break;
- }
- }
+ count++;
+ if (gf_uuid_compare(node_uuid, conf->defrag->node_uuid)) {
+ gf_msg_debug(this->name, 0,
+ "subvol %s does not"
+ "belong to this node",
+ prev->name);
+ } else {
+ /* handle multiple bricks of the same replica
+ * on the same node */
+ if (found)
+ continue;
+ conf->local_subvols[(conf->local_subvols_cnt)++] = prev;
+ found = 1;
+ gf_msg_debug(this->name, 0,
+ "subvol %s belongs to"
+ " this node",
+ prev->name);
+ }
}
- local->op_ret = 0;
- unlock:
- UNLOCK (&frame->lock);
-
- if (!is_last_call (this_call_cnt))
- goto out;
-
- if (local->op_ret == -1) {
- goto unwind;
+ if (!found) {
+ local->op_ret = 0;
+ goto unlock;
}
- DHT_STACK_UNWIND (getxattr, frame, 0, 0, xattr, xdata);
- goto out;
+ conf->local_nodeuuids[index].count = count;
+ conf->local_nodeuuids[index].elements = GF_CALLOC(
+ count, sizeof(nodeuuid_info_t), 1);
- unwind:
- DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, xdata);
- out:
- return 0;
-}
-
-int
-dht_vgetxattr_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
-{
- int ret = 0;
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- dict_t *dict = NULL;
-
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (frame->local, out);
-
- local = frame->local;
+ /* The node-uuids are guaranteed to be returned in the same
+ * order as the bricks
+ * A null node-uuid is returned for a brick that is down.
+ */
- LOCK (&frame->lock);
- {
- this_call_cnt = --local->call_cnt;
- if (op_ret < 0) {
- if (op_errno != ENOTCONN) {
- gf_msg (this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_GET_XATTR_FAILED,
- "getxattr err for dir");
- local->op_ret = -1;
- local->op_errno = op_errno;
- }
+ saveptr = NULL;
+ i = 0;
- goto unlock;
- }
+ for (uuid_str = strtok_r(uuid_list_copy, " ", &saveptr); uuid_str;
+ uuid_str = next_uuid_str) {
+ next_uuid_str = strtok_r(NULL, " ", &saveptr);
+ tmp_ptr = &(conf->local_nodeuuids[index].elements[i]);
+ gf_uuid_parse(uuid_str, tmp_ptr->uuid);
- ret = dht_vgetxattr_alloc_and_fill (local, xattr, this,
- op_errno);
- if (ret)
- gf_msg (this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_DICT_SET_FAILED,
- "alloc or fill failure");
+ if (!gf_uuid_compare(tmp_ptr->uuid, conf->defrag->node_uuid)) {
+ tmp_ptr->info = REBAL_NODEUUID_MINE;
+ }
+ i++;
+ tmp_ptr = NULL;
}
- unlock:
- UNLOCK (&frame->lock);
+ }
- if (!is_last_call (this_call_cnt))
- goto out;
+ local->op_ret = 0;
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!is_last_call(this_call_cnt))
+ goto out;
- /* -- last call: do patch ups -- */
+ if (local->op_ret == -1) {
+ goto unwind;
+ }
- if (local->op_ret == -1) {
- goto unwind;
- }
+ DHT_STACK_UNWIND(getxattr, frame, 0, 0, xattr, xdata);
+ goto out;
- ret = dht_vgetxattr_fill_and_set (local, &dict, this, _gf_true);
- if (ret)
- goto unwind;
+unwind:
- DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata);
- goto cleanup;
+ GF_FREE(conf->local_nodeuuids[index].elements);
+ conf->local_nodeuuids[index].elements = NULL;
- unwind:
- DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno, NULL, NULL);
- cleanup:
- if (dict)
- dict_unref (dict);
- out:
- return 0;
+ DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, xdata);
+out:
+ GF_FREE(uuid_list_copy);
+ return 0;
}
-int
-dht_vgetxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+static int
+dht_vgetxattr_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int ret = 0;
- dict_t *dict = NULL;
- call_frame_t *prev = NULL;
- gf_boolean_t flag = _gf_true;
+ int ret = 0;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ dict_t *dict = NULL;
- local = frame->local;
- prev = cookie;
+ VALIDATE_OR_GOTO(frame, out);
+ VALIDATE_OR_GOTO(frame->local, out);
+
+ local = frame->local;
+ LOCK(&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
if (op_ret < 0) {
+ if (op_errno != ENOTCONN) {
local->op_ret = -1;
local->op_errno = op_errno;
- gf_msg (this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_GET_XATTR_FAILED,
- "vgetxattr: Subvolume %s returned -1",
- prev->this->name);
- goto unwind;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_GET_XATTR_FAILED, "getxattr err for dir");
+ goto post_unlock;
+ }
+
+ goto unlock;
}
- ret = dht_vgetxattr_alloc_and_fill (local, xattr, this,
- op_errno);
+ ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno);
if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_NO_MEMORY,
- "Allocation or fill failure");
- goto unwind;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_DICT_SET_FAILED,
+ "alloc or fill failure");
+ goto post_unlock;
}
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ if (!is_last_call(this_call_cnt))
+ goto out;
- flag = (local->layout->cnt > 1) ? _gf_true : _gf_false;
+ /* -- last call: do patch ups -- */
- ret = dht_vgetxattr_fill_and_set (local, &dict, this, flag);
- if (ret)
- goto unwind;
+ if (local->op_ret == -1) {
+ goto unwind;
+ }
- DHT_STACK_UNWIND (getxattr, frame, 0, 0, dict, xdata);
- goto cleanup;
+ ret = dht_vgetxattr_fill_and_set(local, &dict, this, _gf_true);
+ if (ret)
+ goto unwind;
- unwind:
- DHT_STACK_UNWIND (getxattr, frame, -1, local->op_errno,
- NULL, NULL);
- cleanup:
- if (dict)
- dict_unref (dict);
+ DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata);
+ goto cleanup;
- return 0;
+unwind:
+ DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL);
+cleanup:
+ if (dict)
+ dict_unref(dict);
+out:
+ return 0;
}
-int
-dht_linkinfo_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr,
- dict_t *xdata)
+static int
+dht_vgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata)
{
- int ret = 0;
- char *value = NULL;
-
- if (op_ret != -1) {
- ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value);
- if (!ret) {
- ret = dict_set_str (xattr, GF_XATTR_LINKINFO_KEY, value);
- if (!ret)
- gf_msg_trace (this->name, 0,
- "failed to set linkinfo");
- }
- }
+ dht_local_t *local = NULL;
+ int ret = 0;
+ dict_t *dict = NULL;
+ xlator_t *prev = NULL;
+ gf_boolean_t flag = _gf_true;
- DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, xattr, xdata);
+ local = frame->local;
+ prev = cookie;
- return 0;
-}
-
-int
-dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
-{
- int this_call_cnt = 0;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ gf_msg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_GET_XATTR_FAILED,
+ "vgetxattr: Subvolume %s returned -1", prev->name);
+ goto unwind;
+ }
- VALIDATE_OR_GOTO (frame, out);
- VALIDATE_OR_GOTO (frame->local, out);
- VALIDATE_OR_GOTO (this->private, out);
+ ret = dht_vgetxattr_alloc_and_fill(local, xattr, this, op_errno);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+ "Allocation or fill failure");
+ goto unwind;
+ }
- conf = this->private;
- local = frame->local;
+ flag = (local->layout->cnt > 1) ? _gf_true : _gf_false;
- LOCK (&frame->lock);
- {
- if (!xattr || (op_ret == -1)) {
- local->op_ret = op_ret;
- goto unlock;
- }
+ ret = dht_vgetxattr_fill_and_set(local, &dict, this, flag);
+ if (ret)
+ goto unwind;
- if (dict_get (xattr, conf->xattr_name)) {
- dict_del (xattr, conf->xattr_name);
- }
+ DHT_STACK_UNWIND(getxattr, frame, 0, 0, dict, xdata);
+ goto cleanup;
- if (frame->root->pid >= 0) {
- GF_REMOVE_INTERNAL_XATTR
- ("trusted.glusterfs.quota*", xattr);
- GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
- }
+unwind:
+ DHT_STACK_UNWIND(getxattr, frame, -1, local->op_errno, NULL, NULL);
+cleanup:
+ if (dict)
+ dict_unref(dict);
- local->op_ret = 0;
+ return 0;
+}
- if (!local->xattr) {
- local->xattr = dict_copy_with_ref (xattr, NULL);
- } else {
- dht_aggregate_xattr (local->xattr, xattr);
- }
+static int
+dht_linkinfo_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ int ret = 0;
+ char *value = NULL;
+ if (op_ret != -1) {
+ ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value);
+ if (!ret) {
+ ret = dict_set_str(xattr, GF_XATTR_LINKINFO_KEY, value);
+ if (!ret)
+ gf_msg_trace(this->name, 0, "failed to set linkinfo");
}
-unlock:
- UNLOCK (&frame->lock);
+ }
- this_call_cnt = dht_frame_return (frame);
-out:
- if (is_last_call (this_call_cnt)) {
+ DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
- /* If we have a valid xattr received from any one of the
- * subvolume, let's return it */
- if (local->xattr) {
- local->op_ret = 0;
- }
-
- DHT_STACK_UNWIND (getxattr, frame, local->op_ret, op_errno,
- local->xattr, NULL);
- }
- return 0;
+ return 0;
}
-int32_t
-dht_getxattr_unwind (call_frame_t *frame,
- int op_ret, int op_errno, dict_t *dict, dict_t *xdata)
+static int
+dht_mds_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
- DHT_STACK_UNWIND (getxattr, frame, op_ret, op_errno, dict, xdata);
- return 0;
-}
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(frame->local, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+ local = frame->local;
+
+ if (!xattr || (op_ret == -1)) {
+ local->op_ret = op_ret;
+ goto out;
+ }
+ dict_del(xattr, conf->xattr_name);
+ local->op_ret = 0;
+
+ if (!local->xattr) {
+ local->xattr = dict_copy_with_ref(xattr, NULL);
+ }
+out:
+ DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr,
+ xdata);
+ return 0;
+err:
+ DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL);
+ return 0;
+}
int
-dht_getxattr_get_real_filename_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- dict_t *xattr, dict_t *xdata)
+dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata)
{
- int this_call_cnt = 0;
- dht_local_t *local = NULL;
-
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
- local = frame->local;
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(frame->local, err);
+ VALIDATE_OR_GOTO(this->private, err);
- LOCK (&frame->lock);
- {
- if (local->op_errno == ENODATA ||
- local->op_errno == EOPNOTSUPP) {
- /* Nothing to do here, we have already found
- * a subvol which does not have the get_real_filename
- * optimization. If condition is for simple logic.
- */
- goto unlock;
- }
+ conf = this->private;
+ local = frame->local;
- if (op_ret == -1) {
-
- if (op_errno == ENODATA || op_errno == EOPNOTSUPP) {
- /* This subvol does not have the optimization.
- * Better let the user know we don't support it.
- * Remove previous results if any.
- */
-
- if (local->xattr) {
- dict_unref (local->xattr);
- local->xattr = NULL;
- }
-
- if (local->xattr_req) {
- dict_unref (local->xattr_req);
- local->xattr_req = NULL;
- }
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- gf_msg (this->name, GF_LOG_WARNING, op_errno,
- DHT_MSG_UPGRADE_BRICKS, "At least "
- "one of the bricks does not support "
- "this operation. Please upgrade all "
- "bricks.");
- goto unlock;
- }
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto err;
+ return 0;
+ }
- if (op_errno == ENOENT) {
- /* Do nothing, our defaults are set to this.
- */
- goto unlock;
- }
+ LOCK(&frame->lock);
+ {
+ if (!xattr || (op_ret == -1)) {
+ local->op_ret = op_ret;
+ goto unlock;
+ }
- /* This is a place holder for every other error
- * case. I am not sure of how to interpret
- * ENOTCONN etc. As of now, choosing to ignore
- * down subvol and return a good result(if any)
- * from other subvol.
- */
- gf_msg (this->name, GF_LOG_WARNING, op_errno,
- DHT_MSG_GET_XATTR_FAILED,
- "Failed to get real filename.");
- goto unlock;
+ dict_del(xattr, conf->xattr_name);
+ dict_del(xattr, conf->mds_xattr_key);
- }
+ dict_del(xattr, conf->commithash_xattr_name);
+ if (frame->root->pid >= 0) {
+ GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+ GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+ }
- /* This subvol has the required file.
- * There could be other subvols which have returned
- * success already, choosing to return the latest good
- * result.
- */
- if (local->xattr)
- dict_unref (local->xattr);
- local->xattr = dict_ref (xattr);
+ local->op_ret = 0;
- if (local->xattr_req) {
- dict_unref (local->xattr_req);
- local->xattr_req = NULL;
- }
- if (xdata)
- local->xattr_req = dict_ref (xdata);
+ if (!local->xattr) {
+ local->xattr = dict_copy_with_ref(xattr, NULL);
+ } else {
+ dht_aggregate_xattr(local->xattr, xattr);
+ }
- local->op_ret = op_ret;
- local->op_errno = 0;
- gf_msg_debug (this->name, 0, "Found a matching "
- "file.");
+ if (!local->xdata) {
+ local->xdata = dict_ref(xdata);
+ } else if ((local->inode && IA_ISDIR(local->inode->ia_type)) ||
+ (local->fd && IA_ISDIR(local->fd->inode->ia_type))) {
+ dht_aggregate_xattr(local->xdata, xdata);
}
+ }
unlock:
- UNLOCK (&frame->lock);
-
+ UNLOCK(&frame->lock);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (getxattr, frame, local->op_ret,
- local->op_errno, local->xattr,
- local->xattr_req);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ /* If we have a valid xattr received from any one of the
+ * subvolume, let's return it */
+ if (local->xattr) {
+ local->op_ret = 0;
}
- return 0;
+ DHT_STACK_UNWIND(getxattr, frame, local->op_ret, op_errno, local->xattr,
+ local->xdata);
+ }
+ return 0;
+err:
+ DHT_STACK_UNWIND(getxattr, frame, -1, EINVAL, NULL, NULL);
+ return 0;
}
+static int32_t
+dht_getxattr_unwind(call_frame_t *frame, int op_ret, int op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, dict, xdata);
+ return 0;
+}
-int
-dht_getxattr_get_real_filename (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *key, dict_t *xdata)
+static int
+dht_getxattr_get_real_filename_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int op_ret, int op_errno,
+ dict_t *xattr, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int i = 0;
- dht_layout_t *layout = NULL;
- int cnt = 0;
- xlator_t *subvol = NULL;
+ int this_call_cnt = 0;
+ dht_local_t *local = NULL;
+ local = frame->local;
- local = frame->local;
- layout = local->layout;
+ LOCK(&frame->lock);
+ {
+ if (local->op_errno == EOPNOTSUPP) {
+ /* Nothing to do here, we have already found
+ * a subvol which does not have the get_real_filename
+ * optimization. If condition is for simple logic.
+ */
+ goto unlock;
+ }
- cnt = local->call_cnt = layout->cnt;
+ if (op_ret == -1) {
+ if (op_errno == EOPNOTSUPP) {
+ /* This subvol does not have the optimization.
+ * Better let the user know we don't support it.
+ * Remove previous results if any.
+ */
- local->op_ret = -1;
- local->op_errno = ENOENT;
+ if (local->xattr) {
+ dict_unref(local->xattr);
+ local->xattr = NULL;
+ }
- for (i = 0; i < cnt; i++) {
- subvol = layout->list[i].xlator;
- STACK_WIND (frame, dht_getxattr_get_real_filename_cbk,
- subvol, subvol->fops->getxattr,
- loc, key, xdata);
- }
+ if (local->xattr_req) {
+ dict_unref(local->xattr_req);
+ local->xattr_req = NULL;
+ }
- return 0;
-}
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_UPGRADE_BRICKS,
+ "At least "
+ "one of the bricks does not support "
+ "this operation. Please upgrade all "
+ "bricks.");
+ goto post_unlock;
+ }
-int
-dht_marker_populate_args (call_frame_t *frame, int type, int *gauge,
- xlator_t **subvols)
-{
- dht_local_t *local = NULL;
- int i = 0;
- dht_layout_t *layout = NULL;
+ if (op_errno == ENOATTR) {
+ /* Do nothing, our defaults are set to this.
+ */
+ goto unlock;
+ }
- local = frame->local;
- layout = local->layout;
+ /* This is a place holder for every other error
+ * case. I am not sure of how to interpret
+ * ENOTCONN etc. As of now, choosing to ignore
+ * down subvol and return a good result(if any)
+ * from other subvol.
+ */
+ UNLOCK(&frame->lock);
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_GET_XATTR_FAILED, "Failed to get real filename.");
+ goto post_unlock;
+ }
+
+ /* This subvol has the required file.
+ * There could be other subvols which have returned
+ * success already, choosing to return the latest good
+ * result.
+ */
+ if (local->xattr)
+ dict_unref(local->xattr);
+ local->xattr = dict_ref(xattr);
- for (i = 0; i < layout->cnt; i++)
- subvols[i] = layout->list[i].xlator;
+ if (local->xattr_req) {
+ dict_unref(local->xattr_req);
+ local->xattr_req = NULL;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
- return layout->cnt;
+ local->op_ret = op_ret;
+ local->op_errno = 0;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, 0, "Found a matching file.");
+ goto post_unlock;
+ }
+unlock:
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ DHT_STACK_UNWIND(getxattr, frame, local->op_ret, local->op_errno,
+ local->xattr, local->xattr_req);
+ }
+
+ return 0;
}
-int
-dht_getxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *key, dict_t *xdata)
-#define DHT_IS_DIR(layout) (layout->cnt > 1)
-{
-
- xlator_t *subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- xlator_t *cached_subvol = NULL;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int op_errno = -1;
- int i = 0;
- int cnt = 0;
- char *node_uuid_key = NULL;
- int ret = -1;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (this->private, err);
-
- conf = this->private;
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_GETXATTR);
- if (!local) {
- op_errno = ENOMEM;
+static int
+dht_getxattr_get_real_filename(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
+ int cnt = 0;
+ xlator_t *subvol = NULL;
- goto err;
- }
+ local = frame->local;
+ layout = local->layout;
- layout = local->layout;
- if (!layout) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_LAYOUT_NULL,
- "Layout is NULL");
- op_errno = ENOENT;
- goto err;
- }
+ cnt = local->call_cnt = layout->cnt;
- if (key) {
- local->key = gf_strdup (key);
- if (!local->key) {
- op_errno = ENOMEM;
- goto err;
- }
- }
+ local->op_ret = -1;
+ local->op_errno = ENOATTR;
- if (key &&
- (strncmp (key, GF_XATTR_GET_REAL_FILENAME_KEY,
- strlen (GF_XATTR_GET_REAL_FILENAME_KEY)) == 0)
- && DHT_IS_DIR(layout)) {
- dht_getxattr_get_real_filename (frame, this, loc, key, xdata);
- return 0;
- }
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_getxattr_get_real_filename_cbk, subvol,
+ subvol->fops->getxattr, loc, key, xdata);
+ }
- if (key && DHT_IS_DIR(layout) &&
- (!strcmp (key, GF_REBAL_FIND_LOCAL_SUBVOL))) {
- ret = gf_asprintf
- (&node_uuid_key, "%s", GF_XATTR_NODE_UUID_KEY);
- if (ret == -1 || !node_uuid_key) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_NO_MEMORY,
- "Failed to copy key");
- op_errno = ENOMEM;
- goto err;
- }
- (void) strncpy (local->xsel, node_uuid_key, 256);
- cnt = local->call_cnt = conf->subvolume_cnt;
- for (i = 0; i < cnt; i++) {
- STACK_WIND (frame, dht_find_local_subvol_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->getxattr,
- loc, node_uuid_key, xdata);
- }
- if (node_uuid_key)
- GF_FREE (node_uuid_key);
- return 0;
- }
+ return 0;
+}
- /* for file use cached subvolume (obviously!): see if {}
- * below
- * for directory:
- * wind to all subvolumes and exclude subvolumes which
- * return ENOTCONN (in callback)
- *
- * NOTE: Don't trust inode here, as that may not be valid
- * (until inode_link() happens)
- */
+static int
+dht_marker_populate_args(call_frame_t *frame, int type, int *gauge,
+ xlator_t **subvols)
+{
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_layout_t *layout = NULL;
- if (key && DHT_IS_DIR(layout) &&
- (XATTR_IS_PATHINFO (key)
- || (strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0))) {
- (void) strncpy (local->xsel, key, 256);
- cnt = local->call_cnt = layout->cnt;
- for (i = 0; i < cnt; i++) {
- subvol = layout->list[i].xlator;
- STACK_WIND (frame, dht_vgetxattr_dir_cbk,
- subvol, subvol->fops->getxattr,
- loc, key, xdata);
- }
- return 0;
- }
+ local = frame->local;
+ layout = local->layout;
- /* node-uuid or pathinfo for files */
- if (key && ((strcmp (key, GF_XATTR_NODE_UUID_KEY) == 0)
- || XATTR_IS_PATHINFO (key))) {
- cached_subvol = local->cached_subvol;
- (void) strncpy (local->xsel, key, 256);
+ for (i = 0; i < layout->cnt; i++)
+ subvols[i] = layout->list[i].xlator;
- local->call_cnt = 1;
- STACK_WIND (frame, dht_vgetxattr_cbk, cached_subvol,
- cached_subvol->fops->getxattr, loc, key, xdata);
+ return layout->cnt;
+}
- return 0;
- }
+static int
+dht_is_debug_xattr_key(const char **array, char *key)
+{
+ int i = 0;
- if (key && (strcmp (key, GF_XATTR_LINKINFO_KEY) == 0)) {
+ for (i = 0; array[i]; i++) {
+ if (fnmatch(array[i], key, FNM_NOESCAPE) == 0)
+ return i;
+ }
- hashed_subvol = dht_subvol_get_hashed (this, loc);
- if (!hashed_subvol) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_HASHED_SUBVOL_GET_FAILED,
- "Failed to get hashed subvol for %s",
- loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ return -1;
+}
- cached_subvol = dht_subvol_get_cached (this, loc->inode);
- if (!cached_subvol) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_CACHED_SUBVOL_GET_FAILED,
- "Failed to get cached subvol for %s",
- loc->path);
- op_errno = EINVAL;
- goto err;
- }
+/* Note we already have frame->local initialised here*/
- if (hashed_subvol == cached_subvol) {
- op_errno = ENODATA;
- goto err;
- }
+static int
+dht_handle_debug_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENODATA;
+ char *value = NULL;
+ loc_t file_loc = {0};
+ const char *name = NULL;
- STACK_WIND (frame, dht_linkinfo_getxattr_cbk, hashed_subvol,
- hashed_subvol->fops->getxattr, loc,
- GF_XATTR_PATHINFO_KEY, xdata);
- return 0;
- }
+ local = frame->local;
- if (key && (!strcmp (QUOTA_LIMIT_KEY, key) ||
- !strcmp (QUOTA_LIMIT_OBJECTS_KEY, key))) {
- /* quota hardlimit and aggregated size of a directory is stored
- * in inode contexts of each brick. Hence its good enough that
- * we send getxattr for this key to any brick.
- */
- local->call_cnt = 1;
- subvol = dht_first_up_subvol (this);
- STACK_WIND (frame, dht_getxattr_cbk, subvol,
- subvol->fops->getxattr, loc, key, xdata);
- return 0;
- }
+ if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) == -1) {
+ goto out;
+ }
- if (cluster_handle_marker_getxattr (frame, loc, key, conf->vol_uuid,
- dht_getxattr_unwind,
- dht_marker_populate_args) == 0)
- return 0;
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ op_errno = ENOMEM;
+ goto out;
+ }
- if (DHT_IS_DIR(layout)) {
- cnt = local->call_cnt = layout->cnt;
- } else {
- cnt = local->call_cnt = 1;
+ if (strncmp(key, DHT_DBG_HASHED_SUBVOL_KEY,
+ SLEN(DHT_DBG_HASHED_SUBVOL_KEY)) == 0) {
+ name = key + strlen(DHT_DBG_HASHED_SUBVOL_KEY);
+ if (strlen(name) == 0) {
+ op_errno = EINVAL;
+ goto out;
}
- for (i = 0; i < cnt; i++) {
- subvol = layout->list[i].xlator;
- STACK_WIND (frame, dht_getxattr_cbk,
- subvol, subvol->fops->getxattr,
- loc, key, xdata);
+ ret = dht_build_child_loc(this, &file_loc, loc, (char *)name);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto out;
}
- return 0;
-
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (getxattr, frame, -1, op_errno, NULL, NULL);
-
- return 0;
-}
-#undef DHT_IS_DIR
-
-int
-dht_fgetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *key, dict_t *xdata)
-{
- xlator_t *subvol = NULL;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int op_errno = -1;
- int i = 0;
- int cnt = 0;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
- VALIDATE_OR_GOTO (this->private, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_FGETXATTR);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
+ local->hashed_subvol = dht_subvol_get_hashed(this, &file_loc);
+ if (local->hashed_subvol == NULL) {
+ op_errno = ENODATA;
+ goto out;
}
- layout = local->layout;
- if (!layout) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_LAYOUT_NULL,
- "Layout is NULL");
- op_errno = ENOENT;
- goto err;
+ value = gf_strdup(local->hashed_subvol->name);
+ if (!value) {
+ op_errno = ENOMEM;
+ goto out;
}
- if (key) {
- local->key = gf_strdup (key);
- if (!local->key) {
- op_errno = ENOMEM;
- goto err;
- }
+ ret = dict_set_dynstr(local->xattr, (char *)key, value);
+ if (ret < 0) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
}
+ ret = 0;
+ goto out;
+ }
- if ((fd->inode->ia_type == IA_IFDIR)
- && key
- && (strncmp (key, GF_XATTR_LOCKINFO_KEY,
- strlen (GF_XATTR_LOCKINFO_KEY)) != 0)) {
- cnt = local->call_cnt = layout->cnt;
- } else {
- cnt = local->call_cnt = 1;
- }
+out:
+ loc_wipe(&file_loc);
+ DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL);
+ return 0;
+}
- for (i = 0; i < cnt; i++) {
- subvol = layout->list[i].xlator;
- STACK_WIND (frame, dht_getxattr_cbk,
- subvol, subvol->fops->fgetxattr,
- fd, key, NULL);
- }
- return 0;
+/* Virtual Xattr which returns 1 if all subvols are up,
+ else returns 0. Geo-rep then uses this virtual xattr
+ after a fresh mount and starts the I/O.
+*/
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fgetxattr, frame, -1, op_errno, NULL, NULL);
+enum dht_vxattr_subvol {
+ DHT_VXATTR_SUBVOLS_UP = 1,
+ DHT_VXATTR_SUBVOLS_DOWN = 0,
+};
- return 0;
+int
+dht_vgetxattr_subvol_status(call_frame_t *frame, xlator_t *this,
+ const char *key)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ int op_errno = ENODATA;
+ int value = DHT_VXATTR_SUBVOLS_UP;
+ int i = 0;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+ local = frame->local;
+
+ if (!key) {
+ op_errno = EINVAL;
+ goto out;
+ }
+ local->xattr = dict_new();
+ if (!local->xattr) {
+ op_errno = ENOMEM;
+ goto out;
+ }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ value = DHT_VXATTR_SUBVOLS_DOWN;
+ gf_msg_debug(this->name, 0, "subvol %s is down ",
+ conf->subvolumes[i]->name);
+ break;
+ }
+ }
+ ret = dict_set_int8(local->xattr, (char *)key, value);
+ if (ret < 0) {
+ op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+
+out:
+ DHT_STACK_UNWIND(getxattr, frame, ret, op_errno, local->xattr, NULL);
+ return 0;
}
int
-dht_file_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
+dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *key,
+ dict_t *xdata)
+#define DHT_IS_DIR(layout) (layout->cnt > 1)
{
- int ret = -1;
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- struct iatt *stbuf = NULL;
- inode_t *inode = NULL;
- xlator_t *subvol1 = NULL, *subvol2 = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *mds_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int op_errno = -1;
+ int i = 0;
+ int cnt = 0;
+ char *node_uuid_key = NULL;
+ int ret = -1;
+
+ GF_CHECK_XATTR_KEY_AND_GOTO(key, IO_THREADS_QUEUE_SIZE_KEY, op_errno, err);
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_GETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL,
+ "Layout is NULL");
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ /* skip over code which is irrelevant without a valid key */
+ if (!key)
+ goto no_key;
+
+ local->key = gf_strdup(key);
+ if (!local->key) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ if (strncmp(key, DHT_SUBVOL_STATUS_KEY, SLEN(DHT_SUBVOL_STATUS_KEY)) == 0) {
+ dht_vgetxattr_subvol_status(frame, this, key);
+ return 0;
+ }
+
+ /* skip over code which is irrelevant if !DHT_IS_DIR(layout) */
+ if (!DHT_IS_DIR(layout))
+ goto no_dht_is_dir;
+
+ if ((strncmp(key, GF_XATTR_GET_REAL_FILENAME_KEY,
+ SLEN(GF_XATTR_GET_REAL_FILENAME_KEY)) == 0) &&
+ DHT_IS_DIR(layout)) {
+ dht_getxattr_get_real_filename(frame, this, loc, key, xdata);
+ return 0;
+ }
+
+ if (!strcmp(key, GF_REBAL_FIND_LOCAL_SUBVOL)) {
+ ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_LIST_NODE_UUIDS_KEY);
+ if (ret == -1 || !node_uuid_key) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+ "Failed to copy node uuid key");
+ op_errno = ENOMEM;
+ goto err;
+ }
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key);
+ cnt = local->call_cnt = conf->subvolume_cnt;
+ for (i = 0; i < cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->getxattr, loc,
+ node_uuid_key, xdata);
+ }
+ if (node_uuid_key)
+ GF_FREE(node_uuid_key);
+ return 0;
+ }
+
+ if (!strcmp(key, GF_REBAL_OLD_FIND_LOCAL_SUBVOL)) {
+ ret = gf_asprintf(&node_uuid_key, "%s", GF_XATTR_NODE_UUID_KEY);
+ if (ret == -1 || !node_uuid_key) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_NO_MEMORY,
+ "Failed to copy node uuid key");
+ op_errno = ENOMEM;
+ goto err;
+ }
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", node_uuid_key);
+ cnt = local->call_cnt = conf->subvolume_cnt;
+ for (i = 0; i < cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_find_local_subvol_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->getxattr, loc,
+ node_uuid_key, xdata);
+ }
+ if (node_uuid_key)
+ GF_FREE(node_uuid_key);
+ return 0;
+ }
+
+ /* for file use cached subvolume (obviously!): see if {}
+ * below
+ * for directory:
+ * wind to all subvolumes and exclude subvolumes which
+ * return ENOTCONN (in callback)
+ *
+ * NOTE: Don't trust inode here, as that may not be valid
+ * (until inode_link() happens)
+ */
+
+ if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0) ||
+ (strcmp(key, GF_XATTR_LIST_NODE_UUIDS_KEY) == 0)) {
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key);
+ cnt = local->call_cnt = layout->cnt;
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_vgetxattr_dir_cbk, subvol,
+ subvol->fops->getxattr, loc, key, xdata);
+ }
+ return 0;
+ }
- local = frame->local;
- prev = cookie;
+no_dht_is_dir:
+ /* node-uuid or pathinfo for files */
+ if (XATTR_IS_PATHINFO(key) || (strcmp(key, GF_XATTR_NODE_UUID_KEY) == 0)) {
+ cached_subvol = local->cached_subvol;
+ (void)snprintf(local->xsel, sizeof(local->xsel), "%s", key);
+ local->call_cnt = 1;
+ STACK_WIND_COOKIE(frame, dht_vgetxattr_cbk, cached_subvol,
+ cached_subvol, cached_subvol->fops->getxattr, loc,
+ key, xdata);
- local->op_errno = op_errno;
+ return 0;
+ }
- if ((op_ret == -1) && !dht_inode_missing (op_errno)) {
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1.",
- prev->this->name);
- goto out;
+ if (strcmp(key, GF_XATTR_LINKINFO_KEY) == 0) {
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (!hashed_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for %s", loc->path);
+ op_errno = EINVAL;
+ goto err;
}
- if (local->call_cnt != 1)
- goto out;
-
- ret = dict_get_bin (xdata, DHT_IATT_IN_XDATA_KEY, (void **) &stbuf);
+ cached_subvol = dht_subvol_get_cached(this, loc->inode);
+ if (!cached_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_CACHED_SUBVOL_GET_FAILED,
+ "Failed to get cached subvol for %s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
- if ((!op_ret) && !stbuf) {
- goto out;
+ if (hashed_subvol == cached_subvol) {
+ op_errno = ENODATA;
+ goto err;
}
- local->op_ret = op_ret;
- local->rebalance.target_op_fn = dht_setxattr2;
- if (xdata)
- local->rebalance.xdata = dict_ref (xdata);
+ STACK_WIND(frame, dht_linkinfo_getxattr_cbk, hashed_subvol,
+ hashed_subvol->fops->getxattr, loc, GF_XATTR_PATHINFO_KEY,
+ xdata);
+ return 0;
+ }
- /* Phase 2 of migration */
- if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
- }
+ if (dht_is_debug_xattr_key(dht_dbg_vxattrs, (char *)key) >= 0) {
+ dht_handle_debug_getxattr(frame, this, loc, key);
+ return 0;
+ }
- /* Phase 1 of migration */
- if (IS_DHT_MIGRATION_PHASE1 (stbuf)) {
- inode = (local->fd) ? local->fd->inode : local->loc.inode;
+no_key:
+ if (cluster_handle_marker_getxattr(frame, loc, key, conf->vol_uuid,
+ dht_getxattr_unwind,
+ dht_marker_populate_args) == 0)
+ return 0;
- ret = dht_inode_ctx_get_mig_info (this, inode,
- &subvol1, &subvol2);
- if (!dht_mig_info_is_invalid (local->cached_subvol,
- subvol1, subvol2)) {
- dht_setxattr2 (this, subvol2, frame, 0);
- return 0;
+ if (DHT_IS_DIR(layout)) {
+ local->call_cnt = conf->subvolume_cnt;
+ cnt = conf->subvolume_cnt;
+ ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol);
+ if (!mds_subvol) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Cannot determine MDS, fetching xattr %s randomly"
+ " from a subvol for path %s ",
+ key, loc->path);
+ } else {
+ /* TODO need to handle it, As of now we are
+ choosing availability instead of chossing
+ consistencty, in case of mds_subvol is
+ down winding a getxattr call on other subvol
+ and return xattr
+ */
+ local->mds_subvol = mds_subvol;
+ for (i = 0; i < cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_INFO, 0,
+ DHT_MSG_HASHED_SUBVOL_DOWN,
+ "MDS %s is down for path"
+ " path %s so fetching xattr "
+ "%s randomly from a subvol ",
+ local->mds_subvol->name, loc->path, key);
+ ret = 1;
+ }
}
-
- ret = dht_rebalance_in_progress_check (this, frame);
- if (!ret)
- return 0;
+ }
}
-out:
+ if (!ret && key && local->mds_subvol && dht_match_xattr(key)) {
+ STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol,
+ local->mds_subvol->fops->getxattr, loc, key, xdata);
- if (local->fop == GF_FOP_SETXATTR) {
- DHT_STACK_UNWIND (setxattr, frame, op_ret, op_errno, NULL);
- } else {
- DHT_STACK_UNWIND (fsetxattr, frame, op_ret, op_errno, NULL);
+ return 0;
}
+ } else {
+ cnt = local->call_cnt = 1;
+ }
- return 0;
-}
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->getxattr, loc,
+ key, xdata);
+ }
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(getxattr, frame, -1, op_errno, NULL, NULL);
+ return 0;
+}
+#undef DHT_IS_DIR
int
-dht_fsetxattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, dict_t *xattr, int flags, dict_t *xdata)
-{
- xlator_t *subvol = NULL;
- dht_local_t *local = NULL;
- int op_errno = EINVAL;
- dht_conf_t *conf = NULL;
- dht_layout_t *layout = NULL;
- int ret = -1;
- int call_cnt = 0;
- int i = 0;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (fd->inode, err);
- VALIDATE_OR_GOTO (this->private, err);
-
- conf = this->private;
-
- if (!conf->defrag)
- GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
- op_errno, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_FSETXATTR);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
+dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int op_errno = -1;
+ int i = 0;
+ int cnt = 0;
+ xlator_t *mds_subvol = NULL;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(fd->inode, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FGETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_NULL,
+ "Layout is NULL");
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ if (key) {
+ local->key = gf_strdup(key);
+ if (!local->key) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ }
+
+ gf_uuid_unparse(fd->inode->gfid, gfid);
+
+ if ((fd->inode->ia_type == IA_IFDIR) && key &&
+ (strncmp(key, GF_XATTR_LOCKINFO_KEY, SLEN(GF_XATTR_LOCKINFO_KEY)) !=
+ 0)) {
+ local->call_cnt = conf->subvolume_cnt;
+ cnt = conf->subvolume_cnt;
+ ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol);
+
+ if (!mds_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "cannot determine MDS, fetching xattr %s "
+ " randomly from a subvol for gfid %s ",
+ key, gfid);
+ } else {
+ /* TODO need to handle it, As of now we are
+ choosing availability instead of chossing
+ consistencty, in case of hashed_subvol is
+ down winding a getxattr call on other subvol
+ and return xattr
+ */
+ local->mds_subvol = mds_subvol;
+ for (i = 0; i < cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_HASHED_SUBVOL_DOWN,
+ "MDS subvolume %s is down"
+ " for gfid %s so fetching xattr "
+ " %s randomly from a subvol ",
+ local->mds_subvol->name, gfid, key);
+ ret = 1;
+ }
+ }
+ }
}
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ if (!ret && key && local->mds_subvol && dht_match_xattr(key)) {
+ STACK_WIND(frame, dht_mds_getxattr_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fgetxattr, fd, key, NULL);
- layout = local->layout;
- if (!layout) {
- gf_msg_debug (this->name, 0,
- "no layout for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
+ return 0;
}
- local->call_cnt = call_cnt = layout->cnt;
-
- if (IA_ISDIR (fd->inode->ia_type)) {
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_err_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->fsetxattr,
- fd, xattr, flags, NULL);
- }
-
- } else {
-
- local->call_cnt = 1;
- local->rebalance.xattr = dict_ref (xattr);
- local->rebalance.flags = flags;
+ } else {
+ cnt = local->call_cnt = 1;
+ }
- xdata = xdata ? dict_ref (xdata) : dict_new ();
- if (xdata)
- ret = dict_set_dynstr_with_alloc (xdata,
- DHT_IATT_IN_XDATA_KEY, "yes");
- if (ret) {
- gf_msg_debug (this->name, 0,
- "Failed to set dictionary key %s for fd=%p",
- DHT_IATT_IN_XDATA_KEY, fd);
- }
-
- STACK_WIND (frame, dht_file_setxattr_cbk, subvol,
- subvol->fops->fsetxattr, fd, xattr, flags, xdata);
-
- if (xdata)
- dict_unref (xdata);
-
- }
- return 0;
+ for (i = 0; i < cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr, fd,
+ key, NULL);
+ }
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fsetxattr, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
static int
-dht_common_setxattr_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xdata)
+dht_setxattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- DHT_STACK_UNWIND (setxattr, frame, op_ret, op_errno, xdata);
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
- return 0;
-}
+ if (!frame || !frame->local)
+ goto err;
+ local = frame->local;
+ op_errno = local->op_errno;
-int
-dht_checking_pathinfo_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xattr,
- dict_t *xdata)
-{
- int i = -1;
- int ret = -1;
- char *value = NULL;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- call_frame_t *prev = NULL;
- int this_call_cnt = 0;
-
- local = frame->local;
- prev = cookie;
- conf = this->private;
-
- if (op_ret == -1)
- goto out;
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, local->op_errno,
+ local->rebalance.xdata);
+ return 0;
+ }
+ if (subvol == NULL)
+ goto err;
- ret = dict_get_str (xattr, GF_XATTR_PATHINFO_KEY, &value);
- if (ret)
- goto out;
+ local->call_cnt = 2; /* This is the second attempt */
- if (!strcmp (value, local->key)) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->subvolumes[i] == prev->this)
- conf->decommissioned_bricks[i] = prev->this;
- }
- }
+ if (local->fop == GF_FOP_SETXATTR) {
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->setxattr, &local->loc,
+ local->rebalance.xattr, local->rebalance.flags,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->fsetxattr, local->fd,
+ local->rebalance.xattr, local->rebalance.flags,
+ local->xattr_req);
+ }
-out:
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (setxattr, frame, local->op_ret, ENOTSUP, NULL);
- }
- return 0;
+ return 0;
+err:
+ DHT_STACK_UNWIND(setxattr, frame, (local ? local->op_ret : -1), op_errno,
+ NULL);
+ return 0;
}
-
int
-dht_setxattr2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int op_errno = EINVAL;
-
- if (!frame || !frame->local)
- goto err;
-
- local = frame->local;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ struct iatt *stbuf = NULL;
+ inode_t *inode = NULL;
+ xlator_t *subvol1 = NULL, *subvol2 = NULL;
- if (we_are_not_migrating (ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND (setxattr, frame, local->op_ret,
- local->op_errno, local->rebalance.xdata);
- return 0;
- }
+ local = frame->local;
+ prev = cookie;
- if (subvol == NULL)
- goto err;
+ local->op_errno = op_errno;
- op_errno = local->op_errno;
+ if ((local->fop == GF_FOP_FSETXATTR) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- local->call_cnt = 2; /* This is the second attempt */
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1.",
+ prev->name);
+ goto out;
+ }
- if (local->fop == GF_FOP_SETXATTR) {
- STACK_WIND (frame, dht_file_setxattr_cbk, subvol,
- subvol->fops->setxattr, &local->loc,
- local->rebalance.xattr, local->rebalance.flags,
- NULL);
- } else {
- STACK_WIND (frame, dht_file_setxattr_cbk, subvol,
- subvol->fops->fsetxattr, local->fd,
- local->rebalance.xattr, local->rebalance.flags,
- NULL);
- }
+ if (local->call_cnt != 1)
+ goto out;
- return 0;
+ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
-err:
- DHT_STACK_UNWIND (setxattr, frame, local ? local->op_ret : -1, op_errno, NULL);
- return 0;
-}
+ if ((!op_ret) && !stbuf) {
+ goto out;
+ }
+ local->op_ret = op_ret;
+ local->rebalance.target_op_fn = dht_setxattr2;
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
-int
-dht_setxattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, dict_t *xattr, int flags, dict_t *xdata)
-{
- xlator_t *subvol = NULL;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- dht_methods_t *methods = NULL;
- dht_layout_t *layout = NULL;
- int i = 0;
- int op_errno = EINVAL;
- int ret = -1;
- data_t *tmp = NULL;
- uint32_t dir_spread = 0;
- char value[4096] = {0,};
- gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA;
- int call_cnt = 0;
- uint32_t new_hash = 0;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- conf = this->private;
- GF_VALIDATE_OR_GOTO (this->name, conf, err);
-
- methods = &(conf->methods);
-
- /* Rebalance daemon is allowed to set internal keys */
- if (!conf->defrag)
- GF_IF_INTERNAL_XATTR_GOTO (conf->wild_xattr_name, xattr,
- op_errno, err);
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_SETXATTR);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for path=%s",
- loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ /* Phase 1 of migration */
+ if (IS_DHT_MIGRATION_PHASE1(stbuf)) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
- layout = local->layout;
- if (!layout) {
- gf_msg_debug (this->name, 0,
- "no layout for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
+ ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ dht_setxattr2(this, subvol2, frame, 0);
+ return 0;
}
- local->call_cnt = call_cnt = layout->cnt;
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
- tmp = dict_get (xattr, GF_XATTR_FILE_MIGRATE_KEY);
- if (tmp) {
-
- if (IA_ISDIR (loc->inode->ia_type)) {
- op_errno = ENOTSUP;
- goto err;
- }
+out:
- /* TODO: need to interpret the 'value' for more meaning
- (ie, 'target' subvolume given there, etc) */
- memcpy (value, tmp->data, tmp->len);
- if (strcmp (value, "force") == 0)
- forced_rebalance =
- GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS;
+ if (local->fop == GF_FOP_SETXATTR) {
+ DHT_STACK_UNWIND(setxattr, frame, op_ret, op_errno, xdata);
+ } else {
+ DHT_STACK_UNWIND(fsetxattr, frame, op_ret, op_errno, xdata);
+ }
- if (conf->decommission_in_progress)
- forced_rebalance = GF_DHT_MIGRATE_HARDLINK;
+ return 0;
+}
- if (!loc->path) {
- op_errno = EINVAL;
- goto err;
- }
+/* Function is call by dict_foreach_fnmatch if key is match with
+ user.* and set boolean flag to true
+*/
+static int
+dht_is_user_xattr(dict_t *this, char *key, data_t *value, void *data)
+{
+ gf_boolean_t *user_xattr_found = data;
+ *user_xattr_found = _gf_true;
+ return 0;
+}
- if (!local->loc.name)
- local->loc.name = strrchr (local->loc.path, '/')+1;
+/* Common code to wind a (f)(set|remove)xattr call to set xattr on directory
+ */
+static int
+dht_dir_common_set_remove_xattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ fd_t *fd, dict_t *xattr, int flags,
+ dict_t *xdata, int *op_errno)
- if (!local->loc.parent)
- local->loc.parent =
- inode_parent(local->loc.inode, NULL, NULL);
+{
+ dict_t *xattrop = NULL;
+ int32_t subone[1] = {-1};
+ gf_boolean_t uxattr_key_found = _gf_false;
+ xlator_t *mds_subvol = NULL;
+ xlator_t *travvol = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int i = 0;
+ int call_cnt = 0;
+ dht_local_t *local = NULL;
+ char gfid_local[GF_UUID_BUF_SIZE] = {0};
+ char **xattrs_to_heal;
+
+ conf = this->private;
+ local = frame->local;
+ call_cnt = conf->subvolume_cnt;
+ local->flags = flags;
+ xattrs_to_heal = get_xattrs_to_heal();
+
+ if (!gf_uuid_is_null(local->gfid)) {
+ gf_uuid_unparse(local->gfid, gfid_local);
+ }
+
+ if ((local->fop == GF_FOP_SETXATTR) || (local->fop == GF_FOP_FSETXATTR)) {
+ /* Check if any user xattr present in xattr
+ */
+ dict_foreach_fnmatch(xattr, "user*", dht_is_user_xattr,
+ &uxattr_key_found);
- if ((!local->loc.name) || (!local->loc.parent)) {
- op_errno = EINVAL;
- goto err;
+ /* Check if any custom key xattr present in dict xattr
+ and start index from 1 because user xattr already
+ checked in previous line
+ */
+ for (i = 1; xattrs_to_heal[i]; i++)
+ if (dict_get(xattr, xattrs_to_heal[i]))
+ uxattr_key_found = _gf_true;
+ }
+
+ if ((local->fop == GF_FOP_REMOVEXATTR) ||
+ (local->fop == GF_FOP_FREMOVEXATTR)) {
+ /* Check if any custom key xattr present in local->key
+ */
+ for (i = 0; xattrs_to_heal[i]; i++)
+ if (strstr(local->key, xattrs_to_heal[i]))
+ uxattr_key_found = _gf_true;
+ }
+
+ /* If there is no custom key xattr present or gfid is root
+ or call_cnt is 1 then wind a (f)setxattr call on all subvols
+ */
+ if (!uxattr_key_found || __is_root_gfid(local->gfid) || call_cnt == 1) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ travvol = conf->subvolumes[i];
+ if ((local->fop == GF_FOP_SETXATTR) ||
+ (local->fop == GF_FOP_FSETXATTR)) {
+ if (fd) {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->fsetxattr, fd, xattr,
+ flags, xdata);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->setxattr, loc, xattr,
+ flags, xdata);
}
+ }
- methods->migration_get_dst_subvol(this, local);
-
- if (!local->rebalance.target_node) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_HASHED_SUBVOL_GET_FAILED,
- "Failed to get hashed subvol for %s",
- loc->path);
- op_errno = EINVAL;
- goto err;
+ if ((local->fop == GF_FOP_REMOVEXATTR) ||
+ (local->fop == GF_FOP_FREMOVEXATTR)) {
+ if (fd) {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->fremovexattr, fd,
+ local->key, local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_err_cbk, travvol, travvol,
+ travvol->fops->removexattr, loc,
+ local->key, local->xattr_req);
}
+ }
+ }
- local->rebalance.from_subvol = local->cached_subvol;
+ return 0;
+ }
- if (local->rebalance.target_node == local->rebalance.from_subvol) {
- op_errno = EEXIST;
- goto err;
- }
- if (local->rebalance.target_node) {
- local->flags = forced_rebalance;
-
- /* Flag to suggest its a tiering migration
- * The reason for this dic key-value is that
- * promotions and demotions are multithreaded
- * so the original frame from gf_defrag_start()
- * is not carried. A new frame will be created when
- * we do syncop_setxattr(). This doesnot have the
- * frame->root->pid of the original frame. So we pass
- * this dic key-value when we do syncop_setxattr() to do
- * data migration and set the frame->root->pid to
- * GF_CLIENT_PID_TIER_DEFRAG in dht_setxattr() just before
- * calling dht_start_rebalance_task() */
- tmp = dict_get (xattr, TIERING_MIGRATION_KEY);
- if (tmp)
- frame->root->pid = GF_CLIENT_PID_TIER_DEFRAG;
- else
- frame->root->pid = GF_CLIENT_PID_DEFRAG;
-
- ret = dht_start_rebalance_task (this, frame);
- if (!ret)
- return 0;
-
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_REBALANCE_START_FAILED,
- "%s: failed to create a new rebalance synctask",
- loc->path);
- }
- op_errno = EINVAL;
+ /* Calculate hash subvol based on inode and parent inode
+ */
+ if (fd) {
+ ret = dht_inode_ctx_mdsvol_get(fd->inode, this, &mds_subvol);
+ } else {
+ ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol);
+ }
+ if (ret || !mds_subvol) {
+ if (fd) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get mds subvol for fd %p"
+ "gfid is %s ",
+ fd, gfid_local);
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "%s: Failed to get mds subvol. (gfid is %s)", loc->path,
+ gfid_local);
+ }
+ (*op_errno) = ENOENT;
+ goto err;
+ }
+
+ local->mds_subvol = mds_subvol;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_HASHED_SUBVOL_DOWN,
+ "MDS subvol is down for path "
+ " %s gfid is %s Unable to set xattr ",
+ local->loc.path, gfid_local);
+ (*op_errno) = ENOTCONN;
goto err;
-
+ }
}
-
- tmp = dict_get (xattr, "decommission-brick");
- if (tmp) {
- /* This operation should happen only on '/' */
- if (!__is_root_gfid (loc->inode->gfid)) {
- op_errno = ENOTSUP;
- goto err;
- }
-
- memcpy (value, tmp->data, ((tmp->len < 4095) ? tmp->len : 4095));
- local->key = gf_strdup (value);
- local->call_cnt = conf->subvolume_cnt;
-
- for (i = 0 ; i < conf->subvolume_cnt; i++) {
- /* Get the pathinfo, and then compare */
- STACK_WIND (frame, dht_checking_pathinfo_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->getxattr,
- loc, GF_XATTR_PATHINFO_KEY, NULL);
- }
- return 0;
+ }
+
+ if (uxattr_key_found) {
+ xattrop = dict_new();
+ if (!xattrop) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, 0,
+ "dictionary creation failed for path %s "
+ "for gfid is %s ",
+ local->loc.path, gfid_local);
+ (*op_errno) = ENOMEM;
+ goto err;
+ }
+ local->xattr = dict_ref(xattr);
+ /* Subtract current MDS xattr value to -1 , value of MDS
+ xattr represents no. of times xattr modification failed
+ on non MDS subvols.
+ */
+ ret = dht_dict_set_array(xattrop, conf->mds_xattr_key, subone, 1);
+ if (ret != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "dictionary set array failed for path %s "
+ "for gfid is %s ",
+ local->loc.path, gfid_local);
+ if (xattrop)
+ dict_unref(xattrop);
+ (*op_errno) = ret;
+ goto err;
+ }
+ /* Wind a xattrop call to use ref counting approach
+ update mds xattr to -1 before update xattr on
+ hashed subvol and update mds xattr to +1 after update
+ xattr on all non hashed subvol
+ */
+ if (fd) {
+ STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->fxattrop, fd,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
+ } else {
+ STACK_WIND(frame, dht_xattrop_mds_cbk, local->mds_subvol,
+ local->mds_subvol->fops->xattrop, loc,
+ GF_XATTROP_ADD_ARRAY, xattrop, NULL);
}
+ if (xattrop)
+ dict_unref(xattrop);
+ }
- tmp = dict_get (xattr, GF_XATTR_FIX_LAYOUT_KEY);
- if (tmp) {
- ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash);
- if (ret == 0) {
- gf_msg_debug (this->name, 0,
- "updating commit hash for %s from %u to %u",
- uuid_utoa(loc->gfid),
- layout->commit_hash, new_hash);
- layout->commit_hash = new_hash;
-
- ret = dht_update_commit_hash_for_layout (frame);
- if (ret) {
- op_errno = ENOTCONN;
- goto err;
- }
- return ret;
- }
+ return 0;
+err:
+ return -1;
+}
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_FIX_LAYOUT_INFO,
- "fixing the layout of %s", loc->path);
+int
+dht_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr,
+ int flags, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ int call_cnt = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(fd->inode, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FSETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+ local->call_cnt = call_cnt = layout->cnt;
+
+ if (IA_ISDIR(fd->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, xattr,
+ flags, xdata, &op_errno);
+ if (ret)
+ goto err;
+ } else {
+ local->call_cnt = 1;
+ local->rebalance.xattr = dict_ref(xattr);
+ local->rebalance.flags = flags;
- ret = dht_fix_directory_layout (frame, dht_common_setxattr_cbk,
- layout);
- if (ret) {
- op_errno = ENOTCONN;
- goto err;
- }
- return ret;
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+ if (ret) {
+ gf_msg_debug(this->name, 0,
+ "Failed to set dictionary key %s for fd=%p",
+ DHT_IATT_IN_XDATA_KEY, fd);
}
- tmp = dict_get (xattr, "distribute.directory-spread-count");
- if (tmp) {
- /* Setxattr value is packed as 'binary', not string */
- memcpy (value, tmp->data, ((tmp->len < 4095)?tmp->len:4095));
- ret = gf_string2uint32 (value, &dir_spread);
- if (!ret && ((dir_spread <= conf->subvolume_cnt) &&
- (dir_spread > 0))) {
- layout->spread_cnt = dir_spread;
-
- ret = dht_fix_directory_layout (frame,
- dht_common_setxattr_cbk,
- layout);
- if (ret) {
- op_errno = ENOTCONN;
- goto err;
- }
- return ret;
- }
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_OPERATION_NOT_SUP,
- "wrong 'directory-spread-count' value (%s)", value);
- op_errno = ENOTSUP;
- goto err;
- }
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->fsetxattr, fd, xattr, flags,
+ local->xattr_req);
+ }
+ return 0;
- if (IA_ISDIR (loc->inode->ia_type)) {
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_err_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->setxattr,
- loc, xattr, flags, xdata);
- }
+ return 0;
+}
- } else {
+static int
+dht_checking_pathinfo_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr,
+ dict_t *xdata)
+{
+ int i = -1;
+ int ret = -1;
+ char *value = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *prev = NULL;
+ int this_call_cnt = 0;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret == -1)
+ goto out;
- local->rebalance.xattr = dict_ref (xattr);
- local->rebalance.flags = flags;
- local->call_cnt = 1;
+ ret = dict_get_str(xattr, GF_XATTR_PATHINFO_KEY, &value);
+ if (ret)
+ goto out;
- xdata = xdata ? dict_ref (xdata) : dict_new ();
- if (xdata)
- ret = dict_set_dynstr_with_alloc (xdata,
- DHT_IATT_IN_XDATA_KEY, "yes");
+ if (!strcmp(value, local->key)) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev)
+ conf->decommissioned_bricks[i] = prev;
+ }
+ }
- STACK_WIND (frame, dht_file_setxattr_cbk,
- subvol, subvol->fops->setxattr,
- loc, xattr, flags, xdata);
+out:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ DHT_STACK_UNWIND(setxattr, frame, local->op_ret, ENOTSUP, NULL);
+ }
+ return 0;
+}
- if (xdata)
- dict_unref (xdata);
- }
+static int
+dht_nuke_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, NULL);
+ return 0;
+}
+static int
+dht_nuke_dir(call_frame_t *frame, xlator_t *this, loc_t *loc, data_t *tmp)
+{
+ if (!IA_ISDIR(loc->inode->ia_type)) {
+ DHT_STACK_UNWIND(setxattr, frame, -1, ENOTSUP, NULL);
return 0;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (setxattr, frame, -1, op_errno, NULL);
-
+ /* Setxattr didn't need the parent, but rmdir does. */
+ loc->parent = inode_parent(loc->inode, NULL, NULL);
+ if (!loc->parent) {
+ DHT_STACK_UNWIND(setxattr, frame, -1, ENOENT, NULL);
return 0;
-}
+ }
+ gf_uuid_copy(loc->pargfid, loc->parent->gfid);
+ if (!loc->name && loc->path) {
+ loc->name = strrchr(loc->path, '/');
+ if (loc->name) {
+ ++(loc->name);
+ }
+ }
+ /*
+ * We do this instead of calling dht_rmdir_do directly for two reasons.
+ * The first is that we want to reuse all of the initialization that
+ * dht_rmdir does, so if it ever changes we'll just follow along. The
+ * second (i.e. why we don't use STACK_WIND_TAIL) is so that we don't
+ * obscure the fact that we came in via this path instead of a genuine
+ * rmdir. That makes debugging just a tiny bit easier.
+ */
+ STACK_WIND(frame, dht_nuke_dir_cbk, this, this->fops->rmdir, loc, 1, NULL);
+ return 0;
+}
int
-dht_file_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
+dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr,
+ int flags, dict_t *xdata)
{
- int ret = -1;
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- struct iatt *stbuf = NULL;
- inode_t *inode = NULL;
- xlator_t *subvol1 = NULL, *subvol2 = NULL;
+ xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int op_errno = EINVAL;
+ int ret = -1;
+ data_t *tmp = NULL;
+ uint32_t dir_spread = 0;
+ char value[4096] = {
+ 0,
+ };
+ gf_dht_migrate_data_type_t forced_rebalance = GF_DHT_MIGRATE_DATA;
+ int call_cnt = 0;
+ uint32_t new_hash = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, err);
+
+ methods = &(conf->methods);
+
+ /* Rebalance daemon is allowed to set internal keys */
+ if (!conf->defrag)
+ GF_IF_INTERNAL_XATTR_GOTO(conf->wild_xattr_name, xattr, op_errno, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_SETXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->call_cnt = call_cnt = layout->cnt;
+ tmp = dict_get(xattr, conf->mds_xattr_key);
+ if (tmp) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ tmp = dict_get(xattr, GF_XATTR_FILE_MIGRATE_KEY);
+ if (tmp) {
+ if (IA_ISDIR(loc->inode->ia_type)) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ /* TODO: need to interpret the 'value' for more meaning
+ (ie, 'target' subvolume given there, etc) */
+ memcpy(value, tmp->data, tmp->len);
+ if (strcmp(value, "force") == 0)
+ forced_rebalance = GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS;
+
+ if (conf->decommission_in_progress)
+ forced_rebalance = GF_DHT_MIGRATE_HARDLINK;
+
+ if (!loc->path) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!local->loc.name)
+ local->loc.name = strrchr(local->loc.path, '/') + 1;
+
+ if (!local->loc.parent)
+ local->loc.parent = inode_parent(local->loc.inode, NULL, NULL);
+
+ if ((!local->loc.name) || (!local->loc.parent)) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (gf_uuid_is_null(local->loc.pargfid))
+ gf_uuid_copy(local->loc.pargfid, local->loc.parent->gfid);
+
+ methods->migration_get_dst_subvol(this, local);
+
+ if (!local->rebalance.target_node) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for %s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->rebalance.from_subvol = local->cached_subvol;
+
+ if (local->rebalance.target_node == local->rebalance.from_subvol) {
+ op_errno = EEXIST;
+ goto err;
+ }
+ if (local->rebalance.target_node) {
+ local->flags = forced_rebalance;
- local = frame->local;
- prev = cookie;
+ frame->root->pid = GF_CLIENT_PID_DEFRAG;
- local->op_errno = op_errno;
+ ret = dht_start_rebalance_task(this, frame);
+ if (!ret)
+ return 0;
- if ((op_ret == -1) && !dht_inode_missing (op_errno)) {
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
- goto out;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED,
+ "%s: failed to create a new rebalance synctask", loc->path);
}
+ op_errno = EINVAL;
+ goto err;
+ }
- if (local->call_cnt != 1)
- goto out;
+ tmp = dict_get(xattr, "decommission-brick");
+ if (tmp) {
+ /* This operation should happen only on '/' */
+ if (!__is_root_gfid(loc->inode->gfid)) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
- ret = dict_get_bin (xdata, DHT_IATT_IN_XDATA_KEY, (void **) &stbuf);
+ memcpy(value, tmp->data, min(tmp->len, 4095));
+ local->key = gf_strdup(value);
+ local->call_cnt = conf->subvolume_cnt;
- if ((!op_ret) && !stbuf) {
- goto out;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* Get the pathinfo, and then compare */
+ STACK_WIND_COOKIE(frame, dht_checking_pathinfo_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->getxattr, loc,
+ GF_XATTR_PATHINFO_KEY, NULL);
}
+ return 0;
+ }
- local->op_ret = 0;
+ tmp = dict_get(xattr, GF_XATTR_FIX_LAYOUT_KEY);
+ if (tmp) {
+ ret = dict_get_uint32(xattr, "new-commit-hash", &new_hash);
+ if (ret == 0) {
+ gf_msg_debug(this->name, 0,
+ "updating commit hash for %s from %u to %u",
+ uuid_utoa(loc->gfid), layout->commit_hash, new_hash);
+ layout->commit_hash = new_hash;
+
+ ret = dht_update_commit_hash_for_layout(frame);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ return ret;
+ }
- local->rebalance.target_op_fn = dht_removexattr2;
- if (xdata)
- local->rebalance.xdata = dict_ref (xdata);
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_FIX_LAYOUT_INFO,
+ "fixing the layout of %s", loc->path);
- /* Phase 2 of migration */
- if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
+ ret = dht_fix_directory_layout(frame, dht_fix_layout_setxattr_cbk,
+ layout);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
}
+ return ret;
+ }
+
+ tmp = dict_get(xattr, "distribute.directory-spread-count");
+ if (tmp) {
+ /* Setxattr value is packed as 'binary', not string */
+ memcpy(value, tmp->data, min(tmp->len, 4095));
+ ret = gf_string2uint32(value, &dir_spread);
+ if (!ret && ((dir_spread <= conf->subvolume_cnt) && (dir_spread > 0))) {
+ layout->spread_cnt = dir_spread;
+
+ ret = dht_fix_directory_layout(frame, dht_common_setxattr_cbk,
+ layout);
+ if (ret) {
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ return ret;
+ }
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_OPERATION_NOT_SUP,
+ "wrong 'directory-spread-count' value (%s)", value);
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ tmp = dict_get(xattr, "glusterfs.dht.nuke");
+ if (tmp) {
+ return dht_nuke_dir(frame, this, loc, tmp);
+ }
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+
+ if (IA_ISDIR(loc->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, xattr,
+ flags, xdata, &op_errno);
+ if (ret)
+ goto err;
+ } else {
+ local->rebalance.xattr = dict_ref(xattr);
+ local->rebalance.flags = flags;
+ local->call_cnt = 1;
- /* Phase 1 of migration */
- if (IS_DHT_MIGRATION_PHASE1 (stbuf)) {
- inode = (local->fd) ? local->fd->inode : local->loc.inode;
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
- ret = dht_inode_ctx_get_mig_info (this, inode,
- &subvol1, &subvol2);
- if (!dht_mig_info_is_invalid (local->cached_subvol,
- subvol1, subvol2)) {
- dht_removexattr2 (this, subvol2, frame, 0);
- return 0;
- }
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->setxattr, loc, xattr, flags,
+ local->xattr_req);
+ }
- ret = dht_rebalance_in_progress_check (this, frame);
- if (!ret)
- return 0;
- }
+ return 0;
-out:
- if (local->fop == GF_FOP_REMOVEXATTR) {
- DHT_STACK_UNWIND (removexattr, frame, op_ret, op_errno, NULL);
- } else {
- DHT_STACK_UNWIND (fremovexattr, frame, op_ret, op_errno, NULL);
- }
- return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(setxattr, frame, -1, op_errno, NULL);
+ return 0;
}
-int
-dht_removexattr2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame,
- int ret)
+static int
+dht_removexattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- int op_errno = EINVAL;
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
- if (!frame || !frame->local || !subvol)
- goto err;
+ if (!frame || !frame->local)
+ goto err;
- local = frame->local;
+ local = frame->local;
+ op_errno = local->op_errno;
- local->call_cnt = 2; /* This is the second attempt */
+ local->call_cnt = 2; /* This is the second attempt */
- if (we_are_not_migrating (ret)) {
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(removexattr, frame, local->op_ret, local->op_errno,
+ local->rebalance.xdata);
+ return 0;
+ }
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND (removexattr, frame, local->op_ret,
- local->op_errno, local->rebalance.xdata);
- return 0;
- }
+ if (subvol == NULL)
+ goto err;
- if (local->fop == GF_FOP_REMOVEXATTR) {
- STACK_WIND (frame, dht_file_removexattr_cbk, subvol,
- subvol->fops->removexattr, &local->loc,
- local->key, NULL);
- } else {
- STACK_WIND (frame, dht_file_removexattr_cbk, subvol,
- subvol->fops->fremovexattr, local->fd,
- local->key, NULL);
- }
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->removexattr, &local->loc, local->key,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->fremovexattr, local->fd, local->key,
+ local->xattr_req);
+ }
- return 0;
+ return 0;
err:
- DHT_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
- return 0;
+ DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
+ return 0;
}
-
int
-dht_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
+dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
- goto unlock;
- }
-
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
+ int ret = -1;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ struct iatt *stbuf = NULL;
+ inode_t *inode = NULL;
+ xlator_t *subvol1 = NULL, *subvol2 = NULL;
+ local = frame->local;
+ prev = cookie;
+ local->op_errno = op_errno;
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (removexattr, frame, local->op_ret,
- local->op_errno, NULL);
- }
-
+ if ((local->fop == GF_FOP_FREMOVEXATTR) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
return 0;
-}
+ }
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
-int
-dht_removexattr (call_frame_t *frame, xlator_t *this,
- loc_t *loc, const char *key, dict_t *xdata)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int call_cnt = 0;
- dht_conf_t *conf = NULL;
- int i;
- int ret = 0;
+ if (local->call_cnt != 1)
+ goto out;
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (this->private, err);
+ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
- conf = this->private;
+ if ((!op_ret) && !stbuf) {
+ goto out;
+ }
- GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err);
+ local->op_ret = 0;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
+ local->rebalance.target_op_fn = dht_removexattr2;
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
- local = dht_local_init (frame, loc, NULL, GF_FOP_REMOVEXATTR);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ /* Phase 1 of migration */
+ if (IS_DHT_MIGRATION_PHASE1(stbuf)) {
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
- layout = local->layout;
- if (!local->layout) {
- gf_msg_debug (this->name, 0,
- "no layout for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
+ ret = dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ dht_removexattr2(this, subvol2, frame, 0);
+ return 0;
}
- local->call_cnt = call_cnt = layout->cnt;
- local->key = gf_strdup (key);
-
- if (IA_ISDIR (loc->inode->ia_type)) {
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_removexattr_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->removexattr,
- loc, key, NULL);
- }
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
- } else {
-
- local->call_cnt = 1;
- xdata = xdata ? dict_ref (xdata) : dict_new ();
- if (xdata)
- ret = dict_set_dynstr_with_alloc (xdata,
- DHT_IATT_IN_XDATA_KEY, "yes");
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_DICT_SET_FAILED, "Failed to "
- "set dictionary key %s for %s",
- DHT_IATT_IN_XDATA_KEY, loc->path);
- }
+out:
+ if (local->fop == GF_FOP_REMOVEXATTR) {
+ DHT_STACK_UNWIND(removexattr, frame, op_ret, op_errno, xdata);
+ } else {
+ DHT_STACK_UNWIND(fremovexattr, frame, op_ret, op_errno, xdata);
+ }
+ return 0;
+}
- STACK_WIND (frame, dht_file_removexattr_cbk,
- subvol, subvol->fops->removexattr,
- loc, key, xdata);
+int
+dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int call_cnt = 0;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err);
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_REMOVEXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!local->layout) {
+ gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+ local->xattr_req = (xdata) ? dict_ref(xdata) : dict_new();
+
+ local->call_cnt = call_cnt = layout->cnt;
+ local->key = gf_strdup(key);
+
+ if (key && (strncmp(key, conf->mds_xattr_key, strlen(key)) == 0)) {
+ op_errno = ENOTSUP;
+ goto err;
+ }
+
+ if (IA_ISDIR(loc->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, loc, NULL, NULL, 0,
+ local->xattr_req, &op_errno);
+ if (ret)
+ goto err;
- if (xdata)
- dict_unref (xdata);
+ } else {
+ local->call_cnt = 1;
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to "
+ "set dictionary key %s for %s",
+ DHT_IATT_IN_XDATA_KEY, loc->path);
}
- return 0;
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->removexattr, loc, key,
+ local->xattr_req);
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (removexattr, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(removexattr, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
int
-dht_fremovexattr (call_frame_t *frame, xlator_t *this,
- fd_t *fd, const char *key, dict_t *xdata)
+dht_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int call_cnt = 0;
- dht_conf_t *conf = 0;
- int ret = 0;
-
- int i;
-
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (this->private, err);
-
- conf = this->private;
-
- GF_IF_NATIVE_XATTR_GOTO (conf->wild_xattr_name, key, op_errno, err);
-
- VALIDATE_OR_GOTO (frame, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_FREMOVEXATTR);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for inode=%s",
- uuid_utoa (fd->inode->gfid));
- op_errno = EINVAL;
- goto err;
- }
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int call_cnt = 0;
+ dht_conf_t *conf = 0;
+ int ret = 0;
+
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ GF_IF_NATIVE_XATTR_GOTO(conf->wild_xattr_name, key, op_errno, err);
+
+ VALIDATE_OR_GOTO(frame, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FREMOVEXATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for inode=%s",
+ uuid_utoa(fd->inode->gfid));
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!local->layout) {
+ gf_msg_debug(this->name, 0, "no layout for inode=%s",
+ uuid_utoa(fd->inode->gfid));
+ op_errno = EINVAL;
+ goto err;
+ }
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+
+ local->call_cnt = call_cnt = layout->cnt;
+ local->key = gf_strdup(key);
+
+ if (IA_ISDIR(fd->inode->ia_type)) {
+ local->hashed_subvol = NULL;
+ ret = dht_dir_common_set_remove_xattr(frame, this, NULL, fd, NULL, 0,
+ local->xattr_req, &op_errno);
+ if (ret)
+ goto err;
- layout = local->layout;
- if (!local->layout) {
- gf_msg_debug (this->name, 0,
- "no layout for inode=%s",
- uuid_utoa (fd->inode->gfid));
- op_errno = EINVAL;
- goto err;
+ } else {
+ local->call_cnt = 1;
+ ret = dict_set_int8(local->xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to "
+ "set dictionary key %s for fd=%p",
+ DHT_IATT_IN_XDATA_KEY, fd);
}
- local->call_cnt = call_cnt = layout->cnt;
- local->key = gf_strdup (key);
-
- if (IA_ISDIR (fd->inode->ia_type)) {
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_removexattr_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->fremovexattr,
- fd, key, NULL);
- }
-
- } else {
-
- local->call_cnt = 1;
- xdata = xdata ? dict_ref (xdata) : dict_new ();
- if (xdata)
- ret = dict_set_dynstr_with_alloc (xdata,
- DHT_IATT_IN_XDATA_KEY, "yes");
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_DICT_SET_FAILED, "Failed to "
- "set dictionary key %s for fd=%p",
- DHT_IATT_IN_XDATA_KEY, fd);
- }
-
- STACK_WIND (frame, dht_file_removexattr_cbk,
- subvol, subvol->fops->fremovexattr,
- fd, key, xdata);
-
- if (xdata)
- dict_unref (xdata);
- }
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->fremovexattr, fd, key,
+ local->xattr_req);
+ }
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fremovexattr, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
int
-dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+dht_fd_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
- goto unlock;
- }
+ local = frame->local;
+ prev = cookie;
- local->op_ret = 0;
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto post_unlock;
}
-unlock:
- UNLOCK (&frame->lock);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (open, frame, local->op_ret, local->op_errno,
- local->fd, NULL);
-
- return 0;
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_UNWIND(open, frame, local->op_ret, local->op_errno, local->fd,
+ NULL);
+
+ return 0;
}
/*
* dht_normalize_stats -
*/
static void
-dht_normalize_stats (struct statvfs *buf, unsigned long bsize,
- unsigned long frsize)
+dht_normalize_stats(struct statvfs *buf, unsigned long bsize,
+ unsigned long frsize)
{
- double factor = 0;
+ double factor = 0;
+
+ if (buf->f_bsize != bsize) {
+ buf->f_bsize = bsize;
+ }
+
+ if (buf->f_frsize != frsize) {
+ factor = ((double)buf->f_frsize) / frsize;
+ buf->f_frsize = frsize;
+ buf->f_blocks = (fsblkcnt_t)(factor * buf->f_blocks);
+ buf->f_bfree = (fsblkcnt_t)(factor * buf->f_bfree);
+ buf->f_bavail = (fsblkcnt_t)(factor * buf->f_bavail);
+ }
+}
- if (buf->f_bsize != bsize) {
- buf->f_bsize = bsize;
+static int
+dht_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
+{
+ gf_boolean_t event = _gf_false;
+ qdstatfs_action_t action = qdstatfs_action_OFF;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int bsize = 0;
+ int frsize = 0;
+ GF_UNUSED int ret = 0;
+ unsigned long new_usage = 0;
+ unsigned long cur_usage = 0;
+
+ local = frame->local;
+ GF_ASSERT(local);
+
+ if (xdata)
+ ret = dict_get_int8(xdata, "quota-deem-statfs", (int8_t *)&event);
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
}
-
- if (buf->f_frsize != frsize) {
- factor = ((double) buf->f_frsize) / frsize;
- buf->f_frsize = frsize;
- buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks);
- buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree);
- buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail);
-
+ if (!statvfs) {
+ op_errno = EINVAL;
+ local->op_ret = -1;
+ goto unlock;
}
-}
+ local->op_ret = 0;
-int
-dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct statvfs *statvfs,
- dict_t *xdata)
-{
+ if (local->quota_deem_statfs) {
+ if (event == _gf_true) {
+ action = qdstatfs_action_COMPARE;
+ } else {
+ action = qdstatfs_action_NEGLECT;
+ }
+ } else {
+ if (event == _gf_true) {
+ action = qdstatfs_action_REPLACE;
+ local->quota_deem_statfs = _gf_true;
+ }
+ }
- gf_boolean_t event = _gf_false;
- qdstatfs_action_t action = qdstatfs_action_OFF;
- dht_local_t * local = NULL;
- int this_call_cnt = 0;
- int bsize = 0;
- int frsize = 0;
- GF_UNUSED int ret = 0;
- unsigned long new_usage = 0;
- unsigned long cur_usage = 0;
+ if (local->quota_deem_statfs) {
+ switch (action) {
+ case qdstatfs_action_NEGLECT:
+ goto unlock;
- local = frame->local;
- GF_ASSERT (local);
+ case qdstatfs_action_REPLACE:
+ local->statvfs = *statvfs;
+ goto unlock;
- if (xdata)
- ret = dict_get_int8 (xdata, "quota-deem-statfs",
- (int8_t *)&event);
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto unlock;
- }
- if (!statvfs) {
- op_errno = EINVAL;
- local->op_ret = -1;
- goto unlock;
- }
- local->op_ret = 0;
+ case qdstatfs_action_COMPARE:
+ new_usage = statvfs->f_blocks - statvfs->f_bfree;
+ cur_usage = local->statvfs.f_blocks -
+ local->statvfs.f_bfree;
- switch (local->quota_deem_statfs) {
- case _gf_true:
- if (event == _gf_true)
- action = qdstatfs_action_COMPARE;
- else
- action = qdstatfs_action_NEGLECT;
- break;
-
- case _gf_false:
- if (event == _gf_true) {
- action = qdstatfs_action_REPLACE;
- local->quota_deem_statfs = _gf_true;
- }
- break;
+ /* Take the max of the usage from subvols */
+ if (new_usage >= cur_usage)
+ local->statvfs = *statvfs;
+ goto unlock;
default:
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_INVALID_VALUE,
- "Encountered third "
- "value for boolean variable %d",
- local->quota_deem_statfs);
- break;
- }
-
- if (local->quota_deem_statfs) {
- switch (action) {
- case qdstatfs_action_NEGLECT:
- goto unlock;
-
- case qdstatfs_action_REPLACE:
- local->statvfs = *statvfs;
- goto unlock;
-
- case qdstatfs_action_COMPARE:
- new_usage = statvfs->f_blocks -
- statvfs->f_bfree;
- cur_usage = local->statvfs.f_blocks -
- local->statvfs.f_bfree;
-
- /* Take the max of the usage from subvols */
- if (new_usage >= cur_usage)
- local->statvfs = *statvfs;
- goto unlock;
-
- default:
- break;
- }
- }
-
- if (local->statvfs.f_bsize != 0) {
- bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
- frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
- dht_normalize_stats(&local->statvfs, bsize, frsize);
- dht_normalize_stats(statvfs, bsize, frsize);
- } else {
- local->statvfs.f_bsize = statvfs->f_bsize;
- local->statvfs.f_frsize = statvfs->f_frsize;
- }
-
- local->statvfs.f_blocks += statvfs->f_blocks;
- local->statvfs.f_bfree += statvfs->f_bfree;
- local->statvfs.f_bavail += statvfs->f_bavail;
- local->statvfs.f_files += statvfs->f_files;
- local->statvfs.f_ffree += statvfs->f_ffree;
- local->statvfs.f_favail += statvfs->f_favail;
- local->statvfs.f_fsid = statvfs->f_fsid;
- local->statvfs.f_flag = statvfs->f_flag;
- local->statvfs.f_namemax = statvfs->f_namemax;
-
-
+ break;
+ }
}
-unlock:
- UNLOCK (&frame->lock);
+ if (local->statvfs.f_bsize != 0) {
+ bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
+ frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
+ dht_normalize_stats(&local->statvfs, bsize, frsize);
+ dht_normalize_stats(statvfs, bsize, frsize);
+ } else {
+ local->statvfs.f_bsize = statvfs->f_bsize;
+ local->statvfs.f_frsize = statvfs->f_frsize;
+ }
+
+ local->statvfs.f_blocks += statvfs->f_blocks;
+ local->statvfs.f_bfree += statvfs->f_bfree;
+ local->statvfs.f_bavail += statvfs->f_bavail;
+ local->statvfs.f_files += statvfs->f_files;
+ local->statvfs.f_ffree += statvfs->f_ffree;
+ local->statvfs.f_favail += statvfs->f_favail;
+ local->statvfs.f_fsid = statvfs->f_fsid;
+ local->statvfs.f_flag = statvfs->f_flag;
+ local->statvfs.f_namemax = statvfs->f_namemax;
+ }
+unlock:
+ UNLOCK(&frame->lock);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (statfs, frame, local->op_ret, local->op_errno,
- &local->statvfs, xdata);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
+ &local->statvfs, xdata);
- return 0;
+ return 0;
}
-
int
-dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- int i = -1;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+ inode_t *inode = NULL;
+ inode_table_t *itable = NULL;
+ static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+ loc_t newloc = {
+ 0,
+ };
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (this->private, err);
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(this->private, err);
- conf = this->private;
+ conf = this->private;
- local = dht_local_init (frame, NULL, NULL, GF_FOP_STATFS);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ local = dht_local_init(frame, NULL, NULL, GF_FOP_STATFS);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- if (!loc->inode || IA_ISDIR (loc->inode->ia_type)) {
- local->call_cnt = conf->subvolume_cnt;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_statfs_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->statfs, loc,
- xdata);
- }
- return 0;
+ if (loc->inode && !IA_ISDIR(loc->inode->ia_type)) {
+ itable = loc->inode->table;
+ if (!itable) {
+ op_errno = EINVAL;
+ goto err;
}
- subvol = dht_subvol_get_cached (this, loc->inode);
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
+ loc = &local->loc2;
+
+ inode = inode_find(itable, root_gfid);
+ if (!inode) {
+ op_errno = EINVAL;
+ goto err;
}
- local->call_cnt = 1;
+ dht_build_root_loc(inode, &newloc);
+ loc = &newloc;
+ }
- STACK_WIND (frame, dht_statfs_cbk,
- subvol, subvol->fops->statfs, loc, xdata);
+ local->call_cnt = conf->subvolume_cnt;
- return 0;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND(frame, dht_statfs_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->statfs, loc, xdata);
+ }
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (statfs, frame, -1, op_errno, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
-
int
-dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
- dict_t *xdata)
+dht_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- int i = -1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (this->private, err);
-
- conf = this->private;
-
- local = dht_local_init (frame, loc, fd, GF_FOP_OPENDIR);
- if (!local) {
- op_errno = ENOMEM;
-
- goto err;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+ int ret = 0;
+ gf_boolean_t new_xdata = _gf_false;
+ xlator_t **subvolumes = NULL;
+ int call_count = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, fd, GF_FOP_OPENDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ local->first_up_subvol = dht_first_up_subvol(this);
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ new_xdata = _gf_true;
+ }
+
+ ret = dict_set_uint32(xdata, conf->link_xattr_name, 256);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value : key = %s",
+ conf->link_xattr_name);
+
+ /* dht_readdirp will wind to all subvols so open has to be sent to
+ * all subvols whether or not conf->local_subvols is set */
+
+ call_count = local->call_cnt = conf->subvolume_cnt;
+ subvolumes = conf->subvolumes;
+
+ /* In case of parallel-readdir, the readdir-ahead will be loaded
+ * below dht, in this case, if we want to enable or disable SKIP_DIRs
+ * it has to be done in opendir, so that prefetching logic in
+ * readdir-ahead, honors it */
+ for (i = 0; i < call_count; i++) {
+ if (conf->readdir_optimize == _gf_true) {
+ if (subvolumes[i] != local->first_up_subvol) {
+ ret = dict_set_int32(xdata, GF_READDIR_SKIP_DIRS, 1);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary"
+ " value :key = %s, ret:%d",
+ GF_READDIR_SKIP_DIRS, ret);
+ }
}
- if ((conf->defrag && conf->defrag->cmd == GF_DEFRAG_CMD_START_TIER) ||
- (conf->defrag && conf->defrag->cmd ==
- GF_DEFRAG_CMD_START_DETACH_TIER) ||
- (!(conf->local_subvols_cnt) || !conf->defrag)) {
- local->call_cnt = conf->subvolume_cnt;
+ STACK_WIND_COOKIE(frame, dht_fd_cbk, subvolumes[i], subvolumes[i],
+ subvolumes[i]->fops->opendir, loc, fd, xdata);
+ dict_del(xdata, GF_READDIR_SKIP_DIRS);
+ }
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_fd_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->opendir,
- loc, fd, xdata);
-
- }
- } else {
- local->call_cnt = conf->local_subvols_cnt;
- for (i = 0; i < conf->local_subvols_cnt; i++) {
- STACK_WIND (frame, dht_fd_cbk,
- conf->local_subvols[i],
- conf->local_subvols[i]->fops->opendir,
- loc, fd, xdata);
- }
- }
+ if (new_xdata)
+ dict_unref(xdata);
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (opendir, frame, -1, op_errno, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(opendir, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
+/* dht_readdirp_cbk creates a new dentry and dentry->inode is not assigned.
+ This functions assigns an inode if all of the following conditions are
+ true:
-int
-dht_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- gf_dirent_t entries;
- gf_dirent_t *orig_entry = NULL;
- gf_dirent_t *entry = NULL;
- call_frame_t *prev = NULL;
- xlator_t *next_subvol = NULL;
- off_t next_offset = 0;
- int count = 0;
- dht_layout_t *layout = 0;
- dht_conf_t *conf = NULL;
- dht_methods_t *methods = NULL;
- xlator_t *subvol = 0;
- xlator_t *hashed_subvol = 0;
- int ret = 0;
- int readdir_optimize = 0;
- inode_table_t *itable = NULL;
- inode_t *inode = NULL;
-
- INIT_LIST_HEAD (&entries.list);
- prev = cookie;
- local = frame->local;
- itable = local->fd ? local->fd->inode->table : NULL;
+ * DHT has only one child. In this case the entire layout is present on
+ this single child and hence we can set complete layout in inode.
+ * backend has complete layout and there are no anomalies in it and from
+ this information layout can be constructed and set in inode.
+*/
- conf = this->private;
- GF_VALIDATE_OR_GOTO(this->name, conf, unwind);
+static void
+dht_populate_inode_for_dentry(xlator_t *this, xlator_t *subvol,
+ gf_dirent_t *entry, gf_dirent_t *orig_entry)
+{
+ dht_layout_t *layout = NULL;
+ int ret = 0;
+ loc_t loc = {
+ 0,
+ };
+
+ if (gf_uuid_is_null(orig_entry->d_stat.ia_gfid)) {
+ /* this skips the '..' entry for the root of the volume */
+ return;
+ }
- methods = &(conf->methods);
+ gf_uuid_copy(loc.gfid, orig_entry->d_stat.ia_gfid);
+ loc.inode = inode_ref(orig_entry->inode);
- if (op_ret < 0)
- goto done;
+ if (is_revalidate(&loc)) {
+ goto out;
+ }
- if (!local->layout)
- local->layout = dht_layout_get (this, local->fd->inode);
+ layout = dht_layout_new(this, 1);
+ if (!layout)
+ goto out;
- layout = local->layout;
+ ret = dht_layout_merge(this, layout, subvol, 0, 0, orig_entry->dict);
+ if (!ret) {
+ ret = dht_layout_normalize(this, &loc, layout);
+ if (ret == 0) {
+ dht_layout_set(this, orig_entry->inode, layout);
+ entry->inode = inode_ref(orig_entry->inode);
+ layout = NULL;
+ }
+ }
- /* We have seen crashes in while running "rm -rf" on tier volumes
- when the layout was NULL on the hot tier. This will skip the
- entries on the subvol without a layout, hence preventing the crash
- but rmdir might fail with "directory not empty" errors*/
+ if (layout)
+ dht_layout_unref(this, layout);
- if (layout == NULL)
- goto done;
+out:
+ loc_wipe(&loc);
+ return;
+}
- if (conf->readdir_optimize == _gf_true)
- readdir_optimize = 1;
+/* Posix returns op_errno = ENOENT to indicate that there are no more
+ * entries
+ */
+static int
+dht_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *next_subvol = NULL;
+ off_t next_offset = 0;
+ int count = 0;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+ xlator_t *subvol = 0;
+ xlator_t *hashed_subvol = 0;
+ int ret = 0;
+ int readdir_optimize = 0;
+ inode_table_t *itable = NULL;
+ inode_t *inode = NULL;
+ gf_boolean_t skip_hashed_check = _gf_false;
+
+ INIT_LIST_HEAD(&entries.list);
+
+ prev = cookie;
+ local = frame->local;
+ GF_VALIDATE_OR_GOTO(this->name, local->fd, unwind);
+
+ itable = local->fd->inode->table;
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, unwind);
+
+ methods = &(conf->methods);
+
+ if (op_ret <= 0) {
+ goto done;
+ }
+
+ /* Why aren't we skipping DHT entirely in case of a single subvol?
+ * Because if this was a larger volume earlier and all but one subvol
+ * was removed, there might be stale linkto files on the subvol.
+ */
+ if (conf->subvolume_cnt == 1) {
+ /* return all directory and file entries except
+ * linkto files for a single child DHT
+ */
+ skip_hashed_check = _gf_true;
+ }
- list_for_each_entry (orig_entry, (&orig_entries->list), list) {
- next_offset = orig_entry->d_off;
+ if (!local->layout)
+ local->layout = dht_layout_get(this, local->fd->inode);
- if (IA_ISINVAL(orig_entry->d_stat.ia_type)) {
- /*stat failed somewhere- ignore this entry*/
- continue;
- }
+ layout = local->layout;
- if (check_is_dir (NULL, (&orig_entry->d_stat), NULL)) {
+ /* This will skip the entries on the subvol without a layout,
+ * hence preventing the crash but rmdir might fail with
+ * "directory not empty" errors*/
- /*Directory entries filtering :
- * a) If rebalance is running, pick from first_up_subvol
- * b) (rebalance not running)hashed subvolume is NULL or
- * down then filter in first_up_subvolume. Other wise the
- * corresponding hashed subvolume will take care of the
- * directory entry.
- */
+ if (layout == NULL)
+ goto done;
- if (readdir_optimize) {
- if (prev->this == local->first_up_subvol)
- goto list;
- else
- continue;
+ if (conf->readdir_optimize == _gf_true)
+ readdir_optimize = 1;
- }
+ gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name);
- hashed_subvol = methods->layout_search (this, layout,
- orig_entry->d_name);
+ list_for_each_entry(orig_entry, (&orig_entries->list), list)
+ {
+ next_offset = orig_entry->d_off;
- if (prev->this == hashed_subvol)
- goto list;
- if ((hashed_subvol
- && dht_subvol_status (conf, hashed_subvol))
- ||(prev->this != local->first_up_subvol))
- continue;
+ gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name,
+ orig_entry->d_name, orig_entry->d_type);
- goto list;
- }
+ if (IA_ISINVAL(orig_entry->d_stat.ia_type)) {
+ /*stat failed somewhere- display this entry but the data may
+ * be inaccurate.
+ */
+ gf_msg_debug(this->name, EINVAL, "Invalid stat for %s (gfid %s)",
+ orig_entry->d_name,
+ uuid_utoa(orig_entry->d_stat.ia_gfid));
+ }
- if (check_is_linkfile (NULL, (&orig_entry->d_stat),
- orig_entry->dict,
- conf->link_xattr_name)) {
- continue;
- }
-list:
- entry = gf_dirent_for_name (orig_entry->d_name);
- if (!entry) {
+ if (check_is_linkfile(NULL, (&orig_entry->d_stat), orig_entry->dict,
+ conf->link_xattr_name)) {
+ gf_msg_debug(this->name, 0, "%s: %s is a linkto file", prev->name,
+ orig_entry->d_name);
+ continue;
+ }
- goto unwind;
- }
+ if (skip_hashed_check) {
+ goto list;
+ }
- /* Do this if conf->search_unhashed is set to "auto" */
- if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
- subvol = methods->layout_search (this, layout,
- orig_entry->d_name);
- if (!subvol || (subvol != prev->this)) {
- /* TODO: Count the number of entries which need
- linkfile to prove its existence in fs */
- layout->search_unhashed++;
- }
- }
+ if (check_is_dir(NULL, (&orig_entry->d_stat), NULL)) {
+ /*Directory entries filtering :
+ * a) If rebalance is running, pick from first_up_subvol
+ * b) (rebalance not running)hashed subvolume is NULL or
+ * down then filter in first_up_subvolume. Other wise the
+ * corresponding hashed subvolume will take care of the
+ * directory entry.
+ */
+ if (readdir_optimize) {
+ if (prev == local->first_up_subvol)
+ goto list;
+ else
+ continue;
+ }
- entry->d_off = orig_entry->d_off;
- entry->d_stat = orig_entry->d_stat;
- entry->d_ino = orig_entry->d_ino;
- entry->d_type = orig_entry->d_type;
- entry->d_len = orig_entry->d_len;
-
- if (orig_entry->dict)
- entry->dict = dict_ref (orig_entry->dict);
-
- /* making sure we set the inode ctx right with layout,
- currently possible only for non-directories, so for
- directories don't set entry inodes */
- if (IA_ISDIR(entry->d_stat.ia_type)) {
- if (orig_entry->inode) {
- dht_inode_ctx_time_update (orig_entry->inode,
- this, &entry->d_stat,
- 1);
- }
- } else {
- if (orig_entry->inode) {
- ret = dht_layout_preset (this, prev->this,
- orig_entry->inode);
- if (ret)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_LAYOUT_SET_FAILED,
- "failed to link the layout "
- "in inode");
-
- entry->inode = inode_ref (orig_entry->inode);
- } else if (itable) {
- /*
- * orig_entry->inode might be null if any upper
- * layer xlators below client set to null, to
- * force a lookup on the inode even if the inode
- * is present in the inode table. In that case
- * we just update the ctx to make sure we didn't
- * missed anything.
- */
- inode = inode_find (itable,
- orig_entry->d_stat.ia_gfid);
- if (inode) {
- ret = dht_layout_preset
- (this, prev->this,
- inode);
- if (ret)
- gf_msg (this->name,
- GF_LOG_WARNING, 0,
- DHT_MSG_LAYOUT_SET_FAILED,
- "failed to link the layout"
- " in inode");
- inode_unref (inode);
- inode = NULL;
- }
- }
- }
- list_add_tail (&entry->list, &entries.list);
- count++;
- }
- op_ret = count;
- /* We need to ensure that only the last subvolume's end-of-directory
- * notification is respected so that directory reading does not stop
- * before all subvolumes have been read. That could happen because the
- * posix for each subvolume sends a ENOENT on end-of-directory but in
- * distribute we're not concerned only with a posix's view of the
- * directory but the aggregated namespace' view of the directory.
- */
- if (prev->this != dht_last_up_subvol (this))
- op_errno = 0;
+ hashed_subvol = methods->layout_search(this, layout,
+ orig_entry->d_name);
-done:
- if (count == 0) {
- /* non-zero next_offset means that
- EOF is not yet hit on the current subvol
- */
- if (next_offset == 0) {
- next_subvol = dht_subvol_next (this, prev->this);
- } else {
- next_subvol = prev->this;
- }
+ if (prev == hashed_subvol)
+ goto list;
+ if ((hashed_subvol && dht_subvol_status(conf, hashed_subvol)) ||
+ (prev != local->first_up_subvol))
+ continue;
- if (!next_subvol) {
- goto unwind;
- }
+ goto list;
+ }
- if (conf->readdir_optimize == _gf_true) {
- if (next_subvol != local->first_up_subvol) {
- ret = dict_set_int32 (local->xattr,
- GF_READDIR_SKIP_DIRS, 1);
- if (ret)
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value"
- ":key = %s",
- GF_READDIR_SKIP_DIRS );
- } else {
- dict_del (local->xattr,
- GF_READDIR_SKIP_DIRS);
- }
- }
+ list:
+ entry = gf_dirent_for_name(orig_entry->d_name);
+ if (!entry) {
+ goto unwind;
+ }
- STACK_WIND (frame, dht_readdirp_cbk,
- next_subvol, next_subvol->fops->readdirp,
- local->fd, local->size, next_offset,
- local->xattr);
- return 0;
+ /* Do this if conf->search_unhashed is set to "auto" */
+ if (conf->search_unhashed == GF_DHT_LOOKUP_UNHASHED_AUTO) {
+ subvol = methods->layout_search(this, layout, orig_entry->d_name);
+ if (!subvol || (subvol != prev)) {
+ /* TODO: Count the number of entries which need
+ linkfile to prove its existence in fs */
+ layout->search_unhashed++;
+ }
}
-unwind:
- if (op_ret < 0)
- op_ret = 0;
+ entry->d_off = orig_entry->d_off;
+ entry->d_stat = orig_entry->d_stat;
+ entry->d_ino = orig_entry->d_ino;
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
- DHT_STACK_UNWIND (readdirp, frame, op_ret, op_errno, &entries, NULL);
+ if (orig_entry->dict)
+ entry->dict = dict_ref(orig_entry->dict);
- gf_dirent_free (&entries);
+ /* making sure we set the inode ctx right with layout,
+ currently possible only for non-directories, so for
+ directories don't set entry inodes */
+ if (IA_ISDIR(entry->d_stat.ia_type)) {
+ entry->d_stat.ia_blocks = DHT_DIR_STAT_BLOCKS;
+ entry->d_stat.ia_size = DHT_DIR_STAT_SIZE;
+ if (orig_entry->inode) {
+ dht_inode_ctx_time_update(orig_entry->inode, this,
+ &entry->d_stat, 1);
- return 0;
-}
+ if (conf->subvolume_cnt == 1) {
+ dht_populate_inode_for_dentry(this, prev, entry,
+ orig_entry);
+ }
+ }
+ } else {
+ if (orig_entry->dict &&
+ dict_get(orig_entry->dict, conf->link_xattr_name)) {
+ /* Strip out the S and T flags set by rebalance*/
+ DHT_STRIP_PHASE1_FLAGS(&entry->d_stat);
+ }
+ if (orig_entry->inode) {
+ ret = dht_layout_preset(this, prev, orig_entry->inode);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to link the layout "
+ "in inode for %s",
+ orig_entry->d_name);
+
+ entry->inode = inode_ref(orig_entry->inode);
+ } else if (itable) {
+ /*
+ * orig_entry->inode might be null if any upper
+ * layer xlators below client set to null, to
+ * force a lookup on the inode even if the inode
+ * is present in the inode table. In that case
+ * we just update the ctx to make sure we didn't
+ * missed anything.
+ */
+ inode = inode_find(itable, orig_entry->d_stat.ia_gfid);
+ if (inode) {
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LAYOUT_SET_FAILED,
+ "failed to link the layout"
+ " in inode for %s",
+ orig_entry->d_name);
+ inode_unref(inode);
+ inode = NULL;
+ }
+ }
+ }
+ gf_msg_debug(this->name, 0, "%s: Adding entry = %s", prev->name,
+ entry->d_name);
-int
-dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, gf_dirent_t *orig_entries,
- dict_t *xdata)
-{
- dht_local_t *local = NULL;
- gf_dirent_t entries;
- gf_dirent_t *orig_entry = NULL;
- gf_dirent_t *entry = NULL;
- call_frame_t *prev = NULL;
- xlator_t *next_subvol = NULL;
- off_t next_offset = 0;
- int count = 0;
- dht_layout_t *layout = 0;
- xlator_t *subvol = 0;
- dht_conf_t *conf = NULL;
- dht_methods_t *methods = NULL;
-
- INIT_LIST_HEAD (&entries.list);
- prev = cookie;
- local = frame->local;
+ list_add_tail(&entry->list, &entries.list);
+ count++;
+ }
- conf = this->private;
- GF_VALIDATE_OR_GOTO (this->name, conf, done);
+done:
- methods = &(conf->methods);
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ * Possible values:
+ * op_ret == 0 and op_errno != 0
+ * if op_errno != ENOENT : Error.Unwind.
+ * if op_errno == ENOENT : There are no more entries on this subvol.
+ * Move to the next one.
+ * op_ret > 0 and count == 0 :
+ * The subvol returned entries to dht but all were stripped out.
+ * For example, if they were linkto files or dirs where
+ * hashed_subvol != prev. Try to get some entries by winding
+ * to the next subvol. This can be dangerous if parallel readdir
+ * is enabled as it grows the stack.
+ *
+ * op_ret > 0 and count > 0:
+ * We found some entries. Unwind even if the buffer is not full.
+ *
+ */
+
+ op_ret = count;
+ if (count == 0) {
+ /* non-zero next_offset means that
+ * EOF is not yet hit on the current subvol
+ */
+ if ((next_offset == 0) || (op_errno == ENOENT)) {
+ next_offset = 0;
+ next_subvol = dht_subvol_next(this, prev);
+ } else {
+ next_subvol = prev;
+ }
- if (op_ret < 0)
- goto done;
+ if (!next_subvol) {
+ goto unwind;
+ }
- if (!local->layout)
- local->layout = dht_layout_get (this, local->fd->inode);
+ if (conf->readdir_optimize == _gf_true) {
+ if (next_subvol != local->first_up_subvol) {
+ ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value"
+ ":key = %s",
+ GF_READDIR_SKIP_DIRS);
+ } else {
+ dict_del(local->xattr, GF_READDIR_SKIP_DIRS);
+ }
+ }
- layout = local->layout;
+ STACK_WIND_COOKIE(frame, dht_readdirp_cbk, next_subvol, next_subvol,
+ next_subvol->fops->readdirp, local->fd, local->size,
+ next_offset, local->xattr);
+ return 0;
+ }
- list_for_each_entry (orig_entry, (&orig_entries->list), list) {
- next_offset = orig_entry->d_off;
+unwind:
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ */
+ if (op_ret < 0)
+ op_ret = 0;
+
+ if (prev != dht_last_up_subvol(this))
+ op_errno = 0;
+
+ DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL);
+
+ gf_dirent_free(&entries);
+ return 0;
+}
- subvol = methods->layout_search (this, layout,
- orig_entry->d_name);
+static int
+dht_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *orig_entry = NULL;
+ gf_dirent_t *entry = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *next_subvol = NULL;
+ off_t next_offset = 0;
+ int count = 0;
+ dht_layout_t *layout = 0;
+ xlator_t *subvol = 0;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
+ gf_boolean_t skip_hashed_check = _gf_false;
- if (!subvol || (subvol == prev->this)) {
- entry = gf_dirent_for_name (orig_entry->d_name);
- if (!entry) {
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_NO_MEMORY,
- "Memory allocation failed ");
- goto unwind;
- }
+ INIT_LIST_HEAD(&entries.list);
- entry->d_off = orig_entry->d_off;
- entry->d_ino = orig_entry->d_ino;
- entry->d_type = orig_entry->d_type;
- entry->d_len = orig_entry->d_len;
+ prev = cookie;
+ local = frame->local;
- list_add_tail (&entry->list, &entries.list);
- count++;
- }
- }
- op_ret = count;
- /* We need to ensure that only the last subvolume's end-of-directory
- * notification is respected so that directory reading does not stop
- * before all subvolumes have been read. That could happen because the
- * posix for each subvolume sends a ENOENT on end-of-directory but in
- * distribute we're not concerned only with a posix's view of the
- * directory but the aggregated namespace' view of the directory.
- */
- if (prev->this != dht_last_up_subvol (this))
- op_errno = 0;
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, done);
-done:
- if (count == 0) {
- /* non-zero next_offset means that
- EOF is not yet hit on the current subvol
- */
- if (next_offset == 0) {
- next_subvol = dht_subvol_next (this, prev->this);
- } else {
- next_subvol = prev->this;
- }
+ methods = &(conf->methods);
- if (!next_subvol) {
- goto unwind;
- }
+ if (op_ret <= 0)
+ goto done;
- STACK_WIND (frame, dht_readdir_cbk,
- next_subvol, next_subvol->fops->readdir,
- local->fd, local->size, next_offset, NULL);
- return 0;
- }
+ if (!local->layout)
+ local->layout = dht_layout_get(this, local->fd->inode);
-unwind:
- if (op_ret < 0)
- op_ret = 0;
+ layout = local->layout;
- DHT_STACK_UNWIND (readdir, frame, op_ret, op_errno, &entries, NULL);
+ gf_msg_debug(this->name, 0, "Processing entries from %s", prev->name);
- gf_dirent_free (&entries);
+ if (conf->subvolume_cnt == 1) {
+ /*return everything*/
+ skip_hashed_check = _gf_true;
+ count = op_ret;
+ goto done;
+ }
- return 0;
-}
+ list_for_each_entry(orig_entry, (&orig_entries->list), list)
+ {
+ next_offset = orig_entry->d_off;
+ gf_msg_debug(this->name, 0, "%s: entry = %s, type = %d", prev->name,
+ orig_entry->d_name, orig_entry->d_type);
-int
-dht_do_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff, int whichop, dict_t *dict)
-{
- dht_local_t *local = NULL;
- int op_errno = -1;
- xlator_t *xvol = NULL;
- int ret = 0;
- dht_conf_t *conf = NULL;
+ subvol = methods->layout_search(this, layout, orig_entry->d_name);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (this->private, err);
+ if (!subvol || (subvol == prev)) {
+ entry = gf_dirent_for_name(orig_entry->d_name);
+ if (!entry) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "Memory allocation failed ");
+ goto unwind;
+ }
- conf = this->private;
+ entry->d_off = orig_entry->d_off;
+ entry->d_ino = orig_entry->d_ino;
+ entry->d_type = orig_entry->d_type;
+ entry->d_len = orig_entry->d_len;
- local = dht_local_init (frame, NULL, NULL, whichop);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ gf_msg_debug(this->name, 0, "%s: Adding = entry %s", prev->name,
+ entry->d_name);
- local->fd = fd_ref (fd);
- local->size = size;
- local->xattr_req = (dict)? dict_ref (dict) : NULL;
- local->first_up_subvol = dht_first_up_subvol (this);
+ list_add_tail(&entry->list, &entries.list);
+ count++;
+ }
+ }
+done:
+ op_ret = count;
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ */
+ if (count == 0) {
+ if ((next_offset == 0) || (op_errno == ENOENT)) {
+ next_offset = 0;
+ next_subvol = dht_subvol_next(this, prev);
+ } else {
+ next_subvol = prev;
+ }
- dht_deitransform (this, yoff, &xvol);
+ if (!next_subvol) {
+ goto unwind;
+ }
- /* TODO: do proper readdir */
- if (whichop == GF_FOP_READDIRP) {
- if (dict)
- local->xattr = dict_ref (dict);
- else
- local->xattr = dict_new ();
+ STACK_WIND_COOKIE(frame, dht_readdir_cbk, next_subvol, next_subvol,
+ next_subvol->fops->readdir, local->fd, local->size,
+ next_offset, NULL);
+ return 0;
+ }
- if (local->xattr) {
- ret = dict_set_uint32 (local->xattr,
- conf->link_xattr_name, 256);
- if (ret)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value"
- " : key = %s",
- conf->link_xattr_name);
+unwind:
+ /* We need to ensure that only the last subvolume's end-of-directory
+ * notification is respected so that directory reading does not stop
+ * before all subvolumes have been read. That could happen because the
+ * posix for each subvolume sends a ENOENT on end-of-directory but in
+ * distribute we're not concerned only with a posix's view of the
+ * directory but the aggregated namespace' view of the directory.
+ */
+
+ if (prev != dht_last_up_subvol(this))
+ op_errno = 0;
+
+ if (!skip_hashed_check) {
+ DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL);
+ gf_dirent_free(&entries);
+
+ } else {
+ DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, orig_entries, NULL);
+ }
+ return 0;
+}
- if (conf->readdir_optimize == _gf_true) {
- if (xvol != local->first_up_subvol) {
- ret = dict_set_int32 (local->xattr,
- GF_READDIR_SKIP_DIRS, 1);
- if (ret)
- gf_msg (this->name,
- GF_LOG_ERROR, 0,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set "
- "dictionary value: "
- "key = %s",
- GF_READDIR_SKIP_DIRS);
- } else {
- dict_del (local->xattr,
- GF_READDIR_SKIP_DIRS);
- }
- }
+static int
+dht_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, int whichop, dict_t *dict)
+{
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ xlator_t *xvol = NULL;
+ int ret = 0;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, NULL, NULL, whichop);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->fd = fd_ref(fd);
+ local->size = size;
+ local->xattr_req = (dict) ? dict_ref(dict) : NULL;
+ local->first_up_subvol = dht_first_up_subvol(this);
+ local->op_ret = -1;
+
+ dht_deitransform(this, yoff, &xvol);
+
+ /* TODO: do proper readdir */
+ if (whichop == GF_FOP_READDIRP) {
+ if (dict)
+ local->xattr = dict_ref(dict);
+ else
+ local->xattr = dict_new();
+
+ if (local->xattr) {
+ ret = dict_set_uint32(local->xattr, conf->link_xattr_name, 256);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value"
+ " : key = %s",
+ conf->link_xattr_name);
+
+ if (conf->readdir_optimize == _gf_true) {
+ if (xvol != local->first_up_subvol) {
+ ret = dict_set_int32(local->xattr, GF_READDIR_SKIP_DIRS, 1);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DICT_SET_FAILED,
+ "Failed to set "
+ "dictionary value: "
+ "key = %s",
+ GF_READDIR_SKIP_DIRS);
+ } else {
+ dict_del(local->xattr, GF_READDIR_SKIP_DIRS);
}
+ }
- STACK_WIND (frame, dht_readdirp_cbk, xvol, xvol->fops->readdirp,
- fd, size, yoff, local->xattr);
- } else {
- STACK_WIND (frame, dht_readdir_cbk, xvol, xvol->fops->readdir,
- fd, size, yoff, local->xattr);
+ if (conf->subvolume_cnt == 1) {
+ ret = dict_set_uint32(local->xattr, conf->xattr_name, 4 * 4);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary "
+ "value:key = %s ",
+ conf->xattr_name);
+ }
+ }
}
- return 0;
+ STACK_WIND_COOKIE(frame, dht_readdirp_cbk, xvol, xvol,
+ xvol->fops->readdirp, fd, size, yoff, local->xattr);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_readdir_cbk, xvol, xvol,
+ xvol->fops->readdir, fd, size, yoff, local->xattr);
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (readdir, frame, -1, op_errno, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
-
int
-dht_readdir (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff, dict_t *xdata)
+dht_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, dict_t *xdata)
{
- int op = GF_FOP_READDIR;
- dht_conf_t *conf = NULL;
- int i = 0;
+ int op = GF_FOP_READDIR;
+ dht_conf_t *conf = NULL;
+ int i = 0;
- conf = this->private;
- if (!conf)
- goto out;
+ conf = this->private;
+ if (!conf)
+ goto out;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!conf->subvolume_status[i]) {
- op = GF_FOP_READDIRP;
- break;
- }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ op = GF_FOP_READDIRP;
+ break;
}
+ }
- if (conf->use_readdirp)
- op = GF_FOP_READDIRP;
+ if (conf->use_readdirp)
+ op = GF_FOP_READDIRP;
out:
- dht_do_readdir (frame, this, fd, size, yoff, op, 0);
- return 0;
+ dht_do_readdir(frame, this, fd, size, yoff, op, 0);
+ return 0;
}
int
-dht_readdirp (call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
- off_t yoff, dict_t *dict)
+dht_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t yoff, dict_t *dict)
{
- dht_do_readdir (frame, this, fd, size, yoff, GF_FOP_READDIRP, dict);
- return 0;
+ dht_do_readdir(frame, this, fd, size, yoff, GF_FOP_READDIRP, dict);
+ return 0;
}
-
-
-int
-dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
+static int
+dht_fsyncdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
-
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
- local = frame->local;
+ local = frame->local;
- LOCK (&frame->lock);
- {
- if (op_ret == -1)
- local->op_errno = op_errno;
-
- if (op_ret == 0)
- local->op_ret = 0;
- }
- UNLOCK (&frame->lock);
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_UNWIND (fsyncdir, frame, local->op_ret,
- local->op_errno, xdata);
-
- return 0;
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1)
+ local->op_errno = op_errno;
+ else if (op_ret == 0)
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_UNWIND(fsyncdir, frame, local->op_ret, local->op_errno,
+ xdata);
+
+ return 0;
}
-
int
-dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd,
- int datasync, dict_t *xdata)
+dht_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- int i = -1;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO (this->private, err);
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(this->private, err);
- conf = this->private;
+ conf = this->private;
- local = dht_local_init (frame, NULL, NULL, GF_FOP_FSYNCDIR);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ local = dht_local_init(frame, NULL, NULL, GF_FOP_FSYNCDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- local->fd = fd_ref (fd);
- local->call_cnt = conf->subvolume_cnt;
+ local->fd = fd_ref(fd);
+ local->call_cnt = conf->subvolume_cnt;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_fsyncdir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->fsyncdir,
- fd, datasync, xdata);
- }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND(frame, dht_fsyncdir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->fsyncdir, fd, datasync, xdata);
+ }
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fsyncdir, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fsyncdir, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
int
-dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+dht_newfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- xlator_t *prev = NULL;
- int ret = -1;
- dht_local_t *local = NULL;
-
-
- if (op_ret == -1)
- goto out;
-
- local = frame->local;
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+ xlator_t *prev = NULL;
+ int ret = -1;
+ dht_local_t *local = NULL;
- prev = cookie;
-
- if (local->loc.parent) {
-
- dht_inode_ctx_time_update (local->loc.parent, this,
- preparent, 0);
- dht_inode_ctx_time_update (local->loc.parent, this,
- postparent, 1);
- }
+ if (op_ret == -1)
+ goto out;
- ret = dht_layout_preset (this, prev, inode);
- if (ret < 0) {
- gf_msg_debug (this->name, EINVAL,
- "could not set pre-set layout for subvolume %s",
- prev? prev->name: NULL);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
- if (local->linked == _gf_true)
- dht_linkfile_attr_heal (frame, this);
+ local = frame->local;
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ prev = cookie;
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret < 0) {
+ gf_msg_debug(this->name, EINVAL,
+ "could not set pre-set layout for subvolume %s",
+ prev ? prev->name : NULL);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+ if (local->linked == _gf_true)
+ dht_linkfile_attr_heal(frame, this);
out:
- /*
- * FIXME: ia_size and st_blocks of preparent and postparent do not have
- * correct values. since, preparent and postparent buffers correspond
- * to a directory these two members should have values equal to sum of
- * corresponding values from each of the subvolume.
- * See dht_iatt_merge for reference.
- */
- DHT_STRIP_PHASE1_FLAGS (stbuf);
-
- if (local && local->lock.locks) {
- /* store op_errno for failure case*/
- local->op_errno = op_errno;
- local->refresh_layout_unlock (frame, this, op_ret);
+ /*
+ * FIXME: ia_size and st_blocks of preparent and postparent do not have
+ * correct values. since, preparent and postparent buffers correspond
+ * to a directory these two members should have values equal to sum of
+ * corresponding values from each of the subvolume.
+ * See dht_iatt_merge for reference.
+ */
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(postparent);
+ dht_set_fixed_dir_stat(preparent);
+
+ if (local && local->lock[0].layout.parent_layout.locks) {
+ /* store op_errno for failure case*/
+ local->op_errno = op_errno;
+ local->refresh_layout_unlock(frame, this, op_ret, 1);
- if (op_ret == 0) {
- DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno,
- inode, stbuf, preparent, postparent,
- xdata);
- }
- } else {
- DHT_STACK_UNWIND (mknod, frame, op_ret, op_errno, inode,
- stbuf, preparent, postparent, xdata);
+ if (op_ret == 0) {
+ DHT_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, stbuf,
+ preparent, postparent, xdata);
}
+ } else {
+ DHT_STACK_UNWIND(mknod, frame, op_ret, op_errno, inode, stbuf,
+ preparent, postparent, xdata);
+ }
- return 0;
+ return 0;
}
-int
-dht_mknod_linkfile_create_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
+static int
+dht_mknod_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- dht_local_t *local = NULL;
- xlator_t *cached_subvol = NULL;
- dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_conf_t *conf = NULL;
- local = frame->local;
+ local = frame->local;
- if (!local || !local->cached_subvol) {
- op_errno = EINVAL;
- goto err;
- }
+ if (!local || !local->cached_subvol) {
+ op_errno = EINVAL;
+ goto err;
+ }
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto err;
- }
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto err;
+ }
- conf = this->private;
- if (!conf) {
- local->op_errno = EINVAL;
- op_errno = EINVAL;
- goto err;
- }
+ conf = this->private;
+ if (!conf) {
+ local->op_errno = EINVAL;
+ op_errno = EINVAL;
+ goto err;
+ }
- cached_subvol = local->cached_subvol;
+ cached_subvol = local->cached_subvol;
- if (local->params) {
- dict_del (local->params, conf->link_xattr_name);
- dict_del (local->params, GLUSTERFS_INTERNAL_FOP_KEY);
- }
+ if (local->params) {
+ dict_del(local->params, conf->link_xattr_name);
+ dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
+ }
- STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)cached_subvol,
- cached_subvol, cached_subvol->fops->mknod,
- &local->loc, local->mode, local->rdev, local->umask,
- local->params);
+ STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)cached_subvol,
+ cached_subvol, cached_subvol->fops->mknod, &local->loc,
+ local->mode, local->rdev, local->umask, local->params);
- return 0;
+ return 0;
err:
- if (local && local->lock.locks) {
- local->refresh_layout_unlock (frame, this, -1);
- } else {
- DHT_STACK_UNWIND (mknod, frame, -1,
- op_errno, NULL, NULL, NULL,
- NULL, NULL);
- }
- return 0;
+ if (local && local->lock[0].layout.parent_layout.locks) {
+ local->refresh_layout_unlock(frame, this, -1, 1);
+ } else {
+ DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL);
+ }
+ return 0;
}
-int
-dht_mknod_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
- xlator_t *subvol, loc_t *loc, dev_t rdev,
- mode_t mode, mode_t umask, dict_t *params)
+static int
+dht_mknod_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this,
+ xlator_t *subvol, loc_t *loc, dev_t rdev,
+ mode_t mode, mode_t umask, dict_t *params)
{
- dht_local_t *local = NULL;
- xlator_t *avail_subvol = NULL;
-
- local = frame->local;
+ dht_local_t *local = NULL;
+ xlator_t *avail_subvol = NULL;
- if (!dht_is_subvol_filled (this, subvol)) {
- gf_msg_debug (this->name, 0,
- "creating %s on %s", loc->path,
- subvol->name);
+ local = frame->local;
- STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol,
- subvol, subvol->fops->mknod, loc, mode,
- rdev, umask, params);
- } else {
- avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
+ if (!dht_is_subvol_filled(this, subvol)) {
+ gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+ subvol->name);
- if (avail_subvol != subvol) {
- local->params = dict_ref (params);
- local->rdev = rdev;
- local->mode = mode;
- local->umask = umask;
- local->cached_subvol = avail_subvol;
- local->hashed_subvol = subvol;
+ STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode, rdev, umask, params);
+ } else {
+ avail_subvol = dht_free_disk_available_subvol(this, subvol, local);
- gf_msg_debug (this->name, 0,
- "creating %s on %s (link at %s)", loc->path,
- avail_subvol->name, subvol->name);
+ if (avail_subvol != subvol) {
+ local->params = dict_ref(params);
+ local->rdev = rdev;
+ local->mode = mode;
+ local->umask = umask;
+ local->cached_subvol = avail_subvol;
+ local->hashed_subvol = subvol;
- dht_linkfile_create (frame,
- dht_mknod_linkfile_create_cbk,
- this, avail_subvol, subvol, loc);
+ gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)",
+ loc->path, avail_subvol->name, subvol->name);
- goto out;
- }
+ dht_linkfile_create(frame, dht_mknod_linkfile_create_cbk, this,
+ avail_subvol, subvol, loc);
- gf_msg_debug (this->name, 0,
- "creating %s on %s", loc->path, subvol->name);
+ goto out;
+ }
- STACK_WIND_COOKIE (frame, dht_newfile_cbk,
- (void *)subvol, subvol,
- subvol->fops->mknod, loc, mode,
- rdev, umask, params);
+ gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+ subvol->name);
- }
+ STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->mknod, loc, mode, rdev, umask, params);
+ }
out:
- return 0;
+ return 0;
}
-
-int32_t
-dht_mknod_do (call_frame_t *frame)
+static int32_t
+dht_mknod_do(call_frame_t *frame)
{
- dht_local_t *local = NULL;
- dht_layout_t *refreshed = NULL;
- xlator_t *subvol = NULL;
- xlator_t *this = NULL;
- dht_conf_t *conf = NULL;
- dht_methods_t *methods = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *refreshed = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
- local = frame->local;
+ local = frame->local;
- this = THIS;
+ this = THIS;
- conf = this->private;
+ conf = this->private;
- GF_VALIDATE_OR_GOTO (this->name, conf, err);
+ GF_VALIDATE_OR_GOTO(this->name, conf, err);
- methods = &(conf->methods);
+ methods = &(conf->methods);
- /* We don't need parent_loc anymore */
- loc_wipe (&local->loc);
+ /* We don't need parent_loc anymore */
+ loc_wipe(&local->loc);
- loc_copy (&local->loc, &local->loc2);
+ loc_copy(&local->loc, &local->loc2);
- loc_wipe (&local->loc2);
+ loc_wipe(&local->loc2);
- refreshed = local->selfheal.refreshed_layout;
+ refreshed = local->selfheal.refreshed_layout;
- subvol = methods->layout_search (this, refreshed, local->loc.name);
+ subvol = methods->layout_search(this, refreshed, local->loc.name);
- if (!subvol) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_HASHED_SUBVOL_GET_FAILED, "no subvolume in "
- "layout for path=%s", local->loc.path);
- local->op_errno = ENOENT;
- goto err;
- }
+ if (!subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "no subvolume in "
+ "layout for path=%s",
+ local->loc.path);
+ local->op_errno = ENOENT;
+ goto err;
+ }
- dht_mknod_wind_to_avail_subvol (frame, this, subvol, &local->loc,
- local->rdev, local->mode,
- local->umask, local->params);
- return 0;
+ dht_mknod_wind_to_avail_subvol(frame, this, subvol, &local->loc,
+ local->rdev, local->mode, local->umask,
+ local->params);
+ return 0;
err:
- local->refresh_layout_unlock (frame, this, -1);
+ local->refresh_layout_unlock(frame, this, -1, 1);
- return 0;
+ return 0;
}
+static int32_t
+dht_mknod_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
-int32_t
-dht_mknod_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+static int32_t
+dht_mknod_finish(call_frame_t *frame, xlator_t *this, int op_ret,
+ int invoke_cbk)
{
- DHT_STACK_DESTROY (frame);
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ int lock_count = 0;
+
+ local = frame->local;
+ lock_count = dht_lock_count(local->lock[0].layout.parent_layout.locks,
+ local->lock[0].layout.parent_layout.lk_count);
+ if (lock_count == 0)
+ goto done;
+
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL) {
+ goto done;
+ }
+
+ lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+ lock_frame->root->op);
+ if (lock_local == NULL) {
+ goto done;
+ }
+
+ lock_local->lock[0]
+ .layout.parent_layout.locks = local->lock[0].layout.parent_layout.locks;
+ lock_local->lock[0].layout.parent_layout.lk_count =
+ local->lock[0].layout.parent_layout.lk_count;
+
+ local->lock[0].layout.parent_layout.locks = NULL;
+ local->lock[0].layout.parent_layout.lk_count = 0;
+
+ dht_unlock_inodelk(lock_frame,
+ lock_local->lock[0].layout.parent_layout.locks,
+ lock_local->lock[0].layout.parent_layout.lk_count,
+ dht_mknod_unlock_cbk);
+ lock_frame = NULL;
+
+done:
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
+
+ if (op_ret == 0)
return 0;
+
+ DHT_STACK_UNWIND(mknod, frame, op_ret, local->op_errno, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
}
-int32_t
-dht_mknod_finish (call_frame_t *frame, xlator_t *this, int op_ret)
+static int32_t
+dht_mknod_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL, *lock_local = NULL;
- call_frame_t *lock_frame = NULL;
- int lock_count = 0;
+ dht_local_t *local = NULL;
- local = frame->local;
- lock_count = dht_lock_count (local->lock.locks, local->lock.lk_count);
- if (lock_count == 0)
- goto done;
+ local = frame->local;
- lock_frame = copy_frame (frame);
- if (lock_frame == NULL) {
- goto done;
- }
+ if (!local) {
+ goto err;
+ }
- lock_local = dht_local_init (lock_frame, &local->loc, NULL,
- lock_frame->root->op);
- if (lock_local == NULL) {
- goto done;
- }
+ if (op_ret < 0) {
+ gf_msg("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+ "mknod lock failed for file: %s", local->loc2.name);
- lock_local->lock.locks = local->lock.locks;
- lock_local->lock.lk_count = local->lock.lk_count;
+ local->op_errno = op_errno;
- local->lock.locks = NULL;
- local->lock.lk_count = 0;
+ goto err;
+ }
- dht_unlock_inodelk (lock_frame, lock_local->lock.locks,
- lock_local->lock.lk_count,
- dht_mknod_unlock_cbk);
- lock_frame = NULL;
+ local->refresh_layout_unlock = dht_mknod_finish;
-done:
- if (lock_frame != NULL) {
- DHT_STACK_DESTROY (lock_frame);
- }
+ local->refresh_layout_done = dht_mknod_do;
- if (op_ret == 0)
- return 0;
+ dht_refresh_layout(frame);
- DHT_STACK_UNWIND (mknod, frame, op_ret, local->op_errno, NULL, NULL,
- NULL, NULL, NULL);
- return 0;
+ return 0;
+err:
+ if (local)
+ dht_mknod_finish(frame, this, -1, 0);
+ else
+ DHT_STACK_UNWIND(mknod, frame, -1, EINVAL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
}
-int32_t
-dht_mknod_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+static int32_t
+dht_mknod_lock(call_frame_t *frame, xlator_t *subvol)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1;
+ dht_lock_t **lk_array = NULL;
- local = frame->local;
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err);
- if (!local) {
- goto err;
- }
+ local = frame->local;
- if (op_ret < 0) {
- gf_msg ("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
- "mknod lock failed for file: %s", local->loc2.name);
+ lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
- local->op_errno = op_errno;
+ if (lk_array == NULL)
+ goto err;
- goto err;
- }
+ lk_array[0] = dht_lock_new(frame->this, subvol, &local->loc, F_RDLCK,
+ DHT_LAYOUT_HEAL_DOMAIN, NULL,
+ IGNORE_ENOENT_ESTALE);
- local->refresh_layout_unlock = dht_mknod_finish;
+ if (lk_array[0] == NULL)
+ goto err;
- local->refresh_layout_done = dht_mknod_do;
+ local->lock[0].layout.parent_layout.locks = lk_array;
+ local->lock[0].layout.parent_layout.lk_count = count;
- dht_refresh_layout (frame);
+ ret = dht_blocking_inodelk(frame, lk_array, count, dht_mknod_lock_cbk);
- return 0;
+ if (ret < 0) {
+ local->lock[0].layout.parent_layout.locks = NULL;
+ local->lock[0].layout.parent_layout.lk_count = 0;
+ goto err;
+ }
+
+ return 0;
err:
- dht_mknod_finish (frame, this, -1);
- return 0;
+ if (lk_array != NULL) {
+ dht_lock_array_free(lk_array, count);
+ GF_FREE(lk_array);
+ }
+
+ return -1;
}
-int32_t
-dht_mknod_lock (call_frame_t *frame, xlator_t *subvol)
+static int
+dht_refresh_parent_layout_resume(call_frame_t *frame, xlator_t *this, int ret,
+ int invoke_cbk)
{
- dht_local_t *local = NULL;
- int count = 1, ret = -1;
- dht_lock_t **lk_array = NULL;
+ dht_local_t *local = NULL, *parent_local = NULL;
+ call_stub_t *stub = NULL;
+ call_frame_t *parent_frame = NULL;
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err);
+ local = frame->local;
- local = frame->local;
+ stub = local->stub;
+ local->stub = NULL;
- lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_char);
+ parent_frame = stub->frame;
+ parent_local = parent_frame->local;
- if (lk_array == NULL)
- goto err;
+ if (ret < 0) {
+ parent_local->op_ret = -1;
+ parent_local->op_errno = local->op_errno ? local->op_errno : EIO;
+ } else {
+ parent_local->op_ret = 0;
+ }
- lk_array[0] = dht_lock_new (frame->this, subvol, &local->loc, F_RDLCK,
- DHT_LAYOUT_HEAL_DOMAIN);
+ call_resume(stub);
- if (lk_array[0] == NULL)
- goto err;
+ DHT_STACK_DESTROY(frame);
- local->lock.locks = lk_array;
- local->lock.lk_count = count;
+ return 0;
+}
- ret = dht_blocking_inodelk (frame, lk_array, count,
- dht_mknod_lock_cbk);
+static int
+dht_refresh_parent_layout_done(call_frame_t *frame)
+{
+ dht_local_t *local = NULL;
+ int ret = 0;
- if (ret < 0) {
- local->lock.locks = NULL;
- local->lock.lk_count = 0;
- goto err;
- }
+ local = frame->local;
- return 0;
-err:
- if (lk_array != NULL) {
- dht_lock_array_free (lk_array, count);
- GF_FREE (lk_array);
- }
+ if (local->op_ret < 0) {
+ ret = -1;
+ goto resume;
+ }
- return -1;
-}
+ dht_layout_set(frame->this, local->loc.inode,
+ local->selfheal.refreshed_layout);
+resume:
+ dht_refresh_parent_layout_resume(frame, frame->this, ret, 1);
+ return 0;
+}
-int
-dht_mknod (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, dev_t rdev, mode_t umask, dict_t *params)
+static int
+dht_handle_parent_layout_change(xlator_t *this, call_stub_t *stub)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- int i = 0;
- int ret = 0;
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
+ call_frame_t *refresh_frame = NULL, *frame = NULL;
+ dht_local_t *refresh_local = NULL, *local = NULL;
- conf = this->private;
+ frame = stub->frame;
+ local = frame->local;
- dht_get_du_info (frame, this, loc);
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_MKNOD);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ refresh_frame = copy_frame(frame);
+ if (!refresh_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "mem allocation failed for refresh_frame");
+ return -1;
+ }
- subvol = dht_subvol_get_hashed (this, loc);
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no subvolume in layout for path=%s",
- loc->path);
- op_errno = EIO;
- goto err;
- }
+ refresh_local = dht_local_init(refresh_frame, NULL, NULL, stub->fop);
+ if (!refresh_local) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "mem allocation failed for refresh_local");
+ return -1;
+ }
- /* Post remove-brick, the client layout may not be in sync with
- * disk layout because of lack of lookup. Hence,a mknod call
- * may fall on the decommissioned brick. Hence, if the
- * hashed_subvol is part of decommissioned bricks list, do a
- * lookup on parent dir. If a fix-layout is already done by the
- * remove-brick process, the parent directory layout will be in
- * sync with that of the disk. If fix-layout is still ending
- * on the parent directory, we can let the file get created on
- * the decommissioned brick which will be eventually migrated to
- * non-decommissioned brick based on the new layout.
- */
+ refresh_local->loc.inode = inode_ref(local->loc.parent);
+ gf_uuid_copy(refresh_local->loc.gfid, local->loc.parent->gfid);
- if (conf->decommission_subvols_cnt) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->decommissioned_bricks[i] &&
- conf->decommissioned_bricks[i] == subvol) {
-
- gf_msg_debug (this->name, 0, "hashed subvol:%s is "
- "part of decommission brick list for "
- "file: %s", subvol->name, loc->path);
-
- /* dht_refresh_layout needs directory info in
- * local->loc. Hence, storing the parent_loc in
- * local->loc and storing the create context in
- * local->loc2. We will restore this information
- * in dht_creation do */
-
- ret = loc_copy (&local->loc2, &local->loc);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_NO_MEMORY,
- "loc_copy failed %s", loc->path);
-
- goto err;
- }
+ refresh_local->stub = stub;
- local->params = dict_ref (params);
- local->rdev = rdev;
- local->mode = mode;
- local->umask = umask;
+ refresh_local->refresh_layout_unlock = dht_refresh_parent_layout_resume;
+ refresh_local->refresh_layout_done = dht_refresh_parent_layout_done;
- loc_wipe (&local->loc);
+ dht_refresh_layout(refresh_frame);
+ return 0;
+}
- ret = dht_build_parent_loc (this, &local->loc, loc,
- &op_errno);
+static int32_t
+dht_call_mkdir_stub(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_stub_t *stub = NULL;
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_NO_MEMORY,
- "parent loc build failed");
- goto err;
- }
+ local = frame->local;
+ stub = local->stub;
+ local->stub = NULL;
- ret = dht_mknod_lock (frame, subvol);
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ } else {
+ local->op_ret = 0;
+ }
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_INODE_LK_ERROR,
- "locking parent failed");
- goto err;
- }
+ call_resume(stub);
- goto done;
- }
- }
- }
+ return 0;
+}
- dht_mknod_wind_to_avail_subvol (frame, this, subvol, loc, rdev, mode,
- umask, params);
+static int32_t
+dht_guard_parent_layout_and_namespace(xlator_t *subvol, call_stub_t *stub)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ loc_t *loc = NULL;
+ xlator_t *hashed_subvol = NULL, *this = NULL;
+ ;
+ call_frame_t *frame = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int32_t *parent_disk_layout = NULL;
+ dht_layout_t *parent_layout = NULL;
+ dht_conf_t *conf = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", stub, err);
+
+ frame = stub->frame;
+ this = frame->this;
+
+ conf = this->private;
+
+ local = frame->local;
+
+ local->stub = stub;
+
+ /* TODO: recheck whether we should lock on src or dst if we do similar
+ * stale layout checks for rename.
+ */
+ loc = &stub->args.loc;
+
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ if (local->params == NULL) {
+ local->params = dict_new();
+ if (local->params == NULL) {
+ local->op_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "dict allocation failed",
+ gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (hashed_subvol == NULL) {
+ local->op_errno = EINVAL;
+
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "hashed subvolume not found",
+ gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ parent_layout = dht_layout_get(this, loc->parent);
+
+ ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol,
+ &parent_disk_layout);
+ if (ret == -1) {
+ local->op_errno = EINVAL;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "extracting in-memory layout of parent failed. ",
+ gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ memcpy((void *)local->parent_disk_layout, (void *)parent_disk_layout,
+ sizeof(local->parent_disk_layout));
+
+ dht_layout_unref(this, parent_layout);
+ parent_layout = NULL;
+
+ ret = dict_set_str(local->params, GF_PREOP_PARENT_KEY, conf->xattr_name);
+ if (ret < 0) {
+ local->op_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "setting %s key in params dictionary failed. ",
+ gf_fop_list[stub->fop], pgfid, loc->name, loc->path,
+ GF_PREOP_PARENT_KEY);
+ goto err;
+ }
+
+ ret = dict_set_bin(local->params, conf->xattr_name, parent_disk_layout,
+ 4 * 4);
+ if (ret < 0) {
+ local->op_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "setting parent-layout in params dictionary failed. ",
+ gf_fop_list[stub->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ parent_disk_layout = NULL;
+ local->hashed_subvol = hashed_subvol;
+
+ local->current = &local->lock[0];
+ ret = dht_protect_namespace(frame, loc, hashed_subvol, &local->current->ns,
+ dht_call_mkdir_stub);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
-done:
- return 0;
+ if (parent_disk_layout != NULL)
+ GF_FREE(parent_disk_layout);
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (mknod, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
+ if (parent_layout != NULL)
+ dht_layout_unref(this, parent_layout);
- return 0;
+ return -1;
}
-
int
-dht_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkname, loc_t *loc, mode_t umask, dict_t *params)
+dht_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *params)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ int i = 0;
+ int ret = 0;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ conf = this->private;
+
+ dht_get_du_info(frame, this, loc);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_MKNOD);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = dht_subvol_get_hashed(this, loc);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = EIO;
+ goto err;
+ }
+
+ /* Post remove-brick, the client layout may not be in sync with
+ * disk layout because of lack of lookup. Hence,a mknod call
+ * may fall on the decommissioned brick. Hence, if the
+ * hashed_subvol is part of decommissioned bricks list, do a
+ * lookup on parent dir. If a fix-layout is already done by the
+ * remove-brick process, the parent directory layout will be in
+ * sync with that of the disk. If fix-layout is still ending
+ * on the parent directory, we can let the file get created on
+ * the decommissioned brick which will be eventually migrated to
+ * non-decommissioned brick based on the new layout.
+ */
+
+ if (conf->decommission_subvols_cnt) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i] &&
+ conf->decommissioned_bricks[i] == subvol) {
+ gf_msg_debug(this->name, 0,
+ "hashed subvol:%s is "
+ "part of decommission brick list for "
+ "file: %s",
+ subvol->name, loc->path);
+
+ /* dht_refresh_layout needs directory info in
+ * local->loc. Hence, storing the parent_loc in
+ * local->loc and storing the create context in
+ * local->loc2. We will restore this information
+ * in dht_creation do */
+
+ ret = loc_copy(&local->loc2, &local->loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "loc_copy failed %s", loc->path);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
+ goto err;
+ }
- local = dht_local_init (frame, loc, NULL, GF_FOP_SYMLINK);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ local->params = dict_ref(params);
+ local->rdev = rdev;
+ local->mode = mode;
+ local->umask = umask;
- subvol = dht_subvol_get_hashed (this, loc);
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no subvolume in layout for path=%s",
- loc->path);
- op_errno = EIO;
- goto err;
- }
+ loc_wipe(&local->loc);
- gf_msg_trace (this->name, 0,
- "creating %s on %s", loc->path, subvol->name);
+ ret = dht_build_parent_loc(this, &local->loc, loc, &op_errno);
- STACK_WIND_COOKIE (frame, dht_newfile_cbk, (void *)subvol, subvol,
- subvol->fops->symlink, linkname, loc, umask,
- params);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED,
+ "parent loc build failed");
+ goto err;
+ }
- return 0;
+ ret = dht_mknod_lock(frame, subvol);
+
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+ "locking parent failed");
+ goto err;
+ }
+
+ goto done;
+ }
+ }
+ }
+
+ dht_mknod_wind_to_avail_subvol(frame, this, subvol, loc, rdev, mode, umask,
+ params);
+
+done:
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (link, frame, -1, op_errno,
- NULL, NULL, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(mknod, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
-
int
-dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
- dict_t *xdata)
+dht_symlink(call_frame_t *frame, xlator_t *this, const char *linkname,
+ loc_t *loc, mode_t umask, dict_t *params)
{
- xlator_t *cached_subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
- local = dht_local_init (frame, loc, NULL, GF_FOP_UNLINK);
- if (!local) {
- op_errno = ENOMEM;
+ local = dht_local_init(frame, loc, NULL, GF_FOP_SYMLINK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- goto err;
- }
+ subvol = dht_subvol_get_hashed(this, loc);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ loc->path);
+ op_errno = EIO;
+ goto err;
+ }
- cached_subvol = local->cached_subvol;
- if (!cached_subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ gf_msg_trace(this->name, 0, "creating %s on %s", loc->path, subvol->name);
- local->flags = xflag;
- STACK_WIND (frame, dht_unlink_cbk,
- cached_subvol, cached_subvol->fops->unlink, loc,
- xflag, xdata);
+ STACK_WIND_COOKIE(frame, dht_newfile_cbk, (void *)subvol, subvol,
+ subvol->fops->symlink, linkname, loc, umask, params);
+
+ return 0;
- return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (unlink, frame, -1, op_errno, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
int
-dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+dht_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- int ret = -1;
- gf_boolean_t stbuf_merged = _gf_false;
- xlator_t *subvol = NULL;
-
- local = frame->local;
+ xlator_t *cached_subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_UNLINK);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+
+ cached_subvol = local->cached_subvol;
+ if (!cached_subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local->flags = xflag;
+ STACK_WIND_COOKIE(frame, dht_unlink_cbk, cached_subvol, cached_subvol,
+ cached_subvol->fops->unlink, loc, xflag, xdata);
+
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
- if (op_ret == -1) {
- /* No continuation on DHT inode missing errors, as we should
- * then have a good stbuf that states P2 happened. We would
- * get inode missing if, the file completed migrated between
- * the lookup and the link call */
- goto out;
- }
+ return 0;
+}
- /* Update parent on success, even if P1/2 checks are positve.
- * The second call on success will further update the parent */
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- preparent, 0);
- dht_inode_ctx_time_update (local->loc.parent, this,
- postparent, 1);
- }
-
- /* Update linkto attrs, if this is the first call and non-P2,
- * if we detect P2 then we need to trust the attrs from the
- * second call, not the first */
- if (local->linked == _gf_true &&
- ((local->call_cnt == 1 && !IS_DHT_MIGRATION_PHASE2 (stbuf))
- || (local->call_cnt != 1 &&
- IS_DHT_MIGRATION_PHASE2 (&local->stbuf)))) {
- dht_iatt_merge (this, &local->stbuf, stbuf, NULL);
- stbuf_merged = _gf_true;
- dht_linkfile_attr_heal (frame, this);
- }
-
- /* No further P1/2 checks if we are in the second iteration of
- * the call */
- if (local->call_cnt != 1) {
- goto out;
- } else {
- /* Preserve the return values, in case the migration decides
- * to recreate the link on the same subvol that the current
- * hased for the link was created on. */
- dht_iatt_merge (this, &local->preparent,
- preparent, NULL);
- dht_iatt_merge (this, &local->postparent,
- postparent, NULL);
- if (!stbuf_merged) {
- dht_iatt_merge (this, &local->stbuf,
- stbuf, NULL);
- stbuf_merged = _gf_true;
- }
+static int
+dht_remove_stale_linkto_cbk(int ret, call_frame_t *sync_frame, void *data)
+{
+ DHT_STACK_DESTROY(sync_frame);
+ return 0;
+}
- local->inode = inode_ref (inode);
- }
+static int
+dht_remove_stale_linkto(void *data)
+{
+ call_frame_t *frame = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ dict_t *xdata_in = NULL;
+ int ret = 0;
+
+ GF_VALIDATE_OR_GOTO("dht", data, out);
+
+ frame = data;
+ local = frame->local;
+ this = frame->this;
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", local, out);
+ GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out);
+
+ xdata_in = dict_new();
+ if (!xdata_in)
+ goto out;
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- local->rebalance.target_op_fn = dht_link2;
- dht_set_local_rebalance (this, local, stbuf, preparent,
- postparent, xdata);
-
- /* Check if the rebalance phase2 is true */
- if (IS_DHT_MIGRATION_PHASE2 (stbuf)) {
- ret = dht_inode_ctx_get_mig_info (this, local->loc.inode, NULL,
- &subvol);
- if (!subvol) {
- /* Phase 2 of migration */
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
- } else {
- dht_link2 (this, subvol, frame, 0);
- return 0;
- }
- }
+ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(xdata_in);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, 0,
+ "Failed to set keys for stale linkto"
+ "deletion on path %s",
+ local->loc.path);
+ goto out;
+ }
+
+ ret = syncop_unlink(local->link_subvol, &local->loc, xdata_in, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, 0,
+ "Removal of linkto failed"
+ " on path %s at subvol %s",
+ local->loc.path, local->link_subvol->name);
+ }
+out:
+ if (xdata_in)
+ dict_unref(xdata_in);
+ return ret;
+}
- /* Check if the rebalance phase1 is true */
- if (IS_DHT_MIGRATION_PHASE1 (stbuf)) {
- ret = dht_inode_ctx_get_mig_info (this, local->loc.inode, NULL,
- &subvol);
- if (subvol) {
- dht_link2 (this, subvol, frame, 0);
- return 0;
- }
- ret = dht_rebalance_in_progress_check (this, frame);
- if (!ret)
- return 0;
+static int
+dht_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ gf_boolean_t stbuf_merged = _gf_false;
+ xlator_t *subvol = NULL;
+ call_frame_t *cleanup_frame = NULL;
+ dht_local_t *cleanup_local = NULL;
+
+ local = frame->local;
+
+ if (op_ret == -1) {
+ /* Remove the linkto if exists */
+ if (local->linked) {
+ cleanup_frame = create_frame(this, this->ctx->pool);
+ if (cleanup_frame) {
+ cleanup_local = dht_local_init(cleanup_frame, &local->loc2,
+ NULL, 0);
+ if (!cleanup_local || !local->link_subvol) {
+ DHT_STACK_DESTROY(cleanup_frame);
+ goto out;
+ }
+ cleanup_local->link_subvol = local->link_subvol;
+ FRAME_SU_DO(cleanup_frame, dht_local_t);
+ ret = synctask_new(this->ctx->env, dht_remove_stale_linkto,
+ dht_remove_stale_linkto_cbk, cleanup_frame,
+ cleanup_frame);
+ }
}
+ /* No continuation on DHT inode missing errors, as we should
+ * then have a good stbuf that states P2 happened. We would
+ * get inode missing if, the file completed migrated between
+ * the lookup and the link call */
+ goto out;
+ }
+
+ /* Update parent on success, even if P1/2 checks are positive.
+ * The second call on success will further update the parent */
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+
+ /* Update linkto attrs, if this is the first call and non-P2,
+ * if we detect P2 then we need to trust the attrs from the
+ * second call, not the first */
+ if (local->linked == _gf_true &&
+ ((local->call_cnt == 1 && !IS_DHT_MIGRATION_PHASE2(stbuf)) ||
+ (local->call_cnt != 1 && IS_DHT_MIGRATION_PHASE2(&local->stbuf)))) {
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ stbuf_merged = _gf_true;
+ dht_linkfile_attr_heal(frame, this);
+ }
+
+ /* No further P1/2 checks if we are in the second iteration of
+ * the call */
+ if (local->call_cnt != 1) {
+ goto out;
+ } else {
+ /* Preserve the return values, in case the migration decides
+ * to recreate the link on the same subvol that the current
+ * hased for the link was created on. */
+ dht_iatt_merge(this, &local->preparent, preparent);
+ dht_iatt_merge(this, &local->postparent, postparent);
+ if (!stbuf_merged) {
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ stbuf_merged = _gf_true;
+ }
+
+ local->inode = inode_ref(inode);
+ }
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ local->rebalance.target_op_fn = dht_link2;
+ dht_set_local_rebalance(this, local, stbuf, preparent, postparent, xdata);
+
+ /* Check if the rebalance phase2 is true */
+ if (IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ ret = dht_inode_ctx_get_mig_info(this, local->loc.inode, NULL, &subvol);
+ if (!subvol) {
+ /* Phase 2 of migration */
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ } else {
+ dht_link2(this, subvol, frame, 0);
+ return 0;
+ }
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(stbuf)) {
+ ret = dht_inode_ctx_get_mig_info(this, local->loc.inode, NULL, &subvol);
+ if (subvol) {
+ dht_link2(this, subvol, frame, 0);
+ return 0;
+ }
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
out:
- DHT_STRIP_PHASE1_FLAGS (stbuf);
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
- DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf,
- preparent, postparent, NULL);
+ dht_set_fixed_dir_stat(preparent);
+ dht_set_fixed_dir_stat(postparent);
+ DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+ postparent, NULL);
- return 0;
+ return 0;
}
-
-int
-dht_link2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+static int
+dht_link2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- int op_errno = EINVAL;
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
- local = frame->local;
- if (!local)
- goto err;
+ local = frame->local;
+ if (!local)
+ goto err;
- op_errno = local->op_errno;
+ op_errno = local->op_errno;
- if (we_are_not_migrating (ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND (link, frame, local->op_ret, op_errno,
- local->inode,
- &local->stbuf, &local->preparent,
- &local->postparent, NULL);
- return 0;
- }
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
- if (subvol == NULL) {
- op_errno = EINVAL;
- goto err;
- }
+ DHT_STACK_UNWIND(link, frame, local->op_ret, op_errno, local->inode,
+ &local->stbuf, &local->preparent, &local->postparent,
+ NULL);
+ return 0;
+ }
- /* Second call to create link file could result in EEXIST as the
- * first call created the linkto in the currently
- * migrating subvol, which could be the new hashed subvol */
- if (local->link_subvol == subvol) {
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (link, frame, 0, 0, local->inode,
- &local->stbuf, &local->preparent,
- &local->postparent, NULL);
+ if (subvol == NULL) {
+ op_errno = EINVAL;
+ goto err;
+ }
- return 0;
- }
+ /* Second call to create link file could result in EEXIST as the
+ * first call created the linkto in the currently
+ * migrating subvol, which could be the new hashed subvol */
+ if (local->link_subvol == subvol) {
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+ DHT_STACK_UNWIND(link, frame, 0, 0, local->inode, &local->stbuf,
+ &local->preparent, &local->postparent, NULL);
+
+ return 0;
+ }
- local->call_cnt = 2;
+ local->call_cnt = 2;
- STACK_WIND (frame, dht_link_cbk, subvol, subvol->fops->link,
- &local->loc, &local->loc2, NULL);
+ STACK_WIND(frame, dht_link_cbk, subvol, subvol->fops->link, &local->loc,
+ &local->loc2, local->xattr_req);
- return 0;
+ return 0;
err:
- DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL,
- NULL, NULL);
+ DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
-int
-dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
+static int
+dht_link_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- dht_local_t *local = NULL;
- xlator_t *srcvol = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *srcvol = NULL;
- if (op_ret == -1)
- goto err;
+ if (op_ret == -1)
+ goto err;
- local = frame->local;
- srcvol = local->linkfile.srcvol;
+ local = frame->local;
+ srcvol = local->linkfile.srcvol;
- STACK_WIND (frame, dht_link_cbk, srcvol, srcvol->fops->link,
- &local->loc, &local->loc2, xdata);
+ STACK_WIND(frame, dht_link_cbk, srcvol, srcvol->fops->link, &local->loc,
+ &local->loc2, local->xattr_req);
- return 0;
+ return 0;
err:
- DHT_STRIP_PHASE1_FLAGS (stbuf);
- DHT_STACK_UNWIND (link, frame, op_ret, op_errno, inode, stbuf, preparent,
- postparent, NULL);
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(preparent);
+ dht_set_fixed_dir_stat(postparent);
+ DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
+ postparent, xdata);
- return 0;
+ return 0;
}
-
int
-dht_link (call_frame_t *frame, xlator_t *this,
- loc_t *oldloc, loc_t *newloc, dict_t *xdata)
+dht_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
{
- xlator_t *cached_subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- int op_errno = -1;
- int ret = -1;
- dht_local_t *local = NULL;
+ xlator_t *cached_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ int op_errno = -1;
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(oldloc, err);
+ VALIDATE_OR_GOTO(newloc, err);
+
+ local = dht_local_init(frame, oldloc, NULL, GF_FOP_LINK);
+ if (!local) {
+ op_errno = ENOMEM;
+
+ goto err;
+ }
+ local->call_cnt = 1;
+
+ cached_subvol = local->cached_subvol;
+ if (!cached_subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ oldloc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, newloc);
+ if (!hashed_subvol) {
+ gf_msg_debug(this->name, 0, "no subvolume in layout for path=%s",
+ newloc->path);
+ op_errno = EIO;
+ goto err;
+ }
+
+ ret = loc_copy(&local->loc2, newloc);
+ if (ret == -1) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ if (hashed_subvol != cached_subvol) {
+ gf_uuid_copy(local->gfid, oldloc->inode->gfid);
+ dht_linkfile_create(frame, dht_link_linkfile_cbk, this, cached_subvol,
+ hashed_subvol, newloc);
+ } else {
+ STACK_WIND(frame, dht_link_cbk, cached_subvol,
+ cached_subvol->fops->link, oldloc, newloc, xdata);
+ }
+
+ return 0;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (oldloc, err);
- VALIDATE_OR_GOTO (newloc, err);
-
- local = dht_local_init (frame, oldloc, NULL, GF_FOP_LINK);
- if (!local) {
- op_errno = ENOMEM;
-
- goto err;
- }
- local->call_cnt = 1;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- cached_subvol = local->cached_subvol;
- if (!cached_subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for path=%s", oldloc->path);
- op_errno = ENOENT;
- goto err;
- }
+ return 0;
+}
- hashed_subvol = dht_subvol_get_hashed (this, newloc);
- if (!hashed_subvol) {
- gf_msg_debug (this->name, 0,
- "no subvolume in layout for path=%s",
- newloc->path);
- op_errno = EIO;
- goto err;
- }
+int
+dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ xlator_t *prev = NULL;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ gf_boolean_t parent_layout_changed = _gf_false;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ local = frame->local;
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
- ret = loc_copy (&local->loc2, newloc);
- if (ret == -1) {
- op_errno = ENOMEM;
- goto err;
- }
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ parent_layout_changed = (xdata &&
+ dict_get(xdata, GF_PREOP_CHECK_FAILED))
+ ? _gf_true
+ : _gf_false;
+
+ if (parent_layout_changed) {
+ if (local && local->lock[0].layout.parent_layout.locks) {
+ /* Returning failure as the layout could not be fixed even under
+ * the lock */
+ goto out;
+ }
- if (hashed_subvol != cached_subvol) {
- gf_uuid_copy (local->gfid, oldloc->inode->gfid);
- dht_linkfile_create (frame, dht_link_linkfile_cbk, this,
- cached_subvol, hashed_subvol, newloc);
- } else {
- STACK_WIND (frame, dht_link_cbk,
- cached_subvol, cached_subvol->fops->link,
- oldloc, newloc, xdata);
- }
+ gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "create (%s/%s) (path: %s): parent layout "
+ "changed. Attempting a layout refresh and then a "
+ "retry",
+ pgfid, local->loc.name, local->loc.path);
- return 0;
+ /*
+ dht_refresh_layout needs directory info in local->loc.Hence,
+ storing the parent_loc in local->loc and storing the create
+ context in local->loc2. We will restore this information in
+ dht_creation_do.
+ */
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
+ loc_wipe(&local->loc2);
- return 0;
-}
+ ret = loc_copy(&local->loc2, &local->loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "loc_copy failed %s", local->loc.path);
+ goto out;
+ }
-int
-dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- fd_t *fd, inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
-{
- call_frame_t *prev = NULL;
- int ret = -1;
- dht_local_t *local = NULL;
+ loc_wipe(&local->loc);
- local = frame->local;
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+ ret = dht_build_parent_loc(this, &local->loc, &local->loc2,
+ &op_errno);
- if (op_ret == -1)
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED,
+ "parent loc build failed");
goto out;
+ }
- prev = cookie;
+ subvol = dht_subvol_get_hashed(this, &local->loc2);
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- preparent, 0);
+ ret = dht_create_lock(frame, subvol);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+ "locking parent failed");
+ goto out;
+ }
- dht_inode_ctx_time_update (local->loc.parent, this,
- postparent, 1);
+ return 0;
}
- ret = dht_layout_preset (this, prev->this, inode);
- if (ret != 0) {
- gf_msg_debug (this->name, 0,
- "could not set preset layout for subvol %s",
- prev->this->name);
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+ goto out;
+ }
+
+ prev = cookie;
+
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
+
+ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
+ }
+
+ ret = dht_fd_ctx_set(this, fd, prev);
+ if (ret != 0) {
+ gf_msg_debug(this->name, 0,
+ "Possible fd leak. "
+ "Could not set fd ctx for subvol %s",
+ prev->name);
+ }
+
+ ret = dht_layout_preset(this, prev, inode);
+ if (ret != 0) {
+ gf_msg_debug(this->name, 0, "could not set preset layout for subvol %s",
+ prev->name);
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
- local->op_errno = op_errno;
+ local->op_errno = op_errno;
- if (local->linked == _gf_true) {
- local->stbuf = *stbuf;
- dht_linkfile_attr_heal (frame, this);
- }
+ if (local->linked == _gf_true) {
+ local->stbuf = *stbuf;
+ dht_linkfile_attr_heal(frame, this);
+ }
out:
- DHT_STRIP_PHASE1_FLAGS (stbuf);
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ dht_set_fixed_dir_stat(preparent);
+ dht_set_fixed_dir_stat(postparent);
- if (local && local->lock.locks) {
- /* store op_errno for failure case*/
- local->op_errno = op_errno;
- local->refresh_layout_unlock (frame, this, op_ret);
+ if (local && local->lock[0].layout.parent_layout.locks) {
+ /* store op_errno for failure case*/
+ local->op_errno = op_errno;
+ local->refresh_layout_unlock(frame, this, op_ret, 1);
- if (op_ret == 0) {
- DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd,
- inode, stbuf, preparent, postparent,
- xdata);
- }
- } else {
- DHT_STACK_UNWIND (create, frame, op_ret, op_errno, fd, inode,
- stbuf, preparent, postparent, xdata);
- }
- return 0;
+ if (op_ret == 0) {
+ DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
+ preparent, postparent, xdata);
+ }
+ } else {
+ DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
+ preparent, postparent, xdata);
+ }
+ return 0;
}
-int
-dht_create_linkfile_create_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
+static int
+dht_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret, int32_t op_errno,
+ inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- xlator_t *cached_subvol = NULL;
- dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *cached_subvol = NULL;
+ dht_conf_t *conf = NULL;
- local = frame->local;
- if (!local) {
- op_errno = EINVAL;
- goto err;
- }
+ local = frame->local;
+ if (!local) {
+ op_errno = EINVAL;
+ goto err;
+ }
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto err;
- }
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto err;
+ }
- conf = this->private;
- if (!conf) {
- local->op_errno = EINVAL;
- op_errno = EINVAL;
- goto err;
- }
+ conf = this->private;
+ if (!conf) {
+ local->op_errno = EINVAL;
+ op_errno = EINVAL;
+ goto err;
+ }
- cached_subvol = local->cached_subvol;
+ cached_subvol = local->cached_subvol;
- if (local->params) {
- dict_del (local->params, conf->link_xattr_name);
- dict_del (local->params, GLUSTERFS_INTERNAL_FOP_KEY);
- }
+ if (local->params) {
+ dict_del(local->params, conf->link_xattr_name);
+ dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
+ }
- STACK_WIND (frame, dht_create_cbk,
- cached_subvol, cached_subvol->fops->create,
- &local->loc, local->flags, local->mode,
- local->umask, local->fd, local->params);
+ STACK_WIND_COOKIE(frame, dht_create_cbk, cached_subvol, cached_subvol,
+ cached_subvol->fops->create, &local->loc, local->flags,
+ local->mode, local->umask, local->fd, local->params);
- return 0;
+ return 0;
err:
- if (local && local->lock.locks) {
- local->refresh_layout_unlock (frame, this, -1);
- } else {
- DHT_STACK_UNWIND (create, frame, -1,
- op_errno, NULL, NULL, NULL,
- NULL, NULL, NULL);
- }
- return 0;
+ if (local && local->lock[0].layout.parent_layout.locks) {
+ local->refresh_layout_unlock(frame, this, -1, 1);
+ } else {
+ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ }
+ return 0;
}
-int
-dht_create_wind_to_avail_subvol (call_frame_t *frame, xlator_t *this,
- xlator_t *subvol, loc_t *loc, int32_t flags,
- mode_t mode, mode_t umask, fd_t *fd,
- dict_t *params)
+static int
+dht_create_wind_to_avail_subvol(call_frame_t *frame, xlator_t *this,
+ xlator_t *subvol, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd,
+ dict_t *params)
{
- dht_local_t *local = NULL;
- xlator_t *avail_subvol = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *avail_subvol = NULL;
- local = frame->local;
+ local = frame->local;
- if (!dht_is_subvol_filled (this, subvol)) {
- gf_msg_debug (this->name, 0,
- "creating %s on %s", loc->path,
- subvol->name);
+ if (!dht_is_subvol_filled(this, subvol)) {
+ gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+ subvol->name);
- STACK_WIND (frame, dht_create_cbk,
- subvol, subvol->fops->create,
- loc, flags, mode, umask, fd, params);
+ dht_set_parent_layout_in_dict(loc, this, local);
- } else {
- avail_subvol = dht_free_disk_available_subvol (this, subvol, local);
+ STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+ subvol->fops->create, loc, flags, mode, umask, fd,
+ params);
- if (avail_subvol != subvol) {
- local->params = dict_ref (params);
- local->flags = flags;
- local->mode = mode;
- local->umask = umask;
- local->cached_subvol = avail_subvol;
- local->hashed_subvol = subvol;
+ } else {
+ avail_subvol = dht_free_disk_available_subvol(this, subvol, local);
- gf_msg_debug (this->name, 0,
- "creating %s on %s (link at %s)", loc->path,
- avail_subvol->name, subvol->name);
+ if (avail_subvol != subvol) {
+ local->cached_subvol = avail_subvol;
+ local->hashed_subvol = subvol;
- dht_linkfile_create (frame, dht_create_linkfile_create_cbk,
- this, avail_subvol, subvol, loc);
-
- goto out;
- }
+ gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)",
+ loc->path, avail_subvol->name, subvol->name);
- gf_msg_debug (this->name, 0,
- "creating %s on %s", loc->path, subvol->name);
+ dht_linkfile_create(frame, dht_create_linkfile_create_cbk, this,
+ avail_subvol, subvol, loc);
- STACK_WIND (frame, dht_create_cbk,
- subvol, subvol->fops->create,
- loc, flags, mode, umask, fd, params);
+ goto out;
}
+
+ gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
+ subvol->name);
+
+ dht_set_parent_layout_in_dict(loc, this, local);
+
+ STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+ subvol->fops->create, loc, flags, mode, umask, fd,
+ params);
+ }
out:
- return 0;
+ return 0;
}
int
-dht_build_parent_loc (xlator_t *this, loc_t *parent, loc_t *child,
- int32_t *op_errno)
+dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child,
+ int32_t *op_errno)
{
- inode_table_t *table = NULL;
- int ret = -1;
+ inode_table_t *table = NULL;
+ int ret = -1;
- if (!parent || !child) {
- if (op_errno)
- *op_errno = EINVAL;
- goto out;
- }
+ if (!parent || !child) {
+ if (op_errno)
+ *op_errno = EINVAL;
+ goto out;
+ }
- if (child->parent) {
- parent->inode = inode_ref (child->parent);
- if (!parent->inode) {
- if (op_errno)
- *op_errno = EINVAL;
- goto out;
- }
+ if (child->parent) {
+ parent->inode = inode_ref(child->parent);
+ if (!parent->inode) {
+ if (op_errno)
+ *op_errno = EINVAL;
+ goto out;
+ }
- gf_uuid_copy (parent->gfid, child->pargfid);
+ gf_uuid_copy(parent->gfid, child->pargfid);
- ret = 0;
+ ret = 0;
- goto out;
- } else {
- if (gf_uuid_is_null (child->pargfid)) {
- if (op_errno)
- *op_errno = EINVAL;
- goto out;
- }
+ goto out;
+ } else {
+ if (gf_uuid_is_null(child->pargfid)) {
+ if (op_errno)
+ *op_errno = EINVAL;
+ goto out;
+ }
- table = this->itable;
+ table = this->itable;
- if (!table) {
- if (op_errno) {
- *op_errno = EINVAL;
- goto out;
- }
- }
+ if (!table) {
+ if (op_errno) {
+ *op_errno = EINVAL;
+ goto out;
+ }
+ }
- parent->inode = inode_find (table, child->pargfid);
+ parent->inode = inode_find(table, child->pargfid);
- if (!parent->inode) {
- if (op_errno) {
- *op_errno = EINVAL;
- goto out;
- }
- }
+ if (!parent->inode) {
+ if (op_errno) {
+ *op_errno = EINVAL;
+ goto out;
+ }
+ }
- gf_uuid_copy (parent->gfid, child->pargfid);
+ gf_uuid_copy(parent->gfid, child->pargfid);
- ret = 0;
- }
+ ret = 0;
+ }
out:
- return ret;
+ return ret;
}
-
-int32_t
-dht_create_do (call_frame_t *frame)
+static int32_t
+dht_create_do(call_frame_t *frame)
{
- dht_local_t *local = NULL;
- dht_layout_t *refreshed = NULL;
- xlator_t *subvol = NULL;
- xlator_t *this = NULL;
- dht_conf_t *conf = NULL;
- dht_methods_t *methods = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *refreshed = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
- local = frame->local;
+ local = frame->local;
- this = THIS;
+ this = THIS;
- conf = this->private;
+ conf = this->private;
- GF_VALIDATE_OR_GOTO (this->name, conf, err);
+ GF_VALIDATE_OR_GOTO(this->name, conf, err);
- methods = &(conf->methods);
+ methods = &(conf->methods);
- /* We don't need parent_loc anymore */
- loc_wipe (&local->loc);
+ /* We don't need parent_loc anymore */
+ loc_wipe(&local->loc);
- loc_copy (&local->loc, &local->loc2);
+ loc_copy(&local->loc, &local->loc2);
- loc_wipe (&local->loc2);
+ loc_wipe(&local->loc2);
- refreshed = local->selfheal.refreshed_layout;
+ refreshed = local->selfheal.refreshed_layout;
- subvol = methods->layout_search (this, refreshed, local->loc.name);
+ subvol = methods->layout_search(this, refreshed, local->loc.name);
- if (!subvol) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_HASHED_SUBVOL_GET_FAILED, "no subvolume in "
- "layout for path=%s", local->loc.path);
- local->op_errno = ENOENT;
- goto err;
- }
+ if (!subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "no subvolume in "
+ "layout for path=%s",
+ local->loc.path);
+ local->op_errno = ENOENT;
+ goto err;
+ }
- dht_create_wind_to_avail_subvol (frame, this, subvol, &local->loc,
- local->flags, local->mode,
- local->umask, local->fd, local->params);
- return 0;
+ dht_create_wind_to_avail_subvol(frame, this, subvol, &local->loc,
+ local->flags, local->mode, local->umask,
+ local->fd, local->params);
+ return 0;
err:
- local->refresh_layout_unlock (frame, this, -1);
+ local->refresh_layout_unlock(frame, this, -1, 1);
- return 0;
+ return 0;
}
-int32_t
-dht_create_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+static int32_t
+dht_create_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- DHT_STACK_DESTROY (frame);
- return 0;
+ DHT_STACK_DESTROY(frame);
+ return 0;
}
-int32_t
-dht_create_finish (call_frame_t *frame, xlator_t *this, int op_ret)
+static int32_t
+dht_create_finish(call_frame_t *frame, xlator_t *this, int op_ret,
+ int invoke_cbk)
{
- dht_local_t *local = NULL, *lock_local = NULL;
- call_frame_t *lock_frame = NULL;
- int lock_count = 0;
-
- local = frame->local;
- lock_count = dht_lock_count (local->lock.locks, local->lock.lk_count);
- if (lock_count == 0)
- goto done;
-
- lock_frame = copy_frame (frame);
- if (lock_frame == NULL) {
- goto done;
- }
-
- lock_local = dht_local_init (lock_frame, &local->loc, NULL,
- lock_frame->root->op);
- if (lock_local == NULL) {
- goto done;
- }
-
- lock_local->lock.locks = local->lock.locks;
- lock_local->lock.lk_count = local->lock.lk_count;
-
- local->lock.locks = NULL;
- local->lock.lk_count = 0;
-
- dht_unlock_inodelk (lock_frame, lock_local->lock.locks,
- lock_local->lock.lk_count,
- dht_create_unlock_cbk);
- lock_frame = NULL;
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ int lock_count = 0;
+
+ local = frame->local;
+ lock_count = dht_lock_count(local->lock[0].layout.parent_layout.locks,
+ local->lock[0].layout.parent_layout.lk_count);
+ if (lock_count == 0)
+ goto done;
+
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL) {
+ goto done;
+ }
+
+ lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+ lock_frame->root->op);
+ if (lock_local == NULL) {
+ goto done;
+ }
+
+ lock_local->lock[0]
+ .layout.parent_layout.locks = local->lock[0].layout.parent_layout.locks;
+ lock_local->lock[0].layout.parent_layout.lk_count =
+ local->lock[0].layout.parent_layout.lk_count;
+
+ local->lock[0].layout.parent_layout.locks = NULL;
+ local->lock[0].layout.parent_layout.lk_count = 0;
+
+ dht_unlock_inodelk(lock_frame,
+ lock_local->lock[0].layout.parent_layout.locks,
+ lock_local->lock[0].layout.parent_layout.lk_count,
+ dht_create_unlock_cbk);
+ lock_frame = NULL;
done:
- if (lock_frame != NULL) {
- DHT_STACK_DESTROY (lock_frame);
- }
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
- if (op_ret == 0)
- return 0;
-
- DHT_STACK_UNWIND (create, frame, op_ret, local->op_errno, NULL, NULL,
- NULL, NULL, NULL, NULL);
+ if (op_ret == 0)
return 0;
+
+ DHT_STACK_UNWIND(create, frame, op_ret, local->op_errno, NULL, NULL, NULL,
+ NULL, NULL, NULL);
+ return 0;
}
-int32_t
-dht_create_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+static int32_t
+dht_create_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (!local) {
- goto err;
- }
+ if (!local) {
+ goto err;
+ }
- if (op_ret < 0) {
- gf_msg ("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
- "Create lock failed for file: %s", local->loc2.name);
+ if (op_ret < 0) {
+ gf_msg("DHT", GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+ "Create lock failed for file: %s", local->loc2.name);
- local->op_errno = op_errno;
+ local->op_errno = op_errno;
- goto err;
- }
+ goto err;
+ }
- local->refresh_layout_unlock = dht_create_finish;
+ local->refresh_layout_unlock = dht_create_finish;
- local->refresh_layout_done = dht_create_do;
+ local->refresh_layout_done = dht_create_do;
- dht_refresh_layout (frame);
+ dht_refresh_layout(frame);
- return 0;
+ return 0;
err:
- dht_create_finish (frame, this, -1);
- return 0;
+ if (local)
+ dht_create_finish(frame, this, -1, 0);
+ else
+ DHT_STACK_UNWIND(create, frame, -1, EINVAL, NULL, NULL, NULL, NULL,
+ NULL, NULL);
+ return 0;
}
int32_t
-dht_create_lock (call_frame_t *frame, xlator_t *subvol)
+dht_create_lock(call_frame_t *frame, xlator_t *subvol)
{
- dht_local_t *local = NULL;
- int count = 1, ret = -1;
- dht_lock_t **lk_array = NULL;
+ dht_local_t *local = NULL;
+ int count = 1, ret = -1;
+ dht_lock_t **lk_array = NULL;
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO (frame->this->name, frame->local, err);
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO(frame->this->name, frame->local, err);
- local = frame->local;
+ local = frame->local;
- lk_array = GF_CALLOC (count, sizeof (*lk_array), gf_common_mt_char);
+ lk_array = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
- if (lk_array == NULL)
- goto err;
+ if (lk_array == NULL)
+ goto err;
- lk_array[0] = dht_lock_new (frame->this, subvol, &local->loc, F_RDLCK,
- DHT_LAYOUT_HEAL_DOMAIN);
+ lk_array[0] = dht_lock_new(frame->this, subvol, &local->loc, F_RDLCK,
+ DHT_LAYOUT_HEAL_DOMAIN, NULL,
+ IGNORE_ENOENT_ESTALE);
- if (lk_array[0] == NULL)
- goto err;
+ if (lk_array[0] == NULL)
+ goto err;
- local->lock.locks = lk_array;
- local->lock.lk_count = count;
+ local->lock[0].layout.parent_layout.locks = lk_array;
+ local->lock[0].layout.parent_layout.lk_count = count;
- ret = dht_blocking_inodelk (frame, lk_array, count,
- dht_create_lock_cbk);
+ ret = dht_blocking_inodelk(frame, lk_array, count, dht_create_lock_cbk);
- if (ret < 0) {
- local->lock.locks = NULL;
- local->lock.lk_count = 0;
- goto err;
- }
+ if (ret < 0) {
+ local->lock[0].layout.parent_layout.locks = NULL;
+ local->lock[0].layout.parent_layout.lk_count = 0;
+ goto err;
+ }
- return 0;
+ return 0;
err:
- if (lk_array != NULL) {
- dht_lock_array_free (lk_array, count);
- GF_FREE (lk_array);
- }
+ if (lk_array != NULL) {
+ dht_lock_array_free(lk_array, count);
+ GF_FREE(lk_array);
+ }
- return -1;
+ return -1;
}
int
-dht_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- mode_t umask, fd_t *fd, dict_t *params)
+dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local)
{
- int op_errno = -1;
- xlator_t *subvol = NULL;
- dht_local_t *local = NULL;
- int i = 0;
- dht_conf_t *conf = NULL;
- int ret = 0;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
+ dht_conf_t *conf = this->private;
+ dht_layout_t *parent_layout = NULL;
+ int *parent_disk_layout = NULL;
+ xlator_t *hashed_subvol = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = 0;
+
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ parent_layout = dht_layout_get(this, loc->parent);
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+
+ ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol,
+ &parent_disk_layout);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "extracting in-memory layout of parent failed. ",
+ gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ ret = dict_set_str_sizen(local->params, GF_PREOP_PARENT_KEY,
+ conf->xattr_name);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "setting %s key in params dictionary failed. ",
+ gf_fop_list[local->fop], pgfid, loc->name, loc->path,
+ GF_PREOP_PARENT_KEY);
+ goto err;
+ }
+
+ ret = dict_set_bin(local->params, conf->xattr_name, parent_disk_layout,
+ 4 * 4);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "%s (%s/%s) (path: %s): "
+ "setting parent-layout in params dictionary failed. ",
+ gf_fop_list[local->fop], pgfid, loc->name, loc->path);
+ goto err;
+ }
- conf = this->private;
+err:
+ dht_layout_unref(this, parent_layout);
+ return ret;
+}
- dht_get_du_info (frame, this, loc);
+int
+dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
+{
+ int op_errno = -1;
+ xlator_t *subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ dht_local_t *local = NULL;
+ int i = 0;
+ dht_conf_t *conf = NULL;
+ int ret = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+
+ conf = this->private;
+
+ dht_get_du_info(frame, this, loc);
+
+ local = dht_local_init(frame, loc, fd, GF_FOP_CREATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->params = dict_ref(params);
+ local->flags = flags;
+ local->mode = mode;
+ local->umask = umask;
+
+ if (dht_filter_loc_subvol_key(this, loc, &local->loc, &subvol)) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_INFO,
+ "creating %s on %s (got create on %s)", local->loc.path,
+ subvol->name, loc->path);
+
+ /* Since lookup-optimize is enabled by default, we need
+ * to create the linkto file if required.
+ * Note this does not check for decommisioned bricks
+ * and min-free-disk limits as this is a debugging tool
+ * and not expected to be used in production.
+ */
+ hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
- local = dht_local_init (frame, loc, fd, GF_FOP_CREATE);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ if (hashed_subvol && (hashed_subvol != subvol)) {
+ /* Create the linkto file and then the data file */
+ local->cached_subvol = subvol;
+ local->hashed_subvol = hashed_subvol;
- if (dht_filter_loc_subvol_key (this, loc, &local->loc,
- &subvol)) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_SUBVOL_INFO,
- "creating %s on %s (got create on %s)",
- local->loc.path, subvol->name, loc->path);
- STACK_WIND (frame, dht_create_cbk,
- subvol, subvol->fops->create,
- &local->loc, flags, mode, umask, fd, params);
- goto done;
+ dht_linkfile_create(frame, dht_create_linkfile_create_cbk, this,
+ subvol, hashed_subvol, &local->loc);
+ goto done;
}
+ /* We either don't have a hashed subvol or the hashed subvol is
+ * the same as the one specified. No need to create the linkto
+ * file as we expect a lookup everywhere if there are problems
+ * with the parent layout
+ */
- subvol = dht_subvol_get_hashed (this, loc);
- if (!subvol) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_HASHED_SUBVOL_GET_FAILED,
- "no subvolume in layout for path=%s",
- loc->path);
-
- op_errno = EIO;
- goto err;
- }
-
- /* Post remove-brick, the client layout may not be in sync with
- * disk layout because of lack of lookup. Hence,a create call
- * may fall on the decommissioned brick. Hence, if the
- * hashed_subvol is part of decommissioned bricks list, do a
- * lookup on parent dir. If a fix-layout is already done by the
- * remove-brick process, the parent directory layout will be in
- * sync with that of the disk. If fix-layout is still ending
- * on the parent directory, we can let the file get created on
- * the decommissioned brick which will be eventually migrated to
- * non-decommissioned brick based on the new layout.
- */
-
- if (conf->decommission_subvols_cnt) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->decommissioned_bricks[i] &&
- conf->decommissioned_bricks[i] == subvol) {
-
- gf_msg_debug (this->name, 0, "hashed subvol:%s is "
- "part of decommission brick list for "
- "file: %s", subvol->name, loc->path);
-
- /* dht_refresh_layout needs directory info in
- * local->loc. Hence, storing the parent_loc in
- * local->loc and storing the create context in
- * local->loc2. We will restore this information
- * in dht_creation do */
-
- ret = loc_copy (&local->loc2, &local->loc);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_NO_MEMORY,
- "loc_copy failed %s", loc->path);
-
- goto err;
- }
+ dht_set_parent_layout_in_dict(loc, this, local);
+
+ STACK_WIND_COOKIE(frame, dht_create_cbk, subvol, subvol,
+ subvol->fops->create, &local->loc, flags, mode, umask,
+ fd, params);
+ goto done;
+ }
+
+ subvol = dht_subvol_get_hashed(this, loc);
+ if (!subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "no subvolume in layout for path=%s", loc->path);
+
+ op_errno = EIO;
+ goto err;
+ }
+
+ /* Post remove-brick, the client layout may not be in sync with
+ * disk layout because of lack of lookup. Hence,a create call
+ * may fall on the decommissioned brick. Hence, if the
+ * hashed_subvol is part of decommissioned bricks list, do a
+ * lookup on parent dir. If a fix-layout is already done by the
+ * remove-brick process, the parent directory layout will be in
+ * sync with that of the disk. If fix-layout is still ending
+ * on the parent directory, we can let the file get created on
+ * the decommissioned brick which will be eventually migrated to
+ * non-decommissioned brick based on the new layout.
+ */
+
+ if (conf->decommission_subvols_cnt) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i] &&
+ conf->decommissioned_bricks[i] == subvol) {
+ gf_msg_debug(this->name, 0,
+ "hashed subvol:%s is "
+ "part of decommission brick list for "
+ "file: %s",
+ subvol->name, loc->path);
+
+ /* dht_refresh_layout needs directory info in
+ * local->loc. Hence, storing the parent_loc in
+ * local->loc and storing the create context in
+ * local->loc2. We will restore this information
+ * in dht_creation do */
+
+ ret = loc_copy(&local->loc2, &local->loc);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "loc_copy failed %s", loc->path);
- local->params = dict_ref (params);
- local->flags = flags;
- local->mode = mode;
- local->umask = umask;
+ goto err;
+ }
- loc_wipe (&local->loc);
+ loc_wipe(&local->loc);
- ret = dht_build_parent_loc (this, &local->loc, loc,
- &op_errno);
+ ret = dht_build_parent_loc(this, &local->loc, loc, &op_errno);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_NO_MEMORY,
- "parent loc build failed");
- goto err;
- }
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_LOC_FAILED,
+ "parent loc build failed");
+ goto err;
+ }
- ret = dht_create_lock (frame, subvol);
+ ret = dht_create_lock(frame, subvol);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_INODE_LK_ERROR,
- "locking parent failed");
- goto err;
- }
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INODE_LK_ERROR,
+ "locking parent failed");
+ goto err;
+ }
- goto done;
- }
+ goto done;
}
}
+ }
-
- dht_create_wind_to_avail_subvol (frame, this, subvol, loc, flags, mode,
- umask, fd, params);
+ dht_create_wind_to_avail_subvol(frame, this, subvol, loc, flags, mode,
+ umask, fd, params);
done:
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (create, frame, -1, op_errno, NULL, NULL, NULL,
- NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
- return 0;
+ return 0;
}
-
-int
-dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+static int
+dht_mkdir_selfheal_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
- local = frame->local;
- layout = local->selfheal.layout;
+ local = frame->local;
+ layout = local->selfheal.layout;
- if (op_ret == 0) {
- dht_layout_set (this, local->inode, layout);
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent, this,
- &local->preparent, 0);
+ FRAME_SU_UNDO(frame, dht_local_t);
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
- dht_inode_ctx_time_update (local->loc.parent, this,
- &local->postparent, 1);
- }
+ if (op_ret == 0) {
+ dht_layout_set(this, local->inode, layout);
+
+ dht_inode_ctx_time_update(local->inode, this, &local->stbuf, 1);
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->preparent, 0);
+
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
}
+ }
- DHT_STACK_UNWIND (mkdir, frame, op_ret, op_errno,
- local->inode, &local->stbuf, &local->preparent,
- &local->postparent, NULL);
+ DHT_STACK_UNWIND(mkdir, frame, op_ret, op_errno, local->inode,
+ &local->stbuf, &local->preparent, &local->postparent,
+ NULL);
- return 0;
+ return 0;
}
-int
-dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+static int
+dht_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- int ret = -1;
- gf_boolean_t subvol_filled = _gf_false;
- gf_boolean_t dir_exists = _gf_false;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
-
- local = frame->local;
- prev = cookie;
- layout = local->layout;
-
- subvol_filled = dht_is_subvol_filled (this, prev->this);
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ int ret = -1;
+ gf_boolean_t subvol_filled = _gf_false;
+ gf_boolean_t dir_exists = _gf_false;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ layout = local->layout;
+
+ subvol_filled = dht_is_subvol_filled(this, prev);
+
+ LOCK(&frame->lock);
+ {
+ if (subvol_filled && (op_ret != -1)) {
+ ret = dht_layout_merge(this, layout, prev, -1, ENOSPC, NULL);
+ } else {
+ if (op_ret == -1 && op_errno == EEXIST) {
+ /* Very likely just a race between mkdir and
+ self-heal (from lookup of a concurrent mkdir
+ attempt).
+ Ignore error for now. layout setting will
+ anyways fail if this was a different (old)
+ pre-existing different directory.
+ */
+ op_ret = 0;
+ dir_exists = _gf_true;
+ }
+ ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, NULL);
+ }
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ "%s: failed to merge layouts for subvol %s", local->loc.path,
+ prev->name);
- LOCK (&frame->lock);
- {
- if (subvol_filled && (op_ret != -1)) {
- ret = dht_layout_merge (this, layout, prev->this,
- -1, ENOSPC, NULL);
- } else {
- if (op_ret == -1 && op_errno == EEXIST) {
- /* Very likely just a race between mkdir and
- self-heal (from lookup of a concurrent mkdir
- attempt).
- Ignore error for now. layout setting will
- anyways fail if this was a different (old)
- pre-existing different directory.
- */
- op_ret = 0;
- dir_exists = _gf_true;
- }
- ret = dht_layout_merge (this, layout, prev->this,
- op_ret, op_errno, NULL);
- }
- if (ret)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_LAYOUT_MERGE_FAILED,
- "%s: failed to merge layouts for subvol %s",
- local->loc.path, prev->this->name);
-
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto unlock;
- }
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
- if (dir_exists)
- goto unlock;
+ if (dir_exists)
+ goto unlock;
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preparent, preparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
- }
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->preparent, preparent);
+ dht_iatt_merge(this, &local->postparent, postparent);
+ }
unlock:
- UNLOCK (&frame->lock);
+ UNLOCK(&frame->lock);
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- dht_selfheal_new_directory (frame, dht_mkdir_selfheal_cbk,
- layout);
- }
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ /*Unlock entrylk and inodelk once mkdir is done on all subvols*/
+ dht_unlock_namespace(frame, &local->lock[0]);
+ FRAME_SU_DO(frame, dht_local_t);
+ dht_selfheal_new_directory(frame, dht_mkdir_selfheal_cbk, layout);
+ }
- return 0;
+ return 0;
}
-int
-dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
-{
- dht_local_t *local = NULL;
- int ret = -1;
- call_frame_t *prev = NULL;
- dht_layout_t *layout = NULL;
- dht_conf_t *conf = NULL;
- int i = 0;
- xlator_t *hashed_subvol = NULL;
-
- VALIDATE_OR_GOTO (this->private, err);
-
- local = frame->local;
- prev = cookie;
- layout = local->layout;
- conf = this->private;
- hashed_subvol = local->hashed_subvol;
+static int
+dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
- if (gf_uuid_is_null (local->loc.gfid) && !op_ret)
- gf_uuid_copy (local->loc.gfid, stbuf->ia_gfid);
+static int
+dht_mkdir_helper(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *params)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1, ret = -1;
+ xlator_t *hashed_subvol = NULL;
+ int32_t *parent_disk_layout = NULL;
+ dht_layout_t *parent_layout = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ conf = this->private;
+ local = frame->local;
+
+ if (local->op_ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): refreshing parent layout "
+ "failed.",
+ pgfid, loc->name, loc->path);
- if (dht_is_subvol_filled (this, hashed_subvol))
- ret = dht_layout_merge (this, layout, prev->this,
- -1, ENOSPC, NULL);
- else
- ret = dht_layout_merge (this, layout, prev->this,
- op_ret, op_errno, NULL);
+ op_errno = local->op_errno;
+ goto err;
+ }
+
+ local->op_ret = -1;
+
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (hashed_subvol == NULL) {
+ gf_msg_debug(this->name, 0,
+ "mkdir (%s/%s) (path: %s): hashed subvol not "
+ "found",
+ pgfid, loc->name, loc->path);
+ op_errno = ENOENT;
+ goto err;
+ }
+
+ local->hashed_subvol = hashed_subvol;
+
+ parent_layout = dht_layout_get(this, loc->parent);
+
+ ret = dht_disk_layout_extract_for_subvol(this, parent_layout, hashed_subvol,
+ &parent_disk_layout);
+ if (ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, EIO, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): "
+ "extracting in-memory layout of parent failed. ",
+ pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ if (memcmp(local->parent_disk_layout, parent_disk_layout,
+ sizeof(local->parent_disk_layout)) == 0) {
+ gf_msg(this->name, GF_LOG_WARNING, EIO, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): loop detected. "
+ "parent layout didn't change even though "
+ "previous attempt of mkdir failed because of "
+ "in-memory layout not matching with that on disk.",
+ pgfid, loc->name, loc->path);
+ op_errno = EIO;
+ goto err;
+ }
+
+ memcpy((void *)local->parent_disk_layout, (void *)parent_disk_layout,
+ sizeof(local->parent_disk_layout));
+
+ dht_layout_unref(this, parent_layout);
+ parent_layout = NULL;
+
+ ret = dict_set_str(params, GF_PREOP_PARENT_KEY, conf->xattr_name);
+ if (ret < 0) {
+ local->op_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): "
+ "setting %s key in params dictionary failed. ",
+ pgfid, loc->name, loc->path, GF_PREOP_PARENT_KEY);
+ goto err;
+ }
+
+ ret = dict_set_bin(params, conf->xattr_name, parent_disk_layout, 4 * 4);
+ if (ret < 0) {
+ local->op_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "setting parent-layout in params dictionary failed. "
+ "mkdir (%s/%s) (path: %s)",
+ pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ parent_disk_layout = NULL;
+
+ STACK_WIND_COOKIE(frame, dht_mkdir_hashed_cbk, hashed_subvol, hashed_subvol,
+ hashed_subvol->fops->mkdir, loc, mode, umask, params);
+
+ return 0;
- /* TODO: we may have to return from the function
- if layout merge fails. For now, lets just log an error */
- if (ret)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_LAYOUT_MERGE_FAILED,
- "%s: failed to merge layouts for subvol %s",
- local->loc.path, prev->this->name);
+err:
+ dht_unlock_namespace(frame, &local->lock[0]);
- if (op_ret == -1) {
- local->op_errno = op_errno;
- goto err;
- }
- local->op_ret = 0;
+ op_errno = local ? local->op_errno : op_errno;
+ DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preparent, preparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent, prev->this);
+ if (parent_disk_layout != NULL)
+ GF_FREE(parent_disk_layout);
- local->call_cnt = conf->subvolume_cnt - 1;
+ if (parent_layout != NULL)
+ dht_layout_unref(this, parent_layout);
- if (gf_uuid_is_null (local->loc.gfid))
- gf_uuid_copy (local->loc.gfid, stbuf->ia_gfid);
- if (local->call_cnt == 0) {
- dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk,
- &local->loc, layout);
- }
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->subvolumes[i] == hashed_subvol)
- continue;
- STACK_WIND (frame, dht_mkdir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->mkdir, &local->loc,
- local->mode, local->umask, local->params);
- }
- return 0;
-err:
- DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL,
- NULL, NULL);
- return 0;
+ return 0;
}
-
-int
-dht_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, mode_t umask, dict_t *params)
+static int
+dht_mkdir_hashed_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- xlator_t *hashed_subvol = NULL;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (this->private, err);
-
- conf = this->private;
-
- dht_get_du_info (frame, this, loc);
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_MKDIR);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ dht_local_t *local = NULL;
+ int ret = -1;
+ xlator_t *prev = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *hashed_subvol = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ gf_boolean_t parent_layout_changed = _gf_false;
+ call_stub_t *stub = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ layout = local->layout;
+ conf = this->private;
+ hashed_subvol = local->hashed_subvol;
+
+ gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+
+ if (gf_uuid_is_null(local->loc.gfid) && !op_ret)
+ gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid);
+
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
- hashed_subvol = dht_subvol_get_hashed (this, loc);
- if (hashed_subvol == NULL) {
- gf_msg_debug (this->name, 0,
- "hashed subvol not found for %s",
- loc->path);
- op_errno = EIO;
+ parent_layout_changed = (xdata &&
+ dict_get(xdata, GF_PREOP_CHECK_FAILED))
+ ? 1
+ : 0;
+ if (parent_layout_changed) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): parent layout "
+ "changed. Attempting a refresh and then a "
+ "retry",
+ pgfid, local->loc.name, local->loc.path);
+
+ stub = fop_mkdir_stub(frame, dht_mkdir_helper, &local->loc,
+ local->mode, local->umask, local->params);
+ if (stub == NULL) {
goto err;
- }
-
- local->hashed_subvol = hashed_subvol;
- local->mode = mode;
- local->umask = umask;
- local->params = dict_ref (params);
- local->inode = inode_ref (loc->inode);
+ }
- local->layout = dht_layout_new (this, conf->subvolume_cnt);
- if (!local->layout) {
- op_errno = ENOMEM;
+ ret = dht_handle_parent_layout_change(this, stub);
+ if (ret) {
goto err;
- }
+ }
- /* set the newly created directory hash to the commit hash
- * if the configuration option is set. If configuration option
- * is not set, the older clients may still be connecting to the
- * volume and hence we need to preserve the 1 in disk[0] part of the
- * layout xattr */
- if (conf->lookup_optimize)
- local->layout->commit_hash = conf->vol_commit_hash;
- else
- local->layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+ stub = NULL;
+
+ return 0;
+ }
+
+ goto err;
+ }
+
+ dict_del(local->params, GF_PREOP_PARENT_KEY);
+ dict_del(local->params, conf->xattr_name);
+
+ if (dht_is_subvol_filled(this, hashed_subvol))
+ ret = dht_layout_merge(this, layout, prev, -1, ENOSPC, NULL);
+ else
+ ret = dht_layout_merge(this, layout, prev, op_ret, op_errno, NULL);
+
+ /* TODO: we may have to return from the function
+ if layout merge fails. For now, lets just log an error */
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ "%s: failed to merge layouts for subvol %s", local->loc.path,
+ prev->name);
+
+ local->op_ret = 0;
+
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->preparent, preparent);
+ dht_iatt_merge(this, &local->postparent, postparent);
+
+ local->call_cnt = conf->subvolume_cnt - 1;
+ /* Delete internal mds xattr from params dict to avoid store
+ internal mds xattr on other subvols
+ */
+ dict_del(local->params, conf->mds_xattr_key);
+
+ if (gf_uuid_is_null(local->loc.gfid))
+ gf_uuid_copy(local->loc.gfid, stbuf->ia_gfid);
+
+ /* Set hashed subvol as a mds subvol on inode ctx */
+ /*if (!local->inode)
+ local->inode = inode_ref (inode);
+ */
+ ret = dht_inode_ctx_mdsvol_set(local->inode, this, hashed_subvol);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "Failed to set hashed subvol for %s on inode vol is %s",
+ local->loc.path, hashed_subvol->name);
+ }
+
+ if (local->call_cnt == 0) {
+ /*Unlock namespace lock once mkdir is done on all subvols*/
+ dht_unlock_namespace(frame, &local->lock[0]);
+ FRAME_SU_DO(frame, dht_local_t);
+ dht_selfheal_directory(frame, dht_mkdir_selfheal_cbk, &local->loc,
+ layout);
+ return 0;
+ }
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == hashed_subvol)
+ continue;
+ STACK_WIND_COOKIE(frame, dht_mkdir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i], conf->subvolumes[i]->fops->mkdir,
+ &local->loc, local->mode, local->umask,
+ local->params);
+ }
+
+ return 0;
+err:
+ if (local->op_ret != 0) {
+ dht_unlock_namespace(frame, &local->lock[0]);
+ }
- STACK_WIND (frame, dht_mkdir_hashed_cbk,
- hashed_subvol,
- hashed_subvol->fops->mkdir,
- loc, mode, umask, params);
+ DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
+}
+static int
+dht_mkdir_guard_parent_layout_cbk(call_frame_t *frame, xlator_t *this,
+ loc_t *loc, mode_t mode, mode_t umask,
+ dict_t *params)
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = 0;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = -1;
+ int32_t zero[1] = {0};
+
+ local = frame->local;
+ conf = this->private;
+
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ if (local->op_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): "
+ "Acquiring lock on parent to guard against "
+ "layout-change failed.",
+ pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ local->op_ret = -1;
+ /* Add internal MDS xattr on disk for hashed subvol
+ */
+ ret = dht_dict_set_array(params, conf->mds_xattr_key, zero, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value:key = %s for "
+ "path %s",
+ conf->mds_xattr_key, loc->path);
+ }
+
+ STACK_WIND_COOKIE(frame, dht_mkdir_hashed_cbk, local->hashed_subvol,
+ local->hashed_subvol, local->hashed_subvol->fops->mkdir,
+ loc, mode, umask, params);
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (mkdir, frame, -1, op_errno, NULL, NULL, NULL,
- NULL, NULL);
+ DHT_STACK_UNWIND(mkdir, frame, -1, local->op_errno, NULL, NULL, NULL, NULL,
+ NULL);
- return 0;
+ return 0;
}
-
int
-dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
+dht_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *params)
{
- dht_local_t *local = NULL;
-
- local = frame->local;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = EINVAL, ret = -1;
+ xlator_t *hashed_subvol = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ call_stub_t *stub = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ gf_uuid_unparse(loc->parent->gfid, pgfid);
+
+ conf = this->private;
+
+ if (!params || !dict_get(params, "gfid-req")) {
+ op_errno = EPERM;
+ gf_msg_callingfn(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_GFID_NULL,
+ "mkdir: %s is received "
+ "without gfid-req %p",
+ loc->path, params);
+ goto err;
+ }
+
+ dht_get_du_info(frame, this, loc);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_MKDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (hashed_subvol == NULL) {
+ gf_msg_debug(this->name, 0, "hashed subvol not found for %s",
+ loc->path);
+ local->op_errno = EIO;
+ goto err;
+ }
+
+ local->hashed_subvol = hashed_subvol;
+ local->mode = mode;
+ local->umask = umask;
+ if (params)
+ local->params = dict_ref(params);
+
+ local->inode = inode_ref(loc->inode);
+
+ local->layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!local->layout) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ /* set the newly created directory hash to the commit hash
+ * if the configuration option is set. If configuration option
+ * is not set, the older clients may still be connecting to the
+ * volume and hence we need to preserve the 1 in disk[0] part of the
+ * layout xattr */
+ if (conf->lookup_optimize)
+ local->layout->commit_hash = conf->vol_commit_hash;
+ else
+ local->layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+
+ stub = fop_mkdir_stub(frame, dht_mkdir_guard_parent_layout_cbk, loc, mode,
+ umask, params);
+ if (stub == NULL) {
+ gf_msg(this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s): "
+ "creating stub failed.",
+ pgfid, loc->name, loc->path);
+ local->op_errno = ENOMEM;
+ goto err;
+ }
+
+ ret = dht_guard_parent_layout_and_namespace(this, stub);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ "mkdir (%s/%s) (path: %s) cannot wind lock request to "
+ "guard parent layout",
+ pgfid, loc->name, loc->path);
+ goto err;
+ }
+
+ return 0;
- DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent, NULL);
+err:
+ op_errno = local ? local->op_errno : op_errno;
+ DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
- return 0;
+ return 0;
}
-
-int
-dht_rmdir_hashed_subvol_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+static int
+dht_rmdir_selfheal_cbk(call_frame_t *heal_frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- char gfid[GF_UUID_BUF_SIZE] ={0};
+ dht_local_t *local = NULL;
+ dht_local_t *heal_local = NULL;
+ call_frame_t *main_frame = NULL;
- local = frame->local;
- prev = cookie;
- conf = this->private;
+ heal_local = heal_frame->local;
+ main_frame = heal_local->main_frame;
+ local = main_frame->local;
- gf_uuid_unparse(local->loc.gfid, gfid);
+ DHT_STACK_DESTROY(heal_frame);
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- local->op_ret = -1;
- if (conf->subvolume_cnt != 1) {
- if (op_errno != ENOENT && op_errno != EACCES) {
- local->need_selfheal = 1;
- }
- }
+ DHT_STACK_UNWIND(rmdir, main_frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
- gf_msg_debug (this->name, op_errno,
- "rmdir on %s for %s failed "
- "(gfid = %s)",
- prev->this->name, local->loc.path,
- gfid);
- goto unlock;
- }
+ return 0;
+}
- dht_iatt_merge (this, &local->preparent, preparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
+static int
+dht_rmdir_hashed_subvol_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ dht_local_t *heal_local = NULL;
+ call_frame_t *heal_frame = NULL;
+ dht_conf_t *conf = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ if (conf->subvolume_cnt != 1) {
+ if (op_errno != ENOENT && op_errno != EACCES &&
+ op_errno != ESTALE) {
+ local->need_selfheal = 1;
+ }
+ }
+ gf_msg_debug(this->name, op_errno,
+ "rmdir on %s for %s failed "
+ "(gfid = %s)",
+ prev->name, local->loc.path, gfid);
+ goto unlock;
}
+
+ dht_iatt_merge(this, &local->preparent, preparent);
+ dht_iatt_merge(this, &local->postparent, postparent);
+ }
unlock:
- UNLOCK (&frame->lock);
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- if (local->need_selfheal) {
- local->layout =
- dht_layout_get (this, local->loc.inode);
-
- /* TODO: neater interface needed below */
- local->stbuf.ia_type = local->loc.inode->ia_type;
-
- gf_uuid_copy (local->gfid, local->loc.inode->gfid);
- dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk,
- &local->loc, local->layout);
- } else {
-
- if (local->loc.parent) {
- dht_inode_ctx_time_update (local->loc.parent,
- this,
- &local->preparent,
- 0);
-
- dht_inode_ctx_time_update (local->loc.parent,
- this,
- &local->postparent,
- 1);
- }
+ UNLOCK(&frame->lock);
- DHT_STACK_UNWIND (rmdir, frame, local->op_ret,
- local->op_errno, &local->preparent,
- &local->postparent, NULL);
- }
- }
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ if (local->need_selfheal) {
+ dht_rmdir_unlock(frame, this);
+ local->layout = dht_layout_get(this, local->loc.inode);
- return 0;
-}
+ /* TODO: neater interface needed below */
+ local->stbuf.ia_type = local->loc.inode->ia_type;
+ gf_uuid_copy(local->gfid, local->loc.inode->gfid);
-int
-dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- int done = 0;
- char gfid[GF_UUID_BUF_SIZE] ={0};
+ /* Use a different frame or else the rmdir op_ret is
+ * overwritten by that of the selfheal */
- local = frame->local;
- prev = cookie;
+ heal_frame = copy_frame(frame);
+ if (heal_frame == NULL) {
+ goto err;
+ }
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- if ((op_errno != ENOENT) && (op_errno != ESTALE)) {
- local->op_errno = op_errno;
- local->op_ret = -1;
+ heal_local = dht_local_init(heal_frame, &local->loc, NULL, 0);
+ if (!heal_local) {
+ DHT_STACK_DESTROY(heal_frame);
+ goto err;
+ }
- if (op_errno != EACCES)
- local->need_selfheal = 1;
- }
+ heal_local->inode = inode_ref(local->loc.inode);
+ heal_local->main_frame = frame;
+ gf_uuid_copy(heal_local->gfid, local->loc.inode->gfid);
- gf_uuid_unparse(local->loc.gfid, gfid);
+ dht_selfheal_restore(heal_frame, dht_rmdir_selfheal_cbk,
+ &heal_local->loc, heal_local->layout);
+ return 0;
+ } else {
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->preparent, 0);
- gf_msg_debug (this->name, op_errno,
- "rmdir on %s for %s failed."
- "(gfid = %s)",
- prev->this->name, local->loc.path,
- gfid);
- goto unlock;
- }
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
- /* Track if rmdir succeeded on atleast one subvol*/
- local->fop_succeeded = 1;
- dht_iatt_merge (this, &local->preparent, preparent, prev->this);
- dht_iatt_merge (this, &local->postparent, postparent,
- prev->this);
+ dht_rmdir_unlock(frame, this);
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
}
-unlock:
- UNLOCK (&frame->lock);
+ }
+ return 0;
- this_call_cnt = dht_frame_return (frame);
+err:
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno, NULL, NULL,
+ NULL);
+ return 0;
+}
- /* if local->hashed_subvol, we are yet to wind to hashed_subvol. */
- if (local->hashed_subvol && (this_call_cnt == 1)) {
- done = 1;
- } else if (!local->hashed_subvol && !this_call_cnt) {
- done = 1;
- }
+static int
+dht_rmdir_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
+static int
+dht_rmdir_unlock(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ int lock_count = 0;
- if (done) {
- if (local->need_selfheal && local->fop_succeeded) {
- local->layout =
- dht_layout_get (this, local->loc.inode);
+ local = frame->local;
- /* TODO: neater interface needed below */
- local->stbuf.ia_type = local->loc.inode->ia_type;
+ /* Unlock entrylk */
+ dht_unlock_entrylk_wrapper(frame, &local->lock[0].ns.directory_ns);
- gf_uuid_copy (local->gfid, local->loc.inode->gfid);
- dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk,
- &local->loc, local->layout);
- } else if (this_call_cnt) {
- /* If non-hashed subvol's have responded, proceed */
+ /* Unlock inodelk */
+ lock_count = dht_lock_count(local->lock[0].ns.parent_layout.locks,
+ local->lock[0].ns.parent_layout.lk_count);
- local->need_selfheal = 0;
- STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk,
- local->hashed_subvol,
- local->hashed_subvol->fops->rmdir,
- &local->loc, local->flags, NULL);
- } else if (!this_call_cnt) {
- /* All subvol's have responded, proceed */
+ if (lock_count == 0)
+ goto done;
- if (local->loc.parent) {
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL)
+ goto done;
- dht_inode_ctx_time_update (local->loc.parent,
- this,
- &local->preparent,
- 0);
+ lock_local = dht_local_init(lock_frame, &local->loc, NULL,
+ lock_frame->root->op);
+ if (lock_local == NULL)
+ goto done;
- dht_inode_ctx_time_update (local->loc.parent,
- this,
- &local->postparent,
- 1);
+ lock_local->lock[0].ns.parent_layout.locks = local->lock[0]
+ .ns.parent_layout.locks;
+ lock_local->lock[0]
+ .ns.parent_layout.lk_count = local->lock[0].ns.parent_layout.lk_count;
- }
+ local->lock[0].ns.parent_layout.locks = NULL;
+ local->lock[0].ns.parent_layout.lk_count = 0;
+ dht_unlock_inodelk(lock_frame, lock_local->lock[0].ns.parent_layout.locks,
+ lock_local->lock[0].ns.parent_layout.lk_count,
+ dht_rmdir_unlock_cbk);
+ lock_frame = NULL;
- DHT_STACK_UNWIND (rmdir, frame, local->op_ret,
- local->op_errno, &local->preparent,
- &local->postparent, NULL);
- }
- }
+done:
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
- return 0;
+ return 0;
}
-
-int
-dht_rmdir_do (call_frame_t *frame, xlator_t *this)
+static int
+dht_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int i = 0;
- xlator_t *hashed_subvol = NULL;
- char gfid[GF_UUID_BUF_SIZE] ={0};
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ int done = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_local_t *heal_local = NULL;
+ call_frame_t *heal_frame = NULL;
+ int ret = -1;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ if ((op_errno != ENOENT) && (op_errno != ESTALE)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
- VALIDATE_OR_GOTO (this->private, err);
+ if (op_errno != EACCES)
+ local->need_selfheal = 1;
+ }
- conf = this->private;
- local = frame->local;
+ gf_uuid_unparse(local->loc.gfid, gfid);
- if (local->op_ret == -1)
- goto err;
+ gf_msg_debug(this->name, op_errno,
+ "rmdir on %s for %s failed."
+ "(gfid = %s)",
+ prev->name, local->loc.path, gfid);
+ goto unlock;
+ }
- local->call_cnt = conf->subvolume_cnt;
+ /* Track if rmdir succeeded on at least one subvol*/
+ local->fop_succeeded = 1;
+ dht_iatt_merge(this, &local->preparent, preparent);
+ dht_iatt_merge(this, &local->postparent, postparent);
+ }
+unlock:
+ UNLOCK(&frame->lock);
+ this_call_cnt = dht_frame_return(frame);
- /* first remove from non-hashed_subvol */
- hashed_subvol = dht_subvol_get_hashed (this, &local->loc);
+ /* if local->hashed_subvol, we are yet to wind to hashed_subvol. */
+ if (local->hashed_subvol && (this_call_cnt == 1)) {
+ done = 1;
+ } else if (!local->hashed_subvol && !this_call_cnt) {
+ done = 1;
+ }
- if (!hashed_subvol) {
- gf_uuid_unparse(local->loc.gfid, gfid);
+ if (done) {
+ if (local->need_selfheal && local->fop_succeeded) {
+ dht_rmdir_unlock(frame, this);
+ local->layout = dht_layout_get(this, local->loc.inode);
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_HASHED_SUBVOL_GET_FAILED,
- "Failed to get hashed subvol for %s (gfid = %s)",
- local->loc.path, gfid);
- } else {
- local->hashed_subvol = hashed_subvol;
- }
+ /* TODO: neater interface needed below */
+ local->stbuf.ia_type = local->loc.inode->ia_type;
+
+ gf_uuid_copy(local->gfid, local->loc.inode->gfid);
+ heal_frame = copy_frame(frame);
+ if (heal_frame == NULL) {
+ goto err;
+ }
+
+ heal_local = dht_local_init(heal_frame, &local->loc, NULL, 0);
+ if (!heal_local) {
+ DHT_STACK_DESTROY(heal_frame);
+ goto err;
+ }
+
+ heal_local->inode = inode_ref(local->loc.inode);
+ heal_local->main_frame = frame;
+ gf_uuid_copy(heal_local->gfid, local->loc.inode->gfid);
+ ret = dht_selfheal_restore(heal_frame, dht_rmdir_selfheal_cbk,
+ &heal_local->loc, heal_local->layout);
+ if (ret) {
+ DHT_STACK_DESTROY(heal_frame);
+ goto err;
+ }
+
+ } else if (this_call_cnt) {
+ /* If non-hashed subvol's have responded, proceed */
+ if (local->op_ret == 0) {
+ /* Delete the dir from the hashed subvol if:
+ * The fop succeeded on at least one subvol
+ * and did not fail on any
+ * or
+ * The fop failed with ENOENT/ESTALE on
+ * all subvols */
+
+ STACK_WIND_COOKIE(frame, dht_rmdir_hashed_subvol_cbk,
+ local->hashed_subvol, local->hashed_subvol,
+ local->hashed_subvol->fops->rmdir,
+ &local->loc, local->flags, NULL);
+ } else {
+ /* hashed-subvol was non-NULL and rmdir failed on
+ * all non hashed-subvols. Unwind rmdir with
+ * local->op_ret and local->op_errno. */
+ dht_rmdir_unlock(frame, this);
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
- /* When DHT has only 1 child */
- if (conf->subvolume_cnt == 1) {
- STACK_WIND (frame, dht_rmdir_hashed_subvol_cbk,
- conf->subvolumes[0],
- conf->subvolumes[0]->fops->rmdir,
- &local->loc, local->flags, NULL);
return 0;
- }
+ }
+ } else if (!this_call_cnt) {
+ /* All subvol's have responded, proceed */
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (hashed_subvol &&
- (hashed_subvol == conf->subvolumes[i]))
- continue;
+ if (local->loc.parent) {
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->preparent, 0);
- STACK_WIND (frame, dht_rmdir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->rmdir,
- &local->loc, local->flags, NULL);
+ dht_inode_ctx_time_update(local->loc.parent, this,
+ &local->postparent, 1);
+ }
+
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+
+ dht_rmdir_unlock(frame, this);
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
}
+ }
- return 0;
+ return 0;
err:
- DHT_STACK_UNWIND (rmdir, frame, local->op_ret, local->op_errno,
- &local->preparent, &local->postparent, NULL);
- return 0;
+ DHT_STACK_UNWIND(rmdir, frame, -1, local->op_errno, NULL, NULL, NULL);
+ return 0;
}
-
-int
-dht_rmdir_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+static int
+dht_rmdir_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- xlator_t *src = NULL;
- call_frame_t *main_frame = NULL;
- dht_local_t *main_local = NULL;
- int this_call_cnt = 0;
- char gfid[GF_UUID_BUF_SIZE] ={0};
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *hashed_subvol;
+ conf = this->private;
+ local = frame->local;
- local = frame->local;
- prev = cookie;
- src = prev->this;
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+ "acquiring entrylk after inodelk failed rmdir for %s)",
+ local->loc.path);
- main_frame = local->main_frame;
- main_local = main_frame->local;
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto err;
+ }
- gf_uuid_unparse(local->loc.gfid, gfid);
+ hashed_subvol = local->hashed_subvol;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (hashed_subvol && (hashed_subvol == conf->subvolumes[i]))
+ continue;
- if (op_ret == 0) {
- gf_msg_trace (this->name, 0,
- "Unlinked linkfile %s on %s, gfid = %s",
- local->loc.path, src->name, gfid);
- } else {
- main_local->op_ret = -1;
- main_local->op_errno = op_errno;
- gf_msg_debug (this->name, op_errno,
- "Unlink of %s on %s failed. (gfid = %s)",
- local->loc.path, src->name, gfid);
- }
+ STACK_WIND_COOKIE(frame, dht_rmdir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i], conf->subvolumes[i]->fops->rmdir,
+ &local->loc, local->flags, NULL);
+ }
- this_call_cnt = dht_frame_return (main_frame);
- if (is_last_call (this_call_cnt))
- dht_rmdir_do (main_frame, this);
+ return 0;
- DHT_STACK_DESTROY (frame);
- return 0;
-}
+err:
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
+ return 0;
+}
-int
-dht_rmdir_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *stbuf, dict_t *xattr, struct iatt *parent)
+static int
+dht_rmdir_do(call_frame_t *frame, xlator_t *this)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- xlator_t *src = NULL;
- call_frame_t *main_frame = NULL;
- dht_local_t *main_local = NULL;
- int this_call_cnt = 0;
- dht_conf_t *conf = this->private;
- char gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ xlator_t *hashed_subvol = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ VALIDATE_OR_GOTO(frame->local, err);
+ local = frame->local;
+ VALIDATE_OR_GOTO(this->private, out);
+ conf = this->private;
+
+ if (local->op_ret == -1)
+ goto out;
- local = frame->local;
- prev = cookie;
- src = prev->this;
+ local->call_cnt = conf->subvolume_cnt;
- main_frame = local->main_frame;
- main_local = main_frame->local;
+ /* first remove from non-hashed_subvol */
+ hashed_subvol = dht_subvol_get_hashed(this, &local->loc);
- if (op_ret != 0)
- goto err;
+ if (!hashed_subvol) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
- if (!check_is_linkfile (inode, stbuf, xattr, conf->link_xattr_name)) {
- main_local->op_ret = -1;
- main_local->op_errno = ENOTEMPTY;
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for %s (gfid = %s)",
+ local->loc.path, gfid);
+ } else {
+ local->hashed_subvol = hashed_subvol;
+ }
- gf_uuid_unparse(local->loc.gfid, gfid);
+ /* When DHT has only 1 child */
+ if (conf->subvolume_cnt == 1) {
+ STACK_WIND_COOKIE(frame, dht_rmdir_hashed_subvol_cbk,
+ conf->subvolumes[0], conf->subvolumes[0],
+ conf->subvolumes[0]->fops->rmdir, &local->loc,
+ local->flags, NULL);
+ return 0;
+ }
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_NOT_LINK_FILE_ERROR,
- "%s on %s is not a linkfile (type=0%o, gfid = %s)",
- local->loc.path, src->name, stbuf->ia_type, gfid);
- goto err;
- }
+ local->current = &local->lock[0];
+ ret = dht_protect_namespace(frame, &local->loc, local->hashed_subvol,
+ &local->current->ns, dht_rmdir_lock_cbk);
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = errno ? errno : EINVAL;
+ goto out;
+ }
- STACK_WIND (frame, dht_rmdir_linkfile_unlink_cbk,
- src, src->fops->unlink, &local->loc, 0, NULL);
- return 0;
+ return 0;
+
+out:
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
+
+ DHT_STACK_UNWIND(rmdir, frame, local->op_ret, local->op_errno,
+ &local->preparent, &local->postparent, NULL);
+ return 0;
err:
+ DHT_STACK_UNWIND(rmdir, frame, -1, EINVAL, NULL, NULL, NULL);
+ return 0;
+}
+
+static void
+dht_rmdir_readdirp_done(call_frame_t *readdirp_frame, xlator_t *this)
+{
+ call_frame_t *main_frame = NULL;
+ dht_local_t *main_local = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+
+ local = readdirp_frame->local;
+ main_frame = local->main_frame;
+ main_local = main_frame->local;
+
+ /* At least one readdirp failed.
+ * This is a bit hit or miss - if readdirp failed on more than
+ * one subvol, we don't know which error is returned.
+ */
+ if (local->op_ret == -1) {
+ main_local->op_ret = local->op_ret;
+ main_local->op_errno = local->op_errno;
+ }
+
+ this_call_cnt = dht_frame_return(main_frame);
+
+ if (is_last_call(this_call_cnt))
+ dht_rmdir_do(main_frame, this);
+
+ DHT_STACK_DESTROY(readdirp_frame);
+}
- this_call_cnt = dht_frame_return (main_frame);
- if (is_last_call (this_call_cnt))
- dht_rmdir_do (main_frame, this);
+/* Keep sending readdirp on the subvol until it returns no more entries
+ * It is possible that not all entries will fit in a single readdirp in
+ * which case the rmdir will keep failing with ENOTEMPTY
+ */
+
+static int
+dht_rmdir_readdirp_do(call_frame_t *readdirp_frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+
+ local = readdirp_frame->local;
- DHT_STACK_DESTROY (frame);
+ if (local->op_ret == -1) {
+ /* there is no point doing another readdirp on this
+ * subvol . */
+ dht_rmdir_readdirp_done(readdirp_frame, this);
return 0;
-}
+ }
+ STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk,
+ local->hashed_subvol, local->hashed_subvol,
+ local->hashed_subvol->fops->readdirp, local->fd, 4096, 0,
+ local->xattr);
-int
-dht_rmdir_cached_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *stbuf, dict_t *xattr,
- struct iatt *parent)
-{
- dht_local_t *local = NULL;
- xlator_t *src = NULL;
- call_frame_t *main_frame = NULL;
- dht_local_t *main_local = NULL;
- int this_call_cnt = 0;
- dht_conf_t *conf = this->private;
- dict_t *xattrs = NULL;
- int ret = 0;
+ return 0;
+}
- local = frame->local;
- src = local->hashed_subvol;
+static int
+dht_rmdir_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *src = NULL;
+ call_frame_t *readdirp_frame = NULL;
+ dht_local_t *readdirp_local = NULL;
+ int this_call_cnt = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ prev = cookie;
+ src = prev;
+
+ readdirp_frame = local->main_frame;
+ readdirp_local = readdirp_frame->local;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ if (op_ret == 0) {
+ gf_msg_trace(this->name, 0, "Unlinked linkfile %s on %s, gfid = %s",
+ local->loc.path, src->name, gfid);
+ } else {
+ if (op_errno != ENOENT) {
+ readdirp_local->op_ret = -1;
+ readdirp_local->op_errno = op_errno;
+ }
+ gf_msg_debug(this->name, op_errno,
+ "Unlink of %s on %s failed. (gfid = %s)", local->loc.path,
+ src->name, gfid);
+ }
+
+ this_call_cnt = dht_frame_return(readdirp_frame);
+
+ if (is_last_call(this_call_cnt))
+ dht_rmdir_readdirp_do(readdirp_frame, this);
+
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
- main_frame = local->main_frame;
- main_local = main_frame->local;
+static int
+dht_rmdir_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr, struct iatt *parent)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *src = NULL;
+ call_frame_t *readdirp_frame = NULL;
+ dht_local_t *readdirp_local = NULL;
+ int this_call_cnt = 0;
+ dht_conf_t *conf = this->private;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ prev = cookie;
+ src = prev;
+
+ gf_msg_debug(this->name, 0, "dht_rmdir_lookup_cbk %s", local->loc.path);
+
+ readdirp_frame = local->main_frame;
+ readdirp_local = readdirp_frame->local;
+
+ if (op_ret != 0) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_FILE_LOOKUP_FAILED,
+ "lookup failed for %s on %s", local->loc.path, src->name);
+ goto err;
+ }
+
+ if (!check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name)) {
+ readdirp_local->op_ret = -1;
+ readdirp_local->op_errno = ENOTEMPTY;
- if (op_ret == 0) {
- main_local->op_ret = -1;
- main_local->op_errno = ENOTEMPTY;
+ gf_uuid_unparse(local->loc.gfid, gfid);
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_SUBVOL_ERROR,
- "%s found on cached subvol %s",
- local->loc.path, src->name);
- goto err;
- } else if (op_errno != ENOENT) {
- main_local->op_ret = -1;
- main_local->op_errno = op_errno;
- goto err;
- }
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR,
+ "%s on %s is not a linkfile (type=0%o, gfid = %s)",
+ local->loc.path, src->name, stbuf->ia_type, gfid);
+ goto err;
+ }
- xattrs = dict_new ();
- if (!xattrs) {
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_NO_MEMORY, "dict_new failed");
- goto err;
- }
+ STACK_WIND_COOKIE(frame, dht_rmdir_linkfile_unlink_cbk, src, src,
+ src->fops->unlink, &local->loc, 0, NULL);
+ return 0;
+err:
- ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value: key = %s",
- conf->link_xattr_name);
- if (xattrs)
- dict_unref (xattrs);
- goto err;
- }
+ this_call_cnt = dht_frame_return(readdirp_frame);
+ if (is_last_call(this_call_cnt)) {
+ dht_rmdir_readdirp_do(readdirp_frame, this);
+ }
- STACK_WIND (frame, dht_rmdir_lookup_cbk,
- src, src->fops->lookup, &local->loc, xattrs);
- if (xattrs)
- dict_unref (xattrs);
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
- return 0;
+static int
+dht_rmdir_cached_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *parent)
+{
+ dht_local_t *local = NULL;
+ xlator_t *src = NULL;
+ call_frame_t *readdirp_frame = NULL;
+ dht_local_t *readdirp_local = NULL;
+ int this_call_cnt = 0;
+ dht_conf_t *conf = this->private;
+ dict_t *xattrs = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ src = local->hashed_subvol;
+
+ /* main_frame here is the readdirp_frame */
+
+ readdirp_frame = local->main_frame;
+ readdirp_local = readdirp_frame->local;
+
+ gf_msg_debug(this->name, 0, "returning for %s ", local->loc.path);
+
+ if (op_ret == 0) {
+ readdirp_local->op_ret = -1;
+ readdirp_local->op_errno = ENOTEMPTY;
+
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_ERROR,
+ "%s found on cached subvol %s", local->loc.path, src->name);
+ goto err;
+ } else if (op_errno != ENOENT) {
+ readdirp_local->op_ret = -1;
+ readdirp_local->op_errno = op_errno;
+
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_SUBVOL_ERROR,
+ "%s not found on cached subvol %s", local->loc.path, src->name);
+ goto err;
+ }
+
+ xattrs = dict_new();
+ if (!xattrs) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "dict_new failed");
+ goto err;
+ }
+
+ ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s",
+ conf->link_xattr_name);
+ if (xattrs)
+ dict_unref(xattrs);
+ goto err;
+ }
+ STACK_WIND_COOKIE(frame, dht_rmdir_lookup_cbk, src, src, src->fops->lookup,
+ &local->loc, xattrs);
+ if (xattrs)
+ dict_unref(xattrs);
+
+ return 0;
err:
- this_call_cnt = dht_frame_return (main_frame);
- if (is_last_call (this_call_cnt))
- dht_rmdir_do (main_frame, this);
+ this_call_cnt = dht_frame_return(readdirp_frame);
- DHT_STACK_DESTROY (frame);
- return 0;
+ /* Once all the lookups/unlinks etc have returned, proceed to wind
+ * readdirp on the subvol again until no entries are returned.
+ * This is required if there are more entries than can be returned
+ * in a single readdirp call.
+ */
+
+ if (is_last_call(this_call_cnt))
+ dht_rmdir_readdirp_do(readdirp_frame, this);
+
+ DHT_STACK_DESTROY(frame);
+ return 0;
}
+static int
+dht_rmdir_is_subvol_empty(call_frame_t *frame, xlator_t *this,
+ gf_dirent_t *entries, xlator_t *src)
+{
+ int ret = 0;
+ int build_ret = 0;
+ gf_dirent_t *trav = NULL;
+ call_frame_t *lookup_frame = NULL;
+ dht_local_t *lookup_local = NULL;
+ dht_local_t *local = NULL;
+ dict_t *xattrs = NULL;
+ dht_conf_t *conf = this->private;
+ xlator_t *subvol = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ int count = 0;
+ gf_boolean_t unwind = _gf_false;
+
+ local = frame->local;
+
+ list_for_each_entry(trav, &entries->list, list)
+ {
+ if (strcmp(trav->d_name, ".") == 0)
+ continue;
+ if (strcmp(trav->d_name, "..") == 0)
+ continue;
+ if (check_is_linkfile(NULL, (&trav->d_stat), trav->dict,
+ conf->link_xattr_name)) {
+ count++;
+ continue;
+ }
+
+ /* this entry is either a directory which is neither "." nor "..",
+ or a non directory which is not a linkfile. the directory is to
+ be treated as non-empty
+ */
+ return 0;
+ }
-int
-dht_rmdir_is_subvol_empty (call_frame_t *frame, xlator_t *this,
- gf_dirent_t *entries, xlator_t *src)
-{
- int ret = 0;
- int build_ret = 0;
- gf_dirent_t *trav = NULL;
- call_frame_t *lookup_frame = NULL;
- dht_local_t *lookup_local = NULL;
- dht_local_t *local = NULL;
- dict_t *xattrs = NULL;
- dht_conf_t *conf = this->private;
- xlator_t *subvol = NULL;
- char gfid[GF_UUID_BUF_SIZE] = {0};
+ xattrs = dict_new();
+ if (!xattrs) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "dict_new failed");
+ return -1;
+ }
- local = frame->local;
+ ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s",
+ conf->link_xattr_name);
- list_for_each_entry (trav, &entries->list, list) {
- if (strcmp (trav->d_name, ".") == 0)
- continue;
- if (strcmp (trav->d_name, "..") == 0)
- continue;
- if (check_is_linkfile (NULL, (&trav->d_stat), trav->dict,
- conf->link_xattr_name)) {
- ret++;
- continue;
- }
+ if (xattrs)
+ dict_unref(xattrs);
+ return -1;
+ }
- /* this entry is either a directory which is neither "." nor "..",
- or a non directory which is not a linkfile. the directory is to
- be treated as non-empty
- */
- return 0;
+ local->call_cnt = count;
+ ret = 0;
+
+ list_for_each_entry(trav, &entries->list, list)
+ {
+ if (strcmp(trav->d_name, ".") == 0)
+ continue;
+ if (strcmp(trav->d_name, "..") == 0)
+ continue;
+
+ lookup_frame = copy_frame(frame);
+
+ if (!lookup_frame) {
+ /* out of memory, let the rmdir fail
+ (as non-empty, unfortunately) */
+ goto err;
}
- xattrs = dict_new ();
- if (!xattrs) {
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_NO_MEMORY, "dict_new failed");
- return -1;
+ lookup_local = dht_local_init(lookup_frame, NULL, NULL, GF_FOP_LOOKUP);
+ if (!lookup_local) {
+ goto err;
}
- ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value: key = %s",
- conf->link_xattr_name);
-
- if (xattrs)
- dict_unref (xattrs);
- return -1;
- }
-
- list_for_each_entry (trav, &entries->list, list) {
- if (strcmp (trav->d_name, ".") == 0)
- continue;
- if (strcmp (trav->d_name, "..") == 0)
- continue;
-
- lookup_frame = NULL;
- lookup_local = NULL;
-
- lookup_frame = copy_frame (frame);
- if (!lookup_frame) {
- /* out of memory, let the rmdir fail
- (as non-empty, unfortunately) */
- goto err;
- }
+ lookup_frame->local = lookup_local;
+ lookup_local->main_frame = frame;
+ lookup_local->hashed_subvol = src;
- lookup_local = mem_get0 (this->local_pool);
- if (!lookup_local) {
- goto err;
- }
+ build_ret = dht_build_child_loc(this, &lookup_local->loc, &local->loc,
+ trav->d_name);
+ if (build_ret != 0)
+ goto err;
- lookup_frame->local = lookup_local;
- lookup_local->main_frame = frame;
- lookup_local->hashed_subvol = src;
+ gf_uuid_copy(lookup_local->loc.gfid, trav->d_stat.ia_gfid);
- build_ret = dht_build_child_loc (this, &lookup_local->loc,
- &local->loc, trav->d_name);
- if (build_ret != 0)
- goto err;
+ gf_uuid_unparse(lookup_local->loc.gfid, gfid);
- gf_uuid_copy (lookup_local->loc.gfid, trav->d_stat.ia_gfid);
+ gf_msg_trace(this->name, 0, "looking up %s on subvolume %s, gfid = %s",
+ lookup_local->loc.path, src->name, gfid);
- gf_uuid_unparse(lookup_local->loc.gfid, gfid);
+ subvol = dht_linkfile_subvol(this, NULL, &trav->d_stat, trav->dict);
+ if (!subvol || (subvol == src)) {
+ /* we need to delete the linkto file if it does not have a
+ * valid subvol or it points to itself.
+ */
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_INVALID_LINKFILE,
+ "Linkfile does not have link subvolume. "
+ "path = %s, gfid = %s",
+ lookup_local->loc.path, gfid);
- gf_msg_trace (this->name, 0,
- "looking up %s on subvolume %s, gfid = %s",
- lookup_local->loc.path, src->name, gfid);
+ gf_msg_debug(this->name, 0, "looking up %s on subvol %s, gfid = %s",
+ lookup_local->loc.path, src->name, gfid);
- LOCK (&frame->lock);
- {
- local->call_cnt++;
- }
- UNLOCK (&frame->lock);
-
- subvol = dht_linkfile_subvol (this, NULL, &trav->d_stat,
- trav->dict);
- if (!subvol) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_INVALID_LINKFILE,
- "Linkfile does not have link subvolume. "
- "path = %s, gfid = %s",
- lookup_local->loc.path, gfid);
- STACK_WIND (lookup_frame, dht_rmdir_lookup_cbk,
- src, src->fops->lookup,
- &lookup_local->loc, xattrs);
- } else {
- STACK_WIND (lookup_frame, dht_rmdir_cached_lookup_cbk,
- subvol, subvol->fops->lookup,
- &lookup_local->loc, xattrs);
- }
- ret++;
+ STACK_WIND_COOKIE(lookup_frame, dht_rmdir_lookup_cbk, src, src,
+ src->fops->lookup, &lookup_local->loc, xattrs);
+ } else {
+ gf_msg_debug(this->name, 0,
+ "Looking up linkfile target %s on "
+ " subvol %s, gfid = %s",
+ lookup_local->loc.path, subvol->name, gfid);
+
+ STACK_WIND(lookup_frame, dht_rmdir_cached_lookup_cbk, subvol,
+ subvol->fops->lookup, &lookup_local->loc, xattrs);
}
+ ret++;
- if (xattrs)
- dict_unref (xattrs);
+ lookup_frame = NULL;
+ lookup_local = NULL;
+ }
- return ret;
+ if (xattrs)
+ dict_unref(xattrs);
+
+ return ret;
err:
- if (xattrs)
- dict_unref (xattrs);
+ if (xattrs)
+ dict_unref(xattrs);
- if (lookup_frame)
- DHT_STACK_DESTROY (lookup_frame);
- return 0;
+ if (lookup_frame)
+ DHT_STACK_DESTROY(lookup_frame);
+
+ /* Handle the case where the wound calls have unwound before the
+ * loop processing is done
+ */
+
+ LOCK(&frame->lock);
+ {
+ local->op_ret = -1;
+ local->op_errno = ENOTEMPTY;
+
+ local->call_cnt -= (count - ret);
+ if (!local->call_cnt)
+ unwind = _gf_true;
+ }
+ UNLOCK(&frame->lock);
+
+ if (!unwind) {
+ return ret;
+ }
+ return 0;
}
+/*
+ * No more entries on this subvol. Proceed to the actual rmdir operation.
+ */
-int
-dht_rmdir_readdirp_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, gf_dirent_t *entries,
- dict_t *xdata)
+static int
+dht_rmdir_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = -1;
- call_frame_t *prev = NULL;
- xlator_t *src = NULL;
- int ret = 0;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *src = NULL;
+ int ret = 0;
+ char *path = NULL;
+
+ local = frame->local;
+ prev = cookie;
+ src = prev;
+
+ if (op_ret > 2) {
+ /* dht_rmdir_is_subvol_empty() may free the frame,
+ * copy path for logging.
+ */
+ path = gf_strdup(local->loc.path);
- local = frame->local;
- prev = cookie;
- src = prev->this;
-
- if (op_ret > 2) {
- ret = dht_rmdir_is_subvol_empty (frame, this, entries, src);
-
- switch (ret) {
- case 0: /* non linkfiles exist */
- gf_msg_trace (this->name, 0,
- "readdir on %s for %s returned %d "
- "entries", prev->this->name,
- local->loc.path, op_ret);
- local->op_ret = -1;
- local->op_errno = ENOTEMPTY;
- break;
- default:
- /* @ret number of linkfiles are getting unlinked */
- gf_msg_trace (this->name, 0,
- "readdir on %s for %s found %d "
- "linkfiles", prev->this->name,
- local->loc.path, ret);
- break;
- }
+ ret = dht_rmdir_is_subvol_empty(frame, this, entries, src);
+
+ switch (ret) {
+ case 0: /* non linkfiles exist */
+ gf_msg_trace(this->name, 0,
+ "readdir on %s for %s returned %d "
+ "entries",
+ prev->name, local->loc.path, op_ret);
+ local->op_ret = -1;
+ local->op_errno = ENOTEMPTY;
+ break;
+ default:
+ /* @ret number of linkfiles are getting unlinked */
+ gf_msg_trace(this->name, 0,
+ "readdir on %s for %s found %d "
+ "linkfiles",
+ prev->name, path, ret);
+ break;
}
+ }
+
+ /* readdirp failed or no linkto files were found on this subvol */
+ if (!ret)
+ dht_rmdir_readdirp_done(frame, this);
- this_call_cnt = dht_frame_return (frame);
+ GF_FREE(path);
+ return 0;
+}
+
+static int
+dht_rmdir_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ xlator_t *prev = NULL;
+ int ret = 0;
+ dht_conf_t *conf = this->private;
+ dict_t *dict = NULL;
+ int i = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_local_t *readdirp_local = NULL;
+ call_frame_t *readdirp_frame = NULL;
+ int cnt = 0;
+
+ local = frame->local;
+ prev = cookie;
+
+ this_call_cnt = dht_frame_return(frame);
+ if (op_ret == -1) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
- if (is_last_call (this_call_cnt)) {
- dht_rmdir_do (frame, this);
+ gf_msg_debug(this->name, op_errno,
+ "opendir on %s for %s failed, "
+ "gfid = %s,",
+ prev->name, local->loc.path, gfid);
+ if ((op_errno != ENOENT) && (op_errno != ESTALE)) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
}
+ goto err;
+ }
+ if (!is_last_call(this_call_cnt))
return 0;
-}
+ if (local->op_ret == -1)
+ goto err;
-int
-dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = -1;
- call_frame_t *prev = NULL;
- dict_t *dict = NULL;
- int ret = 0;
- dht_conf_t *conf = this->private;
- int i = 0;
- char gfid[GF_UUID_BUF_SIZE] = {0};
+ fd_bind(fd);
- local = frame->local;
- prev = cookie;
+ dict = dict_new();
+ if (!dict) {
+ local->op_ret = -1;
+ local->op_errno = ENOMEM;
+ goto err;
+ }
+ ret = dict_set_uint32(dict, conf->link_xattr_name, 256);
+ if (ret)
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: Failed to set dictionary value:key = %s", local->loc.path,
+ conf->link_xattr_name);
- this_call_cnt = dht_frame_return (frame);
- if (op_ret == -1) {
- gf_uuid_unparse(local->loc.gfid, gfid);
-
- gf_msg_debug (this->name, op_errno,
- "opendir on %s for %s failed, "
- "gfid = %s,",
- prev->this->name, local->loc.path, gfid);
- if ((op_errno != ENOENT) && (op_errno != ESTALE)) {
- local->op_ret = -1;
- local->op_errno = op_errno;
- }
- goto err;
- }
+ cnt = local->call_cnt = conf->subvolume_cnt;
- if (!is_last_call (this_call_cnt))
- return 0;
+ /* Create a separate frame per subvol as we might need
+ * to resend readdirp multiple times to get all the
+ * entries.
+ */
- if (local->op_ret == -1)
- goto err;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ readdirp_frame = copy_frame(frame);
- fd_bind (fd);
- dict = dict_new ();
- if (!dict) {
- local->op_ret = -1;
- local->op_errno = ENOMEM;
- goto err;
+ if (!readdirp_frame) {
+ cnt--;
+ /* Reduce the local->call_cnt as well */
+ (void)dht_frame_return(frame);
+ continue;
}
- ret = dict_set_uint32 (dict, conf->link_xattr_name, 256);
- if (ret)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value:key = %s",
- local->loc.path, conf->link_xattr_name);
+ readdirp_local = dht_local_init(readdirp_frame, &local->loc, local->fd,
+ 0);
- local->call_cnt = conf->subvolume_cnt;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_rmdir_readdirp_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->readdirp,
- local->fd, 4096, 0, dict);
+ if (!readdirp_local) {
+ DHT_STACK_DESTROY(readdirp_frame);
+ cnt--;
+ /* Reduce the local->call_cnt as well */
+ dht_frame_return(frame);
+ continue;
}
+ readdirp_local->main_frame = frame;
+ readdirp_local->op_ret = 0;
+ readdirp_local->xattr = dict_ref(dict);
+ /* overload this field to save the subvol info */
+ readdirp_local->hashed_subvol = conf->subvolumes[i];
- if (dict)
- dict_unref (dict);
+ STACK_WIND_COOKIE(readdirp_frame, dht_rmdir_readdirp_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->readdirp,
+ readdirp_local->fd, 4096, 0, readdirp_local->xattr);
+ }
- return 0;
+ if (dict)
+ dict_unref(dict);
+
+ /* Could not wind readdirp to any subvol */
+
+ if (!cnt)
+ goto err;
+
+ return 0;
err:
- if (is_last_call (this_call_cnt)) {
- dht_rmdir_do (frame, this);
- }
+ if (is_last_call(this_call_cnt)) {
+ dht_rmdir_do(frame, this);
+ }
- return 0;
+ return 0;
}
-
int
-dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
- dict_t *xdata)
+dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- dht_conf_t *conf = NULL;
- int op_errno = -1;
- int i = -1;
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ int op_errno = -1;
+ int i = -1;
+ int ret = -1;
+ dict_t *xattr_req = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+ VALIDATE_OR_GOTO(this->private, err);
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_RMDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->call_cnt = conf->subvolume_cnt;
+ local->op_ret = 0;
+ local->fop_succeeded = 0;
+
+ local->flags = flags;
+
+ local->fd = fd_create(local->loc.inode, frame->root->pid);
+ if (!local->fd) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ if (flags) {
+ return dht_rmdir_do(frame, this);
+ }
+ if (xdata) {
+ xattr_req = dict_ref(xdata);
+ } else {
+ xattr_req = dict_new();
+ }
+ if (xattr_req) {
+ ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+ /* If parallel-readdir is enabled, this is required
+ * to handle stale linkto files in the directory
+ * being deleted. If this fails, log an error but
+ * do not prevent the operation.
+ */
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s",
+ loc->path, conf->link_xattr_name);
+ }
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "%s: failed to set key %s",
+ loc->path, conf->link_xattr_name);
+ }
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_rmdir_opendir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->opendir, loc, local->fd,
+ xattr_req);
+ }
+
+ if (xattr_req) {
+ dict_unref(xattr_req);
+ }
+ return 0;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
- VALIDATE_OR_GOTO (this->private, err);
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(rmdir, frame, -1, op_errno, NULL, NULL, NULL);
- conf = this->private;
+ return 0;
+}
- local = dht_local_init (frame, loc, NULL, GF_FOP_RMDIR);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+static int
+dht_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
- local->call_cnt = conf->subvolume_cnt;
- local->op_ret = 0;
- local->fop_succeeded = 0;
+{
+ DHT_STACK_UNWIND(entrylk, frame, op_ret, op_errno, xdata);
+ return 0;
+}
- local->flags = flags;
+/* TODO
+ * Sending entrylk to cached subvol can result in stale lock
+ * as described in the bug 1311002.
+ */
+int
+dht_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_ENTRYLK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_uuid_unparse(loc->gfid, gfid);
- local->fd = fd_create (local->loc.inode, frame->root->pid);
- if (!local->fd) {
+ gf_msg_debug(this->name, 0,
+ "no cached subvolume for path=%s, "
+ "gfid = %s",
+ loc->path, gfid);
+ op_errno = EINVAL;
+ goto err;
+ }
- op_errno = ENOMEM;
- goto err;
- }
+ local->call_cnt = 1;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_rmdir_opendir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->opendir,
- loc, local->fd, NULL);
- }
+ STACK_WIND(frame, dht_entrylk_cbk, subvol, subvol->fops->entrylk, volume,
+ loc, basename, cmd, type, xdata);
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (rmdir, frame, -1, op_errno,
- NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(entrylk, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-int
-dht_entrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata)
+static int
+dht_fentrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- DHT_STACK_UNWIND (entrylk, frame, op_ret, op_errno, xdata);
- return 0;
+ DHT_STACK_UNWIND(fentrylk, frame, op_ret, op_errno, NULL);
+ return 0;
}
-
int
-dht_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_ENTRYLK);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+dht_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+ VALIDATE_OR_GOTO(fd->inode, err);
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_uuid_unparse(loc->gfid, gfid);
+ gf_uuid_unparse(fd->inode->gfid, gfid);
- gf_msg_debug (this->name, 0,
- "no cached subvolume for path=%s, "
- "gfid = %s", loc->path, gfid);
- op_errno = EINVAL;
- goto err;
- }
+ subvol = dht_subvol_get_cached(this, fd->inode);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0,
+ "No cached subvolume for fd=%p,"
+ " gfid = %s",
+ fd, gfid);
+ op_errno = EINVAL;
+ goto err;
+ }
- local->call_cnt = 1;
+ STACK_WIND(frame, dht_fentrylk_cbk, subvol, subvol->fops->fentrylk, volume,
+ fd, basename, cmd, type, xdata);
- STACK_WIND (frame, dht_entrylk_cbk,
- subvol, subvol->fops->entrylk,
- volume, loc, basename, cmd, type, xdata);
-
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (entrylk, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fentrylk, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
+static int32_t
+dht_ipc_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
-int
-dht_fentrylk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata)
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
-{
- DHT_STACK_UNWIND (fentrylk, frame, op_ret, op_errno, NULL);
- return 0;
-}
+ local = frame->local;
+ LOCK(&frame->lock);
+ {
+ if (op_ret < 0 && op_errno != ENOTCONN) {
+ local->op_errno = op_errno;
+ goto unlock;
+ }
+ local->op_ret = 0;
+ }
+unlock:
+ UNLOCK(&frame->lock);
-int
-dht_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata)
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ DHT_STACK_UNWIND(ipc, frame, local->op_ret, local->op_errno, NULL);
+ }
+
+out:
+ return 0;
+}
+
+int32_t
+dht_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- char gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+ dht_conf_t *conf = NULL;
+ int call_cnt = 0;
+ int i = 0;
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
- VALIDATE_OR_GOTO(fd->inode, err);
+ if (op != GF_IPC_TARGET_UPCALL)
+ goto wind_default;
- gf_uuid_unparse(fd->inode->gfid, gfid);
+ VALIDATE_OR_GOTO(this->private, err);
+ conf = this->private;
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "No cached subvolume for fd=%p,"
- " gfid = %s", fd, gfid);
- op_errno = EINVAL;
- goto err;
- }
+ local = dht_local_init(frame, NULL, NULL, GF_FOP_IPC);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- STACK_WIND (frame, dht_fentrylk_cbk,
- subvol, subvol->fops->fentrylk,
- volume, fd, basename, cmd, type, xdata);
+ call_cnt = conf->subvolume_cnt;
+ local->call_cnt = call_cnt;
- return 0;
+ if (xdata) {
+ if (dict_set_int8(xdata, conf->xattr_name, 0) < 0)
+ goto err;
+ }
+
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND(frame, dht_ipc_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->ipc, op, xdata);
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fentrylk, frame, -1, op_errno, NULL);
+ DHT_STACK_UNWIND(ipc, frame, -1, op_errno, NULL);
- return 0;
-}
+ return 0;
+wind_default:
+ STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->ipc, op, xdata);
+ return 0;
+}
int
-dht_forget (xlator_t *this, inode_t *inode)
+dht_forget(xlator_t *this, inode_t *inode)
{
- uint64_t ctx_int = 0;
- dht_inode_ctx_t *ctx = NULL;
- dht_layout_t *layout = NULL;
+ uint64_t ctx_int = 0;
+ dht_inode_ctx_t *ctx = NULL;
+ dht_layout_t *layout = NULL;
- inode_ctx_del (inode, this, &ctx_int);
+ inode_ctx_del(inode, this, &ctx_int);
- if (!ctx_int)
- return 0;
+ if (!ctx_int)
+ return 0;
- ctx = (dht_inode_ctx_t *) (long) ctx_int;
+ ctx = (dht_inode_ctx_t *)(long)ctx_int;
- layout = ctx->layout;
- ctx->layout = NULL;
- dht_layout_unref (this, layout);
- GF_FREE (ctx);
+ layout = ctx->layout;
+ ctx->layout = NULL;
+ dht_layout_unref(this, layout);
+ GF_FREE(ctx);
- return 0;
+ return 0;
}
-
int
-dht_notify (xlator_t *this, int event, void *data, ...)
-{
- xlator_t *subvol = NULL;
- int cnt = -1;
- int i = -1;
- dht_conf_t *conf = NULL;
- int ret = -1;
- int propagate = 0;
-
- int had_heard_from_all = 0;
- int have_heard_from_all = 0;
- struct timeval time = {0,};
- gf_defrag_info_t *defrag = NULL;
- dict_t *dict = NULL;
- gf_defrag_type cmd = 0;
- dict_t *output = NULL;
- va_list ap;
- dht_methods_t *methods = NULL;
-
- conf = this->private;
- GF_VALIDATE_OR_GOTO (this->name, conf, out);
-
- methods = &(conf->methods);
-
- /* had all subvolumes reported status once till now? */
- had_heard_from_all = 1;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!conf->last_event[i]) {
- had_heard_from_all = 0;
- }
- }
-
- switch (event) {
+dht_notify(xlator_t *this, int event, void *data, ...)
+{
+ xlator_t *subvol = NULL;
+ int cnt = -1;
+ int i = -1;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+ int propagate = 0;
+
+ int had_heard_from_all = 0;
+ int have_heard_from_all = 0;
+ gf_defrag_info_t *defrag = NULL;
+ dict_t *dict = NULL;
+ gf_defrag_type cmd = 0;
+ dict_t *output = NULL;
+ va_list ap;
+ struct gf_upcall *up_data = NULL;
+ struct gf_upcall_cache_invalidation *up_ci = NULL;
+
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, out);
+
+ /* had all subvolumes reported status once till now? */
+ had_heard_from_all = 1;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->last_event[i]) {
+ had_heard_from_all = 0;
+ }
+ }
+
+ switch (event) {
case GF_EVENT_CHILD_UP:
- subvol = data;
+ subvol = data;
- conf->gen++;
+ conf->gen++;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (subvol == conf->subvolumes[i]) {
- cnt = i;
- break;
- }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ cnt = i;
+ break;
}
+ }
- if (cnt == -1) {
- gf_msg_debug (this->name, 0,
- "got GF_EVENT_CHILD_UP bad "
- "subvolume %s",
- subvol->name);
- break;
- }
+ if (cnt == -1) {
+ gf_msg_debug(this->name, 0,
+ "got GF_EVENT_CHILD_UP bad "
+ "subvolume %s",
+ subvol->name);
+ break;
+ }
- gettimeofday (&time, NULL);
- LOCK (&conf->subvolume_lock);
- {
- conf->subvolume_status[cnt] = 1;
- conf->last_event[cnt] = event;
- conf->subvol_up_time[cnt] = time.tv_sec;
- }
- UNLOCK (&conf->subvolume_lock);
+ LOCK(&conf->subvolume_lock);
+ {
+ conf->subvolume_status[cnt] = 1;
+ conf->last_event[cnt] = event;
+ conf->subvol_up_time[cnt] = gf_time();
+ }
+ UNLOCK(&conf->subvolume_lock);
- /* one of the node came back up, do a stat update */
- dht_get_du_info_for_subvol (this, cnt);
+ /* one of the node came back up, do a stat update */
+ dht_get_du_info_for_subvol(this, cnt);
- break;
+ break;
- case GF_EVENT_CHILD_MODIFIED:
- subvol = data;
+ case GF_EVENT_SOME_DESCENDENT_UP:
+ subvol = data;
+ conf->gen++;
+ propagate = 1;
- conf->gen++;
- propagate = 1;
+ break;
- break;
+ case GF_EVENT_SOME_DESCENDENT_DOWN:
+ subvol = data;
+ propagate = 1;
- case GF_EVENT_SOME_CHILD_DOWN:
- subvol = data;
- propagate = 1;
-
- break;
+ break;
case GF_EVENT_CHILD_DOWN:
- subvol = data;
-
- if (conf->assert_no_child_down) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_CHILD_DOWN,
- "Received CHILD_DOWN. Exiting");
- if (conf->defrag) {
- gf_defrag_stop (conf->defrag,
- GF_DEFRAG_STATUS_FAILED, NULL);
- } else {
- kill (getpid(), SIGTERM);
- }
- }
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (subvol == conf->subvolumes[i]) {
- cnt = i;
- break;
- }
- }
+ subvol = data;
- if (cnt == -1) {
- gf_msg_debug (this->name, 0,
- "got GF_EVENT_CHILD_DOWN bad "
- "subvolume %s", subvol->name);
- break;
+ if (conf->assert_no_child_down) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_CHILD_DOWN,
+ "Received CHILD_DOWN. Exiting");
+ if (conf->defrag) {
+ gf_defrag_stop(conf, GF_DEFRAG_STATUS_FAILED, NULL);
+ } else {
+ kill(getpid(), SIGTERM);
}
+ }
- LOCK (&conf->subvolume_lock);
- {
- conf->subvolume_status[cnt] = 0;
- conf->last_event[cnt] = event;
- conf->subvol_up_time[cnt] = 0;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ cnt = i;
+ break;
}
- UNLOCK (&conf->subvolume_lock);
+ }
- for (i = 0; i < conf->subvolume_cnt; i++)
- if (conf->last_event[i] != event)
- event = GF_EVENT_CHILD_MODIFIED;
+ if (cnt == -1) {
+ gf_msg_debug(this->name, 0,
+ "got GF_EVENT_CHILD_DOWN bad "
+ "subvolume %s",
+ subvol->name);
break;
+ }
- case GF_EVENT_CHILD_CONNECTING:
- subvol = data;
+ LOCK(&conf->subvolume_lock);
+ {
+ conf->subvolume_status[cnt] = 0;
+ conf->last_event[cnt] = event;
+ conf->subvol_up_time[cnt] = 0;
+ }
+ UNLOCK(&conf->subvolume_lock);
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (subvol == conf->subvolumes[i]) {
- cnt = i;
- break;
- }
- }
+ for (i = 0; i < conf->subvolume_cnt; i++)
+ if (conf->last_event[i] != event)
+ event = GF_EVENT_SOME_DESCENDENT_DOWN;
+ break;
- if (cnt == -1) {
- gf_msg_debug (this->name, 0,
- "got GF_EVENT_CHILD_CONNECTING"
- " bad subvolume %s",
- subvol->name);
- break;
- }
+ case GF_EVENT_CHILD_CONNECTING:
+ subvol = data;
- LOCK (&conf->subvolume_lock);
- {
- conf->last_event[cnt] = event;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ cnt = i;
+ break;
}
- UNLOCK (&conf->subvolume_lock);
+ }
+ if (cnt == -1) {
+ gf_msg_debug(this->name, 0,
+ "got GF_EVENT_CHILD_CONNECTING"
+ " bad subvolume %s",
+ subvol->name);
break;
- case GF_EVENT_VOLUME_DEFRAG:
- {
- if (!conf->defrag) {
- return ret;
- }
- defrag = conf->defrag;
+ }
- dict = data;
- va_start (ap, data);
- output = va_arg (ap, dict_t*);
+ LOCK(&conf->subvolume_lock);
+ {
+ conf->last_event[cnt] = event;
+ }
+ UNLOCK(&conf->subvolume_lock);
- ret = dict_get_int32 (dict, "rebalance-command",
- (int32_t*)&cmd);
- if (ret)
- return ret;
- LOCK (&defrag->lock);
- {
- if (defrag->is_exiting)
- goto unlock;
- if ((cmd == GF_DEFRAG_CMD_STATUS) ||
- (cmd == GF_DEFRAG_CMD_STATUS_TIER))
- gf_defrag_status_get (defrag, output);
- else if (cmd == GF_DEFRAG_CMD_START_DETACH_TIER)
- gf_defrag_start_detach_tier(defrag);
- else if (cmd == GF_DEFRAG_CMD_STOP ||
- cmd == GF_DEFRAG_CMD_STOP_DETACH_TIER)
- gf_defrag_stop (defrag,
- GF_DEFRAG_STATUS_STOPPED, output);
- else if (cmd == GF_DEFRAG_CMD_PAUSE_TIER)
- ret = gf_defrag_pause_tier (this, defrag);
- else if (cmd == GF_DEFRAG_CMD_RESUME_TIER)
- ret = gf_defrag_resume_tier (this, defrag);
- }
-unlock:
- UNLOCK (&defrag->lock);
+ break;
+ case GF_EVENT_VOLUME_DEFRAG: {
+ if (!conf->defrag) {
return ret;
- break;
- }
+ }
+ defrag = conf->defrag;
- default:
- propagate = 1;
- break;
- }
+ dict = data;
+ va_start(ap, data);
+ output = va_arg(ap, dict_t *);
+ ret = dict_get_int32(dict, "rebalance-command", (int32_t *)&cmd);
+ if (ret) {
+ va_end(ap);
+ return ret;
+ }
+ LOCK(&defrag->lock);
+ {
+ if (defrag->is_exiting)
+ goto unlock;
+ if ((cmd == GF_DEFRAG_CMD_STATUS) ||
+ (cmd == GF_DEFRAG_CMD_DETACH_STATUS))
+ gf_defrag_status_get(conf, output);
+ else if (cmd == GF_DEFRAG_CMD_DETACH_START)
+ defrag->cmd = GF_DEFRAG_CMD_DETACH_START;
+ else if (cmd == GF_DEFRAG_CMD_STOP ||
+ cmd == GF_DEFRAG_CMD_DETACH_STOP)
+ gf_defrag_stop(conf, GF_DEFRAG_STATUS_STOPPED, output);
+ }
+ unlock:
+ UNLOCK(&defrag->lock);
+ va_end(ap);
+ return ret;
+ break;
+ }
+ case GF_EVENT_UPCALL:
+ up_data = (struct gf_upcall *)data;
+ if (up_data->event_type != GF_UPCALL_CACHE_INVALIDATION)
+ break;
+ up_ci = (struct gf_upcall_cache_invalidation *)up_data->data;
+
+ /* Since md-cache will be aggressively filtering lookups,
+ * the stale layout issue will be more pronounced. Hence
+ * when a layout xattr is changed by the rebalance process
+ * notify all the md-cache clients to invalidate the existing
+ * stat cache and send the lookup next time*/
+ if (up_ci->dict && dict_get(up_ci->dict, conf->xattr_name))
+ up_ci->flags |= UP_EXPLICIT_LOOKUP;
+
+ /* TODO: Instead of invalidating iatt, update the new
+ * hashed/cached subvolume in dht inode_ctx */
+ if (IS_DHT_LINKFILE_MODE(&up_ci->stat))
+ up_ci->flags |= UP_EXPLICIT_LOOKUP;
+
+ propagate = 1;
+ break;
+ default:
+ propagate = 1;
+ break;
+ }
+
+ /* have all subvolumes reported status once by now? */
+ have_heard_from_all = 1;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->last_event[i])
+ have_heard_from_all = 0;
+ }
+
+ /* if all subvols have reported status, no need to hide anything
+ or wait for anything else. Just propagate blindly */
+ if (have_heard_from_all) {
+ propagate = 1;
+ }
+
+ if (!had_heard_from_all && have_heard_from_all) {
+ static int run_defrag = 0;
+ /* This is the first event which completes aggregation
+ of events from all subvolumes. If at least one subvol
+ had come up, propagate CHILD_UP, but only this time
+ */
+ event = GF_EVENT_CHILD_DOWN;
- /* have all subvolumes reported status once by now? */
- have_heard_from_all = 1;
for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!conf->last_event[i])
- have_heard_from_all = 0;
- }
-
- /* if all subvols have reported status, no need to hide anything
- or wait for anything else. Just propagate blindly */
- if (have_heard_from_all) {
- propagate = 1;
+ if (conf->last_event[i] == GF_EVENT_CHILD_UP) {
+ event = GF_EVENT_CHILD_UP;
+ break;
+ }
+ if (conf->last_event[i] == GF_EVENT_CHILD_CONNECTING) {
+ event = GF_EVENT_CHILD_CONNECTING;
+ /* continue to check other events for CHILD_UP */
+ }
}
-
- if (!had_heard_from_all && have_heard_from_all) {
- /* This is the first event which completes aggregation
- of events from all subvolumes. If at least one subvol
- had come up, propagate CHILD_UP, but only this time
- */
- event = GF_EVENT_CHILD_DOWN;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->last_event[i] == GF_EVENT_CHILD_UP) {
- event = GF_EVENT_CHILD_UP;
- break;
- }
-
- if (conf->last_event[i] == GF_EVENT_CHILD_CONNECTING) {
- event = GF_EVENT_CHILD_CONNECTING;
- /* continue to check other events for CHILD_UP */
- }
- }
-
- /* Rebalance is started with assert_no_child_down. So we do
- * not need to handle CHILD_DOWN event here.
- *
- * If there is a graph switch, we should not restart the
- * rebalance daemon. Use 'run_defrag' to indicate if the
- * thread has already started.
- */
- if (conf->defrag && !run_defrag) {
- if (methods->migration_needed(this)) {
- run_defrag = 1;
- ret = gf_thread_create(&conf->defrag->th,
- NULL,
- gf_defrag_start, this);
- if (ret) {
- GF_FREE (conf->defrag);
- conf->defrag = NULL;
- kill (getpid(), SIGTERM);
- }
- }
- }
+ /* Rebalance is started with assert_no_child_down. So we do
+ * not need to handle CHILD_DOWN event here.
+ *
+ * If there is a graph switch, we should not restart the
+ * rebalance daemon. Use 'run_defrag' to indicate if the
+ * thread has already started.
+ */
+ if (conf->defrag && !run_defrag) {
+ run_defrag = 1;
+ ret = gf_thread_create(&conf->defrag->th, NULL, gf_defrag_start,
+ this, "dhtdg");
+ if (ret) {
+ GF_FREE(conf->defrag);
+ conf->defrag = NULL;
+ kill(getpid(), SIGTERM);
+ }
}
+ }
- ret = 0;
- if (propagate)
- ret = default_notify (this, event, data);
+ ret = 0;
+ if (propagate)
+ ret = default_notify(this, event, data);
out:
- return ret;
+ return ret;
}
int
-dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this, dht_layout_t **layout)
+dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this, dht_layout_t **layout)
{
- dht_inode_ctx_t *ctx = NULL;
- int ret = -1;
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- ret = dht_inode_ctx_get (inode, this, &ctx);
+ ret = dht_inode_ctx_get(inode, this, &ctx);
- if (!ret && ctx) {
- if (ctx->layout) {
- if (layout)
- *layout = ctx->layout;
- ret = 0;
- } else {
- ret = -1;
- }
+ if (!ret && ctx) {
+ if (ctx->layout) {
+ if (layout)
+ *layout = ctx->layout;
+ ret = 0;
+ } else {
+ ret = -1;
}
+ }
- return ret;
+ return ret;
}
void
-dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
- dht_layout_t *layout)
+dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc,
+ dht_layout_t *layout)
{
+ char string[2048] = {0};
+ char *output_string = NULL;
+ int len = 0;
+ int off = 0;
+ int i = 0;
+ gf_loglevel_t log_level = gf_log_get_loglevel();
+ int ret = 0;
+
+ if (log_level < GF_LOG_INFO)
+ return;
- char string[2048] = {0};
- char *output_string = NULL;
- int len = 0;
- int off = 0;
- int i = 0;
- gf_loglevel_t log_level = gf_log_get_loglevel();
- int ret = 0;
- int max_string_len = 0;
-
- if (log_level < GF_LOG_INFO)
- return;
+ if (!layout)
+ return;
- if (!layout)
- return;
+ if (!layout->cnt)
+ return;
- if (!layout->cnt)
- return;
+ if (!loc)
+ return;
- if (!loc)
- return;
+ if (!loc->path)
+ return;
- if (!loc->path)
- return;
+ ret = snprintf(string, sizeof(string), "Setting layout of %s with ",
+ loc->path);
- max_string_len = sizeof (string);
+ if (ret < 0)
+ return;
- ret = snprintf (string, max_string_len, "Setting layout of %s with ",
- loc->path);
+ len += ret;
+
+ /* Calculation of total length of the string required to calloc
+ * output_string. Log includes subvolume-name, start-range, end-range
+ * and err value.
+ *
+ * This log will help to debug cases where:
+ * a) Different processes set different layout of a directory.
+ * b) Error captured in lookup, which will be filled in layout->err
+ * (like ENOENT, ESTALE etc)
+ */
+
+ for (i = 0; i < layout->cnt; i++) {
+ ret = snprintf(string, sizeof(string),
+ "[Subvol_name: %s, Err: %d , Start: "
+ "0x%x, Stop: 0x%x, Hash: 0x%x], ",
+ layout->list[i].xlator->name, layout->list[i].err,
+ layout->list[i].start, layout->list[i].stop,
+ layout->list[i].commit_hash);
if (ret < 0)
- return;
+ return;
len += ret;
+ }
- /* Calculation of total length of the string required to calloc
- * output_string. Log includes subvolume-name, start-range, end-range and
- * err value.
- *
- * This log will help to debug cases where:
- * a) Different processes set different layout of a directory.
- * b) Error captured in lookup, which will be filled in layout->err
- * (like ENOENT, ESTALE etc)
- */
-
- for (i = 0; i < layout->cnt; i++) {
+ len++;
- ret = snprintf (string, max_string_len,
- "[Subvol_name: %s, Err: %d , Start: "
- "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %"
- PRIu32 " ], ",
- layout->list[i].xlator->name,
- layout->list[i].err, layout->list[i].start,
- layout->list[i].stop,
- layout->list[i].commit_hash);
+ output_string = GF_MALLOC(len + 1, gf_common_mt_char);
- if (ret < 0)
- return;
-
- len += ret;
-
- }
+ if (!output_string)
+ return;
- len++;
+ ret = snprintf(output_string, len + 1, "Setting layout of %s with ",
+ loc->path);
- output_string = GF_CALLOC (len, sizeof (char), gf_common_mt_char);
+ if (ret < 0)
+ goto err;
- if (!output_string)
- return;
+ off += ret;
- ret = snprintf (output_string, len, "Setting layout of %s with ",
- loc->path);
+ for (i = 0; i < layout->cnt; i++) {
+ ret = snprintf(output_string + off, len - off,
+ "[Subvol_name: %s, Err: %d , Start: "
+ "0x%x, Stop: 0x%x, Hash: 0x%x], ",
+ layout->list[i].xlator->name, layout->list[i].err,
+ layout->list[i].start, layout->list[i].stop,
+ layout->list[i].commit_hash);
if (ret < 0)
- goto err;
+ goto err;
off += ret;
+ }
+ gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_LOG_FIXED_LAYOUT, "%s",
+ output_string);
- for (i = 0; i < layout->cnt; i++) {
-
- ret = snprintf (output_string + off, len - off,
- "[Subvol_name: %s, Err: %d , Start: "
- "%"PRIu32 " , Stop: %"PRIu32 " , Hash: %"
- PRIu32 " ], ",
- layout->list[i].xlator->name,
- layout->list[i].err, layout->list[i].start,
- layout->list[i].stop,
- layout->list[i].commit_hash);
+err:
+ GF_FREE(output_string);
+}
- if (ret < 0)
- goto err;
+int32_t
+dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local)
+{
+ int ret = -1;
- off += ret;
+ if (!local)
+ goto out;
- }
+ local->rebalance.target_node = dht_subvol_get_hashed(this, &local->loc);
- gf_msg (this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_FIXED_LAYOUT,
- "%s", output_string);
+ if (local->rebalance.target_node)
+ ret = 0;
-err:
- GF_FREE (output_string);
+out:
+ return ret;
}
-int32_t dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local)
+/*
+This function should not be called more then once during a FOP
+handling path. It is valid only for for ops on files
+*/
+int32_t
+dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf,
+ struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
{
- int ret = -1;
+ if (!local)
+ return -1;
- if (!local)
- goto out;
+ if (local->rebalance.set) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_REBAL_STRUCT_SET,
+ "local->rebalance already set");
+ }
- local->rebalance.target_node =
- dht_subvol_get_hashed (this, &local->loc);
+ if (stbuf)
+ memcpy(&local->rebalance.stbuf, stbuf, sizeof(struct iatt));
- if (local->rebalance.target_node)
- ret = 0;
+ if (prebuf)
+ memcpy(&local->rebalance.prebuf, prebuf, sizeof(struct iatt));
-out:
- return ret;
+ if (postbuf)
+ memcpy(&local->rebalance.postbuf, postbuf, sizeof(struct iatt));
+
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
+
+ local->rebalance.set = 1;
+
+ return 0;
}
-int32_t dht_migration_needed(xlator_t *this)
+int32_t
+dht_release(xlator_t *this, fd_t *fd)
{
- gf_defrag_info_t *defrag = NULL;
- dht_conf_t *conf = NULL;
- int ret = 0;
+ return dht_fd_ctx_destroy(this, fd);
+}
- conf = this->private;
+static int
+dht_pt_mkdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
- GF_VALIDATE_OR_GOTO ("dht", conf, out);
- GF_VALIDATE_OR_GOTO ("dht", conf->defrag, out);
+ local = frame->local;
- defrag = conf->defrag;
+ if (!op_ret) {
+ dht_layout_set(this, inode, local->layout);
+ }
- if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
- (defrag->cmd != GF_DEFRAG_CMD_START_DETACH_TIER))
- ret = 1;
+ DHT_STACK_UNWIND(mkdir, frame, op_ret, op_errno, inode, stbuf, preparent,
+ postparent, NULL);
-out:
- return ret;
+ return 0;
}
+int32_t
+dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata)
+{
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ bool free_xdata = false;
+ int ret = 0;
+ int op_errno = 0;
+ int32_t *disk_layout_p = NULL;
+
+ conf = this->private;
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_MKDIR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ layout = dht_layout_new(this, conf->subvolume_cnt);
+ if (!layout)
+ goto wind;
+
+ local->layout = layout;
+
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata)
+ goto wind;
+ free_xdata = true;
+ }
+
+ /*Set the xlator or the following will crash*/
+ layout->list[0].xlator = conf->subvolumes[0];
+
+ dht_selfheal_layout_new_directory(frame, loc, layout);
+
+ dht_disk_layout_extract(this, layout, 0, &disk_layout_p);
+
+ ret = dict_set_bin(xdata, conf->xattr_name, disk_layout_p, 4 * 4);
+ if (ret) {
+ gf_msg("dht", GF_LOG_DEBUG, EINVAL, DHT_MSG_DICT_SET_FAILED,
+ "dht layout dict set failed");
+ }
+wind:
+ STACK_WIND(frame, dht_pt_mkdir_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->mkdir, loc, mode, umask, xdata);
+ if (free_xdata)
+ dict_unref(xdata);
+ return 0;
+err:
+ op_errno = local ? local->op_errno : op_errno;
+ DHT_STACK_UNWIND(mkdir, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
-/*
-This function should not be called more then once during a FOP
-handling path. It is valid only for for ops on files
-*/
-int32_t dht_set_local_rebalance (xlator_t *this, dht_local_t *local,
- struct iatt *stbuf,
- struct iatt *prebuf, struct iatt *postbuf,
- dict_t *xdata)
+ return 0;
+}
+
+static int
+dht_pt_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
{
+ dht_conf_t *conf = NULL;
- if (!local)
- return -1;
+ conf = this->private;
+ dict_del(xattr, conf->xattr_name);
+ dict_del(xattr, conf->mds_xattr_key);
+ dict_del(xattr, conf->commithash_xattr_name);
- if (local->rebalance.set) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_REBAL_STRUCT_SET,
- "local->rebalance already set");
- }
+ if (frame->root->pid >= 0) {
+ GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+ GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+ }
+ DHT_STACK_UNWIND(getxattr, frame, op_ret, op_errno, xattr, xdata);
+ return 0;
+}
- if (stbuf)
- memcpy (&local->rebalance.stbuf, stbuf, sizeof (struct iatt));
+int
+dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata)
+{
+ STACK_WIND(frame, dht_pt_getxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->getxattr, loc, key, xdata);
+ return 0;
+}
- if (prebuf)
- memcpy (&local->rebalance.prebuf, prebuf, sizeof (struct iatt));
+static int
+dht_pt_fgetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xattr, dict_t *xdata)
+{
+ dht_conf_t *conf = NULL;
- if (postbuf)
- memcpy (&local->rebalance.postbuf, postbuf,
- sizeof (struct iatt));
+ conf = this->private;
+ dict_del(xattr, conf->xattr_name);
- if (xdata)
- local->rebalance.xdata = dict_ref (xdata);
+ if (frame->root->pid >= 0) {
+ GF_REMOVE_INTERNAL_XATTR("trusted.glusterfs.quota*", xattr);
+ GF_REMOVE_INTERNAL_XATTR("trusted.pgfid*", xattr);
+ }
- local->rebalance.set = 1;
-
- return 0;
+ DHT_STACK_UNWIND(fgetxattr, frame, op_ret, op_errno, xattr, xdata);
+ return 0;
}
-gf_boolean_t
-dht_is_tier_xlator (xlator_t *this)
+int
+dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata)
{
-
- if (strcmp (this->type, "cluster/tier") == 0)
- return _gf_true;
- return _gf_false;
+ STACK_WIND(frame, dht_pt_fgetxattr_cbk, FIRST_CHILD(this),
+ FIRST_CHILD(this)->fops->fgetxattr, fd, key, xdata);
+ return 0;
}
-int32_t
-dht_release (xlator_t *this, fd_t *fd)
+/* The job of this function is to check if all the xlators have updated
+ * error in the layout. */
+int
+dht_dir_layout_error_check(xlator_t *this, inode_t *inode)
{
- return dht_fd_ctx_destroy (this, fd);
+ dht_layout_t *layout = NULL;
+ int i = 0;
+
+ layout = dht_layout_get(this, inode);
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].err == 0) {
+ return 0;
+ }
+ }
+
+ /* Returning the first xlator error as all xlators have errors */
+ return layout->list[0].err;
}
diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h
index d9b52e28bb9..fe0dc3db34a 100644
--- a/xlators/cluster/dht/src/dht-common.h
+++ b/xlators/cluster/dht/src/dht-common.h
@@ -9,120 +9,148 @@
*/
#include <regex.h>
-#include <signal.h>
#include "dht-mem-types.h"
#include "dht-messages.h"
+#include <glusterfs/call-stub.h>
#include "libxlator.h"
-#include "syncop.h"
-#include "refcount.h"
-#include "timer.h"
+#include <glusterfs/syncop.h>
+#include <glusterfs/refcount.h>
+#include <glusterfs/timer.h>
+#include "protocol-common.h"
+#include <glusterfs/glusterfs-acl.h>
#ifndef _DHT_H
#define _DHT_H
-#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout"
-#define GF_XATTR_FILE_MIGRATE_KEY "trusted.distribute.migrate-data"
-#define GF_DHT_LOOKUP_UNHASHED_ON 1
+#define GF_XATTR_FIX_LAYOUT_KEY "distribute.fix.layout"
+#define GF_XATTR_FILE_MIGRATE_KEY "trusted.distribute.migrate-data"
+#define DHT_MDS_STR "mds"
+#define GF_DHT_LOOKUP_UNHASHED_OFF 0
+#define GF_DHT_LOOKUP_UNHASHED_ON 1
#define GF_DHT_LOOKUP_UNHASHED_AUTO 2
-#define DHT_PATHINFO_HEADER "DISTRIBUTE:"
-#define DHT_FILE_MIGRATE_DOMAIN "dht.file.migrate"
-#define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal"
-#define TIERING_MIGRATION_KEY "tiering.migration"
-#define DHT_LAYOUT_HASH_INVALID 1
+#define DHT_PATHINFO_HEADER "DISTRIBUTE:"
+#define DHT_FILE_MIGRATE_DOMAIN "dht.file.migrate"
+/* Layout synchronization */
+#define DHT_LAYOUT_HEAL_DOMAIN "dht.layout.heal"
+/* Namespace synchronization */
+#define DHT_ENTRY_SYNC_DOMAIN "dht.entry.sync"
+#define DHT_LAYOUT_HASH_INVALID 1
+#define MAX_REBAL_THREADS sysconf(_SC_NPROCESSORS_ONLN)
-#include <fnmatch.h>
+#define DHT_DIR_STAT_BLOCKS 8
+#define DHT_DIR_STAT_SIZE 4096
-typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie,
- xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- dict_t *xdata);
-typedef int (*dht_defrag_cbk_fn_t) (xlator_t *this, xlator_t *dst_node,
- call_frame_t *frame, int ret);
+/* Virtual xattr for subvols status */
-typedef int (*dht_refresh_layout_unlock) (call_frame_t *frame, xlator_t *this,
- int op_ret);
+#define DHT_SUBVOL_STATUS_KEY "dht.subvol.status"
-typedef int (*dht_refresh_layout_done_handle) (call_frame_t *frame);
+/* Virtual xattrs for debugging */
+
+#define DHT_DBG_HASHED_SUBVOL_PATTERN "dht.file.hashed-subvol.*"
+#define DHT_DBG_HASHED_SUBVOL_KEY "dht.file.hashed-subvol."
+
+/* Rebalance nodeuuid flags */
+#define REBAL_NODEUUID_MINE 0x01
+
+typedef int (*dht_selfheal_dir_cbk_t)(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata);
+typedef int (*dht_defrag_cbk_fn_t)(xlator_t *this, xlator_t *dst_node,
+ call_frame_t *frame, int ret);
+
+typedef int (*dht_refresh_layout_unlock)(call_frame_t *frame, xlator_t *this,
+ int op_ret, int invoke_cbk);
+
+typedef int (*dht_refresh_layout_done_handle)(call_frame_t *frame);
struct dht_layout {
- int spread_cnt; /* layout spread count per directory,
- is controlled by 'setxattr()' with
- special key */
- int cnt;
- int preset;
- /*
- * The last *configuration* state for which this directory was known
- * to be in balance. The corresponding vol_commit_hash changes
- * whenever bricks are added or removed. This value changes when a
- * (full) rebalance is complete. If they match, it's safe to assume
- * that every file is where it should be and there's no need to do
- * lookups for files elsewhere. If they don't, then we have to do a
- * global lookup to be sure.
- */
- uint32_t commit_hash;
- /*
- * The *runtime* state of the volume, changes when connections to
- * bricks are made or lost.
- */
- int gen;
- int type;
- int ref; /* use with dht_conf_t->layout_lock */
- gf_boolean_t search_unhashed;
- struct {
- int err; /* 0 = normal
- -1 = dir exists and no xattr
- >0 = dir lookup failed with errno
- */
- uint32_t start;
- uint32_t stop;
- uint32_t commit_hash;
- xlator_t *xlator;
- } list[];
+ int spread_cnt; /* layout spread count per directory,
+ is controlled by 'setxattr()' with
+ special key */
+ int cnt;
+ int preset;
+ /*
+ * The last *configuration* state for which this directory was known
+ * to be in balance. The corresponding vol_commit_hash changes
+ * whenever bricks are added or removed. This value changes when a
+ * (full) rebalance is complete. If they match, it's safe to assume
+ * that every file is where it should be and there's no need to do
+ * lookups for files elsewhere. If they don't, then we have to do a
+ * global lookup to be sure.
+ */
+ uint32_t commit_hash;
+ /*
+ * The *runtime* state of the volume, changes when connections to
+ * bricks are made or lost.
+ */
+ int gen;
+ int type;
+ gf_atomic_t ref; /* use with dht_conf_t->layout_lock */
+ uint32_t search_unhashed;
+ struct {
+ int err; /* 0 = normal
+ -1 = dir exists and no xattr
+ >0 = dir lookup failed with errno
+ */
+ uint32_t start;
+ uint32_t stop;
+ uint32_t commit_hash;
+ xlator_t *xlator;
+ } list[];
};
-typedef struct dht_layout dht_layout_t;
+typedef struct dht_layout dht_layout_t;
struct dht_stat_time {
- uint32_t atime;
- uint32_t atime_nsec;
- uint32_t ctime;
- uint32_t ctime_nsec;
- uint32_t mtime;
- uint32_t mtime_nsec;
+ uint32_t atime;
+ uint32_t atime_nsec;
+ uint32_t ctime;
+ uint32_t ctime_nsec;
+ uint32_t mtime;
+ uint32_t mtime_nsec;
};
typedef struct dht_stat_time dht_stat_time_t;
struct dht_inode_ctx {
- dht_layout_t *layout;
- dht_stat_time_t time;
+ dht_layout_t *layout;
+ dht_stat_time_t time;
+ xlator_t *lock_subvol;
+ xlator_t *mds_subvol; /* This is only used for directories */
};
typedef struct dht_inode_ctx dht_inode_ctx_t;
-
typedef enum {
- DHT_HASH_TYPE_DM,
- DHT_HASH_TYPE_DM_USER,
+ DHT_HASH_TYPE_DM,
+ DHT_HASH_TYPE_DM_USER,
} dht_hashfn_type_t;
+typedef enum {
+ DHT_INODELK,
+ DHT_ENTRYLK,
+} dht_lock_type_t;
+
/* rebalance related */
struct dht_rebalance_ {
- xlator_t *from_subvol;
- xlator_t *target_node;
- off_t offset;
- size_t size;
- int32_t flags;
- int count;
- struct iobref *iobref;
- struct iovec *vector;
- struct iatt stbuf;
- struct iatt prebuf;
- struct iatt postbuf;
- dht_defrag_cbk_fn_t target_op_fn;
- dict_t *xdata;
- dict_t *xattr;
- int32_t set;
+ xlator_t *from_subvol;
+ xlator_t *target_node;
+ off_t offset;
+ size_t size;
+ int32_t flags;
+ int count;
+ struct iobref *iobref;
+ struct iovec *vector;
+ struct iatt stbuf;
+ struct iatt prebuf;
+ struct iatt postbuf;
+ dht_defrag_cbk_fn_t target_op_fn;
+ dict_t *xdata;
+ dict_t *xattr;
+ dict_t *dict;
+ struct gf_flock flock;
+ int32_t set;
+ int lock_cmd;
};
/**
@@ -130,1041 +158,1227 @@ struct dht_rebalance_ {
* events
**/
typedef enum {
- qdstatfs_action_OFF = 0,
- qdstatfs_action_REPLACE,
- qdstatfs_action_NEGLECT,
- qdstatfs_action_COMPARE,
+ qdstatfs_action_OFF = 0,
+ qdstatfs_action_REPLACE,
+ qdstatfs_action_NEGLECT,
+ qdstatfs_action_COMPARE,
} qdstatfs_action_t;
-struct dht_skip_linkto_unlink {
+typedef enum {
+ REACTION_INVALID,
+ FAIL_ON_ANY_ERROR,
+ IGNORE_ENOENT_ESTALE,
+ IGNORE_ENOENT_ESTALE_EIO,
+} dht_reaction_type_t;
- gf_boolean_t handle_valid_link;
- int opend_fd_count;
- xlator_t *hash_links_to;
- uuid_t cached_gfid;
- uuid_t hashed_gfid;
+struct dht_skip_linkto_unlink {
+ xlator_t *hash_links_to;
+ uuid_t cached_gfid;
+ uuid_t hashed_gfid;
+ int opend_fd_count;
+ gf_boolean_t handle_valid_link;
};
typedef struct {
- xlator_t *xl;
- loc_t loc; /* contains/points to inode to lock on. */
- short type; /* read/write lock. */
- char *domain; /* Only locks within a single domain
- * contend with each other
- */
- gf_lkowner_t lk_owner;
- gf_boolean_t locked;
+ xlator_t *xl;
+ loc_t loc; /* contains/points to inode to lock on. */
+ char *domain; /* Only locks within a single domain
+ * contend with each other
+ */
+ char *basename; /* Required for entrylk */
+ gf_boolean_t locked;
+ dht_reaction_type_t do_on_failure;
+ short type; /* read/write lock. */
+ gf_lkowner_t lk_owner;
} dht_lock_t;
-typedef
-int (*dht_selfheal_layout_t)(call_frame_t *frame, loc_t *loc,
- dht_layout_t *layout);
-
-typedef
-gf_boolean_t (*dht_need_heal_t)(call_frame_t *frame, dht_layout_t **inmem,
- dht_layout_t **ondisk);
-
-struct dht_local {
- int call_cnt;
- loc_t loc;
- loc_t loc2;
- int op_ret;
- int op_errno;
- int layout_mismatch;
- /* Use stbuf as the postbuf, when we require both
- * pre and post attrs */
- struct iatt stbuf;
- struct iatt prebuf;
- struct iatt preoldparent;
- struct iatt postoldparent;
- struct iatt preparent;
- struct iatt postparent;
- struct statvfs statvfs;
- fd_t *fd;
- inode_t *inode;
- dict_t *params;
- dict_t *xattr;
- dict_t *xattr_req;
- dht_layout_t *layout;
- size_t size;
- ino_t ia_ino;
- xlator_t *src_hashed, *src_cached;
- xlator_t *dst_hashed, *dst_cached;
- xlator_t *cached_subvol;
- xlator_t *hashed_subvol;
- char need_selfheal;
- int file_count;
- int dir_count;
- call_frame_t *main_frame;
- int fop_succeeded;
- struct {
- fop_mknod_cbk_t linkfile_cbk;
- struct iatt stbuf;
- loc_t loc;
- inode_t *inode;
- dict_t *xattr;
- xlator_t *srcvol;
- } linkfile;
- struct {
- uint32_t hole_cnt;
- uint32_t overlaps_cnt;
- uint32_t down;
- uint32_t misc;
- dht_selfheal_dir_cbk_t dir_cbk;
- dht_selfheal_layout_t healer;
- dht_need_heal_t should_heal;
- gf_boolean_t force_mkdir;
- dht_layout_t *layout, *refreshed_layout;
- } selfheal;
-
- dht_refresh_layout_unlock refresh_layout_unlock;
- dht_refresh_layout_done_handle refresh_layout_done;
-
- uint32_t uid;
- uint32_t gid;
-
- /* needed by nufa */
- int32_t flags;
- mode_t mode;
- dev_t rdev;
- mode_t umask;
-
- /* need for file-info */
- char *xattr_val;
- char *key;
-
- /* which xattr request? */
- char xsel[256];
- int32_t alloc_len;
-
- char *newpath;
-
- /* gfid related */
- uuid_t gfid;
-
- /* flag used to make sure we need to return estale in
- {lookup,revalidate}_cbk */
- char return_estale;
- char need_lookup_everywhere;
-
- glusterfs_fop_t fop;
-
- gf_boolean_t linked;
- xlator_t *link_subvol;
-
- struct dht_rebalance_ rebalance;
- xlator_t *first_up_subvol;
+/* The lock structure represents inodelk. */
+typedef struct {
+ fop_inodelk_cbk_t inodelk_cbk;
+ dht_lock_t **locks;
+ int lk_count;
+ dht_reaction_type_t reaction;
- gf_boolean_t quota_deem_statfs;
+ /* whether locking failed on _any_ of the "locks" above */
+ int op_ret;
+ int op_errno;
+} dht_ilock_wrap_t;
- gf_boolean_t added_link;
- gf_boolean_t is_linkfile;
+/* The lock structure represents entrylk. */
+typedef struct {
+ fop_entrylk_cbk_t entrylk_cbk;
+ dht_lock_t **locks;
+ int lk_count;
+ dht_reaction_type_t reaction;
+
+ /* whether locking failed on _any_ of the "locks" above */
+ int op_ret;
+ int op_errno;
+} dht_elock_wrap_t;
+
+/* The first member of dht_dir_transaction_t should be of type dht_ilock_wrap_t.
+ * Otherwise it can result in subtle memory corruption issues as in most of the
+ * places we use lock[0].layout.my_layout or lock[0].layout.parent_layout and
+ * lock[0].ns.parent_layout (like in dht_local_wipe).
+ */
+typedef union {
+ union {
+ dht_ilock_wrap_t my_layout;
+ dht_ilock_wrap_t parent_layout;
+ } layout;
+ struct dht_namespace {
+ dht_ilock_wrap_t parent_layout;
+ dht_elock_wrap_t directory_ns;
+ fop_entrylk_cbk_t ns_cbk;
+ } ns;
+} dht_dir_transaction_t;
+
+typedef int (*dht_selfheal_layout_t)(call_frame_t *frame, loc_t *loc,
+ dht_layout_t *layout);
- struct dht_skip_linkto_unlink skip_unlink;
+typedef gf_boolean_t (*dht_need_heal_t)(call_frame_t *frame,
+ dht_layout_t **inmem,
+ dht_layout_t **ondisk);
- struct {
- fop_inodelk_cbk_t inodelk_cbk;
- dht_lock_t **locks;
- int lk_count;
-
- /* whether locking failed on _any_ of the "locks" above */
- int op_ret;
- int op_errno;
- } lock;
+struct dht_local {
+ loc_t loc;
+ loc_t loc2;
+ int call_cnt;
+ int op_ret;
+ int op_errno;
+ int layout_mismatch;
+ /* Use stbuf as the postbuf, when we require both
+ * pre and post attrs */
+ struct iatt stbuf;
+ struct iatt mds_stbuf;
+ struct iatt prebuf;
+ struct iatt preoldparent;
+ struct iatt postoldparent;
+ struct iatt preparent;
+ struct iatt postparent;
+ struct statvfs statvfs;
+ fd_t *fd;
+ inode_t *inode;
+ dict_t *params;
+ dict_t *xattr;
+ dict_t *mds_xattr;
+ dict_t *xdata; /* dict used to save xdata response by xattr fop */
+ dict_t *xattr_req;
+ dht_layout_t *layout;
+ size_t size;
+ ino_t ia_ino;
+ xlator_t *src_hashed, *src_cached;
+ xlator_t *dst_hashed, *dst_cached;
+ xlator_t *cached_subvol;
+ xlator_t *hashed_subvol;
+ xlator_t *mds_subvol; /* This is use for dir only */
+ int file_count;
+ int dir_count;
+ call_frame_t *main_frame;
+ int fop_succeeded;
+ struct {
+ fop_mknod_cbk_t linkfile_cbk;
+ struct iatt stbuf;
+ loc_t loc;
+ inode_t *inode;
+ dict_t *xattr;
+ xlator_t *srcvol;
+ } linkfile;
+ struct {
+ uint32_t hole_cnt;
+ uint32_t overlaps_cnt;
+ uint32_t down;
+ uint32_t misc;
+ dht_selfheal_dir_cbk_t dir_cbk;
+ dht_selfheal_layout_t healer;
+ dht_need_heal_t should_heal;
+ dht_layout_t *layout, *refreshed_layout;
+ uint32_t missing_cnt;
+ gf_boolean_t force_mkdir;
+ } selfheal;
+
+ dht_refresh_layout_unlock refresh_layout_unlock;
+ dht_refresh_layout_done_handle refresh_layout_done;
+
+ uint32_t uid;
+ uint32_t gid;
+ pid_t pid;
+
+ glusterfs_fop_t fop;
+
+ /* need for file-info */
+ char *xattr_val;
+ char *key;
+
+ /* needed by nufa */
+ int32_t flags;
+ mode_t mode;
+ dev_t rdev;
+ mode_t umask;
+
+ /* which xattr request? */
+ char xsel[256];
+ int32_t alloc_len;
+
+ /* gfid related */
+ uuid_t gfid;
+ uuid_t gfid_req;
+
+ xlator_t *link_subvol;
+
+ struct dht_rebalance_ rebalance;
+ xlator_t *first_up_subvol;
+
+ struct dht_skip_linkto_unlink skip_unlink;
+
+ dht_dir_transaction_t lock[2], *current;
+
+ /* inodelks during filerename for backward compatibility */
+ dht_lock_t **rename_inodelk_backward_compatible;
+
+ call_stub_t *stub;
+ int32_t parent_disk_layout[4];
+
+ /* rename rollback */
+ int *ret_cache;
+
+ loc_t loc2_copy;
+
+ int rename_inodelk_bc_count;
+ /* This is use only for directory operation */
+ int32_t valid;
+ int32_t mds_heal_fresh_lookup;
+ short lock_type;
+ char need_selfheal;
+ char need_xattr_heal;
+ char need_attrheal;
+ /* flag used to make sure we need to return estale in
+ {lookup,revalidate}_cbk */
+ char return_estale;
+ char need_lookup_everywhere;
+ /* fd open check */
+ gf_boolean_t fd_checked;
+ gf_boolean_t linked;
+ gf_boolean_t added_link;
+ gf_boolean_t is_linkfile;
+ gf_boolean_t quota_deem_statfs;
+ gf_boolean_t heal_layout;
+ gf_boolean_t locked;
+ gf_boolean_t dont_create_linkto;
+ gf_boolean_t gfid_missing;
};
typedef struct dht_local dht_local_t;
/* du - disk-usage */
struct dht_du {
- double avail_percent;
- double avail_inodes;
- uint64_t avail_space;
- uint32_t log;
- uint32_t chunks;
+ double avail_percent;
+ double avail_inodes;
+ uint64_t avail_space;
+ uint32_t log;
+ uint32_t chunks;
+ uint32_t total_blocks;
+ uint32_t avail_blocks;
+ uint32_t frsize; /*fragment size*/
};
typedef struct dht_du dht_du_t;
enum gf_defrag_type {
- GF_DEFRAG_CMD_START = 1,
- GF_DEFRAG_CMD_STOP = 1 + 1,
- GF_DEFRAG_CMD_STATUS = 1 + 2,
- GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3,
- GF_DEFRAG_CMD_START_FORCE = 1 + 4,
- GF_DEFRAG_CMD_START_TIER = 1 + 5,
- GF_DEFRAG_CMD_STATUS_TIER = 1 + 6,
- GF_DEFRAG_CMD_START_DETACH_TIER = 1 + 7,
- GF_DEFRAG_CMD_STOP_DETACH_TIER = 1 + 8,
- GF_DEFRAG_CMD_PAUSE_TIER = 1 + 9,
- GF_DEFRAG_CMD_RESUME_TIER = 1 + 10,
+ GF_DEFRAG_CMD_NONE = 0,
+ GF_DEFRAG_CMD_START = 1,
+ GF_DEFRAG_CMD_STOP = 1 + 1,
+ GF_DEFRAG_CMD_STATUS = 1 + 2,
+ GF_DEFRAG_CMD_START_LAYOUT_FIX = 1 + 3,
+ GF_DEFRAG_CMD_START_FORCE = 1 + 4,
+ GF_DEFRAG_CMD_DETACH_STATUS = 1 + 11,
+ GF_DEFRAG_CMD_DETACH_START = 1 + 13,
+ GF_DEFRAG_CMD_DETACH_COMMIT = 1 + 14,
+ GF_DEFRAG_CMD_DETACH_COMMIT_FORCE = 1 + 15,
+ GF_DEFRAG_CMD_DETACH_STOP = 1 + 16,
+ /* new labels are used so it will help
+ * while removing old labels by easily differentiating.
+ * A few labels are added so that the count remains same
+ * between this enum and the ones on the xdr file.
+ * different values for the same enum cause errors and
+ * confusion.
+ */
};
typedef enum gf_defrag_type gf_defrag_type;
enum gf_defrag_status_t {
- GF_DEFRAG_STATUS_NOT_STARTED,
- GF_DEFRAG_STATUS_STARTED,
- GF_DEFRAG_STATUS_STOPPED,
- GF_DEFRAG_STATUS_COMPLETE,
- GF_DEFRAG_STATUS_FAILED,
- GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED,
- GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED,
- GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE,
- GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED,
+ GF_DEFRAG_STATUS_NOT_STARTED,
+ GF_DEFRAG_STATUS_STARTED,
+ GF_DEFRAG_STATUS_STOPPED,
+ GF_DEFRAG_STATUS_COMPLETE,
+ GF_DEFRAG_STATUS_FAILED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STARTED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_STOPPED,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_COMPLETE,
+ GF_DEFRAG_STATUS_LAYOUT_FIX_FAILED,
};
typedef enum gf_defrag_status_t gf_defrag_status_t;
typedef struct gf_defrag_pattern_list gf_defrag_pattern_list_t;
struct gf_defrag_pattern_list {
- char path_pattern[256];
- uint64_t size;
- gf_defrag_pattern_list_t *next;
+ char path_pattern[256];
+ uint64_t size;
+ gf_defrag_pattern_list_t *next;
};
struct dht_container {
- union {
- struct list_head list;
- struct {
- struct _gf_dirent_t *next;
- struct _gf_dirent_t *prev;
- };
+ union {
+ struct list_head list;
+ struct {
+ struct _gf_dirent_t *next;
+ struct _gf_dirent_t *prev;
};
- gf_dirent_t *df_entry;
- xlator_t *this;
- loc_t *parent_loc;
- dict_t *migrate_data;
+ };
+ gf_dirent_t *df_entry;
+ xlator_t *this;
+ loc_t *parent_loc;
+ dict_t *migrate_data;
+ int local_subvol_index;
};
-typedef enum tier_mode_ {
- TIER_MODE_NONE = 0,
- TIER_MODE_TEST,
- TIER_MODE_WM
-} tier_mode_t;
-
-typedef enum tier_pause_state_ {
- TIER_RUNNING = 0,
- TIER_REQUEST_PAUSE,
- TIER_PAUSED
-} tier_pause_state_t;
-
-typedef struct gf_tier_conf {
- int is_tier;
- int watermark_hi;
- int watermark_low;
- int watermark_last;
- fsblkcnt_t blocks_total;
- fsblkcnt_t blocks_used;
- int percent_full;
- uint64_t max_migrate_bytes;
- int max_migrate_files;
- tier_mode_t mode;
- int tier_promote_frequency;
- int tier_demote_frequency;
- uint64_t st_last_promoted_size;
- uint64_t st_last_demoted_size;
- tier_pause_state_t pause_state;
- struct synctask *pause_synctask;
- gf_timer_t *pause_timer;
- pthread_mutex_t pause_mutex;
- int promote_in_progress;
- int demote_in_progress;
-} gf_tier_conf_t;
+typedef struct nodeuuid_info {
+ char info; /* Set to 1 is this is my node's uuid*/
+ uuid_t uuid; /* Store the nodeuuid as well for debugging*/
+} nodeuuid_info_t;
+
+typedef struct subvol_nodeuuids_info {
+ nodeuuid_info_t *elements;
+ int count;
+} subvol_nodeuuids_info_t;
struct gf_defrag_info_ {
- uint64_t total_files;
- uint64_t total_data;
- uint64_t num_files_lookedup;
- uint64_t total_failures;
- uint64_t skipped;
- gf_lock_t lock;
- int cmd;
- pthread_t th;
- gf_defrag_status_t defrag_status;
- struct rpc_clnt *rpc;
- uint32_t connected;
- uint32_t is_exiting;
- pid_t pid;
- inode_t *root_inode;
- uuid_t node_uuid;
- struct timeval start_time;
- gf_boolean_t stats;
- uint32_t new_commit_hash;
- gf_defrag_pattern_list_t *defrag_pattern;
- gf_tier_conf_t tier_conf;
-
- /*Data Tiering params for scanner*/
- uint64_t total_files_promoted;
- uint64_t total_files_demoted;
- int write_freq_threshold;
- int read_freq_threshold;
-
- pthread_cond_t parallel_migration_cond;
- pthread_mutex_t dfq_mutex;
- pthread_cond_t rebalance_crawler_alarm;
- int32_t q_entry_count;
- int32_t global_error;
- struct dht_container *queue;
- int32_t crawl_done;
- int32_t abort;
- int32_t wakeup_crawler;
-
- /*Throttle params*/
- /*stands for reconfigured thread count*/
- int32_t recon_thread_count;
- /*stands for current running thread count*/
- int32_t current_thread_count;
- pthread_cond_t df_wakeup_thread;
-
- /* Hard link handle requirement */
- synclock_t link_lock;
+ uint64_t total_files;
+ uint64_t total_data;
+ uint64_t num_files_lookedup;
+ uint64_t total_failures;
+ uint64_t skipped;
+ uint64_t num_dirs_processed;
+ uint64_t size_processed;
+ gf_lock_t lock;
+ pthread_t th;
+ struct rpc_clnt *rpc;
+ uint32_t connected;
+ uint32_t is_exiting;
+ pid_t pid;
+ int cmd;
+ inode_t *root_inode;
+ uuid_t node_uuid;
+ time_t start_time;
+ uint32_t new_commit_hash;
+ gf_defrag_status_t defrag_status;
+ gf_defrag_pattern_list_t *defrag_pattern;
+
+ pthread_cond_t parallel_migration_cond;
+ pthread_mutex_t dfq_mutex;
+ pthread_cond_t rebalance_crawler_alarm;
+ int32_t q_entry_count;
+ int32_t global_error;
+ struct dht_container *queue;
+ int32_t crawl_done;
+ int32_t abort;
+ int32_t wakeup_crawler;
+
+ /*Throttle params*/
+ /*stands for reconfigured thread count*/
+ int32_t recon_thread_count;
+ pthread_cond_t df_wakeup_thread;
+
+ /* backpointer to make it easier to write functions for rebalance */
+ xlator_t *this;
+
+ pthread_cond_t fc_wakeup_cond;
+ pthread_mutex_t fc_mutex;
+
+ /*stands for current running thread count*/
+ int32_t current_thread_count;
+
+ gf_boolean_t stats;
+ /* lock migration flag */
+ gf_boolean_t lock_migration_enabled;
};
typedef struct gf_defrag_info_ gf_defrag_info_t;
struct dht_methods_s {
- int32_t (*migration_get_dst_subvol)(xlator_t *this,
- dht_local_t *local);
- int32_t (*migration_other)(xlator_t *this,
- gf_defrag_info_t *defrag);
- int32_t (*migration_needed)(xlator_t *this);
- xlator_t* (*layout_search)(xlator_t *this,
- dht_layout_t *layout,
- const char *name);
+ int32_t (*migration_get_dst_subvol)(xlator_t *this, dht_local_t *local);
+ int32_t (*migration_other)(xlator_t *this, gf_defrag_info_t *defrag);
+ xlator_t *(*layout_search)(xlator_t *this, dht_layout_t *layout,
+ const char *name);
};
typedef struct dht_methods_s dht_methods_t;
struct dht_conf {
- gf_lock_t subvolume_lock;
- int subvolume_cnt;
- xlator_t **subvolumes;
- char *subvolume_status;
- int *last_event;
- dht_layout_t **file_layouts;
- dht_layout_t **dir_layouts;
- unsigned int search_unhashed;
- gf_boolean_t lookup_optimize;
- int gen;
- dht_du_t *du_stats;
- double min_free_disk;
- double min_free_inodes;
- char disk_unit;
- int32_t refresh_interval;
- gf_boolean_t unhashed_sticky_bit;
- struct timeval last_stat_fetch;
- gf_lock_t layout_lock;
- dict_t *leaf_to_subvol;
- void *private; /* Can be used by wrapper xlators over
- dht */
- gf_boolean_t use_readdirp;
- char vol_uuid[UUID_SIZE + 1];
- gf_boolean_t assert_no_child_down;
- time_t *subvol_up_time;
-
- /* This is the count used as the distribute layout for a directory */
- /* Will be a global flag to control the layout spread count */
- uint32_t dir_spread_cnt;
-
- /* to keep track of nodes which are decommissioned */
- xlator_t **decommissioned_bricks;
- int decommission_in_progress;
- int decommission_subvols_cnt;
-
- /* defrag related */
- gf_defrag_info_t *defrag;
-
- /* Request to filter directory entries in readdir request */
-
- gf_boolean_t readdir_optimize;
-
- /* Support regex-based name reinterpretation. */
- regex_t rsync_regex;
- gf_boolean_t rsync_regex_valid;
- regex_t extra_regex;
- gf_boolean_t extra_regex_valid;
-
- /* Support variable xattr names. */
- char *xattr_name;
- char *link_xattr_name;
- char *commithash_xattr_name;
- char *wild_xattr_name;
-
- /* Support size-weighted rebalancing (heterogeneous bricks). */
- gf_boolean_t do_weighting;
- gf_boolean_t randomize_by_gfid;
- char *dthrottle;
-
- dht_methods_t methods;
-
- struct mem_pool *lock_pool;
-
- /*local subvol storage for rebalance*/
- xlator_t **local_subvols;
- int32_t local_subvols_cnt;
-
- /*
- * "Commit hash" for this volume topology. Changed whenever bricks
- * are added or removed.
- */
- uint32_t vol_commit_hash;
- gf_boolean_t vch_forced;
+ xlator_t **subvolumes;
+ char *subvolume_status;
+ int *last_event;
+ dht_layout_t **file_layouts;
+ dht_layout_t **dir_layouts;
+ unsigned int search_unhashed;
+ int gen;
+ dht_du_t *du_stats;
+ double min_free_disk;
+ double min_free_inodes;
+ int subvolume_cnt;
+ int32_t refresh_interval;
+ gf_lock_t subvolume_lock;
+ time_t last_stat_fetch;
+ gf_lock_t layout_lock;
+ dict_t *leaf_to_subvol;
+ void *private; /* Can be used by wrapper xlators over
+ dht */
+ time_t *subvol_up_time;
+
+ /* to keep track of nodes which are decommissioned */
+ xlator_t **decommissioned_bricks;
+ int decommission_in_progress;
+ int decommission_subvols_cnt;
+
+ /* defrag related */
+ gf_defrag_info_t *defrag;
+
+ /* Support regex-based name reinterpretation. */
+ regex_t rsync_regex;
+ regex_t extra_regex;
+
+ /* Support variable xattr names. */
+ char *xattr_name;
+ char *mds_xattr_key;
+ char *link_xattr_name;
+ char *commithash_xattr_name;
+ char *wild_xattr_name;
+
+ dht_methods_t methods;
+
+ struct mem_pool *lock_pool;
+
+ /*local subvol storage for rebalance*/
+ xlator_t **local_subvols;
+ subvol_nodeuuids_info_t *local_nodeuuids;
+ int32_t local_subvols_cnt;
+
+ int dthrottle;
+
+ /* Hard link handle requirement for migration triggered from client*/
+ synclock_t link_lock;
+
+ /* lock migration */
+ gf_lock_t lock;
+
+ /* This is the count used as the distribute layout for a directory */
+ /* Will be a global flag to control the layout spread count */
+ uint32_t dir_spread_cnt;
+
+ /*
+ * "Commit hash" for this volume topology. Changed whenever bricks
+ * are added or removed.
+ */
+ uint32_t vol_commit_hash;
+
+ char vol_uuid[UUID_SIZE + 1];
+
+ char disk_unit;
+
+ gf_boolean_t lock_migration_enabled;
+
+ gf_boolean_t vch_forced;
+
+ gf_boolean_t use_fallocate;
+
+ gf_boolean_t force_migration;
+
+ gf_boolean_t lookup_optimize;
+
+ gf_boolean_t unhashed_sticky_bit;
+
+ gf_boolean_t assert_no_child_down;
+
+ gf_boolean_t use_readdirp;
+
+ /* Request to filter directory entries in readdir request */
+ gf_boolean_t readdir_optimize;
+
+ gf_boolean_t rsync_regex_valid;
+
+ gf_boolean_t extra_regex_valid;
+
+ /* Support size-weighted rebalancing (heterogeneous bricks). */
+ gf_boolean_t do_weighting;
+
+ gf_boolean_t randomize_by_gfid;
};
typedef struct dht_conf dht_conf_t;
struct dht_dfoffset_ctx {
- xlator_t *this;
- off_t offset;
- int32_t readdir_done;
+ xlator_t *this;
+ off_t offset;
+ int32_t readdir_done;
};
typedef struct dht_dfoffset_ctx dht_dfoffset_ctx_t;
struct dht_disk_layout {
- uint32_t cnt;
- uint32_t type;
- struct {
- uint32_t start;
- uint32_t stop;
- } list[1];
+ uint32_t cnt;
+ uint32_t type;
+ struct {
+ uint32_t start;
+ uint32_t stop;
+ } list[1];
};
typedef struct dht_disk_layout dht_disk_layout_t;
typedef enum {
- GF_DHT_MIGRATE_DATA,
- GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS,
- GF_DHT_MIGRATE_HARDLINK,
- GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS
+ GF_DHT_MIGRATE_DATA,
+ GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS,
+ GF_DHT_MIGRATE_HARDLINK,
+ GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS
} gf_dht_migrate_data_type_t;
typedef enum {
- GF_DHT_EQUAL_DISTRIBUTION,
- GF_DHT_WEIGHTED_DISTRIBUTION
+ GF_DHT_EQUAL_DISTRIBUTION,
+ GF_DHT_WEIGHTED_DISTRIBUTION
} dht_distribution_type_t;
struct dir_dfmeta {
- gf_dirent_t *equeue;
- dht_dfoffset_ctx_t *offset_var;
- struct list_head **head;
- struct list_head **iterator;
- int *fetch_entries;
+ gf_dirent_t *equeue;
+ dht_dfoffset_ctx_t *offset_var;
+ struct list_head **head;
+ struct list_head **iterator;
+ int *fetch_entries;
+ /* fds corresponding to local subvols only */
+ fd_t **lfd;
};
typedef struct dht_migrate_info {
- xlator_t *src_subvol;
- xlator_t *dst_subvol;
- GF_REF_DECL;
+ xlator_t *src_subvol;
+ xlator_t *dst_subvol;
+ GF_REF_DECL;
} dht_migrate_info_t;
-
-
typedef struct dht_fd_ctx {
- uint64_t opened_on_dst;
- GF_REF_DECL;
+ uint64_t opened_on_dst;
+ GF_REF_DECL;
} dht_fd_ctx_t;
-
#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT)
-#define is_revalidate(loc) (dht_inode_ctx_layout_get (loc->inode, this, NULL) == 0)
+#define is_revalidate(loc) \
+ (dht_inode_ctx_layout_get((loc)->inode, this, NULL) == 0)
#define is_last_call(cnt) (cnt == 0)
#define DHT_MIGRATION_IN_PROGRESS 1
-#define DHT_MIGRATION_COMPLETED 2
+#define DHT_MIGRATION_COMPLETED 2
-#define check_is_linkfile(i,s,x,n) (IS_DHT_LINKFILE_MODE (s) && dict_get (x, n))
+#define check_is_linkfile(i, s, x, n) \
+ (IS_DHT_LINKFILE_MODE(s) && dict_get(x, n))
-#define IS_DHT_MIGRATION_PHASE2(buf) ( \
- IA_ISREG ((buf)->ia_type) && \
- ((st_mode_from_ia ((buf)->ia_prot, (buf)->ia_type) & \
- ~S_IFMT) == DHT_LINKFILE_MODE))
+#define IS_DHT_MIGRATION_PHASE2(buf) \
+ (IA_ISREG((buf)->ia_type) && \
+ ((st_mode_from_ia((buf)->ia_prot, (buf)->ia_type) & ~S_IFMT) == \
+ DHT_LINKFILE_MODE))
-#define IS_DHT_MIGRATION_PHASE1(buf) ( \
- IA_ISREG ((buf)->ia_type) && \
- ((buf)->ia_prot.sticky == 1) && \
- ((buf)->ia_prot.sgid == 1))
+#define IS_DHT_MIGRATION_PHASE1(buf) \
+ (IA_ISREG((buf)->ia_type) && ((buf)->ia_prot.sticky == 1) && \
+ ((buf)->ia_prot.sgid == 1))
-#define DHT_STRIP_PHASE1_FLAGS(buf) do { \
- if ((buf) && IS_DHT_MIGRATION_PHASE1(buf)) { \
- (buf)->ia_prot.sticky = 0; \
- (buf)->ia_prot.sgid = 0; \
- } \
- } while (0)
+#define DHT_STRIP_PHASE1_FLAGS(buf) \
+ do { \
+ if ((buf) && IS_DHT_MIGRATION_PHASE1(buf)) { \
+ (buf)->ia_prot.sticky = 0; \
+ (buf)->ia_prot.sgid = 0; \
+ } \
+ } while (0)
#define dht_inode_missing(op_errno) (op_errno == ENOENT || op_errno == ESTALE)
-#define check_is_dir(i,s,x) (IA_ISDIR(s->ia_type))
+#define check_is_dir(i, s, x) (IA_ISDIR(s->ia_type))
#define layout_is_sane(layout) ((layout) && (layout->cnt > 0))
-#define we_are_not_migrating(x) ((x) == 1)
-
-#define DHT_STACK_UNWIND(fop, frame, params ...) do { \
- dht_local_t *__local = NULL; \
- xlator_t *__xl = NULL; \
- if (frame) { \
- __xl = frame->this; \
- __local = frame->local; \
- frame->local = NULL; \
- } \
- STACK_UNWIND_STRICT (fop, frame, params); \
- dht_local_wipe (__xl, __local); \
- } while (0)
-
-#define DHT_STACK_DESTROY(frame) do { \
- dht_local_t *__local = NULL; \
- xlator_t *__xl = NULL; \
- __xl = frame->this; \
- __local = frame->local; \
- frame->local = NULL; \
- STACK_DESTROY (frame->root); \
- dht_local_wipe (__xl, __local); \
- } while (0)
-
-#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, inode, post) do {\
- LOCK (&inode->lock); \
- { \
- if (ctx_sec == new_sec) \
- new_nsec = max (new_nsec, ctx_nsec); \
- else if (ctx_sec > new_sec) { \
- new_sec = ctx_sec; \
- new_nsec = ctx_nsec; \
- } \
- if (post) { \
- ctx_sec = new_sec; \
- ctx_nsec = new_nsec; \
- } \
- } \
- UNLOCK (&inode->lock); \
- } while (0)
-
-#define is_greater_time(a, an, b, bn) (((a) < (b)) || (((a) == (b)) && ((an) < (bn))))
-
-#define DHT_MARK_FOP_INTERNAL(xattr) do { \
- int tmp = -1; \
- if (!xattr) { \
- xattr = dict_new (); \
- if (!xattr) \
- break; \
- } \
- tmp = dict_set_str (xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \
- if (tmp) { \
- gf_msg (this->name, GF_LOG_ERROR, 0, \
- DHT_MSG_DICT_SET_FAILED, \
- "Failed to set dictionary value: key = %s," \
- " path = %s", GLUSTERFS_INTERNAL_FOP_KEY, \
- local->loc.path); \
- } \
- } while (0)
-
-dht_layout_t *dht_layout_new (xlator_t *this, int cnt);
-dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode);
-dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol);
-xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout,
- const char *name);
+#define we_are_not_migrating(x) ((x) == 1)
+
+#define DHT_STACK_UNWIND(fop, frame, params...) \
+ do { \
+ dht_local_t *__local = NULL; \
+ xlator_t *__xl = NULL; \
+ if (frame) { \
+ __xl = frame->this; \
+ __local = frame->local; \
+ frame->local = NULL; \
+ } \
+ STACK_UNWIND_STRICT(fop, frame, params); \
+ dht_local_wipe(__xl, __local); \
+ } while (0)
+
+#define DHT_STACK_DESTROY(frame) \
+ do { \
+ dht_local_t *__local = NULL; \
+ xlator_t *__xl = NULL; \
+ __xl = frame->this; \
+ __local = frame->local; \
+ frame->local = NULL; \
+ STACK_DESTROY(frame->root); \
+ dht_local_wipe(__xl, __local); \
+ } while (0)
+
+#define DHT_UPDATE_TIME(ctx_sec, ctx_nsec, new_sec, new_nsec, post) \
+ do { \
+ if (ctx_sec == new_sec) \
+ new_nsec = max(new_nsec, ctx_nsec); \
+ else if (ctx_sec > new_sec) { \
+ new_sec = ctx_sec; \
+ new_nsec = ctx_nsec; \
+ } \
+ if (post) { \
+ ctx_sec = new_sec; \
+ ctx_nsec = new_nsec; \
+ } \
+ } while (0)
+
+#define is_greater_time(a, an, b, bn) \
+ (((a) < (b)) || (((a) == (b)) && ((an) < (bn))))
+
+#define DHT_MARK_FOP_INTERNAL(xattr) \
+ do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new(); \
+ if (!xattr) \
+ break; \
+ } \
+ tmp = dict_set_str(xattr, GLUSTERFS_INTERNAL_FOP_KEY, "yes"); \
+ if (tmp) { \
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \
+ "Failed to set dictionary value: key = %s," \
+ " path = %s", \
+ GLUSTERFS_INTERNAL_FOP_KEY, local->loc.path); \
+ } \
+ } while (0)
+
+dht_layout_t *
+dht_layout_new(xlator_t *this, int cnt);
+dht_layout_t *
+dht_layout_get(xlator_t *this, inode_t *inode);
+dht_layout_t *
+dht_layout_for_subvol(xlator_t *this, xlator_t *subvol);
+xlator_t *
+dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name);
int32_t
-dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local);
+dht_migration_get_dst_subvol(xlator_t *this, dht_local_t *local);
int32_t
dht_migration_needed(xlator_t *this);
-int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout);
-int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
- uint32_t *holes_p, uint32_t *overlaps_p,
- uint32_t *missing_p, uint32_t *down_p,
- uint32_t *misc_p, uint32_t *no_space_p);
-int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout,
- xlator_t *subvol, loc_t *loc, dict_t *xattr);
+int
+dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout);
+void
+dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
+ uint32_t *holes_p, uint32_t *overlaps_p,
+ uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p,
+ uint32_t *no_space_p);
+int
+dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ loc_t *loc, dict_t *xattr);
+xlator_t *
+dht_linkfile_subvol(xlator_t *this, inode_t *inode, struct iatt *buf,
+ dict_t *xattr);
+int
+dht_linkfile_unlink(call_frame_t *frame, xlator_t *this, xlator_t *subvol,
+ loc_t *loc);
+
+int
+dht_layouts_init(xlator_t *this, dht_conf_t *conf);
+int
+dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ int op_ret, int op_errno, dict_t *xattr);
+
+int
+dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos,
+ int32_t **disk_layout_p);
+int
+dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout,
+ xlator_t *subvol, int32_t **disk_layout_p);
+
+int
+dht_frame_return(call_frame_t *frame);
+
+int
+dht_deitransform(xlator_t *this, uint64_t y, xlator_t **subvol);
+
+void
+dht_local_wipe(xlator_t *this, dht_local_t *local);
+dht_local_t *
+dht_local_init(call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop);
+int
+dht_iatt_merge(xlator_t *this, struct iatt *to, struct iatt *from);
+
+xlator_t *
+dht_subvol_get_hashed(xlator_t *this, loc_t *loc);
+xlator_t *
+dht_subvol_get_cached(xlator_t *this, inode_t *inode);
+xlator_t *
+dht_subvol_next(xlator_t *this, xlator_t *prev);
+xlator_t *
+dht_subvol_next_available(xlator_t *this, xlator_t *prev);
+int
+dht_subvol_cnt(xlator_t *this, xlator_t *subvol);
-xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode,
- struct iatt *buf, dict_t *xattr);
-int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this,
- xlator_t *subvol, loc_t *loc);
+int
+dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p);
-int dht_layouts_init (xlator_t *this, dht_conf_t *conf);
-int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
- int op_ret, int op_errno, dict_t *xattr);
+int
+dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+ xlator_t *this, xlator_t *tovol, xlator_t *fromvol,
+ loc_t *loc);
+int
+dht_lookup_everywhere(call_frame_t *frame, xlator_t *this, loc_t *loc);
+int
+dht_selfheal_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+ loc_t *loc, dht_layout_t *layout);
+int
+dht_selfheal_new_directory(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+ dht_layout_t *layout);
+int
+dht_selfheal_restore(call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
+ loc_t *loc, dht_layout_t *layout);
+void
+dht_layout_sort_volname(dht_layout_t *layout);
-int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
- int pos, int32_t **disk_layout_p);
-int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
- int pos, void *disk_layout_raw, int disk_layout_len);
+int
+dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc);
+gf_boolean_t
+dht_is_subvol_filled(xlator_t *this, xlator_t *subvol);
+xlator_t *
+dht_free_disk_available_subvol(xlator_t *this, xlator_t *subvol,
+ dht_local_t *layout);
+int
+dht_get_du_info_for_subvol(xlator_t *this, int subvol_idx);
-int dht_frame_return (call_frame_t *frame);
+int
+dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode);
+int
+dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout);
+;
+void
+dht_layout_unref(xlator_t *this, dht_layout_t *layout);
+dht_layout_t *
+dht_layout_ref(xlator_t *this, dht_layout_t *layout);
+int
+dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol);
+xlator_t *
+dht_first_up_subvol(xlator_t *this);
+xlator_t *
+dht_last_up_subvol(xlator_t *this);
-int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol);
+int
+dht_build_child_loc(xlator_t *this, loc_t *child, loc_t *parent, char *name);
-void dht_local_wipe (xlator_t *this, dht_local_t *local);
-dht_local_t *dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd,
- glusterfs_fop_t fop);
-int dht_iatt_merge (xlator_t *this, struct iatt *to, struct iatt *from,
- xlator_t *subvol);
+int
+dht_filter_loc_subvol_key(xlator_t *this, loc_t *loc, loc_t *new_loc,
+ xlator_t **subvol);
-xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc);
-xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode);
-xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev);
-xlator_t *dht_subvol_next_available (xlator_t *this, xlator_t *prev);
-int dht_subvol_cnt (xlator_t *this, xlator_t *subvol);
+int
+dht_rename_cleanup(call_frame_t *frame);
+int
+dht_rename_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata);
-int dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p);
+int
+dht_update_commit_hash_for_layout(call_frame_t *frame);
+int
+dht_fix_directory_layout(call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk,
+ dht_layout_t *layout);
-int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
- xlator_t *this, xlator_t *tovol,
- xlator_t *fromvol, loc_t *loc);
-int dht_lookup_directory (call_frame_t *frame, xlator_t *this, loc_t *loc);
-int dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc);
int
-dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
- loc_t *loc, dht_layout_t *layout);
+dht_init_subvolumes(xlator_t *this, dht_conf_t *conf);
+/* migration/rebalance */
int
-dht_selfheal_directory_for_nameless_lookup (call_frame_t *frame,
- dht_selfheal_dir_cbk_t cbk,
- loc_t *loc, dht_layout_t *layout);
+dht_start_rebalance_task(xlator_t *this, call_frame_t *frame);
int
-dht_selfheal_new_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
- dht_layout_t *layout);
+dht_rebalance_in_progress_check(xlator_t *this, call_frame_t *frame);
int
-dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk,
- loc_t *loc, dht_layout_t *layout);
+dht_rebalance_complete_check(xlator_t *this, call_frame_t *frame);
+
int
-dht_layout_sort_volname (dht_layout_t *layout);
+dht_init_local_subvolumes(xlator_t *this, dht_conf_t *conf);
+
+/* FOPS */
+int32_t
+dht_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req);
+
+int32_t
+dht_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int32_t
+dht_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
+
+int32_t
+dht_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata);
-int dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc);
+int32_t
+dht_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata);
-gf_boolean_t dht_is_subvol_filled (xlator_t *this, xlator_t *subvol);
-xlator_t *dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
- dht_local_t *layout);
-int dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx);
+int32_t
+dht_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata);
-int dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode);
-int dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol);
-int dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout);;
-void dht_layout_unref (xlator_t *this, dht_layout_t *layout);
-dht_layout_t *dht_layout_ref (xlator_t *this, dht_layout_t *layout);
-xlator_t *dht_first_up_subvol (xlator_t *this);
-xlator_t *dht_last_up_subvol (xlator_t *this);
+int32_t
+dht_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata);
-int dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name);
+int32_t
+dht_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ dev_t rdev, mode_t umask, dict_t *xdata);
-int dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc,
- xlator_t **subvol);
+int32_t
+dht_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata);
-int dht_rename_cleanup (call_frame_t *frame);
-int dht_rename_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata);
+int32_t
+dht_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata);
-int dht_update_commit_hash_for_layout (call_frame_t *frame);
-int dht_fix_directory_layout (call_frame_t *frame,
- dht_selfheal_dir_cbk_t dir_cbk,
- dht_layout_t *layout);
+int32_t
+dht_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags,
+ dict_t *xdata);
-int dht_init_subvolumes (xlator_t *this, dht_conf_t *conf);
+int32_t
+dht_symlink(call_frame_t *frame, xlator_t *this, const char *linkpath,
+ loc_t *loc, mode_t umask, dict_t *xdata);
-/* migration/rebalance */
-int dht_start_rebalance_task (xlator_t *this, call_frame_t *frame);
+int32_t
+dht_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
-int dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame);
-int dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame);
+int32_t
+dht_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
-int
-dht_init_local_subvolumes (xlator_t *this, dht_conf_t *conf);
+int32_t
+dht_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ mode_t mode, mode_t umask, fd_t *fd, dict_t *params);
-/* FOPS */
-int32_t dht_lookup (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *xattr_req);
-
-int32_t dht_stat (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc, dict_t *xdata);
-
-int32_t dht_fstat (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd, dict_t *xdata);
-
-int32_t dht_truncate (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- off_t offset, dict_t *xdata);
-
-int32_t dht_ftruncate (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- off_t offset, dict_t *xdata);
-
-int32_t dht_access (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t mask, dict_t *xdata);
-
-int32_t dht_readlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- size_t size, dict_t *xdata);
-
-int32_t dht_mknod (call_frame_t *frame, xlator_t *this, loc_t *loc,
- mode_t mode, dev_t rdev, mode_t umask, dict_t *xdata);
-
-int32_t dht_mkdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, mode_t mode, mode_t umask, dict_t *xdata);
-
-int32_t dht_unlink (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc, int xflag, dict_t *xdata);
-
-int32_t dht_rmdir (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags, dict_t *xdata);
-
-int32_t dht_symlink (call_frame_t *frame, xlator_t *this,
- const char *linkpath, loc_t *loc, mode_t umask,
- dict_t *xdata);
-
-int32_t dht_rename (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc, dict_t *xdata);
-
-int32_t dht_link (call_frame_t *frame,
- xlator_t *this,
- loc_t *oldloc,
- loc_t *newloc, dict_t *xdata);
-
-int32_t dht_create (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int32_t flags, mode_t mode,
- mode_t umask, fd_t *fd, dict_t *params);
-
-int32_t dht_open (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- int32_t flags, fd_t *fd, dict_t *xdata);
-
-int32_t dht_readv (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size,
- off_t offset, uint32_t flags, dict_t *xdata);
-
-int32_t dht_writev (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- struct iovec *vector,
- int32_t count,
- off_t offset,
- uint32_t flags,
- struct iobref *iobref, dict_t *xdata);
-
-int32_t dht_flush (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd, dict_t *xdata);
-
-int32_t dht_fsync (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t datasync, dict_t *xdata);
-
-int32_t dht_opendir (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc, fd_t *fd, dict_t *xdata);
-
-int32_t dht_fsyncdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t datasync, dict_t *xdata);
-
-int32_t dht_statfs (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc, dict_t *xdata);
-
-int32_t dht_setxattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- dict_t *dict,
- int32_t flags, dict_t *xdata);
-
-int32_t dht_getxattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name, dict_t *xdata);
-
-int32_t dht_fsetxattr (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- dict_t *dict,
- int32_t flags, dict_t *xdata);
-
-int32_t dht_fgetxattr (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- const char *name, dict_t *xdata);
-
-int32_t dht_removexattr (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- const char *name, dict_t *xdata);
-int32_t dht_fremovexattr (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- const char *name, dict_t *xdata);
-
-int32_t dht_lk (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- int32_t cmd,
- struct gf_flock *flock, dict_t *xdata);
-
-int32_t dht_inodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, int32_t cmd,
- struct gf_flock *flock, dict_t *xdata);
-
-int32_t dht_finodelk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, int32_t cmd,
- struct gf_flock *flock, dict_t *xdata);
-
-int32_t dht_entrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, loc_t *loc, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata);
-
-int32_t dht_fentrylk (call_frame_t *frame, xlator_t *this,
- const char *volume, fd_t *fd, const char *basename,
- entrylk_cmd cmd, entrylk_type type, dict_t *xdata);
-
-int32_t dht_readdir (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size, off_t off, dict_t *xdata);
-
-int32_t dht_readdirp (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- size_t size, off_t off, dict_t *dict);
-
-int32_t dht_xattrop (call_frame_t *frame,
- xlator_t *this,
- loc_t *loc,
- gf_xattrop_flags_t flags,
- dict_t *dict, dict_t *xdata);
-
-int32_t dht_fxattrop (call_frame_t *frame,
- xlator_t *this,
- fd_t *fd,
- gf_xattrop_flags_t flags,
- dict_t *dict, dict_t *xdata);
-
-int32_t dht_forget (xlator_t *this, inode_t *inode);
-int32_t dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid, dict_t *xdata);
-int32_t dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iatt *stbuf, int32_t valid, dict_t *xdata);
-int32_t dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd,
- int32_t mode, off_t offset, size_t len, dict_t *xdata);
-int32_t dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd,
- off_t offset, size_t len, dict_t *xdata);
-int32_t dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd,
- off_t offset, off_t len, dict_t *xdata);
+int32_t
+dht_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
+ fd_t *fd, dict_t *xdata);
+
+int32_t
+dht_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t offset, uint32_t flags, dict_t *xdata);
+
+int32_t
+dht_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int32_t count, off_t offset, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata);
+
+int32_t
+dht_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata);
+
+int32_t
+dht_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata);
+
+int32_t
+dht_opendir(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd,
+ dict_t *xdata);
+
+int32_t
+dht_fsyncdir(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ dict_t *xdata);
+
+int32_t
+dht_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
+
+int32_t
+dht_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict,
+ int32_t flags, dict_t *xdata);
+
+int32_t
+dht_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, const char *name,
+ dict_t *xdata);
+
+int32_t
+dht_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
+ int32_t flags, dict_t *xdata);
+
+int32_t
+dht_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name,
+ dict_t *xdata);
+
+int32_t
+dht_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *name, dict_t *xdata);
+int32_t
+dht_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ const char *name, dict_t *xdata);
+
+int32_t
+dht_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t cmd,
+ struct gf_flock *flock, dict_t *xdata);
+
+int32_t
+dht_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct gf_lease *lease, dict_t *xdata);
+
+int32_t
+dht_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata);
+
+int32_t
+dht_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int32_t cmd, struct gf_flock *flock, dict_t *xdata);
+
+int32_t
+dht_entrylk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata);
+
+int32_t
+dht_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ const char *basename, entrylk_cmd cmd, entrylk_type type,
+ dict_t *xdata);
+
+int32_t
+dht_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *xdata);
+
+int32_t
+dht_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
+ off_t off, dict_t *dict);
+
+int32_t
+dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int32_t
+dht_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata);
+
+int32_t
+dht_forget(xlator_t *this, inode_t *inode);
+int32_t
+dht_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata);
+int32_t
+dht_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata);
+int32_t
+dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
+ off_t offset, size_t len, dict_t *xdata);
+int32_t
+dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ size_t len, dict_t *xdata);
+int32_t
+dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ off_t len, dict_t *xdata);
+int32_t
+dht_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *xdata);
int
dht_set_subvol_range(xlator_t *this);
-int32_t dht_init (xlator_t *this);
-void dht_fini (xlator_t *this);
-int dht_reconfigure (xlator_t *this, dict_t *options);
-int32_t dht_notify (xlator_t *this, int32_t event, void *data, ...);
+int32_t
+dht_init(xlator_t *this);
+void
+dht_fini(xlator_t *this);
+int
+dht_reconfigure(xlator_t *this, dict_t *options);
+int32_t
+dht_notify(xlator_t *this, int32_t event, void *data, ...);
/* definitions for nufa/switch */
-int dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent);
-int dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
+int
+dht_revalidate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent);
+int
+dht_lookup_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr, struct iatt *postparent);
+int
+dht_lookup_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
int op_ret, int op_errno, inode_t *inode,
struct iatt *stbuf, dict_t *xattr,
struct iatt *postparent);
-int dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent);
-int dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent);
-int dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- fd_t *fd, inode_t *inode, struct iatt *stbuf,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata);
-int dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata);
-
-int
-gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict);
-
-void
-gf_defrag_set_pause_state (gf_tier_conf_t *tier_conf, tier_pause_state_t state);
-
-tier_pause_state_t
-gf_defrag_get_pause_state (gf_tier_conf_t *tier_conf);
-
int
-gf_defrag_pause_tier (xlator_t *this, gf_defrag_info_t *defrag);
+dht_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent);
+int
+dht_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
+int
+dht_newfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, inode_t *inode, struct iatt *stbuf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
-tier_pause_state_t
-gf_defrag_check_pause_tier (gf_tier_conf_t *defrag);
+int
+dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
int
-gf_defrag_resume_tier (xlator_t *this, gf_defrag_info_t *defrag);
+dht_getxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xattr, dict_t *xdata);
int
-gf_defrag_start_detach_tier (gf_defrag_info_t *defrag);
+dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+int
+gf_defrag_status_get(dht_conf_t *conf, dict_t *dict);
int
-gf_defrag_stop (gf_defrag_info_t *defrag, gf_defrag_status_t status,
- dict_t *output);
+gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output);
-void*
-gf_defrag_start (void *this);
+void *
+gf_defrag_start(void *this);
int32_t
-gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
- struct iatt *stbuf);
+gf_defrag_handle_hardlink(xlator_t *this, loc_t *loc, int *fop_errno);
int
-dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
- int flag);
+dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
+ int flag, int *fop_errno);
int
-dht_inode_ctx_layout_get (inode_t *inode, xlator_t *this,
- dht_layout_t **layout_int);
+dht_inode_ctx_layout_get(inode_t *inode, xlator_t *this,
+ dht_layout_t **layout_int);
int
-dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this,
- dht_layout_t* layout_int);
+dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this,
+ dht_layout_t *layout_int);
int
-dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat,
- int32_t update_ctx);
-void dht_inode_ctx_time_set (inode_t *inode, xlator_t *this, struct iatt *stat);
+dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat,
+ int32_t update_ctx);
+void
+dht_inode_ctx_time_set(inode_t *inode, xlator_t *this, struct iatt *stat);
-int dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx);
-int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx);
int
-dht_dir_attr_heal (void *data);
+dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx);
int
-dht_dir_attr_heal_done (int ret, call_frame_t *sync_frame, void *data);
+dht_inode_ctx_set(inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx);
int
-dht_dir_has_layout (dict_t *xattr, char *name);
-gf_boolean_t
-dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator);
+dht_dir_attr_heal(void *data);
+int
+dht_dir_attr_heal_done(int ret, call_frame_t *sync_frame, void *data);
xlator_t *
-dht_subvol_with_free_space_inodes (xlator_t *this, xlator_t *subvol,
- dht_layout_t *layout);
+dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
+ xlator_t *ignore, dht_layout_t *layout,
+ uint64_t filesize);
xlator_t *
-dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
- dht_layout_t *layout);
+dht_subvol_maxspace_nonzeroinode(xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout);
+int
+dht_dir_has_layout(dict_t *xattr, char *name);
int
-dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this);
+dht_linkfile_attr_heal(call_frame_t *frame, xlator_t *this);
-void
-dht_layout_dump (dht_layout_t *layout, const char *prefix);
int32_t
-dht_priv_dump (xlator_t *this);
+dht_priv_dump(xlator_t *this);
int32_t
-dht_inodectx_dump (xlator_t *this, inode_t *inode);
+dht_inodectx_dump(xlator_t *this, inode_t *inode);
+
+gf_boolean_t
+dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator);
int
-dht_inode_ctx_get_mig_info (xlator_t *this, inode_t *inode,
- xlator_t **src_subvol, xlator_t **dst_subvol);
+dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode,
+ xlator_t **src_subvol, xlator_t **dst_subvol);
gf_boolean_t
-dht_mig_info_is_invalid (xlator_t *current, xlator_t *src_subvol,
- xlator_t *dst_subvol);
+dht_mig_info_is_invalid(xlator_t *current, xlator_t *src_subvol,
+ xlator_t *dst_subvol);
int
-dht_subvol_status (dht_conf_t *conf, xlator_t *subvol);
+dht_subvol_status(dht_conf_t *conf, xlator_t *subvol);
void
-dht_log_new_layout_for_dir_selfheal (xlator_t *this, loc_t *loc,
- dht_layout_t *layout);
+dht_log_new_layout_for_dir_selfheal(xlator_t *this, loc_t *loc,
+ dht_layout_t *layout);
+
int
-dht_lookup_everywhere_done (call_frame_t *frame, xlator_t *this);
+dht_layout_sort(dht_layout_t *layout);
int
-dht_fill_dict_to_avoid_unlink_of_migrating_file (dict_t *dict);
+dht_heal_full_path(void *data);
+int
+dht_heal_full_path_done(int op_ret, call_frame_t *frame, void *data);
-/* Acquire non-blocking inodelk on a list of xlators.
- *
- * @lk_array: array of lock requests lock on.
- *
- * @lk_count: number of locks in @lk_array
- *
- * @inodelk_cbk: will be called after inodelk replies are received
- *
- * @retval: -1 if stack_winding inodelk fails. 0 otherwise.
- * inodelk_cbk is called with appropriate error on errors.
- * On failure to acquire lock on all members of list, successful
- * locks are unlocked before invoking cbk.
- */
+int
+dht_layout_missing_dirs(dht_layout_t *layout);
int
-dht_nonblocking_inodelk (call_frame_t *frame, dht_lock_t **lk_array,
- int lk_count, fop_inodelk_cbk_t inodelk_cbk);
+dht_refresh_layout(call_frame_t *frame);
-/* same as dht_nonblocking_inodelk, but issues sequential blocking locks on
- * @lk_array directly. locks are issued on some order which remains same
- * for a list of xlators (irrespective of order of xlators within list).
- */
int
-dht_blocking_inodelk (call_frame_t *frame, dht_lock_t **lk_array,
- int lk_count, fop_inodelk_cbk_t inodelk_cbk);
+dht_build_parent_loc(xlator_t *this, loc_t *parent, loc_t *child,
+ int32_t *op_errno);
int32_t
-dht_unlock_inodelk (call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
- fop_inodelk_cbk_t inodelk_cbk);
-
-dht_lock_t *
-dht_lock_new (xlator_t *this, xlator_t *xl, loc_t *loc, short type,
- const char *domain);
+dht_set_local_rebalance(xlator_t *this, dht_local_t *local, struct iatt *stbuf,
+ struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
void
-dht_lock_array_free (dht_lock_t **lk_array, int count);
+dht_build_root_loc(inode_t *inode, loc_t *loc);
+
+gf_boolean_t
+dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst);
+
+int32_t
+dht_fd_ctx_destroy(xlator_t *this, fd_t *fd);
int32_t
-dht_lock_count (dht_lock_t **lk_array, int lk_count);
+dht_release(xlator_t *this, fd_t *fd);
+
+int32_t
+dht_set_fixed_dir_stat(struct iatt *stat);
+
+xlator_t *
+dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock,
+ dht_local_t *local);
int
-dht_layout_sort (dht_layout_t *layout);
+dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret);
int
-dht_heal_full_path (void *data);
+dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *subvol);
int
-dht_heal_full_path_done (int op_ret, call_frame_t *frame, void *data);
+dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame);
+
+/* FD fop callbacks */
int
-dht_layout_missing_dirs (dht_layout_t *layout);
+dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
int
-dht_refresh_layout (call_frame_t *frame);
+dht_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata);
-gf_boolean_t
-dht_is_tier_xlator (xlator_t *this);
+int
+dht_file_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata);
+
+int
+dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+int
+dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+int
+dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
int
-dht_build_parent_loc (xlator_t *this, loc_t *parent, loc_t *child,
- int32_t *op_errno);
+dht_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+int
+dht_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata);
+
+int
+dht_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iovec *vector, int count, struct iatt *stbuf,
+ struct iobref *iobref, dict_t *xdata);
+
+int
+dht_file_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *stbuf, dict_t *xdata);
+
+int
+dht_file_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
+
+int
+dht_file_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, dict_t *xdata);
+
+/* All custom xattr heal functions */
+int
+dht_dir_heal_xattrs(void *data);
+
+int
+dht_dir_heal_xattrs_done(int ret, call_frame_t *sync_frame, void *data);
int32_t
-dht_set_local_rebalance (xlator_t *this, dht_local_t *local,
- struct iatt *stbuf,
- struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata);
+dht_dict_set_array(dict_t *dict, char *key, int32_t value[], int32_t size);
+
+int
+dht_set_user_xattr(dict_t *dict, char *k, data_t *v, void *data);
+
void
-dht_build_root_loc (inode_t *inode, loc_t *loc);
+dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst,
+ dict_t *src, int *uret, int *uflag);
-gf_boolean_t
-dht_fd_open_on_dst (xlator_t *this, fd_t *fd, xlator_t *dst);
+int
+dht_dir_xattr_heal(xlator_t *this, dht_local_t *local, int *op_errno);
+
+int
+dht_common_mark_mdsxattr(call_frame_t *frame, int *errst, int flag);
+
+int
+dht_inode_ctx_mdsvol_get(inode_t *inode, xlator_t *this, xlator_t **mdsvol);
+
+int
+dht_selfheal_dir_setattr(call_frame_t *frame, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dht_layout_t *layout);
+
+/* Abstract out the DHT-IATT-IN-DICT */
+
+void
+dht_selfheal_layout_new_directory(call_frame_t *frame, loc_t *loc,
+ dht_layout_t *new_layout);
+
+int
+dht_pt_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *key,
+ dict_t *xdata);
+
+int
+dht_pt_getxattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ const char *key, dict_t *xdata);
int32_t
-dht_fd_ctx_destroy (xlator_t *this, fd_t *fd);
+dht_pt_mkdir(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
+ mode_t umask, dict_t *xdata);
+
+int
+dht_pt_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata);
int32_t
-dht_release (xlator_t *this, fd_t *fd);
-#endif/* _DHT_H */
+dht_check_remote_fd_failed_error(dht_local_t *local, int op_ret, int op_errno);
+
+int
+dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata);
+
+int32_t
+dht_create_lock(call_frame_t *frame, xlator_t *subvol);
+
+int
+dht_set_parent_layout_in_dict(loc_t *loc, xlator_t *this, dht_local_t *local);
+
+int
+dht_dir_layout_error_check(xlator_t *this, inode_t *inode);
+
+int
+dht_inode_ctx_mdsvol_set(inode_t *inode, xlator_t *this, xlator_t *mds_subvol);
+#endif /* _DHT_H */
diff --git a/xlators/cluster/dht/src/dht-diskusage.c b/xlators/cluster/dht/src/dht-diskusage.c
index 1eb9e63c531..c0588828fdb 100644
--- a/xlators/cluster/dht/src/dht-diskusage.c
+++ b/xlators/cluster/dht/src/dht-diskusage.c
@@ -8,453 +8,480 @@
cases as published by the Free Software Foundation.
*/
-
/* TODO: add NS locking */
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "dht-messages.h"
-#include "defaults.h"
#include <sys/time.h>
-
+#include <glusterfs/events.h>
int
-dht_du_info_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct statvfs *statvfs,
- dict_t *xdata)
+dht_du_info_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct statvfs *statvfs, dict_t *xdata)
{
- dht_conf_t *conf = NULL;
- call_frame_t *prev = NULL;
- int this_call_cnt = 0;
- int i = 0;
- double percent = 0;
- double percent_inodes = 0;
- uint64_t bytes = 0;
- uint32_t bpc; /* blocks per chunk */
- uint32_t chunks = 0;
-
- conf = this->private;
- prev = cookie;
-
- if (op_ret == -1) {
- gf_msg (this->name, GF_LOG_WARNING, op_errno,
- DHT_MSG_GET_DISK_INFO_ERROR,
- "failed to get disk info from %s", prev->this->name);
- goto out;
- }
-
- if (statvfs && statvfs->f_blocks) {
- percent = (statvfs->f_bavail * 100) / statvfs->f_blocks;
- bytes = (statvfs->f_bavail * statvfs->f_frsize);
- /*
- * A 32-bit count of 1MB chunks allows a maximum brick size of
- * ~4PB. It's possible that we could see a single local FS
- * bigger than that some day, but this code is likely to be
- * irrelevant by then. Meanwhile, it's more important to keep
- * the chunk size small so the layout-calculation code that
- * uses this value can be tested on normal machines.
- */
- bpc = (1 << 20) / statvfs->f_bsize;
- chunks = (statvfs->f_blocks + bpc - 1) / bpc;
- }
-
- if (statvfs && statvfs->f_files) {
- percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files;
- } else {
- /*
- * Set percent inodes to 100 for dynamically allocated inode
- * filesystems. The rationale is that distribute need not
- * worry about total inodes; rather, let the 'create()' be
- * scheduled on the hashed subvol regardless of the total
- * inodes.
- */
- percent_inodes = 100;
- }
-
- LOCK (&conf->subvolume_lock);
- {
- for (i = 0; i < conf->subvolume_cnt; i++)
- if (prev->this == conf->subvolumes[i]) {
- conf->du_stats[i].avail_percent = percent;
- conf->du_stats[i].avail_space = bytes;
- conf->du_stats[i].avail_inodes = percent_inodes;
- conf->du_stats[i].chunks = chunks;
- gf_msg_debug (this->name, 0,
- "subvolume '%s': avail_percent "
- "is: %.2f and avail_space "
- "is: %" PRIu64" and avail_inodes"
- " is: %.2f",
- prev->this->name,
- conf->du_stats[i].avail_percent,
- conf->du_stats[i].avail_space,
- conf->du_stats[i].avail_inodes);
- break; /* no point in looping further */
- }
- }
- UNLOCK (&conf->subvolume_lock);
+ dht_conf_t *conf = NULL;
+ xlator_t *prev = NULL;
+ int this_call_cnt = 0;
+ int i = 0;
+ double percent = 0;
+ double percent_inodes = 0;
+ uint64_t bytes = 0;
+ uint32_t bpc; /* blocks per chunk */
+ uint32_t chunks = 0;
+
+ conf = this->private;
+ prev = cookie;
+
+ if (op_ret == -1 || !statvfs) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_GET_DISK_INFO_ERROR, "failed to get disk info from %s",
+ prev->name);
+ goto out;
+ }
+
+ if (statvfs->f_blocks) {
+ percent = (statvfs->f_bavail * 100) / statvfs->f_blocks;
+ bytes = (statvfs->f_bavail * statvfs->f_frsize);
+ /*
+ * A 32-bit count of 1MB chunks allows a maximum brick size of
+ * ~4PB. It's possible that we could see a single local FS
+ * bigger than that some day, but this code is likely to be
+ * irrelevant by then. Meanwhile, it's more important to keep
+ * the chunk size small so the layout-calculation code that
+ * uses this value can be tested on normal machines.
+ */
+ bpc = (1 << 20) / statvfs->f_bsize;
+ chunks = (statvfs->f_blocks + bpc - 1) / bpc;
+ }
+
+ if (statvfs->f_files) {
+ percent_inodes = (statvfs->f_ffree * 100) / statvfs->f_files;
+ } else {
+ /*
+ * Set percent inodes to 100 for dynamically allocated inode
+ * filesystems. The rationale is that distribute need not
+ * worry about total inodes; rather, let the 'create()' be
+ * scheduled on the hashed subvol regardless of the total
+ * inodes.
+ */
+ percent_inodes = 100;
+ }
+
+ LOCK(&conf->subvolume_lock);
+ {
+ for (i = 0; i < conf->subvolume_cnt; i++)
+ if (prev == conf->subvolumes[i]) {
+ conf->du_stats[i].avail_percent = percent;
+ conf->du_stats[i].avail_space = bytes;
+ conf->du_stats[i].avail_inodes = percent_inodes;
+ conf->du_stats[i].chunks = chunks;
+ conf->du_stats[i].total_blocks = statvfs->f_blocks;
+ conf->du_stats[i].avail_blocks = statvfs->f_bavail;
+ conf->du_stats[i].frsize = statvfs->f_frsize;
+
+ gf_msg_debug(this->name, 0,
+ "subvolume '%s': avail_percent "
+ "is: %.2f and avail_space "
+ "is: %" PRIu64
+ " and avail_inodes"
+ " is: %.2f",
+ prev->name, conf->du_stats[i].avail_percent,
+ conf->du_stats[i].avail_space,
+ conf->du_stats[i].avail_inodes);
+ break; /* no point in looping further */
+ }
+ }
+ UNLOCK(&conf->subvolume_lock);
out:
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt))
- DHT_STACK_DESTROY (frame);
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt))
+ DHT_STACK_DESTROY(frame);
- return 0;
+ return 0;
}
int
-dht_get_du_info_for_subvol (xlator_t *this, int subvol_idx)
+dht_get_du_info_for_subvol(xlator_t *this, int subvol_idx)
{
- dht_conf_t *conf = NULL;
- call_frame_t *statfs_frame = NULL;
- dht_local_t *statfs_local = NULL;
- call_pool_t *pool = NULL;
- loc_t tmp_loc = {0,};
-
- conf = this->private;
- pool = this->ctx->pool;
-
- statfs_frame = create_frame (this, pool);
- if (!statfs_frame) {
- goto err;
- }
-
- /* local->fop value is not used in this case */
- statfs_local = dht_local_init (statfs_frame, NULL, NULL,
- GF_FOP_MAXVALUE);
- if (!statfs_local) {
- goto err;
- }
-
- /* make it root gfid, should be enough to get the proper info back */
- tmp_loc.gfid[15] = 1;
-
- statfs_local->call_cnt = 1;
- STACK_WIND (statfs_frame, dht_du_info_cbk,
- conf->subvolumes[subvol_idx],
- conf->subvolumes[subvol_idx]->fops->statfs,
- &tmp_loc, NULL);
-
- return 0;
+ dht_conf_t *conf = NULL;
+ call_frame_t *statfs_frame = NULL;
+ dht_local_t *statfs_local = NULL;
+ call_pool_t *pool = NULL;
+ loc_t tmp_loc = {
+ 0,
+ };
+
+ conf = this->private;
+ pool = this->ctx->pool;
+
+ statfs_frame = create_frame(this, pool);
+ if (!statfs_frame) {
+ goto err;
+ }
+
+ /* local->fop value is not used in this case */
+ statfs_local = dht_local_init(statfs_frame, NULL, NULL, GF_FOP_MAXVALUE);
+ if (!statfs_local) {
+ goto err;
+ }
+
+ /* make it root gfid, should be enough to get the proper info back */
+ tmp_loc.gfid[15] = 1;
+
+ statfs_local->call_cnt = 1;
+ STACK_WIND_COOKIE(
+ statfs_frame, dht_du_info_cbk, conf->subvolumes[subvol_idx],
+ conf->subvolumes[subvol_idx],
+ conf->subvolumes[subvol_idx]->fops->statfs, &tmp_loc, NULL);
+
+ return 0;
err:
- if (statfs_frame)
- DHT_STACK_DESTROY (statfs_frame);
+ if (statfs_frame)
+ DHT_STACK_DESTROY(statfs_frame);
- return -1;
+ return -1;
}
int
-dht_get_du_info (call_frame_t *frame, xlator_t *this, loc_t *loc)
+dht_get_du_info(call_frame_t *frame, xlator_t *this, loc_t *loc)
{
- int i = 0;
- int ret = -1;
- dht_conf_t *conf = NULL;
- call_frame_t *statfs_frame = NULL;
- dht_local_t *statfs_local = NULL;
- struct timeval tv = {0,};
- loc_t tmp_loc = {0,};
-
- conf = this->private;
-
- gettimeofday (&tv, NULL);
-
- /* make it root gfid, should be enough to get the proper
- info back */
- tmp_loc.gfid[15] = 1;
-
- if (tv.tv_sec > (conf->refresh_interval
- + conf->last_stat_fetch.tv_sec)) {
-
- statfs_frame = copy_frame (frame);
- if (!statfs_frame) {
- goto err;
- }
-
- /* In this case, 'local->fop' is not used */
- statfs_local = dht_local_init (statfs_frame, loc, NULL,
- GF_FOP_MAXVALUE);
- if (!statfs_local) {
- goto err;
- }
-
- statfs_local->params = dict_new ();
- if (!statfs_local->params)
- goto err;
-
- ret = dict_set_int8 (statfs_local->params,
- GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set "
- GF_INTERNAL_IGNORE_DEEM_STATFS" in dict");
- goto err;
- }
+ int i = 0;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ call_frame_t *statfs_frame = NULL;
+ dht_local_t *statfs_local = NULL;
+ loc_t tmp_loc = {
+ 0,
+ };
+ time_t now;
+
+ conf = this->private;
+ now = gf_time();
+ /* make it root gfid, should be enough to get the proper
+ info back */
+ tmp_loc.gfid[15] = 1;
+
+ if (now > (conf->refresh_interval + conf->last_stat_fetch)) {
+ statfs_frame = copy_frame(frame);
+ if (!statfs_frame) {
+ goto err;
+ }
+
+ /* In this case, 'local->fop' is not used */
+ statfs_local = dht_local_init(statfs_frame, loc, NULL, GF_FOP_MAXVALUE);
+ if (!statfs_local) {
+ goto err;
+ }
+
+ statfs_local->params = dict_new();
+ if (!statfs_local->params)
+ goto err;
- statfs_local->call_cnt = conf->subvolume_cnt;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (statfs_frame, dht_du_info_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->statfs,
- &tmp_loc, statfs_local->params);
- }
-
- conf->last_stat_fetch.tv_sec = tv.tv_sec;
- }
- return 0;
+ ret = dict_set_int8(statfs_local->params,
+ GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict");
+ goto err;
+ }
+
+ statfs_local->call_cnt = conf->subvolume_cnt;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND_COOKIE(statfs_frame, dht_du_info_cbk,
+ conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->statfs, &tmp_loc,
+ statfs_local->params);
+ }
+
+ conf->last_stat_fetch = now;
+ }
+ return 0;
err:
- if (statfs_frame)
- DHT_STACK_DESTROY (statfs_frame);
+ if (statfs_frame)
+ DHT_STACK_DESTROY(statfs_frame);
- return -1;
+ return -1;
}
-
gf_boolean_t
-dht_is_subvol_filled (xlator_t *this, xlator_t *subvol)
+dht_is_subvol_filled(xlator_t *this, xlator_t *subvol)
{
- int i = 0;
- dht_conf_t *conf = NULL;
- gf_boolean_t subvol_filled_inodes = _gf_false;
- gf_boolean_t subvol_filled_space = _gf_false;
- gf_boolean_t is_subvol_filled = _gf_false;
-
- conf = this->private;
-
- /* Check for values above specified percent or free disk */
- LOCK (&conf->subvolume_lock);
- {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (subvol == conf->subvolumes[i]) {
- if (conf->disk_unit == 'p') {
- if (conf->du_stats[i].avail_percent <
- conf->min_free_disk) {
- subvol_filled_space = _gf_true;
- break;
- }
-
- } else {
- if (conf->du_stats[i].avail_space <
- conf->min_free_disk) {
- subvol_filled_space = _gf_true;
- break;
- }
- }
- if (conf->du_stats[i].avail_inodes <
- conf->min_free_inodes) {
- subvol_filled_inodes = _gf_true;
- break;
- }
- }
- }
- }
- UNLOCK (&conf->subvolume_lock);
-
- if (subvol_filled_space && conf->subvolume_status[i]) {
- if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_SUBVOL_INSUFF_SPACE,
- "disk space on subvolume '%s' is getting "
- "full (%.2f %%), consider adding more bricks",
- subvol->name,
- (100 - conf->du_stats[i].avail_percent));
- }
- }
-
- if (subvol_filled_inodes && conf->subvolume_status[i]) {
- if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) {
- gf_msg (this->name, GF_LOG_CRITICAL, 0,
- DHT_MSG_SUBVOL_INSUFF_INODES,
- "inodes on subvolume '%s' are at "
- "(%.2f %%), consider adding more bricks",
- subvol->name,
- (100 - conf->du_stats[i].avail_inodes));
- }
- }
-
- is_subvol_filled = (subvol_filled_space || subvol_filled_inodes);
-
- return is_subvol_filled;
-}
+ int i = 0;
+ char vol_name[256];
+ dht_conf_t *conf = NULL;
+ gf_boolean_t subvol_filled_inodes = _gf_false;
+ gf_boolean_t subvol_filled_space = _gf_false;
+ gf_boolean_t is_subvol_filled = _gf_false;
+ double usage = 0;
+
+ conf = this->private;
+
+ /* Check for values above specified percent or free disk */
+ LOCK(&conf->subvolume_lock);
+ {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ if (conf->disk_unit == 'p') {
+ if (conf->du_stats[i].avail_percent < conf->min_free_disk) {
+ subvol_filled_space = _gf_true;
+ break;
+ }
+
+ } else {
+ if (conf->du_stats[i].avail_space < conf->min_free_disk) {
+ subvol_filled_space = _gf_true;
+ break;
+ }
+ }
+ if (conf->du_stats[i].avail_inodes < conf->min_free_inodes) {
+ subvol_filled_inodes = _gf_true;
+ break;
+ }
+ }
+ }
+ }
+ UNLOCK(&conf->subvolume_lock);
+ if (subvol_filled_space && conf->subvolume_status[i]) {
+ if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) {
+ usage = 100 - conf->du_stats[i].avail_percent;
-/*Get the best subvolume to create the file in*/
-xlator_t *
-dht_free_disk_available_subvol (xlator_t *this, xlator_t *subvol,
- dht_local_t *local)
-{
- xlator_t *avail_subvol = NULL;
- dht_conf_t *conf = NULL;
- dht_layout_t *layout = NULL;
- loc_t *loc = NULL;
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE,
+ "disk space on subvolume '%s' is getting "
+ "full (%.2f %%), consider adding more bricks",
+ subvol->name, usage);
- conf = this->private;
- if (!local)
- goto out;
- loc = &local->loc;
- if (!local->layout) {
- layout = dht_layout_get (this, loc->parent);
-
- if (!layout) {
- gf_msg_debug (this->name, 0,
- "Missing layout. path=%s,"
- " parent gfid = %s", loc->path,
- uuid_utoa (loc->parent->gfid));
- goto out;
- }
- } else {
- layout = dht_layout_ref (this, local->layout);
+ (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name);
+ vol_name[(strlen(this->name) - 4)] = '\0';
+
+ gf_event(EVENT_DHT_DISK_USAGE, "volume=%s;subvol=%s;usage=%.2f %%",
+ vol_name, subvol->name, usage);
+ }
+ }
+
+ if (subvol_filled_inodes && conf->subvolume_status[i]) {
+ if (!(conf->du_stats[i].log++ % (GF_UNIVERSAL_ANSWER * 10))) {
+ usage = 100 - conf->du_stats[i].avail_inodes;
+ gf_msg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_SUBVOL_INSUFF_INODES,
+ "inodes on subvolume '%s' are at "
+ "(%.2f %%), consider adding more bricks",
+ subvol->name, usage);
+
+ (void)snprintf(vol_name, sizeof(vol_name), "%s", this->name);
+ vol_name[(strlen(this->name) - 4)] = '\0';
+
+ gf_event(EVENT_DHT_INODES_USAGE,
+ "volume=%s;subvol=%s;usage=%.2f %%", vol_name,
+ subvol->name, usage);
}
+ }
- LOCK (&conf->subvolume_lock);
- {
- avail_subvol = dht_subvol_with_free_space_inodes(this, subvol,
- layout);
- if(!avail_subvol)
- {
- avail_subvol = dht_subvol_maxspace_nonzeroinode(this,
- subvol,
- layout);
- }
+ is_subvol_filled = (subvol_filled_space || subvol_filled_inodes);
- }
- UNLOCK (&conf->subvolume_lock);
+ return is_subvol_filled;
+}
+
+/*Get the best subvolume to create the file in*/
+xlator_t *
+dht_free_disk_available_subvol(xlator_t *this, xlator_t *subvol,
+ dht_local_t *local)
+{
+ xlator_t *avail_subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ loc_t *loc = NULL;
+
+ conf = this->private;
+ if (!local)
+ goto out;
+ loc = &local->loc;
+ if (!local->layout) {
+ layout = dht_layout_get(this, loc->parent);
+
+ if (!layout) {
+ gf_msg_debug(this->name, 0,
+ "Missing layout. path=%s,"
+ " parent gfid = %s",
+ loc->path, uuid_utoa(loc->parent->gfid));
+ goto out;
+ }
+ } else {
+ layout = dht_layout_ref(this, local->layout);
+ }
+
+ LOCK(&conf->subvolume_lock);
+ {
+ avail_subvol = dht_subvol_with_free_space_inodes(this, subvol, NULL,
+ layout, 0);
+ if (!avail_subvol) {
+ avail_subvol = dht_subvol_maxspace_nonzeroinode(this, subvol,
+ layout);
+ }
+ }
+ UNLOCK(&conf->subvolume_lock);
out:
- if (!avail_subvol) {
- gf_msg_debug (this->name, 0,
- "No subvolume has enough free space \
+ if (!avail_subvol) {
+ gf_msg_debug(this->name, 0,
+ "No subvolume has enough free space \
and/or inodes to create");
- avail_subvol = subvol;
- }
+ avail_subvol = subvol;
+ }
- if (layout)
- dht_layout_unref (this, layout);
- return avail_subvol;
+ if (layout)
+ dht_layout_unref(this, layout);
+ return avail_subvol;
}
-static inline
-int32_t dht_subvol_has_err (dht_conf_t *conf, xlator_t *this,
- dht_layout_t *layout)
+static inline int32_t
+dht_subvol_has_err(dht_conf_t *conf, xlator_t *this, xlator_t *ignore,
+ dht_layout_t *layout)
{
- int ret = -1;
- int i = 0;
-
- if (!this || !layout)
- goto out;
-
- /* check if subvol has layout errors, before selecting it */
- for (i = 0; i < layout->cnt; i++) {
- if (!strcmp (layout->list[i].xlator->name, this->name) &&
- (layout->list[i].err != 0)) {
- ret = -1;
- goto out;
- }
+ int ret = -1;
+ int i = 0;
+
+ if (!this || !layout)
+ goto out;
+
+ /* this check is meant for rebalance process. The source of the file
+ * should be ignored for space check */
+ if (this == ignore) {
+ goto out;
+ }
+
+ /* check if subvol has layout errors, before selecting it */
+ for (i = 0; i < layout->cnt; i++) {
+ if (!strcmp(layout->list[i].xlator->name, this->name) &&
+ (layout->list[i].err != 0)) {
+ ret = -1;
+ goto out;
}
+ }
- /* discard decommissioned subvol */
- if (conf->decommission_subvols_cnt) {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->decommissioned_bricks[i] &&
- conf->decommissioned_bricks[i] == this) {
- ret = -1;
- goto out;
- }
- }
+ /* discard decommissioned subvol */
+ if (conf->decommission_subvols_cnt) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i] &&
+ conf->decommissioned_bricks[i] == this) {
+ ret = -1;
+ goto out;
+ }
}
+ }
- ret = 0;
+ ret = 0;
out:
- return ret;
+ return ret;
}
/*Get subvolume which has both space and inodes more than the min criteria*/
xlator_t *
dht_subvol_with_free_space_inodes(xlator_t *this, xlator_t *subvol,
- dht_layout_t *layout)
+ xlator_t *ignore, dht_layout_t *layout,
+ uint64_t filesize)
{
- int i = 0;
- double max = 0;
- double max_inodes = 0;
- int ignore_subvol = 0;
-
- xlator_t *avail_subvol = NULL;
- dht_conf_t *conf = NULL;
-
- conf = this->private;
-
- for(i=0; i < conf->subvolume_cnt; i++) {
- /* check if subvol has layout errors and also it is not a
- * decommissioned brick, before selecting it */
- ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i],
- layout);
- if (ignore_subvol)
- continue;
-
- if ((conf->disk_unit == 'p') &&
- (conf->du_stats[i].avail_percent > conf->min_free_disk) &&
- (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
- if ((conf->du_stats[i].avail_inodes > max_inodes) ||
- (conf->du_stats[i].avail_percent > max)) {
- max = conf->du_stats[i].avail_percent;
- max_inodes = conf->du_stats[i].avail_inodes;
- avail_subvol = conf->subvolumes[i];
- }
- }
+ int i = 0;
+ double max = 0;
+ double max_inodes = 0;
+ int ignore_subvol = 0;
+ uint64_t total_blocks = 0;
+ uint64_t avail_blocks = 0;
+ uint64_t frsize = 0;
+ double post_availspace = 0;
+ double post_percent = 0;
+
+ xlator_t *avail_subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors and also it is not a
+ * decommissioned brick, before selecting it */
+ ignore_subvol = dht_subvol_has_err(conf, conf->subvolumes[i], ignore,
+ layout);
+ if (ignore_subvol)
+ continue;
+
+ if ((conf->disk_unit == 'p') &&
+ (conf->du_stats[i].avail_percent > conf->min_free_disk) &&
+ (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
+ if ((conf->du_stats[i].avail_inodes > max_inodes) ||
+ (conf->du_stats[i].avail_percent > max)) {
+ max = conf->du_stats[i].avail_percent;
+ max_inodes = conf->du_stats[i].avail_inodes;
+ avail_subvol = conf->subvolumes[i];
+ total_blocks = conf->du_stats[i].total_blocks;
+ avail_blocks = conf->du_stats[i].avail_blocks;
+ frsize = conf->du_stats[i].frsize;
+ }
+ }
- if ((conf->disk_unit != 'p') &&
- (conf->du_stats[i].avail_space > conf->min_free_disk) &&
- (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
- if ((conf->du_stats[i].avail_inodes > max_inodes) ||
- (conf->du_stats[i].avail_space > max)) {
- max = conf->du_stats[i].avail_space;
- max_inodes = conf->du_stats[i].avail_inodes;
- avail_subvol = conf->subvolumes[i];
- }
- }
+ if ((conf->disk_unit != 'p') &&
+ (conf->du_stats[i].avail_space > conf->min_free_disk) &&
+ (conf->du_stats[i].avail_inodes > conf->min_free_inodes)) {
+ if ((conf->du_stats[i].avail_inodes > max_inodes) ||
+ (conf->du_stats[i].avail_space > max)) {
+ max = conf->du_stats[i].avail_space;
+ max_inodes = conf->du_stats[i].avail_inodes;
+ avail_subvol = conf->subvolumes[i];
+ }
+ }
+ }
+
+ if (avail_subvol) {
+ if (conf->disk_unit == 'p') {
+ post_availspace = (avail_blocks * frsize) - filesize;
+ post_percent = (post_availspace * 100) / (total_blocks * frsize);
+ if (post_percent < conf->min_free_disk)
+ avail_subvol = NULL;
}
+ if (conf->disk_unit != 'p') {
+ if ((max - filesize) < conf->min_free_disk)
+ avail_subvol = NULL;
+ }
+ }
- return avail_subvol;
+ return avail_subvol;
}
-
-/* Get subvol which has atleast one inode and maximum space */
+/* Get subvol which has at least one inode and maximum space */
xlator_t *
-dht_subvol_maxspace_nonzeroinode (xlator_t *this, xlator_t *subvol,
- dht_layout_t *layout)
+dht_subvol_maxspace_nonzeroinode(xlator_t *this, xlator_t *subvol,
+ dht_layout_t *layout)
{
- int i = 0;
- double max = 0;
- int ignore_subvol = 0;
-
- xlator_t *avail_subvol = NULL;
- dht_conf_t *conf = NULL;
-
- conf = this->private;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- /* check if subvol has layout errors and also it is not a
- * decommissioned brick, before selecting it*/
-
- ignore_subvol = dht_subvol_has_err (conf, conf->subvolumes[i],
- layout);
- if (ignore_subvol)
- continue;
-
- if (conf->disk_unit == 'p') {
- if ((conf->du_stats[i].avail_percent > max)
- && (conf->du_stats[i].avail_inodes > 0 )) {
- max = conf->du_stats[i].avail_percent;
- avail_subvol = conf->subvolumes[i];
- }
- } else {
- if ((conf->du_stats[i].avail_space > max)
- && (conf->du_stats[i].avail_inodes > 0)) {
- max = conf->du_stats[i].avail_space;
- avail_subvol = conf->subvolumes[i];
- }
- }
+ int i = 0;
+ double max = 0;
+ int ignore_subvol = 0;
+
+ xlator_t *avail_subvol = NULL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ /* check if subvol has layout errors and also it is not a
+ * decommissioned brick, before selecting it*/
+
+ ignore_subvol = dht_subvol_has_err(conf, conf->subvolumes[i], NULL,
+ layout);
+ if (ignore_subvol)
+ continue;
+
+ if (conf->disk_unit == 'p') {
+ if ((conf->du_stats[i].avail_percent > max) &&
+ (conf->du_stats[i].avail_inodes > 0)) {
+ max = conf->du_stats[i].avail_percent;
+ avail_subvol = conf->subvolumes[i];
+ }
+ } else {
+ if ((conf->du_stats[i].avail_space > max) &&
+ (conf->du_stats[i].avail_inodes > 0)) {
+ max = conf->du_stats[i].avail_space;
+ avail_subvol = conf->subvolumes[i];
+ }
}
+ }
- return avail_subvol;
+ return avail_subvol;
}
diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c
index 66e3ede736b..acda67c312a 100644
--- a/xlators/cluster/dht/src/dht-hashfn.c
+++ b/xlators/cluster/dht/src/dht-hashfn.c
@@ -8,92 +8,103 @@
cases as published by the Free Software Foundation.
*/
-
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "hashfn.h"
-
+#include <glusterfs/hashfn.h>
-int
-dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p)
+static int
+dht_hash_compute_internal(int type, const char *name, const int len,
+ uint32_t *hash_p)
{
- int ret = 0;
- uint32_t hash = 0;
+ int ret = 0;
+ uint32_t hash = 0;
- switch (type) {
+ switch (type) {
case DHT_HASH_TYPE_DM:
case DHT_HASH_TYPE_DM_USER:
- hash = gf_dm_hashfn (name, strlen (name));
- break;
+ hash = gf_dm_hashfn(name, len);
+ break;
default:
- ret = -1;
- break;
- }
+ ret = -1;
+ break;
+ }
- if (ret == 0) {
- *hash_p = hash;
- }
+ if (ret == 0) {
+ *hash_p = hash;
+ }
- return ret;
+ return ret;
}
-
-static
-gf_boolean_t
-dht_munge_name (const char *original, char *modified, size_t len, regex_t *re)
+/* The function returns:
+ * 0 : in case no munge took place
+ * >0 : the length (inc. terminating NULL!) of the newly modified string,
+ * if it was munged.
+ */
+static int
+dht_munge_name(const char *original, char *modified, size_t len, regex_t *re)
{
- regmatch_t matches[2];
- size_t new_len;
-
- if (regexec(re,original,2,matches,0) != REG_NOMATCH) {
- if (matches[1].rm_so != -1) {
- new_len = matches[1].rm_eo - matches[1].rm_so;
- /* Equal would fail due to the NUL at the end. */
- if (new_len < len) {
- memcpy (modified,original+matches[1].rm_so,
- new_len);
- modified[new_len] = '\0';
- return _gf_true;
- }
- }
+ regmatch_t matches[2] = {
+ {0},
+ };
+ size_t new_len = 0;
+ int ret = 0;
+
+ ret = regexec(re, original, 2, matches, 0);
+
+ if (ret != REG_NOMATCH) {
+ if (matches[1].rm_so != -1) {
+ new_len = matches[1].rm_eo - matches[1].rm_so;
+ /* Equal would fail due to the NUL at the end. */
+ if (new_len < len) {
+ memcpy(modified, original + matches[1].rm_so, new_len);
+ modified[new_len] = '\0';
+ return new_len + 1; /* +1 for the terminating NULL */
+ }
}
+ }
- /* This is guaranteed safe because of how the dest was allocated. */
- strcpy(modified,original);
- return _gf_false;
+ /* This is guaranteed safe because of how the dest was allocated. */
+ strcpy(modified, original);
+ return 0;
}
int
-dht_hash_compute (xlator_t *this, int type, const char *name, uint32_t *hash_p)
+dht_hash_compute(xlator_t *this, int type, const char *name, uint32_t *hash_p)
{
- char *rsync_friendly_name = NULL;
- dht_conf_t *priv = this->private;
- size_t len = 0;
- gf_boolean_t munged = _gf_false;
+ char *rsync_friendly_name = NULL;
+ dht_conf_t *priv = NULL;
+ size_t len = 0;
+ int munged = 0;
+ priv = this->private;
+
+ if (name == NULL)
+ return -1;
+
+ len = strlen(name) + 1;
+ rsync_friendly_name = alloca(len);
+
+ LOCK(&priv->lock);
+ {
if (priv->extra_regex_valid) {
- len = strlen(name) + 1;
- rsync_friendly_name = alloca(len);
- munged = dht_munge_name (name, rsync_friendly_name, len,
- &priv->extra_regex);
+ munged = dht_munge_name(name, rsync_friendly_name, len,
+ &priv->extra_regex);
}
if (!munged && priv->rsync_regex_valid) {
- len = strlen(name) + 1;
- rsync_friendly_name = alloca(len);
- gf_msg_trace (this->name, 0, "trying regex for %s", name);
- munged = dht_munge_name (name, rsync_friendly_name, len,
- &priv->rsync_regex);
- if (munged) {
- gf_msg_debug (this->name, 0,
- "munged down to %s", rsync_friendly_name);
- }
+ gf_msg_trace(this->name, 0, "trying regex for %s", name);
+ munged = dht_munge_name(name, rsync_friendly_name, len,
+ &priv->rsync_regex);
}
-
- if (!munged) {
- rsync_friendly_name = (char *)name;
- }
-
- return dht_hash_compute_internal (type, rsync_friendly_name, hash_p);
+ }
+ UNLOCK(&priv->lock);
+ if (munged) {
+ gf_msg_debug(this->name, 0, "munged down to %s", rsync_friendly_name);
+ len = munged;
+ } else {
+ rsync_friendly_name = (char *)name;
+ }
+
+ return dht_hash_compute_internal(type, rsync_friendly_name, len - 1,
+ hash_p);
}
diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c
index 79f6ab795f2..3f2fe43d5f3 100644
--- a/xlators/cluster/dht/src/dht-helper.c
+++ b/xlators/cluster/dht/src/dht-helper.c
@@ -8,1043 +8,1228 @@
cases as published by the Free Software Foundation.
*/
-
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "dht-helper.h"
-
+#include "dht-lock.h"
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
-void
-dht_free_fd_ctx (void *data)
+static void
+dht_free_fd_ctx(dht_fd_ctx_t *fd_ctx)
{
- dht_fd_ctx_t *fd_ctx = NULL;
-
- fd_ctx = (dht_fd_ctx_t *)data;
- GF_FREE (fd_ctx);
-
- return;
+ GF_FREE(fd_ctx);
}
-
int32_t
-dht_fd_ctx_destroy (xlator_t *this, fd_t *fd)
+dht_fd_ctx_destroy(xlator_t *this, fd_t *fd)
{
- dht_fd_ctx_t *fd_ctx = NULL;
- uint64_t value = 0;
- int32_t ret = -1;
+ dht_fd_ctx_t *fd_ctx = NULL;
+ uint64_t value = 0;
+ int32_t ret = -1;
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, fd, out);
- ret = fd_ctx_del (fd, this, &value);
- if (ret) {
- goto out;
- }
+ ret = fd_ctx_del(fd, this, &value);
+ if (ret) {
+ goto out;
+ }
- fd_ctx = (dht_fd_ctx_t *)value;
- if (fd_ctx) {
- GF_REF_PUT (fd_ctx);
- }
+ fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value;
+ if (fd_ctx) {
+ GF_REF_PUT(fd_ctx);
+ }
out:
- return ret;
+ return ret;
}
-
static int
-__dht_fd_ctx_set (xlator_t *this, fd_t *fd, xlator_t *dst)
+__dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst)
{
- dht_fd_ctx_t *fd_ctx = NULL;
- uint64_t value = 0;
- int ret = -1;
+ dht_fd_ctx_t *fd_ctx = NULL;
+ uint64_t value = 0;
+ int ret = -1;
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, fd, out);
- fd_ctx = GF_CALLOC (1, sizeof (*fd_ctx), gf_dht_mt_fd_ctx_t);
+ fd_ctx = GF_CALLOC(1, sizeof(*fd_ctx), gf_dht_mt_fd_ctx_t);
- if (!fd_ctx) {
- goto out;
- }
+ if (!fd_ctx) {
+ goto out;
+ }
- fd_ctx->opened_on_dst = (uint64_t) dst;
- GF_REF_INIT (fd_ctx, dht_free_fd_ctx);
+ fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst;
+ GF_REF_INIT(fd_ctx, dht_free_fd_ctx);
- value = (uint64_t) fd_ctx;
+ value = (uint64_t)(uintptr_t)fd_ctx;
- ret = __fd_ctx_set (fd, this, value);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_FD_CTX_SET_FAILED,
- "Failed to set fd ctx in fd=0x%p", fd);
- GF_REF_PUT (fd_ctx);
- }
+ ret = __fd_ctx_set(fd, this, value);
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FD_CTX_SET_FAILED,
+ "fd=0x%p", fd, NULL);
+ GF_REF_PUT(fd_ctx);
+ }
out:
- return ret;
+ return ret;
}
-
-
int
-dht_fd_ctx_set (xlator_t *this, fd_t *fd, xlator_t *dst)
-{
- dht_fd_ctx_t *fd_ctx = NULL;
- uint64_t value = 0;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- LOCK (&fd->lock);
- {
- ret = __fd_ctx_get (fd, this, &value);
- if (ret && value) {
-
- fd_ctx = (dht_fd_ctx_t *) value;
- if (fd_ctx->opened_on_dst == (uint64_t) dst) {
- /* This could happen due to racing
- * check_progress tasks*/
- goto unlock;
- } else {
- /* This would be a big problem*/
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_INVALID_VALUE,
- "Different dst found in the fd ctx");
-
- /* Overwrite and hope for the best*/
- fd_ctx->opened_on_dst = (uint64_t)dst;
- goto unlock;
- }
+dht_fd_ctx_set(xlator_t *this, fd_t *fd, xlator_t *dst)
+{
+ dht_fd_ctx_t *fd_ctx = NULL;
+ uint64_t value = 0;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+ LOCK(&fd->lock);
+ {
+ ret = __fd_ctx_get(fd, this, &value);
+ if (ret && value) {
+ fd_ctx = (dht_fd_ctx_t *)(uintptr_t)value;
+ if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) {
+ /* This could happen due to racing
+ * check_progress tasks*/
+ goto unlock;
+ } else {
+ /* This would be a big problem*/
+ /* Overwrite and hope for the best*/
+ fd_ctx->opened_on_dst = (uint64_t)(uintptr_t)dst;
+ UNLOCK(&fd->lock);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_VALUE,
+ NULL);
- }
- ret = __dht_fd_ctx_set (this, fd, dst);
+ goto out;
+ }
}
+ ret = __dht_fd_ctx_set(this, fd, dst);
+ }
unlock:
- UNLOCK (&fd->lock);
+ UNLOCK(&fd->lock);
out:
- return ret;
+ return ret;
}
-
-
-static
-dht_fd_ctx_t *
-dht_fd_ctx_get (xlator_t *this, fd_t *fd)
+static dht_fd_ctx_t *
+dht_fd_ctx_get(xlator_t *this, fd_t *fd)
{
- dht_fd_ctx_t *fd_ctx = NULL;
- int ret = -1;
- uint64_t tmp_val = 0;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO (this->name, fd, out);
-
- LOCK (&fd->lock);
- {
- ret = __fd_ctx_get (fd, this, &tmp_val);
- if ((ret < 0) || (tmp_val == 0)) {
- UNLOCK (&fd->lock);
- goto out;
- }
+ dht_fd_ctx_t *fd_ctx = NULL;
+ int ret = -1;
+ uint64_t tmp_val = 0;
- fd_ctx = (dht_fd_ctx_t *)tmp_val;
- GF_REF_GET (fd_ctx);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, fd, out);
+
+ LOCK(&fd->lock);
+ {
+ ret = __fd_ctx_get(fd, this, &tmp_val);
+ if ((ret < 0) || (tmp_val == 0)) {
+ goto unlock;
}
- UNLOCK (&fd->lock);
+
+ fd_ctx = (dht_fd_ctx_t *)(uintptr_t)tmp_val;
+ GF_REF_GET(fd_ctx);
+ }
+unlock:
+ UNLOCK(&fd->lock);
out:
- return fd_ctx;
+ return fd_ctx;
}
gf_boolean_t
-dht_fd_open_on_dst (xlator_t *this, fd_t *fd, xlator_t *dst)
+dht_fd_open_on_dst(xlator_t *this, fd_t *fd, xlator_t *dst)
{
- dht_fd_ctx_t *fd_ctx = NULL;
- gf_boolean_t opened = _gf_false;
+ dht_fd_ctx_t *fd_ctx = NULL;
+ gf_boolean_t opened = _gf_false;
- fd_ctx = dht_fd_ctx_get (this, fd);
+ fd_ctx = dht_fd_ctx_get(this, fd);
- if (fd_ctx) {
- if (fd_ctx->opened_on_dst == (uint64_t) dst) {
- opened = _gf_true;
- }
- GF_REF_PUT (fd_ctx);
+ if (fd_ctx) {
+ if (fd_ctx->opened_on_dst == (uint64_t)(uintptr_t)dst) {
+ opened = _gf_true;
}
+ GF_REF_PUT(fd_ctx);
+ }
- return opened;
+ return opened;
}
-
void
-dht_free_mig_info (void *data)
+dht_free_mig_info(void *data)
{
- dht_migrate_info_t *miginfo = NULL;
+ dht_migrate_info_t *miginfo = NULL;
- miginfo = data;
- GF_FREE (miginfo);
+ miginfo = data;
+ GF_FREE(miginfo);
- return;
+ return;
}
static int
-dht_inode_ctx_set_mig_info (xlator_t *this, inode_t *inode,
- xlator_t *src_subvol, xlator_t *dst_subvol)
+dht_inode_ctx_set_mig_info(xlator_t *this, inode_t *inode, xlator_t *src_subvol,
+ xlator_t *dst_subvol)
{
- dht_migrate_info_t *miginfo = NULL;
- uint64_t value = 0;
- int ret = -1;
+ dht_migrate_info_t *miginfo = NULL;
+ uint64_t value = 0;
+ int ret = -1;
- miginfo = GF_CALLOC (1, sizeof (*miginfo), gf_dht_mt_miginfo_t);
- if (miginfo == NULL)
- goto out;
+ miginfo = GF_CALLOC(1, sizeof(*miginfo), gf_dht_mt_miginfo_t);
+ if (miginfo == NULL)
+ goto out;
- miginfo->src_subvol = src_subvol;
- miginfo->dst_subvol = dst_subvol;
- GF_REF_INIT (miginfo, dht_free_mig_info);
+ miginfo->src_subvol = src_subvol;
+ miginfo->dst_subvol = dst_subvol;
+ GF_REF_INIT(miginfo, dht_free_mig_info);
- value = (uint64_t) miginfo;
+ value = (uint64_t)(uintptr_t)miginfo;
- ret = inode_ctx_set1 (inode, this, &value);
- if (ret < 0) {
- GF_REF_PUT (miginfo);
- }
+ ret = inode_ctx_set1(inode, this, &value);
+ if (ret < 0) {
+ GF_REF_PUT(miginfo);
+ }
out:
- return ret;
+ return ret;
}
-
int
-dht_inode_ctx_get_mig_info (xlator_t *this, inode_t *inode,
- xlator_t **src_subvol, xlator_t **dst_subvol)
+dht_inode_ctx_get_mig_info(xlator_t *this, inode_t *inode,
+ xlator_t **src_subvol, xlator_t **dst_subvol)
{
- int ret = -1;
- uint64_t tmp_miginfo = 0;
- dht_migrate_info_t *miginfo = NULL;
-
- LOCK (&inode->lock);
- {
- ret = __inode_ctx_get1 (inode, this, &tmp_miginfo);
- if ((ret < 0) || (tmp_miginfo == 0)) {
- UNLOCK (&inode->lock);
- goto out;
- }
+ int ret = -1;
+ uint64_t tmp_miginfo = 0;
+ dht_migrate_info_t *miginfo = NULL;
- miginfo = (dht_migrate_info_t *)tmp_miginfo;
- GF_REF_GET (miginfo);
+ LOCK(&inode->lock);
+ {
+ ret = __inode_ctx_get1(inode, this, &tmp_miginfo);
+ if ((ret < 0) || (tmp_miginfo == 0)) {
+ UNLOCK(&inode->lock);
+ goto out;
}
- UNLOCK (&inode->lock);
- if (src_subvol)
- *src_subvol = miginfo->src_subvol;
+ miginfo = (dht_migrate_info_t *)(uintptr_t)tmp_miginfo;
+ GF_REF_GET(miginfo);
+ }
+ UNLOCK(&inode->lock);
+
+ if (src_subvol)
+ *src_subvol = miginfo->src_subvol;
- if (dst_subvol)
- *dst_subvol = miginfo->dst_subvol;
+ if (dst_subvol)
+ *dst_subvol = miginfo->dst_subvol;
- GF_REF_PUT (miginfo);
+ GF_REF_PUT(miginfo);
out:
- return ret;
+ return ret;
}
gf_boolean_t
-dht_mig_info_is_invalid (xlator_t *current, xlator_t *src_subvol,
- xlator_t *dst_subvol)
-{
-
-/* Not set
- */
- if (!src_subvol || !dst_subvol)
- return _gf_true;
-
-/* Invalid scenarios:
- * The src_subvol does not match the subvol on which the current op was sent
- * so the cached subvol has changed between the last mig_info_set and now.
- * src_subvol == dst_subvol. The file was migrated without any FOP detecting
- * a P2 so the old dst is now the current subvol.
+dht_mig_info_is_invalid(xlator_t *current, xlator_t *src_subvol,
+ xlator_t *dst_subvol)
+{
+ /* Not set
+ */
+ if (!src_subvol || !dst_subvol)
+ return _gf_true;
+
+ /* Invalid scenarios:
+ * The src_subvol does not match the subvol on which the current op was sent
+ * so the cached subvol has changed between the last mig_info_set and now.
+ * src_subvol == dst_subvol. The file was migrated without any FOP detecting
+ * a P2 so the old dst is now the current subvol.
+ *
+ * There is still one scenario where the info could be outdated - if
+ * file has undergone multiple migrations and ends up on the same src_subvol
+ * on which the mig_info was first set.
+ */
+ if ((current == dst_subvol) || (current != src_subvol))
+ return _gf_true;
+
+ return _gf_false;
+}
+
+/* Used to check if fd fops have the fd opened on the cached subvol
+ * This is required when:
+ * 1. an fd is opened on FILE1 on subvol1
+ * 2. the file is migrated to subvol2
+ * 3. a lookup updates the cached subvol in the inode_ctx to subvol2
+ * 4. a write comes on the fd
+ * The write is sent to subvol2 on an fd which has been opened only on fd1
+ * Since the migration phase checks don't kick in, the fop fails with EBADF
*
- * There is still one scenario where the info could be outdated - if
- * file has undergone multiple migrations and ends up on the same src_subvol
- * on which the mig_info was first set.
*/
- if ((current == dst_subvol) || (current != src_subvol))
- return _gf_true;
- return _gf_false;
+int
+dht_check_and_open_fd_on_subvol_complete(int ret, call_frame_t *frame,
+ void *data)
+{
+ glusterfs_fop_t fop = 0;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *this = NULL;
+ fd_t *fd = NULL;
+ int op_errno = -1;
+
+ local = frame->local;
+ this = frame->this;
+ fop = local->fop;
+ subvol = local->cached_subvol;
+ fd = local->fd;
+
+ if (ret) {
+ op_errno = local->op_errno;
+ goto handle_err;
+ }
+
+ switch (fop) {
+ case GF_FOP_WRITE:
+ STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol,
+ subvol->fops->writev, fd, local->rebalance.vector,
+ local->rebalance.count, local->rebalance.offset,
+ local->rebalance.flags, local->rebalance.iobref,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FLUSH:
+ STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, fd,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FSETATTR:
+ STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+ subvol->fops->fsetattr, fd,
+ &local->rebalance.stbuf, local->rebalance.flags,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_ZEROFILL:
+ STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol,
+ subvol->fops->zerofill, fd,
+ local->rebalance.offset, local->rebalance.size,
+ local->xattr_req);
+
+ break;
+
+ case GF_FOP_DISCARD:
+ STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol,
+ subvol->fops->discard, local->fd,
+ local->rebalance.offset, local->rebalance.size,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FALLOCATE:
+ STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol,
+ subvol->fops->fallocate, fd,
+ local->rebalance.flags, local->rebalance.offset,
+ local->rebalance.size, local->xattr_req);
+ break;
+
+ case GF_FOP_FTRUNCATE:
+ STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+ subvol->fops->ftruncate, fd,
+ local->rebalance.offset, local->xattr_req);
+ break;
+
+ case GF_FOP_FSYNC:
+ STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol,
+ subvol->fops->fsync, local->fd,
+ local->rebalance.flags, local->xattr_req);
+ break;
+
+ case GF_FOP_READ:
+ STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv,
+ local->fd, local->rebalance.size,
+ local->rebalance.offset, local->rebalance.flags,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FSTAT:
+ STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+ subvol->fops->fstat, fd, local->xattr_req);
+ break;
+
+ case GF_FOP_FSETXATTR:
+ STACK_WIND_COOKIE(frame, dht_file_setxattr_cbk, subvol, subvol,
+ subvol->fops->fsetxattr, local->fd,
+ local->rebalance.xattr, local->rebalance.flags,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FREMOVEXATTR:
+ STACK_WIND_COOKIE(frame, dht_file_removexattr_cbk, subvol, subvol,
+ subvol->fops->fremovexattr, local->fd, local->key,
+ local->xattr_req);
+
+ break;
+
+ case GF_FOP_FXATTROP:
+ STACK_WIND(frame, dht_common_xattrop_cbk, subvol,
+ subvol->fops->fxattrop, local->fd,
+ local->rebalance.flags, local->rebalance.xattr,
+ local->xattr_req);
+ break;
+
+ case GF_FOP_FGETXATTR:
+ STACK_WIND(frame, dht_getxattr_cbk, subvol, subvol->fops->fgetxattr,
+ local->fd, local->key, NULL);
+ break;
+
+ case GF_FOP_FINODELK:
+ STACK_WIND(frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk,
+ local->key, local->fd, local->rebalance.lock_cmd,
+ &local->rebalance.flock, local->xattr_req);
+ break;
+ default:
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p",
+ fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s",
+ subvol->name, NULL);
+ break;
+ }
+
+ goto out;
+
+ /* Could not open the fd on the dst. Unwind */
+
+handle_err:
+
+ switch (fop) {
+ case GF_FOP_WRITE:
+ DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_FLUSH:
+ DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+ break;
+
+ case GF_FOP_FSETATTR:
+ DHT_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_ZEROFILL:
+ DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_DISCARD:
+ DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_FALLOCATE:
+ DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_FTRUNCATE:
+ DHT_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_FSYNC:
+ DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+ break;
+
+ case GF_FOP_READ:
+ DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL,
+ NULL);
+ break;
+
+ case GF_FOP_FSTAT:
+ DHT_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
+ break;
+
+ case GF_FOP_FSETXATTR:
+ DHT_STACK_UNWIND(fsetxattr, frame, -1, op_errno, NULL);
+ break;
+
+ case GF_FOP_FREMOVEXATTR:
+ DHT_STACK_UNWIND(fremovexattr, frame, -1, op_errno, NULL);
+ break;
+
+ case GF_FOP_FXATTROP:
+ DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+ break;
+
+ case GF_FOP_FGETXATTR:
+ DHT_STACK_UNWIND(fgetxattr, frame, -1, op_errno, NULL, NULL);
+ break;
+
+ case GF_FOP_FINODELK:
+ DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL);
+ break;
+
+ default:
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_UNKNOWN_FOP, "fd=%p",
+ fd, "gfid=%s", uuid_utoa(fd->inode->gfid), "name=%s",
+ subvol->name, NULL);
+ break;
+ }
+
+out:
+
+ return 0;
}
+/* Check once again if the fd has been opened on the cached subvol.
+ * If not, open and update the fd_ctx.
+ */
+
int
-dht_frame_return (call_frame_t *frame)
-{
- dht_local_t *local = NULL;
- int this_call_cnt = -1;
+dht_check_and_open_fd_on_subvol_task(void *data)
+{
+ loc_t loc = {
+ 0,
+ };
+ int ret = -1;
+ call_frame_t *frame = NULL;
+ dht_local_t *local = NULL;
+ fd_t *fd = NULL;
+ xlator_t *this = NULL;
+ xlator_t *subvol = NULL;
+
+ frame = data;
+ local = frame->local;
+ this = THIS;
+ fd = local->fd;
+ subvol = local->cached_subvol;
+
+ local->fd_checked = _gf_true;
+
+ if (fd_is_anonymous(fd) || dht_fd_open_on_dst(this, fd, subvol)) {
+ ret = 0;
+ goto out;
+ }
- if (!frame)
- return -1;
+ gf_msg_debug(this->name, 0, "Opening fd (%p, flags=0%o) on file %s @ %s",
+ fd, fd->flags, uuid_utoa(fd->inode->gfid), subvol->name);
- local = frame->local;
+ loc.inode = inode_ref(fd->inode);
+ gf_uuid_copy(loc.gfid, fd->inode->gfid);
- LOCK (&frame->lock);
- {
- this_call_cnt = --local->call_cnt;
- }
- UNLOCK (&frame->lock);
+ /* Open this on the dst subvol */
- return this_call_cnt;
-}
+ SYNCTASK_SETID(0, 0);
+ ret = syncop_open(subvol, &loc, (fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)),
+ fd, NULL, NULL);
-int
-dht_filter_loc_subvol_key (xlator_t *this, loc_t *loc, loc_t *new_loc,
- xlator_t **subvol)
-{
- char *new_name = NULL;
- char *new_path = NULL;
- xlator_list_t *trav = NULL;
- char key[1024] = {0,};
- int ret = 0; /* not found */
-
- /* Why do other tasks if first required 'char' itself is not there */
- if (!new_loc || !loc || !loc->name || !strchr (loc->name, '@'))
- goto out;
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_OPEN_FD_ON_DST_FAILED,
+ "fd=%p", fd, "flags=0%o", fd->flags, "gfid=%s",
+ uuid_utoa(fd->inode->gfid), "name=%s", subvol->name, NULL);
+ /* This can happen if the cached subvol was updated in the
+ * inode_ctx and the fd was opened on the new cached suvol
+ * after this fop was wound on the old cached subvol.
+ * As we do not close the fd on the old subvol (a leak)
+ * don't treat ENOENT as an error and allow the phase1/phase2
+ * checks to handle it.
+ */
- trav = this->children;
- while (trav) {
- snprintf (key, 1024, "*@%s:%s", this->name, trav->xlator->name);
- if (fnmatch (key, loc->name, FNM_NOESCAPE) == 0) {
- new_name = GF_CALLOC(strlen (loc->name),
- sizeof (char),
- gf_common_mt_char);
- if (!new_name)
- goto out;
- if (fnmatch (key, loc->path, FNM_NOESCAPE) == 0) {
- new_path = GF_CALLOC(strlen (loc->path),
- sizeof (char),
- gf_common_mt_char);
- if (!new_path)
- goto out;
- strncpy (new_path, loc->path, (strlen (loc->path) -
- strlen (key) + 1));
- }
- strncpy (new_name, loc->name, (strlen (loc->name) -
- strlen (key) + 1));
-
- if (new_loc) {
- new_loc->path = ((new_path) ? new_path:
- gf_strdup (loc->path));
- new_loc->name = new_name;
- new_loc->inode = inode_ref (loc->inode);
- new_loc->parent = inode_ref (loc->parent);
- }
- *subvol = trav->xlator;
- ret = 1; /* success */
- goto out;
- }
- trav = trav->next;
- }
-out:
- if (!ret) {
- /* !success */
- GF_FREE (new_path);
- GF_FREE (new_name);
+ if ((-ret != ENOENT) && (-ret != ESTALE)) {
+ local->op_errno = -ret;
+ ret = -1;
+ } else {
+ ret = 0;
}
- return ret;
-}
-static xlator_t *
-dht_get_subvol_from_id(xlator_t *this, int client_id)
-{
- xlator_t *xl = NULL;
- dht_conf_t *conf = NULL;
- char sid[6] = { 0 };
+ local->op_errno = -ret;
+ ret = -1;
- conf = this->private;
+ } else {
+ dht_fd_ctx_set(this, fd, subvol);
+ }
- sprintf(sid, "%d", client_id);
- if (dict_get_ptr(conf->leaf_to_subvol, sid, (void **) &xl))
- xl = NULL;
+ SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+out:
+ loc_wipe(&loc);
- return xl;
+ return ret;
}
int
-dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p)
+dht_check_and_open_fd_on_subvol(xlator_t *this, call_frame_t *frame)
{
- int client_id = 0;
- xlator_t *subvol = 0;
- dht_conf_t *conf = NULL;
-
- if (!this->private)
- return -1;
+ int ret = -1;
+ dht_local_t *local = NULL;
- conf = this->private;
+ /*
+ if (dht_fd_open_on_dst (this, fd, subvol))
+ goto out;
+ */
+ local = frame->local;
- client_id = gf_deitransform(this, y);
+ ret = synctask_new(this->ctx->env, dht_check_and_open_fd_on_subvol_task,
+ dht_check_and_open_fd_on_subvol_complete, frame, frame);
- subvol = dht_get_subvol_from_id(this, client_id);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SYNCTASK_CREATE_FAILED,
+ "to-check-and-open fd=%p", local->fd, NULL);
+ }
- if (!subvol)
- subvol = conf->subvolumes[0];
-
- if (subvol_p)
- *subvol_p = subvol;
-
- return 0;
+ return ret;
}
-char *
-dht_lock_asprintf (dht_lock_t *lock)
+int
+dht_frame_return(call_frame_t *frame)
{
- char *lk_buf = NULL;
- char gfid[GF_UUID_BUF_SIZE] = {0, };
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
- if (lock == NULL)
- goto out;
+ if (!frame)
+ return -1;
- uuid_utoa_r (lock->loc.gfid, gfid);
+ local = frame->local;
- gf_asprintf (&lk_buf, "%s:%s", lock->xl->name, gfid);
+ LOCK(&frame->lock);
+ {
+ this_call_cnt = --local->call_cnt;
+ }
+ UNLOCK(&frame->lock);
-out:
- return lk_buf;
+ return this_call_cnt;
}
-void
-dht_log_lk_array (char *name, gf_loglevel_t log_level, dht_lock_t **lk_array,
- int count)
-{
- int i = 0;
- char *lk_buf = NULL;
+/*
+ * Use this function to specify which subvol you want the file created
+ * on - this need not be the hashed subvol.
+ * Format: <filename>@<this->name>:<subvol-name>
+ * Eg: file-1@vol1-dht:vol1-client-0
+ * where vol1 is a pure distribute volume
+ * will create file-1 on vol1-client-0
+ */
- if ((lk_array == NULL) || (count == 0))
+int
+dht_filter_loc_subvol_key(xlator_t *this, loc_t *loc, loc_t *new_loc,
+ xlator_t **subvol)
+{
+ char *new_name = NULL;
+ char *new_path = NULL;
+ xlator_list_t *trav = NULL;
+ char key[1024] = {
+ 0,
+ };
+ int ret = 0; /* not found */
+ int keylen = 0;
+ int name_len = 0;
+ int path_len = 0;
+
+ /* Why do other tasks if first required 'char' itself is not there */
+ if (!new_loc || !loc || !loc->name || !strchr(loc->name, '@')) {
+ /* Skip the GF_FREE checks here */
+ return ret;
+ }
+
+ trav = this->children;
+ while (trav) {
+ keylen = snprintf(key, sizeof(key), "*@%s:%s", this->name,
+ trav->xlator->name);
+ /* Ignore '*' */
+ keylen = keylen - 1;
+ if (fnmatch(key, loc->name, FNM_NOESCAPE) == 0) {
+ name_len = strlen(loc->name) - keylen;
+ new_name = GF_MALLOC(name_len + 1, gf_common_mt_char);
+ if (!new_name)
goto out;
-
- for (i = 0; i < count; i++) {
- lk_buf = dht_lock_asprintf (lk_array[i]);
- gf_msg (name, log_level, 0, DHT_MSG_LK_ARRAY_INFO,
- "%d. %s", i, lk_buf);
- GF_FREE (lk_buf);
- }
-
+ if (fnmatch(key, loc->path, FNM_NOESCAPE) == 0) {
+ path_len = strlen(loc->path) - keylen;
+ new_path = GF_MALLOC(path_len + 1, gf_common_mt_char);
+ if (!new_path)
+ goto out;
+ snprintf(new_path, path_len + 1, "%s", loc->path);
+ }
+ snprintf(new_name, name_len + 1, "%s", loc->name);
+
+ if (new_loc) {
+ new_loc->path = ((new_path) ? new_path : gf_strdup(loc->path));
+ new_loc->name = new_name;
+ new_loc->inode = inode_ref(loc->inode);
+ new_loc->parent = inode_ref(loc->parent);
+ }
+ *subvol = trav->xlator;
+ ret = 1; /* success */
+ goto out;
+ }
+ trav = trav->next;
+ }
out:
- return;
+ if (!ret) {
+ /* !success */
+ GF_FREE(new_path);
+ GF_FREE(new_name);
+ }
+ return ret;
}
-void
-dht_lock_stack_destroy (call_frame_t *lock_frame)
+static xlator_t *
+dht_get_subvol_from_id(xlator_t *this, int client_id)
{
- dht_local_t *local = NULL;
+ xlator_t *xl = NULL;
+ dht_conf_t *conf = NULL;
+ char *sid = NULL;
+ int32_t ret = -1;
- local = lock_frame->local;
+ conf = this->private;
- local->lock.locks = NULL;
- local->lock.lk_count = 0;
+ ret = gf_asprintf(&sid, "%d", client_id);
+ if (ret == -1) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_ASPRINTF_FAILED, NULL);
+ goto out;
+ }
- DHT_STACK_DESTROY (lock_frame);
- return;
-}
-
-void
-dht_lock_free (dht_lock_t *lock)
-{
- if (lock == NULL)
- goto out;
+ if (dict_get_ptr(conf->leaf_to_subvol, sid, (void **)&xl))
+ xl = NULL;
- loc_wipe (&lock->loc);
- GF_FREE (lock->domain);
- mem_put (lock);
+ GF_FREE(sid);
out:
- return;
+ return xl;
}
-void
-dht_lock_array_free (dht_lock_t **lk_array, int count)
+int
+dht_deitransform(xlator_t *this, uint64_t y, xlator_t **subvol_p)
{
- int i = 0;
- dht_lock_t *lock = NULL;
-
- if (lk_array == NULL)
- goto out;
-
- for (i = 0; i < count; i++) {
- lock = lk_array[i];
- lk_array[i] = NULL;
- dht_lock_free (lock);
- }
+ int client_id = 0;
+ xlator_t *subvol = 0;
+ dht_conf_t *conf = NULL;
-out:
- return;
-}
+ if (!this->private)
+ return -1;
-dht_lock_t *
-dht_lock_new (xlator_t *this, xlator_t *xl, loc_t *loc, short type,
- const char *domain)
-{
- dht_conf_t *conf = NULL;
- dht_lock_t *lock = NULL;
+ conf = this->private;
- conf = this->private;
+ client_id = gf_deitransform(this, y);
- lock = mem_get0 (conf->lock_pool);
- if (lock == NULL)
- goto out;
+ subvol = dht_get_subvol_from_id(this, client_id);
- lock->xl = xl;
- lock->type = type;
- lock->domain = gf_strdup (domain);
- if (lock->domain == NULL) {
- dht_lock_free (lock);
- lock = NULL;
- goto out;
- }
+ if (!subvol)
+ subvol = conf->subvolumes[0];
- /* Fill only inode and gfid.
- posix and protocol/server give preference to pargfid/basename over
- gfid/inode for resolution if all the three parameters of loc_t are
- present. I want to avoid the following hypothetical situation:
-
- 1. rebalance did a lookup on a dentry and got a gfid.
- 2. rebalance acquires lock on loc_t which was filled with gfid and
- path (pargfid/bname) from step 1.
- 3. somebody deleted and recreated the same file
- 4. rename on the same path acquires lock on loc_t which now points
- to a different inode (and hence gets the lock).
- 5. rebalance continues to migrate file (note that not all fops done
- by rebalance during migration are inode/gfid based Eg., unlink)
- 6. rename continues.
- */
- lock->loc.inode = inode_ref (loc->inode);
- loc_gfid (loc, lock->loc.gfid);
+ if (subvol_p)
+ *subvol_p = subvol;
-out:
- return lock;
+ return 0;
}
-int
-dht_local_lock_init (call_frame_t *frame, dht_lock_t **lk_array,
- int lk_count, fop_inodelk_cbk_t inodelk_cbk)
+void
+dht_local_wipe(xlator_t *this, dht_local_t *local)
{
- int ret = -1;
- dht_local_t *local = NULL;
-
- local = frame->local;
-
- if (local == NULL) {
- local = dht_local_init (frame, NULL, NULL, 0);
- }
+ int i = 0;
- if (local == NULL) {
- goto out;
- }
+ if (!local)
+ return;
- local->lock.inodelk_cbk = inodelk_cbk;
- local->lock.locks = lk_array;
- local->lock.lk_count = lk_count;
+ loc_wipe(&local->loc);
+ loc_wipe(&local->loc2);
+ loc_wipe(&local->loc2_copy);
- ret = dht_lock_order_requests (local->lock.locks,
- local->lock.lk_count);
- if (ret < 0)
- goto out;
+ if (local->xattr)
+ dict_unref(local->xattr);
- ret = 0;
-out:
- return ret;
-}
+ if (local->inode)
+ inode_unref(local->inode);
-void
-dht_local_wipe (xlator_t *this, dht_local_t *local)
-{
- if (!local)
- return;
+ if (local->layout) {
+ dht_layout_unref(this, local->layout);
+ local->layout = NULL;
+ }
- loc_wipe (&local->loc);
- loc_wipe (&local->loc2);
+ loc_wipe(&local->linkfile.loc);
- if (local->xattr)
- dict_unref (local->xattr);
+ if (local->linkfile.xattr)
+ dict_unref(local->linkfile.xattr);
- if (local->inode)
- inode_unref (local->inode);
+ if (local->linkfile.inode)
+ inode_unref(local->linkfile.inode);
- if (local->layout) {
- dht_layout_unref (this, local->layout);
- local->layout = NULL;
- }
+ if (local->fd) {
+ fd_unref(local->fd);
+ local->fd = NULL;
+ }
- loc_wipe (&local->linkfile.loc);
+ if (local->params) {
+ dict_unref(local->params);
+ local->params = NULL;
+ }
- if (local->linkfile.xattr)
- dict_unref (local->linkfile.xattr);
+ if (local->xattr_req)
+ dict_unref(local->xattr_req);
+ if (local->mds_xattr)
+ dict_unref(local->mds_xattr);
+ if (local->xdata)
+ dict_unref(local->xdata);
- if (local->linkfile.inode)
- inode_unref (local->linkfile.inode);
+ if (local->selfheal.layout) {
+ dht_layout_unref(this, local->selfheal.layout);
+ local->selfheal.layout = NULL;
+ }
- if (local->fd) {
- fd_unref (local->fd);
- local->fd = NULL;
- }
+ if (local->selfheal.refreshed_layout) {
+ dht_layout_unref(this, local->selfheal.refreshed_layout);
+ local->selfheal.refreshed_layout = NULL;
+ }
- if (local->params) {
- dict_unref (local->params);
- local->params = NULL;
- }
+ for (i = 0; i < 2; i++) {
+ dht_lock_array_free(local->lock[i].ns.parent_layout.locks,
+ local->lock[i].ns.parent_layout.lk_count);
- if (local->xattr_req)
- dict_unref (local->xattr_req);
+ GF_FREE(local->lock[i].ns.parent_layout.locks);
- if (local->selfheal.layout) {
- dht_layout_unref (this, local->selfheal.layout);
- local->selfheal.layout = NULL;
- }
+ dht_lock_array_free(local->lock[i].ns.directory_ns.locks,
+ local->lock[i].ns.directory_ns.lk_count);
+ GF_FREE(local->lock[i].ns.directory_ns.locks);
+ }
- if (local->selfheal.refreshed_layout) {
- dht_layout_unref (this, local->selfheal.refreshed_layout);
- local->selfheal.refreshed_layout = NULL;
- }
+ GF_FREE(local->key);
- dht_lock_array_free (local->lock.locks, local->lock.lk_count);
- GF_FREE (local->lock.locks);
+ if (local->rebalance.xdata)
+ dict_unref(local->rebalance.xdata);
- GF_FREE (local->newpath);
+ if (local->rebalance.xattr)
+ dict_unref(local->rebalance.xattr);
- GF_FREE (local->key);
+ if (local->rebalance.dict)
+ dict_unref(local->rebalance.dict);
- if (local->rebalance.xdata)
- dict_unref (local->rebalance.xdata);
+ GF_FREE(local->rebalance.vector);
- if (local->rebalance.xattr)
- dict_unref (local->rebalance.xattr);
+ if (local->rebalance.iobref)
+ iobref_unref(local->rebalance.iobref);
- GF_FREE (local->rebalance.vector);
+ if (local->stub) {
+ call_stub_destroy(local->stub);
+ local->stub = NULL;
+ }
- if (local->rebalance.iobref)
- iobref_unref (local->rebalance.iobref);
+ if (local->ret_cache)
+ GF_FREE(local->ret_cache);
- mem_put (local);
+ mem_put(local);
}
-
dht_local_t *
-dht_local_init (call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop)
+dht_local_init(call_frame_t *frame, loc_t *loc, fd_t *fd, glusterfs_fop_t fop)
{
- dht_local_t *local = NULL;
- inode_t *inode = NULL;
- int ret = 0;
+ dht_local_t *local = NULL;
+ inode_t *inode = NULL;
+ int ret = 0;
- local = mem_get0 (THIS->local_pool);
- if (!local)
- goto out;
+ local = mem_get0(THIS->local_pool);
+ if (!local)
+ goto out;
- if (loc) {
- ret = loc_copy (&local->loc, loc);
- if (ret)
- goto out;
+ if (loc) {
+ ret = loc_copy(&local->loc, loc);
+ if (ret)
+ goto out;
- inode = loc->inode;
- }
+ inode = loc->inode;
+ }
- if (fd) {
- local->fd = fd_ref (fd);
- if (!inode)
- inode = fd->inode;
- }
+ if (fd) {
+ local->fd = fd_ref(fd);
+ if (!inode)
+ inode = fd->inode;
+ }
- local->op_ret = -1;
- local->op_errno = EUCLEAN;
- local->fop = fop;
+ local->op_ret = -1;
+ local->op_errno = EUCLEAN;
+ local->fop = fop;
- if (inode) {
- local->layout = dht_layout_get (frame->this, inode);
- local->cached_subvol = dht_subvol_get_cached (frame->this,
- inode);
- }
+ if (inode) {
+ local->layout = dht_layout_get(frame->this, inode);
+ local->cached_subvol = dht_subvol_get_cached(frame->this, inode);
+ }
- frame->local = local;
+ frame->local = local;
out:
- if (ret) {
- if (local)
- mem_put (local);
- local = NULL;
- }
- return local;
+ if (ret) {
+ if (local)
+ mem_put(local);
+ local = NULL;
+ }
+ return local;
}
xlator_t *
-dht_first_up_subvol (xlator_t *this)
+dht_first_up_subvol(xlator_t *this)
{
- dht_conf_t *conf = NULL;
- xlator_t *child = NULL;
- int i = 0;
- time_t time = 0;
+ dht_conf_t *conf = NULL;
+ xlator_t *child = NULL;
+ int i = 0;
+ time_t time = 0;
- conf = this->private;
- if (!conf)
- goto out;
+ conf = this->private;
+ if (!conf)
+ goto out;
- LOCK (&conf->subvolume_lock);
- {
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->subvol_up_time[i]) {
- if (!time) {
- time = conf->subvol_up_time[i];
- child = conf->subvolumes[i];
- } else if (time > conf->subvol_up_time[i]) {
- time = conf->subvol_up_time[i];
- child = conf->subvolumes[i];
- }
- }
+ LOCK(&conf->subvolume_lock);
+ {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvol_up_time[i]) {
+ if (!time) {
+ time = conf->subvol_up_time[i];
+ child = conf->subvolumes[i];
+ } else if (time > conf->subvol_up_time[i]) {
+ time = conf->subvol_up_time[i];
+ child = conf->subvolumes[i];
}
+ }
}
- UNLOCK (&conf->subvolume_lock);
+ }
+ UNLOCK(&conf->subvolume_lock);
out:
- return child;
+ return child;
}
xlator_t *
-dht_last_up_subvol (xlator_t *this)
+dht_last_up_subvol(xlator_t *this)
{
- dht_conf_t *conf = NULL;
- xlator_t *child = NULL;
- int i = 0;
+ dht_conf_t *conf = NULL;
+ xlator_t *child = NULL;
+ int i = 0;
- conf = this->private;
- if (!conf)
- goto out;
+ conf = this->private;
+ if (!conf)
+ goto out;
- LOCK (&conf->subvolume_lock);
- {
- for (i = conf->subvolume_cnt-1; i >= 0; i--) {
- if (conf->subvolume_status[i]) {
- child = conf->subvolumes[i];
- break;
- }
- }
+ LOCK(&conf->subvolume_lock);
+ {
+ for (i = conf->subvolume_cnt - 1; i >= 0; i--) {
+ if (conf->subvolume_status[i]) {
+ child = conf->subvolumes[i];
+ break;
+ }
}
- UNLOCK (&conf->subvolume_lock);
+ }
+ UNLOCK(&conf->subvolume_lock);
out:
- return child;
+ return child;
}
xlator_t *
-dht_subvol_get_hashed (xlator_t *this, loc_t *loc)
+dht_subvol_get_hashed(xlator_t *this, loc_t *loc)
{
- dht_layout_t *layout = NULL;
- xlator_t *subvol = NULL;
- dht_conf_t *conf = NULL;
- dht_methods_t *methods = NULL;
+ dht_layout_t *layout = NULL;
+ xlator_t *subvol = NULL;
+ dht_conf_t *conf = NULL;
+ dht_methods_t *methods = NULL;
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO (this->name, loc, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, loc, out);
- conf = this->private;
- GF_VALIDATE_OR_GOTO (this->name, conf, out);
+ conf = this->private;
+ GF_VALIDATE_OR_GOTO(this->name, conf, out);
- methods = &(conf->methods);
+ methods = &(conf->methods);
- if (__is_root_gfid (loc->gfid)) {
- subvol = dht_first_up_subvol (this);
- goto out;
- }
+ if (__is_root_gfid(loc->gfid)) {
+ subvol = dht_first_up_subvol(this);
+ goto out;
+ }
- GF_VALIDATE_OR_GOTO (this->name, loc->parent, out);
- GF_VALIDATE_OR_GOTO (this->name, loc->name, out);
+ GF_VALIDATE_OR_GOTO(this->name, loc->parent, out);
+ GF_VALIDATE_OR_GOTO(this->name, loc->name, out);
- layout = dht_layout_get (this, loc->parent);
+ layout = dht_layout_get(this, loc->parent);
- if (!layout) {
- gf_msg_debug (this->name, 0,
- "Missing layout. path=%s, parent gfid =%s",
- loc->path, uuid_utoa (loc->parent->gfid));
- goto out;
- }
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "Missing layout. path=%s, parent gfid =%s",
+ loc->path, uuid_utoa(loc->parent->gfid));
+ goto out;
+ }
- subvol = methods->layout_search (this, layout, loc->name);
+ subvol = methods->layout_search(this, layout, loc->name);
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "No hashed subvolume for path=%s",
- loc->path);
- goto out;
- }
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "No hashed subvolume for path=%s",
+ loc->path);
+ goto out;
+ }
out:
- if (layout) {
- dht_layout_unref (this, layout);
- }
+ if (layout) {
+ dht_layout_unref(this, layout);
+ }
- return subvol;
+ return subvol;
}
-
xlator_t *
-dht_subvol_get_cached (xlator_t *this, inode_t *inode)
+dht_subvol_get_cached(xlator_t *this, inode_t *inode)
{
- dht_layout_t *layout = NULL;
- xlator_t *subvol = NULL;
+ dht_layout_t *layout = NULL;
+ xlator_t *subvol = NULL;
- GF_VALIDATE_OR_GOTO (this->name, this, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- layout = dht_layout_get (this, inode);
+ layout = dht_layout_get(this, inode);
- if (!layout) {
- goto out;
- }
+ if (!layout) {
+ goto out;
+ }
- subvol = layout->list[0].xlator;
+ subvol = layout->list[0].xlator;
out:
- if (layout) {
- dht_layout_unref (this, layout);
- }
+ if (layout) {
+ dht_layout_unref(this, layout);
+ }
- return subvol;
+ return subvol;
}
-
xlator_t *
-dht_subvol_next (xlator_t *this, xlator_t *prev)
+dht_subvol_next(xlator_t *this, xlator_t *prev)
{
- dht_conf_t *conf = NULL;
- int i = 0;
- xlator_t *next = NULL;
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *next = NULL;
- conf = this->private;
- if (!conf)
- goto out;
+ conf = this->private;
+ if (!conf)
+ goto out;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->subvolumes[i] == prev) {
- if ((i + 1) < conf->subvolume_cnt)
- next = conf->subvolumes[i + 1];
- break;
- }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev) {
+ if ((i + 1) < conf->subvolume_cnt)
+ next = conf->subvolumes[i + 1];
+ break;
}
+ }
out:
- return next;
+ return next;
}
/* This func wraps around, if prev is actually the last subvol.
*/
xlator_t *
-dht_subvol_next_available (xlator_t *this, xlator_t *prev)
-{
- dht_conf_t *conf = NULL;
- int i = 0;
- xlator_t *next = NULL;
-
- conf = this->private;
- if (!conf)
- goto out;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->subvolumes[i] == prev) {
- /* if prev is last in conf->subvolumes, then wrap
- * around.
- */
- if ((i + 1) < conf->subvolume_cnt) {
- next = conf->subvolumes[i + 1];
- } else {
- next = conf->subvolumes[0];
- }
- break;
- }
- }
+dht_subvol_next_available(xlator_t *this, xlator_t *prev)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ xlator_t *next = NULL;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == prev) {
+ /* if prev is last in conf->subvolumes, then wrap
+ * around.
+ */
+ if ((i + 1) < conf->subvolume_cnt) {
+ next = conf->subvolumes[i + 1];
+ } else {
+ next = conf->subvolumes[0];
+ }
+ break;
+ }
+ }
out:
- return next;
+ return next;
}
int
-dht_subvol_cnt (xlator_t *this, xlator_t *subvol)
+dht_subvol_cnt(xlator_t *this, xlator_t *subvol)
{
- int i = 0;
- int ret = -1;
- dht_conf_t *conf = NULL;
+ int i = 0;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
- conf = this->private;
- if (!conf)
- goto out;
+ conf = this->private;
+ if (!conf)
+ goto out;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (subvol == conf->subvolumes[i]) {
- ret = i;
- break;
- }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (subvol == conf->subvolumes[i]) {
+ ret = i;
+ break;
}
+ }
out:
- return ret;
+ return ret;
}
+#define set_if_greater(a, b) \
+ do { \
+ if ((a) < (b)) \
+ (a) = (b); \
+ } while (0)
-#define set_if_greater(a, b) do { \
- if ((a) < (b)) \
- (a) = (b); \
- } while (0)
-
-
-#define set_if_greater_time(a, an, b, bn) do { \
- if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))){ \
- (a) = (b); \
- (an) = (bn); \
- } \
- } while (0) \
-
+#define set_if_greater_time(a, an, b, bn) \
+ do { \
+ if (((a) < (b)) || (((a) == (b)) && ((an) < (bn)))) { \
+ (a) = (b); \
+ (an) = (bn); \
+ } \
+ } while (0)
int
-dht_iatt_merge (xlator_t *this, struct iatt *to,
- struct iatt *from, xlator_t *subvol)
+dht_iatt_merge(xlator_t *this, struct iatt *to, struct iatt *from)
{
- if (!from || !to)
- return 0;
+ if (!from || !to)
+ return 0;
- to->ia_dev = from->ia_dev;
+ to->ia_dev = from->ia_dev;
- gf_uuid_copy (to->ia_gfid, from->ia_gfid);
+ gf_uuid_copy(to->ia_gfid, from->ia_gfid);
- to->ia_ino = from->ia_ino;
- to->ia_prot = from->ia_prot;
- to->ia_type = from->ia_type;
- to->ia_nlink = from->ia_nlink;
- to->ia_rdev = from->ia_rdev;
- to->ia_size += from->ia_size;
- to->ia_blksize = from->ia_blksize;
- to->ia_blocks += from->ia_blocks;
+ to->ia_ino = from->ia_ino;
+ to->ia_prot = from->ia_prot;
+ to->ia_type = from->ia_type;
+ to->ia_nlink = from->ia_nlink;
+ to->ia_rdev = from->ia_rdev;
+ to->ia_size += from->ia_size;
+ to->ia_blksize = from->ia_blksize;
+ to->ia_blocks += from->ia_blocks;
- set_if_greater (to->ia_uid, from->ia_uid);
- set_if_greater (to->ia_gid, from->ia_gid);
+ if (IA_ISDIR(from->ia_type)) {
+ to->ia_blocks = DHT_DIR_STAT_BLOCKS;
+ to->ia_size = DHT_DIR_STAT_SIZE;
+ }
+ set_if_greater(to->ia_uid, from->ia_uid);
+ set_if_greater(to->ia_gid, from->ia_gid);
- set_if_greater_time(to->ia_atime, to->ia_atime_nsec,
- from->ia_atime, from->ia_atime_nsec);
- set_if_greater_time (to->ia_mtime, to->ia_mtime_nsec,
- from->ia_mtime, from->ia_mtime_nsec);
- set_if_greater_time (to->ia_ctime, to->ia_ctime_nsec,
- from->ia_ctime, from->ia_ctime_nsec);
+ set_if_greater_time(to->ia_atime, to->ia_atime_nsec, from->ia_atime,
+ from->ia_atime_nsec);
+ set_if_greater_time(to->ia_mtime, to->ia_mtime_nsec, from->ia_mtime,
+ from->ia_mtime_nsec);
+ set_if_greater_time(to->ia_ctime, to->ia_ctime_nsec, from->ia_ctime,
+ from->ia_ctime_nsec);
- return 0;
+ return 0;
}
int
-dht_build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name)
+dht_build_child_loc(xlator_t *this, loc_t *child, loc_t *parent, char *name)
{
- if (!child) {
- goto err;
- }
+ if (!child) {
+ goto err;
+ }
- if (strcmp (parent->path, "/") == 0)
- gf_asprintf ((char **)&child->path, "/%s", name);
- else
- gf_asprintf ((char **)&child->path, "%s/%s", parent->path, name);
+ if (strcmp(parent->path, "/") == 0)
+ gf_asprintf((char **)&child->path, "/%s", name);
+ else
+ gf_asprintf((char **)&child->path, "%s/%s", parent->path, name);
- if (!child->path) {
- goto err;
- }
+ if (!child->path) {
+ goto err;
+ }
- child->name = strrchr (child->path, '/');
- if (child->name)
- child->name++;
+ child->name = strrchr(child->path, '/');
+ if (child->name)
+ child->name++;
- child->parent = inode_ref (parent->inode);
- child->inode = inode_new (parent->inode->table);
+ child->parent = inode_ref(parent->inode);
+ child->inode = inode_new(parent->inode->table);
- if (!child->inode) {
- goto err;
- }
+ if (!child->inode) {
+ goto err;
+ }
- return 0;
+ return 0;
err:
- loc_wipe (child);
- return -1;
+ if (child) {
+ loc_wipe(child);
+ }
+ return -1;
}
int
-dht_init_local_subvolumes (xlator_t *this, dht_conf_t *conf)
+dht_init_local_subvolumes(xlator_t *this, dht_conf_t *conf)
{
- xlator_list_t *subvols = NULL;
- int cnt = 0;
+ xlator_list_t *subvols = NULL;
+ int cnt = 0;
- if (!conf)
- return -1;
+ if (!conf)
+ return -1;
- for (subvols = this->children; subvols; subvols = subvols->next)
- cnt++;
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ cnt++;
- conf->local_subvols = GF_CALLOC (cnt, sizeof (xlator_t *),
- gf_dht_mt_xlator_t);
- if (!conf->local_subvols) {
- return -1;
- }
+ conf->local_subvols = GF_CALLOC(cnt, sizeof(xlator_t *),
+ gf_dht_mt_xlator_t);
- conf->local_subvols_cnt = 0;
+ /* FIX FIX : do this dynamically*/
+ conf->local_nodeuuids = GF_CALLOC(cnt, sizeof(subvol_nodeuuids_info_t),
+ gf_dht_nodeuuids_t);
- return 0;
+ if (!conf->local_subvols || !conf->local_nodeuuids) {
+ return -1;
+ }
+
+ conf->local_subvols_cnt = 0;
+
+ return 0;
}
int
-dht_init_subvolumes (xlator_t *this, dht_conf_t *conf)
+dht_init_subvolumes(xlator_t *this, dht_conf_t *conf)
{
- xlator_list_t *subvols = NULL;
- int cnt = 0;
+ xlator_list_t *subvols = NULL;
+ int cnt = 0;
- if (!conf)
- return -1;
+ if (!conf)
+ return -1;
- for (subvols = this->children; subvols; subvols = subvols->next)
- cnt++;
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ cnt++;
- conf->subvolumes = GF_CALLOC (cnt, sizeof (xlator_t *),
- gf_dht_mt_xlator_t);
- if (!conf->subvolumes) {
- return -1;
- }
- conf->subvolume_cnt = cnt;
+ conf->subvolumes = GF_CALLOC(cnt, sizeof(xlator_t *), gf_dht_mt_xlator_t);
+ if (!conf->subvolumes) {
+ return -1;
+ }
+ conf->subvolume_cnt = cnt;
+ /* Doesn't make sense to do any dht layer tasks
+ if the subvol count is 1. Set it as pass_through */
+ if (cnt == 1)
+ this->pass_through = _gf_true;
- conf->local_subvols_cnt = 0;
+ conf->local_subvols_cnt = 0;
- dht_set_subvol_range(this);
+ dht_set_subvol_range(this);
- cnt = 0;
- for (subvols = this->children; subvols; subvols = subvols->next)
- conf->subvolumes[cnt++] = subvols->xlator;
+ cnt = 0;
+ for (subvols = this->children; subvols; subvols = subvols->next)
+ conf->subvolumes[cnt++] = subvols->xlator;
- conf->subvolume_status = GF_CALLOC (cnt, sizeof (char),
- gf_dht_mt_char);
- if (!conf->subvolume_status) {
- return -1;
- }
+ conf->subvolume_status = GF_CALLOC(cnt, sizeof(char), gf_dht_mt_char);
+ if (!conf->subvolume_status) {
+ return -1;
+ }
- conf->last_event = GF_CALLOC (cnt, sizeof (int),
- gf_dht_mt_char);
- if (!conf->last_event) {
- return -1;
- }
+ conf->last_event = GF_CALLOC(cnt, sizeof(int), gf_dht_mt_char);
+ if (!conf->last_event) {
+ return -1;
+ }
- conf->subvol_up_time = GF_CALLOC (cnt, sizeof (time_t),
- gf_dht_mt_subvol_time);
- if (!conf->subvol_up_time) {
- return -1;
- }
+ conf->subvol_up_time = GF_CALLOC(cnt, sizeof(time_t),
+ gf_dht_mt_subvol_time);
+ if (!conf->subvol_up_time) {
+ return -1;
+ }
- conf->du_stats = GF_CALLOC (conf->subvolume_cnt, sizeof (dht_du_t),
- gf_dht_mt_dht_du_t);
- if (!conf->du_stats) {
- return -1;
- }
+ conf->du_stats = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_du_t),
+ gf_dht_mt_dht_du_t);
+ if (!conf->du_stats) {
+ return -1;
+ }
- conf->decommissioned_bricks = GF_CALLOC (cnt, sizeof (xlator_t *),
- gf_dht_mt_xlator_t);
- if (!conf->decommissioned_bricks) {
- return -1;
- }
+ conf->decommissioned_bricks = GF_CALLOC(cnt, sizeof(xlator_t *),
+ gf_dht_mt_xlator_t);
+ if (!conf->decommissioned_bricks) {
+ return -1;
+ }
- return 0;
+ return 0;
}
-
/*
op_ret values :
0 : Success.
@@ -1053,239 +1238,274 @@ dht_init_subvolumes (xlator_t *this, dht_conf_t *conf)
*/
static int
-dht_migration_complete_check_done (int op_ret, call_frame_t *frame, void *data)
+dht_migration_complete_check_done(int op_ret, call_frame_t *frame, void *data)
{
- dht_local_t *local = NULL;
- xlator_t *subvol = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret != 0)
- goto out;
+ if (op_ret != 0)
+ goto out;
- if (local->cached_subvol == NULL) {
- local->op_errno = EINVAL;
- goto out;
- }
+ if (local->cached_subvol == NULL) {
+ local->op_errno = EINVAL;
+ goto out;
+ }
- subvol = local->cached_subvol;
+ subvol = local->cached_subvol;
out:
- local->rebalance.target_op_fn (THIS, subvol, frame, op_ret);
+ local->rebalance.target_op_fn(THIS, subvol, frame, op_ret);
- return 0;
+ return 0;
}
-
int
-dht_migration_complete_check_task (void *data)
-{
- int ret = -1;
- xlator_t *src_node = NULL;
- xlator_t *dst_node = NULL, *linkto_target = NULL;
- dht_local_t *local = NULL;
- dict_t *dict = NULL;
- struct iatt stbuf = {0,};
- xlator_t *this = NULL;
- call_frame_t *frame = NULL;
- loc_t tmp_loc = {0,};
- char *path = NULL;
- dht_conf_t *conf = NULL;
- inode_t *inode = NULL;
- fd_t *iter_fd = NULL;
- uint64_t tmp_miginfo = 0;
- dht_migrate_info_t *miginfo = NULL;
- int open_failed = 0;
-
- this = THIS;
- frame = data;
- local = frame->local;
- conf = this->private;
-
- src_node = local->cached_subvol;
-
- if (!local->loc.inode && !local->fd) {
- local->op_errno = EINVAL;
- goto out;
- }
-
- inode = (!local->fd) ? local->loc.inode : local->fd->inode;
-
- /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
- * as root:root. If a fd is already open, access check wont be done*/
-
- if (!local->loc.inode) {
- ret = syncop_fgetxattr (src_node, local->fd, &dict,
- conf->link_xattr_name, NULL, NULL);
- } else {
- SYNCTASK_SETID (0, 0);
- ret = syncop_getxattr (src_node, &local->loc, &dict,
- conf->link_xattr_name, NULL, NULL);
- SYNCTASK_SETID (frame->root->uid, frame->root->gid);
- }
-
-
- /*
- * Each DHT xlator layer has its own name for the linkto xattr.
- * If the file mode bits indicate the the file is being migrated but
- * this layer's linkto xattr is not set, it means that another
- * DHT layer is migrating the file. In this case, return 1 so
- * the mode bits can be passed on to the higher layer for appropriate
- * action.
+dht_migration_complete_check_task(void *data)
+{
+ int ret = -1;
+ xlator_t *src_node = NULL;
+ xlator_t *dst_node = NULL, *linkto_target = NULL;
+ dht_local_t *local = NULL;
+ dict_t *dict = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ xlator_t *this = NULL;
+ call_frame_t *frame = NULL;
+ loc_t tmp_loc = {
+ 0,
+ };
+ char *path = NULL;
+ dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ fd_t *tmp = NULL;
+ uint64_t tmp_miginfo = 0;
+ dht_migrate_info_t *miginfo = NULL;
+ gf_boolean_t skip_open = _gf_false;
+ int open_failed = 0;
+
+ this = THIS;
+ frame = data;
+ local = frame->local;
+ conf = this->private;
+
+ src_node = local->cached_subvol;
+
+ if (!local->loc.inode && !local->fd) {
+ local->op_errno = EINVAL;
+ goto out;
+ }
+
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check won't be done*/
+
+ if (!local->loc.inode) {
+ ret = syncop_fgetxattr(src_node, local->fd, &dict,
+ conf->link_xattr_name, NULL, NULL);
+ } else {
+ SYNCTASK_SETID(0, 0);
+ ret = syncop_getxattr(src_node, &local->loc, &dict,
+ conf->link_xattr_name, NULL, NULL);
+ SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+ }
+
+ /*
+ * Each DHT xlator layer has its own name for the linkto xattr.
+ * If the file mode bits indicate the the file is being migrated but
+ * this layer's linkto xattr is not set, it means that another
+ * DHT layer is migrating the file. In this case, return 1 so
+ * the mode bits can be passed on to the higher layer for appropriate
+ * action.
+ */
+ if (-ret == ENODATA) {
+ /* This DHT translator is not migrating this file */
+
+ ret = inode_ctx_reset1(inode, this, &tmp_miginfo);
+ if (tmp_miginfo) {
+ /* This can be a problem if the file was
+ * migrated by two different layers. Raise
+ * a warning here.
+ */
+ gf_smsg(
+ this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL);
+
+ miginfo = (void *)(uintptr_t)tmp_miginfo;
+ GF_REF_PUT(miginfo);
+ }
+ ret = 1;
+ goto out;
+ }
+
+ if (!ret)
+ linkto_target = dht_linkfile_subvol(this, NULL, NULL, dict);
+
+ if (local->loc.inode) {
+ loc_copy(&tmp_loc, &local->loc);
+ } else {
+ tmp_loc.inode = inode_ref(inode);
+ gf_uuid_copy(tmp_loc.gfid, inode->gfid);
+ }
+
+ ret = syncop_lookup(this, &tmp_loc, &stbuf, 0, 0, 0);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED,
+ "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "name=%s", this->name, NULL);
+ local->op_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ dst_node = dht_subvol_get_cached(this, tmp_loc.inode);
+ if (linkto_target && dst_node != linkto_target) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_INVALID_LINKFILE,
+ "linkto_target_name=%s", linkto_target->name, "dst_name=%s",
+ dst_node->name, NULL);
+ }
+
+ if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "dst_name=%s", dst_node->name, NULL);
+ ret = -1;
+ local->op_errno = EIO;
+ goto out;
+ }
+
+ /* update local. A layout is set in inode-ctx in lookup already */
+
+ dht_layout_unref(this, local->layout);
+
+ local->layout = dht_layout_get(frame->this, inode);
+ local->cached_subvol = dst_node;
+
+ ret = 0;
+
+ /* once we detect the migration complete, the inode-ctx2 is no more
+ required.. delete the ctx and also, it means, open() already
+ done on all the fd of inode */
+ ret = inode_ctx_reset1(inode, this, &tmp_miginfo);
+ if (tmp_miginfo) {
+ miginfo = (void *)(uintptr_t)tmp_miginfo;
+ GF_REF_PUT(miginfo);
+ goto out;
+ }
+
+ /* perform 'open()' on all the fd's present on the inode */
+ if (tmp_loc.path == NULL) {
+ inode_path(inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+ }
+
+ LOCK(&inode->lock);
+
+ if (list_empty(&inode->fd_list))
+ goto unlock;
+
+ /* perform open as root:root. There is window between linkfile
+ * creation(root:root) and setattr with the correct uid/gid
+ */
+ SYNCTASK_SETID(0, 0);
+
+ /* It's possible that we are the last user of iter_fd after each
+ * iteration. In this case the fd_unref() of iter_fd at the end of
+ * the loop will cause the destruction of the fd. So we need to
+ * iterate the list safely because iter_fd cannot be trusted.
+ */
+ iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list);
+ while (&iter_fd->inode_list != (&inode->fd_list)) {
+ if (fd_is_anonymous(iter_fd) ||
+ (dht_fd_open_on_dst(this, iter_fd, dst_node))) {
+ if (!tmp) {
+ iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd),
+ inode_list);
+ continue;
+ }
+ skip_open = _gf_true;
+ }
+ /* We need to release the inode->lock before calling
+ * syncop_open() to avoid possible deadlocks. However this
+ * can cause the iter_fd to be released by other threads.
+ * To avoid this, we take a reference before releasing the
+ * lock.
*/
- if (-ret == ENODATA) {
- /* This DHT translator is not migrating this file */
-
- ret = inode_ctx_reset1 (inode, this, &tmp_miginfo);
- if (tmp_miginfo) {
-
- /* This can be a problem if the file was
- * migrated by two different layers. Raise
- * a warning here.
- */
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_HAS_MIGINFO,
- "%s: Found miginfo in the inode ctx",
- tmp_loc.path ? tmp_loc.path :
- uuid_utoa (tmp_loc.gfid));
-
- miginfo = (void *)tmp_miginfo;
- GF_REF_PUT (miginfo);
- }
- ret = 1;
- goto out;
- }
+ fd_ref(iter_fd);
- if (!ret)
- linkto_target = dht_linkfile_subvol (this, NULL, NULL, dict);
+ UNLOCK(&inode->lock);
- if (local->loc.inode) {
- loc_copy (&tmp_loc, &local->loc);
- } else {
- tmp_loc.inode = inode_ref (inode);
- gf_uuid_copy (tmp_loc.gfid, inode->gfid);
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
}
+ if (skip_open)
+ goto next;
- ret = syncop_lookup (this, &tmp_loc, &stbuf, 0, 0, 0);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_FILE_LOOKUP_FAILED,
- "%s: failed to lookup the file on %s",
- tmp_loc.path ? tmp_loc.path : uuid_utoa (tmp_loc.gfid),
- this->name);
- local->op_errno = -ret;
- ret = -1;
- goto out;
- }
-
- dst_node = dht_subvol_get_cached (this, tmp_loc.inode);
- if (linkto_target && dst_node != linkto_target) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_INVALID_LINKFILE,
- "linkto target (%s) is "
- "different from cached-subvol (%s). Treating %s as "
- "destination subvol", linkto_target->name,
- dst_node->name, dst_node->name);
- }
-
- if (gf_uuid_compare (stbuf.ia_gfid, tmp_loc.inode->gfid)) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_GFID_MISMATCH,
- "%s: gfid different on the target file on %s",
- tmp_loc.path ? tmp_loc.path :
- uuid_utoa (tmp_loc.gfid), dst_node->name);
- ret = -1;
- local->op_errno = EIO;
- goto out;
- }
-
- /* update local. A layout is set in inode-ctx in lookup already */
-
- dht_layout_unref (this, local->layout);
-
- local->layout = dht_layout_get (frame->this, inode);
- local->cached_subvol = dst_node;
-
- ret = 0;
-
- /* once we detect the migration complete, the inode-ctx2 is no more
- required.. delete the ctx and also, it means, open() already
- done on all the fd of inode */
- ret = inode_ctx_reset1 (inode, this, &tmp_miginfo);
- if (tmp_miginfo) {
- miginfo = (void *)tmp_miginfo;
- GF_REF_PUT (miginfo);
- goto out;
+ /* flags for open are stripped down to allow following the
+ * new location of the file, otherwise we can get EEXIST or
+ * truncate the file again as rebalance is moving the data */
+ ret = syncop_open(dst_node, &tmp_loc,
+ (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)),
+ iter_fd, NULL, NULL);
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_OPEN_FD_ON_DST_FAILED, "id=%p", iter_fd,
+ "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s",
+ dst_node->name, NULL);
+
+ open_failed = 1;
+ local->op_errno = -ret;
+ ret = -1;
+ } else {
+ dht_fd_ctx_set(this, iter_fd, dst_node);
}
- if (list_empty (&inode->fd_list))
- goto out;
+ next:
+ LOCK(&inode->lock);
+ skip_open = _gf_false;
+ tmp = iter_fd;
+ iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list);
+ }
- /* perform open as root:root. There is window between linkfile
- * creation(root:root) and setattr with the correct uid/gid
- */
- SYNCTASK_SETID(0, 0);
+ SYNCTASK_SETID(frame->root->uid, frame->root->gid);
- /* perform 'open()' on all the fd's present on the inode */
- tmp_loc.inode = inode;
- inode_path (inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
-
- list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
-
- if (fd_is_anonymous (iter_fd))
- continue;
- if (dht_fd_open_on_dst (this, iter_fd, dst_node))
- continue;
-
- /* flags for open are stripped down to allow following the
- * new location of the file, otherwise we can get EEXIST or
- * truncate the file again as rebalance is moving the data */
- ret = syncop_open (dst_node, &tmp_loc,
- (iter_fd->flags &
- ~(O_CREAT | O_EXCL | O_TRUNC)),
- iter_fd, NULL, NULL);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_OPEN_FD_ON_DST_FAILED, "failed"
- " to open the fd"
- " (%p, flags=0%o) on file %s @ %s",
- iter_fd, iter_fd->flags, path,
- dst_node->name);
-
- open_failed = 1;
- local->op_errno = -ret;
- ret = -1;
- } else {
- dht_fd_ctx_set (this, iter_fd, dst_node);
- }
- }
+ if (open_failed) {
+ ret = -1;
+ goto unlock;
+ }
+ ret = 0;
- SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+unlock:
+ UNLOCK(&inode->lock);
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
+ }
- if (open_failed) {
- ret = -1;
- goto out;
- }
- ret = 0;
out:
+ if (dict) {
+ dict_unref(dict);
+ }
- loc_wipe (&tmp_loc);
+ loc_wipe(&tmp_loc);
- return ret;
+ return ret;
}
int
-dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame)
+dht_rebalance_complete_check(xlator_t *this, call_frame_t *frame)
{
- int ret = -1;
-
- ret = synctask_new (this->ctx->env, dht_migration_complete_check_task,
- dht_migration_complete_check_done,
- frame, frame);
- return ret;
+ int ret = -1;
+ ret = synctask_new(this->ctx->env, dht_migration_complete_check_task,
+ dht_migration_complete_check_done, frame, frame);
+ return ret;
}
/* During 'in-progress' state, both nodes should have the file */
@@ -1296,971 +1516,789 @@ dht_rebalance_complete_check (xlator_t *this, call_frame_t *frame)
1 : File is being migrated but not by this DHT layer.
*/
static int
-dht_inprogress_check_done (int op_ret, call_frame_t *frame, void *data)
+dht_inprogress_check_done(int op_ret, call_frame_t *frame, void *data)
{
- dht_local_t *local = NULL;
- xlator_t *dst_subvol = NULL, *src_subvol = NULL;
- inode_t *inode = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *dst_subvol = NULL, *src_subvol = NULL;
+ inode_t *inode = NULL;
- local = frame->local;
+ local = frame->local;
- if (op_ret != 0)
- goto out;
+ if (op_ret != 0)
+ goto out;
- inode = local->loc.inode ? local->loc.inode : local->fd->inode;
+ inode = local->loc.inode ? local->loc.inode : local->fd->inode;
- dht_inode_ctx_get_mig_info (THIS, inode, &src_subvol, &dst_subvol);
- if (dht_mig_info_is_invalid (local->cached_subvol,
- src_subvol, dst_subvol)) {
- dst_subvol = dht_subvol_get_cached (THIS, inode);
- if (!dst_subvol) {
- local->op_errno = EINVAL;
- goto out;
- }
+ dht_inode_ctx_get_mig_info(THIS, inode, &src_subvol, &dst_subvol);
+ if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol, dst_subvol)) {
+ dst_subvol = dht_subvol_get_cached(THIS, inode);
+ if (!dst_subvol) {
+ local->op_errno = EINVAL;
+ goto out;
}
+ }
out:
- local->rebalance.target_op_fn (THIS, dst_subvol, frame, op_ret);
+ local->rebalance.target_op_fn(THIS, dst_subvol, frame, op_ret);
- return 0;
+ return 0;
}
static int
-dht_rebalance_inprogress_task (void *data)
-{
- int ret = -1;
- xlator_t *src_node = NULL;
- xlator_t *dst_node = NULL;
- dht_local_t *local = NULL;
- dict_t *dict = NULL;
- call_frame_t *frame = NULL;
- xlator_t *this = NULL;
- char *path = NULL;
- struct iatt stbuf = {0,};
- loc_t tmp_loc = {0,};
- dht_conf_t *conf = NULL;
- inode_t *inode = NULL;
- fd_t *iter_fd = NULL;
- int open_failed = 0;
- uint64_t tmp_miginfo = 0;
- dht_migrate_info_t *miginfo = NULL;
-
-
- this = THIS;
- frame = data;
- local = frame->local;
- conf = this->private;
-
- src_node = local->cached_subvol;
-
- if (!local->loc.inode && !local->fd)
- goto out;
-
- inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+dht_rebalance_inprogress_task(void *data)
+{
+ int ret = -1;
+ xlator_t *src_node = NULL;
+ xlator_t *dst_node = NULL;
+ dht_local_t *local = NULL;
+ dict_t *dict = NULL;
+ call_frame_t *frame = NULL;
+ xlator_t *this = NULL;
+ char *path = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ loc_t tmp_loc = {
+ 0,
+ };
+ dht_conf_t *conf = NULL;
+ inode_t *inode = NULL;
+ fd_t *iter_fd = NULL;
+ fd_t *tmp = NULL;
+ int open_failed = 0;
+ uint64_t tmp_miginfo = 0;
+ dht_migrate_info_t *miginfo = NULL;
+ gf_boolean_t skip_open = _gf_false;
+
+ this = THIS;
+ frame = data;
+ local = frame->local;
+ conf = this->private;
+
+ src_node = local->cached_subvol;
+
+ if (!local->loc.inode && !local->fd)
+ goto out;
+
+ inode = (!local->fd) ? local->loc.inode : local->fd->inode;
+
+ /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
+ * as root:root. If a fd is already open, access check won't be done*/
+ if (local->loc.inode) {
+ SYNCTASK_SETID(0, 0);
+ ret = syncop_getxattr(src_node, &local->loc, &dict,
+ conf->link_xattr_name, NULL, NULL);
+ SYNCTASK_SETID(frame->root->uid, frame->root->gid);
+ } else {
+ ret = syncop_fgetxattr(src_node, local->fd, &dict,
+ conf->link_xattr_name, NULL, NULL);
+ }
+
+ /*
+ * Each DHT xlator layer has its own name for the linkto xattr.
+ * If the file mode bits indicate the the file is being migrated but
+ * this layer's linkto xattr is not present, it means that another
+ * DHT layer is migrating the file. In this case, return 1 so
+ * the mode bits can be passed on to the higher layer for appropriate
+ * action.
+ */
+
+ if (-ret == ENODATA) {
+ /* This DHT layer is not migrating this file */
+ ret = inode_ctx_reset1(inode, this, &tmp_miginfo);
+ if (tmp_miginfo) {
+ /* This can be a problem if the file was
+ * migrated by two different layers. Raise
+ * a warning here.
+ */
+ gf_smsg(
+ this->name, GF_LOG_WARNING, 0, DHT_MSG_HAS_MIGINFO, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid), NULL);
+ miginfo = (void *)(uintptr_t)tmp_miginfo;
+ GF_REF_PUT(miginfo);
+ }
+ ret = 1;
+ goto out;
+ }
+
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_GET_XATTR_FAILED,
+ "path=%s", local->loc.path, NULL);
+ ret = -1;
+ goto out;
+ }
+
+ dst_node = dht_linkfile_subvol(this, NULL, NULL, dict);
+ if (!dst_node) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GET_XATTR_FAILED,
+ "path=%s", local->loc.path, NULL);
+ ret = -1;
+ goto out;
+ }
+
+ local->rebalance.target_node = dst_node;
+
+ if (local->loc.inode) {
+ loc_copy(&tmp_loc, &local->loc);
+ } else {
+ tmp_loc.inode = inode_ref(inode);
+ gf_uuid_copy(tmp_loc.gfid, inode->gfid);
+ }
+
+ /* lookup on dst */
+ ret = syncop_lookup(dst_node, &tmp_loc, &stbuf, NULL, NULL, NULL);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_FILE_LOOKUP_FAILED,
+ "tmp=%s", tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "name=%s", dst_node->name, NULL);
+ ret = -1;
+ goto out;
+ }
+
+ if (gf_uuid_compare(stbuf.ia_gfid, tmp_loc.inode->gfid)) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH, "tmp=%s",
+ tmp_loc.path ? tmp_loc.path : uuid_utoa(tmp_loc.gfid),
+ "name=%s", dst_node->name, NULL);
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+
+ if (tmp_loc.path == NULL) {
+ inode_path(inode, NULL, &path);
+ if (path)
+ tmp_loc.path = path;
+ }
+
+ LOCK(&inode->lock);
+
+ if (list_empty(&inode->fd_list))
+ goto unlock;
+
+ /* perform open as root:root. There is window between linkfile
+ * creation(root:root) and setattr with the correct uid/gid
+ */
+ SYNCTASK_SETID(0, 0);
+
+ /* It's possible that we are the last user of iter_fd after each
+ * iteration. In this case the fd_unref() of iter_fd at the end of
+ * the loop will cause the destruction of the fd. So we need to
+ * iterate the list safely because iter_fd cannot be trusted.
+ */
+ iter_fd = list_entry((&inode->fd_list)->next, typeof(*iter_fd), inode_list);
+ while (&iter_fd->inode_list != (&inode->fd_list)) {
+ /* We need to release the inode->lock before calling
+ * syncop_open() to avoid possible deadlocks. However this
+ * can cause the iter_fd to be released by other threads.
+ * To avoid this, we take a reference before releasing the
+ * lock.
+ */
- /* getxattr on cached_subvol for 'linkto' value. Do path based getxattr
- * as root:root. If a fd is already open, access check wont be done*/
- if (local->loc.inode) {
- SYNCTASK_SETID (0, 0);
- ret = syncop_getxattr (src_node, &local->loc, &dict,
- conf->link_xattr_name, NULL, NULL);
- SYNCTASK_SETID (frame->root->uid, frame->root->gid);
- } else {
- ret = syncop_fgetxattr (src_node, local->fd, &dict,
- conf->link_xattr_name, NULL, NULL);
+ if (fd_is_anonymous(iter_fd) ||
+ (dht_fd_open_on_dst(this, iter_fd, dst_node))) {
+ if (!tmp) {
+ iter_fd = list_entry(iter_fd->inode_list.next, typeof(*iter_fd),
+ inode_list);
+ continue;
+ }
+ skip_open = _gf_true;
}
- /*
- * Each DHT xlator layer has its own name for the linkto xattr.
- * If the file mode bits indicate the the file is being migrated but
- * this layer's linkto xattr is not present, it means that another
- * DHT layer is migrating the file. In this case, return 1 so
- * the mode bits can be passed on to the higher layer for appropriate
- * action.
+ /* Yes, this is ugly but there isn't a cleaner way to do this
+ * the fd_ref is an atomic increment so not too bad. We want to
+ * reduce the number of inode locks and unlocks.
*/
- if (-ret == ENODATA) {
- /* This DHT layer is not migrating this file */
- ret = inode_ctx_reset1 (inode, this, &tmp_miginfo);
- if (tmp_miginfo) {
- /* This can be a problem if the file was
- * migrated by two different layers. Raise
- * a warning here.
- */
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_HAS_MIGINFO,
- "%s: Found miginfo in the inode ctx",
- tmp_loc.path ? tmp_loc.path :
- uuid_utoa (tmp_loc.gfid));
- miginfo = (void *)tmp_miginfo;
- GF_REF_PUT (miginfo);
- }
- ret = 1;
- goto out;
- }
+ fd_ref(iter_fd);
+ UNLOCK(&inode->lock);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_GET_XATTR_FAILED,
- "%s: failed to get the 'linkto' xattr",
- local->loc.path);
- ret = -1;
- goto out;
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
}
+ if (skip_open)
+ goto next;
- dst_node = dht_linkfile_subvol (this, NULL, NULL, dict);
- if (!dst_node) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_SUBVOL_NOT_FOUND,
- "%s: failed to get the 'linkto' xattr from dict",
- local->loc.path);
- ret = -1;
- goto out;
- }
-
- local->rebalance.target_node = dst_node;
-
- if (local->loc.inode) {
- loc_copy (&tmp_loc, &local->loc);
+ /* flags for open are stripped down to allow following the
+ * new location of the file, otherwise we can get EEXIST or
+ * truncate the file again as rebalance is moving the data */
+ ret = syncop_open(dst_node, &tmp_loc,
+ (iter_fd->flags & ~(O_CREAT | O_EXCL | O_TRUNC)),
+ iter_fd, NULL, NULL);
+ if (ret < 0) {
+ gf_smsg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_OPEN_FD_ON_DST_FAILED, "fd=%p", iter_fd,
+ "flags=0%o", iter_fd->flags, "path=%s", path, "name=%s",
+ dst_node->name, NULL);
+ ret = -1;
+ open_failed = 1;
} else {
- tmp_loc.inode = inode_ref (inode);
- gf_uuid_copy (tmp_loc.gfid, inode->gfid);
- }
-
- /* lookup on dst */
- ret = syncop_lookup (dst_node, &tmp_loc, &stbuf, NULL,
- NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_FILE_LOOKUP_ON_DST_FAILED,
- "%s: failed to lookup the file on %s",
- tmp_loc.path ? tmp_loc.path : uuid_utoa (tmp_loc.gfid),
- dst_node->name);
- ret = -1;
- goto out;
- }
-
- if (gf_uuid_compare (stbuf.ia_gfid, tmp_loc.inode->gfid)) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_GFID_MISMATCH,
- "%s: gfid different on the target file on %s",
- tmp_loc.path ? tmp_loc.path : uuid_utoa (tmp_loc.gfid),
- dst_node->name);
- ret = -1;
- goto out;
- }
- ret = 0;
-
- if (list_empty (&inode->fd_list))
- goto done;
-
- /* perform open as root:root. There is window between linkfile
- * creation(root:root) and setattr with the correct uid/gid
- */
- SYNCTASK_SETID (0, 0);
-
- inode_path (inode, NULL, &path);
- if (path)
- tmp_loc.path = path;
-
- list_for_each_entry (iter_fd, &inode->fd_list, inode_list) {
- if (fd_is_anonymous (iter_fd))
- continue;
-
- if (dht_fd_open_on_dst (this, iter_fd, dst_node))
- continue;
- /* flags for open are stripped down to allow following the
- * new location of the file, otherwise we can get EEXIST or
- * truncate the file again as rebalance is moving the data */
- ret = syncop_open (dst_node, &tmp_loc,
- (iter_fd->flags &
- ~(O_CREAT | O_EXCL | O_TRUNC)),
- iter_fd, NULL, NULL);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_OPEN_FD_ON_DST_FAILED,
- "failed to send open "
- "the fd (%p, flags=0%o) on file %s @ %s",
- iter_fd, iter_fd->flags, path,
- dst_node->name);
- ret = -1;
- open_failed = 1;
- } else {
- /* Potential fd leak if this fails here as it will be
- reopened at the next Phase1/2 check */
- dht_fd_ctx_set (this, iter_fd, dst_node);
- }
-
+ /* Potential fd leak if this fails here as it will be
+ reopened at the next Phase1/2 check */
+ dht_fd_ctx_set(this, iter_fd, dst_node);
}
- SYNCTASK_SETID (frame->root->uid, frame->root->gid);
+ next:
+ LOCK(&inode->lock);
+ skip_open = _gf_false;
+ tmp = iter_fd;
+ iter_fd = list_entry(tmp->inode_list.next, typeof(*tmp), inode_list);
+ }
- if (open_failed) {
- ret = -1;
- goto out;
- }
+ SYNCTASK_SETID(frame->root->uid, frame->root->gid);
-done:
- ret = dht_inode_ctx_set_mig_info (this, inode, src_node, dst_node);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_SET_INODE_CTX_FAILED,
- "%s: failed to set inode-ctx target file at %s",
- local->loc.path, dst_node->name);
- goto out;
- }
-
- ret = 0;
+unlock:
+ UNLOCK(&inode->lock);
+
+ if (tmp) {
+ fd_unref(tmp);
+ tmp = NULL;
+ }
+ if (open_failed) {
+ ret = -1;
+ goto out;
+ }
+
+ ret = dht_inode_ctx_set_mig_info(this, inode, src_node, dst_node);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "path=%s", local->loc.path, "name=%s", dst_node->name, NULL);
+ goto out;
+ }
+
+ ret = 0;
out:
- loc_wipe (&tmp_loc);
- return ret;
+ if (dict) {
+ dict_unref(dict);
+ }
+
+ loc_wipe(&tmp_loc);
+ return ret;
}
int
-dht_rebalance_in_progress_check (xlator_t *this, call_frame_t *frame)
+dht_rebalance_in_progress_check(xlator_t *this, call_frame_t *frame)
{
+ int ret = -1;
- int ret = -1;
-
- ret = synctask_new (this->ctx->env, dht_rebalance_inprogress_task,
- dht_inprogress_check_done,
- frame, frame);
- return ret;
+ ret = synctask_new(this->ctx->env, dht_rebalance_inprogress_task,
+ dht_inprogress_check_done, frame, frame);
+ return ret;
}
int
-dht_inode_ctx_layout_set (inode_t *inode, xlator_t *this,
- dht_layout_t *layout_int)
+dht_inode_ctx_layout_set(inode_t *inode, xlator_t *this,
+ dht_layout_t *layout_int)
{
- dht_inode_ctx_t *ctx = NULL;
- int ret = -1;
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
- ret = dht_inode_ctx_get (inode, this, &ctx);
- if (!ret && ctx) {
- ctx->layout = layout_int;
- } else {
- ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t);
- if (!ctx)
- return ret;
- ctx->layout = layout_int;
- }
+ ret = dht_inode_ctx_get(inode, this, &ctx);
+ if (!ret && ctx) {
+ ctx->layout = layout_int;
+ } else {
+ ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t);
+ if (!ctx)
+ return ret;
+ ctx->layout = layout_int;
+ }
- ret = dht_inode_ctx_set (inode, this, ctx);
+ ret = dht_inode_ctx_set(inode, this, ctx);
- return ret;
+ return ret;
}
-
void
-dht_inode_ctx_time_set (inode_t *inode, xlator_t *this, struct iatt *stat)
+dht_inode_ctx_time_set(inode_t *inode, xlator_t *this, struct iatt *stat)
{
- dht_inode_ctx_t *ctx = NULL;
- dht_stat_time_t *time = 0;
- int ret = -1;
+ dht_inode_ctx_t *ctx = NULL;
+ dht_stat_time_t *time = 0;
+ int ret = -1;
- ret = dht_inode_ctx_get (inode, this, &ctx);
+ ret = dht_inode_ctx_get(inode, this, &ctx);
- if (ret)
- return;
+ if (ret)
+ return;
- time = &ctx->time;
+ time = &ctx->time;
- time->mtime = stat->ia_mtime;
- time->mtime_nsec = stat->ia_mtime_nsec;
+ time->mtime = stat->ia_mtime;
+ time->mtime_nsec = stat->ia_mtime_nsec;
- time->ctime = stat->ia_ctime;
- time->ctime_nsec = stat->ia_ctime_nsec;
+ time->ctime = stat->ia_ctime;
+ time->ctime_nsec = stat->ia_ctime_nsec;
- time->atime = stat->ia_atime;
- time->atime_nsec = stat->ia_atime_nsec;
+ time->atime = stat->ia_atime;
+ time->atime_nsec = stat->ia_atime_nsec;
- return;
+ return;
}
-
int
-dht_inode_ctx_time_update (inode_t *inode, xlator_t *this, struct iatt *stat,
- int32_t post)
-{
- dht_inode_ctx_t *ctx = NULL;
- dht_stat_time_t *time = 0;
- int ret = -1;
-
- GF_VALIDATE_OR_GOTO (this->name, stat, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
-
- ret = dht_inode_ctx_get (inode, this, &ctx);
-
- if (ret) {
- ctx = GF_CALLOC (1, sizeof (*ctx), gf_dht_mt_inode_ctx_t);
- if (!ctx)
- return -1;
- }
-
- time = &ctx->time;
-
- DHT_UPDATE_TIME(time->mtime, time->mtime_nsec,
- stat->ia_mtime, stat->ia_mtime_nsec, inode, post);
- DHT_UPDATE_TIME(time->ctime, time->ctime_nsec,
- stat->ia_ctime, stat->ia_ctime_nsec, inode, post);
- DHT_UPDATE_TIME(time->atime, time->atime_nsec,
- stat->ia_atime, stat->ia_atime_nsec, inode, post);
-
- ret = dht_inode_ctx_set (inode, this, ctx);
+dht_inode_ctx_time_update(inode_t *inode, xlator_t *this, struct iatt *stat,
+ int32_t post)
+{
+ dht_inode_ctx_t *ctx = NULL;
+ dht_stat_time_t *time = 0;
+ int ret = -1;
+
+ GF_VALIDATE_OR_GOTO(this->name, stat, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+
+ ret = dht_inode_ctx_get(inode, this, &ctx);
+
+ if (ret) {
+ ctx = GF_CALLOC(1, sizeof(*ctx), gf_dht_mt_inode_ctx_t);
+ if (!ctx)
+ return -1;
+ }
+
+ time = &ctx->time;
+
+ LOCK(&inode->lock);
+ {
+ DHT_UPDATE_TIME(time->mtime, time->mtime_nsec, stat->ia_mtime,
+ stat->ia_mtime_nsec, post);
+ DHT_UPDATE_TIME(time->ctime, time->ctime_nsec, stat->ia_ctime,
+ stat->ia_ctime_nsec, post);
+ DHT_UPDATE_TIME(time->atime, time->atime_nsec, stat->ia_atime,
+ stat->ia_atime_nsec, post);
+ }
+ UNLOCK(&inode->lock);
+
+ ret = dht_inode_ctx_set(inode, this, ctx);
out:
- return 0;
+ return 0;
}
int
-dht_inode_ctx_get (inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx)
+dht_inode_ctx_get(inode_t *inode, xlator_t *this, dht_inode_ctx_t **ctx)
{
- int ret = -1;
- uint64_t ctx_int = 0;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
+ int ret = -1;
+ uint64_t ctx_int = 0;
- ret = inode_ctx_get (inode, this, &ctx_int);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- if (ret)
- return ret;
-
- if (ctx)
- *ctx = (dht_inode_ctx_t *) ctx_int;
-out:
- return ret;
-}
+ ret = inode_ctx_get(inode, this, &ctx_int);
-int dht_inode_ctx_set (inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx)
-{
- int ret = -1;
- uint64_t ctx_int = 0;
-
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO (this->name, inode, out);
- GF_VALIDATE_OR_GOTO (this->name, ctx, out);
-
- ctx_int = (long)ctx;
- ret = inode_ctx_set (inode, this, &ctx_int);
-out:
+ if (ret)
return ret;
-}
-
-void
-dht_set_lkowner (dht_lock_t **lk_array, int count, gf_lkowner_t *lkowner)
-{
- int i = 0;
-
- if (!lk_array || !lkowner)
- goto out;
-
- for (i = 0; i < count; i++) {
- lk_array[i]->lk_owner = *lkowner;
- }
+ if (ctx)
+ *ctx = (dht_inode_ctx_t *)(uintptr_t)ctx_int;
out:
- return;
+ return ret;
}
int
-dht_subvol_status (dht_conf_t *conf, xlator_t *subvol)
-{
- int i;
-
- for (i=0 ; i < conf->subvolume_cnt; i++) {
- if (conf->subvolumes[i] == subvol) {
- return conf->subvolume_status[i];
- }
- }
- return 0;
-}
-
-void
-dht_inodelk_done (call_frame_t *lock_frame)
+dht_inode_ctx_set(inode_t *inode, xlator_t *this, dht_inode_ctx_t *ctx)
{
- fop_inodelk_cbk_t inodelk_cbk = NULL;
- call_frame_t *main_frame = NULL;
- dht_local_t *local = NULL;
+ int ret = -1;
+ uint64_t ctx_int = 0;
- local = lock_frame->local;
- main_frame = local->main_frame;
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
+ GF_VALIDATE_OR_GOTO(this->name, ctx, out);
- local->lock.locks = NULL;
- local->lock.lk_count = 0;
-
- inodelk_cbk = local->lock.inodelk_cbk;
- local->lock.inodelk_cbk = NULL;
-
- inodelk_cbk (main_frame, NULL, main_frame->this, local->lock.op_ret,
- local->lock.op_errno, NULL);
-
- dht_lock_stack_destroy (lock_frame);
- return;
+ ctx_int = (long)ctx;
+ ret = inode_ctx_set(inode, this, &ctx_int);
+out:
+ return ret;
}
int
-dht_inodelk_cleanup_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xdata)
-{
- dht_inodelk_done (frame);
- return 0;
-}
-
-int32_t
-dht_lock_count (dht_lock_t **lk_array, int lk_count)
-{
- int i = 0, locked = 0;
-
- if ((lk_array == NULL) || (lk_count == 0))
+dht_subvol_status(dht_conf_t *conf, xlator_t *subvol)
+{
+ int i;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == subvol) {
+ return conf->subvolume_status[i];
+ }
+ }
+ return 0;
+}
+
+inode_t *
+dht_heal_path(xlator_t *this, char *path, inode_table_t *itable)
+{
+ int ret = -1;
+ struct iatt iatt = {
+ 0,
+ };
+ inode_t *linked_inode = NULL;
+ loc_t loc = {
+ 0,
+ };
+ char *bname = NULL;
+ char *save_ptr = NULL;
+ static uuid_t gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+ char *tmp_path = NULL;
+
+ tmp_path = gf_strdup(path);
+ if (!tmp_path) {
+ goto out;
+ }
+
+ gf_uuid_copy(loc.pargfid, gfid);
+ loc.parent = inode_ref(itable->root);
+
+ bname = strtok_r(tmp_path, "/", &save_ptr);
+
+ /* sending a lookup on parent directory,
+ * Eg: if path is like /a/b/c/d/e/f/g/
+ * then we will send a lookup on a first and then b,c,d,etc
+ */
+
+ while (bname) {
+ linked_inode = NULL;
+ loc.inode = inode_grep(itable, loc.parent, bname);
+ if (loc.inode == NULL) {
+ loc.inode = inode_new(itable);
+ if (loc.inode == NULL) {
+ ret = -ENOMEM;
goto out;
-
- for (i = 0; i < lk_count; i++) {
- if (lk_array[i]->locked)
- locked++;
- }
-out:
- return locked;
-}
-
-void
-dht_inodelk_cleanup (call_frame_t *lock_frame)
-{
- dht_lock_t **lk_array = NULL;
- int lk_count = 0, lk_acquired = 0;
- dht_local_t *local = NULL;
-
- local = lock_frame->local;
-
- lk_array = local->lock.locks;
- lk_count = local->lock.lk_count;
-
- lk_acquired = dht_lock_count (lk_array, lk_count);
- if (lk_acquired != 0) {
- dht_unlock_inodelk (lock_frame, lk_array, lk_count,
- dht_inodelk_cleanup_cbk);
- } else {
- dht_inodelk_done (lock_frame);
- }
-
- return;
-}
-
-int32_t
-dht_unlock_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- int lk_index = 0, call_cnt = 0;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- lk_index = (long) cookie;
-
- local = frame->local;
- if (op_ret < 0) {
- uuid_utoa_r (local->lock.locks[lk_index]->loc.gfid,
- gfid);
-
- gf_msg (this->name, GF_LOG_WARNING, op_errno,
- DHT_MSG_UNLOCKING_FAILED,
- "unlocking failed on %s:%s",
- local->lock.locks[lk_index]->xl->name,
- gfid);
+ }
} else {
- local->lock.locks[lk_index]->locked = 0;
- }
-
- call_cnt = dht_frame_return (frame);
- if (is_last_call (call_cnt)) {
- dht_inodelk_done (frame);
- }
-
- return 0;
-}
-
-call_frame_t *
-dht_lock_frame (call_frame_t *parent_frame)
-{
- call_frame_t *lock_frame = NULL;
-
- lock_frame = copy_frame (parent_frame);
- if (lock_frame == NULL)
+ /*
+ * Inode is already populated in the inode table.
+ * Which means we already looked up the inode and
+ * linked with a dentry. So that we will skip
+ * lookup on this entry, and proceed to next.
+ */
+ linked_inode = loc.inode;
+ bname = strtok_r(NULL, "/", &save_ptr);
+ if (!bname) {
goto out;
-
- set_lk_owner_from_ptr (&lock_frame->root->lk_owner, parent_frame->root);
-
-out:
- return lock_frame;
-}
-
-int32_t
-dht_unlock_inodelk (call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
- fop_inodelk_cbk_t inodelk_cbk)
-{
- dht_local_t *local = NULL;
- struct gf_flock flock = {0,};
- int ret = -1 , i = 0;
- call_frame_t *lock_frame = NULL;
- int call_cnt = 0;
-
- GF_VALIDATE_OR_GOTO ("dht-locks", frame, done);
- GF_VALIDATE_OR_GOTO (frame->this->name, lk_array, done);
- GF_VALIDATE_OR_GOTO (frame->this->name, inodelk_cbk, done);
-
- call_cnt = dht_lock_count (lk_array, lk_count);
- if (call_cnt == 0) {
- ret = 0;
- goto done;
+ }
+ inode_unref(loc.parent);
+ loc.parent = loc.inode;
+ gf_uuid_copy(loc.pargfid, loc.inode->gfid);
+ loc.inode = NULL;
+ continue;
}
- lock_frame = dht_lock_frame (frame);
- if (lock_frame == NULL) {
- gf_msg (frame->this->name, GF_LOG_WARNING, 0,
- DHT_MSG_UNLOCKING_FAILED,
- "cannot allocate a frame, not unlocking following "
- "locks:");
+ loc.name = bname;
+ ret = loc_path(&loc, bname);
- dht_log_lk_array (frame->this->name, GF_LOG_WARNING, lk_array,
- lk_count);
- goto done;
- }
-
- ret = dht_local_lock_init (lock_frame, lk_array, lk_count, inodelk_cbk);
- if (ret < 0) {
- gf_msg (frame->this->name, GF_LOG_WARNING, 0,
- DHT_MSG_UNLOCKING_FAILED,
- "storing locks in local failed, not unlocking "
- "following locks:");
-
- dht_log_lk_array (frame->this->name, GF_LOG_WARNING, lk_array,
- lk_count);
-
- goto done;
- }
-
- local = lock_frame->local;
- local->main_frame = frame;
- local->call_cnt = call_cnt;
-
- flock.l_type = F_UNLCK;
-
- for (i = 0; i < local->lock.lk_count; i++) {
- if (!local->lock.locks[i]->locked)
- continue;
-
- lock_frame->root->lk_owner = local->lock.locks[i]->lk_owner;
- STACK_WIND_COOKIE (lock_frame, dht_unlock_inodelk_cbk,
- (void *)(long)i,
- local->lock.locks[i]->xl,
- local->lock.locks[i]->xl->fops->inodelk,
- local->lock.locks[i]->domain,
- &local->lock.locks[i]->loc, F_SETLK,
- &flock, NULL);
- if (!--call_cnt)
- break;
+ ret = syncop_lookup(this, &loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_SELFHEAL_FAILED,
+ "path=%s", path, "subvolume=%s", this->name, "bname=%s",
+ bname, NULL);
+ goto out;
}
- return 0;
-
-done:
- if (lock_frame)
- dht_lock_stack_destroy (lock_frame);
-
- /* no locks acquired, invoke inodelk_cbk */
- if (ret == 0)
- inodelk_cbk (frame, NULL, frame->this, 0, 0, NULL);
-
- return ret;
-}
-
-int32_t
-dht_nonblocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- dht_local_t *local = NULL;
- int lk_index = 0, call_cnt = 0;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- local = frame->local;
- lk_index = (long) cookie;
+ linked_inode = inode_link(loc.inode, loc.parent, bname, &iatt);
+ if (!linked_inode)
+ goto out;
- if (op_ret == -1) {
- local->lock.op_ret = -1;
- local->lock.op_errno = op_errno;
-
- if (local && local->lock.locks[lk_index]) {
- uuid_utoa_r (local->lock.locks[lk_index]->loc.inode->gfid,
- gfid);
-
- gf_msg_debug (this->name, op_errno,
- "inodelk failed on gfid: %s "
- "subvolume: %s", gfid,
- local->lock.locks[lk_index]->xl->name);
- }
-
- goto out;
- }
-
- local->lock.locks[lk_index]->locked = _gf_true;
+ loc_wipe(&loc);
+ gf_uuid_copy(loc.pargfid, linked_inode->gfid);
+ loc.inode = NULL;
+ bname = strtok_r(NULL, "/", &save_ptr);
+ if (bname)
+ loc.parent = linked_inode;
+ }
out:
- call_cnt = dht_frame_return (frame);
- if (is_last_call (call_cnt)) {
- if (local->lock.op_ret < 0) {
- dht_inodelk_cleanup (frame);
- return 0;
- }
+ inode_ref(linked_inode);
+ loc_wipe(&loc);
+ GF_FREE(tmp_path);
- dht_inodelk_done (frame);
- }
-
- return 0;
+ return linked_inode;
}
int
-dht_nonblocking_inodelk (call_frame_t *frame, dht_lock_t **lk_array,
- int lk_count, fop_inodelk_cbk_t inodelk_cbk)
-{
- struct gf_flock flock = {0,};
- int i = 0, ret = 0;
- dht_local_t *local = NULL;
- call_frame_t *lock_frame = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht-locks", frame, out);
- GF_VALIDATE_OR_GOTO (frame->this->name, lk_array, out);
- GF_VALIDATE_OR_GOTO (frame->this->name, inodelk_cbk, out);
-
- lock_frame = dht_lock_frame (frame);
- if (lock_frame == NULL)
- goto out;
-
- ret = dht_local_lock_init (lock_frame, lk_array, lk_count, inodelk_cbk);
- if (ret < 0) {
- goto out;
- }
-
- dht_set_lkowner (lk_array, lk_count, &lock_frame->root->lk_owner);
-
- local = lock_frame->local;
- local->main_frame = frame;
-
- local->call_cnt = lk_count;
-
- for (i = 0; i < lk_count; i++) {
- flock.l_type = local->lock.locks[i]->type;
-
- STACK_WIND_COOKIE (lock_frame, dht_nonblocking_inodelk_cbk,
- (void *) (long) i,
- local->lock.locks[i]->xl,
- local->lock.locks[i]->xl->fops->inodelk,
- local->lock.locks[i]->domain,
- &local->lock.locks[i]->loc, F_SETLK,
- &flock, NULL);
- }
-
- return 0;
-
-out:
- if (lock_frame)
- dht_lock_stack_destroy (lock_frame);
-
- return -1;
-}
-
-int32_t
-dht_blocking_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
-{
- int lk_index = 0;
- dht_local_t *local = NULL;
-
- lk_index = (long) cookie;
-
- local = frame->local;
-
- if (op_ret == 0) {
- local->lock.locks[lk_index]->locked = _gf_true;
+dht_heal_full_path(void *data)
+{
+ call_frame_t *heal_frame = data;
+ dht_local_t *local = NULL;
+ loc_t loc = {
+ 0,
+ };
+ dict_t *dict = NULL;
+ char *path = NULL;
+ int ret = -1;
+ xlator_t *source = NULL;
+ xlator_t *this = NULL;
+ inode_table_t *itable = NULL;
+ inode_t *inode = NULL;
+ inode_t *tmp_inode = NULL;
+
+ GF_VALIDATE_OR_GOTO("DHT", heal_frame, out);
+
+ local = heal_frame->local;
+ this = heal_frame->this;
+ source = heal_frame->cookie;
+ heal_frame->cookie = NULL;
+ gf_uuid_copy(loc.gfid, local->gfid);
+
+ if (local->loc.inode)
+ loc.inode = inode_ref(local->loc.inode);
+ else
+ goto out;
+
+ itable = loc.inode->table;
+ ret = syncop_getxattr(source, &loc, &dict, GET_ANCESTRY_PATH_KEY, NULL,
+ NULL);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_HEAL_ABORT,
+ "subvol=%s", source->name, NULL);
+ goto out;
+ }
+
+ ret = dict_get_str(dict, GET_ANCESTRY_PATH_KEY, &path);
+ if (path) {
+ inode = dht_heal_path(this, path, itable);
+ if (inode && inode != local->inode) {
+ /*
+ * if inode returned by heal function is different
+ * from what we passed, which means a racing thread
+ * already linked a different inode for dentry.
+ * So we will update our local->inode, so that we can
+ * retrurn proper inode.
+ */
+ tmp_inode = local->inode;
+ local->inode = inode;
+ inode_unref(tmp_inode);
+ tmp_inode = NULL;
} else {
- local->lock.op_ret = -1;
- local->lock.op_errno = op_errno;
- goto cleanup;
+ inode_unref(inode);
}
+ }
- if (lk_index == (local->lock.lk_count - 1)) {
- dht_inodelk_done (frame);
- } else {
- dht_blocking_inodelk_rec (frame, ++lk_index);
- }
-
- return 0;
-
-cleanup:
- dht_inodelk_cleanup (frame);
-
- return 0;
-}
-
-void
-dht_blocking_inodelk_rec (call_frame_t *frame, int i)
-{
- dht_local_t *local = NULL;
- struct gf_flock flock = {0,};
-
- local = frame->local;
-
- flock.l_type = local->lock.locks[i]->type;
-
- STACK_WIND_COOKIE (frame, dht_blocking_inodelk_cbk,
- (void *) (long) i,
- local->lock.locks[i]->xl,
- local->lock.locks[i]->xl->fops->inodelk,
- local->lock.locks[i]->domain,
- &local->lock.locks[i]->loc, F_SETLKW, &flock, NULL);
-
- return;
+out:
+ loc_wipe(&loc);
+ if (dict)
+ dict_unref(dict);
+ return 0;
}
int
-dht_lock_request_cmp (const void *val1, const void *val2)
-{
- dht_lock_t *lock1 = NULL;
- dht_lock_t *lock2 = NULL;
- int ret = 0;
-
- lock1 = *(dht_lock_t **)val1;
- lock2 = *(dht_lock_t **)val2;
-
- GF_VALIDATE_OR_GOTO ("dht-locks", lock1, out);
- GF_VALIDATE_OR_GOTO ("dht-locks", lock2, out);
-
- ret = strcmp (lock1->xl->name, lock2->xl->name);
-
- if (ret == 0) {
- ret = gf_uuid_compare (lock1->loc.gfid, lock2->loc.gfid);
+dht_heal_full_path_done(int op_ret, call_frame_t *heal_frame, void *data)
+{
+ call_frame_t *main_frame = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ int ret = -1;
+ int op_errno = 0;
+
+ local = heal_frame->local;
+ main_frame = local->main_frame;
+ local->main_frame = NULL;
+ this = heal_frame->this;
+
+ dht_set_fixed_dir_stat(&local->postparent);
+ if (local->need_xattr_heal) {
+ local->need_xattr_heal = 0;
+ ret = dht_dir_xattr_heal(this, local, &op_errno);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED, "path=%s", local->loc.path,
+ NULL);
}
+ }
-out:
- return ret;
+ DHT_STACK_UNWIND(lookup, main_frame, 0, 0, local->inode, &local->stbuf,
+ local->xattr, &local->postparent);
+
+ DHT_STACK_DESTROY(heal_frame);
+ return 0;
}
+/* This function must be called inside an inode lock */
int
-dht_lock_order_requests (dht_lock_t **locks, int count)
+__dht_lock_subvol_set(inode_t *inode, xlator_t *this, xlator_t *lock_subvol)
{
- int ret = -1;
+ dht_inode_ctx_t *ctx = NULL;
+ int ret = -1;
+ uint64_t value = 0;
- if (!locks || !count)
- goto out;
+ GF_VALIDATE_OR_GOTO(this->name, inode, out);
- qsort (locks, count, sizeof (*locks), dht_lock_request_cmp);
- ret = 0;
+ ret = __inode_ctx_get0(inode, this, &value);
+ if (ret || !value) {
+ return -1;
+ }
+ ctx = (dht_inode_ctx_t *)(uintptr_t)value;
+ ctx->lock_subvol = lock_subvol;
out:
- return ret;
+ return ret;
}
-int
-dht_blocking_inodelk (call_frame_t *frame, dht_lock_t **lk_array,
- int lk_count, fop_inodelk_cbk_t inodelk_cbk)
+xlator_t *
+dht_get_lock_subvolume(xlator_t *this, struct gf_flock *lock,
+ dht_local_t *local)
{
- int ret = -1;
- call_frame_t *lock_frame = NULL;
- dht_local_t *local = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht-locks", frame, out);
- GF_VALIDATE_OR_GOTO (frame->this->name, lk_array, out);
- GF_VALIDATE_OR_GOTO (frame->this->name, inodelk_cbk, out);
-
- lock_frame = dht_lock_frame (frame);
- if (lock_frame == NULL)
- goto out;
-
- ret = dht_local_lock_init (lock_frame, lk_array, lk_count, inodelk_cbk);
- if (ret < 0) {
- goto out;
- }
-
- dht_set_lkowner (lk_array, lk_count, &lock_frame->root->lk_owner);
-
- local = lock_frame->local;
- local->main_frame = frame;
+ xlator_t *subvol = NULL;
+ inode_t *inode = NULL;
+ int32_t ret = -1;
+ uint64_t value = 0;
+ xlator_t *cached_subvol = NULL;
+ dht_inode_ctx_t *ctx = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
- dht_blocking_inodelk_rec (lock_frame, 0);
+ GF_VALIDATE_OR_GOTO(this->name, lock, out);
+ GF_VALIDATE_OR_GOTO(this->name, local, out);
- return 0;
-out:
- if (lock_frame)
- dht_lock_stack_destroy (lock_frame);
+ cached_subvol = local->cached_subvol;
- return -1;
-}
-inode_t*
-dht_heal_path (xlator_t *this, char *path, inode_table_t *itable)
-{
- int ret = -1;
- struct iatt iatt = {0, };
- inode_t *linked_inode = NULL;
- loc_t loc = {0, };
- char *bname = NULL;
- char *save_ptr = NULL;
- uuid_t gfid = {0, };
- char *tmp_path = NULL;
-
-
- tmp_path = gf_strdup (path);
- if (!tmp_path) {
- goto out;
- }
-
- memset (gfid, 0, 16);
- gfid[15] = 1;
-
- gf_uuid_copy (loc.pargfid, gfid);
- loc.parent = inode_ref (itable->root);
+ if (local->loc.inode || local->fd) {
+ inode = local->loc.inode ? local->loc.inode : local->fd->inode;
+ }
- bname = strtok_r (tmp_path, "/", &save_ptr);
+ if (!inode)
+ goto out;
- /* sending a lookup on parent directory,
- * Eg: if path is like /a/b/c/d/e/f/g/
- * then we will send a lookup on a first and then b,c,d,etc
+ if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) {
+ /*
+ * We may get non-linked inode for directories as part
+ * of the selfheal code path. So checking for IA_INVAL
+ * type also. This will only happen for directory.
*/
+ subvol = local->cached_subvol;
+ goto out;
+ }
- while (bname) {
- linked_inode = NULL;
- loc.inode = inode_grep (itable, loc.parent, bname);
- if (loc.inode == NULL) {
- loc.inode = inode_new (itable);
- if (loc.inode == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- } else {
- /*
- * Inode is already populated in the inode table.
- * Which means we already looked up the inde and
- * linked with a dentry. So that we will skip
- * lookup on this entry, and proceed to next.
- */
- bname = strtok_r (NULL, "/", &save_ptr);
- inode_unref (loc.parent);
- loc.parent = loc.inode;
- gf_uuid_copy (loc.pargfid, loc.inode->gfid);
- loc.inode = NULL;
- continue;
- }
-
- loc.name = bname;
- ret = loc_path (&loc, bname);
-
- ret = syncop_lookup (this, &loc, &iatt, NULL, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_INFO, -ret,
- DHT_MSG_DIR_SELFHEAL_FAILED,
- "Healing of path %s failed on subvolume %s for "
- "directory %s", path, this->name, bname);
- goto out;
- }
-
- linked_inode = inode_link (loc.inode, loc.parent, bname, &iatt);
- if (!linked_inode)
- goto out;
-
- loc_wipe (&loc);
- gf_uuid_copy (loc.pargfid, linked_inode->gfid);
- loc.inode = NULL;
- loc.parent = linked_inode;
-
- bname = strtok_r (NULL, "/", &save_ptr);
- }
+ if (lock->l_type != F_UNLCK) {
+ /*
+ * inode purging might happen on NFS between a lk
+ * and unlk. Due to this lk and unlk might be sent
+ * to different subvols.
+ * So during a lock request, taking a ref on inode
+ * to prevent inode purging. inode unref will happen
+ * in unlock cbk code path.
+ */
+ inode_ref(inode);
+ }
+
+ LOCK(&inode->lock);
+ ret = __inode_ctx_get0(inode, this, &value);
+ if (!ret && value) {
+ ctx = (dht_inode_ctx_t *)(uintptr_t)value;
+ subvol = ctx->lock_subvol;
+ }
+ if (!subvol && lock->l_type != F_UNLCK && cached_subvol) {
+ ret = __dht_lock_subvol_set(inode, this, cached_subvol);
+ if (ret) {
+ gf_uuid_unparse(inode->gfid, gfid);
+ UNLOCK(&inode->lock);
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SET_INODE_CTX_FAILED,
+ "lock_subvol gfid=%s", gfid, NULL);
+ goto post_unlock;
+ }
+ subvol = cached_subvol;
+ }
+ UNLOCK(&inode->lock);
+post_unlock:
+ if (!subvol && inode && lock->l_type != F_UNLCK) {
+ inode_unref(inode);
+ }
out:
- inode_ref (linked_inode);
- loc_wipe (&loc);
- GF_FREE (tmp_path);
-
- return linked_inode;
+ return subvol;
}
-
int
-dht_heal_full_path (void *data)
+dht_lk_inode_unref(call_frame_t *frame, int32_t op_ret)
{
- call_frame_t *heal_frame = data;
- dht_local_t *local = NULL;
- loc_t loc = {0, };
- dict_t *dict = NULL;
- char *path = NULL;
- int ret = -1;
- xlator_t *source = NULL;
- xlator_t *this = NULL;
- inode_table_t *itable = NULL;
- inode_t *inode = NULL;
- inode_t *tmp_inode = NULL;
-
- GF_VALIDATE_OR_GOTO ("DHT", heal_frame, out);
-
- local = heal_frame->local;
- this = heal_frame->this;
- source = heal_frame->cookie;
- heal_frame->cookie = NULL;
- gf_uuid_copy (loc.gfid, local->gfid);
-
- if (local->loc.inode)
- loc.inode = inode_ref (local->loc.inode);
- else
- goto out;
-
- itable = loc.inode->table;
- ret = syncop_getxattr (source, &loc, &dict,
- GET_ANCESTRY_PATH_KEY, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_INFO, -ret,
- DHT_MSG_DIR_SELFHEAL_FAILED,
- "Failed to get path from subvol %s. Aborting "
- "directory healing.", source->name);
- goto out;
- }
+ int ret = -1;
+ dht_local_t *local = NULL;
+ inode_t *inode = NULL;
+ xlator_t *this = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
- ret = dict_get_str (dict, GET_ANCESTRY_PATH_KEY, &path);
- if (path) {
- inode = dht_heal_path (this, path, itable);
- if (inode && inode != local->inode) {
- /*
- * if inode returned by heal function is different
- * from what we passed, which means a racing thread
- * already linked a different inode for dentry.
- * So we will update our local->inode, so that we can
- * retrurn proper inode.
- */
- tmp_inode = local->inode;
- local->inode = inode;
- inode_unref (tmp_inode);
- tmp_inode = NULL;
- } else {
- inode_unref (inode);
- }
- }
+ local = frame->local;
+ this = frame->this;
+ if (local->loc.inode || local->fd) {
+ inode = local->loc.inode ? local->loc.inode : local->fd->inode;
+ }
+ if (!inode) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LOCK_INODE_UNREF_FAILED,
+ NULL);
+ goto out;
+ }
+
+ if (!(IA_ISDIR(inode->ia_type) || IA_ISINVAL(inode->ia_type))) {
+ ret = 0;
+ goto out;
+ }
+
+ switch (local->lock_type) {
+ case F_RDLCK:
+ case F_WRLCK:
+ if (op_ret) {
+ gf_uuid_unparse(inode->gfid, gfid);
+ gf_msg_debug(this->name, 0, "lock request failed for gfid %s",
+ gfid);
+ inode_unref(inode);
+ goto out;
+ }
+ break;
+
+ case F_UNLCK:
+ if (!op_ret) {
+ inode_unref(inode);
+ } else {
+ gf_uuid_unparse(inode->gfid, gfid);
+ gf_smsg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LOCK_INODE_UNREF_FAILED, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+ default:
+ break;
+ }
+ ret = 0;
out:
- loc_wipe (&loc);
- if (dict)
- dict_unref (dict);
- return 0;
+ return ret;
}
-int
-dht_heal_full_path_done (int op_ret, call_frame_t *heal_frame, void *data)
-{
-
- call_frame_t *main_frame = NULL;
- dht_local_t *local = NULL;
-
- local = heal_frame->local;
- main_frame = local->main_frame;
- local->main_frame = NULL;
-
- DHT_STACK_UNWIND (lookup, main_frame, 0, 0,
- local->inode, &local->stbuf, local->xattr,
- &local->postparent);
-
- DHT_STACK_DESTROY (heal_frame);
- return 0;
+/* Code to update custom extended attributes from src dict to dst dict
+ */
+void
+dht_dir_set_heal_xattr(xlator_t *this, dht_local_t *local, dict_t *dst,
+ dict_t *src, int *uret, int *uflag)
+{
+ int ret = -1;
+ data_t *keyval = NULL;
+ int luret = -1;
+ int luflag = -1;
+ int i = 0;
+ char **xattrs_to_heal;
+
+ if (!src || !dst) {
+ gf_smsg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_DST_NULL_SET_FAILED,
+ "path=%s", local->loc.path, NULL);
+ return;
+ }
+ /* Check if any user xattr present in src dict and set
+ it to dst dict
+ */
+ luret = dict_foreach_fnmatch(src, "user.*", dht_set_user_xattr, dst);
+ /* Check if any other custom xattr present in src dict
+ and set it to dst dict, here index start from 1 because
+ user xattr already checked in previous statement
+ */
+
+ xattrs_to_heal = get_xattrs_to_heal();
+
+ for (i = 1; xattrs_to_heal[i]; i++) {
+ keyval = dict_get(src, xattrs_to_heal[i]);
+ if (keyval) {
+ luflag = 1;
+ ret = dict_set(dst, xattrs_to_heal[i], keyval);
+ if (ret)
+ gf_smsg(this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_DICT_SET_FAILED, "key=%s", xattrs_to_heal[i],
+ "path=%s", local->loc.path, NULL);
+ keyval = NULL;
+ }
+ }
+ if (uret)
+ (*uret) = luret;
+ if (uflag)
+ (*uflag) = luflag;
}
diff --git a/xlators/cluster/dht/src/dht-inode-read.c b/xlators/cluster/dht/src/dht-inode-read.c
index d40ac7d4c61..dbb8070b0da 100644
--- a/xlators/cluster/dht/src/dht-inode-read.c
+++ b/xlators/cluster/dht/src/dht-inode-read.c
@@ -10,1237 +10,1649 @@
#include "dht-common.h"
-int dht_access2 (xlator_t *this, xlator_t *dst_node,
- call_frame_t *frame, int ret);
-int dht_readv2 (xlator_t *this, xlator_t *dst_node,
- call_frame_t *frame, int ret);
-int dht_attr2 (xlator_t *this, xlator_t *dst_node,
- call_frame_t *frame, int ret);
-int dht_open2 (xlator_t *this, xlator_t *dst_node,
- call_frame_t *frame, int ret);
-int dht_flush2 (xlator_t *this, xlator_t *dst_node,
- call_frame_t *frame, int ret);
-int dht_lk2 (xlator_t *this, xlator_t *dst_node,
- call_frame_t *frame, int ret);
-int dht_fsync2 (xlator_t *this, xlator_t *dst_node,
- call_frame_t *frame, int ret);
+static int
+dht_access2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_readv2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_attr2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_open2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_flush2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_lk2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_fsync2(xlator_t *this, xlator_t *dst_node, call_frame_t *frame, int ret);
+static int
+dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame,
+ int ret);
+
+static int
+dht_open_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, fd_t *fd, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = 0;
+
+ local = frame->local;
+ prev = cookie;
+
+ local->op_errno = op_errno;
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
+
+ /* Update ctx if the fd has been opened on the target*/
+ if (!op_ret && (local->call_cnt == 1)) {
+ dht_fd_ctx_set(this, fd, prev);
+ goto out;
+ }
+
+ if (!op_ret || (local->call_cnt != 1))
+ goto out;
+
+ /* rebalance would have happened */
+ local->rebalance.target_op_fn = dht_open2;
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+out:
+ DHT_STACK_UNWIND(open, frame, op_ret, op_errno, local->fd, xdata);
+ return 0;
+}
-int
-dht_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+static int
+dht_open2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int ret = 0;
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
- local = frame->local;
- prev = cookie;
+ if (!frame || !frame->local)
+ goto out;
- local->op_errno = op_errno;
- if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
- goto out;
- }
+ local = frame->local;
+ op_errno = local->op_errno;
- if (!op_ret || (local->call_cnt != 1))
- goto out;
+ if (we_are_not_migrating(ret)) {
+ /* This DHT layer is not migrating the file */
+ DHT_STACK_UNWIND(open, frame, -1, local->op_errno, NULL,
+ local->rebalance.xdata);
+ return 0;
+ }
- /* rebalance would have happened */
- local->rebalance.target_op_fn = dht_open2;
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
+ if (subvol == NULL)
+ goto out;
-out:
- DHT_STACK_UNWIND (open, frame, op_ret, op_errno, local->fd, xdata);
+ local->call_cnt = 2;
- return 0;
+ STACK_WIND_COOKIE(frame, dht_open_cbk, subvol, subvol, subvol->fops->open,
+ &local->loc, local->rebalance.flags, local->fd,
+ local->xattr_req);
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL);
+ return 0;
}
int
-dht_open2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+dht_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- int op_errno = EINVAL;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
- if (!frame || !frame->local)
- goto out;
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
- local = frame->local;
- op_errno = ENOENT;
+ local = dht_local_init(frame, loc, fd, GF_FOP_OPEN);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- if (we_are_not_migrating (ret)) {
- /* This DHT layer is not migrating the file */
- DHT_STACK_UNWIND (open, frame, -1, local->op_errno,
- NULL, NULL);
- return 0;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
- }
+ local->rebalance.flags = flags;
+ local->call_cnt = 1;
- if (subvol == NULL)
- goto out;
+ STACK_WIND_COOKIE(frame, dht_open_cbk, subvol, subvol, subvol->fops->open,
+ loc, flags, fd, xdata);
- local->call_cnt = 2;
+ return 0;
- STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open,
- &local->loc, local->rebalance.flags, local->fd,
- NULL);
- return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(open, frame, -1, op_errno, NULL, NULL);
-out:
- DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
-
int
-dht_open (call_frame_t *frame, xlator_t *this,
- loc_t *loc, int flags, fd_t *fd, dict_t *xdata)
+dht_file_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *stbuf, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame, loc, fd, GF_FOP_OPEN);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ xlator_t *subvol1 = 0;
+ xlator_t *subvol2 = 0;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ inode_t *inode = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ if ((local->fop == GF_FOP_FSTAT) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
+
+ if (local->call_cnt != 1)
+ goto out;
+
+ local->op_errno = op_errno;
+ local->op_ret = op_ret;
+
+ /* Check if the rebalance phase2 is true */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ local->rebalance.target_op_fn = dht_attr2;
+ dht_set_local_rebalance(this, local, NULL, NULL, stbuf, xdata);
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ dht_inode_ctx_get_mig_info(this, inode, &subvol1, &subvol2);
+ if (dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ /* Phase 2 of migration */
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ } else {
+ /* it is a non-fd op or it is an fd based Fop and
+ opened on the dst.*/
+ if (local->fd && !dht_fd_open_on_dst(this, local->fd, subvol2)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ } else {
+ dht_attr2(this, subvol2, frame, 0);
+ return 0;
+ }
}
+ }
- local->rebalance.flags = flags;
- local->call_cnt = 1;
-
- STACK_WIND (frame, dht_open_cbk, subvol, subvol->fops->open,
- loc, flags, fd, xdata);
+out:
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ DHT_STACK_UNWIND(stat, frame, op_ret, op_errno, stbuf, xdata);
+err:
+ return 0;
+}
+static int
+dht_attr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+
+ local = frame->local;
+ if (!local)
+ goto out;
+
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(stat, frame, local->op_ret, op_errno,
+ &local->rebalance.postbuf, local->rebalance.xdata);
return 0;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (open, frame, -1, op_errno, NULL, NULL);
+ if (subvol == NULL)
+ goto out;
- return 0;
+ local->call_cnt = 2;
+
+ if (local->fop == GF_FOP_FSTAT) {
+ STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+ subvol->fops->fstat, local->fd, local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+ subvol->fops->stat, &local->loc, local->xattr_req);
+ }
+
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
+ return 0;
}
-int
-dht_file_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata)
+static int
+dht_attr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *stbuf, dict_t *xdata)
{
- xlator_t *subvol1 = 0;
- xlator_t *subvol2 = 0;
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int ret = -1;
- inode_t *inode = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
-
- local = frame->local;
- prev = cookie;
-
- if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
- local->op_errno = op_errno;
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
- goto out;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ local = frame->local;
+ prev = cookie;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+
+ goto post_unlock;
}
- if (local->call_cnt != 1)
- goto out;
-
- local->op_errno = op_errno;
- local->op_ret = op_ret;
+ dht_iatt_merge(this, &local->stbuf, stbuf);
- /* Check if the rebalance phase2 is true */
- if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
-
- local->rebalance.target_op_fn = dht_attr2;
- dht_set_local_rebalance (this, local, NULL, NULL,
- stbuf, xdata);
- inode = (local->fd) ? local->fd->inode : local->loc.inode;
-
- dht_inode_ctx_get_mig_info (this, inode, &subvol1, &subvol2);
- if (dht_mig_info_is_invalid (local->cached_subvol,
- subvol1, subvol2)){
- /* Phase 2 of migration */
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
- } else {
- /* it is a non-fd op or it is an fd based Fop and
- opened on the dst.*/
- if (local->fd &&
- !dht_fd_open_on_dst (this, local->fd, subvol2)) {
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
- } else {
- dht_attr2 (this, subvol2, frame, 0);
- return 0;
- }
- }
- }
+ local->op_ret = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ DHT_STACK_UNWIND(stat, frame, local->op_ret, local->op_errno,
+ &local->stbuf, xdata);
+ }
-out:
- DHT_STRIP_PHASE1_FLAGS (stbuf);
- DHT_STACK_UNWIND (stat, frame, op_ret, op_errno, stbuf, xdata);
-err:
- return 0;
+ return 0;
}
int
-dht_attr2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+dht_stat(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int op_errno = EINVAL;
-
- local = frame->local;
- if (!local)
- goto out;
-
- op_errno = local->op_errno;
-
- if (we_are_not_migrating (ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND (stat, frame, local->op_ret, op_errno,
- &local->rebalance.postbuf,
- local->rebalance.xdata);
- return 0;
- }
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_STAT);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ if (IA_ISREG(loc->inode->ia_type)) {
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
- if (subvol == NULL)
- goto out;
+ STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+ subvol->fops->stat, loc, xdata);
- local->call_cnt = 2;
+ return 0;
+ }
- if (local->fop == GF_FOP_FSTAT) {
- STACK_WIND (frame, dht_file_attr_cbk, subvol,
- subvol->fops->fstat, local->fd, NULL);
- } else {
- STACK_WIND (frame, dht_file_attr_cbk, subvol,
- subvol->fops->stat, &local->loc, NULL);
- }
+ local->call_cnt = call_cnt = layout->cnt;
- return 0;
+ for (i = 0; i < call_cnt; i++) {
+ subvol = layout->list[i].xlator;
-out:
- DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL);
- return 0;
+ STACK_WIND_COOKIE(frame, dht_attr_cbk, subvol, subvol,
+ subvol->fops->stat, loc, xdata);
+ }
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(stat, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
}
int
-dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *stbuf, dict_t *xdata)
+dht_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int call_cnt = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FSTAT);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ if (IA_ISREG(fd->inode->ia_type)) {
+ local->call_cnt = 1;
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
+ subvol = local->cached_subvol;
- local = frame->local;
- prev = cookie;
+ STACK_WIND_COOKIE(frame, dht_file_attr_cbk, subvol, subvol,
+ subvol->fops->fstat, fd, xdata);
+ return 0;
+ }
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
+ local->call_cnt = call_cnt = layout->cnt;
- goto unlock;
- }
+ for (i = 0; i < call_cnt; i++) {
+ subvol = layout->list[i].xlator;
+ STACK_WIND_COOKIE(frame, dht_attr_cbk, subvol, subvol,
+ subvol->fops->fstat, fd, xdata);
+ }
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
+ return 0;
- local->op_ret = 0;
- }
-unlock:
- UNLOCK (&frame->lock);
-out:
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- DHT_STACK_UNWIND (stat, frame, local->op_ret, local->op_errno,
- &local->stbuf, xdata);
- }
err:
- return 0;
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fstat, frame, -1, op_errno, NULL, NULL);
+
+ return 0;
}
int
-dht_stat (call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+dht_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iovec *vector, int count, struct iatt *stbuf,
+ struct iobref *iobref, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int i = 0;
- int call_cnt = 0;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_STAT);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- layout = local->layout;
- if (!layout) {
- gf_msg_debug (this->name, 0,
- "no layout for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
-
- if (IA_ISREG (loc->inode->ia_type)) {
- local->call_cnt = 1;
-
- subvol = local->cached_subvol;
+ dht_local_t *local = NULL;
+ int ret = 0;
+ xlator_t *src_subvol = 0;
+ xlator_t *dst_subvol = 0;
+
+ local = frame->local;
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ /* This is already second try, no need for re-check */
+ if (local->call_cnt != 1)
+ goto out;
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- STACK_WIND (frame, dht_file_attr_cbk, subvol,
- subvol->fops->stat, loc, xdata);
+ if ((op_ret == -1) && !dht_inode_missing(op_errno))
+ goto out;
+ local->op_errno = op_errno;
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(stbuf)) {
+ local->op_ret = op_ret;
+ local->rebalance.target_op_fn = dht_readv2;
+ dht_set_local_rebalance(this, local, NULL, NULL, stbuf, xdata);
+ /* File would be migrated to other node */
+ ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol,
+ &dst_subvol);
+
+ if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol) ||
+ !dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
return 0;
+ } else {
+ /* value is already set in fd_ctx, that means no need
+ to check for whether its complete or not. */
+ dht_readv2(this, dst_subvol, frame, 0);
+ return 0;
}
+ }
- local->call_cnt = call_cnt = layout->cnt;
+out:
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
- for (i = 0; i < call_cnt; i++) {
- subvol = layout->list[i].xlator;
+ DHT_STACK_UNWIND(readv, frame, op_ret, op_errno, vector, count, stbuf,
+ iobref, xdata);
- STACK_WIND (frame, dht_attr_cbk,
- subvol, subvol->fops->stat,
- loc, xdata);
- }
+ return 0;
+}
+static int
+dht_readv2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
+
+ local = frame->local;
+ if (!local)
+ goto out;
+
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(readv, frame, local->op_ret, op_errno, NULL, 0,
+ &local->rebalance.postbuf, NULL,
+ local->rebalance.xdata);
return 0;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (stat, frame, -1, op_errno, NULL, NULL);
+ if (subvol == NULL)
+ goto out;
- return 0;
-}
+ local->call_cnt = 2;
+
+ STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, local->fd,
+ local->rebalance.size, local->rebalance.offset,
+ local->rebalance.flags, local->xattr_req);
+ return 0;
+
+out:
+ DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
+ return 0;
+}
int
-dht_fstat (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+dht_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off,
+ uint32_t flags, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
- dht_layout_t *layout = NULL;
- int i = 0;
- int call_cnt = 0;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_FSTAT);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
- layout = local->layout;
- if (!layout) {
- gf_msg_debug (this->name, 0,
- "no layout for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
- if (IA_ISREG (fd->inode->ia_type)) {
- local->call_cnt = 1;
+ local = dht_local_init(frame, NULL, fd, GF_FOP_READ);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- subvol = local->cached_subvol;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
- STACK_WIND (frame, dht_file_attr_cbk, subvol,
- subvol->fops->fstat, fd, xdata);
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
- return 0;
- }
+ local->rebalance.offset = off;
+ local->rebalance.size = size;
+ local->rebalance.flags = flags;
+ local->call_cnt = 1;
- local->call_cnt = call_cnt = layout->cnt;
+ STACK_WIND(frame, dht_readv_cbk, subvol, subvol->fops->readv, local->fd,
+ local->rebalance.size, local->rebalance.offset,
+ local->rebalance.flags, local->xattr_req);
- for (i = 0; i < call_cnt; i++) {
- subvol = layout->list[i].xlator;
- STACK_WIND (frame, dht_attr_cbk,
- subvol, subvol->fops->fstat,
- fd, xdata);
- }
-
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fstat, frame, -1, op_errno, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
- return 0;
+ return 0;
}
-int
-dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- struct iovec *vector, int count, struct iatt *stbuf,
- struct iobref *iobref, dict_t *xdata)
+static int
+dht_access_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int ret = 0;
- xlator_t *src_subvol = 0;
- xlator_t *dst_subvol = 0;
-
- local = frame->local;
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ xlator_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (!prev)
+ goto out;
+ if (local->call_cnt != 1)
+ goto out;
+ if ((op_ret == -1) &&
+ ((op_errno == ENOTCONN) || dht_inode_missing(op_errno)) &&
+ IA_ISDIR(local->loc.inode->ia_type)) {
+ subvol = dht_subvol_next_available(this, prev);
+ if (!subvol)
+ goto out;
+
+ /* check if we are done with visiting every node */
+ if (subvol == local->cached_subvol) {
+ goto out;
}
- /* This is already second try, no need for re-check */
- if (local->call_cnt != 1)
- goto out;
-
- if ((op_ret == -1) && !dht_inode_missing(op_errno))
- goto out;
-
+ STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol,
+ subvol->fops->access, &local->loc,
+ local->rebalance.flags, NULL);
+ return 0;
+ }
+ if ((op_ret == -1) && dht_inode_missing(op_errno) &&
+ !(IA_ISDIR(local->loc.inode->ia_type))) {
+ /* File would be migrated to other node */
local->op_errno = op_errno;
- if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (stbuf)) {
-
- local->op_ret = op_ret;
- local->rebalance.target_op_fn = dht_readv2;
- dht_set_local_rebalance (this, local, NULL, NULL,
- stbuf, xdata);
- /* File would be migrated to other node */
- ret = dht_inode_ctx_get_mig_info (this, local->fd->inode,
- &src_subvol,
- &dst_subvol);
-
- if (dht_mig_info_is_invalid (local->cached_subvol,
- src_subvol, dst_subvol)
- || !dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
-
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
- } else {
- /* value is already set in fd_ctx, that means no need
- to check for whether its complete or not. */
- dht_readv2 (this, dst_subvol, frame, 0);
- return 0;
- }
- }
+ local->rebalance.target_op_fn = dht_access2;
+ ret = dht_rebalance_complete_check(frame->this, frame);
+ if (!ret)
+ return 0;
+ }
out:
- DHT_STRIP_PHASE1_FLAGS (stbuf);
-
- DHT_STACK_UNWIND (readv, frame, op_ret, op_errno, vector, count, stbuf,
- iobref, xdata);
-
- return 0;
+ DHT_STACK_UNWIND(access, frame, op_ret, op_errno, xdata);
+ return 0;
}
-int
-dht_readv2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+static int
+dht_access2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- int op_errno = EINVAL;
-
- local = frame->local;
- if (!local)
- goto out;
-
- op_errno = local->op_errno;
-
- if (we_are_not_migrating (ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND (readv, frame, local->op_ret, op_errno,
- NULL, 0, &local->rebalance.postbuf,
- NULL, local->rebalance.xdata);
- return 0;
- }
+ dht_local_t *local = NULL;
+ int op_errno = EINVAL;
- if (subvol == NULL)
- goto out;
+ local = frame->local;
+ if (!local)
+ goto out;
- local->call_cnt = 2;
+ op_errno = local->op_errno;
- STACK_WIND (frame, dht_readv_cbk, subvol, subvol->fops->readv,
- local->fd, local->rebalance.size, local->rebalance.offset,
- local->rebalance.flags, NULL);
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL);
return 0;
+ }
+
+ if (subvol == NULL)
+ goto out;
+
+ local->call_cnt = 2;
+
+ STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol,
+ subvol->fops->access, &local->loc, local->rebalance.flags,
+ local->xattr_req);
+
+ return 0;
out:
- DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
- return 0;
+ DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL);
+ return 0;
}
+int
+dht_access(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
+ dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_ACCESS);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.flags = mask;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND_COOKIE(frame, dht_access_cbk, subvol, subvol,
+ subvol->fops->access, loc, mask, xdata);
+
+ return 0;
+
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(access, frame, -1, op_errno, NULL);
+
+ return 0;
+}
int
-dht_readv (call_frame_t *frame, xlator_t *this,
- fd_t *fd, size_t size, off_t off, uint32_t flags, dict_t *xdata)
+dht_flush_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_READ);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ dht_local_t *local = NULL;
+ xlator_t *subvol = 0;
+ int ret = 0;
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ local = frame->local;
- local->rebalance.offset = off;
- local->rebalance.size = size;
- local->rebalance.flags = flags;
- local->call_cnt = 1;
+ local->op_errno = op_errno;
- STACK_WIND (frame, dht_readv_cbk,
- subvol, subvol->fops->readv,
- fd, size, off, flags, xdata);
+ if (local->call_cnt != 1)
+ goto out;
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
return 0;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (readv, frame, -1, op_errno, NULL, 0, NULL, NULL, NULL);
+ local->rebalance.target_op_fn = dht_flush2;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ /* If context is set, then send flush() it to the destination */
+ dht_inode_ctx_get_mig_info(this, local->fd->inode, NULL, &subvol);
+ if (subvol && dht_fd_open_on_dst(this, local->fd, subvol)) {
+ dht_flush2(this, subvol, frame, 0);
return 0;
-}
+ }
-int
-dht_access_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
-{
- int ret = -1;
- dht_local_t *local = NULL;
- xlator_t *subvol = NULL;
- call_frame_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- if (!prev || !prev->this)
- goto out;
- if (local->call_cnt != 1)
- goto out;
- if ((op_ret == -1) && ((op_errno == ENOTCONN) ||
- dht_inode_missing(op_errno)) &&
- IA_ISDIR(local->loc.inode->ia_type)) {
- subvol = dht_subvol_next_available (this, prev->this);
- if (!subvol)
- goto out;
-
- /* check if we are done with visiting every node */
- if (subvol == local->cached_subvol) {
- goto out;
- }
-
- STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access,
- &local->loc, local->rebalance.flags, NULL);
- return 0;
- }
- if ((op_ret == -1) && dht_inode_missing(op_errno) &&
- !(IA_ISDIR(local->loc.inode->ia_type))) {
- /* File would be migrated to other node */
- local->op_errno = op_errno;
- local->rebalance.target_op_fn = dht_access2;
- ret = dht_rebalance_complete_check (frame->this, frame);
- if (!ret)
- return 0;
+ if (op_errno == EREMOTE) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret) {
+ return 0;
}
+ }
out:
- DHT_STACK_UNWIND (access, frame, op_ret, op_errno, xdata);
- return 0;
+ DHT_STACK_UNWIND(flush, frame, op_ret, op_errno, xdata);
+
+ return 0;
}
-int
-dht_access2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+static int
+dht_flush2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- int op_errno = EINVAL;
-
- local = frame->local;
- if (!local)
- goto out;
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
- op_errno = local->op_errno;
+ if ((frame == NULL) || (frame->local == NULL))
+ goto out;
- if (we_are_not_migrating (ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
+ local = frame->local;
- DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL);
- return 0;
- }
+ op_errno = local->op_errno;
- if (subvol == NULL)
- goto out;
+ if (subvol == NULL)
+ goto out;
- local->call_cnt = 2;
+ local->call_cnt = 2; /* This is the second attempt */
- STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access,
- &local->loc, local->rebalance.flags, NULL);
+ STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, local->fd,
+ local->xattr_req);
- return 0;
+ return 0;
out:
- DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL);
- return 0;
+ DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
+ return 0;
}
-
int
-dht_access (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t mask,
- dict_t *xdata)
+dht_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_ACCESS);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
- local->rebalance.flags = mask;
- local->call_cnt = 1;
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
- STACK_WIND (frame, dht_access_cbk, subvol, subvol->fops->access,
- loc, mask, xdata);
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FLUSH);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- return 0;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ local->call_cnt = 1;
+
+ STACK_WIND(frame, dht_flush_cbk, subvol, subvol->fops->flush, fd,
+ local->xattr_req);
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (access, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(flush, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
int
-dht_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, dict_t *xdata)
+dht_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- xlator_t *subvol = 0;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ inode_t *inode = NULL;
+ xlator_t *src_subvol = 0;
+ xlator_t *dst_subvol = 0;
+
+ local = frame->local;
+ prev = cookie;
+
+ local->op_errno = op_errno;
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
+
+ if (op_ret == -1 && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
+ }
+ goto out;
+ }
- local = frame->local;
+ local->op_ret = op_ret;
+ inode = local->fd->inode;
- local->op_errno = op_errno;
+ local->rebalance.target_op_fn = dht_fsync2;
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
- if (local->call_cnt != 1)
- goto out;
+ dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol);
- /* If context is set, then send flush() it to the destination */
- dht_inode_ctx_get_mig_info (this, local->fd->inode, NULL, &subvol);
- if (subvol && dht_fd_open_on_dst (this, local->fd, subvol)) {
- dht_flush2 (this, subvol, frame, 0);
+ if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol) ||
+ !dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
return 0;
+ } else {
+ dht_fsync2(this, dst_subvol, frame, 0);
+ return 0;
}
+ }
out:
- DHT_STACK_UNWIND (flush, frame, op_ret, op_errno, xdata);
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
- return 0;
+ DHT_STACK_UNWIND(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
+
+ return 0;
}
-int
-dht_flush2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+static int
+dht_fsync2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- int32_t op_errno = EINVAL;
-
- if ((frame == NULL) || (frame->local == NULL))
- goto out;
-
- local = frame->local;
-
- op_errno = local->op_errno;
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if ((frame == NULL) || (frame->local == NULL))
+ goto out;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(fsync, frame, local->op_ret, op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
- if (subvol == NULL)
- goto out;
+ if (subvol == NULL)
+ goto out;
- local->call_cnt = 2; /* This is the second attempt */
+ local->call_cnt = 2; /* This is the second attempt */
- STACK_WIND (frame, dht_flush_cbk,
- subvol, subvol->fops->flush, local->fd, NULL);
+ STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, subvol->fops->fsync,
+ local->fd, local->rebalance.flags, local->xattr_req);
- return 0;
+ return 0;
out:
- DHT_STACK_UNWIND (flush, frame, -1, op_errno, NULL);
- return 0;
+ DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
-
int
-dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata)
+dht_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
+ dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_FLUSH);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
- local->call_cnt = 1;
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FSYNC);
+ if (!local) {
+ op_errno = ENOMEM;
- STACK_WIND (frame, dht_flush_cbk,
- subvol, subvol->fops->flush, fd, xdata);
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
- return 0;
+ local->call_cnt = 1;
+ local->rebalance.flags = datasync;
+
+ subvol = local->cached_subvol;
+
+ STACK_WIND_COOKIE(frame, dht_fsync_cbk, subvol, subvol, subvol->fops->fsync,
+ local->fd, local->rebalance.flags, local->xattr_req);
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (flush, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fsync, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
-
-int
-dht_fsync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
- int op_errno, struct iatt *prebuf, struct iatt *postbuf,
- dict_t *xdata)
+/* TODO: for 'lk()' call, we need some other special error, may be ESTALE to
+ indicate that lock migration happened on the fd, so we can consider it as
+ phase 2 of migration */
+static int
+dht_lk_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct gf_flock *flock, dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int ret = -1;
- inode_t *inode = NULL;
- xlator_t *src_subvol = 0;
- xlator_t *dst_subvol = 0;
+ dht_local_t *local = NULL;
+ int ret = -1;
+ xlator_t *subvol = NULL;
- local = frame->local;
- prev = cookie;
+ local = frame->local;
- local->op_errno = op_errno;
- if (op_ret == -1 && !dht_inode_missing(op_errno)) {
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
- goto out;
- }
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ if (local->call_cnt != 1)
+ goto out;
+
+ local->rebalance.target_op_fn = dht_lk2;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- if (local->call_cnt != 1) {
- if (local->stbuf.ia_blocks) {
- dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
- dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
- }
- goto out;
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
+
+ if (op_errno == EREMOTE) {
+ dht_inode_ctx_get_mig_info(this, local->fd->inode, NULL, &subvol);
+ if (subvol && dht_fd_open_on_dst(this, local->fd, subvol)) {
+ dht_lk2(this, subvol, frame, 0);
+ return 0;
+ } else {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret) {
+ return 0;
+ }
}
+ }
- local->op_ret = op_ret;
- inode = local->fd->inode;
+out:
+ dht_lk_inode_unref(frame, op_ret);
+ DHT_STACK_UNWIND(lk, frame, op_ret, op_errno, flock, xdata);
- local->rebalance.target_op_fn = dht_fsync2;
- dht_set_local_rebalance (this, local, NULL, prebuf,
- postbuf, xdata);
+ return 0;
+}
- /* Check if the rebalance phase1 is true */
- if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
+static int
+dht_lk2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+{
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
- dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
- dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
+ if ((frame == NULL) || (frame->local == NULL))
+ goto out;
- dht_inode_ctx_get_mig_info (this, inode, &src_subvol, &dst_subvol);
+ local = frame->local;
- if (dht_mig_info_is_invalid (local->cached_subvol, src_subvol,
- dst_subvol) ||
- !dht_fd_open_on_dst (this, local->fd, dst_subvol)) {
+ op_errno = local->op_errno;
- ret = dht_rebalance_in_progress_check (this, frame);
- if (!ret)
- return 0;
- } else {
- dht_fsync2 (this, dst_subvol, frame, 0);
- return 0;
- }
- }
+ if (subvol == NULL)
+ goto out;
- if (IS_DHT_MIGRATION_PHASE2 (postbuf)) {
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
- }
+ local->call_cnt = 2; /* This is the second attempt */
-out:
- DHT_STRIP_PHASE1_FLAGS (postbuf);
- DHT_STRIP_PHASE1_FLAGS (prebuf);
+ STACK_WIND(frame, dht_lk_cbk, subvol, subvol->fops->lk, local->fd,
+ local->rebalance.lock_cmd, &local->rebalance.flock,
+ local->xattr_req);
- DHT_STACK_UNWIND (fsync, frame, op_ret, op_errno,
- prebuf, postbuf, xdata);
+ return 0;
- return 0;
+out:
+ DHT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
+ return 0;
}
int
-dht_fsync2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+dht_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd,
+ struct gf_flock *flock, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int32_t op_errno = EINVAL;
-
- if ((frame == NULL) || (frame->local == NULL))
- goto out;
-
- local = frame->local;
- op_errno = local->op_errno;
-
- if (we_are_not_migrating (ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND (fsync, frame, local->op_ret,
- op_errno, &local->rebalance.prebuf,
- &local->rebalance.postbuf,
- local->rebalance.xdata);
- return 0;
- }
+ xlator_t *lock_subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_LK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->lock_type = flock->l_type;
+ lock_subvol = dht_get_lock_subvolume(this, flock, local);
+ if (!lock_subvol) {
+ gf_msg_debug(this->name, 0, "no lock subvolume for path=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /*
+ local->cached_subvol = lock_subvol;
+ ret = dht_check_and_open_fd_on_subvol (this, frame);
+ if (ret)
+ goto err;
+ */
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ local->rebalance.flock = *flock;
+ local->rebalance.lock_cmd = cmd;
+
+ local->call_cnt = 1;
+
+ STACK_WIND(frame, dht_lk_cbk, lock_subvol, lock_subvol->fops->lk, fd, cmd,
+ flock, xdata);
+
+ return 0;
- if (subvol == NULL)
- goto out;
-
- local->call_cnt = 2; /* This is the second attempt */
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lk, frame, -1, op_errno, NULL, NULL);
- STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync,
- local->fd, local->rebalance.flags, NULL);
+ return 0;
+}
- return 0;
+static int
+dht_lease_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct gf_lease *lease, dict_t *xdata)
+{
+ DHT_STACK_UNWIND(lease, frame, op_ret, op_errno, lease, xdata);
-out:
- DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
int
-dht_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync,
- dict_t *xdata)
+dht_lease(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ struct gf_lease *lease, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
- local = dht_local_init (frame, NULL, fd, GF_FOP_FSYNC);
- if (!local) {
- op_errno = ENOMEM;
+ subvol = dht_subvol_get_cached(this, loc->inode);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
- goto err;
- }
+ /* TODO: for rebalance, we need to preserve the fop arguments */
+ STACK_WIND(frame, dht_lease_cbk, subvol, subvol->fops->lease, loc, lease,
+ xdata);
- local->call_cnt = 1;
- local->rebalance.flags = datasync;
+ return 0;
- subvol = local->cached_subvol;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(lease, frame, -1, op_errno, NULL, NULL);
- STACK_WIND (frame, dht_fsync_cbk, subvol, subvol->fops->fsync,
- fd, datasync, xdata);
+ return 0;
+}
- return 0;
+/* Symlinks are currently not migrated, so no need for any check here */
+static int
+dht_readlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, const char *path, struct iatt *stbuf,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+ if (op_ret == -1)
+ goto err;
+
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ }
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fsync, frame, -1, op_errno, NULL, NULL, NULL);
+ DHT_STRIP_PHASE1_FLAGS(stbuf);
+ DHT_STACK_UNWIND(readlink, frame, op_ret, op_errno, path, stbuf, xdata);
- return 0;
+ return 0;
}
-
-/* TODO: for 'lk()' call, we need some other special error, may be ESTALE to
- indicate that lock migration happened on the fd, so we can consider it as
- phase 2 of migration */
int
-dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct gf_flock *flock, dict_t *xdata)
+dht_readlink(call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
+ dict_t *xdata)
{
- DHT_STACK_UNWIND (lk, frame, op_ret, op_errno, flock, xdata);
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_READLINK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ STACK_WIND(frame, dht_readlink_cbk, subvol, subvol->fops->readlink, loc,
+ size, xdata);
+
+ return 0;
- return 0;
-}
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(readlink, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
-int
-dht_lk (call_frame_t *frame, xlator_t *this,
- fd_t *fd, int cmd, struct gf_flock *flock, dict_t *xdata)
+/* Get both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY
+ * Use DHT_MODE_IN_XDATA_KEY if available, else fall back to
+ * DHT_IATT_IN_XDATA_KEY
+ * This will return a dummy iatt with only the mode and type set
+ */
+static int
+dht_read_iatt_from_xdata(dict_t *xdata, struct iatt *stbuf)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
+ int ret = -1;
+ int32_t mode = 0;
+ ret = dict_get_int32(xdata, DHT_MODE_IN_XDATA_KEY, &mode);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ if (ret) {
+ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
+ } else {
+ stbuf->ia_prot = ia_prot_from_st_mode(mode);
+ stbuf->ia_type = ia_type_from_st_mode(mode);
+ }
- /* TODO: for rebalance, we need to preserve the fop arguments */
- STACK_WIND (frame, dht_lk_cbk, subvol, subvol->fops->lk, fd,
- cmd, flock, xdata);
+ return ret;
+}
+int
+dht_common_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict,
+ dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ call_frame_t *call_frame = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *src_subvol = NULL;
+ xlator_t *dst_subvol = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ int ret = -1;
+ inode_t *inode = NULL;
+
+ local = frame->local;
+ call_frame = cookie;
+ prev = call_frame->this;
+
+ local->op_errno = op_errno;
+
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1.",
+ prev->name);
+ goto out;
+ }
+
+ if (local->call_cnt != 1)
+ goto out;
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
return 0;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (lk, frame, -1, op_errno, NULL, NULL);
+ ret = dht_read_iatt_from_xdata(xdata, &stbuf);
- return 0;
-}
+ if ((!op_ret) && (ret)) {
+ /* This is a potential problem and can cause corruption
+ * with sharding.
+ * Oh well. We tried.
+ */
+ goto out;
+ }
-/* Symlinks are currently not migrated, so no need for any check here */
-int
-dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, const char *path,
- struct iatt *stbuf, dict_t *xdata)
-{
- dht_local_t *local = NULL;
+ local->op_ret = op_ret;
+ local->rebalance.target_op_fn = dht_common_xattrop2;
+ if (xdata)
+ local->rebalance.xdata = dict_ref(xdata);
- local = frame->local;
- if (op_ret == -1)
- goto err;
+ if (dict)
+ local->rebalance.dict = dict_ref(dict);
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(&stbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(&stbuf)) {
+ inode = local->loc.inode ? local->loc.inode : local->fd->inode;
+ dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol);
+
+ if (dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol) ||
+ !dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ } else {
+ dht_common_xattrop2(this, dst_subvol, frame, 0);
+ return 0;
}
+ }
-err:
- DHT_STRIP_PHASE1_FLAGS (stbuf);
- DHT_STACK_UNWIND (readlink, frame, op_ret, op_errno, path, stbuf, xdata);
+out:
+ if (local->fop == GF_FOP_XATTROP) {
+ DHT_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata);
+ } else {
+ DHT_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata);
+ }
- return 0;
+ return 0;
}
-
-int
-dht_readlink (call_frame_t *frame, xlator_t *this, loc_t *loc, size_t size,
- dict_t *xdata)
+static int
+dht_common_xattrop2(xlator_t *this, xlator_t *subvol, call_frame_t *frame,
+ int ret)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_READLINK);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
+
+ if ((frame == NULL) || (frame->local == NULL))
+ goto out;
+
+ local = frame->local;
+ op_errno = local->op_errno;
+
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ if (local->fop == GF_FOP_XATTROP) {
+ DHT_STACK_UNWIND(xattrop, frame, local->op_ret, op_errno,
+ local->rebalance.dict, local->rebalance.xdata);
+ } else {
+ DHT_STACK_UNWIND(fxattrop, frame, local->op_ret, op_errno,
+ local->rebalance.dict, local->rebalance.xdata);
}
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ return 0;
+ }
- STACK_WIND (frame, dht_readlink_cbk,
- subvol, subvol->fops->readlink,
- loc, size, xdata);
+ if (subvol == NULL)
+ goto out;
- return 0;
+ local->call_cnt = 2; /* This is the second attempt */
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (readlink, frame, -1, op_errno, NULL, NULL, NULL);
+ if (local->fop == GF_FOP_XATTROP) {
+ STACK_WIND(frame, dht_common_xattrop_cbk, subvol, subvol->fops->xattrop,
+ &local->loc, local->rebalance.flags, local->rebalance.xattr,
+ local->xattr_req);
+ } else {
+ STACK_WIND(frame, dht_common_xattrop_cbk, subvol,
+ subvol->fops->fxattrop, local->fd, local->rebalance.flags,
+ local->rebalance.xattr, local->xattr_req);
+ }
- return 0;
-}
+ return 0;
-/* Currently no translators on top of 'distribute' will be using
- * below fops, hence not implementing 'migration' related checks
- */
+out:
-int
-dht_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+ /* If local is unavailable we could be unwinding the wrong
+ * function here */
+
+ if (local && (local->fop == GF_FOP_XATTROP)) {
+ DHT_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
+ } else {
+ DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
+ }
+ return 0;
+}
+
+static int
+dht_xattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- DHT_STACK_UNWIND (xattrop, frame, op_ret, op_errno, dict, xdata);
- return 0;
+ DHT_STACK_UNWIND(xattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
}
+/* Set both DHT_IATT_IN_XDATA_KEY and DHT_MODE_IN_XDATA_KEY
+ * Use DHT_MODE_IN_XDATA_KEY if available. Else fall back to
+ * DHT_IATT_IN_XDATA_KEY
+ */
+static int
+dht_request_iatt_in_xdata(dict_t *xattr_req)
+{
+ int ret = -1;
+
+ ret = dict_set_int8(xattr_req, DHT_MODE_IN_XDATA_KEY, 1);
+ ret = dict_set_int8(xattr_req, DHT_IATT_IN_XDATA_KEY, 1);
+
+ /* At least one call succeeded */
+ return ret;
+}
int
-dht_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc,
- gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+dht_xattrop(call_frame_t *frame, xlator_t *this, loc_t *loc,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_XATTROP);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ int ret = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_XATTROP);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for gfid=%s",
+ uuid_utoa(loc->inode->gfid));
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /* Todo : Handle dirs as well. At the moment the only xlator above dht
+ * that uses xattrop is sharding and that is only for files */
+
+ if (IA_ISDIR(loc->inode->ia_type)) {
+ STACK_WIND(frame, dht_xattrop_cbk, subvol, subvol->fops->xattrop, loc,
+ flags, dict, xdata);
+
+ } else {
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+ local->call_cnt = 1;
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for gfid=%s",
- uuid_utoa (loc->inode->gfid));
- op_errno = EINVAL;
- goto err;
- }
+ local->rebalance.xattr = dict_ref(dict);
+ local->rebalance.flags = flags;
- local->call_cnt = 1;
+ ret = dht_request_iatt_in_xdata(local->xattr_req);
- STACK_WIND (frame,
- dht_xattrop_cbk,
- subvol, subvol->fops->xattrop,
- loc, flags, dict, xdata);
+ if (ret) {
+ gf_msg_debug(this->name, 0,
+ "Failed to set dictionary key %s file=%s",
+ DHT_IATT_IN_XDATA_KEY, loc->path);
+ }
- return 0;
+ STACK_WIND(frame, dht_common_xattrop_cbk, subvol, subvol->fops->xattrop,
+ loc, local->rebalance.flags, local->rebalance.xattr,
+ local->xattr_req);
+ }
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (xattrop, frame, -1, op_errno, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(xattrop, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
-
-int
-dht_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
+static int
+dht_fxattrop_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *dict, dict_t *xdata)
{
- DHT_STACK_UNWIND (fxattrop, frame, op_ret, op_errno, dict, xdata);
- return 0;
+ DHT_STACK_UNWIND(fxattrop, frame, op_ret, op_errno, dict, xdata);
+ return 0;
}
-
int
-dht_fxattrop (call_frame_t *frame, xlator_t *this,
- fd_t *fd, gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
+dht_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd,
+ gf_xattrop_flags_t flags, dict_t *dict, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ int ret = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ subvol = dht_subvol_get_cached(this, fd->inode);
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FXATTROP);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ /* Todo : Handle dirs as well. At the moment the only xlator above dht
+ * that uses xattrop is sharding and that is only for files */
+
+ if (IA_ISDIR(fd->inode->ia_type)) {
+ STACK_WIND(frame, dht_fxattrop_cbk, subvol, subvol->fops->fxattrop, fd,
+ flags, dict, xdata);
+
+ } else {
+ local->xattr_req = xdata ? dict_ref(xdata) : dict_new();
+ local->call_cnt = 1;
+
+ local->rebalance.xattr = dict_ref(dict);
+ local->rebalance.flags = flags;
+
+ ret = dht_request_iatt_in_xdata(local->xattr_req);
+
+ if (ret) {
+ gf_msg_debug(this->name, 0, "Failed to set dictionary key %s fd=%p",
+ DHT_IATT_IN_XDATA_KEY, fd);
}
- STACK_WIND (frame,
- dht_fxattrop_cbk,
- subvol, subvol->fops->fxattrop,
- fd, flags, dict, xdata);
+ STACK_WIND(frame, dht_common_xattrop_cbk, subvol,
+ subvol->fops->fxattrop, fd, local->rebalance.flags,
+ local->rebalance.xattr, local->xattr_req);
+ }
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fxattrop, frame, -1, op_errno, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fxattrop, frame, -1, op_errno, NULL, NULL);
- return 0;
+ return 0;
}
+/* Currently no translators on top of 'distribute' will be using
+ * below fops, hence not implementing 'migration' related checks
+ */
-int
-dht_inodelk_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno, dict_t *xdata)
+static int
+dht_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- DHT_STACK_UNWIND (inodelk, frame, op_ret, op_errno, xdata);
- return 0;
+ dht_lk_inode_unref(frame, op_ret);
+ DHT_STACK_UNWIND(inodelk, frame, op_ret, op_errno, xdata);
+ return 0;
}
-
int32_t
-dht_inodelk (call_frame_t *frame, xlator_t *this, const char *volume,
- loc_t *loc, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+dht_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, loc_t *loc,
+ int32_t cmd, struct gf_flock *lock, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
+ xlator_t *lock_subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
+ local = dht_local_init(frame, loc, NULL, GF_FOP_INODELK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- local = dht_local_init (frame, loc, NULL, GF_FOP_INODELK);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ local->lock_type = lock->l_type;
+ lock_subvol = dht_get_lock_subvolume(this, lock, local);
+ if (!lock_subvol) {
+ gf_msg_debug(this->name, 0, "no lock subvolume for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+ local->call_cnt = 1;
- local->call_cnt = 1;
+ STACK_WIND(frame, dht_inodelk_cbk, lock_subvol, lock_subvol->fops->inodelk,
+ volume, loc, cmd, lock, xdata);
- STACK_WIND (frame,
- dht_inodelk_cbk,
- subvol, subvol->fops->inodelk,
- volume, loc, cmd, lock, xdata);
-
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (inodelk, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(inodelk, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
-
int
-dht_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, dict_t *xdata)
+dht_finodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- DHT_STACK_UNWIND (finodelk, frame, op_ret, op_errno, xdata);
- return 0;
-}
+ dht_local_t *local = NULL;
+ int ret = 0;
+ GF_VALIDATE_OR_GOTO("dht", frame, out);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
-int
-dht_finodelk (call_frame_t *frame, xlator_t *this, const char *volume,
- fd_t *fd, int32_t cmd, struct gf_flock *lock, dict_t *xdata)
-{
- xlator_t *subvol = NULL;
- int op_errno = -1;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- subvol = dht_subvol_get_cached (this, fd->inode);
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ local = frame->local;
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- STACK_WIND (frame, dht_finodelk_cbk, subvol, subvol->fops->finodelk,
- volume, fd, cmd, lock, xdata);
+out:
+ dht_lk_inode_unref(frame, op_ret);
+ DHT_STACK_UNWIND(finodelk, frame, op_ret, op_errno, xdata);
- return 0;
+ return 0;
+}
+
+int
+dht_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd,
+ int32_t cmd, struct gf_flock *lock, dict_t *xdata)
+{
+ xlator_t *lock_subvol = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_INODELK);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->call_cnt = 1;
+ local->lock_type = lock->l_type;
+
+ lock_subvol = dht_get_lock_subvolume(this, lock, local);
+ if (!lock_subvol) {
+ gf_msg_debug(this->name, 0, "no lock subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ /*
+ local->cached_subvol = lock_subvol;
+ ret = dht_check_and_open_fd_on_subvol (this, frame);
+ if (ret)
+ goto err;
+ */
+ local->rebalance.flock = *lock;
+ local->rebalance.lock_cmd = cmd;
+ local->key = gf_strdup(volume);
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND(frame, dht_finodelk_cbk, lock_subvol,
+ lock_subvol->fops->finodelk, volume, fd, cmd, lock, xdata);
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (finodelk, frame, -1, op_errno, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(finodelk, frame, -1, op_errno, NULL);
- return 0;
+ return 0;
}
diff --git a/xlators/cluster/dht/src/dht-inode-write.c b/xlators/cluster/dht/src/dht-inode-write.c
index 112685b659e..2f23ce90fbd 100644
--- a/xlators/cluster/dht/src/dht-inode-write.c
+++ b/xlators/cluster/dht/src/dht-inode-write.c
@@ -8,1201 +8,1397 @@
cases as published by the Free Software Foundation.
*/
-
#include "dht-common.h"
-int dht_writev2 (xlator_t *this, xlator_t *subvol,
- call_frame_t *frame, int ret);
-int dht_truncate2 (xlator_t *this, xlator_t *subvol,
- call_frame_t *frame, int ret);
-int dht_setattr2 (xlator_t *this, xlator_t *subvol,
- call_frame_t *frame, int ret);
-int dht_fallocate2 (xlator_t *this, xlator_t *subvol,
- call_frame_t *frame, int ret);
-int dht_discard2 (xlator_t *this, xlator_t *subvol,
- call_frame_t *frame, int ret);
-int dht_zerofill2 (xlator_t *this, xlator_t *subvol,
- call_frame_t *frame, int ret);
+static int
+dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
+static int
+dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret);
int
-dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+dht_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int ret = -1;
- xlator_t *subvol1 = NULL;
- xlator_t *subvol2 = NULL;
-
- local = frame->local;
- prev = cookie;
-
- if (!local) {
- op_ret = -1;
- op_errno = EINVAL;
- goto out;
- }
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol1 = NULL;
+ xlator_t *subvol2 = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (!local) {
+ op_ret = -1;
+ op_errno = EINVAL;
+ goto out;
+ }
+
+ /* writev fails with EBADF if dht has not yet opened the fd
+ * on the cached subvol. This could happen if the file was migrated
+ * and a lookup updated the cached subvol in the inode ctx.
+ * We only check once as this could be a valid bad fd error.
+ */
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- if (op_ret == -1 && !dht_inode_missing(op_errno)) {
- local->op_errno = op_errno;
+ if (op_ret == -1 && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_msg_debug(this->name, 0, "subvolume %s returned -1 (%s)", prev->name,
+ strerror(op_errno));
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ /* preserve the modes of source */
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
+ }
+ goto out;
+ }
+
+ local->rebalance.target_op_fn = dht_writev2;
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ /* We might need to pass the stbuf information to the higher DHT
+ * layer for appropriate handling.
+ */
+
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ if (!local->xattr_req) {
+ local->xattr_req = dict_new();
+ if (!local->xattr_req) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM,
+ "insufficient memory");
+ local->op_errno = ENOMEM;
local->op_ret = -1;
- gf_msg_debug (this->name, 0,
- "subvolume %s returned -1 (%s)",
- prev->this->name, strerror (op_errno));
goto out;
+ }
}
- if (local->call_cnt != 1) {
- /* preserve the modes of source */
- if (local->stbuf.ia_blocks) {
- dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
- dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
- }
- goto out;
+ ret = dict_set_uint32(local->xattr_req, GF_PROTECT_FROM_EXTERNAL_WRITES,
+ 1);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_DICT_SET_FAILED, 0,
+ "Failed to set key %s in dictionary",
+ GF_PROTECT_FROM_EXTERNAL_WRITES);
+ local->op_errno = ENOMEM;
+ local->op_ret = -1;
+ goto out;
}
- local->rebalance.target_op_fn = dht_writev2;
-
- local->op_ret = op_ret;
- local->op_errno = op_errno;
-
- /* We might need to pass the stbuf information to the higher DHT
- * layer for appropriate handling.
- */
-
- dht_set_local_rebalance (this, local, NULL, prebuf, postbuf, xdata);
-
- /* Phase 2 of migration */
- if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
- }
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
- /* Check if the rebalance phase1 is true */
- if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
- dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
- dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
-
- ret = dht_inode_ctx_get_mig_info (this, local->fd->inode,
- &subvol1, &subvol2);
- if (!dht_mig_info_is_invalid (local->cached_subvol,
- subvol1, subvol2)) {
- if (dht_fd_open_on_dst (this, local->fd, subvol2)) {
- dht_writev2 (this, subvol2, frame, 0);
- return 0;
- }
- }
- ret = dht_rebalance_in_progress_check (this, frame);
- if (!ret)
- return 0;
+ ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &subvol1,
+ &subvol2);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ if (dht_fd_open_on_dst(this, local->fd, subvol2)) {
+ dht_writev2(this, subvol2, frame, 0);
+ return 0;
+ }
}
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
out:
- DHT_STRIP_PHASE1_FLAGS (postbuf);
- DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
- DHT_STACK_UNWIND (writev, frame, op_ret, op_errno, prebuf, postbuf,
- xdata);
+ DHT_STACK_UNWIND(writev, frame, op_ret, op_errno, prebuf, postbuf, xdata);
- return 0;
+ return 0;
}
-int
-dht_writev2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+static int
+dht_writev2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- int32_t op_errno = EINVAL;
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
- if ((frame == NULL) || (frame->local == NULL))
- goto out;
+ if ((frame == NULL) || (frame->local == NULL))
+ goto out;
- local = frame->local;
- op_errno = local->op_errno;
-
- if (we_are_not_migrating (ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND (writev, frame, local->op_ret,
- local->op_errno, &local->rebalance.prebuf,
- &local->rebalance.postbuf,
- local->rebalance.xdata);
- return 0;
- }
+ local = frame->local;
+ op_errno = local->op_errno;
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(writev, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
- if (subvol == NULL)
- goto out;
+ if (subvol == NULL)
+ goto out;
- local->call_cnt = 2; /* This is the second attempt */
+ local->call_cnt = 2; /* This is the second attempt */
- STACK_WIND (frame, dht_writev_cbk,
- subvol, subvol->fops->writev,
- local->fd, local->rebalance.vector, local->rebalance.count,
- local->rebalance.offset, local->rebalance.flags,
- local->rebalance.iobref, NULL);
+ STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol,
+ subvol->fops->writev, local->fd, local->rebalance.vector,
+ local->rebalance.count, local->rebalance.offset,
+ local->rebalance.flags, local->rebalance.iobref,
+ local->xattr_req);
- return 0;
+ return 0;
out:
- DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
+ DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
int
-dht_writev (call_frame_t *frame, xlator_t *this, fd_t *fd,
- struct iovec *vector, int count, off_t off, uint32_t flags,
- struct iobref *iobref, dict_t *xdata)
+dht_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
+ int count, off_t off, uint32_t flags, struct iobref *iobref,
+ dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_WRITE);
- if (!local) {
-
- op_errno = ENOMEM;
- goto err;
- }
-
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
-
-
- local->rebalance.vector = iov_dup (vector, count);
- local->rebalance.offset = off;
- local->rebalance.count = count;
- local->rebalance.flags = flags;
- local->rebalance.iobref = iobref_ref (iobref);
- local->call_cnt = 1;
-
- STACK_WIND (frame, dht_writev_cbk,
- subvol, subvol->fops->writev,
- fd, vector, count, off, flags, iobref, xdata);
-
- return 0;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_WRITE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ local->rebalance.vector = iov_dup(vector, count);
+ local->rebalance.offset = off;
+ local->rebalance.count = count;
+ local->rebalance.flags = flags;
+ local->rebalance.iobref = iobref_ref(iobref);
+ local->call_cnt = 1;
+
+ STACK_WIND_COOKIE(frame, dht_writev_cbk, subvol, subvol,
+ subvol->fops->writev, fd, local->rebalance.vector,
+ local->rebalance.count, local->rebalance.offset,
+ local->rebalance.flags, local->rebalance.iobref,
+ local->xattr_req);
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (writev, frame, -1, op_errno, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(writev, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
-
-
int
-dht_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+dht_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int ret = -1;
- xlator_t *src_subvol = NULL;
- xlator_t *dst_subvol = NULL;
- inode_t *inode = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
-
- local = frame->local;
- prev = cookie;
-
- if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
- local->op_errno = op_errno;
- local->op_ret = -1;
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ xlator_t *src_subvol = NULL;
+ xlator_t *dst_subvol = NULL;
+ inode_t *inode = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ /* Needs to be checked only for ftruncate.
+ * ftruncate fails with EBADF/EINVAL if dht has not yet opened the fd
+ * on the cached subvol. This could happen if the file was migrated
+ * and a lookup updated the cached subvol in the inode ctx.
+ * We only check once as this could actually be a valid error.
+ */
+
+ if ((local->fop == GF_FOP_FTRUNCATE) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- goto out;
- }
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
- if (local->call_cnt != 1) {
- if (local->stbuf.ia_blocks) {
- dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
- dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
- }
- goto out;
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
}
+ goto out;
+ }
- local->rebalance.target_op_fn = dht_truncate2;
+ local->rebalance.target_op_fn = dht_truncate2;
- local->op_ret = op_ret;
- local->op_errno = op_errno;
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- /* We might need to pass the stbuf information to the higher DHT
- * layer for appropriate handling.
- */
+ /* We might need to pass the stbuf information to the higher DHT
+ * layer for appropriate handling.
+ */
- dht_set_local_rebalance (this, local, NULL, prebuf, postbuf, xdata);
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
- /* Phase 2 of migration */
- if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
- }
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
- /* Check if the rebalance phase1 is true */
- if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
- dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
- dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
-
- inode = (local->fd) ? local->fd->inode : local->loc.inode;
-
- dht_inode_ctx_get_mig_info (this, inode, &src_subvol,
- &dst_subvol);
- if (!dht_mig_info_is_invalid (local->cached_subvol,
- src_subvol, dst_subvol)) {
- if ((!local->fd) || ((local->fd) &&
- dht_fd_open_on_dst (this, local->fd, dst_subvol))) {
- dht_truncate2 (this, dst_subvol, frame, 0);
- return 0;
- }
- }
- ret = dht_rebalance_in_progress_check (this, frame);
- if (!ret)
- return 0;
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
+
+ inode = (local->fd) ? local->fd->inode : local->loc.inode;
+
+ dht_inode_ctx_get_mig_info(this, inode, &src_subvol, &dst_subvol);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol)) {
+ if ((!local->fd) ||
+ ((local->fd) &&
+ dht_fd_open_on_dst(this, local->fd, dst_subvol))) {
+ dht_truncate2(this, dst_subvol, frame, 0);
+ return 0;
+ }
}
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
out:
- DHT_STRIP_PHASE1_FLAGS (postbuf);
- DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
- DHT_STACK_UNWIND (truncate, frame, op_ret, op_errno,
- prebuf, postbuf, xdata);
+ DHT_STACK_UNWIND(truncate, frame, op_ret, op_errno, prebuf, postbuf, xdata);
err:
- return 0;
+ return 0;
}
-
-int
-dht_truncate2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+static int
+dht_truncate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- int32_t op_errno = EINVAL;
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
- if (!frame || !frame->local)
- goto out;
-
- local = frame->local;
- op_errno = local->op_errno;
+ if (!frame || !frame->local)
+ goto out;
- /* This dht xlator is not migrating the file */
- if (we_are_not_migrating (ret)) {
+ local = frame->local;
+ op_errno = local->op_errno;
- DHT_STACK_UNWIND (truncate, frame, local->op_ret,
- local->op_errno, &local->rebalance.prebuf,
- &local->rebalance.postbuf,
- local->rebalance.xdata);
- return 0;
- }
+ /* This dht xlator is not migrating the file */
+ if (we_are_not_migrating(ret)) {
+ DHT_STACK_UNWIND(truncate, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
- if (subvol == NULL)
- goto out;
+ if (subvol == NULL)
+ goto out;
- local->call_cnt = 2; /* This is the second attempt */
+ local->call_cnt = 2; /* This is the second attempt */
- if (local->fop == GF_FOP_TRUNCATE) {
- STACK_WIND (frame, dht_truncate_cbk, subvol,
- subvol->fops->truncate, &local->loc,
- local->rebalance.offset, NULL);
- } else {
- STACK_WIND (frame, dht_truncate_cbk, subvol,
- subvol->fops->ftruncate, local->fd,
- local->rebalance.offset, NULL);
- }
+ if (local->fop == GF_FOP_TRUNCATE) {
+ STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+ subvol->fops->truncate, &local->loc,
+ local->rebalance.offset, local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+ subvol->fops->ftruncate, local->fd,
+ local->rebalance.offset, local->xattr_req);
+ }
- return 0;
+ return 0;
out:
- DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ DHT_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
int
-dht_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
- dict_t *xdata)
+dht_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
+ dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_TRUNCATE);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- local->rebalance.offset = offset;
- local->call_cnt = 1;
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for gfid=%s",
- uuid_utoa (loc->inode->gfid));
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, dht_truncate_cbk,
- subvol, subvol->fops->truncate,
- loc, offset, xdata);
-
- return 0;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+
+ local = dht_local_init(frame, loc, NULL, GF_FOP_TRUNCATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for gfid=%s",
+ uuid_utoa(loc->inode->gfid));
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+ subvol->fops->truncate, loc, offset, xdata);
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (truncate, frame, -1, op_errno, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(truncate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
int
-dht_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- dict_t *xdata)
+dht_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
+ dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_FTRUNCATE);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- local->rebalance.offset = offset;
- local->call_cnt = 1;
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, dht_truncate_cbk,
- subvol, subvol->fops->ftruncate,
- fd, offset, xdata);
-
- return 0;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FTRUNCATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.offset = offset;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND_COOKIE(frame, dht_truncate_cbk, subvol, subvol,
+ subvol->fops->ftruncate, fd, local->rebalance.offset,
+ local->xattr_req);
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(ftruncate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
-
int
-dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+dht_fallocate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int ret = -1;
- xlator_t *src_subvol = NULL;
- xlator_t *dst_subvol = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
-
- local = frame->local;
- prev = cookie;
-
- if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
- local->op_errno = op_errno;
- local->op_ret = -1;
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
-
- goto out;
- }
-
- if (local->call_cnt != 1) {
- if (local->stbuf.ia_blocks) {
- dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
- dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
- }
- goto out;
- }
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ xlator_t *src_subvol = NULL;
+ xlator_t *dst_subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ /* fallocate fails with EBADF if dht has not yet opened the fd
+ * on the cached subvol. This could happen if the file was migrated
+ * and a lookup updated the cached subvol in the inode ctx.
+ * We only check once as this could actually be a valid error.
+ */
+
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- local->op_ret = op_ret;
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
local->op_errno = op_errno;
- local->rebalance.target_op_fn = dht_fallocate2;
+ local->op_ret = -1;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
- dht_set_local_rebalance (this, local, NULL, prebuf, postbuf, xdata);
+ goto out;
+ }
- /* Phase 2 of migration */
- if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
}
-
- /* Check if the rebalance phase1 is true */
- if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
- dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
- dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
-
- dht_inode_ctx_get_mig_info (this, local->fd->inode, &src_subvol,
- &dst_subvol);
- if (!dht_mig_info_is_invalid (local->cached_subvol,
- src_subvol, dst_subvol)) {
- if (dht_fd_open_on_dst (this, local->fd, dst_subvol)) {
- dht_fallocate2 (this, dst_subvol, frame, 0);
- return 0;
- }
- }
- ret = dht_rebalance_in_progress_check (this, frame);
- if (!ret)
- return 0;
+ goto out;
+ }
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ local->rebalance.target_op_fn = dht_fallocate2;
+
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
+
+ dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol,
+ &dst_subvol);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol)) {
+ if (dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+ dht_fallocate2(this, dst_subvol, frame, 0);
+ return 0;
+ }
}
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
out:
- DHT_STRIP_PHASE1_FLAGS (postbuf);
- DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
- DHT_STACK_UNWIND (fallocate, frame, op_ret, op_errno,
- prebuf, postbuf, xdata);
+ DHT_STACK_UNWIND(fallocate, frame, op_ret, op_errno, prebuf, postbuf,
+ xdata);
err:
- return 0;
+ return 0;
}
-int
-dht_fallocate2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+static int
+dht_fallocate2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- int32_t op_errno = EINVAL;
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
- if (!frame || !frame->local)
- goto out;
+ if (!frame || !frame->local)
+ goto out;
- local = frame->local;
- op_errno = local->op_errno;
-
- if (we_are_not_migrating (ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND (fallocate, frame, local->op_ret,
- local->op_errno,
- &local->rebalance.prebuf,
- &local->rebalance.postbuf,
- local->rebalance.xdata);
- return 0;
- }
+ local = frame->local;
+ op_errno = local->op_errno;
- if (subvol == NULL)
- goto out;
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(fallocate, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
- local->call_cnt = 2; /* This is the second attempt */
+ if (subvol == NULL)
+ goto out;
- STACK_WIND(frame, dht_fallocate_cbk, subvol, subvol->fops->fallocate,
- local->fd, local->rebalance.flags, local->rebalance.offset,
- local->rebalance.size, NULL);
+ local->call_cnt = 2; /* This is the second attempt */
- return 0;
+ STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol,
+ subvol->fops->fallocate, local->fd,
+ local->rebalance.flags, local->rebalance.offset,
+ local->rebalance.size, local->xattr_req);
+
+ return 0;
out:
- DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
int
dht_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode,
- off_t offset, size_t len, dict_t *xdata)
+ off_t offset, size_t len, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_FALLOCATE);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- local->rebalance.flags = mode;
- local->rebalance.offset = offset;
- local->rebalance.size = len;
-
- local->call_cnt = 1;
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
-
- STACK_WIND (frame, dht_fallocate_cbk,
- subvol, subvol->fops->fallocate,
- fd, mode, offset, len, xdata);
-
- return 0;
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FALLOCATE);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->rebalance.flags = mode;
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
+
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND_COOKIE(frame, dht_fallocate_cbk, subvol, subvol,
+ subvol->fops->fallocate, fd, local->rebalance.flags,
+ local->rebalance.offset, local->rebalance.size,
+ local->xattr_req);
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fallocate, frame, -1, op_errno, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fallocate, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
-
int
-dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+dht_discard_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int ret = -1;
- xlator_t *src_subvol = NULL;
- xlator_t *dst_subvol = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
-
- local = frame->local;
- prev = cookie;
-
- if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
- local->op_errno = op_errno;
- local->op_ret = -1;
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
-
- goto out;
- }
-
- if (local->call_cnt != 1) {
- if (local->stbuf.ia_blocks) {
- dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
- dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
- }
- goto out;
- }
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ xlator_t *src_subvol = NULL;
+ xlator_t *dst_subvol = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ /* discard fails with EBADF if dht has not yet opened the fd
+ * on the cached subvol. This could happen if the file was migrated
+ * and a lookup updated the cached subvol in the inode ctx.
+ * We only check once as this could actually be a valid error.
+ */
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- local->rebalance.target_op_fn = dht_discard2;
- local->op_ret = op_ret;
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
local->op_errno = op_errno;
+ local->op_ret = -1;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
- dht_set_local_rebalance (this, local, NULL, prebuf, postbuf, xdata);
+ goto out;
+ }
- /* Phase 2 of migration */
- if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
}
-
- /* Check if the rebalance phase1 is true */
- if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
- dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
- dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
-
- dht_inode_ctx_get_mig_info (this, local->fd->inode, &src_subvol,
- &dst_subvol);
- if (!dht_mig_info_is_invalid(local->cached_subvol,
- src_subvol, dst_subvol)) {
- if (dht_fd_open_on_dst (this, local->fd, dst_subvol)) {
- dht_discard2 (this, dst_subvol, frame, 0);
- return 0;
- }
- }
- ret = dht_rebalance_in_progress_check (this, frame);
- if (!ret)
- return 0;
+ goto out;
+ }
+
+ local->rebalance.target_op_fn = dht_discard2;
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
+
+ dht_inode_ctx_get_mig_info(this, local->fd->inode, &src_subvol,
+ &dst_subvol);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, src_subvol,
+ dst_subvol)) {
+ if (dht_fd_open_on_dst(this, local->fd, dst_subvol)) {
+ dht_discard2(this, dst_subvol, frame, 0);
+ return 0;
+ }
}
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
out:
- DHT_STRIP_PHASE1_FLAGS (postbuf);
- DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
- DHT_STACK_UNWIND (discard, frame, op_ret, op_errno,
- prebuf, postbuf, xdata);
+ DHT_STACK_UNWIND(discard, frame, op_ret, op_errno, prebuf, postbuf, xdata);
err:
- return 0;
+ return 0;
}
-int
-dht_discard2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+static int
+dht_discard2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- int32_t op_errno = EINVAL;
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
- if (!frame || !frame->local)
- goto out;
+ if (!frame || !frame->local)
+ goto out;
- local = frame->local;
- op_errno = local->op_errno;
-
- if (we_are_not_migrating (ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND (discard, frame, local->op_ret,
- local->op_errno,
- &local->rebalance.prebuf,
- &local->rebalance.postbuf,
- local->rebalance.xdata);
- return 0;
- }
+ local = frame->local;
+ op_errno = local->op_errno;
- if (subvol == NULL)
- goto out;
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(discard, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
- local->call_cnt = 2; /* This is the second attempt */
+ if (subvol == NULL)
+ goto out;
- STACK_WIND(frame, dht_discard_cbk, subvol, subvol->fops->discard,
- local->fd, local->rebalance.offset, local->rebalance.size,
- NULL);
+ local->call_cnt = 2; /* This is the second attempt */
- return 0;
+ STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol,
+ subvol->fops->discard, local->fd, local->rebalance.offset,
+ local->rebalance.size, local->xattr_req);
+
+ return 0;
out:
- DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
int
dht_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- size_t len, dict_t *xdata)
+ size_t len, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_DISCARD);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
- local->rebalance.offset = offset;
- local->rebalance.size = len;
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
- local->call_cnt = 1;
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ local = dht_local_init(frame, NULL, fd, GF_FOP_DISCARD);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- STACK_WIND (frame, dht_discard_cbk, subvol, subvol->fops->discard,
- fd, offset, len, xdata);
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
- return 0;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ STACK_WIND_COOKIE(frame, dht_discard_cbk, subvol, subvol,
+ subvol->fops->discard, fd, local->rebalance.offset,
+ local->rebalance.size, local->xattr_req);
+
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (discard, frame, -1, op_errno, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(discard, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
int
-dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+dht_zerofill_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *prebuf, struct iatt *postbuf,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int ret = -1;
- xlator_t *subvol1 = NULL, *subvol2 = NULL;
-
- GF_VALIDATE_OR_GOTO ("dht", frame, err);
- GF_VALIDATE_OR_GOTO ("dht", this, out);
- GF_VALIDATE_OR_GOTO ("dht", frame->local, out);
- GF_VALIDATE_OR_GOTO ("dht", cookie, out);
-
- local = frame->local;
- prev = cookie;
-
- if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
- local->op_errno = op_errno;
- local->op_ret = -1;
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
- goto out;
- }
-
- if (local->call_cnt != 1) {
- if (local->stbuf.ia_blocks) {
- dht_iatt_merge (this, postbuf, &local->stbuf, NULL);
- dht_iatt_merge (this, prebuf, &local->prebuf, NULL);
- }
- goto out;
- }
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
+ xlator_t *subvol1 = NULL, *subvol2 = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht", frame, err);
+ GF_VALIDATE_OR_GOTO("dht", this, out);
+ GF_VALIDATE_OR_GOTO("dht", frame->local, out);
+ GF_VALIDATE_OR_GOTO("dht", cookie, out);
+
+ local = frame->local;
+ prev = cookie;
+
+ /* zerofill fails with EBADF if dht has not yet opened the fd
+ * on the cached subvol. This could happen if the file was migrated
+ * and a lookup updated the cached subvol in the inode ctx.
+ * We only check once as this could actually be a valid error.
+ */
+ if (dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- local->rebalance.target_op_fn = dht_zerofill2;
- local->op_ret = op_ret;
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
local->op_errno = op_errno;
-
- dht_set_local_rebalance (this, local, NULL, prebuf, postbuf, xdata);
-
- /* Phase 2 of migration */
- if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
+ local->op_ret = -1;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
+
+ if (local->call_cnt != 1) {
+ if (local->stbuf.ia_blocks) {
+ dht_iatt_merge(this, postbuf, &local->stbuf);
+ dht_iatt_merge(this, prebuf, &local->prebuf);
}
-
- /* Check if the rebalance phase1 is true */
- if (IS_DHT_MIGRATION_PHASE1 (postbuf)) {
- dht_iatt_merge (this, &local->stbuf, postbuf, NULL);
- dht_iatt_merge (this, &local->prebuf, prebuf, NULL);
-
- ret = dht_inode_ctx_get_mig_info (this, local->fd->inode,
- &subvol1, &subvol2);
- if (!dht_mig_info_is_invalid (local->cached_subvol,
- subvol1, subvol2)) {
- if (dht_fd_open_on_dst (this, local->fd, subvol2)) {
- dht_zerofill2 (this, subvol2, frame, 0);
- return 0;
- }
- }
-
- ret = dht_rebalance_in_progress_check (this, frame);
- if (!ret)
- return 0;
+ goto out;
+ }
+
+ local->rebalance.target_op_fn = dht_zerofill2;
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
+
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
+ /* Check if the rebalance phase1 is true */
+ if (IS_DHT_MIGRATION_PHASE1(postbuf)) {
+ dht_iatt_merge(this, &local->stbuf, postbuf);
+ dht_iatt_merge(this, &local->prebuf, prebuf);
+
+ ret = dht_inode_ctx_get_mig_info(this, local->fd->inode, &subvol1,
+ &subvol2);
+ if (!dht_mig_info_is_invalid(local->cached_subvol, subvol1, subvol2)) {
+ if (dht_fd_open_on_dst(this, local->fd, subvol2)) {
+ dht_zerofill2(this, subvol2, frame, 0);
+ return 0;
+ }
}
+ ret = dht_rebalance_in_progress_check(this, frame);
+ if (!ret)
+ return 0;
+ }
+
out:
- DHT_STRIP_PHASE1_FLAGS (postbuf);
- DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
- DHT_STACK_UNWIND (zerofill, frame, op_ret, op_errno,
- prebuf, postbuf, xdata);
+ DHT_STACK_UNWIND(zerofill, frame, op_ret, op_errno, prebuf, postbuf, xdata);
err:
- return 0;
+ return 0;
}
-int
-dht_zerofill2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+static int
+dht_zerofill2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- int32_t op_errno = EINVAL;
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
- if (!frame || !frame->local)
- goto out;
+ if (!frame || !frame->local)
+ goto out;
- local = frame->local;
+ local = frame->local;
- op_errno = local->op_errno;
+ op_errno = local->op_errno;
- if (we_are_not_migrating (ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND (zerofill, frame, local->op_ret,
- local->op_errno,
- &local->rebalance.prebuf,
- &local->rebalance.postbuf,
- local->rebalance.xdata);
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(zerofill, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
- return 0;
- }
+ return 0;
+ }
- if (subvol == NULL)
- goto out;
+ if (subvol == NULL)
+ goto out;
- local->call_cnt = 2; /* This is the second attempt */
+ local->call_cnt = 2; /* This is the second attempt */
- STACK_WIND(frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
- local->fd, local->rebalance.offset, local->rebalance.size,
- NULL);
+ STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol,
+ subvol->fops->zerofill, local->fd,
+ local->rebalance.offset, local->rebalance.size,
+ local->xattr_req);
- return 0;
+ return 0;
out:
- DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
int
dht_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
- off_t len, dict_t *xdata)
+ off_t len, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- int op_errno = -1;
- dht_local_t *local = NULL;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_ZEROFILL);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ xlator_t *subvol = NULL;
+ int op_errno = -1;
+ dht_local_t *local = NULL;
- local->rebalance.offset = offset;
- local->rebalance.size = len;
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
- local->call_cnt = 1;
- subvol = local->cached_subvol;
- if (!subvol) {
- gf_msg_debug (this->name, 0,
- "no cached subvolume for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
+ local = dht_local_init(frame, NULL, fd, GF_FOP_ZEROFILL);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
- STACK_WIND (frame, dht_zerofill_cbk, subvol, subvol->fops->zerofill,
- fd, offset, len, xdata);
+ local->rebalance.offset = offset;
+ local->rebalance.size = len;
- return 0;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
+ if (!subvol) {
+ gf_msg_debug(this->name, 0, "no cached subvolume for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
- return 0;
-}
+ STACK_WIND_COOKIE(frame, dht_zerofill_cbk, subvol, subvol,
+ subvol->fops->zerofill, fd, local->rebalance.offset,
+ local->rebalance.size, local->xattr_req);
+
+ return 0;
+err:
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(zerofill, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
+}
/* handle cases of migration here for 'setattr()' calls */
int
-dht_file_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *prebuf,
- struct iatt *postbuf, dict_t *xdata)
+dht_file_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *prebuf,
+ struct iatt *postbuf, dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int ret = -1;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int ret = -1;
- local = frame->local;
- prev = cookie;
+ local = frame->local;
+ prev = cookie;
- local->op_errno = op_errno;
- if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
- goto out;
- }
+ local->op_errno = op_errno;
- if (local->call_cnt != 1)
- goto out;
+ if ((local->fop == GF_FOP_FSETATTR) &&
+ dht_check_remote_fd_failed_error(local, op_ret, op_errno)) {
+ ret = dht_check_and_open_fd_on_subvol(this, frame);
+ if (ret)
+ goto out;
+ return 0;
+ }
- local->op_ret = op_ret;
- local->op_errno = op_errno;
+ if ((op_ret == -1) && !dht_inode_missing(op_errno)) {
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
- local->rebalance.target_op_fn = dht_setattr2;
+ if (local->call_cnt != 1)
+ goto out;
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
- /* Phase 2 of migration */
- if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2 (postbuf)) {
+ local->rebalance.target_op_fn = dht_setattr2;
- dht_set_local_rebalance (this, local, NULL, prebuf,
- postbuf, xdata);
+ /* Phase 2 of migration */
+ if ((op_ret == -1) || IS_DHT_MIGRATION_PHASE2(postbuf)) {
+ dht_set_local_rebalance(this, local, NULL, prebuf, postbuf, xdata);
- ret = dht_rebalance_complete_check (this, frame);
- if (!ret)
- return 0;
- }
+ ret = dht_rebalance_complete_check(this, frame);
+ if (!ret)
+ return 0;
+ }
- /* At the end of the migration process, whatever 'attr' we
- have on source file will be migrated to destination file
- in one shot, hence we don't need to check for in progress
- state here (ie, PHASE1) */
+ /* At the end of the migration process, whatever 'attr' we
+ have on source file will be migrated to destination file
+ in one shot, hence we don't need to check for in progress
+ state here (ie, PHASE1) */
out:
- DHT_STRIP_PHASE1_FLAGS (postbuf);
- DHT_STRIP_PHASE1_FLAGS (prebuf);
+ DHT_STRIP_PHASE1_FLAGS(postbuf);
+ DHT_STRIP_PHASE1_FLAGS(prebuf);
- DHT_STACK_UNWIND (setattr, frame, op_ret, op_errno,
- prebuf, postbuf, xdata);
+ DHT_STACK_UNWIND(setattr, frame, op_ret, op_errno, prebuf, postbuf, xdata);
- return 0;
+ return 0;
}
-int
-dht_setattr2 (xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
+static int
+dht_setattr2(xlator_t *this, xlator_t *subvol, call_frame_t *frame, int ret)
{
- dht_local_t *local = NULL;
- int32_t op_errno = EINVAL;
+ dht_local_t *local = NULL;
+ int32_t op_errno = EINVAL;
- if (!frame || !frame->local)
- goto out;
+ if (!frame || !frame->local)
+ goto out;
- local = frame->local;
- op_errno = local->op_errno;
-
- if (we_are_not_migrating (ret)) {
- /* This dht xlator is not migrating the file. Unwind and
- * pass on the original mode bits so the higher DHT layer
- * can handle this.
- */
- DHT_STACK_UNWIND (setattr, frame, local->op_ret,
- local->op_errno,
- &local->rebalance.prebuf,
- &local->rebalance.postbuf,
- local->rebalance.xdata);
- return 0;
- }
+ local = frame->local;
+ op_errno = local->op_errno;
- if (subvol == NULL)
- goto out;
+ if (we_are_not_migrating(ret)) {
+ /* This dht xlator is not migrating the file. Unwind and
+ * pass on the original mode bits so the higher DHT layer
+ * can handle this.
+ */
+ DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno,
+ &local->rebalance.prebuf, &local->rebalance.postbuf,
+ local->rebalance.xdata);
+ return 0;
+ }
- local->call_cnt = 2; /* This is the second attempt */
-
- if (local->fop == GF_FOP_SETATTR) {
- STACK_WIND (frame, dht_file_setattr_cbk, subvol,
- subvol->fops->setattr, &local->loc,
- &local->rebalance.stbuf, local->rebalance.flags,
- NULL);
- } else {
- STACK_WIND (frame, dht_file_setattr_cbk, subvol,
- subvol->fops->fsetattr, local->fd,
- &local->rebalance.stbuf, local->rebalance.flags,
- NULL);
- }
+ if (subvol == NULL)
+ goto out;
- return 0;
+ local->call_cnt = 2; /* This is the second attempt */
+
+ if (local->fop == GF_FOP_SETATTR) {
+ STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+ subvol->fops->setattr, &local->loc,
+ &local->rebalance.stbuf, local->rebalance.flags,
+ local->xattr_req);
+ } else {
+ STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+ subvol->fops->fsetattr, local->fd,
+ &local->rebalance.stbuf, local->rebalance.flags,
+ local->xattr_req);
+ }
+
+ return 0;
out:
- DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ DHT_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
+ return 0;
}
-
/* Keep the existing code same for all the cases other than regular file */
int
-dht_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *statpre,
- struct iatt *statpost, dict_t *xdata)
+dht_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
+ int op_errno, struct iatt *statpre, struct iatt *statpost,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
-
-
- local = frame->local;
- prev = cookie;
-
- LOCK (&frame->lock);
- {
- if (op_ret == -1) {
- local->op_errno = op_errno;
- gf_msg_debug (this->name, op_errno,
- "subvolume %s returned -1",
- prev->this->name);
- goto unlock;
- }
-
- dht_iatt_merge (this, &local->prebuf, statpre, prev->this);
- dht_iatt_merge (this, &local->stbuf, statpost, prev->this);
-
- local->op_ret = 0;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ LOCK(&frame->lock);
+ {
+ if (op_ret == -1) {
+ local->op_errno = op_errno;
+ UNLOCK(&frame->lock);
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto post_unlock;
}
-unlock:
- UNLOCK (&frame->lock);
-
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- if (local->op_ret == 0)
- dht_inode_ctx_time_set (local->loc.inode, this,
- &local->stbuf);
- DHT_STACK_UNWIND (setattr, frame, local->op_ret, local->op_errno,
- &local->prebuf, &local->stbuf, xdata);
- }
- return 0;
+ dht_iatt_merge(this, &local->prebuf, statpre);
+ dht_iatt_merge(this, &local->stbuf, statpost);
+
+ local->op_ret = 0;
+ local->op_errno = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ if (local->op_ret == 0)
+ dht_inode_ctx_time_set(local->loc.inode, this, &local->stbuf);
+ DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno,
+ &local->prebuf, &local->stbuf, xdata);
+ }
+
+ return 0;
}
-
+/* Keep the existing code same for all the cases other than regular file */
int
-dht_setattr (call_frame_t *frame, xlator_t *this, loc_t *loc,
- struct iatt *stbuf, int32_t valid, dict_t *xdata)
+dht_non_mds_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *statpre,
+ struct iatt *statpost, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- dht_layout_t *layout = NULL;
- dht_local_t *local = NULL;
- int op_errno = -1;
- int i = -1;
- int call_cnt = 0;
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (loc, err);
- VALIDATE_OR_GOTO (loc->inode, err);
- VALIDATE_OR_GOTO (loc->path, err);
-
- local = dht_local_init (frame, loc, NULL, GF_FOP_SETATTR);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ gf_msg(this->name, op_errno, 0, 0, "subvolume %s returned -1",
+ prev->name);
+ goto post_unlock;
+ }
+
+ LOCK(&frame->lock);
+ {
+ dht_iatt_merge(this, &local->prebuf, statpre);
+ dht_iatt_merge(this, &local->stbuf, statpost);
+
+ local->op_ret = 0;
+ local->op_errno = 0;
+ }
+ UNLOCK(&frame->lock);
+post_unlock:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ dht_inode_ctx_time_set(local->loc.inode, this, &local->stbuf);
+ DHT_STACK_UNWIND(setattr, frame, 0, 0, &local->prebuf, &local->stbuf,
+ xdata);
+ }
+
+ return 0;
+}
- layout = local->layout;
- if (!layout) {
- gf_msg_debug (this->name, 0,
- "no layout for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+int
+dht_mds_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *statpre,
+ struct iatt *statpost, dict_t *xdata)
- if (!layout_is_sane (layout)) {
- gf_msg_debug (this->name, 0,
- "layout is not sane for path=%s", loc->path);
- op_errno = EINVAL;
- goto err;
- }
+{
+ dht_local_t *local = NULL;
+ dht_conf_t *conf = NULL;
+ xlator_t *prev = NULL;
+ xlator_t *mds_subvol = NULL;
+ struct iatt loc_stbuf = {
+ 0,
+ };
+ int i = 0;
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+ mds_subvol = local->mds_subvol;
+
+ if (op_ret == -1) {
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ gf_msg_debug(this->name, op_errno, "subvolume %s returned -1",
+ prev->name);
+ goto out;
+ }
+
+ local->op_ret = 0;
+ loc_stbuf = local->stbuf;
+ dht_iatt_merge(this, &local->prebuf, statpre);
+ dht_iatt_merge(this, &local->stbuf, statpost);
+
+ local->call_cnt = conf->subvolume_cnt - 1;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (mds_subvol == conf->subvolumes[i])
+ continue;
+ STACK_WIND_COOKIE(frame, dht_non_mds_setattr_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->setattr, &local->loc,
+ &loc_stbuf, local->valid, local->xattr_req);
+ }
+
+ return 0;
+out:
+ DHT_STACK_UNWIND(setattr, frame, local->op_ret, local->op_errno,
+ &local->prebuf, &local->stbuf, xdata);
- if (IA_ISREG (loc->inode->ia_type)) {
- /* in the regular file _cbk(), we need to check for
- migration possibilities */
- local->rebalance.stbuf = *stbuf;
- local->rebalance.flags = valid;
- local->call_cnt = 1;
- subvol = local->cached_subvol;
+ return 0;
+}
- STACK_WIND (frame, dht_file_setattr_cbk, subvol,
- subvol->fops->setattr,
- loc, stbuf, valid, xdata);
+int
+dht_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
+{
+ xlator_t *subvol = NULL;
+ xlator_t *mds_subvol = NULL;
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+ int ret = -1;
+ int call_cnt = 0;
+ dht_conf_t *conf = NULL;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(loc, err);
+ VALIDATE_OR_GOTO(loc->inode, err);
+ VALIDATE_OR_GOTO(loc->path, err);
+
+ conf = this->private;
+ local = dht_local_init(frame, loc, NULL, GF_FOP_SETATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for path=%s", loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane(layout)) {
+ gf_msg_debug(this->name, 0, "layout is not sane for path=%s",
+ loc->path);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ if (IA_ISREG(loc->inode->ia_type)) {
+ /* in the regular file _cbk(), we need to check for
+ migration possibilities */
+ local->rebalance.stbuf = *stbuf;
+ local->rebalance.flags = valid;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
- return 0;
+ STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+ subvol->fops->setattr, loc, stbuf, valid, xdata);
+
+ return 0;
+ }
+
+ local->call_cnt = call_cnt = layout->cnt;
+
+ if (IA_ISDIR(loc->inode->ia_type) && !__is_root_gfid(loc->inode->gfid) &&
+ call_cnt != 1) {
+ ret = dht_inode_ctx_mdsvol_get(loc->inode, this, &mds_subvol);
+ if (ret || !mds_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get mds subvol for path %s", local->loc.path);
+ op_errno = EINVAL;
+ goto err;
}
- local->call_cnt = call_cnt = layout->cnt;
+ local->mds_subvol = mds_subvol;
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == mds_subvol) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_WARNING, layout->list[i].err,
+ DHT_MSG_HASHED_SUBVOL_DOWN,
+ "MDS subvol is down for path "
+ " %s Unable to set attr ",
+ local->loc.path);
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ }
+ }
+ local->valid = valid;
+ local->stbuf = *stbuf;
+ STACK_WIND_COOKIE(frame, dht_mds_setattr_cbk, local->mds_subvol,
+ local->mds_subvol, local->mds_subvol->fops->setattr,
+ loc, stbuf, valid, xdata);
+ return 0;
+ } else {
for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_setattr_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->setattr,
- loc, stbuf, valid, xdata);
+ STACK_WIND_COOKIE(frame, dht_setattr_cbk, layout->list[i].xlator,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->setattr, loc, stbuf,
+ valid, xdata);
}
+ }
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (setattr, frame, -1, op_errno, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(setattr, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
-
int
-dht_fsetattr (call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
- int32_t valid, dict_t *xdata)
+dht_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
+ int32_t valid, dict_t *xdata)
{
- xlator_t *subvol = NULL;
- dht_layout_t *layout = NULL;
- dht_local_t *local = NULL;
- int op_errno = -1;
- int i = -1;
- int call_cnt = 0;
-
-
- VALIDATE_OR_GOTO (frame, err);
- VALIDATE_OR_GOTO (this, err);
- VALIDATE_OR_GOTO (fd, err);
-
- local = dht_local_init (frame, NULL, fd, GF_FOP_FSETATTR);
- if (!local) {
- op_errno = ENOMEM;
- goto err;
- }
-
- layout = local->layout;
- if (!layout) {
- gf_msg_debug (this->name, 0,
- "no layout for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
-
- if (!layout_is_sane (layout)) {
- gf_msg_debug (this->name, 0,
- "layout is not sane for fd=%p", fd);
- op_errno = EINVAL;
- goto err;
- }
-
- if (IA_ISREG (fd->inode->ia_type)) {
- /* in the regular file _cbk(), we need to check for
- migration possibilities */
- local->rebalance.stbuf = *stbuf;
- local->rebalance.flags = valid;
- local->call_cnt = 1;
- subvol = local->cached_subvol;
-
- STACK_WIND (frame, dht_file_setattr_cbk, subvol,
- subvol->fops->fsetattr,
- fd, stbuf, valid, xdata);
+ xlator_t *subvol = NULL;
+ dht_layout_t *layout = NULL;
+ dht_local_t *local = NULL;
+ int op_errno = -1;
+ int i = -1;
+ int call_cnt = 0;
+
+ VALIDATE_OR_GOTO(frame, err);
+ VALIDATE_OR_GOTO(this, err);
+ VALIDATE_OR_GOTO(fd, err);
+
+ local = dht_local_init(frame, NULL, fd, GF_FOP_FSETATTR);
+ if (!local) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ layout = local->layout;
+ if (!layout) {
+ gf_msg_debug(this->name, 0, "no layout for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ if (!layout_is_sane(layout)) {
+ gf_msg_debug(this->name, 0, "layout is not sane for fd=%p", fd);
+ op_errno = EINVAL;
+ goto err;
+ }
+ if (xdata)
+ local->xattr_req = dict_ref(xdata);
+
+ if (IA_ISREG(fd->inode->ia_type)) {
+ /* in the regular file _cbk(), we need to check for
+ migration possibilities */
+ local->rebalance.stbuf = *stbuf;
+ local->rebalance.flags = valid;
+ local->call_cnt = 1;
+ subvol = local->cached_subvol;
- return 0;
- }
+ STACK_WIND_COOKIE(frame, dht_file_setattr_cbk, subvol, subvol,
+ subvol->fops->fsetattr, fd, &local->rebalance.stbuf,
+ local->rebalance.flags, local->xattr_req);
+ return 0;
+ }
- local->call_cnt = call_cnt = layout->cnt;
+ local->call_cnt = call_cnt = layout->cnt;
- for (i = 0; i < call_cnt; i++) {
- STACK_WIND (frame, dht_setattr_cbk,
- layout->list[i].xlator,
- layout->list[i].xlator->fops->fsetattr,
- fd, stbuf, valid, xdata);
- }
+ for (i = 0; i < call_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_setattr_cbk, layout->list[i].xlator,
+ layout->list[i].xlator,
+ layout->list[i].xlator->fops->fsetattr, fd, stbuf,
+ valid, xdata);
+ }
- return 0;
+ return 0;
err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
+ op_errno = (op_errno == -1) ? errno : op_errno;
+ DHT_STACK_UNWIND(fsetattr, frame, -1, op_errno, NULL, NULL, NULL);
- return 0;
+ return 0;
}
diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c
index ca600e9618a..fda904c92c9 100644
--- a/xlators/cluster/dht/src/dht-layout.c
+++ b/xlators/cluster/dht/src/dht-layout.c
@@ -8,869 +8,801 @@
cases as published by the Free Software Foundation.
*/
-
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "byte-order.h"
-#include "dht-messages.h"
+#include <glusterfs/byte-order.h>
#include "unittest/unittest.h"
+#define layout_base_size (sizeof(dht_layout_t))
-#define layout_base_size (sizeof (dht_layout_t))
-
-#define layout_entry_size (sizeof ((dht_layout_t *)NULL)->list[0])
+#define layout_entry_size (sizeof((dht_layout_t *)NULL)->list[0])
#define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size))
dht_layout_t *
-dht_layout_new (xlator_t *this, int cnt)
+dht_layout_new(xlator_t *this, int cnt)
{
- dht_layout_t *layout = NULL;
- dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ dht_conf_t *conf = NULL;
- REQUIRE(NULL != this);
- REQUIRE(cnt >= 0);
+ REQUIRE(NULL != this);
+ REQUIRE(cnt >= 0);
- conf = this->private;
+ conf = this->private;
- layout = GF_CALLOC (1, layout_size (cnt),
- gf_dht_mt_dht_layout_t);
- if (!layout) {
- goto out;
- }
+ layout = GF_CALLOC(1, layout_size(cnt), gf_dht_mt_dht_layout_t);
+ if (!layout) {
+ goto out;
+ }
- layout->type = DHT_HASH_TYPE_DM;
- layout->cnt = cnt;
+ layout->type = DHT_HASH_TYPE_DM;
+ layout->cnt = cnt;
- if (conf) {
- layout->spread_cnt = conf->dir_spread_cnt;
- layout->gen = conf->gen;
- }
+ if (conf) {
+ layout->spread_cnt = conf->dir_spread_cnt;
+ layout->gen = conf->gen;
+ }
- layout->ref = 1;
+ GF_ATOMIC_INIT(layout->ref, 1);
- ENSURE(NULL != layout);
- ENSURE(layout->type == DHT_HASH_TYPE_DM);
- ENSURE(layout->cnt == cnt);
- ENSURE(layout->ref == 1);
+ ENSURE(NULL != layout);
+ ENSURE(layout->type == DHT_HASH_TYPE_DM);
+ ENSURE(layout->cnt == cnt);
+ ENSURE(GF_ATOMIC_GET(layout->ref) == 1);
out:
- return layout;
+ return layout;
}
-
dht_layout_t *
-dht_layout_get (xlator_t *this, inode_t *inode)
+dht_layout_get(xlator_t *this, inode_t *inode)
{
- dht_conf_t *conf = NULL;
- dht_layout_t *layout = NULL;
- int ret = 0;
-
- conf = this->private;
- if (!conf)
- goto out;
-
- LOCK (&conf->layout_lock);
- {
- ret = dht_inode_ctx_layout_get (inode, this, &layout);
- if ((!ret) && layout) {
- layout->ref++;
- }
- }
- UNLOCK (&conf->layout_lock);
-
-out:
- return layout;
+ dht_layout_t *layout = NULL;
+ int ret = 0;
+
+ ret = dht_inode_ctx_layout_get(inode, this, &layout);
+ if ((!ret) && layout) {
+ GF_ATOMIC_INC(layout->ref);
+ }
+ return layout;
}
-
int
-dht_layout_set (xlator_t *this, inode_t *inode, dht_layout_t *layout)
+dht_layout_set(xlator_t *this, inode_t *inode, dht_layout_t *layout)
{
- dht_conf_t *conf = NULL;
- int oldret = -1;
- int ret = -1;
- dht_layout_t *old_layout;
-
- conf = this->private;
- if (!conf || !layout)
- goto out;
-
- LOCK (&conf->layout_lock);
- {
- oldret = dht_inode_ctx_layout_get (inode, this, &old_layout);
- if (layout)
- layout->ref++;
- ret = dht_inode_ctx_layout_set (inode, this, layout);
- }
- UNLOCK (&conf->layout_lock);
-
- if (!oldret) {
- dht_layout_unref (this, old_layout);
- }
+ dht_conf_t *conf = NULL;
+ int oldret = -1;
+ int ret = -1;
+ dht_layout_t *old_layout;
+
+ conf = this->private;
+ if (!conf || !layout)
+ goto out;
+
+ LOCK(&conf->layout_lock);
+ {
+ oldret = dht_inode_ctx_layout_get(inode, this, &old_layout);
+ if (layout)
+ GF_ATOMIC_INC(layout->ref);
+ ret = dht_inode_ctx_layout_set(inode, this, layout);
+ }
+ UNLOCK(&conf->layout_lock);
+
+ if (!oldret) {
+ dht_layout_unref(this, old_layout);
+ }
+ if (ret)
+ GF_ATOMIC_DEC(layout->ref);
out:
- return ret;
+ return ret;
}
-
void
-dht_layout_unref (xlator_t *this, dht_layout_t *layout)
+dht_layout_unref(xlator_t *this, dht_layout_t *layout)
{
- dht_conf_t *conf = NULL;
- int ref = 0;
-
- if (!layout || layout->preset || !this->private)
- return;
+ int ref = 0;
- conf = this->private;
+ if (!layout || layout->preset || !this->private)
+ return;
- LOCK (&conf->layout_lock);
- {
- ref = --layout->ref;
- }
- UNLOCK (&conf->layout_lock);
+ ref = GF_ATOMIC_DEC(layout->ref);
- if (!ref)
- GF_FREE (layout);
+ if (!ref)
+ GF_FREE(layout);
}
-
dht_layout_t *
-dht_layout_ref (xlator_t *this, dht_layout_t *layout)
+dht_layout_ref(xlator_t *this, dht_layout_t *layout)
{
- dht_conf_t *conf = NULL;
-
- if (layout->preset || !this->private)
- return layout;
+ if (layout->preset || !this->private)
+ return layout;
- conf = this->private;
- LOCK (&conf->layout_lock);
- {
- layout->ref++;
- }
- UNLOCK (&conf->layout_lock);
+ GF_ATOMIC_INC(layout->ref);
- return layout;
+ return layout;
}
-
xlator_t *
-dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name)
+dht_layout_search(xlator_t *this, dht_layout_t *layout, const char *name)
{
- uint32_t hash = 0;
- xlator_t *subvol = NULL;
- int i = 0;
- int ret = 0;
-
- ret = dht_hash_compute (this, layout->type, name, &hash);
- if (ret != 0) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_COMPUTE_HASH_FAILED,
- "hash computation failed for type=%d name=%s",
- layout->type, name);
- goto out;
- }
-
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].start <= hash
- && layout->list[i].stop >= hash) {
- subvol = layout->list[i].xlator;
- break;
- }
- }
-
- if (!subvol) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_HASHED_SUBVOL_GET_FAILED,
- "no subvolume for hash (value) = %u", hash);
- }
+ uint32_t hash = 0;
+ xlator_t *subvol = NULL;
+ int i = 0;
+ int ret = 0;
+
+ ret = dht_hash_compute(this, layout->type, name, &hash);
+ if (ret != 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_COMPUTE_HASH_FAILED,
+ "type=%d", layout->type, "name=%s", name, NULL);
+ goto out;
+ }
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].start <= hash && layout->list[i].stop >= hash) {
+ subvol = layout->list[i].xlator;
+ break;
+ }
+ }
+
+ if (!subvol) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "hash-value=0x%x", hash, NULL);
+ }
out:
- return subvol;
+ return subvol;
}
-
dht_layout_t *
-dht_layout_for_subvol (xlator_t *this, xlator_t *subvol)
+dht_layout_for_subvol(xlator_t *this, xlator_t *subvol)
{
- dht_conf_t *conf = NULL;
- dht_layout_t *layout = NULL;
- int i = 0;
-
- conf = this->private;
- if (!conf)
- goto out;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->subvolumes[i] == subvol) {
- layout = conf->file_layouts[i];
- break;
- }
+ dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int i = 0;
+
+ conf = this->private;
+ if (!conf)
+ goto out;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == subvol) {
+ layout = conf->file_layouts[i];
+ break;
}
+ }
out:
- return layout;
+ return layout;
}
-
int
-dht_layouts_init (xlator_t *this, dht_conf_t *conf)
+dht_layouts_init(xlator_t *this, dht_conf_t *conf)
{
- dht_layout_t *layout = NULL;
- int i = 0;
- int ret = -1;
-
- if (!conf)
- goto out;
-
- conf->file_layouts = GF_CALLOC (conf->subvolume_cnt,
- sizeof (dht_layout_t *),
- gf_dht_mt_dht_layout_t);
- if (!conf->file_layouts) {
- goto out;
- }
+ dht_layout_t *layout = NULL;
+ int i = 0;
+ int ret = -1;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- layout = dht_layout_new (this, 1);
-
- if (!layout) {
- goto out;
- }
+ if (!conf)
+ goto out;
- layout->preset = 1;
+ conf->file_layouts = GF_CALLOC(conf->subvolume_cnt, sizeof(dht_layout_t *),
+ gf_dht_mt_dht_layout_t);
+ if (!conf->file_layouts) {
+ goto out;
+ }
- layout->list[0].xlator = conf->subvolumes[i];
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ layout = dht_layout_new(this, 1);
- conf->file_layouts[i] = layout;
+ if (!layout) {
+ goto out;
}
- ret = 0;
+ layout->preset = 1;
+
+ layout->list[0].xlator = conf->subvolumes[i];
+
+ conf->file_layouts[i] = layout;
+ }
+
+ ret = 0;
out:
- return ret;
+ return ret;
}
-
int
-dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout,
- int pos, int32_t **disk_layout_p)
+dht_disk_layout_extract(xlator_t *this, dht_layout_t *layout, int pos,
+ int32_t **disk_layout_p)
{
- int ret = -1;
- int32_t *disk_layout = NULL;
+ int ret = -1;
+ int32_t *disk_layout = NULL;
- disk_layout = GF_CALLOC (5, sizeof (int),
- gf_dht_mt_int32_t);
- if (!disk_layout) {
- goto out;
- }
+ disk_layout = GF_CALLOC(5, sizeof(int), gf_dht_mt_int32_t);
+ if (!disk_layout) {
+ goto out;
+ }
- disk_layout[0] = hton32 (layout->list[pos].commit_hash);
- disk_layout[1] = hton32 (layout->type);
- disk_layout[2] = hton32 (layout->list[pos].start);
- disk_layout[3] = hton32 (layout->list[pos].stop);
+ disk_layout[0] = hton32(layout->list[pos].commit_hash);
+ disk_layout[1] = hton32(layout->type);
+ disk_layout[2] = hton32(layout->list[pos].start);
+ disk_layout[3] = hton32(layout->list[pos].stop);
- if (disk_layout_p)
- *disk_layout_p = disk_layout;
- else
- GF_FREE (disk_layout);
+ if (disk_layout_p)
+ *disk_layout_p = disk_layout;
+ else
+ GF_FREE(disk_layout);
- ret = 0;
+ ret = 0;
out:
- return ret;
+ return ret;
}
-
int
-dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout,
- int pos, void *disk_layout_raw, int disk_layout_len)
+dht_disk_layout_extract_for_subvol(xlator_t *this, dht_layout_t *layout,
+ xlator_t *subvol, int32_t **disk_layout_p)
+{
+ int i = 0;
+
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol)
+ break;
+ }
+
+ if (i == layout->cnt)
+ return -1;
+
+ return dht_disk_layout_extract(this, layout, i, disk_layout_p);
+}
+
+static int
+dht_disk_layout_merge(xlator_t *this, dht_layout_t *layout, int pos,
+ void *disk_layout_raw, int disk_layout_len)
{
- int type = 0;
- int start_off = 0;
- int stop_off = 0;
- int commit_hash = 0;
- int disk_layout[4];
+ int type = 0;
+ int start_off = 0;
+ int stop_off = 0;
+ int commit_hash = 0;
+ int disk_layout[4];
- if (!disk_layout_raw) {
- gf_msg (this->name, GF_LOG_CRITICAL, 0,
- DHT_MSG_LAYOUT_MERGE_FAILED,
- "error no layout on disk for merge");
- return -1;
- }
+ if (!disk_layout_raw) {
+ gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ NULL);
+ return -1;
+ }
- GF_ASSERT (disk_layout_len == sizeof (disk_layout));
+ GF_ASSERT(disk_layout_len == sizeof(disk_layout));
- memcpy (disk_layout, disk_layout_raw, disk_layout_len);
+ memcpy(disk_layout, disk_layout_raw, disk_layout_len);
- type = ntoh32 (disk_layout[1]);
- switch (type) {
+ type = ntoh32(disk_layout[1]);
+ switch (type) {
case DHT_HASH_TYPE_DM_USER:
- gf_msg_debug (this->name, 0, "found user-set layout");
- layout->type = type;
- /* Fall through. */
- case DHT_HASH_TYPE_DM:
- break;
+ gf_msg_debug(this->name, 0, "found user-set layout");
+ layout->type = type;
+ /* Fall through. */
+ case DHT_HASH_TYPE_DM:
+ break;
default:
- gf_msg (this->name, GF_LOG_CRITICAL, 0,
- DHT_MSG_INVALID_DISK_LAYOUT,
- "Invalid disk layout: "
- "Catastrophic error layout with unknown type found %d",
- disk_layout[1]);
- return -1;
- }
-
- commit_hash = ntoh32 (disk_layout[0]);
- start_off = ntoh32 (disk_layout[2]);
- stop_off = ntoh32 (disk_layout[3]);
-
- layout->list[pos].commit_hash = commit_hash;
- layout->list[pos].start = start_off;
- layout->list[pos].stop = stop_off;
-
- gf_msg_trace (this->name, 0,
- "merged to layout: %u - %u (type %d, hash %d) from %s",
- start_off, stop_off, commit_hash, type,
- layout->list[pos].xlator->name);
-
- return 0;
+ gf_smsg(this->name, GF_LOG_CRITICAL, 0, DHT_MSG_INVALID_DISK_LAYOUT,
+ "layout=%d", disk_layout[1], NULL);
+ return -1;
+ }
+
+ commit_hash = ntoh32(disk_layout[0]);
+ start_off = ntoh32(disk_layout[2]);
+ stop_off = ntoh32(disk_layout[3]);
+
+ layout->list[pos].commit_hash = commit_hash;
+ layout->list[pos].start = start_off;
+ layout->list[pos].stop = stop_off;
+
+ gf_msg_trace(this->name, 0,
+ "merged to layout: 0x%x - 0x%x (hash 0x%x, type %d) from %s",
+ start_off, stop_off, commit_hash, type,
+ layout->list[pos].xlator->name);
+
+ return 0;
}
int
-dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
- int op_ret, int op_errno, dict_t *xattr)
+dht_layout_merge(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ int op_ret, int op_errno, dict_t *xattr)
{
- int i = 0;
- int ret = -1;
- int err = -1;
- void *disk_layout_raw = NULL;
- int disk_layout_len = 0;
- dht_conf_t *conf = this->private;
-
- if (op_ret != 0) {
- err = op_errno;
- }
-
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].xlator == NULL) {
- layout->list[i].err = err;
- layout->list[i].xlator = subvol;
- break;
- }
- }
+ int i = 0;
+ int ret = -1;
+ int err = -1;
+ void *disk_layout_raw = NULL;
+ int disk_layout_len = 0;
+ dht_conf_t *conf = this->private;
- if (op_ret != 0) {
- ret = 0;
- goto out;
- }
+ if (op_ret != 0) {
+ err = op_errno;
+ }
- if (xattr) {
- /* during lookup and not mkdir */
- ret = dict_get_ptr_and_len (xattr, conf->xattr_name,
- &disk_layout_raw, &disk_layout_len);
- }
+ if (!layout)
+ goto out;
- if (ret != 0) {
- layout->list[i].err = 0;
- gf_msg_trace (this->name, 0,
- "Missing disk layout on %s. err = %d",
- subvol->name, err);
- ret = 0;
- goto out;
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == NULL) {
+ layout->list[i].err = err;
+ layout->list[i].xlator = subvol;
+ break;
}
+ }
- ret = dht_disk_layout_merge (this, layout, i, disk_layout_raw,
- disk_layout_len);
- if (ret != 0) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_LAYOUT_MERGE_FAILED,
- "layout merge from subvolume %s failed",
- subvol->name);
- goto out;
- }
+ if (op_ret != 0) {
+ ret = 0;
+ goto out;
+ }
- if (layout->commit_hash == 0) {
- layout->commit_hash = layout->list[i].commit_hash;
- } else if (layout->commit_hash != layout->list[i].commit_hash) {
- layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
- }
+ if (xattr) {
+ /* during lookup and not mkdir */
+ ret = dict_get_ptr_and_len(xattr, conf->xattr_name, &disk_layout_raw,
+ &disk_layout_len);
+ }
+ if (ret != 0) {
layout->list[i].err = 0;
+ gf_msg_trace(this->name, 0, "Missing disk layout on %s. err = %d",
+ subvol->name, err);
+ ret = 0;
+ goto out;
+ }
+
+ ret = dht_disk_layout_merge(this, layout, i, disk_layout_raw,
+ disk_layout_len);
+ if (ret != 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_MERGE_FAILED,
+ "subvolume=%s", subvol->name, NULL);
+ goto out;
+ }
+
+ if (layout->commit_hash == 0) {
+ layout->commit_hash = layout->list[i].commit_hash;
+ } else if (layout->commit_hash != layout->list[i].commit_hash) {
+ layout->commit_hash = DHT_LAYOUT_HASH_INVALID;
+ }
+
+ layout->list[i].err = 0;
out:
- return ret;
+ return ret;
}
-
void
-dht_layout_entry_swap (dht_layout_t *layout, int i, int j)
+dht_layout_entry_swap(dht_layout_t *layout, int i, int j)
{
- uint32_t start_swap = 0;
- uint32_t stop_swap = 0;
- uint32_t commit_hash_swap = 0;
- xlator_t *xlator_swap = 0;
- int err_swap = 0;
-
- start_swap = layout->list[i].start;
- stop_swap = layout->list[i].stop;
- xlator_swap = layout->list[i].xlator;
- err_swap = layout->list[i].err;
- commit_hash_swap = layout->list[i].commit_hash;
-
- layout->list[i].start = layout->list[j].start;
- layout->list[i].stop = layout->list[j].stop;
- layout->list[i].xlator = layout->list[j].xlator;
- layout->list[i].err = layout->list[j].err;
- layout->list[i].commit_hash = layout->list[j].commit_hash;
-
- layout->list[j].start = start_swap;
- layout->list[j].stop = stop_swap;
- layout->list[j].xlator = xlator_swap;
- layout->list[j].err = err_swap;
- layout->list[j].commit_hash = commit_hash_swap;
+ uint32_t start_swap = 0;
+ uint32_t stop_swap = 0;
+ uint32_t commit_hash_swap = 0;
+ xlator_t *xlator_swap = 0;
+ int err_swap = 0;
+
+ start_swap = layout->list[i].start;
+ stop_swap = layout->list[i].stop;
+ xlator_swap = layout->list[i].xlator;
+ err_swap = layout->list[i].err;
+ commit_hash_swap = layout->list[i].commit_hash;
+
+ layout->list[i].start = layout->list[j].start;
+ layout->list[i].stop = layout->list[j].stop;
+ layout->list[i].xlator = layout->list[j].xlator;
+ layout->list[i].err = layout->list[j].err;
+ layout->list[i].commit_hash = layout->list[j].commit_hash;
+
+ layout->list[j].start = start_swap;
+ layout->list[j].stop = stop_swap;
+ layout->list[j].xlator = xlator_swap;
+ layout->list[j].err = err_swap;
+ layout->list[j].commit_hash = commit_hash_swap;
}
void
-dht_layout_range_swap (dht_layout_t *layout, int i, int j)
+dht_layout_range_swap(dht_layout_t *layout, int i, int j)
{
- uint32_t start_swap = 0;
- uint32_t stop_swap = 0;
+ uint32_t start_swap = 0;
+ uint32_t stop_swap = 0;
- start_swap = layout->list[i].start;
- stop_swap = layout->list[i].stop;
+ start_swap = layout->list[i].start;
+ stop_swap = layout->list[i].stop;
- layout->list[i].start = layout->list[j].start;
- layout->list[i].stop = layout->list[j].stop;
+ layout->list[i].start = layout->list[j].start;
+ layout->list[i].stop = layout->list[j].stop;
- layout->list[j].start = start_swap;
- layout->list[j].stop = stop_swap;
+ layout->list[j].start = start_swap;
+ layout->list[j].stop = stop_swap;
}
-
-int64_t
-dht_layout_entry_cmp_volname (dht_layout_t *layout, int i, int j)
+static int64_t
+dht_layout_entry_cmp_volname(dht_layout_t *layout, int i, int j)
{
- return (strcmp (layout->list[i].xlator->name,
- layout->list[j].xlator->name));
+ return (strcmp(layout->list[i].xlator->name, layout->list[j].xlator->name));
}
gf_boolean_t
-dht_is_subvol_in_layout (dht_layout_t *layout, xlator_t *xlator)
+dht_is_subvol_in_layout(dht_layout_t *layout, xlator_t *xlator)
{
- int i = 0;
-
- for (i = 0; i < layout->cnt; i++) {
- /* Check if xlator is already part of layout, and layout is
- * non-zero. */
- if (!strcmp (layout->list[i].xlator->name, xlator->name)) {
- if (layout->list[i].start != layout->list[i].stop)
- return _gf_true;
- break;
- }
- }
- return _gf_false;
+ int i = 0;
+
+ for (i = 0; i < layout->cnt; i++) {
+ /* Check if xlator is already part of layout, and layout is
+ * non-zero. */
+ if (!strcmp(layout->list[i].xlator->name, xlator->name)) {
+ if (layout->list[i].start != layout->list[i].stop)
+ return _gf_true;
+ break;
+ }
+ }
+ return _gf_false;
}
-int64_t
-dht_layout_entry_cmp (dht_layout_t *layout, int i, int j)
+static int64_t
+dht_layout_entry_cmp(dht_layout_t *layout, int i, int j)
{
- int64_t diff = 0;
+ int64_t diff = 0;
- /* swap zero'ed out layouts to front, if needed */
- if (!layout->list[j].start && !layout->list[j].stop) {
- diff = (int64_t) layout->list[i].stop
- - (int64_t) layout->list[j].stop;
- goto out;
- }
- diff = (int64_t) layout->list[i].start
- - (int64_t) layout->list[j].start;
+ /* swap zero'ed out layouts to front, if needed */
+ if (!layout->list[j].start && !layout->list[j].stop) {
+ diff = (int64_t)layout->list[i].stop - (int64_t)layout->list[j].stop;
+ goto out;
+ }
+ diff = (int64_t)layout->list[i].start - (int64_t)layout->list[j].start;
out:
- return diff;
+ return diff;
}
-
int
-dht_layout_sort (dht_layout_t *layout)
+dht_layout_sort(dht_layout_t *layout)
{
- int i = 0;
- int j = 0;
- int64_t ret = 0;
+ int i = 0;
+ int j = 0;
+ int64_t ret = 0;
- /* TODO: O(n^2) -- bad bad */
+ /* TODO: O(n^2) -- bad bad */
- for (i = 0; i < layout->cnt - 1; i++) {
- for (j = i + 1; j < layout->cnt; j++) {
- ret = dht_layout_entry_cmp (layout, i, j);
- if (ret > 0)
- dht_layout_entry_swap (layout, i, j);
- }
+ for (i = 0; i < layout->cnt - 1; i++) {
+ for (j = i + 1; j < layout->cnt; j++) {
+ ret = dht_layout_entry_cmp(layout, i, j);
+ if (ret > 0)
+ dht_layout_entry_swap(layout, i, j);
}
+ }
- return 0;
+ return 0;
}
-int
-dht_layout_sort_volname (dht_layout_t *layout)
+void
+dht_layout_sort_volname(dht_layout_t *layout)
{
- int i = 0;
- int j = 0;
- int64_t ret = 0;
+ int i = 0;
+ int j = 0;
+ int64_t ret = 0;
- /* TODO: O(n^2) -- bad bad */
+ /* TODO: O(n^2) -- bad bad */
- for (i = 0; i < layout->cnt - 1; i++) {
- for (j = i + 1; j < layout->cnt; j++) {
- ret = dht_layout_entry_cmp_volname (layout, i, j);
- if (ret > 0)
- dht_layout_entry_swap (layout, i, j);
- }
+ for (i = 0; i < layout->cnt - 1; i++) {
+ for (j = i + 1; j < layout->cnt; j++) {
+ ret = dht_layout_entry_cmp_volname(layout, i, j);
+ if (ret > 0)
+ dht_layout_entry_swap(layout, i, j);
}
-
- return 0;
+ }
}
-
-int
-dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout,
- uint32_t *holes_p, uint32_t *overlaps_p,
- uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p,
- uint32_t *no_space_p)
+void
+dht_layout_anomalies(xlator_t *this, loc_t *loc, dht_layout_t *layout,
+ uint32_t *holes_p, uint32_t *overlaps_p,
+ uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p,
+ uint32_t *no_space_p)
{
- uint32_t overlaps = 0;
- uint32_t missing = 0;
- uint32_t down = 0;
- uint32_t misc = 0;
- uint32_t hole_cnt = 0;
- uint32_t overlap_cnt = 0;
- int i = 0;
- int ret = 0;
- uint32_t prev_stop = 0;
- uint32_t last_stop = 0;
- char is_virgin = 1;
- uint32_t no_space = 0;
-
- /* This function scans through the layout spread of a directory to
- check if there are any anomalies. Prior to calling this function
- the layout entries should be sorted in the ascending order.
-
- If the layout entry has err != 0
- then increment the corresponding anomaly.
- else
- if (start of the current layout entry > stop + 1 of previous
- non erroneous layout entry)
- then it indicates a hole in the layout
- if (start of the current layout entry < stop + 1 of previous
- non erroneous layout entry)
- then it indicates an overlap in the layout
- */
- last_stop = layout->list[0].start - 1;
- prev_stop = last_stop;
-
- for (i = 0; i < layout->cnt; i++) {
- switch (layout->list[i].err) {
- case -1:
- case ENOENT:
- case ESTALE:
- missing++;
- continue;
- case ENOTCONN:
- down++;
- continue;
- case ENOSPC:
- no_space++;
- continue;
- case 0:
- /* if err == 0 and start == stop, then it is a non misc++;
- * participating subvolume(spread-cnt). Then, do not
- * check for anomalies. If start != stop, then treat it
- * as misc err */
- if (layout->list[i].start == layout->list[i].stop) {
- continue;
- }
- break;
- default:
- misc++;
- continue;
- }
-
- is_virgin = 0;
-
- if ((prev_stop + 1) < layout->list[i].start) {
- hole_cnt++;
+ uint32_t overlaps = 0;
+ uint32_t missing = 0;
+ uint32_t down = 0;
+ uint32_t misc = 0;
+ uint32_t hole_cnt = 0;
+ uint32_t overlap_cnt = 0;
+ int i = 0;
+ uint32_t prev_stop = 0;
+ uint32_t last_stop = 0;
+ char is_virgin = 1;
+ uint32_t no_space = 0;
+
+ /* This function scans through the layout spread of a directory to
+ check if there are any anomalies. Prior to calling this function
+ the layout entries should be sorted in the ascending order.
+
+ If the layout entry has err != 0
+ then increment the corresponding anomaly.
+ else
+ if (start of the current layout entry > stop + 1 of previous
+ non erroneous layout entry)
+ then it indicates a hole in the layout
+ if (start of the current layout entry < stop + 1 of previous
+ non erroneous layout entry)
+ then it indicates an overlap in the layout
+ */
+ last_stop = layout->list[0].start - 1;
+ prev_stop = last_stop;
+
+ for (i = 0; i < layout->cnt; i++) {
+ switch (layout->list[i].err) {
+ case -1:
+ case ENOENT:
+ case ESTALE:
+ missing++;
+ continue;
+ case ENOTCONN:
+ down++;
+ continue;
+ case ENOSPC:
+ no_space++;
+ continue;
+ case 0:
+ /* if err == 0 and start == stop, then it is a non misc++;
+ * participating subvolume(spread-cnt). Then, do not
+ * check for anomalies. If start != stop, then treat it
+ * as misc err */
+ if (layout->list[i].start == layout->list[i].stop) {
+ continue;
}
+ break;
+ default:
+ misc++;
+ continue;
+ }
- if ((prev_stop + 1) > layout->list[i].start) {
- overlap_cnt++;
- overlaps += ((prev_stop + 1) - layout->list[i].start);
- }
- prev_stop = layout->list[i].stop;
+ is_virgin = 0;
+
+ if ((prev_stop + 1) < layout->list[i].start) {
+ hole_cnt++;
}
- if ((last_stop - prev_stop) || is_virgin)
- hole_cnt++;
+ if ((prev_stop + 1) > layout->list[i].start) {
+ overlap_cnt++;
+ overlaps += ((prev_stop + 1) - layout->list[i].start);
+ }
+ prev_stop = layout->list[i].stop;
+ }
- if (holes_p)
- *holes_p = hole_cnt;
+ if ((last_stop - prev_stop) || is_virgin)
+ hole_cnt++;
- if (overlaps_p)
- *overlaps_p = overlap_cnt;
+ if (holes_p)
+ *holes_p = hole_cnt;
- if (missing_p)
- *missing_p = missing;
+ if (overlaps_p)
+ *overlaps_p = overlap_cnt;
- if (down_p)
- *down_p = down;
+ if (missing_p)
+ *missing_p = missing;
- if (misc_p)
- *misc_p = misc;
+ if (down_p)
+ *down_p = down;
- if (no_space_p)
- *no_space_p = no_space;
+ if (misc_p)
+ *misc_p = misc;
- return ret;
+ if (no_space_p)
+ *no_space_p = no_space;
}
-
int
-dht_layout_missing_dirs (dht_layout_t *layout)
+dht_layout_missing_dirs(dht_layout_t *layout)
{
- int i = 0, missing = 0;
+ int i = 0, missing = 0;
- if (layout == NULL)
- goto out;
+ if (layout == NULL)
+ goto out;
- for (i = 0; i < layout->cnt; i++) {
- if ((layout->list[i].err == ENOENT)
- || ((layout->list[i].err == -1)
- && (layout->list[i].start == 0)
- && (layout->list[i].stop == 0))) {
- missing++;
- }
+ for (i = 0; i < layout->cnt; i++) {
+ if ((layout->list[i].err == ENOENT) ||
+ ((layout->list[i].err == -1) && (layout->list[i].start == 0) &&
+ (layout->list[i].stop == 0))) {
+ missing++;
}
+ }
out:
- return missing;
+ return missing;
}
-
int
-dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout)
+dht_layout_normalize(xlator_t *this, loc_t *loc, dht_layout_t *layout)
{
- int ret = 0;
- uint32_t holes = 0;
- uint32_t overlaps = 0;
- uint32_t missing = 0;
- uint32_t down = 0;
- uint32_t misc = 0, missing_dirs = 0;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- ret = dht_layout_sort (layout);
- if (ret == -1) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_LAYOUT_SORT_FAILED,
- "sort failed?! how the ....");
- goto out;
- }
-
- gf_uuid_unparse(loc->gfid, gfid);
-
- ret = dht_layout_anomalies (this, loc, layout,
- &holes, &overlaps,
- &missing, &down, &misc, NULL);
- if (ret == -1) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_FIND_LAYOUT_ANOMALIES_ERROR,
- "Error finding anomalies in %s, gfid = %s",
- loc->path, gfid);
- goto out;
- }
-
- if (holes || overlaps) {
- if (missing == layout->cnt) {
- gf_msg_debug (this->name, 0,
- "Directory %s looked up first time"
- " gfid = %s", loc->path, gfid);
- } else {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_ANOMALIES_INFO,
- "Found anomalies in %s (gfid = %s). "
- "Holes=%d overlaps=%d",
- loc->path, gfid, holes, overlaps );
- }
- ret = -1;
- }
-
- if (ret >= 0) {
- missing_dirs = dht_layout_missing_dirs (layout);
- /* TODO During DHT selfheal rewrite (almost) find a better place
- * to detect this - probably in dht_layout_anomalies()
- */
- if (missing_dirs > 0)
- ret += missing_dirs;
- }
+ int ret = 0;
+ uint32_t holes = 0;
+ uint32_t overlaps = 0;
+ uint32_t missing = 0;
+ uint32_t down = 0;
+ uint32_t misc = 0, missing_dirs = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ ret = dht_layout_sort(layout);
+ if (ret == -1) {
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_LAYOUT_SORT_FAILED,
+ NULL);
+ goto out;
+ }
+
+ gf_uuid_unparse(loc->gfid, gfid);
+
+ dht_layout_anomalies(this, loc, layout, &holes, &overlaps, &missing, &down,
+ &misc, NULL);
+
+ if (holes || overlaps) {
+ if (missing == layout->cnt) {
+ gf_msg_debug(this->name, 0,
+ "Directory %s looked up first time"
+ " gfid = %s",
+ loc->path, gfid);
+ } else {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_ANOMALIES_INFO,
+ "path=%s", loc->path, "gfid=%s", gfid, "holes=%d", holes,
+ "overlaps=%d", overlaps, NULL);
+ }
+ ret = -1;
+ }
+
+ if (ret >= 0) {
+ missing_dirs = dht_layout_missing_dirs(layout);
+ /* TODO During DHT selfheal rewrite (almost) find a better place
+ * to detect this - probably in dht_layout_anomalies()
+ */
+ if (missing_dirs > 0)
+ ret += missing_dirs;
+ }
out:
- return ret;
+ return ret;
}
int
-dht_dir_has_layout (dict_t *xattr, char *name)
+dht_dir_has_layout(dict_t *xattr, char *name)
{
+ void *disk_layout_raw = NULL;
- void *disk_layout_raw = NULL;
-
- return dict_get_ptr (xattr, name, &disk_layout_raw);
+ return dict_get_ptr(xattr, name, &disk_layout_raw);
}
int
-dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
- loc_t *loc, dict_t *xattr)
+dht_layout_dir_mismatch(xlator_t *this, dht_layout_t *layout, xlator_t *subvol,
+ loc_t *loc, dict_t *xattr)
{
- int idx = 0;
- int pos = -1;
- int ret = 0;
- int err = 0;
- int dict_ret = 0;
- int32_t disk_layout[4];
- void *disk_layout_raw = NULL;
- uint32_t start_off = -1;
- uint32_t stop_off = -1;
- uint32_t commit_hash = -1;
- dht_conf_t *conf = this->private;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- if(loc && loc->inode)
- gf_uuid_unparse(loc->inode->gfid, gfid);
-
- for (idx = 0; idx < layout->cnt; idx++) {
- if (layout->list[idx].xlator == subvol) {
- pos = idx;
- break;
- }
- }
-
- if (pos == -1) {
- if (loc) {
- gf_msg_debug (this->name, 0,
- "%s - no layout info for subvolume %s",
- loc ? loc->path : "path not found",
- subvol->name);
- }
- ret = 1;
- goto out;
- }
-
- err = layout->list[pos].err;
-
- if (!xattr) {
- if (err == 0) {
- if (loc) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_DICT_GET_FAILED,
- "%s: xattr dictionary is NULL",
- loc->path);
- } else {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_DICT_GET_FAILED,
- "path not found: "
- "xattr dictionary is NULL");
- }
- ret = -1;
- }
- goto out;
- }
-
- dict_ret = dict_get_ptr (xattr, conf->xattr_name,
- &disk_layout_raw);
-
- if (dict_ret < 0) {
- if (err == 0 && layout->list[pos].stop) {
- if (loc) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_DISK_LAYOUT_MISSING,
- "%s: Disk layout missing, gfid = %s",
- loc->path, gfid);
- } else {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_DISK_LAYOUT_MISSING,
- "path not found: "
- "Disk layout missing, gfid = %s",
- gfid);
- }
- ret = -1;
- }
- goto out;
- }
-
- memcpy (disk_layout, disk_layout_raw, sizeof (disk_layout));
-
- start_off = ntoh32 (disk_layout[2]);
- stop_off = ntoh32 (disk_layout[3]);
- commit_hash = ntoh32 (disk_layout[0]);
-
- if ((layout->list[pos].start != start_off)
- || (layout->list[pos].stop != stop_off)
- || (layout->list[pos].commit_hash != commit_hash)) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_LAYOUT_INFO,
- "subvol: %s; inode layout - %"PRIu32" - %"PRIu32
- " - %"PRIu32"; "
- "disk layout - %"PRIu32" - %"PRIu32" - %"PRIu32,
- layout->list[pos].xlator->name,
- layout->list[pos].start, layout->list[pos].stop,
- layout->list[pos].commit_hash,
- start_off, stop_off, commit_hash);
- ret = 1;
- } else {
- ret = 0;
- }
+ int idx = 0;
+ int pos = -1;
+ int ret = 0;
+ int err = 0;
+ int dict_ret = 0;
+ int32_t disk_layout[4];
+ void *disk_layout_raw = NULL;
+ uint32_t start_off = -1;
+ uint32_t stop_off = -1;
+ uint32_t commit_hash = -1;
+ dht_conf_t *conf = this->private;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ if (loc && loc->inode)
+ gf_uuid_unparse(loc->inode->gfid, gfid);
+
+ for (idx = 0; idx < layout->cnt; idx++) {
+ if (layout->list[idx].xlator == subvol) {
+ pos = idx;
+ break;
+ }
+ }
+
+ if (pos == -1) {
+ if (loc) {
+ gf_msg_debug(this->name, 0, "%s - no layout info for subvolume %s",
+ loc ? loc->path : "path not found", subvol->name);
+ }
+ ret = 1;
+ goto out;
+ }
+
+ err = layout->list[pos].err;
+
+ if (!xattr) {
+ if (err == 0) {
+ if (loc) {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL,
+ "path=%s", loc->path, NULL);
+ } else {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_XATTR_DICT_NULL,
+ "path not found", NULL);
+ }
+ ret = -1;
+ }
+ goto out;
+ }
+
+ dict_ret = dict_get_ptr(xattr, conf->xattr_name, &disk_layout_raw);
+
+ if (dict_ret < 0) {
+ if (err == 0 && layout->list[pos].stop) {
+ if (loc) {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
+ "path=%s", loc->path, "gfid=%s", gfid, NULL);
+ } else {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_DISK_LAYOUT_MISSING,
+ "path not found"
+ "gfid=%s",
+ gfid, NULL);
+ }
+ ret = -1;
+ }
+ goto out;
+ }
+
+ memcpy(disk_layout, disk_layout_raw, sizeof(disk_layout));
+
+ start_off = ntoh32(disk_layout[2]);
+ stop_off = ntoh32(disk_layout[3]);
+ commit_hash = ntoh32(disk_layout[0]);
+
+ if ((layout->list[pos].start != start_off) ||
+ (layout->list[pos].stop != stop_off) ||
+ (layout->list[pos].commit_hash != commit_hash)) {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_LAYOUT_INFO, "subvol=%s",
+ layout->list[pos].xlator->name, "inode-layout:start=0x%x",
+ layout->list[pos].start, "inode-layout:stop=0x%x",
+ layout->list[pos].stop, "layout-commit-hash=0x%x; ",
+ layout->list[pos].commit_hash, "disk-layout:start-off=0x%x",
+ start_off, "disk-layout:top-off=0x%x", stop_off,
+ "commit-hash=0x%x", commit_hash, NULL);
+ ret = 1;
+ } else {
+ ret = 0;
+ }
out:
- return ret;
+ return ret;
}
-
int
-dht_layout_preset (xlator_t *this, xlator_t *subvol, inode_t *inode)
+dht_layout_preset(xlator_t *this, xlator_t *subvol, inode_t *inode)
{
- dht_layout_t *layout = NULL;
- int ret = -1;
- dht_conf_t *conf = NULL;
+ dht_layout_t *layout = NULL;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
- conf = this->private;
- if (!conf)
- goto out;
+ conf = this->private;
+ if (!conf)
+ goto out;
- layout = dht_layout_for_subvol (this, subvol);
- if (!layout) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_SUBVOL_NO_LAYOUT_INFO,
- "no pre-set layout for subvolume %s",
- subvol ? subvol->name : "<nil>");
- ret = -1;
- goto out;
- }
+ layout = dht_layout_for_subvol(this, subvol);
+ if (!layout) {
+ gf_smsg(this->name, GF_LOG_INFO, 0, DHT_MSG_SUBVOL_NO_LAYOUT_INFO,
+ "subvolume=%s", subvol ? subvol->name : "<nil>", NULL);
+ ret = -1;
+ goto out;
+ }
- LOCK (&conf->layout_lock);
- {
- dht_inode_ctx_layout_set (inode, this, layout);
- }
- UNLOCK (&conf->layout_lock);
+ gf_msg_debug(this->name, 0, "file = %s, subvol = %s",
+ uuid_utoa(inode->gfid), subvol ? subvol->name : "<nil>");
- ret = 0;
+ LOCK(&conf->layout_lock);
+ {
+ dht_inode_ctx_layout_set(inode, this, layout);
+ }
+
+ UNLOCK(&conf->layout_lock);
+
+ ret = 0;
out:
- return ret;
+ return ret;
}
int
-dht_layout_index_for_subvol (dht_layout_t *layout, xlator_t *subvol)
+dht_layout_index_for_subvol(dht_layout_t *layout, xlator_t *subvol)
{
- int i = 0, ret = -1;
+ int i = 0, ret = -1;
- for (i = 0; i < layout->cnt; i++) {
- if (layout->list[i].xlator == subvol) {
- ret = i;
- break;
- }
+ for (i = 0; i < layout->cnt; i++) {
+ if (layout->list[i].xlator == subvol) {
+ ret = i;
+ break;
}
+ }
- return ret;
+ return ret;
}
diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c
index deba2138672..89ec6cca56e 100644
--- a/xlators/cluster/dht/src/dht-linkfile.c
+++ b/xlators/cluster/dht/src/dht-linkfile.c
@@ -8,353 +8,321 @@
cases as published by the Free Software Foundation.
*/
-
-#include "glusterfs.h"
-#include "xlator.h"
-#include "compat.h"
+#include <glusterfs/compat.h>
#include "dht-common.h"
-#include "dht-messages.h"
-int
-dht_linkfile_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno,
- inode_t *inode, struct iatt *stbuf, dict_t *xattr,
- struct iatt *postparent)
+static int
+dht_linkfile_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, dict_t *xattr,
+ struct iatt *postparent)
{
- char is_linkfile = 0;
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- local = frame->local;
- prev = cookie;
- conf = this->private;
-
- if (op_ret)
- goto out;
-
- gf_uuid_unparse(local->loc.gfid, gfid);
-
- is_linkfile = check_is_linkfile (inode, stbuf, xattr,
- conf->link_xattr_name);
- if (!is_linkfile)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_NOT_LINK_FILE_ERROR,
- "got non-linkfile %s:%s, gfid = %s",
- prev->this->name, local->loc.path, gfid);
+ char is_linkfile = 0;
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ prev = cookie;
+ conf = this->private;
+
+ if (op_ret)
+ goto out;
+
+ gf_uuid_unparse(local->loc.gfid, gfid);
+
+ is_linkfile = check_is_linkfile(inode, stbuf, xattr, conf->link_xattr_name);
+ if (!is_linkfile)
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_NOT_LINK_FILE_ERROR,
+ "name=%s", prev->name, "path=%s", local->loc.path, "gfid=%s",
+ gfid, NULL);
out:
- local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
- inode, stbuf, postparent, postparent,
- xattr);
- return 0;
+ local->linkfile.linkfile_cbk(frame, cookie, this, op_ret, op_errno, inode,
+ stbuf, postparent, postparent, xattr);
+ return 0;
}
-#define is_equal(a, b) ((a) == (b))
-int
-dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, inode_t *inode,
- struct iatt *stbuf, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+static int
+dht_linkfile_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, inode_t *inode,
+ struct iatt *stbuf, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- dht_local_t *local = NULL;
- xlator_t *subvol = NULL;
- call_frame_t *prev = NULL;
- dict_t *xattrs = NULL;
- dht_conf_t *conf = NULL;
- int ret = -1;
-
- local = frame->local;
-
- if (!op_ret)
- local->linked = _gf_true;
-
- FRAME_SU_UNDO (frame, dht_local_t);
-
- if (op_ret && (op_errno == EEXIST)) {
- conf = this->private;
- prev = cookie;
- subvol = prev->this;
- if (!subvol)
- goto out;
- xattrs = dict_new ();
- if (!xattrs)
- goto out;
- ret = dict_set_uint32 (xattrs, conf->link_xattr_name, 256);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value. key : %s",
- conf->link_xattr_name);
- goto out;
- }
-
- STACK_WIND (frame, dht_linkfile_lookup_cbk, subvol,
- subvol->fops->lookup, &local->loc, xattrs);
- if (xattrs)
- dict_unref (xattrs);
- return 0;
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ dict_t *xattrs = NULL;
+ dht_conf_t *conf = NULL;
+ int ret = -1;
+
+ local = frame->local;
+
+ if (!op_ret)
+ local->linked = _gf_true;
+
+ FRAME_SU_UNDO(frame, dht_local_t);
+
+ if (op_ret && (op_errno == EEXIST)) {
+ conf = this->private;
+ subvol = cookie;
+ if (!subvol)
+ goto out;
+ xattrs = dict_new();
+ if (!xattrs)
+ goto out;
+ ret = dict_set_uint32(xattrs, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "mame=%s", conf->link_xattr_name, NULL);
+ goto out;
}
-out:
- local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno,
- inode, stbuf, preparent, postparent,
- xdata);
+
+ STACK_WIND_COOKIE(frame, dht_linkfile_lookup_cbk, subvol, subvol,
+ subvol->fops->lookup, &local->linkfile.loc, xattrs);
if (xattrs)
- dict_unref (xattrs);
+ dict_unref(xattrs);
return 0;
+ }
+out:
+ local->linkfile.linkfile_cbk(frame, cookie, this, op_ret, op_errno, inode,
+ stbuf, preparent, postparent, xdata);
+ if (xattrs)
+ dict_unref(xattrs);
+ return 0;
}
-
int
-dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
- xlator_t *this,
- xlator_t *tovol, xlator_t *fromvol, loc_t *loc)
+dht_linkfile_create(call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk,
+ xlator_t *this, xlator_t *tovol, xlator_t *fromvol,
+ loc_t *loc)
{
- dht_local_t *local = NULL;
- dict_t *dict = NULL;
- int need_unref = 0;
- int ret = 0;
- dht_conf_t *conf = this->private;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- local = frame->local;
- local->linkfile.linkfile_cbk = linkfile_cbk;
- local->linkfile.srcvol = tovol;
-
- local->linked = _gf_false;
-
- dict = local->params;
- if (!dict) {
- dict = dict_new ();
- if (!dict)
- goto out;
- need_unref = 1;
- }
-
-
- if (!gf_uuid_is_null (local->gfid)) {
- gf_uuid_unparse(local->gfid, gfid);
-
- ret = dict_set_static_bin (dict, "gfid-req", local->gfid, 16);
- if (ret)
- gf_msg ("dht-linkfile", GF_LOG_INFO, 0,
- DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value: "
- "key = gfid-req, gfid = %s ", loc->path, gfid);
- } else {
- gf_uuid_unparse(loc->gfid, gfid);
- }
-
- ret = dict_set_str (dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ dht_local_t *local = NULL;
+ dict_t *dict = NULL;
+ int need_unref = 0;
+ int ret = 0;
+ dht_conf_t *conf = this->private;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ local->linkfile.linkfile_cbk = linkfile_cbk;
+ local->linkfile.srcvol = tovol;
+ loc_copy(&local->linkfile.loc, loc);
+
+ local->linked = _gf_false;
+
+ dict = local->params;
+ if (!dict) {
+ dict = dict_new();
+ if (!dict)
+ goto out;
+ need_unref = 1;
+ }
+
+ if (!gf_uuid_is_null(local->gfid)) {
+ gf_uuid_unparse(local->gfid, gfid);
+
+ ret = dict_set_gfuuid(dict, "gfid-req", local->gfid, true);
if (ret)
- gf_msg ("dht-linkfile", GF_LOG_INFO, 0,
- DHT_MSG_DICT_SET_FAILED,
- "%s: Failed to set dictionary value: key = %s,"
- " gfid = %s", loc->path,
- GLUSTERFS_INTERNAL_FOP_KEY, gfid);
-
- ret = dict_set_str (dict, conf->link_xattr_name, tovol->name);
-
- if (ret < 0) {
- gf_msg (frame->this->name, GF_LOG_INFO, 0,
- DHT_MSG_CREATE_LINK_FAILED,
- "%s: failed to initialize linkfile data, gfid = %s",
- loc->path, gfid);
- goto out;
- }
-
- local->link_subvol = fromvol;
- /* Always create as root:root. dht_linkfile_attr_heal fixes the
- * ownsership */
- FRAME_SU_DO (frame, dht_local_t);
- STACK_WIND (frame, dht_linkfile_create_cbk,
- fromvol, fromvol->fops->mknod, loc,
- S_IFREG | DHT_LINKFILE_MODE, 0, 0, dict);
-
- if (need_unref && dict)
- dict_unref (dict);
-
- return 0;
+ gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "gfid=%s", gfid, NULL);
+ } else {
+ gf_uuid_unparse(loc->gfid, gfid);
+ }
+
+ ret = dict_set_str(dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
+ if (ret)
+ gf_smsg("dht-linkfile", GF_LOG_INFO, 0, DHT_MSG_DICT_SET_FAILED,
+ "path=%s", loc->path, "key=%s", GLUSTERFS_INTERNAL_FOP_KEY,
+ "gfid=%s", gfid, NULL);
+
+ ret = dict_set_str(dict, conf->link_xattr_name, tovol->name);
+
+ if (ret < 0) {
+ gf_smsg(frame->this->name, GF_LOG_INFO, 0, DHT_MSG_CREATE_LINK_FAILED,
+ "path=%s", loc->path, "gfid=%s", gfid, NULL);
+ goto out;
+ }
+
+ local->link_subvol = fromvol;
+ /* Always create as root:root. dht_linkfile_attr_heal fixes the
+ * ownsership */
+ FRAME_SU_DO(frame, dht_local_t);
+ STACK_WIND_COOKIE(frame, dht_linkfile_create_cbk, fromvol, fromvol,
+ fromvol->fops->mknod, loc, S_IFREG | DHT_LINKFILE_MODE, 0,
+ 0, dict);
+
+ if (need_unref && dict)
+ dict_unref(dict);
+
+ return 0;
out:
- local->linkfile.linkfile_cbk (frame, NULL, frame->this, -1, ENOMEM,
- loc->inode, NULL, NULL, NULL, NULL);
+ local->linkfile.linkfile_cbk(frame, frame->this, frame->this, -1, ENOMEM,
+ loc->inode, NULL, NULL, NULL, NULL);
- if (need_unref && dict)
- dict_unref (dict);
+ if (need_unref && dict)
+ dict_unref(dict);
- return 0;
+ return 0;
}
-
int
-dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno,
- struct iatt *preparent, struct iatt *postparent,
- dict_t *xdata)
+dht_linkfile_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno,
+ struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- xlator_t *subvol = NULL;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- local = frame->local;
- prev = cookie;
- subvol = prev->this;
-
+ dht_local_t *local = NULL;
+ xlator_t *subvol = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
- if (op_ret == -1) {
+ local = frame->local;
+ subvol = cookie;
- gf_uuid_unparse(local->loc.gfid, gfid);
- gf_msg (this->name, GF_LOG_INFO, op_errno,
- DHT_MSG_UNLINK_FAILED,
- "Unlinking linkfile %s (gfid = %s)on "
- "subvolume %s failed ",
- local->loc.path, gfid, subvol->name);
- }
+ if (op_ret == -1) {
+ gf_uuid_unparse(local->loc.gfid, gfid);
+ gf_smsg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_UNLINK_FAILED,
+ "path=%s", local->loc.path, "gfid=%s", gfid, "subvolume=%s",
+ subvol->name, NULL);
+ }
- DHT_STACK_DESTROY (frame);
+ DHT_STACK_DESTROY(frame);
- return 0;
+ return 0;
}
-
int
-dht_linkfile_unlink (call_frame_t *frame, xlator_t *this,
- xlator_t *subvol, loc_t *loc)
+dht_linkfile_unlink(call_frame_t *frame, xlator_t *this, xlator_t *subvol,
+ loc_t *loc)
{
- call_frame_t *unlink_frame = NULL;
- dht_local_t *unlink_local = NULL;
+ call_frame_t *unlink_frame = NULL;
+ dht_local_t *unlink_local = NULL;
- unlink_frame = copy_frame (frame);
- if (!unlink_frame) {
- goto err;
- }
+ unlink_frame = copy_frame(frame);
+ if (!unlink_frame) {
+ goto err;
+ }
- /* Using non-fop value here, as anyways, 'local->fop' is not used in
- this particular case */
- unlink_local = dht_local_init (unlink_frame, loc, NULL,
- GF_FOP_MAXVALUE);
- if (!unlink_local) {
- goto err;
- }
+ /* Using non-fop value here, as anyways, 'local->fop' is not used in
+ this particular case */
+ unlink_local = dht_local_init(unlink_frame, loc, NULL, GF_FOP_MAXVALUE);
+ if (!unlink_local) {
+ goto err;
+ }
- STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk,
- subvol, subvol->fops->unlink,
- &unlink_local->loc, 0, NULL);
+ STACK_WIND_COOKIE(unlink_frame, dht_linkfile_unlink_cbk, subvol, subvol,
+ subvol->fops->unlink, &unlink_local->loc, 0, NULL);
- return 0;
+ return 0;
err:
- if (unlink_frame)
- DHT_STACK_DESTROY (unlink_frame);
+ if (unlink_frame)
+ DHT_STACK_DESTROY(unlink_frame);
- return -1;
+ return -1;
}
-
xlator_t *
-dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct iatt *stbuf,
- dict_t *xattr)
+dht_linkfile_subvol(xlator_t *this, inode_t *inode, struct iatt *stbuf,
+ dict_t *xattr)
{
- dht_conf_t *conf = NULL;
- xlator_t *subvol = NULL;
- void *volname = NULL;
- int i = 0, ret = 0;
+ dht_conf_t *conf = NULL;
+ xlator_t *subvol = NULL;
+ void *volname = NULL;
+ int i = 0, ret = 0;
- conf = this->private;
+ conf = this->private;
- if (!xattr)
- goto out;
+ if (!xattr)
+ goto out;
- ret = dict_get_ptr (xattr, conf->link_xattr_name, &volname);
+ ret = dict_get_ptr(xattr, conf->link_xattr_name, &volname);
- if ((-1 == ret) || !volname)
- goto out;
+ if ((-1 == ret) || !volname)
+ goto out;
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) {
- subvol = conf->subvolumes[i];
- break;
- }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (strcmp(conf->subvolumes[i]->name, (char *)volname) == 0) {
+ subvol = conf->subvolumes[i];
+ break;
}
+ }
out:
- return subvol;
+ return subvol;
}
-int
-dht_linkfile_setattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, struct iatt *statpre,
- struct iatt *statpost, dict_t *xdata)
+static int
+dht_linkfile_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, struct iatt *statpre,
+ struct iatt *statpost, dict_t *xdata)
{
- dht_local_t *local = NULL;
- loc_t *loc = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
- local = frame->local;
- loc = &local->loc;
+ local = frame->local;
+ loc = &local->loc;
- if (op_ret)
- gf_msg (this->name, GF_LOG_ERROR, op_errno,
- DHT_MSG_SETATTR_FAILED,
- "Failed to set attr uid/gid on %s"
- " :<gfid:%s> ",
- (loc->path? loc->path: "NULL"),
- uuid_utoa(local->gfid));
+ if (op_ret)
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_SETATTR_FAILED,
+ "path=%s", (loc->path ? loc->path : "NULL"), "gfid=%s",
+ uuid_utoa(local->gfid), NULL);
- DHT_STACK_DESTROY (frame);
+ DHT_STACK_DESTROY(frame);
- return 0;
+ return 0;
}
int
-dht_linkfile_attr_heal (call_frame_t *frame, xlator_t *this)
+dht_linkfile_attr_heal(call_frame_t *frame, xlator_t *this)
{
- int ret = -1;
- call_frame_t *copy = NULL;
- dht_local_t *local = NULL;
- dht_local_t *copy_local = NULL;
- xlator_t *subvol = NULL;
- struct iatt stbuf = {0,};
- dict_t *xattr = NULL;
-
- local = frame->local;
-
- GF_VALIDATE_OR_GOTO ("dht", local, out);
- GF_VALIDATE_OR_GOTO ("dht", local->link_subvol, out);
-
- if (local->stbuf.ia_type == IA_INVAL)
- return 0;
+ int ret = -1;
+ call_frame_t *copy = NULL;
+ dht_local_t *local = NULL;
+ dht_local_t *copy_local = NULL;
+ xlator_t *subvol = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ dict_t *xattr = NULL;
+
+ local = frame->local;
+
+ GF_VALIDATE_OR_GOTO("dht", local, out);
+ GF_VALIDATE_OR_GOTO("dht", local->link_subvol, out);
+
+ if (local->stbuf.ia_type == IA_INVAL)
+ return 0;
- DHT_MARK_FOP_INTERNAL (xattr);
+ DHT_MARK_FOP_INTERNAL(xattr);
- gf_uuid_copy (local->loc.gfid, local->stbuf.ia_gfid);
+ gf_uuid_copy(local->loc.gfid, local->stbuf.ia_gfid);
- copy = copy_frame (frame);
+ copy = copy_frame(frame);
- if (!copy)
- goto out;
+ if (!copy)
+ goto out;
- copy_local = dht_local_init (copy, &local->loc, NULL, 0);
+ copy_local = dht_local_init(copy, &local->loc, NULL, 0);
- if (!copy_local)
- goto out;
+ if (!copy_local)
+ goto out;
- stbuf = local->stbuf;
- subvol = local->link_subvol;
+ stbuf = local->stbuf;
+ subvol = local->link_subvol;
- copy->local = copy_local;
+ copy->local = copy_local;
- FRAME_SU_DO (copy, dht_local_t);
+ FRAME_SU_DO(copy, dht_local_t);
- STACK_WIND (copy, dht_linkfile_setattr_cbk, subvol,
- subvol->fops->setattr, &copy_local->loc,
- &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID), xattr);
- ret = 0;
+ STACK_WIND(copy, dht_linkfile_setattr_cbk, subvol, subvol->fops->setattr,
+ &copy_local->loc, &stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+ xattr);
+ ret = 0;
out:
- if ((ret < 0) && (copy))
- DHT_STACK_DESTROY (copy);
+ if ((ret < 0) && (copy))
+ DHT_STACK_DESTROY(copy);
- if (xattr)
- dict_unref (xattr);
+ if (xattr)
+ dict_unref(xattr);
- return ret;
+ return ret;
}
diff --git a/xlators/cluster/dht/src/dht-lock.c b/xlators/cluster/dht/src/dht-lock.c
new file mode 100644
index 00000000000..638821ccee5
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-lock.c
@@ -0,0 +1,1392 @@
+/*
+ Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#include "dht-lock.h"
+
+static char *
+dht_lock_asprintf(dht_lock_t *lock)
+{
+ char *lk_buf = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+
+ if (lock == NULL)
+ goto out;
+
+ uuid_utoa_r(lock->loc.gfid, gfid);
+
+ gf_asprintf(&lk_buf, "%s:%s", lock->xl->name, gfid);
+
+out:
+ return lk_buf;
+}
+
+static void
+dht_log_lk_array(char *name, gf_loglevel_t log_level, dht_lock_t **lk_array,
+ int count)
+{
+ int i = 0;
+ char *lk_buf = NULL;
+
+ if ((lk_array == NULL) || (count == 0))
+ goto out;
+
+ for (i = 0; i < count; i++) {
+ lk_buf = dht_lock_asprintf(lk_array[i]);
+ if (!lk_buf)
+ goto out;
+
+ gf_smsg(name, log_level, 0, DHT_MSG_LK_ARRAY_INFO, "index=%d", i,
+ "lk_buf=%s", lk_buf, NULL);
+ GF_FREE(lk_buf);
+ }
+
+out:
+ return;
+}
+
+static void
+dht_lock_stack_destroy(call_frame_t *lock_frame, dht_lock_type_t lk)
+{
+ dht_local_t *local = NULL;
+
+ local = lock_frame->local;
+
+ if (lk == DHT_INODELK) {
+ local->lock[0].layout.my_layout.locks = NULL;
+ local->lock[0].layout.my_layout.lk_count = 0;
+ } else {
+ local->lock[0].ns.directory_ns.locks = NULL;
+ local->lock[0].ns.directory_ns.lk_count = 0;
+ }
+
+ DHT_STACK_DESTROY(lock_frame);
+ return;
+}
+
+static void
+dht_lock_free(dht_lock_t *lock)
+{
+ if (lock == NULL)
+ goto out;
+
+ loc_wipe(&lock->loc);
+ GF_FREE(lock->domain);
+ GF_FREE(lock->basename);
+ mem_put(lock);
+
+out:
+ return;
+}
+
+static void
+dht_set_lkowner(dht_lock_t **lk_array, int count, gf_lkowner_t *lkowner)
+{
+ int i = 0;
+
+ if (!lk_array || !lkowner)
+ goto out;
+
+ for (i = 0; i < count; i++) {
+ lk_array[i]->lk_owner = *lkowner;
+ }
+
+out:
+ return;
+}
+
+static int
+dht_lock_request_cmp(const void *val1, const void *val2)
+{
+ dht_lock_t *lock1 = NULL;
+ dht_lock_t *lock2 = NULL;
+ int ret = -1;
+
+ lock1 = *(dht_lock_t **)val1;
+ lock2 = *(dht_lock_t **)val2;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", lock1, out);
+ GF_VALIDATE_OR_GOTO("dht-locks", lock2, out);
+
+ ret = strcmp(lock1->xl->name, lock2->xl->name);
+
+ if (ret == 0) {
+ ret = gf_uuid_compare(lock1->loc.gfid, lock2->loc.gfid);
+ }
+
+out:
+ return ret;
+}
+
+static int
+dht_lock_order_requests(dht_lock_t **locks, int count)
+{
+ int ret = -1;
+
+ if (!locks || !count)
+ goto out;
+
+ qsort(locks, count, sizeof(*locks), dht_lock_request_cmp);
+ ret = 0;
+
+out:
+ return ret;
+}
+
+void
+dht_lock_array_free(dht_lock_t **lk_array, int count)
+{
+ int i = 0;
+ dht_lock_t *lock = NULL;
+
+ if (lk_array == NULL)
+ goto out;
+
+ for (i = 0; i < count; i++) {
+ lock = lk_array[i];
+ lk_array[i] = NULL;
+ dht_lock_free(lock);
+ }
+
+out:
+ return;
+}
+
+int32_t
+dht_lock_count(dht_lock_t **lk_array, int lk_count)
+{
+ int i = 0, locked = 0;
+
+ if ((lk_array == NULL) || (lk_count == 0))
+ goto out;
+
+ for (i = 0; i < lk_count; i++) {
+ if (lk_array[i]->locked)
+ locked++;
+ }
+out:
+ return locked;
+}
+
+static call_frame_t *
+dht_lock_frame(call_frame_t *parent_frame)
+{
+ call_frame_t *lock_frame = NULL;
+
+ lock_frame = copy_frame(parent_frame);
+ if (lock_frame == NULL)
+ goto out;
+
+ set_lk_owner_from_ptr(&lock_frame->root->lk_owner, parent_frame->root);
+
+out:
+ return lock_frame;
+}
+
+dht_lock_t *
+dht_lock_new(xlator_t *this, xlator_t *xl, loc_t *loc, short type,
+ const char *domain, const char *basename,
+ dht_reaction_type_t do_on_failure)
+{
+ dht_conf_t *conf = NULL;
+ dht_lock_t *lock = NULL;
+
+ conf = this->private;
+
+ lock = mem_get0(conf->lock_pool);
+ if (lock == NULL)
+ goto out;
+
+ lock->xl = xl;
+ lock->type = type;
+ lock->do_on_failure = do_on_failure;
+
+ lock->domain = gf_strdup(domain);
+ if (lock->domain == NULL) {
+ dht_lock_free(lock);
+ lock = NULL;
+ goto out;
+ }
+
+ if (basename) {
+ lock->basename = gf_strdup(basename);
+ if (lock->basename == NULL) {
+ dht_lock_free(lock);
+ lock = NULL;
+ goto out;
+ }
+ }
+
+ /* Fill only inode and gfid.
+ posix and protocol/server give preference to pargfid/basename over
+ gfid/inode for resolution if all the three parameters of loc_t are
+ present. I want to avoid the following hypothetical situation:
+
+ 1. rebalance did a lookup on a dentry and got a gfid.
+ 2. rebalance acquires lock on loc_t which was filled with gfid and
+ path (pargfid/bname) from step 1.
+ 3. somebody deleted and recreated the same file
+ 4. rename on the same path acquires lock on loc_t which now points
+ to a different inode (and hence gets the lock).
+ 5. rebalance continues to migrate file (note that not all fops done
+ by rebalance during migration are inode/gfid based Eg., unlink)
+ 6. rename continues.
+ */
+ lock->loc.inode = inode_ref(loc->inode);
+ loc_gfid(loc, lock->loc.gfid);
+
+out:
+ return lock;
+}
+
+static int
+dht_local_entrylk_init(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_entrylk_cbk_t entrylk_cbk)
+{
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local == NULL) {
+ local = dht_local_init(frame, NULL, NULL, 0);
+ }
+
+ if (local == NULL) {
+ goto out;
+ }
+
+ local->lock[0].ns.directory_ns.entrylk_cbk = entrylk_cbk;
+ local->lock[0].ns.directory_ns.locks = lk_array;
+ local->lock[0].ns.directory_ns.lk_count = lk_count;
+
+ ret = dht_lock_order_requests(local->lock[0].ns.directory_ns.locks,
+ local->lock[0].ns.directory_ns.lk_count);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static void
+dht_entrylk_done(call_frame_t *lock_frame)
+{
+ fop_entrylk_cbk_t entrylk_cbk = NULL;
+ call_frame_t *main_frame = NULL;
+ dht_local_t *local = NULL;
+
+ local = lock_frame->local;
+ main_frame = local->main_frame;
+
+ local->lock[0].ns.directory_ns.locks = NULL;
+ local->lock[0].ns.directory_ns.lk_count = 0;
+
+ entrylk_cbk = local->lock[0].ns.directory_ns.entrylk_cbk;
+ local->lock[0].ns.directory_ns.entrylk_cbk = NULL;
+
+ entrylk_cbk(main_frame, NULL, main_frame->this,
+ local->lock[0].ns.directory_ns.op_ret,
+ local->lock[0].ns.directory_ns.op_errno, NULL);
+
+ dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK);
+ return;
+}
+
+static int32_t
+dht_unlock_entrylk_done(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ gf_uuid_unparse(local->lock[0].ns.directory_ns.locks[0]->loc.inode->gfid,
+ gfid);
+
+ if (op_ret < 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_UNLOCK_GFID_FAILED, "gfid=%s", gfid,
+ "DHT_LAYOUT_HEAL_DOMAIN", NULL);
+ }
+
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
+
+static int32_t
+dht_unlock_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int lk_index = 0, call_cnt = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ lk_index = (long)cookie;
+
+ local = frame->local;
+
+ uuid_utoa_r(local->lock[0].ns.directory_ns.locks[lk_index]->loc.gfid, gfid);
+
+ if (op_ret < 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED,
+ "name=%s",
+ local->lock[0].ns.directory_ns.locks[lk_index]->xl->name,
+ "gfid=%s", gfid, NULL);
+ } else {
+ local->lock[0].ns.directory_ns.locks[lk_index]->locked = 0;
+ }
+
+ call_cnt = dht_frame_return(frame);
+ if (is_last_call(call_cnt)) {
+ dht_entrylk_done(frame);
+ }
+
+ return 0;
+}
+
+static int32_t
+dht_unlock_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_entrylk_cbk_t entrylk_cbk)
+{
+ dht_local_t *local = NULL;
+ int ret = -1, i = 0;
+ call_frame_t *lock_frame = NULL;
+ int call_cnt = 0;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, done);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, done);
+ GF_VALIDATE_OR_GOTO(frame->this->name, entrylk_cbk, done);
+
+ call_cnt = dht_lock_count(lk_array, lk_count);
+ if (call_cnt == 0) {
+ ret = 0;
+ goto done;
+ }
+
+ lock_frame = dht_lock_frame(frame);
+ if (lock_frame == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+ NULL);
+
+ dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+ goto done;
+ }
+
+ ret = dht_local_entrylk_init(lock_frame, lk_array, lk_count, entrylk_cbk);
+ if (ret < 0) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+ NULL);
+
+ dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+
+ goto done;
+ }
+
+ local = lock_frame->local;
+ local->main_frame = frame;
+ local->call_cnt = call_cnt;
+
+ for (i = 0; i < local->lock[0].ns.directory_ns.lk_count; i++) {
+ if (!local->lock[0].ns.directory_ns.locks[i]->locked)
+ continue;
+
+ lock_frame->root
+ ->lk_owner = local->lock[0].ns.directory_ns.locks[i]->lk_owner;
+ STACK_WIND_COOKIE(
+ lock_frame, dht_unlock_entrylk_cbk, (void *)(long)i,
+ local->lock[0].ns.directory_ns.locks[i]->xl,
+ local->lock[0].ns.directory_ns.locks[i]->xl->fops->entrylk,
+ local->lock[0].ns.directory_ns.locks[i]->domain,
+ &local->lock[0].ns.directory_ns.locks[i]->loc,
+ local->lock[0].ns.directory_ns.locks[i]->basename, ENTRYLK_UNLOCK,
+ ENTRYLK_WRLCK, NULL);
+ if (!--call_cnt)
+ break;
+ }
+
+ return 0;
+
+done:
+ if (lock_frame)
+ dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK);
+
+ /* no locks acquired, invoke entrylk_cbk */
+ if (ret == 0)
+ entrylk_cbk(frame, NULL, frame->this, 0, 0, NULL);
+
+ return ret;
+}
+
+int32_t
+dht_unlock_entrylk_wrapper(call_frame_t *frame, dht_elock_wrap_t *entrylk)
+{
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = 0;
+
+ local = frame->local;
+
+ if (!entrylk || !entrylk->locks)
+ goto out;
+
+ gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
+ goto done;
+ }
+
+ lock_local = dht_local_init(lock_frame, NULL, NULL, 0);
+ if (lock_local == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_CREATE_FAILED, "local", "pgfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
+ goto done;
+ }
+
+ lock_frame->local = lock_local;
+
+ lock_local->lock[0].ns.directory_ns.locks = entrylk->locks;
+ lock_local->lock[0].ns.directory_ns.lk_count = entrylk->lk_count;
+ entrylk->locks = NULL;
+ entrylk->lk_count = 0;
+
+ ret = dht_unlock_entrylk(
+ lock_frame, lock_local->lock[0].ns.directory_ns.locks,
+ lock_local->lock[0].ns.directory_ns.lk_count, dht_unlock_entrylk_done);
+ if (ret)
+ goto done;
+
+ lock_frame = NULL;
+
+done:
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
+
+out:
+ return 0;
+}
+
+static int
+dht_entrylk_cleanup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_entrylk_done(frame);
+ return 0;
+}
+
+static void
+dht_entrylk_cleanup(call_frame_t *lock_frame)
+{
+ dht_lock_t **lk_array = NULL;
+ int lk_count = 0, lk_acquired = 0;
+ dht_local_t *local = NULL;
+
+ local = lock_frame->local;
+
+ lk_array = local->lock[0].ns.directory_ns.locks;
+ lk_count = local->lock[0].ns.directory_ns.lk_count;
+
+ lk_acquired = dht_lock_count(lk_array, lk_count);
+ if (lk_acquired != 0) {
+ dht_unlock_entrylk(lock_frame, lk_array, lk_count,
+ dht_entrylk_cleanup_cbk);
+ } else {
+ dht_entrylk_done(lock_frame);
+ }
+
+ return;
+}
+
+static int32_t
+dht_blocking_entrylk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ int lk_index = 0;
+ int i = 0;
+ dht_local_t *local = NULL;
+
+ lk_index = (long)cookie;
+
+ local = frame->local;
+ if (op_ret == 0) {
+ local->lock[0].ns.directory_ns.locks[lk_index]->locked = _gf_true;
+ } else {
+ switch (op_errno) {
+ case ESTALE:
+ case ENOENT:
+ if (local->lock[0]
+ .ns.directory_ns.locks[lk_index]
+ ->do_on_failure != IGNORE_ENOENT_ESTALE) {
+ local->lock[0].ns.directory_ns.op_ret = -1;
+ local->lock[0].ns.directory_ns.op_errno = op_errno;
+ goto cleanup;
+ }
+ break;
+ default:
+ local->lock[0].ns.directory_ns.op_ret = -1;
+ local->lock[0].ns.directory_ns.op_errno = op_errno;
+ goto cleanup;
+ }
+ }
+
+ if (lk_index == (local->lock[0].ns.directory_ns.lk_count - 1)) {
+ for (i = 0; (i < local->lock[0].ns.directory_ns.lk_count) &&
+ (!local->lock[0].ns.directory_ns.locks[i]->locked);
+ i++)
+ ;
+
+ if (i == local->lock[0].ns.directory_ns.lk_count) {
+ local->lock[0].ns.directory_ns.op_ret = -1;
+ local->lock[0].ns.directory_ns.op_errno = op_errno;
+ }
+
+ dht_entrylk_done(frame);
+ } else {
+ dht_blocking_entrylk_rec(frame, ++lk_index);
+ }
+
+ return 0;
+
+cleanup:
+ dht_entrylk_cleanup(frame);
+
+ return 0;
+}
+
+void
+dht_blocking_entrylk_rec(call_frame_t *frame, int i)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ STACK_WIND_COOKIE(
+ frame, dht_blocking_entrylk_cbk, (void *)(long)i,
+ local->lock[0].ns.directory_ns.locks[i]->xl,
+ local->lock[0].ns.directory_ns.locks[i]->xl->fops->entrylk,
+ local->lock[0].ns.directory_ns.locks[i]->domain,
+ &local->lock[0].ns.directory_ns.locks[i]->loc,
+ local->lock[0].ns.directory_ns.locks[i]->basename, ENTRYLK_LOCK,
+ ENTRYLK_WRLCK, NULL);
+
+ return;
+}
+
+int
+dht_blocking_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_entrylk_cbk_t entrylk_cbk)
+{
+ int ret = -1;
+ call_frame_t *lock_frame = NULL;
+ dht_local_t *local = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, entrylk_cbk, out);
+
+ lock_frame = dht_lock_frame(frame);
+ if (lock_frame == NULL)
+ goto out;
+
+ ret = dht_local_entrylk_init(lock_frame, lk_array, lk_count, entrylk_cbk);
+ if (ret < 0) {
+ goto out;
+ }
+
+ dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner);
+
+ local = lock_frame->local;
+ local->main_frame = frame;
+
+ dht_blocking_entrylk_rec(lock_frame, 0);
+
+ return 0;
+out:
+ if (lock_frame)
+ dht_lock_stack_destroy(lock_frame, DHT_ENTRYLK);
+
+ return -1;
+}
+
+static int
+dht_local_inodelk_init(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t inodelk_cbk)
+{
+ int ret = -1;
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+
+ if (local == NULL) {
+ local = dht_local_init(frame, NULL, NULL, 0);
+ }
+
+ if (local == NULL) {
+ goto out;
+ }
+
+ local->lock[0].layout.my_layout.inodelk_cbk = inodelk_cbk;
+ local->lock[0].layout.my_layout.locks = lk_array;
+ local->lock[0].layout.my_layout.lk_count = lk_count;
+
+ ret = dht_lock_order_requests(local->lock[0].layout.my_layout.locks,
+ local->lock[0].layout.my_layout.lk_count);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static void
+dht_inodelk_done(call_frame_t *lock_frame)
+{
+ fop_inodelk_cbk_t inodelk_cbk = NULL;
+ call_frame_t *main_frame = NULL;
+ dht_local_t *local = NULL;
+
+ local = lock_frame->local;
+ main_frame = local->main_frame;
+
+ local->lock[0].layout.my_layout.locks = NULL;
+ local->lock[0].layout.my_layout.lk_count = 0;
+
+ inodelk_cbk = local->lock[0].layout.my_layout.inodelk_cbk;
+ local->lock[0].layout.my_layout.inodelk_cbk = NULL;
+
+ inodelk_cbk(main_frame, NULL, main_frame->this,
+ local->lock[0].layout.my_layout.op_ret,
+ local->lock[0].layout.my_layout.op_errno, NULL);
+
+ dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+ return;
+}
+
+static int32_t
+dht_unlock_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int lk_index = 0, call_cnt = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ lk_index = (long)cookie;
+
+ local = frame->local;
+ if (op_ret < 0) {
+ uuid_utoa_r(local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid,
+ gfid);
+
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLOCKING_FAILED,
+ "name=%s",
+ local->lock[0].layout.my_layout.locks[lk_index]->xl->name,
+ "gfid=%s", gfid, NULL);
+ } else {
+ local->lock[0].layout.my_layout.locks[lk_index]->locked = 0;
+ }
+
+ call_cnt = dht_frame_return(frame);
+ if (is_last_call(call_cnt)) {
+ dht_inodelk_done(frame);
+ }
+
+ return 0;
+}
+
+static int32_t
+dht_unlock_inodelk_done(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ gf_uuid_unparse(local->lock[0].layout.my_layout.locks[0]->loc.inode->gfid,
+ gfid);
+
+ if (op_ret < 0) {
+ gf_smsg(this->name, GF_LOG_WARNING, op_errno,
+ DHT_MSG_UNLOCK_GFID_FAILED, "DHT_LAYOUT_HEAL_DOMAIN gfid=%s",
+ gfid, NULL);
+ }
+
+ DHT_STACK_DESTROY(frame);
+ return 0;
+}
+
+int32_t
+dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t inodelk_cbk)
+{
+ dht_local_t *local = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+ int ret = -1, i = 0;
+ call_frame_t *lock_frame = NULL;
+ int call_cnt = 0;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, done);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, done);
+ GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, done);
+
+ call_cnt = dht_lock_count(lk_array, lk_count);
+ if (call_cnt == 0) {
+ ret = 0;
+ goto done;
+ }
+
+ lock_frame = dht_lock_frame(frame);
+ if (lock_frame == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+ NULL);
+
+ dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+ goto done;
+ }
+
+ ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk);
+ if (ret < 0) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+ NULL);
+
+ dht_log_lk_array(frame->this->name, GF_LOG_WARNING, lk_array, lk_count);
+
+ goto done;
+ }
+
+ local = lock_frame->local;
+ local->main_frame = frame;
+ local->call_cnt = call_cnt;
+
+ flock.l_type = F_UNLCK;
+
+ for (i = 0; i < local->lock[0].layout.my_layout.lk_count; i++) {
+ if (!local->lock[0].layout.my_layout.locks[i]->locked)
+ continue;
+
+ lock_frame->root
+ ->lk_owner = local->lock[0].layout.my_layout.locks[i]->lk_owner;
+ STACK_WIND_COOKIE(
+ lock_frame, dht_unlock_inodelk_cbk, (void *)(long)i,
+ local->lock[0].layout.my_layout.locks[i]->xl,
+ local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk,
+ local->lock[0].layout.my_layout.locks[i]->domain,
+ &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLK, &flock,
+ NULL);
+ if (!--call_cnt)
+ break;
+ }
+
+ return 0;
+
+done:
+ if (lock_frame)
+ dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+
+ /* no locks acquired, invoke inodelk_cbk */
+ if (ret == 0)
+ inodelk_cbk(frame, NULL, frame->this, 0, 0, NULL);
+
+ return ret;
+}
+
+int32_t
+dht_unlock_inodelk_wrapper(call_frame_t *frame, dht_ilock_wrap_t *inodelk)
+{
+ dht_local_t *local = NULL, *lock_local = NULL;
+ call_frame_t *lock_frame = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = 0;
+
+ local = frame->local;
+
+ if (!inodelk || !inodelk->locks)
+ goto out;
+
+ gf_uuid_unparse(local->loc.parent->gfid, pgfid);
+
+ lock_frame = copy_frame(frame);
+ if (lock_frame == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_COPY_FRAME_FAILED, "pgfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
+ goto done;
+ }
+
+ lock_local = dht_local_init(lock_frame, NULL, NULL, 0);
+ if (lock_local == NULL) {
+ gf_smsg(frame->this->name, GF_LOG_WARNING, ENOMEM,
+ DHT_MSG_CREATE_FAILED, "local", "gfid=%s", pgfid, "name=%s",
+ local->loc.name, "path=%s", local->loc.path, NULL);
+ goto done;
+ }
+
+ lock_frame->local = lock_local;
+
+ lock_local->lock[0].layout.my_layout.locks = inodelk->locks;
+ lock_local->lock[0].layout.my_layout.lk_count = inodelk->lk_count;
+ inodelk->locks = NULL;
+ inodelk->lk_count = 0;
+
+ ret = dht_unlock_inodelk(
+ lock_frame, lock_local->lock[0].layout.my_layout.locks,
+ lock_local->lock[0].layout.my_layout.lk_count, dht_unlock_inodelk_done);
+
+ if (ret)
+ goto done;
+
+ lock_frame = NULL;
+
+done:
+ if (lock_frame != NULL) {
+ DHT_STACK_DESTROY(lock_frame);
+ }
+out:
+ return 0;
+}
+
+static int
+dht_inodelk_cleanup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_inodelk_done(frame);
+ return 0;
+}
+
+static void
+dht_inodelk_cleanup(call_frame_t *lock_frame)
+{
+ dht_lock_t **lk_array = NULL;
+ int lk_count = 0, lk_acquired = 0;
+ dht_local_t *local = NULL;
+
+ local = lock_frame->local;
+
+ lk_array = local->lock[0].layout.my_layout.locks;
+ lk_count = local->lock[0].layout.my_layout.lk_count;
+
+ lk_acquired = dht_lock_count(lk_array, lk_count);
+ if (lk_acquired != 0) {
+ dht_unlock_inodelk(lock_frame, lk_array, lk_count,
+ dht_inodelk_cleanup_cbk);
+ } else {
+ dht_inodelk_done(lock_frame);
+ }
+
+ return;
+}
+
+static int32_t
+dht_nonblocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int lk_index = 0, call_cnt = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+ lk_index = (long)cookie;
+
+ if (op_ret == -1) {
+ local->lock[0].layout.my_layout.op_ret = -1;
+ local->lock[0].layout.my_layout.op_errno = op_errno;
+
+ if (local && local->lock[0].layout.my_layout.locks[lk_index]) {
+ uuid_utoa_r(local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->loc.inode->gfid,
+ gfid);
+
+ gf_msg_debug(
+ this->name, op_errno,
+ "inodelk failed on gfid: %s "
+ "subvolume: %s",
+ gfid,
+ local->lock[0].layout.my_layout.locks[lk_index]->xl->name);
+ }
+
+ goto out;
+ }
+
+ local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true;
+
+out:
+ call_cnt = dht_frame_return(frame);
+ if (is_last_call(call_cnt)) {
+ if (local->lock[0].layout.my_layout.op_ret < 0) {
+ dht_inodelk_cleanup(frame);
+ return 0;
+ }
+
+ dht_inodelk_done(frame);
+ }
+
+ return 0;
+}
+
+int
+dht_nonblocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array,
+ int lk_count, fop_inodelk_cbk_t inodelk_cbk)
+{
+ struct gf_flock flock = {
+ 0,
+ };
+ int i = 0, ret = 0;
+ dht_local_t *local = NULL;
+ call_frame_t *lock_frame = NULL;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, out);
+
+ lock_frame = dht_lock_frame(frame);
+ if (lock_frame == NULL)
+ goto out;
+
+ ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk);
+ if (ret < 0) {
+ goto out;
+ }
+
+ dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner);
+
+ local = lock_frame->local;
+ local->main_frame = frame;
+
+ local->call_cnt = lk_count;
+
+ for (i = 0; i < lk_count; i++) {
+ flock.l_type = local->lock[0].layout.my_layout.locks[i]->type;
+
+ STACK_WIND_COOKIE(
+ lock_frame, dht_nonblocking_inodelk_cbk, (void *)(long)i,
+ local->lock[0].layout.my_layout.locks[i]->xl,
+ local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk,
+ local->lock[0].layout.my_layout.locks[i]->domain,
+ &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLK, &flock,
+ NULL);
+ }
+
+ return 0;
+
+out:
+ if (lock_frame)
+ dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+
+ return -1;
+}
+
+static int32_t
+dht_blocking_inodelk_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ int lk_index = 0;
+ int i = 0;
+ dht_local_t *local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+ dht_reaction_type_t reaction = 0;
+
+ lk_index = (long)cookie;
+
+ local = frame->local;
+ if (op_ret == 0) {
+ local->lock[0].layout.my_layout.locks[lk_index]->locked = _gf_true;
+ } else {
+ switch (op_errno) {
+ case ESTALE:
+ case ENOENT:
+ reaction = local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->do_on_failure;
+ if ((reaction != IGNORE_ENOENT_ESTALE) &&
+ (reaction != IGNORE_ENOENT_ESTALE_EIO)) {
+ gf_uuid_unparse(local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->loc.gfid,
+ gfid);
+ local->lock[0].layout.my_layout.op_ret = -1;
+ local->lock[0].layout.my_layout.op_errno = op_errno;
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_INODELK_FAILED, "subvol=%s",
+ local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->xl->name,
+ "gfid=%s", gfid, NULL);
+ goto cleanup;
+ }
+ break;
+ case EIO:
+ reaction = local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->do_on_failure;
+ if (reaction != IGNORE_ENOENT_ESTALE_EIO) {
+ gf_uuid_unparse(local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->loc.gfid,
+ gfid);
+ local->lock[0].layout.my_layout.op_ret = -1;
+ local->lock[0].layout.my_layout.op_errno = op_errno;
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno,
+ DHT_MSG_INODELK_FAILED, "subvol=%s",
+ local->lock[0]
+ .layout.my_layout.locks[lk_index]
+ ->xl->name,
+ "gfid=%s", gfid, NULL);
+ goto cleanup;
+ }
+ break;
+
+ default:
+ gf_uuid_unparse(
+ local->lock[0].layout.my_layout.locks[lk_index]->loc.gfid,
+ gfid);
+ local->lock[0].layout.my_layout.op_ret = -1;
+ local->lock[0].layout.my_layout.op_errno = op_errno;
+ gf_smsg(
+ this->name, GF_LOG_ERROR, op_errno, DHT_MSG_INODELK_FAILED,
+ "subvol=%s",
+ local->lock[0].layout.my_layout.locks[lk_index]->xl->name,
+ "gfid=%s", gfid, NULL);
+ goto cleanup;
+ }
+ }
+
+ if (lk_index == (local->lock[0].layout.my_layout.lk_count - 1)) {
+ for (i = 0; (i < local->lock[0].layout.my_layout.lk_count) &&
+ (!local->lock[0].layout.my_layout.locks[i]->locked);
+ i++)
+ ;
+
+ if (i == local->lock[0].layout.my_layout.lk_count) {
+ local->lock[0].layout.my_layout.op_ret = -1;
+ local->lock[0].layout.my_layout.op_errno = op_errno;
+ }
+
+ dht_inodelk_done(frame);
+ } else {
+ dht_blocking_inodelk_rec(frame, ++lk_index);
+ }
+
+ return 0;
+
+cleanup:
+ dht_inodelk_cleanup(frame);
+
+ return 0;
+}
+
+void
+dht_blocking_inodelk_rec(call_frame_t *frame, int i)
+{
+ dht_local_t *local = NULL;
+ struct gf_flock flock = {
+ 0,
+ };
+
+ local = frame->local;
+
+ flock.l_type = local->lock[0].layout.my_layout.locks[i]->type;
+
+ STACK_WIND_COOKIE(
+ frame, dht_blocking_inodelk_cbk, (void *)(long)i,
+ local->lock[0].layout.my_layout.locks[i]->xl,
+ local->lock[0].layout.my_layout.locks[i]->xl->fops->inodelk,
+ local->lock[0].layout.my_layout.locks[i]->domain,
+ &local->lock[0].layout.my_layout.locks[i]->loc, F_SETLKW, &flock, NULL);
+
+ return;
+}
+
+int
+dht_blocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t inodelk_cbk)
+{
+ int ret = -1;
+ call_frame_t *lock_frame = NULL;
+ dht_local_t *local = NULL;
+ dht_local_t *tmp_local = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {
+ 0,
+ };
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lk_array, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, inodelk_cbk, out);
+
+ tmp_local = frame->local;
+
+ lock_frame = dht_lock_frame(frame);
+ if (lock_frame == NULL) {
+ gf_uuid_unparse(tmp_local->loc.gfid, gfid);
+ gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCK_FRAME_FAILED,
+ "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL);
+ goto out;
+ }
+
+ ret = dht_local_inodelk_init(lock_frame, lk_array, lk_count, inodelk_cbk);
+ if (ret < 0) {
+ gf_uuid_unparse(tmp_local->loc.gfid, gfid);
+ gf_smsg("dht", GF_LOG_ERROR, ENOMEM, DHT_MSG_LOCAL_LOCK_INIT_FAILED,
+ "gfid=%s", gfid, "path=%s", tmp_local->loc.path, NULL);
+ goto out;
+ }
+
+ dht_set_lkowner(lk_array, lk_count, &lock_frame->root->lk_owner);
+
+ local = lock_frame->local;
+ local->main_frame = frame;
+
+ dht_blocking_inodelk_rec(lock_frame, 0);
+
+ return 0;
+out:
+ if (lock_frame)
+ dht_lock_stack_destroy(lock_frame, DHT_INODELK);
+
+ return -1;
+}
+
+void
+dht_unlock_namespace(call_frame_t *frame, dht_dir_transaction_t *lock)
+{
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, lock, out);
+
+ dht_unlock_entrylk_wrapper(frame, &lock->ns.directory_ns);
+ dht_unlock_inodelk_wrapper(frame, &lock->ns.parent_layout);
+
+out:
+ return;
+}
+
+static int32_t
+dht_protect_namespace_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+
+ local = frame->local;
+ if (op_ret != 0)
+ dht_unlock_inodelk_wrapper(frame, &local->current->ns.parent_layout);
+
+ local->current->ns.ns_cbk(frame, cookie, this, op_ret, op_errno, xdata);
+ return 0;
+}
+
+int32_t
+dht_blocking_entrylk_after_inodelk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ int ret = -1;
+ loc_t *loc = NULL;
+ dht_lock_t **lk_array = NULL;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int count = 0;
+ dht_elock_wrap_t *entrylk = NULL;
+
+ local = frame->local;
+ entrylk = &local->current->ns.directory_ns;
+
+ if (op_ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ loc = &entrylk->locks[0]->loc;
+ gf_uuid_unparse(loc->gfid, pgfid);
+
+ local->op_ret = 0;
+ lk_array = entrylk->locks;
+ count = entrylk->lk_count;
+
+ ret = dht_blocking_entrylk(frame, lk_array, count,
+ dht_protect_namespace_cbk);
+
+ if (ret < 0) {
+ local->op_ret = -1;
+ local->op_errno = EIO;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_ENTRYLK_FAILED_AFT_INODELK, "fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "basename=%s",
+ entrylk->locks[0]->basename, NULL);
+ goto err;
+ }
+
+ return 0;
+
+err:
+ if (lk_array != NULL) {
+ dht_lock_array_free(lk_array, count);
+ GF_FREE(lk_array);
+ entrylk->locks = NULL;
+ entrylk->lk_count = 0;
+ }
+
+ /* Unlock inodelk. No harm calling unlock twice */
+ dht_unlock_inodelk_wrapper(frame, &local->current->ns.parent_layout);
+ /* Call ns_cbk. It will take care of unwinding */
+ local->current->ns.ns_cbk(frame, NULL, this, local->op_ret, local->op_errno,
+ NULL);
+ return 0;
+}
+
+/* Given the loc and the subvol, this routine takes the inodelk on
+ * the parent inode and entrylk on (parent, loc->name). This routine
+ * is specific as it supports only one subvol on which it takes inodelk
+ * and then entrylk serially.
+ */
+int
+dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
+ struct dht_namespace *ns, fop_entrylk_cbk_t ns_cbk)
+{
+ dht_ilock_wrap_t *inodelk = NULL;
+ dht_elock_wrap_t *entrylk = NULL;
+ dht_lock_t **lk_array = NULL;
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ loc_t parent = {
+ 0,
+ };
+ int ret = -1;
+ char pgfid[GF_UUID_BUF_SIZE] = {0};
+ int32_t op_errno = 0;
+ int count = 1;
+
+ GF_VALIDATE_OR_GOTO("dht-locks", frame, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, loc, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, loc->parent, out);
+ GF_VALIDATE_OR_GOTO(frame->this->name, subvol, out);
+
+ local = frame->local;
+ this = frame->this;
+
+ inodelk = &ns->parent_layout;
+ entrylk = &ns->directory_ns;
+
+ /* Initialize entrylk_cbk and parent loc */
+ ns->ns_cbk = ns_cbk;
+
+ ret = dht_build_parent_loc(this, &parent, loc, &op_errno);
+ if (ret) {
+ gf_smsg(this->name, GF_LOG_ERROR, op_errno, DHT_MSG_LOC_FAILED,
+ "gfid=%s", loc->gfid, "name=%s", loc->name, "path=%s",
+ loc->path, NULL);
+ goto out;
+ }
+ gf_uuid_unparse(parent.gfid, pgfid);
+
+ /* Alloc inodelk */
+ inodelk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
+ if (inodelk->locks == NULL) {
+ local->op_errno = ENOMEM;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_CALLOC_FAILED, "fop=%s", gf_fop_list[local->fop],
+ "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path,
+ NULL);
+ goto out;
+ }
+
+ inodelk->locks[0] = dht_lock_new(this, subvol, &parent, F_RDLCK,
+ DHT_LAYOUT_HEAL_DOMAIN, NULL,
+ FAIL_ON_ANY_ERROR);
+ if (inodelk->locks[0] == NULL) {
+ local->op_errno = ENOMEM;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_LOCK_ALLOC_FAILED, "inodelk-fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+ loc->name, "path=%s", loc->path, NULL);
+ goto err;
+ }
+ inodelk->lk_count = count;
+
+ /* Allock entrylk */
+ entrylk->locks = GF_CALLOC(count, sizeof(*lk_array), gf_common_mt_pointer);
+ if (entrylk->locks == NULL) {
+ local->op_errno = ENOMEM;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_CALLOC_FAILED, "entrylk-fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+ loc->name, "path=%s", loc->path, NULL);
+
+ goto err;
+ }
+
+ entrylk->locks[0] = dht_lock_new(this, subvol, &parent, F_WRLCK,
+ DHT_ENTRY_SYNC_DOMAIN, loc->name,
+ FAIL_ON_ANY_ERROR);
+ if (entrylk->locks[0] == NULL) {
+ local->op_errno = ENOMEM;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_LOCK_ALLOC_FAILED, "entrylk-fop=%s",
+ gf_fop_list[local->fop], "pgfid=%s", pgfid, "name=%s",
+ loc->name, "path=%s", loc->path, NULL);
+
+ goto err;
+ }
+ entrylk->lk_count = count;
+
+ /* Take read inodelk on parent. If it is successful, take write entrylk
+ * on name in cbk.
+ */
+ lk_array = inodelk->locks;
+ ret = dht_blocking_inodelk(frame, lk_array, count,
+ dht_blocking_entrylk_after_inodelk);
+ if (ret < 0) {
+ local->op_errno = EIO;
+ gf_smsg(this->name, GF_LOG_WARNING, local->op_errno,
+ DHT_MSG_BLOCK_INODELK_FAILED, "fop=%s", gf_fop_list[local->fop],
+ "pgfid=%s", pgfid, "name=%s", loc->name, "path=%s", loc->path,
+ NULL);
+
+ goto err;
+ }
+
+ loc_wipe(&parent);
+
+ return 0;
+err:
+ if (entrylk->locks != NULL) {
+ dht_lock_array_free(entrylk->locks, count);
+ GF_FREE(entrylk->locks);
+ entrylk->locks = NULL;
+ entrylk->lk_count = 0;
+ }
+
+ if (inodelk->locks != NULL) {
+ dht_lock_array_free(inodelk->locks, count);
+ GF_FREE(inodelk->locks);
+ inodelk->locks = NULL;
+ inodelk->lk_count = 0;
+ }
+
+ loc_wipe(&parent);
+out:
+ return -1;
+}
diff --git a/xlators/cluster/dht/src/dht-lock.h b/xlators/cluster/dht/src/dht-lock.h
new file mode 100644
index 00000000000..6485c03fb6e
--- /dev/null
+++ b/xlators/cluster/dht/src/dht-lock.h
@@ -0,0 +1,91 @@
+/*
+ Copyright (c) 2016 Red Hat, Inc. <http://www.redhat.com>
+ This file is part of GlusterFS.
+
+ This file is licensed to you under your choice of the GNU Lesser
+ General Public License, version 3 or any later version (LGPLv3 or
+ later), or the GNU General Public License, version 2 (GPLv2), in all
+ cases as published by the Free Software Foundation.
+*/
+
+#ifndef _DHT_LOCK_H
+#define _DHT_LOCK_H
+
+#include "dht-common.h"
+
+void
+dht_lock_array_free(dht_lock_t **lk_array, int count);
+
+int32_t
+dht_lock_count(dht_lock_t **lk_array, int lk_count);
+
+dht_lock_t *
+dht_lock_new(xlator_t *this, xlator_t *xl, loc_t *loc, short type,
+ const char *domain, const char *basename,
+ dht_reaction_type_t do_on_failure);
+
+int32_t
+dht_unlock_entrylk_wrapper(call_frame_t *, dht_elock_wrap_t *);
+
+void
+dht_blocking_entrylk_rec(call_frame_t *frame, int i);
+
+int
+dht_blocking_entrylk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t entrylk_cbk);
+
+int32_t
+dht_unlock_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t inodelk_cbk);
+
+int32_t
+dht_unlock_inodelk_wrapper(call_frame_t *, dht_ilock_wrap_t *);
+
+/* Acquire non-blocking inodelk on a list of xlators.
+ *
+ * @lk_array: array of lock requests lock on.
+ *
+ * @lk_count: number of locks in @lk_array
+ *
+ * @inodelk_cbk: will be called after inodelk replies are received
+ *
+ * @retval: -1 if stack_winding inodelk fails. 0 otherwise.
+ * inodelk_cbk is called with appropriate error on errors.
+ * On failure to acquire lock on all members of list, successful
+ * locks are unlocked before invoking cbk.
+ */
+
+int
+dht_nonblocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array,
+ int lk_count, fop_inodelk_cbk_t inodelk_cbk);
+
+void
+dht_blocking_inodelk_rec(call_frame_t *frame, int i);
+
+/* same as dht_nonblocking_inodelk, but issues sequential blocking locks on
+ * @lk_array directly. locks are issued on some order which remains same
+ * for a list of xlators (irrespective of order of xlators within list).
+ */
+
+int
+dht_blocking_inodelk(call_frame_t *frame, dht_lock_t **lk_array, int lk_count,
+ fop_inodelk_cbk_t inodelk_cbk);
+
+int32_t
+dht_blocking_entrylk_after_inodelk(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata);
+
+int32_t
+dht_blocking_entrylk_after_inodelk_rename(call_frame_t *frame, void *cookie,
+ xlator_t *this, int32_t op_ret,
+ int32_t op_errno, dict_t *xdata);
+
+void
+dht_unlock_namespace(call_frame_t *, dht_dir_transaction_t *);
+
+int
+dht_protect_namespace(call_frame_t *frame, loc_t *loc, xlator_t *subvol,
+ struct dht_namespace *ns, fop_entrylk_cbk_t ns_cbk);
+
+#endif /* _DHT_LOCK_H */
diff --git a/xlators/cluster/dht/src/dht-mem-types.h b/xlators/cluster/dht/src/dht-mem-types.h
index 5de5d1838ad..e3c4471334a 100644
--- a/xlators/cluster/dht/src/dht-mem-types.h
+++ b/xlators/cluster/dht/src/dht-mem-types.h
@@ -8,36 +8,31 @@
cases as published by the Free Software Foundation.
*/
-
#ifndef __DHT_MEM_TYPES_H__
#define __DHT_MEM_TYPES_H__
-#include "mem-types.h"
+#include <glusterfs/mem-types.h>
enum gf_dht_mem_types_ {
- gf_dht_mt_dht_du_t = gf_common_mt_end + 1,
- gf_dht_mt_dht_conf_t,
- gf_dht_mt_char,
- gf_dht_mt_int32_t,
- gf_dht_mt_xlator_t,
- gf_dht_mt_dht_layout_t,
- gf_switch_mt_dht_conf_t,
- gf_switch_mt_dht_du_t,
- gf_switch_mt_switch_sched_array,
- gf_switch_mt_switch_struct,
- gf_dht_mt_subvol_time,
- gf_dht_mt_loc_t,
- gf_defrag_info_mt,
- gf_dht_mt_inode_ctx_t,
- gf_dht_mt_ctx_stat_time_t,
- gf_dht_mt_dirent_t,
- gf_dht_mt_container_t,
- gf_dht_mt_octx_t,
- gf_dht_mt_miginfo_t,
- gf_tier_mt_bricklist_t,
- gf_tier_mt_ipc_ctr_params_t,
- gf_dht_mt_fd_ctx_t,
- gf_tier_mt_qfile_array_t,
- gf_dht_mt_end
+ gf_dht_mt_dht_du_t = gf_common_mt_end + 1,
+ gf_dht_mt_dht_conf_t,
+ gf_dht_mt_char,
+ gf_dht_mt_int32_t,
+ gf_dht_mt_xlator_t,
+ gf_dht_mt_dht_layout_t,
+ gf_switch_mt_switch_sched_array,
+ gf_switch_mt_switch_struct,
+ gf_dht_mt_subvol_time,
+ gf_dht_mt_loc_t,
+ gf_defrag_info_mt,
+ gf_dht_mt_inode_ctx_t,
+ gf_dht_mt_dirent_t,
+ gf_dht_mt_container_t,
+ gf_dht_mt_octx_t,
+ gf_dht_mt_miginfo_t,
+ gf_dht_mt_fd_ctx_t,
+ gf_dht_ret_cache_t,
+ gf_dht_nodeuuids_t,
+ gf_dht_mt_end
};
#endif
diff --git a/xlators/cluster/dht/src/dht-messages.h b/xlators/cluster/dht/src/dht-messages.h
index 6ea245d0c15..601f8dad78b 100644
--- a/xlators/cluster/dht/src/dht-messages.h
+++ b/xlators/cluster/dht/src/dht-messages.h
@@ -10,1037 +10,377 @@
#ifndef _DHT_MESSAGES_H_
#define _DHT_MESSAGES_H_
-#include "glfs-message-id.h"
+#include <glusterfs/glfs-message-id.h>
+
+/* To add new message IDs, append new identifiers at the end of the list.
+ *
+ * Never remove a message ID. If it's not used anymore, you can rename it or
+ * leave it as it is, but not delete it. This is to prevent reutilization of
+ * IDs by other messages.
+ *
+ * The component name must match one of the entries defined in
+ * glfs-message-id.h.
+ */
+
+GLFS_MSGID(
+ DHT, DHT_MSG_CACHED_SUBVOL_GET_FAILED, DHT_MSG_CREATE_LINK_FAILED,
+ DHT_MSG_DICT_SET_FAILED, DHT_MSG_DIR_ATTR_HEAL_FAILED,
+ DHT_MSG_DIR_SELFHEAL_FAILED, DHT_MSG_DIR_SELFHEAL_XATTR_FAILED,
+ DHT_MSG_FILE_ON_MULT_SUBVOL, DHT_MSG_FILE_TYPE_MISMATCH,
+ DHT_MSG_GFID_MISMATCH, DHT_MSG_GFID_NULL, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ DHT_MSG_INIT_FAILED, DHT_MSG_INVALID_CONFIGURATION,
+ DHT_MSG_INVALID_DISK_LAYOUT, DHT_MSG_INVALID_OPTION,
+ DHT_MSG_LAYOUT_FIX_FAILED, DHT_MSG_LAYOUT_MERGE_FAILED,
+ DHT_MSG_LAYOUT_MISMATCH, DHT_MSG_LAYOUT_NULL, DHT_MSG_MIGRATE_DATA_COMPLETE,
+ DHT_MSG_MIGRATE_DATA_FAILED, DHT_MSG_MIGRATE_FILE_COMPLETE,
+ DHT_MSG_MIGRATE_FILE_FAILED, DHT_MSG_NO_MEMORY, DHT_MSG_OPENDIR_FAILED,
+ DHT_MSG_REBALANCE_FAILED, DHT_MSG_REBALANCE_START_FAILED,
+ DHT_MSG_REBALANCE_STATUS, DHT_MSG_REBALANCE_STOPPED, DHT_MSG_RENAME_FAILED,
+ DHT_MSG_SETATTR_FAILED, DHT_MSG_SUBVOL_INSUFF_INODES,
+ DHT_MSG_SUBVOL_INSUFF_SPACE, DHT_MSG_UNLINK_FAILED,
+ DHT_MSG_LAYOUT_SET_FAILED, DHT_MSG_LOG_FIXED_LAYOUT,
+ DHT_MSG_GET_XATTR_FAILED, DHT_MSG_FILE_LOOKUP_FAILED,
+ DHT_MSG_OPEN_FD_FAILED, DHT_MSG_SET_INODE_CTX_FAILED,
+ DHT_MSG_UNLOCKING_FAILED, DHT_MSG_DISK_LAYOUT_NULL, DHT_MSG_SUBVOL_INFO,
+ DHT_MSG_CHUNK_SIZE_INFO, DHT_MSG_LAYOUT_FORM_FAILED, DHT_MSG_SUBVOL_ERROR,
+ DHT_MSG_LAYOUT_SORT_FAILED, DHT_MSG_REGEX_INFO, DHT_MSG_FOPEN_FAILED,
+ DHT_MSG_SET_HOSTNAME_FAILED, DHT_MSG_BRICK_ERROR, DHT_MSG_SYNCOP_FAILED,
+ DHT_MSG_MIGRATE_INFO, DHT_MSG_SOCKET_ERROR, DHT_MSG_CREATE_FD_FAILED,
+ DHT_MSG_READDIR_ERROR, DHT_MSG_CHILD_LOC_BUILD_FAILED,
+ DHT_MSG_SET_SWITCH_PATTERN_ERROR, DHT_MSG_COMPUTE_HASH_FAILED,
+ DHT_MSG_FIND_LAYOUT_ANOMALIES_ERROR, DHT_MSG_ANOMALIES_INFO,
+ DHT_MSG_LAYOUT_INFO, DHT_MSG_INODE_LK_ERROR, DHT_MSG_RENAME_INFO,
+ DHT_MSG_DATA_NULL, DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED,
+ DHT_MSG_UNLINK_LOOKUP_INFO, DHT_MSG_LINK_FILE_LOOKUP_INFO,
+ DHT_MSG_OPERATION_NOT_SUP, DHT_MSG_NOT_LINK_FILE_ERROR, DHT_MSG_CHILD_DOWN,
+ DHT_MSG_UUID_PARSE_ERROR, DHT_MSG_GET_DISK_INFO_ERROR,
+ DHT_MSG_INVALID_VALUE, DHT_MSG_SWITCH_PATTERN_INFO,
+ DHT_MSG_SUBVOL_OP_FAILED, DHT_MSG_LAYOUT_PRESET_FAILED,
+ DHT_MSG_INVALID_LINKFILE, DHT_MSG_FIX_LAYOUT_INFO,
+ DHT_MSG_GET_HOSTNAME_FAILED, DHT_MSG_WRITE_FAILED,
+ DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED, DHT_MSG_FSYNC_FAILED,
+ DHT_MSG_SUBVOL_DECOMMISSION_INFO, DHT_MSG_BRICK_QUERY_FAILED,
+ DHT_MSG_SUBVOL_NO_LAYOUT_INFO, DHT_MSG_OPEN_FD_ON_DST_FAILED,
+ DHT_MSG_SUBVOL_NOT_FOUND, DHT_MSG_FILE_LOOKUP_ON_DST_FAILED,
+ DHT_MSG_DISK_LAYOUT_MISSING, DHT_MSG_DICT_GET_FAILED,
+ DHT_MSG_REVALIDATE_CBK_INFO, DHT_MSG_UPGRADE_BRICKS, DHT_MSG_LK_ARRAY_INFO,
+ DHT_MSG_RENAME_NOT_LOCAL, DHT_MSG_RECONFIGURE_INFO,
+ DHT_MSG_INIT_LOCAL_SUBVOL_FAILED, DHT_MSG_SYS_CALL_GET_TIME_FAILED,
+ DHT_MSG_NO_DISK_USAGE_STATUS, DHT_MSG_SUBVOL_DOWN_ERROR,
+ DHT_MSG_REBAL_THROTTLE_INFO, DHT_MSG_COMMIT_HASH_INFO,
+ DHT_MSG_REBAL_STRUCT_SET, DHT_MSG_HAS_MIGINFO, DHT_MSG_SETTLE_HASH_FAILED,
+ DHT_MSG_DEFRAG_PROCESS_DIR_FAILED, DHT_MSG_FD_CTX_SET_FAILED,
+ DHT_MSG_STALE_LOOKUP, DHT_MSG_PARENT_LAYOUT_CHANGED,
+ DHT_MSG_LOCK_MIGRATION_FAILED, DHT_MSG_LOCK_INODE_UNREF_FAILED,
+ DHT_MSG_ASPRINTF_FAILED, DHT_MSG_DIR_LOOKUP_FAILED, DHT_MSG_INODELK_FAILED,
+ DHT_MSG_LOCK_FRAME_FAILED, DHT_MSG_LOCAL_LOCK_INIT_FAILED,
+ DHT_MSG_ENTRYLK_ERROR, DHT_MSG_INODELK_ERROR, DHT_MSG_LOC_FAILED,
+ DHT_MSG_UNKNOWN_FOP, DHT_MSG_MIGRATE_FILE_SKIPPED,
+ DHT_MSG_DIR_XATTR_HEAL_FAILED, DHT_MSG_HASHED_SUBVOL_DOWN,
+ DHT_MSG_NON_HASHED_SUBVOL_DOWN, DHT_MSG_SYNCTASK_CREATE_FAILED,
+ DHT_MSG_DIR_HEAL_ABORT, DHT_MSG_MIGRATE_SKIP, DHT_MSG_FD_CREATE_FAILED,
+ DHT_MSG_DICT_NEW_FAILED, DHT_MSG_FAILED_TO_OPEN, DHT_MSG_CREATE_FAILED,
+ DHT_MSG_FILE_NOT_EXIST, DHT_MSG_CHOWN_FAILED, DHT_MSG_FALLOCATE_FAILED,
+ DHT_MSG_FTRUNCATE_FAILED, DHT_MSG_STATFS_FAILED, DHT_MSG_WRITE_CROSS,
+ DHT_MSG_NEW_TARGET_FOUND, DHT_MSG_INSUFF_MEMORY, DHT_MSG_SET_XATTR_FAILED,
+ DHT_MSG_SET_MODE_FAILED, DHT_MSG_FILE_EXISTS_IN_DEST,
+ DHT_MSG_SYMLINK_FAILED, DHT_MSG_LINKFILE_DEL_FAILED, DHT_MSG_MKNOD_FAILED,
+ DHT_MSG_MIGRATE_CLEANUP_FAILED, DHT_MSG_LOCK_MIGRATE,
+ DHT_MSG_PARENT_BUILD_FAILED, DHT_MSG_HASHED_SUBVOL_NOT_FOUND,
+ DHT_MSG_ACQUIRE_ENTRYLK_FAILED, DHT_MSG_CREATE_DST_FAILED,
+ DHT_MSG_MIGRATION_EXIT, DHT_MSG_CHANGED_DST, DHT_MSG_TRACE_FAILED,
+ DHT_MSG_WRITE_LOCK_FAILED, DHT_MSG_GETACTIVELK_FAILED, DHT_MSG_STAT_FAILED,
+ DHT_MSG_UNLINK_PERFORM_FAILED, DHT_MSG_CLANUP_SOURCE_FILE_FAILED,
+ DHT_MSG_UNLOCK_FILE_FAILED, DHT_MSG_REMOVE_XATTR_FAILED,
+ DHT_MSG_DATA_MIGRATE_ABORT, DHT_MSG_DEFRAG_NULL, DHT_MSG_PARENT_NULL,
+ DHT_MSG_GFID_NOT_PRESENT, DHT_MSG_CHILD_LOC_FAILED,
+ DHT_MSG_SET_LOOKUP_FAILED, DHT_MSG_DIR_REMOVED, DHT_MSG_FIX_NOT_COMP,
+ DHT_MSG_SUBVOL_DETER_FAILED, DHT_MSG_LOCAL_SUBVOL, DHT_MSG_NODE_UUID,
+ DHT_MSG_SIZE_FILE, DHT_MSG_GET_DATA_SIZE_FAILED,
+ DHT_MSG_PTHREAD_JOIN_FAILED, DHT_MSG_COUNTER_THREAD_CREATE_FAILED,
+ DHT_MSG_MIGRATION_INIT_QUEUE_FAILED, DHT_MSG_PAUSED_TIMEOUT, DHT_MSG_WOKE,
+ DHT_MSG_ABORT_REBALANCE, DHT_MSG_CREATE_TASK_REBAL_FAILED,
+ DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL, DHT_MSG_ADD_CHOICES_ERROR,
+ DHT_MSG_GET_CHOICES_ERROR, DHT_MSG_PREPARE_STATUS_ERROR,
+ DHT_MSG_SET_CHOICE_FAILED, DHT_MSG_SET_HASHED_SUBVOL_FAILED,
+ DHT_MSG_XATTR_HEAL_NOT_POSS, DHT_MSG_LINKTO_FILE_FAILED,
+ DHT_MSG_STALE_LINKFILE_DELETE, DHT_MSG_NO_SUBVOL_FOR_LINKTO,
+ DHT_MSG_SUBVOL_RETURNED, DHT_MSG_UNKNOWN_LOCAL_XSEL, DHT_MSG_GET_XATTR_ERR,
+ DHT_MSG_ALLOC_OR_FILL_FAILED, DHT_MSG_GET_REAL_NAME_FAILED,
+ DHT_MSG_COPY_UUID_FAILED, DHT_MSG_MDS_DETER_FAILED,
+ DHT_MSG_CREATE_REBAL_FAILED, DHT_MSG_LINK_LAYOUT_FAILED,
+ DHT_MSG_NO_SUBVOL_IN_LAYOUT, DHT_MSG_MEM_ALLOC_FAILED,
+ DHT_MSG_SET_IN_PARAMS_DICT_FAILED, DHT_MSG_LOC_COPY_FAILED,
+ DHT_MSG_PARENT_LOC_FAILED, DHT_MSG_CREATE_LOCK_FAILED,
+ DHT_MSG_PREV_ATTEMPT_FAILED, DHT_MSG_REFRESH_ATTEMPT,
+ DHT_MSG_ACQUIRE_LOCK_FAILED, DHT_MSG_CREATE_STUB_FAILED,
+ DHT_MSG_WIND_LOCK_REQ_FAILED, DHT_MSG_REFRESH_FAILED,
+ DHT_MSG_CACHED_SUBVOL_ERROR, DHT_MSG_NO_LINK_SUBVOL, DHT_MSG_SET_KEY_FAILED,
+ DHT_MSG_REMOVE_LINKTO_FAILED, DHT_MSG_LAYOUT_DICT_SET_FAILED,
+ DHT_MSG_XATTR_DICT_NULL, DHT_MSG_DUMMY_ALLOC_FAILED, DHT_MSG_DICT_IS_NULL,
+ DHT_MSG_LINK_INODE_FAILED, DHT_MSG_SELFHEAL_FAILED, DHT_MSG_NO_MDS_SUBVOL,
+ DHT_MSG_LIST_XATTRS_FAILED, DHT_MSG_RESET_INTER_XATTR_FAILED,
+ DHT_MSG_MDS_DOWN_UNABLE_TO_SET, DHT_MSG_WIND_UNLOCK_FAILED,
+ DHT_MSG_COMMIT_HASH_FAILED, DHT_MSG_UNLOCK_GFID_FAILED,
+ DHT_MSG_UNLOCK_FOLLOW_ENTRYLK, DHT_MSG_COPY_FRAME_FAILED,
+ DHT_MSG_UNLOCK_FOLLOW_LOCKS, DHT_MSG_ENTRYLK_FAILED_AFT_INODELK,
+ DHT_MSG_CALLOC_FAILED, DHT_MSG_LOCK_ALLOC_FAILED,
+ DHT_MSG_BLOCK_INODELK_FAILED,
+ DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK,
+ DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS,
+ DHT_MSG_DST_NULL_SET_FAILED);
+
+#define DHT_MSG_FD_CTX_SET_FAILED_STR "Failed to set fd ctx"
+#define DHT_MSG_INVALID_VALUE_STR "Different dst found in the fd ctx"
+#define DHT_MSG_UNKNOWN_FOP_STR "Unknown FOP on file"
+#define DHT_MSG_OPEN_FD_ON_DST_FAILED_STR "Failed to open the fd on file"
+#define DHT_MSG_SYNCTASK_CREATE_FAILED_STR "Failed to create synctask"
+#define DHT_MSG_ASPRINTF_FAILED_STR \
+ "asprintf failed while fetching subvol from the id"
+#define DHT_MSG_HAS_MIGINFO_STR "Found miginfo in the inode ctx"
+#define DHT_MSG_FILE_LOOKUP_FAILED_STR "failed to lookup the file"
+#define DHT_MSG_INVALID_LINKFILE_STR \
+ "linkto target is different from cached-subvol. treating as destination " \
+ "subvol"
+#define DHT_MSG_GFID_MISMATCH_STR "gfid different on the target file"
+#define DHT_MSG_GET_XATTR_FAILED_STR "failed to get 'linkto' xattr"
+#define DHT_MSG_SET_INODE_CTX_FAILED_STR "failed to set inode-ctx target file"
+#define DHT_MSG_DIR_SELFHEAL_FAILED_STR "Healing of path failed"
+#define DHT_MSG_DIR_HEAL_ABORT_STR \
+ "Failed to get path from subvol. Aborting directory healing"
+#define DHT_MSG_DIR_XATTR_HEAL_FAILED_STR "xattr heal failed for directory"
+#define DHT_MSG_LOCK_INODE_UNREF_FAILED_STR \
+ "Found a NULL inode. Failed to unref the inode"
+#define DHT_MSG_DICT_SET_FAILED_STR "Failed to set dictionary value"
+#define DHT_MSG_NOT_LINK_FILE_ERROR_STR "got non-linkfile"
+#define DHT_MSG_CREATE_LINK_FAILED_STR "failed to initialize linkfile data"
+#define DHT_MSG_UNLINK_FAILED_STR "Unlinking linkfile on subvolume failed"
+#define DHT_MSG_MIGRATE_FILE_FAILED_STR "Migrate file failed"
+#define DHT_MSG_NO_MEMORY_STR "could not allocate memory for dict"
+#define DHT_MSG_SUBVOL_ERROR_STR "Failed to get linkto subvol"
+#define DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED_STR "link failed on subvol"
+#define DHT_MSG_MIGRATE_FILE_SKIPPED_STR "Migration skipped"
+#define DHT_MSG_FD_CREATE_FAILED_STR "fd create failed"
+#define DHT_MSG_DICT_NEW_FAILED_STR "dict_new failed"
+#define DHT_MSG_FAILED_TO_OPEN_STR "failed to open"
+#define DHT_MSG_CREATE_FAILED_STR "failed to create"
+#define DHT_MSG_FILE_NOT_EXIST_STR "file does not exist"
+#define DHT_MSG_CHOWN_FAILED_STR "chown failed"
+#define DHT_MSG_FALLOCATE_FAILED_STR "fallocate failed"
+#define DHT_MSG_FTRUNCATE_FAILED_STR "ftruncate failed"
+#define DHT_MSG_STATFS_FAILED_STR "failed to get statfs"
+#define DHT_MSG_WRITE_CROSS_STR \
+ "write will cross min-fre-disk for file on subvol. looking for new subvol"
+#define DHT_MSG_SUBVOL_INSUFF_SPACE_STR \
+ "Could not find any subvol with space accommodating the file. Cosider " \
+ "adding bricks"
+#define DHT_MSG_NEW_TARGET_FOUND_STR "New target found for file"
+#define DHT_MSG_INSUFF_MEMORY_STR "insufficient memory"
+#define DHT_MSG_SET_XATTR_FAILED_STR "failed to set xattr"
+#define DHT_MSG_SET_MODE_FAILED_STR "failed to set mode"
+#define DHT_MSG_FILE_EXISTS_IN_DEST_STR "file exists in destination"
+#define DHT_MSG_LINKFILE_DEL_FAILED_STR "failed to delete the linkfile"
+#define DHT_MSG_SYMLINK_FAILED_STR "symlink failed"
+#define DHT_MSG_MKNOD_FAILED_STR "mknod failed"
+#define DHT_MSG_SETATTR_FAILED_STR "failed to perform setattr"
+#define DHT_MSG_MIGRATE_CLEANUP_FAILED_STR \
+ "Migrate file cleanup failed: failed to fstat file"
+#define DHT_MSG_LOCK_MIGRATE_STR "locks will be migrated for file"
+#define DHT_MSG_PARENT_BUILD_FAILED_STR \
+ "failed to build parent loc, which is needed to acquire entrylk to " \
+ "synchronize with renames on this path. Skipping migration"
+#define DHT_MSG_HASHED_SUBVOL_NOT_FOUND_STR \
+ "cannot find hashed subvol which is needed to synchronize with renames " \
+ "on this path. Skipping migration"
+#define DHT_MSG_ACQUIRE_ENTRYLK_FAILED_STR "failed to acquire entrylk on subvol"
+#define DHT_MSG_CREATE_DST_FAILED_STR "create dst failed for file"
+#define DHT_MSG_MIGRATION_EXIT_STR "Exiting migration"
+#define DHT_MSG_CHANGED_DST_STR "destination changed fo file"
+#define DHT_MSG_TRACE_FAILED_STR "Trace failed"
+#define DHT_MSG_WRITE_LOCK_FAILED_STR "write lock failed"
+#define DHT_MSG_GETACTIVELK_FAILED_STR "getactivelk failed for file"
+#define DHT_MSG_STAT_FAILED_STR "failed to do a stat"
+#define DHT_MSG_UNLINK_PERFORM_FAILED_STR "failed to perform unlink"
+#define DHT_MSG_MIGRATE_FILE_COMPLETE_STR "completed migration"
+#define DHT_MSG_CLANUP_SOURCE_FILE_FAILED_STR "failed to cleanup source file"
+#define DHT_MSG_UNLOCK_FILE_FAILED_STR "failed to unlock file"
+#define DHT_MSG_REMOVE_XATTR_FAILED_STR "remove xattr failed"
+#define DHT_MSG_SOCKET_ERROR_STR "Failed to unlink listener socket"
+#define DHT_MSG_HASHED_SUBVOL_GET_FAILED_STR "Failed to get hashed subvolume"
+#define DHT_MSG_CACHED_SUBVOL_GET_FAILED_STR "Failed to get cached subvolume"
+#define DHT_MSG_MIGRATE_DATA_FAILED_STR "migrate-data failed"
+#define DHT_MSG_DEFRAG_NULL_STR "defrag is NULL"
+#define DHT_MSG_DATA_MIGRATE_ABORT_STR \
+ "Readdirp failed. Aborting data migration for dict"
+#define DHT_MSG_LAYOUT_FIX_FAILED_STR "fix layout failed"
+#define DHT_MSG_PARENT_NULL_STR "parent is NULL"
+#define DHT_MSG_GFID_NOT_PRESENT_STR "gfid not present"
+#define DHT_MSG_CHILD_LOC_FAILED_STR "Child loc build failed"
+#define DHT_MSG_SET_LOOKUP_FAILED_STR "Failed to set lookup"
+#define DHT_MSG_DIR_LOOKUP_FAILED_STR "lookup failed"
+#define DHT_MSG_DIR_REMOVED_STR "Dir renamed or removed. Skipping"
+#define DHT_MSG_READDIR_ERROR_STR "readdir failed, Aborting fix-layout"
+#define DHT_MSG_SETTLE_HASH_FAILED_STR "Settle hash failed"
+#define DHT_MSG_DEFRAG_PROCESS_DIR_FAILED_STR "gf_defrag_process_dir failed"
+#define DHT_MSG_FIX_NOT_COMP_STR \
+ "Unable to retrieve fixlayout xattr. Assume background fix layout not " \
+ "complete"
+#define DHT_MSG_SUBVOL_DETER_FAILED_STR \
+ "local subvolume determination failed with error"
+#define DHT_MSG_LOCAL_SUBVOL_STR "local subvol"
+#define DHT_MSG_NODE_UUID_STR "node uuid"
+#define DHT_MSG_SIZE_FILE_STR "Total size files"
+#define DHT_MSG_GET_DATA_SIZE_FAILED_STR \
+ "Failed to get the total data size. Unable to estimate time to complete " \
+ "rebalance"
+#define DHT_MSG_PTHREAD_JOIN_FAILED_STR \
+ "file_counter_thread: pthread_join failed"
+#define DHT_MSG_COUNTER_THREAD_CREATE_FAILED_STR \
+ "Failed to create the file counter thread"
+#define DHT_MSG_MIGRATION_INIT_QUEUE_FAILED_STR \
+ "Failed to initialise migration queue"
+#define DHT_MSG_REBALANCE_STOPPED_STR "Received stop command on rebalance"
+#define DHT_MSG_PAUSED_TIMEOUT_STR "Request pause timer timeout"
+#define DHT_MSG_WOKE_STR "woken"
+#define DHT_MSG_ABORT_REBALANCE_STR "Aborting rebalance"
+#define DHT_MSG_REBALANCE_START_FAILED_STR \
+ "Failed to start rebalance: look up on / failed"
+#define DHT_MSG_CREATE_TASK_REBAL_FAILED_STR \
+ "Could not create task for rebalance"
+#define DHT_MSG_REBAL_ESTIMATE_NOT_AVAIL_STR \
+ "Rebalance estimates will not be available"
+#define DHT_MSG_REBALANCE_STATUS_STR "Rebalance status"
+#define DHT_MSG_DATA_NULL_STR "data value is NULL"
+#define DHT_MSG_ADD_CHOICES_ERROR_STR "Error to add choices in buffer"
+#define DHT_MSG_GET_CHOICES_ERROR_STR "Error to get choices"
+#define DHT_MSG_PREPARE_STATUS_ERROR_STR "Error to prepare status"
+#define DHT_MSG_SET_CHOICE_FAILED_STR "Failed to set full choice"
+#define DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED_STR \
+ "Failed to aggregate quota xattr"
+#define DHT_MSG_FILE_TYPE_MISMATCH_STR \
+ "path exists as a file on one subvolume and directory on another. Please " \
+ "fix it manually"
+#define DHT_MSG_LAYOUT_SET_FAILED_STR "failed to set layout for subvolume"
+#define DHT_MSG_LAYOUT_MERGE_FAILED_STR "failed to merge layouts for subvolume"
+#define DHT_MSG_SET_HASHED_SUBVOL_FAILED_STR "Failed to set hashed subvolume"
+#define DHT_MSG_XATTR_HEAL_NOT_POSS_STR \
+ "No gfid exists for path. so healing xattr is not possible"
+#define DHT_MSG_REVALIDATE_CBK_INFO_STR "Revalidate: subvolume returned -1"
+#define DHT_MSG_LAYOUT_MISMATCH_STR "Mismatching layouts"
+#define DHT_MSG_UNLINK_LOOKUP_INFO_STR "lookup_unlink retuened"
+#define DHT_MSG_LINKTO_FILE_FAILED_STR \
+ "Could not unlink the linkto file as either fd is open and/or linkto " \
+ "xattr is set"
+#define DHT_MSG_LAYOUT_PRESET_FAILED_STR \
+ "Could not set pre-set layout for subvolume"
+#define DHT_MSG_FILE_ON_MULT_SUBVOL_STR \
+ "multiple subvolumes have file (preferably rename the file in the " \
+ "backend, and do a fresh lookup"
+#define DHT_MSG_STALE_LINKFILE_DELETE_STR \
+ "attempting deletion of stale linkfile"
+#define DHT_MSG_LINK_FILE_LOOKUP_INFO_STR "Lookup on following linkfile"
+#define DHT_MSG_NO_SUBVOL_FOR_LINKTO_STR "No link subvolume for linkto"
+#define DHT_MSG_SUBVOL_RETURNED_STR "Subvolume returned -1"
+#define DHT_MSG_UNKNOWN_LOCAL_XSEL_STR "Unknown local->xsel"
+#define DHT_MSG_DICT_GET_FAILED_STR "Failed to get"
+#define DHT_MSG_UUID_PARSE_ERROR_STR "Failed to parse uuid"
+#define DHT_MSG_GET_XATTR_ERR_STR "getxattr err for dir"
+#define DHT_MSG_ALLOC_OR_FILL_FAILED_STR "alloc or fill failed"
+#define DHT_MSG_UPGRADE_BRICKS_STR \
+ "At least one of the bricks does not support this operation. Please " \
+ "upgrade all bricks"
+#define DHT_MSG_GET_REAL_NAME_FAILED_STR "Failed to get real filename"
+#define DHT_MSG_LAYOUT_NULL_STR "Layout is NULL"
+#define DHT_MSG_COPY_UUID_FAILED_STR "Failed to copy node uuid key"
+#define DHT_MSG_MDS_DETER_FAILED_STR \
+ "Cannot determine MDS, fetching xattr randomly from a subvol"
+#define DHT_MSG_HASHED_SUBVOL_DOWN_STR \
+ "MDS is down for path, so fetching xattr randomly from subvol"
+#define DHT_MSG_CREATE_REBAL_FAILED_STR \
+ "failed to create a new rebalance synctask"
+#define DHT_MSG_FIX_LAYOUT_INFO_STR "fixing the layout"
+#define DHT_MSG_OPERATION_NOT_SUP_STR "wrong directory-spread-count value"
+#define DHT_MSG_LINK_LAYOUT_FAILED_STR "failed to link the layout in inode"
+#define DHT_MSG_NO_SUBVOL_IN_LAYOUT_STR "no subvolume in layout for path"
+#define DHT_MSG_INODE_LK_ERROR_STR "mknod lock failed for file"
+#define DHT_MSG_MEM_ALLOC_FAILED_STR "mem allocation failed"
+#define DHT_MSG_PARENT_LAYOUT_CHANGED_STR \
+ "extracting in-memory layout of parent failed"
+#define DHT_MSG_SET_IN_PARAMS_DICT_FAILED_STR \
+ "setting in params dictionary failed"
+#define DHT_MSG_LOC_COPY_FAILED_STR "loc_copy failed"
+#define DHT_MSG_LOC_FAILED_STR "parent loc build failed"
+#define DHT_MSG_PARENT_LOC_FAILED_STR "locking parent failed"
+#define DHT_MSG_CREATE_LOCK_FAILED_STR "Create lock failed"
+#define DHT_MSG_PREV_ATTEMPT_FAILED_STR \
+ "mkdir loop detected. parent layout didn't change even though previous " \
+ "attempt of mkdir failed because of in-memory layout not matching with " \
+ "that on disk."
+#define DHT_MSG_REFRESH_ATTEMPT_STR \
+ "mkdir parent layout changed. Attempting a refresh and then a retry"
+#define DHT_MSG_ACQUIRE_LOCK_FAILED_STR \
+ "Acquiring lock on parent to guard against layout-change failed"
+#define DHT_MSG_CREATE_STUB_FAILED_STR "creating stub failed"
+#define DHT_MSG_WIND_LOCK_REQ_FAILED_STR \
+ "cannot wind lock request to guard parent layout"
+#define DHT_MSG_REFRESH_FAILED_STR "refreshing parent layout failed."
+#define DHT_MSG_CACHED_SUBVOL_ERROR_STR "On cached subvol"
+#define DHT_MSG_NO_LINK_SUBVOL_STR "Linkfile does not have link subvolume"
+#define DHT_MSG_SET_KEY_FAILED_STR "failed to set key"
+#define DHT_MSG_CHILD_DOWN_STR "Received CHILD_DOWN. Exiting"
+#define DHT_MSG_LOG_FIXED_LAYOUT_STR "log layout fixed"
+#define DHT_MSG_REBAL_STRUCT_SET_STR "local->rebalance already set"
+#define DHT_MSG_REMOVE_LINKTO_FAILED_STR "Removal of linkto failed at subvol"
+#define DHT_MSG_LAYOUT_DICT_SET_FAILED_STR "dht layout dict set failed"
+#define DHT_MSG_SUBVOL_INFO_STR "creating subvolume"
+#define DHT_MSG_COMPUTE_HASH_FAILED_STR "hash computation failed"
+#define DHT_MSG_INVALID_DISK_LAYOUT_STR \
+ "Invalid disk layout: Catastrophic error layout with unknown type found"
+#define DHT_MSG_LAYOUT_SORT_FAILED_STR "layout sort failed"
+#define DHT_MSG_ANOMALIES_INFO_STR "Found anomalies"
+#define DHT_MSG_XATTR_DICT_NULL_STR "xattr dictionary is NULL"
+#define DHT_MSG_DISK_LAYOUT_MISSING_STR "Disk layout missing"
+#define DHT_MSG_LAYOUT_INFO_STR "layout info"
+#define DHT_MSG_SUBVOL_NO_LAYOUT_INFO_STR "no pre-set layout for subvol"
+#define DHT_MSG_SELFHEAL_XATTR_FAILED_STR "layout setxattr failed"
+#define DHT_MSG_DIR_SELFHEAL_XATTR_FAILED_STR "Directory self heal xattr failed"
+#define DHT_MSG_DUMMY_ALLOC_FAILED_STR "failed to allocate dummy layout"
+#define DHT_MSG_DICT_IS_NULL_STR \
+ "dict is NULL, need to make sure gfids are same"
+#define DHT_MSG_ENTRYLK_ERROR_STR "acquiring entrylk after inodelk failed"
+#define DHT_MSG_NO_DISK_USAGE_STATUS_STR "no du stats"
+#define DHT_MSG_LINK_INODE_FAILED_STR "linking inode failed"
+#define DHT_MSG_SELFHEAL_FAILED_STR "Directory selfheal failed"
+#define DHT_MSG_NO_MDS_SUBVOL_STR "No mds subvol"
+#define DHT_MSG_LIST_XATTRS_FAILED_STR "failed to list xattrs"
+#define DHT_MSG_RESET_INTER_XATTR_FAILED_STR "Failed to reset internal xattr"
+#define DHT_MSG_MDS_DOWN_UNABLE_TO_SET_STR \
+ "mds subvol is down, unable to set xattr"
+#define DHT_MSG_DIR_ATTR_HEAL_FAILED_STR \
+ "Directory attr heal failed. Failed to set uid/gid"
+#define DHT_MSG_WIND_UNLOCK_FAILED_STR \
+ "Winding unlock failed: stale locks left on brick"
+#define DHT_MSG_COMMIT_HASH_FAILED_STR "Directory commit hash updaten failed"
+#define DHT_MSG_LK_ARRAY_INFO_STR "lk info"
+#define DHT_MSG_UNLOCK_GFID_FAILED_STR \
+ "unlock failed on gfid: stale lock might be left"
+#define DHT_MSG_UNLOCKING_FAILED_STR "unlocking failed"
+#define DHT_MSG_UNLOCK_FOLLOW_ENTRYLK_STR "not unlocking following entrylks"
+#define DHT_MSG_COPY_FRAME_FAILED_STR "copy frame failed"
+#define DHT_MSG_UNLOCK_FOLLOW_LOCKS_STR "not unlocking following locks"
+#define DHT_MSG_INODELK_FAILED_STR "inodelk failed on subvol"
+#define DHT_MSG_LOCK_FRAME_FAILED_STR "memory allocation failed for lock_frame"
+#define DHT_MSG_LOCAL_LOCK_INIT_FAILED_STR "dht_local_lock_init failed"
+#define DHT_MSG_ENTRYLK_FAILED_AFT_INODELK_STR \
+ "dht_blocking_entrylk failed after taking inodelk"
+#define DHT_MSG_BLOCK_INODELK_FAILED_STR "dht_blocking_inodelk failed"
+#define DHT_MSG_CALLOC_FAILED_STR "calloc failed"
+#define DHT_MSG_LOCK_ALLOC_FAILED_STR "lock allocation failed"
+#define DHT_MSG_ALLOC_FRAME_FAILED_NOT_UNLOCKING_FOLLOWING_ENTRYLKS_STR \
+ "cannot allocate a frame, not unlocking following entrylks"
+#define DHT_MSG_LOCAL_LOCKS_STORE_FAILED_UNLOCKING_FOLLOWING_ENTRYLK_STR \
+ "storing locks in local failed, not unlocking following entrylks"
+#define DHT_MSG_DST_NULL_SET_FAILED_STR \
+ "src or dst is NULL, Failed to set dictionary value"
-/*! \file dht-messages.h
- * \brief DHT log-message IDs and their descriptions
- *
- */
-
-/* NOTE: Rules for message additions
- * 1) Each instance of a message is _better_ left with a unique message ID, even
- * if the message format is the same. Reasoning is that, if the message
- * format needs to change in one instance, the other instances are not
- * impacted or the new change does not change the ID of the instance being
- * modified.
- * 2) Addition of a message,
- * - Should increment the GLFS_NUM_MESSAGES
- * - Append to the list of messages defined, towards the end
- * - Retain macro naming as glfs_msg_X (for redability across developers)
- * NOTE: Rules for message format modifications
- * 3) Check acorss the code if the message ID macro in question is reused
- * anywhere. If reused then then the modifications should ensure correctness
- * everywhere, or needs a new message ID as (1) above was not adhered to. If
- * not used anywhere, proceed with the required modification.
- * NOTE: Rules for message deletion
- * 4) Check (3) and if used anywhere else, then cannot be deleted. If not used
- * anywhere, then can be deleted, but will leave a hole by design, as
- * addition rules specify modification to the end of the list and not filling
- * holes.
- */
-
-#define GLFS_DHT_BASE GLFS_MSGID_COMP_DHT
-#define GLFS_DHT_NUM_MESSAGES 112
-#define GLFS_MSGID_END (GLFS_DHT_BASE + GLFS_DHT_NUM_MESSAGES + 1)
-
-/* Messages with message IDs */
-#define glfs_msg_start_x GLFS_DHT_BASE, "Invalid: Start of messages"
-
-
-
-
-/*!
- * @messageid 109001
- * @diagnosis Cached subvolume could not be found for the specified
- * path
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_CACHED_SUBVOL_GET_FAILED (GLFS_DHT_BASE + 1)
-
-/*!
- * @messageid 109002
- * @diagnosis Linkfile creation failed
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_CREATE_LINK_FAILED (GLFS_DHT_BASE + 2)
-
-/*!
- * @messageid 109003
- * @diagnosis The value could not be set for the specified key in
- * the dictionary
- *
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_DICT_SET_FAILED (GLFS_DHT_BASE + 3)
-
-/*!
- * @messageid 109004
- * @diagnosis Directory attributes could not be healed
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_DIR_ATTR_HEAL_FAILED (GLFS_DHT_BASE + 4)
-
-/*!
- * @messageid 109005
- * @diagnosis Self-heal failed for the specified directory
- * @recommendedaction Ensure that all subvolumes are online
- * and reachable and perform a lookup operation
- * on the directory again.
- *
- */
-
-#define DHT_MSG_DIR_SELFHEAL_FAILED (GLFS_DHT_BASE + 5)
-
-/*!
- * @messageid 109006
- * @diagnosis The extended attributes could not be healed for
- * the specified directory on the specified subvolume
- *
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_DIR_SELFHEAL_XATTR_FAILED (GLFS_DHT_BASE + 6)
-
-/*!
- * @messageid 109007
- * @diagnosis A lookup operation found the file with the same path
- * on multiple subvolumes.
- * @recommendedaction
- * 1. Create backups of the file on other subvolumes.
- * 2. Inspect the content of the files to identify
- * and retain the most appropriate file.
- *
- */
-
-#define DHT_MSG_FILE_ON_MULT_SUBVOL (GLFS_DHT_BASE + 7)
-
-/*!
- * @messageid 109008
- * @diagnosis A path resolves to a file on one subvolume and a directory
- * on another
- * @recommendedaction
- * 1. Create a backup of the file with a different name
- * and delete the original file.
- * 2. In the newly created back up file, remove the "trusted.gfid"
- * extended attribute.
- * - Command: setfattr -x "trusted.gfid" \<path to the newly created backup file\>
- * 3. Perform a new lookup operation on both the new and old paths.
- * 4. From the mount point, inspect both the paths and retain the
- * relevant file or directory.
- *
- */
-
-#define DHT_MSG_FILE_TYPE_MISMATCH (GLFS_DHT_BASE + 8)
-
-/*!
- * @messageid 109009
- * @diagnosis The GFID of the file/directory is different on different subvolumes
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_GFID_MISMATCH (GLFS_DHT_BASE + 9)
-
-/*!
- * @messageid 109010
- * @diagnosis The GFID of the specified file/directory is NULL.
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_GFID_NULL (GLFS_DHT_BASE + 10)
-
-/*
- * @messageid 109011
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_HASHED_SUBVOL_GET_FAILED (GLFS_DHT_BASE + 11)
-
-/*!
- * @messageid 109012
- * @diagnosis The Distributed Hash Table Translator could not be initiated as the
- * system is out of memory.
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_INIT_FAILED (GLFS_DHT_BASE + 12)
-
-/*!
- * @messageid 109013
- * @diagnosis Invalid DHT configuration in the volfile
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_INVALID_CONFIGURATION (GLFS_DHT_BASE + 13)
-
-/*!
- * @messageid 109014
- * @diagnosis Invalid disk layout
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_INVALID_DISK_LAYOUT (GLFS_DHT_BASE + 14)
-
-/*!
- * @messageid 109015
- * @diagnosis Invalid DHT configuration option.
- * @recommendedaction
- * 1. Reset the option with a valid value using the volume
- * set command.
- * 2. Restart the process that logged the message in the
- * log file.
- *
- */
-
-#define DHT_MSG_INVALID_OPTION (GLFS_DHT_BASE + 15)
-
-/*!
- * @messageid 109016
- * @diagnosis The fix layout operation failed
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_LAYOUT_FIX_FAILED (GLFS_DHT_BASE + 16)
-
-/*!
- * @messageid 109017
- * @diagnosis Layout merge failed
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_LAYOUT_MERGE_FAILED (GLFS_DHT_BASE + 17)
-
-/*!
- * @messageid 109018
- * @diagnosis The layout for the specified directory does not match
- that on the disk.
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_LAYOUT_MISMATCH (GLFS_DHT_BASE + 18)
-
-/*!
- * @messageid 109019
- * @diagnosis No layout is present for the specified file/directory
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_LAYOUT_NULL (GLFS_DHT_BASE + 19)
-
-/*!
- * @messageid 109020
- * @diagnosis Informational message: Migration of data from the cached
- * subvolume to the hashed subvolume is complete
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_MIGRATE_DATA_COMPLETE (GLFS_DHT_BASE + 20)
-
-/*!
- * @messageid 109021
- * @diagnosis Migration of data failed during the rebalance operation
- * \n Cause: Directories could not be read to identify the files for the
- * migration process.
- * @recommendedaction
- * The log message would indicate the reason for the failure and
- * the corrective action depends on the specific error that is
- * encountered. The error is one of the standard UNIX errors.
- *
- */
-
-#define DHT_MSG_MIGRATE_DATA_FAILED (GLFS_DHT_BASE + 21)
-
-/*!
- * @messageid 109022
- * @diagnosis Informational message: The file was migrated successfully during
- * the rebalance operation.
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_MIGRATE_FILE_COMPLETE (GLFS_DHT_BASE + 22)
-
-/*!
- * @messageid 109023
- * @diagnosis File migration failed during the rebalance operation
- * \n Cause: Rebalance moves data from the cached subvolume to
- * the hashed subvolume. Migrating a single file is a multi-step operation
- * which involves opening, reading, and writing the data and metadata.
- * Any failures in this multi-step operation can result in a file
- * migration failure.
- * @recommendedaction The log message would indicate the reason for the failure and the
- * corrective action depends on the specific error that is encountered.
- * The error is one of the standard UNIX errors.
- *
- */
-
-#define DHT_MSG_MIGRATE_FILE_FAILED (GLFS_DHT_BASE + 23)
-
-/*!
- * @messageid 109024
- * @diagnosis Out of memory
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_NO_MEMORY (GLFS_DHT_BASE + 24)
-
-/*!
- * @messageid 109025
- * @diagnosis The opendir() call failed on the specified directory
- * \n Cause: When a directory is renamed, the Distribute Hash
- * table translator checks whether the destination directory
- * is empty. This message indicates that the opendir() call
- * on the destination directory has failed.
- * @recommendedaction The log message would indicate the reason for the
- * failure and the corrective action depends on the specific
- * error that is encountered. The error is one of the standard
- * UNIX errors.
- *
- */
-
-#define DHT_MSG_OPENDIR_FAILED (GLFS_DHT_BASE + 25)
-
-/*!
- * @messageid 109026
- * @diagnosis The rebalance operation failed.
- * @recommendedaction Check the log file for details about the failure.
- * Possible causes:
- * - A subvolume is down: Restart the rebalance operation after
- * bringing up all subvolumes.
- *
- */
-
-#define DHT_MSG_REBALANCE_FAILED (GLFS_DHT_BASE + 26)
-
-/*!
- * @messageid 109027
- * @diagnosis Failed to start the rebalance process.
- * @recommendedaction Check the log file for details about the failure.
- *
- */
-
-#define DHT_MSG_REBALANCE_START_FAILED (GLFS_DHT_BASE + 27)
-
-/*!
- * @messageid 109028
- * @diagnosis Informational message that indicates the status of the
- * rebalance operation and details as to how many files were
- * migrated, skipped, failed etc
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_REBALANCE_STATUS (GLFS_DHT_BASE + 28)
-
-/*!
- * @messageid 109029
- * @diagnosis The rebalance operation was aborted by the user.
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_REBALANCE_STOPPED (GLFS_DHT_BASE + 29)
-
-/*!
- * @messageid 109030
- * @diagnosis The file or directory could not be renamed
- * @recommendedaction Ensure that all the subvolumes are
- * online and reachable and try renaming
- * the file or directory again.
- *
- */
-
-#define DHT_MSG_RENAME_FAILED (GLFS_DHT_BASE + 30)
-
-/*!
- * @messageid 109031
- * @diagnosis Attributes could not be set for the specified file or
- * directory.
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_SETATTR_FAILED (GLFS_DHT_BASE + 31)
-
-/*!
- * @messageid 109032
- * @diagnosis The specified subvolume is running out of file system inodes.
- If all subvolumes run out of inodes, then new files cannot be created.
- * @recommendedaction Consider adding more nodes to the cluster if all subvolumes
- * run out of inodes
- *
- */
-
-#define DHT_MSG_SUBVOL_INSUFF_INODES (GLFS_DHT_BASE + 32)
-
-/*!
- * @messageid 109033
- * @diagnosis The specified subvolume is running out of disk space. If all
- subvolumes run out of space, new files cannot be created.
- * @recommendedaction Consider adding more bricks to the cluster if all subvolumes
- * run out of disk space.
- *
- */
-
-#define DHT_MSG_SUBVOL_INSUFF_SPACE (GLFS_DHT_BASE + 33)
-
-/*!
- * @messageid 109034
- * @diagnosis Failed to unlink the specified file/directory
- * @recommendedaction The log message would indicate the reason
- for the failure and the corrective action depends on
- the specific error that is encountered.
- */
-
-#define DHT_MSG_UNLINK_FAILED (GLFS_DHT_BASE + 34)
-
-
-
-/*!
- * @messageid 109035
- * @diagnosis The layout information could not be set in the inode
- * @recommendedaction None
- *
- */
-
-#define DHT_MSG_LAYOUT_SET_FAILED (GLFS_DHT_BASE + 35)
-
-/*!
- * @messageid 109036
- * @diagnosis Informational message regarding layout range distribution
- * for a directory across subvolumes
- * @recommendedaction None
- */
-
-#define DHT_MSG_LOG_FIXED_LAYOUT (GLFS_DHT_BASE + 36)
-
-/*
- * @messageid 109037
- * @diagnosis Informational message regarding error in tier operation
- * @recommendedaction None
- */
-
-#define DHT_MSG_LOG_TIER_ERROR (GLFS_DHT_BASE + 37)
-
-/*
- * @messageid 109038
- * @diagnosis Informational message regarding tier operation
- * @recommendedaction None
- */
-
-#define DHT_MSG_LOG_TIER_STATUS (GLFS_DHT_BASE + 38)
-
-/*
- * @messageid 109039
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_GET_XATTR_FAILED (GLFS_DHT_BASE + 39)
-
-/*
- * @messageid 109040
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_FILE_LOOKUP_FAILED (GLFS_DHT_BASE + 40)
-
-/*
- * @messageid 109041
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_OPEN_FD_FAILED (GLFS_DHT_BASE + 41)
-
-/*
- * @messageid 109042
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SET_INODE_CTX_FAILED (GLFS_DHT_BASE + 42)
-
-/*
- * @messageid 109043
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_UNLOCKING_FAILED (GLFS_DHT_BASE + 43)
-
-/*
- * @messageid 109044
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_DISK_LAYOUT_NULL (GLFS_DHT_BASE + 44)
-
-/*
- * @messageid 109045
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SUBVOL_INFO (GLFS_DHT_BASE + 45)
-
-/*
- * @messageid 109046
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_CHUNK_SIZE_INFO (GLFS_DHT_BASE + 46)
-
-/*
- * @messageid 109047
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_LAYOUT_FORM_FAILED (GLFS_DHT_BASE + 47)
-
-/*
- * @messageid 109048
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SUBVOL_ERROR (GLFS_DHT_BASE + 48)
-
-/*
- * @messageid 109049
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_LAYOUT_SORT_FAILED (GLFS_DHT_BASE + 49)
-
-/*
- * @messageid 109050
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_REGEX_INFO (GLFS_DHT_BASE + 50)
-
-/*
- * @messageid 109051
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_FOPEN_FAILED (GLFS_DHT_BASE + 51)
-
-/*
- * @messageid 109052
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SET_HOSTNAME_FAILED (GLFS_DHT_BASE + 52)
-
-/*
- * @messageid 109053
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_BRICK_ERROR (GLFS_DHT_BASE + 53)
-
-/*
- * @messageid 109054
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SYNCOP_FAILED (GLFS_DHT_BASE + 54)
-
-/*
- * @messageid 109055
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_MIGRATE_INFO (GLFS_DHT_BASE + 55)
-
-/*
- * @messageid 109056
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SOCKET_ERROR (GLFS_DHT_BASE + 56)
-
-/*
- * @messageid 109057
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_CREATE_FD_FAILED (GLFS_DHT_BASE + 57)
-
-/*
- * @messageid 109058
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_READDIR_ERROR (GLFS_DHT_BASE + 58)
-
-/*
- * @messageid 109059
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_CHILD_LOC_BUILD_FAILED (GLFS_DHT_BASE + 59)
-
-/*
- * @messageid 109060
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SET_SWITCH_PATTERN_ERROR (GLFS_DHT_BASE + 60)
-
-/*
- * @messageid 109061
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_COMPUTE_HASH_FAILED (GLFS_DHT_BASE + 61)
-
-/*
- * @messageid 109062
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_FIND_LAYOUT_ANOMALIES_ERROR (GLFS_DHT_BASE + 62)
-
-/*
- * @messageid 109063
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_ANOMALIES_INFO (GLFS_DHT_BASE + 63)
-
-/*
- * @messageid 109064
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_LAYOUT_INFO (GLFS_DHT_BASE + 64)
-
-/*
- * @messageid 109065
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_INODE_LK_ERROR (GLFS_DHT_BASE + 65)
-
-/*
- * @messageid 109066
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_RENAME_INFO (GLFS_DHT_BASE + 66)
-
-/*
- * @messageid 109067
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_DATA_NULL (GLFS_DHT_BASE + 67)
-
-/*
- * @messageid 109068
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_AGGREGATE_QUOTA_XATTR_FAILED (GLFS_DHT_BASE + 68)
-
-/*
- * @messageid 109069
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_UNLINK_LOOKUP_INFO (GLFS_DHT_BASE + 69)
-
-/*
- * @messageid 109070
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_LINK_FILE_LOOKUP_INFO (GLFS_DHT_BASE + 70)
-
-/*
- * @messageid 109071
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_OPERATION_NOT_SUP (GLFS_DHT_BASE + 71)
-
-/*
- * @messageid 109072
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_NOT_LINK_FILE_ERROR (GLFS_DHT_BASE + 72)
-
-/*
- * @messageid 109073
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_CHILD_DOWN (GLFS_DHT_BASE + 73)
-
-/*
- * @messageid 109074
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_UUID_PARSE_ERROR (GLFS_DHT_BASE + 74)
-
-/*
- * @messageid 109075
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_GET_DISK_INFO_ERROR (GLFS_DHT_BASE + 75)
-
-/*
- * @messageid 109076
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_INVALID_VALUE (GLFS_DHT_BASE + 76)
-
-/*
- * @messageid 109077
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SWITCH_PATTERN_INFO (GLFS_DHT_BASE + 77)
-
-/*
- * @messageid 109078
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SUBVOL_OP_FAILED (GLFS_DHT_BASE + 78)
-
-/*
- * @messageid 109079
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_LAYOUT_PRESET_FAILED (GLFS_DHT_BASE + 79)
-
-/*
- * @messageid 109080
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_INVALID_LINKFILE (GLFS_DHT_BASE + 80)
-
-/*
- * @messageid 109081
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_FIX_LAYOUT_INFO (GLFS_DHT_BASE + 81)
-
-/*
- * @messageid 109082
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_GET_HOSTNAME_FAILED (GLFS_DHT_BASE + 82)
-
-/*
- * @messageid 109083
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_WRITE_FAILED (GLFS_DHT_BASE + 83)
-
-/*
- * @messageid 109084
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED (GLFS_DHT_BASE + 84)
-
-/*
- * @messageid 109085
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_FSYNC_FAILED (GLFS_DHT_BASE + 85)
-
-/*
- * @messageid 109086
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SUBVOL_DECOMMISSION_INFO (GLFS_DHT_BASE + 86)
-
-/*
- * @messageid 109087
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_BRICK_QUERY_FAILED (GLFS_DHT_BASE + 87)
-
-/*
- * @messageid 109088
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SUBVOL_NO_LAYOUT_INFO (GLFS_DHT_BASE + 88)
-
-/*
- * @messageid 109089
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_OPEN_FD_ON_DST_FAILED (GLFS_DHT_BASE + 89)
-
-/*
- * @messageid 109090
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SUBVOL_NOT_FOUND (GLFS_DHT_BASE + 90)
-
-/*
- * @messageid 109190
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_FILE_LOOKUP_ON_DST_FAILED (GLFS_DHT_BASE + 91)
-
-/*
- * @messageid 109092
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_DISK_LAYOUT_MISSING (GLFS_DHT_BASE + 92)
-
-/*
- * @messageid 109093
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_DICT_GET_FAILED (GLFS_DHT_BASE + 93)
-
-/*
- * @messageid 109094
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_REVALIDATE_CBK_INFO (GLFS_DHT_BASE + 94)
-
-/*
- * @messageid 109095
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_UPGRADE_BRICKS (GLFS_DHT_BASE + 95)
-
-/*
- * @messageid 109096
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_LK_ARRAY_INFO (GLFS_DHT_BASE + 96)
-
-/*
- * @messageid 109097
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_RENAME_NOT_LOCAL (GLFS_DHT_BASE + 97)
-
-/*
- * @messageid 109098
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_RECONFIGURE_INFO (GLFS_DHT_BASE + 98)
-
-/*
- * @messageid 109099
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_INIT_LOCAL_SUBVOL_FAILED (GLFS_DHT_BASE + 99)
-
-/*
- * @messageid 109100
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SYS_CALL_GET_TIME_FAILED (GLFS_DHT_BASE + 100)
-
-/*
- * @messageid 109101
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_NO_DISK_USAGE_STATUS (GLFS_DHT_BASE + 101)
-
-/*
- * @messageid 109102
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SUBVOL_DOWN_ERROR (GLFS_DHT_BASE + 102)
-
-/*
- * @messageid 109103
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_REBAL_THROTTLE_INFO (GLFS_DHT_BASE + 103)
-
-/*
- * @messageid 109104
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_COMMIT_HASH_INFO (GLFS_DHT_BASE + 104)
-
-/*
- * @messageid 109105
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_REBAL_STRUCT_SET (GLFS_DHT_BASE + 105)
-
-/*
- * @messageid 109106
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_HAS_MIGINFO (GLFS_DHT_BASE + 106)
-
-/*
- * @messageid 109107
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_LOG_IPC_TIER_ERROR (GLFS_DHT_BASE + 107)
-
-/*
- * @messageid 109108
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_TIER_PAUSED (GLFS_DHT_BASE + 108)
-
-/*
- * @messageid 109109
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_TIER_RESUME (GLFS_DHT_BASE + 109)
-
-
-/* @messageid 109110
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_SETTLE_HASH_FAILED (GLFS_DHT_BASE + 110)
-
-/*
- * @messageid 109111
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_DEFRAG_PROCESS_DIR_FAILED (GLFS_DHT_BASE + 111)
-
-/*
- * @messageid 109112
- * @diagnosis
- * @recommendedaction None
- */
-
-#define DHT_MSG_FD_CTX_SET_FAILED (GLFS_DHT_BASE + 112)
-
-#define glfs_msg_end_x GLFS_MSGID_END, "Invalid: End of messages"
#endif /* _DHT_MESSAGES_H_ */
diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
index 3fbf091f337..8ba8082bd86 100644
--- a/xlators/cluster/dht/src/dht-rebalance.c
+++ b/xlators/cluster/dht/src/dht-rebalance.c
@@ -8,141 +8,143 @@
cases as published by the Free Software Foundation.
*/
-
-#include "tier.h"
#include "dht-common.h"
-#include "xlator.h"
-#include "syscall.h"
-#include <signal.h>
+#include <glusterfs/syscall.h>
#include <fnmatch.h>
#include <signal.h>
-
-
-#define GF_DISK_SECTOR_SIZE 512
-#define DHT_REBALANCE_PID 4242 /* Change it if required */
-#define DHT_REBALANCE_BLKSIZE (128 * 1024)
-#define MAX_MIGRATOR_THREAD_COUNT 40
-#define MAX_MIGRATE_QUEUE_COUNT 500
-#define MIN_MIGRATE_QUEUE_COUNT 200
-
+#include <glusterfs/events.h>
+#include "glusterfs/compat-errno.h" // for ENODATA on BSD
+
+#define GF_DISK_SECTOR_SIZE 512
+#define DHT_REBALANCE_PID 4242 /* Change it if required */
+#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
+#define MAX_MIGRATE_QUEUE_COUNT 500
+#define MIN_MIGRATE_QUEUE_COUNT 200
+#define MAX_REBAL_TYPE_SIZE 16
+#define FILE_CNT_INTERVAL 600 /* 10 mins */
+#define ESTIMATE_START_INTERVAL 600 /* 10 mins */
+#define HARDLINK_MIG_INPROGRESS -2
+#define SKIP_MIGRATION_FD_POSITIVE -3
#ifndef MAX
-#define MAX(a, b) (((a) > (b))?(a):(b))
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#endif
+#define GF_CRAWL_INDEX_MOVE(idx, sv_cnt) \
+ { \
+ idx++; \
+ idx %= sv_cnt; \
+ }
-#define GF_CRAWL_INDEX_MOVE(idx, sv_cnt) { \
- idx++; \
- idx %= sv_cnt; \
- }
+uint64_t g_totalfiles = 0;
+uint64_t g_totalsize = 0;
-#define GF_FREE_DIR_DFMETA(dir_dfmeta) { \
- if (dir_dfmeta) { \
- GF_FREE (dir_dfmeta->head); \
- GF_FREE (dir_dfmeta->equeue); \
- GF_FREE (dir_dfmeta->iterator); \
- GF_FREE (dir_dfmeta->offset_var); \
- GF_FREE (dir_dfmeta->fetch_entries); \
- GF_FREE (dir_dfmeta); \
- } \
- } \
+void
+gf_defrag_free_dir_dfmeta(struct dir_dfmeta *meta, int local_subvols_cnt)
+{
+ int i = 0;
+
+ if (meta) {
+ for (i = 0; i < local_subvols_cnt; i++) {
+ if (meta->equeue)
+ gf_dirent_free(&meta->equeue[i]);
+ if (meta->lfd && meta->lfd[i])
+ fd_unref(meta->lfd[i]);
+ }
+
+ GF_FREE(meta->equeue);
+ GF_FREE(meta->head);
+ GF_FREE(meta->iterator);
+ GF_FREE(meta->offset_var);
+ GF_FREE(meta->fetch_entries);
+ GF_FREE(meta->lfd);
+ GF_FREE(meta);
+ }
+}
void
-gf_defrag_free_container (struct dht_container *container)
+gf_defrag_free_container(struct dht_container *container)
{
- if (container) {
- gf_dirent_entry_free (container->df_entry);
+ if (container) {
+ gf_dirent_entry_free(container->df_entry);
- if (container->parent_loc) {
- loc_wipe (container->parent_loc);
- }
+ if (container->parent_loc) {
+ loc_wipe(container->parent_loc);
+ }
- GF_FREE (container->parent_loc);
+ GF_FREE(container->parent_loc);
- GF_FREE (container);
- }
+ GF_FREE(container);
+ }
}
void
-dht_set_global_defrag_error (gf_defrag_info_t *defrag, int ret)
+dht_set_global_defrag_error(gf_defrag_info_t *defrag, int ret)
{
- LOCK (&defrag->lock);
- {
- defrag->global_error = ret;
- }
- UNLOCK (&defrag->lock);
- return;
+ LOCK(&defrag->lock);
+ {
+ defrag->global_error = ret;
+ }
+ UNLOCK(&defrag->lock);
+ return;
}
static int
-dht_write_with_holes (xlator_t *to, fd_t *fd, struct iovec *vec, int count,
- int32_t size, off_t offset, struct iobref *iobref)
+dht_send_rebalance_event(xlator_t *this, int cmd, gf_defrag_status_t status)
{
- int i = 0;
- int ret = -1;
- int start_idx = 0;
- int tmp_offset = 0;
- int write_needed = 0;
- int buf_len = 0;
- int size_pending = 0;
- char *buf = NULL;
-
- /* loop through each vector */
- for (i = 0; i < count; i++) {
- buf = vec[i].iov_base;
- buf_len = vec[i].iov_len;
-
- for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
- start_idx += GF_DISK_SECTOR_SIZE) {
-
- if (mem_0filled (buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
- write_needed = 1;
- continue;
- }
-
- if (write_needed) {
- ret = syncop_write (to, fd, (buf + tmp_offset),
- (start_idx - tmp_offset),
- (offset + tmp_offset),
- iobref, 0, NULL, NULL);
- /* 'path' will be logged in calling function */
- if (ret < 0) {
- gf_log (THIS->name, GF_LOG_WARNING,
- "failed to write (%s)",
- strerror (-ret));
- ret = -1;
- goto out;
- }
-
- write_needed = 0;
- }
- tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
- }
+ int ret = -1;
+ char *volname = NULL;
+ char *tmpstr = NULL;
+ char *ptr = NULL;
+ char *suffix = "-dht";
+ int len = 0;
- if ((start_idx < buf_len) || write_needed) {
- /* This means, last chunk is not yet written.. write it */
- ret = syncop_write (to, fd, (buf + tmp_offset),
- (buf_len - tmp_offset),
- (offset + tmp_offset), iobref, 0,
- NULL, NULL);
- if (ret < 0) {
- /* 'path' will be logged in calling function */
- gf_log (THIS->name, GF_LOG_WARNING,
- "failed to write (%s)",
- strerror (-ret));
- ret = -1;
- goto out;
- }
- }
+ eventtypes_t event = EVENT_LAST;
- size_pending = (size - buf_len);
- if (!size_pending)
- break;
- }
-
- ret = size;
-out:
- return ret;
+ switch (status) {
+ case GF_DEFRAG_STATUS_COMPLETE:
+ event = EVENT_VOLUME_REBALANCE_COMPLETE;
+ break;
+ case GF_DEFRAG_STATUS_FAILED:
+ event = EVENT_VOLUME_REBALANCE_FAILED;
+ break;
+ case GF_DEFRAG_STATUS_STOPPED:
+ event = EVENT_VOLUME_REBALANCE_STOP;
+ break;
+ default:
+ break;
+ }
+
+ /* DHT volume */
+ len = strlen(this->name) - strlen(suffix);
+ tmpstr = gf_strdup(this->name);
+ if (tmpstr) {
+ ptr = tmpstr + len;
+ if (!strcmp(ptr, suffix)) {
+ tmpstr[len] = '\0';
+ volname = tmpstr;
+ }
+ }
+
+ if (!volname) {
+ /* Better than nothing */
+ volname = this->name;
+ }
+
+ if (event != EVENT_LAST) {
+ gf_event(event, "volume=%s", volname);
+ }
+
+ GF_FREE(tmpstr);
+ return ret;
+}
+static void
+dht_strip_out_acls(dict_t *dict)
+{
+ if (dict) {
+ dict_del(dict, "trusted.SGI_ACL_FILE");
+ dict_del(dict, POSIX_ACL_ACCESS_XATTR);
+ }
}
/*
@@ -181,200 +183,316 @@ be converted to "0" in dht_migrate_file.
*/
int32_t
-gf_defrag_handle_hardlink (xlator_t *this, loc_t *loc, dict_t *xattrs,
- struct iatt *stbuf)
+gf_defrag_handle_hardlink(xlator_t *this, loc_t *loc, int *fop_errno)
{
- int32_t ret = -1;
- xlator_t *cached_subvol = NULL;
- xlator_t *hashed_subvol = NULL;
- xlator_t *linkto_subvol = NULL;
- data_t *data = NULL;
- struct iatt iatt = {0,};
- int32_t op_errno = 0;
- dht_conf_t *conf = NULL;
- gf_loglevel_t loglevel = 0;
- dict_t *link_xattr = NULL;
-
- GF_VALIDATE_OR_GOTO ("defrag", loc, out);
- GF_VALIDATE_OR_GOTO ("defrag", loc->name, out);
- GF_VALIDATE_OR_GOTO ("defrag", stbuf, out);
- GF_VALIDATE_OR_GOTO ("defrag", this, out);
- GF_VALIDATE_OR_GOTO ("defrag", xattrs, out);
- GF_VALIDATE_OR_GOTO ("defrag", this->private, out);
-
- conf = this->private;
-
- if (gf_uuid_is_null (loc->pargfid)) {
- gf_msg ("", GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed :"
- "loc->pargfid is NULL for %s", loc->path);
- goto out;
- }
+ int32_t ret = -1;
+ xlator_t *cached_subvol = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *linkto_subvol = NULL;
+ data_t *data = NULL;
+ struct iatt iatt = {
+ 0,
+ };
+ int32_t op_errno = 0;
+ dht_conf_t *conf = NULL;
+ gf_loglevel_t loglevel = 0;
+ dict_t *link_xattr = NULL;
+ dict_t *dict = NULL;
+ dict_t *xattr_rsp = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+
+ *fop_errno = EINVAL;
+
+ GF_VALIDATE_OR_GOTO("defrag", loc, out);
+ GF_VALIDATE_OR_GOTO("defrag", loc->name, out);
+ GF_VALIDATE_OR_GOTO("defrag", this, out);
+ GF_VALIDATE_OR_GOTO("defrag", this->private, out);
+
+ conf = this->private;
+
+ if (gf_uuid_is_null(loc->pargfid)) {
+ gf_msg("", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "loc->pargfid is NULL for %s",
+ loc->path);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ if (gf_uuid_is_null(loc->gfid)) {
+ gf_msg("", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "loc->gfid is NULL for %s",
+ loc->path);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
- if (gf_uuid_is_null (loc->gfid)) {
- gf_msg ("", GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed :"
- "loc->gfid is NULL for %s", loc->path);
- goto out;
+ link_xattr = dict_new();
+ if (!link_xattr) {
+ ret = -1;
+ *fop_errno = ENOMEM;
+ goto out;
+ }
+
+ /*
+ Parallel migration can lead to migration of the hard link multiple
+ times which can lead to data loss. Hence, adding a fresh lookup to
+ decide whether migration is required or not.
+
+ Elaborating the scenario for let say 10 hardlinks [link{1..10}]:
+ Let say the first hard link "link1" does the setxattr of the
+ new hashed subvolume info on the cached file. As there are multiple
+ threads working, we might have already all the links created on the
+ new hashed by the time we reach hardlink let say link5. Now the
+ number of links on hashed is equal to that of cached. Hence, file
+ migration will happen for link6.
+
+ Cached Hashed
+ --------T link6 rwxrwxrwx link6
+
+ Now post above state all the link file on the cached will be zero
+ byte linkto files. Hence, if we still do migration for the following
+ files link{7..10}, we will end up migrating 0 data leading to data
+ loss.
+ Hence, a lookup can make sure whether we need to migrate the
+ file or not.
+ */
+
+ dict = dict_new();
+ if (!dict) {
+ ret = -1;
+ *fop_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "could not allocate memory for dict");
+ goto out;
+ }
+
+ ret = dict_set_int32(dict, conf->link_xattr_name, 256);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to set 'linkto' key in dict",
+ loc->path);
+ goto out;
+ }
+
+ ret = syncop_lookup(this, loc, &stbuf, NULL, dict, &xattr_rsp);
+ if (ret) {
+ /*Ignore ENOENT and ESTALE as file might have been
+ migrated already*/
+ if (-ret == ENOENT || -ret == ESTALE) {
+ ret = -2;
+ goto out;
+ }
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:%s lookup failed with ret = %d", loc->path,
+ ret);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ cached_subvol = dht_subvol_get_cached(this, loc->inode);
+ if (!cached_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "Failed to get cached subvol"
+ " for %s on %s",
+ loc->name, this->name);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (!hashed_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "Failed to get hashed subvol"
+ " for %s on %s",
+ loc->name, this->name);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ /* Hardlink migration happens only with remove-brick. So this condition will
+ * be true only when the migration has happened. In case hardlinks are
+ * migrated for rebalance case, remove this check. Having this check here
+ * avoid redundant calls below*/
+ if (hashed_subvol == cached_subvol) {
+ ret = -2;
+ goto out;
+ }
+
+ gf_log(this->name, GF_LOG_INFO,
+ "Attempting to migrate hardlink %s "
+ "with gfid %s from %s -> %s",
+ loc->name, uuid_utoa(loc->gfid), cached_subvol->name,
+ hashed_subvol->name);
+
+ data = dict_get(xattr_rsp, conf->link_xattr_name);
+ /* set linkto on cached -> hashed if not present, else link it */
+ if (!data) {
+ ret = dict_set_str(link_xattr, conf->link_xattr_name,
+ hashed_subvol->name);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "Failed to set dictionary value:"
+ " key = %s for %s",
+ conf->link_xattr_name, loc->name);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
}
- link_xattr = dict_new ();
- if (!link_xattr) {
- ret = -1;
- errno = ENOMEM;
- goto out;
+ ret = syncop_setxattr(cached_subvol, loc, link_xattr, 0, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :"
+ "Linkto setxattr failed %s -> %s",
+ cached_subvol->name, loc->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
}
- /*
- Parallel migration can lead to migration of the hard link multiple
- times which can lead to data loss. Hence, adding a fresh lookup to
- decide whether migration is required or not.
-
- Elaborating the scenario for let say 10 hardlinks [link{1..10}]:
- Let say the first hard link "link1" does the setxattr of the
- new hashed subvolume info on the cached file. As there are multiple
- threads working, we might have already all the links created on the
- new hashed by the time we reach hardlink let say link5. Now the
- number of links on hashed is equal to that of cached. Hence, file
- migration will happen for link6.
-
- Cached Hashed
- --------T link6 rwxrwxrwx link6
-
- Now post above state all the link file on the cached will be zero
- byte linkto files. Hence, if we still do migration for the following
- files link{7..10}, we will end up migrating 0 data leading to data
- loss.
- Hence, a lookup can make sure whether we need to migrate the
- file or not.
- */
+ gf_msg_debug(this->name, 0,
+ "hardlink target subvol created on %s "
+ ",cached %s, file %s",
+ hashed_subvol->name, cached_subvol->name, loc->path);
- ret = syncop_lookup (this, loc, NULL, NULL,
- NULL, NULL);
- if (ret) {
- /*Ignore ENOENT and ESTALE as file might have been
- migrated already*/
- if (-ret == ENOENT || -ret == ESTALE) {
- ret = -2;
- goto out;
- }
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:%s lookup failed with ret = %d",
- loc->path, ret);
- ret = -1;
- goto out;
+ ret = -2;
+ goto out;
+ } else {
+ linkto_subvol = dht_linkfile_subvol(this, NULL, NULL, xattr_rsp);
+ if (!linkto_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_ERROR,
+ "Failed to get "
+ "linkto subvol for %s",
+ loc->name);
+ } else {
+ hashed_subvol = linkto_subvol;
}
- cached_subvol = dht_subvol_get_cached (this, loc->inode);
- if (!cached_subvol) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed :"
- "Failed to get cached subvol"
- " for %s on %s", loc->name, this->name);
+ ret = syncop_link(hashed_subvol, loc, loc, &iatt, NULL, NULL);
+ if (ret) {
+ op_errno = -ret;
+ ret = -1;
+
+ loglevel = (op_errno == EEXIST) ? GF_LOG_DEBUG : GF_LOG_ERROR;
+ gf_msg(this->name, loglevel, op_errno,
+ DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED,
+ "link of %s -> %s"
+ " failed on subvol %s",
+ loc->name, uuid_utoa(loc->gfid), hashed_subvol->name);
+ if (op_errno != EEXIST) {
+ *fop_errno = op_errno;
goto out;
+ }
+ } else {
+ gf_msg_debug(this->name, 0,
+ "syncop_link successful for"
+ " hardlink %s on subvol %s, cached %s",
+ loc->path, hashed_subvol->name, cached_subvol->name);
}
+ }
- hashed_subvol = dht_subvol_get_hashed (this, loc);
- if (!hashed_subvol) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed :"
- "Failed to get hashed subvol"
- " for %s on %s", loc->name, this->name);
- goto out;
- }
+ ret = syncop_lookup(hashed_subvol, loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed :Failed lookup %s on %s ", loc->name,
+ hashed_subvol->name);
- if (hashed_subvol == cached_subvol) {
- ret = -2;
- goto out;
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* There is a race where on the target subvol for the hardlink
+ * (note: hash subvol for the hardlink might differ from this), some
+ * other client(non-rebalance) would have created a linkto file for that
+ * hardlink as part of lookup. So let say there are 10 hardlinks, on the
+ * 5th hardlink it self the hardlinks might have migrated. Now for
+ * (6..10th) hardlinks the cached and target would be same as the file
+ * has already migrated. Hence this check is needed */
+ if (cached_subvol == hashed_subvol) {
+ gf_msg_debug(this->name, 0,
+ "source %s and destination %s "
+ "for hardlink %s are same",
+ cached_subvol->name, hashed_subvol->name, loc->path);
+ ret = -2;
+ goto out;
+ }
+
+ if (iatt.ia_nlink == stbuf.ia_nlink) {
+ ret = dht_migrate_file(this, loc, cached_subvol, hashed_subvol,
+ GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS, fop_errno);
+ if (ret) {
+ goto out;
}
+ }
+ ret = -2;
+out:
+ if (link_xattr)
+ dict_unref(link_xattr);
- gf_log (this->name, GF_LOG_INFO, "Attempting to migrate hardlink %s "
- "with gfid %s from %s -> %s", loc->name, uuid_utoa (loc->gfid),
- cached_subvol->name, hashed_subvol->name);
- data = dict_get (xattrs, conf->link_xattr_name);
- /* set linkto on cached -> hashed if not present, else link it */
- if (!data) {
- ret = dict_set_str (link_xattr, conf->link_xattr_name,
- hashed_subvol->name);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed :"
- "Failed to set dictionary value:"
- " key = %s for %s",
- conf->link_xattr_name, loc->name);
- goto out;
- }
+ if (xattr_rsp)
+ dict_unref(xattr_rsp);
- ret = syncop_setxattr (cached_subvol, loc, link_xattr, 0, NULL,
- NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed :"
- "Linkto setxattr failed %s -> %s (%s)",
- cached_subvol->name,
- loc->name, strerror (-ret));
- ret = -1;
- goto out;
- }
- ret = -2;
- goto out;
- } else {
- linkto_subvol = dht_linkfile_subvol (this, NULL, NULL, xattrs);
- if (!linkto_subvol) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_SUBVOL_ERROR,
- "Failed to get "
- "linkto subvol for %s", loc->name);
- } else {
- hashed_subvol = linkto_subvol;
- }
+ if (dict)
+ dict_unref(dict);
- ret = syncop_link (hashed_subvol, loc, loc, &iatt, NULL, NULL);
- if (ret) {
- op_errno = -ret;
- ret = -1;
+ return ret;
+}
- loglevel = (op_errno == EEXIST) ? GF_LOG_DEBUG : \
- GF_LOG_ERROR;
- gf_msg (this->name, loglevel, op_errno,
- DHT_MSG_MIGRATE_HARDLINK_FILE_FAILED,
- "link of %s -> %s"
- " failed on subvol %s", loc->name,
- uuid_utoa(loc->gfid),
- hashed_subvol->name);
- if (op_errno != EEXIST)
- goto out;
- }
- }
- ret = syncop_lookup (hashed_subvol, loc, &iatt, NULL, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed :Failed lookup %s on %s ",
- loc->name, hashed_subvol->name);
+static int
+__check_file_has_hardlink(xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ dict_t *xattrs, int flags, gf_defrag_info_t *defrag,
+ dht_conf_t *conf, int *fop_errno)
+{
+ int ret = 0;
- ret = -1;
- goto out;
+ if (flags == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS) {
+ ret = 0;
+ return ret;
+ }
+ if (stbuf->ia_nlink > 1) {
+ /* support for decomission */
+ if (flags == GF_DHT_MIGRATE_HARDLINK) {
+ synclock_lock(&conf->link_lock);
+ ret = gf_defrag_handle_hardlink(this, loc, fop_errno);
+ synclock_unlock(&conf->link_lock);
+ /*
+ Returning zero will force the file to be remigrated.
+ Checkout gf_defrag_handle_hardlink for more information.
+ */
+ if (ret && ret != -2) {
+ gf_msg(this->name, GF_LOG_WARNING, 0,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to migrate file with link",
+ loc->path);
+ }
+ } else {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migration skipped for:"
+ "%s: file has hardlinks",
+ loc->path);
+ *fop_errno = ENOTSUP;
+ ret = 1;
}
+ }
- if (iatt.ia_nlink == stbuf->ia_nlink) {
- ret = dht_migrate_file (this, loc, cached_subvol, hashed_subvol,
- GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS);
- if (ret)
- goto out;
- }
- ret = -2;
-out:
- if (link_xattr)
- dict_unref (link_xattr);
- return ret;
+ return ret;
}
/*
@@ -386,783 +504,963 @@ out:
-1 : failure
*/
static int
-__is_file_migratable (xlator_t *this, loc_t *loc,
- struct iatt *stbuf, dict_t *xattrs, int flags,
- gf_defrag_info_t *defrag)
+__is_file_migratable(xlator_t *this, loc_t *loc, struct iatt *stbuf,
+ dict_t *xattrs, int flags, gf_defrag_info_t *defrag,
+ dht_conf_t *conf, int *fop_errno)
{
- int ret = -1;
- int lock_count = 0;
-
- if (IA_ISDIR (stbuf->ia_type)) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: migrate-file called on directory", loc->path);
- ret = -1;
- goto out;
- }
+ int ret = -1;
+ int lock_count = 0;
+
+ if (IA_ISDIR(stbuf->ia_type)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: migrate-file called on directory",
+ loc->path);
+ *fop_errno = EISDIR;
+ ret = -1;
+ goto out;
+ }
- ret = dict_get_int32 (xattrs, GLUSTERFS_POSIXLK_COUNT, &lock_count);
+ if (!conf->lock_migration_enabled) {
+ ret = dict_get_int32(xattrs, GLUSTERFS_POSIXLK_COUNT, &lock_count);
if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: Unable to get lock count for file", loc->path);
- ret = -1;
- goto out;
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: Unable to get lock count for file",
+ loc->path);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
}
if (lock_count) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed: %s: File has locks."
- " Skipping file migration", loc->path);
- ret = -1;
- goto out;
- }
-
- if (flags == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS) {
- ret = 0;
- goto out;
- }
-
- if (stbuf->ia_nlink > 1) {
- /* support for decomission */
- if (flags == GF_DHT_MIGRATE_HARDLINK) {
- synclock_lock (&defrag->link_lock);
- ret = gf_defrag_handle_hardlink
- (this, loc, xattrs, stbuf);
- synclock_unlock (&defrag->link_lock);
- /*
- Returning zero will force the file to be remigrated.
- Checkout gf_defrag_handle_hardlink for more information.
- */
- if (ret && ret != -2) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: failed to migrate file with link",
- loc->path);
- }
- } else {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: file has hardlinks", loc->path);
- ret = -ENOTSUP;
- }
- goto out;
- }
-
- ret = 0;
-
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: %s: File has locks."
+ " Skipping file migration",
+ loc->path);
+ *fop_errno = ENOTSUP;
+ ret = 1;
+ goto out;
+ }
+ }
+
+ /* Check if file has hardlink*/
+ ret = __check_file_has_hardlink(this, loc, stbuf, xattrs, flags, defrag,
+ conf, fop_errno);
out:
- return ret;
+ return ret;
}
-
static int
-__dht_rebalance_create_dst_file (xlator_t *to, xlator_t *from, loc_t *loc, struct iatt *stbuf,
- fd_t **dst_fd, dict_t *xattr)
+__dht_rebalance_create_dst_file(xlator_t *this, xlator_t *to, xlator_t *from,
+ loc_t *loc, struct iatt *stbuf, fd_t **dst_fd,
+ int *fop_errno, int file_has_holes)
{
- xlator_t *this = NULL;
- int ret = -1;
- fd_t *fd = NULL;
- struct iatt new_stbuf = {0,};
- struct iatt check_stbuf= {0,};
- dht_conf_t *conf = NULL;
- dict_t *dict = NULL;
-
- this = THIS;
- conf = this->private;
-
- dict = dict_new ();
- if (!dict)
- goto out;
-
- ret = dict_set_static_bin (dict, "gfid-req", stbuf->ia_gfid, 16);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_DICT_SET_FAILED,
- "%s: failed to set dictionary value: key = gfid-req",
- loc->path);
- goto out;
- }
-
- ret = dict_set_str (dict, conf->link_xattr_name, from->name);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_DICT_SET_FAILED,
- "%s: failed to set dictionary value: key = %s ",
- loc->path, conf->link_xattr_name);
- goto out;
- }
-
- fd = fd_create (loc->inode, DHT_REBALANCE_PID);
- if (!fd) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: fd create failed (destination) (%s)",
- loc->path, strerror (errno));
- ret = -1;
- goto out;
- }
-
- ret = syncop_lookup (to, loc, &new_stbuf, NULL, NULL, NULL);
- if (!ret) {
- /* File exits in the destination, check if gfid matches */
- if (gf_uuid_compare (stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_GFID_MISMATCH,
- "file %s exists in %s with different gfid",
- loc->path, to->name);
- ret = -1;
- goto out;
- }
- }
- if ((ret < 0) && (-ret != ENOENT)) {
- /* File exists in destination, but not accessible */
- gf_msg (THIS->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: failed to lookup file (%s)",
- loc->path, strerror (-ret));
- ret = -1;
- goto out;
- }
-
- /* Create the destination with LINKFILE mode, and linkto xattr,
- if the linkfile already exists, just open the file */
- if (!ret) {
- /*
- * File already present, just open the file.
- */
- ret = syncop_open (to, loc, O_RDWR, fd, NULL, NULL);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "failed to open %s on %s",
- loc->path, to->name);
- ret = -1;
- goto out;
- }
- } else {
- ret = syncop_create (to, loc, O_RDWR, DHT_LINKFILE_MODE, fd,
- &new_stbuf, dict, NULL);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "failed to create %s on %s",
- loc->path, to->name);
- ret = -1;
- goto out;
- }
-
- }
-
- fd_bind (fd);
+ int ret = -1;
+ int ret2 = -1;
+ fd_t *fd = NULL;
+ struct iatt new_stbuf = {
+ 0,
+ };
+ struct iatt check_stbuf = {
+ 0,
+ };
+ dht_conf_t *conf = NULL;
+ dict_t *dict = NULL;
+ dict_t *xdata = NULL;
+
+ conf = this->private;
+
+ dict = dict_new();
+ if (!dict) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "dictionary allocation failed for"
+ "path:%s",
+ loc->path);
+ goto out;
+ }
+ ret = dict_set_gfuuid(dict, "gfid-req", stbuf->ia_gfid, true);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: failed to set dictionary value: key = gfid-req", loc->path);
+ goto out;
+ }
+
+ ret = dict_set_str(dict, conf->link_xattr_name, from->name);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: failed to set dictionary value: key = %s ", loc->path,
+ conf->link_xattr_name);
+ goto out;
+ }
+
+ fd = fd_create(loc->inode, DHT_REBALANCE_PID);
+ if (!fd) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: fd create failed (destination)", loc->path);
+ goto out;
+ }
+
+ xdata = dict_new();
+ if (!xdata) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: dict_new failed)", loc->path);
+ goto out;
+ }
+
+ ret = dict_set_int32_sizen(xdata, GF_CLEAN_WRITE_PROTECTION, 1);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "%s: failed to set dictionary value: key = %s ", loc->path,
+ GF_CLEAN_WRITE_PROTECTION);
+ goto out;
+ }
+
+ ret = syncop_lookup(to, loc, &new_stbuf, NULL, xdata, NULL);
+ if (!ret) {
+ /* File exits in the destination, check if gfid matches */
+ if (gf_uuid_compare(stbuf->ia_gfid, new_stbuf.ia_gfid) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH,
+ "file %s exists in %s with different gfid", loc->path,
+ to->name);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+ }
+ if ((ret < 0) && (-ret != ENOENT)) {
+ /* File exists in destination, but not accessible */
+ gf_msg(THIS->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to lookup file", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
- /*Reason of doing lookup after create again:
- *In the create, there is some time-gap between opening fd at the
- *server (posix_layer) and binding it in server (incrementing fd count),
- *so if in that time-gap, if other process sends unlink considering it
- *as a linkto file, because inode->fd count will be 0, so file will be
- *unlinked at the backend. And because furthur operations are performed
- *on fd, so though migration will be done but will end with no file
- *at the backend.
+ /* Create the destination with LINKFILE mode, and linkto xattr,
+ if the linkfile already exists, just open the file */
+ if (!ret) {
+ /*
+ * File already present, just open the file.
*/
-
- ret = syncop_lookup (to, loc, &check_stbuf, NULL, NULL, NULL);
- if (!ret) {
-
- if (gf_uuid_compare (stbuf->ia_gfid, check_stbuf.ia_gfid) != 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_GFID_MISMATCH,
- "file %s exists in %s with different gfid,"
- "found in lookup after create",
- loc->path, to->name);
- ret = -1;
- goto out;
+ ret = syncop_open(to, loc, O_RDWR, fd, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to open %s on %s", loc->path, to->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ ret = syncop_create(to, loc, O_RDWR, DHT_LINKFILE_MODE, fd, &new_stbuf,
+ dict, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to create %s on %s", loc->path, to->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ }
+
+ fd_bind(fd);
+
+ /*Reason of doing lookup after create again:
+ *In the create, there is some time-gap between opening fd at the
+ *server (posix_layer) and binding it in server (incrementing fd count),
+ *so if in that time-gap, if other process sends unlink considering it
+ *as a linkto file, because inode->fd count will be 0, so file will be
+ *unlinked at the backend. And because further operations are performed
+ *on fd, so though migration will be done but will end with no file
+ *at the backend.
+ */
+
+ ret = syncop_lookup(to, loc, &check_stbuf, NULL, NULL, NULL);
+ if (!ret) {
+ if (gf_uuid_compare(stbuf->ia_gfid, check_stbuf.ia_gfid) != 0) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_GFID_MISMATCH,
+ "file %s exists in %s with different gfid,"
+ "found in lookup after create",
+ loc->path, to->name);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+ }
+
+ if (-ret == ENOENT) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: file does not exist"
+ "on %s",
+ loc->path, to->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_fsetattr(to, fd, stbuf, (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
+ NULL, NULL, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "chown failed for %s on %s", loc->path, to->name);
+ }
+
+ /* No need to bother about 0 byte size files */
+ if (stbuf->ia_size > 0) {
+ if (conf->use_fallocate && !file_has_holes) {
+ ret = syncop_fallocate(to, fd, 0, 0, stbuf->ia_size, NULL, NULL);
+ if (ret < 0) {
+ if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -ENOSYS) {
+ conf->use_fallocate = _gf_false;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "fallocate failed for %s on %s", loc->path,
+ to->name);
+
+ *fop_errno = -ret;
+
+ /* fallocate does not release the space
+ * in some cases
+ */
+ ret2 = syncop_ftruncate(to, fd, 0, NULL, NULL, NULL, NULL);
+ if (ret2 < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret2,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "ftruncate failed for "
+ "%s on %s",
+ loc->path, to->name);
+ }
+ goto out;
}
-
- }
-
- if (-ret == ENOENT) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED, "%s: file does not exists"
- "on %s (%s)", loc->path, to->name, strerror (-ret));
- ret = -1;
- goto out;
+ }
+ } else {
+ ret = syncop_ftruncate(to, fd, stbuf->ia_size, NULL, NULL, NULL,
+ NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "ftruncate failed for %s on %s", loc->path, to->name);
+ }
}
+ }
- ret = syncop_fsetxattr (to, fd, xattr, 0, NULL, NULL);
- if (ret < 0)
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: failed to set xattr on %s (%s)",
- loc->path, to->name, strerror (-ret));
-
- ret = syncop_ftruncate (to, fd, stbuf->ia_size, NULL, NULL);
- if (ret < 0)
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "ftruncate failed for %s on %s (%s)",
- loc->path, to->name, strerror (-ret));
-
- ret = syncop_fsetattr (to, fd, stbuf,
- (GF_SET_ATTR_UID | GF_SET_ATTR_GID),
- NULL, NULL, NULL, NULL);
- if (ret < 0)
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "chown failed for %s on %s (%s)",
- loc->path, to->name, strerror (-ret));
-
- /* success */
- ret = 0;
+ /* success */
+ ret = 0;
- if (dst_fd)
- *dst_fd = fd;
+ if (dst_fd)
+ *dst_fd = fd;
out:
- if (ret) {
- if (fd) {
- fd_unref (fd);
- }
+ if (ret) {
+ if (fd) {
+ fd_unref(fd);
}
- if (dict)
- dict_unref (dict);
+ }
+ if (dict)
+ dict_unref(dict);
- return ret;
+ if (xdata)
+ dict_unref(xdata);
+
+ return ret;
}
static int
-__dht_check_free_space (xlator_t *to, xlator_t *from, loc_t *loc,
- struct iatt *stbuf, int flag)
+__dht_check_free_space(xlator_t *this, xlator_t *to, xlator_t *from, loc_t *loc,
+ struct iatt *stbuf, int flag, dht_conf_t *conf,
+ gf_boolean_t *target_changed, xlator_t **new_subvol,
+ int *fop_errno)
{
- struct statvfs src_statfs = {0,};
- struct statvfs dst_statfs = {0,};
- int ret = -1;
- xlator_t *this = NULL;
- dict_t *xdata = NULL;
-
- uint64_t src_statfs_blocks = 1;
- uint64_t dst_statfs_blocks = 1;
-
- this = THIS;
-
- xdata = dict_new ();
- if (!xdata) {
- errno = ENOMEM;
- gf_msg (this->name, GF_LOG_ERROR, ENOMEM,
- DHT_MSG_NO_MEMORY,
- "failed to allocate dictionary");
- goto out;
- }
+ struct statvfs src_statfs = {
+ 0,
+ };
+ struct statvfs dst_statfs = {
+ 0,
+ };
+ int ret = -1;
+ dict_t *xdata = NULL;
+ dht_layout_t *layout = NULL;
+ uint64_t src_statfs_blocks = 1;
+ uint64_t dst_statfs_blocks = 1;
+ double dst_post_availspacepercent = 0;
+ double src_post_availspacepercent = 0;
+ uint64_t file_blocks = 0;
+ uint64_t src_total_blocks = 0;
+ uint64_t dst_total_blocks = 0;
+
+ xdata = dict_new();
+ if (!xdata) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "failed to allocate dictionary");
+ goto out;
+ }
+
+ ret = dict_set_int8(xdata, GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict");
+ ret = -1;
+ *fop_errno = ENOMEM;
+ goto out;
+ }
+
+ ret = syncop_statfs(from, loc, &src_statfs, xdata, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to get statfs of %s on %s", loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_statfs(to, loc, &dst_statfs, xdata, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to get statfs of %s on %s", loc->path, to->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ gf_msg_debug(this->name, 0,
+ "min_free_disk - %f , block available - %" PRId64
+ ", block size - %lu",
+ conf->min_free_disk, dst_statfs.f_bavail, dst_statfs.f_bsize);
+
+ dst_statfs_blocks = dst_statfs.f_bavail *
+ (dst_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+ src_statfs_blocks = src_statfs.f_bavail *
+ (src_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+ dst_total_blocks = dst_statfs.f_blocks *
+ (dst_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+ src_total_blocks = src_statfs.f_blocks *
+ (src_statfs.f_frsize / GF_DISK_SECTOR_SIZE);
+
+ /* if force option is given, do not check for space @ dst.
+ * Check only if space is avail for the file */
+ if (flag != GF_DHT_MIGRATE_DATA)
+ goto check_avail_space;
+
+ /* Check:
+ During rebalance `migrate-data` - Destination subvol experiences
+ a `reduction` in 'blocks' of free space, at the same time source
+ subvol gains certain 'blocks' of free space. A valid check is
+ necessary here to avoid erroneous move to destination where
+ the space could be scantily available.
+ With heterogeneous brick support, an actual space comparison could
+ prevent any files being migrated to newly added bricks if they are
+ smaller then the free space available on the existing bricks.
+ */
+ if (!conf->use_fallocate) {
+ file_blocks = stbuf->ia_size + GF_DISK_SECTOR_SIZE - 1;
+ file_blocks /= GF_DISK_SECTOR_SIZE;
+
+ if (file_blocks >= dst_statfs_blocks) {
+ dst_statfs_blocks = 0;
+ } else {
+ dst_statfs_blocks -= file_blocks;
+ }
+ }
+
+ src_post_availspacepercent = ((src_statfs_blocks + file_blocks) * 100) /
+ src_total_blocks;
+
+ dst_post_availspacepercent = (dst_statfs_blocks * 100) / dst_total_blocks;
+
+ if (dst_post_availspacepercent < src_post_availspacepercent) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "data movement of file "
+ "{blocks:%" PRIu64
+ " name:(%s)} would result in "
+ "dst node (%s:%" PRIu64
+ ") having lower disk "
+ "space than the source node (%s:%" PRIu64
+ ")"
+ ".Skipping file.",
+ stbuf->ia_blocks, loc->path, to->name, dst_statfs_blocks,
+ from->name, src_statfs_blocks);
+
+ /* this is not a 'failure', but we don't want to
+ consider this as 'success' too :-/ */
+ *fop_errno = ENOSPC;
+ ret = 1;
+ goto out;
+ }
- ret = dict_set_int8 (xdata, GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to set "
- GF_INTERNAL_IGNORE_DEEM_STATFS" in dict");
- ret = -1;
- goto out;
+check_avail_space:
+ if (conf->disk_unit == 'p' && dst_statfs.f_blocks) {
+ dst_post_availspacepercent = (dst_statfs_blocks * 100) /
+ dst_total_blocks;
+
+ gf_msg_debug(this->name, 0,
+ "file : %s, post_availspacepercent"
+ " : %lf f_bavail : %" PRIu64 " min-free-disk: %lf",
+ loc->path, dst_post_availspacepercent, dst_statfs.f_bavail,
+ conf->min_free_disk);
+
+ if (dst_post_availspacepercent < conf->min_free_disk) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+ "Write will cross min-free-disk for "
+ "file - %s on subvol - %s. Looking "
+ "for new subvol",
+ loc->path, to->name);
+
+ goto find_new_subvol;
+ } else {
+ ret = 0;
+ goto out;
}
+ }
- ret = syncop_statfs (from, loc, &src_statfs, xdata, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "failed to get statfs of %s on %s (%s)",
- loc->path, from->name, strerror (-ret));
- ret = -1;
- goto out;
- }
+ if (conf->disk_unit != 'p') {
+ if ((dst_statfs_blocks * GF_DISK_SECTOR_SIZE) < conf->min_free_disk) {
+ gf_msg_debug(this->name, 0,
+ "file : %s, destination frsize: %lu "
+ "f_bavail : %" PRIu64 " min-free-disk: %lf",
+ loc->path, dst_statfs.f_frsize, dst_statfs.f_bavail,
+ conf->min_free_disk);
- ret = syncop_statfs (to, loc, &dst_statfs, xdata, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "failed to get statfs of %s on %s (%s)",
- loc->path, to->name, strerror (-ret));
- ret = -1;
- goto out;
- }
+ gf_msg(this->name, GF_LOG_WARNING, 0, 0,
+ "write will"
+ " cross min-free-disk for file - %s on subvol -"
+ " %s. looking for new subvol",
+ loc->path, to->name);
- /* if force option is given, do not check for space @ dst.
- * Check only if space is avail for the file */
- if (flag != GF_DHT_MIGRATE_DATA)
- goto check_avail_space;
+ goto find_new_subvol;
- /* Check:
- During rebalance `migrate-data` - Destination subvol experiences
- a `reduction` in 'blocks' of free space, at the same time source
- subvol gains certain 'blocks' of free space. A valid check is
- necessary here to avoid errorneous move to destination where
- the space could be scantily available.
- */
- if (stbuf) {
- dst_statfs_blocks = ((dst_statfs.f_bavail *
- dst_statfs.f_bsize) /
- GF_DISK_SECTOR_SIZE);
- src_statfs_blocks = ((src_statfs.f_bavail *
- src_statfs.f_bsize) /
- GF_DISK_SECTOR_SIZE);
- if ((dst_statfs_blocks - stbuf->ia_blocks) <
- (src_statfs_blocks + stbuf->ia_blocks)) {
-
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "data movement attempted from node "
- "(%s:%"PRIu64") with higher disk space "
- "to a node (%s:%"PRIu64") with lesser "
- "disk space, file { blocks:%"PRIu64", "
- "name:(%s) }", from->name, src_statfs_blocks,
- to->name, dst_statfs_blocks,
- stbuf->ia_blocks, loc->path);
-
- /* this is not a 'failure', but we don't want to
- consider this as 'success' too :-/ */
- ret = -1;
- goto out;
- }
- }
-check_avail_space:
- if (((dst_statfs.f_bavail * dst_statfs.f_bsize) /
- GF_DISK_SECTOR_SIZE) < stbuf->ia_blocks) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "data movement attempted from node (%s) to node (%s) "
- "which does not have required free space for (%s)",
- from->name, to->name, loc->path);
- ret = -1;
- goto out;
+ } else {
+ ret = 0;
+ goto out;
}
+ }
+find_new_subvol:
+ layout = dht_layout_get(this, loc->parent);
+ if (!layout) {
+ gf_log(this->name, GF_LOG_ERROR, "Layout is NULL");
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ *new_subvol = dht_subvol_with_free_space_inodes(this, to, from, layout,
+ stbuf->ia_size);
+ if ((!(*new_subvol)) || (*new_subvol == from)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_SUBVOL_INSUFF_SPACE,
+ "Could not find any subvol"
+ " with space accommodating the file - %s. Consider "
+ "adding bricks",
+ loc->path);
+
+ *target_changed = _gf_false;
+ *fop_errno = ENOSPC;
+ ret = -1;
+ } else {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "new target found - %s"
+ " for file - %s",
+ (*new_subvol)->name, loc->path);
+ *target_changed = _gf_true;
ret = 0;
+ }
+
out:
- if (xdata)
- dict_unref (xdata);
- return ret;
+ if (xdata)
+ dict_unref(xdata);
+ return ret;
}
static int
-__dht_rebalance_migrate_data (xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst,
- uint64_t ia_size, int hole_exists)
+__dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
+ xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst,
+ uint64_t ia_size, int hole_exists, int *fop_errno)
{
- int ret = 0;
- int count = 0;
- off_t offset = 0;
- struct iovec *vector = NULL;
- struct iobref *iobref = NULL;
- uint64_t total = 0;
- size_t read_size = 0;
-
- /* if file size is '0', no need to enter this loop */
- while (total < ia_size) {
- read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ?
- DHT_REBALANCE_BLKSIZE : (ia_size - total));
-
- ret = syncop_readv (from, src, read_size,
- offset, 0, &vector, &count, &iobref, NULL,
- NULL);
- if (!ret || (ret < 0)) {
- break;
- }
+ int ret = 0;
+ int count = 0;
+ off_t offset = 0;
+ off_t data_offset = 0;
+ off_t hole_offset = 0;
+ struct iovec *vector = NULL;
+ struct iobref *iobref = NULL;
+ uint64_t total = 0;
+ size_t read_size = 0;
+ size_t data_block_size = 0;
+ dict_t *xdata = NULL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ /* if file size is '0', no need to enter this loop */
+ while (total < ia_size) {
+ /* This is a regular file - read it sequentially */
+ if (!hole_exists) {
+ read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
+ ? DHT_REBALANCE_BLKSIZE
+ : (ia_size - total));
+ } else {
+ /* This is a sparse file - read only the data segments in the file
+ */
+
+ /* If the previous data block is fully copied, find the next data
+ * segment
+ * starting at the offset of the last read and written byte, */
+ if (data_block_size <= 0) {
+ ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
+ &data_offset);
+ if (ret) {
+ if (ret == -ENXIO)
+ ret = 0; /* No more data segments */
+ else
+ *fop_errno = -ret; /* Error occurred */
- if (hole_exists)
- ret = dht_write_with_holes (to, dst, vector, count,
- ret, offset, iobref);
- else
- ret = syncop_writev (to, dst, vector, count,
- offset, iobref, 0, NULL, NULL);
- if (ret < 0) {
- break;
+ break;
}
- offset += ret;
- total += ret;
-
- GF_FREE (vector);
- if (iobref)
- iobref_unref (iobref);
- iobref = NULL;
- vector = NULL;
- }
- if (iobref)
- iobref_unref (iobref);
- GF_FREE (vector);
-
- if (ret >= 0)
- ret = 0;
- else
- ret = -1;
- return ret;
-}
-
-static int
-__tier_migrate_data (gf_defrag_info_t *defrag, xlator_t *from, xlator_t *to, fd_t *src, fd_t *dst,
- uint64_t ia_size, int hole_exists)
-{
- int ret = 0;
- int count = 0;
- off_t offset = 0;
- struct iovec *vector = NULL;
- struct iobref *iobref = NULL;
- uint64_t total = 0;
- size_t read_size = 0;
-
- /* if file size is '0', no need to enter this loop */
- while (total < ia_size) {
-
- read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ?
- DHT_REBALANCE_BLKSIZE : (ia_size - total));
-
- ret = syncop_readv (from, src, read_size,
- offset, 0, &vector, &count, &iobref, NULL,
- NULL);
- if (!ret || (ret < 0)) {
+ /* If the position of the current data segment is greater than
+ * the position of the next hole, find the next hole in order to
+ * calculate the length of the new data segment */
+ if (data_offset > hole_offset) {
+ /* Starting at the offset of the last data segment, find the
+ * next hole */
+ ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
+ NULL, &hole_offset);
+ if (ret) {
+ /* If an error occurred here it's a real error because
+ * if the seek for a data segment was successful then
+ * necessarily another hole must exist (EOF is a hole)
+ */
+ *fop_errno = -ret;
break;
- }
+ }
- if (hole_exists)
- ret = dht_write_with_holes (to, dst, vector, count,
- ret, offset, iobref);
- else
- ret = syncop_writev (to, dst, vector, count,
- offset, iobref, 0, NULL, NULL);
- if (gf_defrag_get_pause_state (&defrag->tier_conf) != TIER_RUNNING) {
- gf_msg ("tier", GF_LOG_INFO, 0,
- DHT_MSG_TIER_PAUSED,
- "Migrate file paused");
- ret = -1;
+ /* Calculate the total size of the current data block */
+ data_block_size = hole_offset - data_offset;
+ }
+ } else {
+ /* There is still data in the current segment, move the
+ * data_offset to the position of the last written byte */
+ data_offset = offset;
+ }
+
+ /* Calculate how much data needs to be read and written. If the data
+ * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
+ * write DHT_REBALANCE_BLKSIZE data length and the rest in the
+ * next iteration(s) */
+ read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
+ ? DHT_REBALANCE_BLKSIZE
+ : data_block_size);
+
+ /* Calculate the remaining size of the data block - maybe there's no
+ * need to seek for data in the next iteration */
+ data_block_size -= read_size;
+
+ /* Set offset to the offset of the data segment so read and write
+ * will have the correct position */
+ offset = data_offset;
+ }
+
+ ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
+ &iobref, NULL, NULL, NULL);
+
+ if (!ret || (ret < 0)) {
+ if (!ret) {
+ /* File was probably truncated*/
+ ret = -1;
+ *fop_errno = ENOSPC;
+ } else {
+ *fop_errno = -ret;
+ }
+ break;
+ }
+
+ if (!conf->force_migration) {
+ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "insufficient memory");
+ ret = -1;
+ *fop_errno = ENOMEM;
+ break;
}
- if (ret < 0) {
- break;
+ /* Fail this write and abort rebalance if we
+ * detect a write from client since migration of
+ * this file started. This is done to avoid
+ * potential data corruption due to out of order
+ * writes from rebalance and client to the same
+ * region (as compared between src and dst
+ * files). See
+ * https://github.com/gluster/glusterfs/issues/308
+ * for more details.
+ */
+ ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
+ if (ret) {
+ gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
+ "failed to set dict");
+ ret = -1;
+ *fop_errno = ENOMEM;
+ break;
}
- offset += ret;
- total += ret;
+ }
+ }
- GF_FREE (vector);
- if (iobref)
- iobref_unref (iobref);
- iobref = NULL;
- vector = NULL;
+ ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
+ NULL, xdata, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ break;
}
+
+ offset += ret;
+ total += ret;
+
+ GF_FREE(vector);
if (iobref)
- iobref_unref (iobref);
- GF_FREE (vector);
+ iobref_unref(iobref);
+ iobref = NULL;
+ vector = NULL;
+ }
+ if (iobref)
+ iobref_unref(iobref);
+ GF_FREE(vector);
+
+ if (ret >= 0)
+ ret = 0;
+ else
+ ret = -1;
- if (ret >= 0)
- ret = 0;
- else
- ret = -1;
+ if (xdata) {
+ dict_unref(xdata);
+ }
- return ret;
+ return ret;
}
-
static int
-__dht_rebalance_open_src_file (xlator_t *from, xlator_t *to, loc_t *loc,
- struct iatt *stbuf, fd_t **src_fd,
- gf_boolean_t *clean_src)
+__dht_rebalance_open_src_file(xlator_t *this, xlator_t *from, xlator_t *to,
+ loc_t *loc, struct iatt *stbuf, fd_t **src_fd,
+ gf_boolean_t *clean_src, int *fop_errno)
{
- int ret = 0;
- fd_t *fd = NULL;
- dict_t *dict = NULL;
- xlator_t *this = NULL;
- struct iatt iatt = {0,};
- dht_conf_t *conf = NULL;
-
- this = THIS;
- conf = this->private;
-
- *clean_src = _gf_false;
-
- fd = fd_create (loc->inode, DHT_REBALANCE_PID);
- if (!fd) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: fd create failed (source)", loc->path);
- ret = -1;
- goto out;
- }
-
- ret = syncop_open (from, loc, O_RDWR, fd, NULL, NULL);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "failed to open file %s on %s (%s)",
- loc->path, from->name, strerror (-ret));
- ret = -1;
- goto out;
- }
-
- fd_bind (fd);
-
- if (src_fd)
- *src_fd = fd;
-
+ int ret = 0;
+ fd_t *fd = NULL;
+ dict_t *dict = NULL;
+ struct iatt iatt = {
+ 0,
+ };
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
+
+ *clean_src = _gf_false;
+
+ fd = fd_create(loc->inode, DHT_REBALANCE_PID);
+ if (!fd) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: fd create failed (source)", loc->path);
+ *fop_errno = ENOMEM;
ret = -1;
- dict = dict_new ();
- if (!dict)
- goto out;
-
- ret = dict_set_str (dict, conf->link_xattr_name, to->name);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "failed to set xattr in dict for %s (linkto:%s)",
- loc->path, to->name);
- goto out;
- }
-
- /* Once the migration starts, the source should have 'linkto' key set
- to show which is the target, so other clients can work around it */
- ret = syncop_setxattr (from, loc, dict, 0, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "failed to set xattr on %s in %s (%s)",
- loc->path, from->name, strerror (-ret));
- ret = -1;
- goto out;
- }
+ goto out;
+ }
+
+ ret = syncop_open(from, loc, O_RDWR, fd, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to open file %s on %s", loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
- /* Reset source mode/xattr if migration fails*/
- *clean_src = _gf_true;
+ fd_bind(fd);
- /* mode should be (+S+T) to indicate migration is in progress */
- iatt.ia_prot = stbuf->ia_prot;
- iatt.ia_type = stbuf->ia_type;
- iatt.ia_prot.sticky = 1;
- iatt.ia_prot.sgid = 1;
+ if (src_fd)
+ *src_fd = fd;
- ret = syncop_setattr (from, loc, &iatt, GF_SET_ATTR_MODE, NULL, NULL,
- NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "failed to set mode on %s in %s (%s)",
- loc->path, from->name, strerror (-ret));
- ret = -1;
- goto out;
- }
+ ret = -1;
+ dict = dict_new();
+ if (!dict) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: Could not allocate memory for dict", loc->path);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_str(dict, conf->link_xattr_name, to->name);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set xattr in dict for %s (linkto:%s)", loc->path,
+ to->name);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ /* Once the migration starts, the source should have 'linkto' key set
+ to show which is the target, so other clients can work around it */
+ ret = syncop_setxattr(from, loc, dict, 0, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to set xattr on %s in %s", loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* Reset source mode/xattr if migration fails*/
+ *clean_src = _gf_true;
+
+ /* mode should be (+S+T) to indicate migration is in progress */
+ iatt.ia_prot = stbuf->ia_prot;
+ iatt.ia_type = stbuf->ia_type;
+ iatt.ia_prot.sticky = 1;
+ iatt.ia_prot.sgid = 1;
+
+ ret = syncop_setattr(from, loc, &iatt, GF_SET_ATTR_MODE, NULL, NULL, NULL,
+ NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to set mode on %s in %s", loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
- /* success */
- ret = 0;
+ /* success */
+ ret = 0;
out:
- if (dict)
- dict_unref (dict);
+ if (dict)
+ dict_unref(dict);
- return ret;
+ return ret;
}
int
-migrate_special_files (xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
- struct iatt *buf)
+migrate_special_files(xlator_t *this, xlator_t *from, xlator_t *to, loc_t *loc,
+ struct iatt *buf, int *fop_errno)
{
- int ret = -1;
- dict_t *rsp_dict = NULL;
- dict_t *dict = NULL;
- char *link = NULL;
- struct iatt stbuf = {0,};
- dht_conf_t *conf = this->private;
-
- dict = dict_new ();
- if (!dict)
- goto out;
-
- ret = dict_set_int32 (dict, conf->link_xattr_name, 256);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set 'linkto' key in dict", loc->path);
- goto out;
- }
-
- /* check in the destination if the file is link file */
- ret = syncop_lookup (to, loc, &stbuf, NULL, dict, &rsp_dict);
- if ((ret < 0) && (-ret != ENOENT)) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: lookup failed (%s)",
- loc->path, strerror (-ret));
- ret = -1;
- goto out;
- }
-
- /* we no more require this key */
- dict_del (dict, conf->link_xattr_name);
-
- /* file exists in target node, only if it is 'linkfile' its valid,
- otherwise, error out */
- if (!ret) {
- if (!check_is_linkfile (loc->inode, &stbuf, rsp_dict,
- conf->link_xattr_name)) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: file exists in destination", loc->path);
- ret = -1;
- goto out;
- }
-
- /* as file is linkfile, delete it */
- ret = syncop_unlink (to, loc, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: failed to delete the linkfile (%s)",
- loc->path, strerror (-ret));
- ret = -1;
- goto out;
- }
- }
-
- /* Set the gfid of the source file in dict */
- ret = dict_set_static_bin (dict, "gfid-req", buf->ia_gfid, 16);
+ int ret = -1;
+ dict_t *rsp_dict = NULL;
+ dict_t *dict = NULL;
+ char *link = NULL;
+ struct iatt stbuf = {
+ 0,
+ };
+ dht_conf_t *conf = this->private;
+
+ dict = dict_new();
+ if (!dict) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_int32(dict, conf->link_xattr_name, 256);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s: failed to set 'linkto' key in dict", loc->path);
+ goto out;
+ }
+
+ /* check in the destination if the file is link file */
+ ret = syncop_lookup(to, loc, &stbuf, NULL, dict, &rsp_dict);
+ if ((ret < 0) && (-ret != ENOENT)) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: lookup failed", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* we no more require this key */
+ dict_del(dict, conf->link_xattr_name);
+
+ /* file exists in target node, only if it is 'linkfile' its valid,
+ otherwise, error out */
+ if (!ret) {
+ if (!check_is_linkfile(loc->inode, &stbuf, rsp_dict,
+ conf->link_xattr_name)) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: file exists in destination", loc->path);
+ *fop_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+
+ /* as file is linkfile, delete it */
+ ret = syncop_unlink(to, loc, NULL, NULL);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "%s: failed to set gfid in dict for create", loc->path);
- goto out;
- }
-
- /* Create the file in target */
- if (IA_ISLNK (buf->ia_type)) {
- /* Handle symlinks separately */
- ret = syncop_readlink (from, loc, &link, buf->ia_size, NULL,
- NULL);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: readlink on symlink failed (%s)",
- loc->path, strerror (-ret));
- ret = -1;
- goto out;
- }
-
- ret = syncop_symlink (to, loc, link, 0, dict, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: creating symlink failed (%s)",
- loc->path, strerror (-ret));
- ret = -1;
- goto out;
- }
-
- goto done;
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to delete the linkfile", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* Set the gfid of the source file in dict */
+ ret = dict_set_gfuuid(dict, "gfid-req", buf->ia_gfid, true);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s: failed to set gfid in dict for create", loc->path);
+ goto out;
+ }
+
+ /* Create the file in target */
+ if (IA_ISLNK(buf->ia_type)) {
+ /* Handle symlinks separately */
+ ret = syncop_readlink(from, loc, &link, buf->ia_size, NULL, NULL);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: readlink on symlink failed", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
}
- ret = syncop_mknod (to, loc, st_mode_from_ia (buf->ia_prot,
- buf->ia_type),
- makedev (ia_major (buf->ia_rdev),
- ia_minor (buf->ia_rdev)), 0, dict, NULL);
+ ret = syncop_symlink(to, loc, link, 0, dict, NULL);
if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: mknod failed (%s)",
- loc->path, strerror (-ret));
- ret = -1;
- goto out;
- }
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED, "%s: creating symlink failed",
+ loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ goto done;
+ }
+
+ ret = syncop_mknod(to, loc, st_mode_from_ia(buf->ia_prot, buf->ia_type),
+ makedev(ia_major(buf->ia_rdev), ia_minor(buf->ia_rdev)),
+ 0, dict, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: mknod failed", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
done:
- ret = syncop_setattr (to, loc, buf,
- (GF_SET_ATTR_MTIME |
- GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_MODE), NULL, NULL, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: failed to perform setattr on %s (%s)",
- loc->path, to->name, strerror (-ret));
- ret = -1;
- }
-
- ret = syncop_unlink (from, loc, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: unlink failed (%s)",
- loc->path, strerror (-ret));
- ret = -1;
- }
+ ret = syncop_setattr(to, loc, buf,
+ (GF_SET_ATTR_MTIME | GF_SET_ATTR_UID |
+ GF_SET_ATTR_GID | GF_SET_ATTR_MODE),
+ NULL, NULL, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to perform setattr on %s", loc->path, to->name);
+ *fop_errno = -ret;
+ }
+
+ ret = syncop_unlink(from, loc, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: unlink failed", loc->path);
+ *fop_errno = -ret;
+ ret = -1;
+ }
out:
- GF_FREE (link);
- if (dict)
- dict_unref (dict);
+ GF_FREE(link);
+ if (dict)
+ dict_unref(dict);
- if (rsp_dict)
- dict_unref (rsp_dict);
+ if (rsp_dict)
+ dict_unref(rsp_dict);
- return ret;
+ return ret;
}
static int
-__dht_migration_cleanup_src_file (xlator_t *this, loc_t *loc, fd_t *fd,
- xlator_t *from, ia_prot_t *src_ia_prot)
+__dht_migration_cleanup_src_file(xlator_t *this, loc_t *loc, fd_t *fd,
+ xlator_t *from, ia_prot_t *src_ia_prot)
{
- int ret = -1;
- dht_conf_t *conf = NULL;
- struct iatt new_stbuf = {0,};
-
- if (!this || !fd || !from || !src_ia_prot) {
- goto out;
- }
-
- conf = this->private;
-
- /*Revert source mode and xattr changes*/
- ret = syncop_fstat (from, fd, &new_stbuf, NULL, NULL);
- if (ret < 0) {
- /* Failed to get the stat info */
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file cleanup failed: failed to fstat "
- "file %s on %s ", loc->path, from->name);
- ret = -1;
- goto out;
- }
-
-
- /* Remove the sticky bit and sgid bit set, reset it to 0*/
- if (!src_ia_prot->sticky)
- new_stbuf.ia_prot.sticky = 0;
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ struct iatt new_stbuf = {
+ 0,
+ };
+
+ if (!this || !fd || !from || !src_ia_prot) {
+ goto out;
+ }
+
+ conf = this->private;
+
+ /*Revert source mode and xattr changes*/
+ ret = syncop_fstat(from, fd, &new_stbuf, NULL, NULL);
+ if (ret < 0) {
+ /* Failed to get the stat info */
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file cleanup failed: failed to fstat "
+ "file %s on %s ",
+ loc->path, from->name);
+ ret = -1;
+ goto out;
+ }
- if (!src_ia_prot->sgid)
- new_stbuf.ia_prot.sgid = 0;
+ /* Remove the sticky bit and sgid bit set, reset it to 0*/
+ if (!src_ia_prot->sticky)
+ new_stbuf.ia_prot.sticky = 0;
- ret = syncop_fsetattr (from, fd, &new_stbuf,
- (GF_SET_ATTR_GID | GF_SET_ATTR_MODE),
- NULL, NULL, NULL, NULL);
+ if (!src_ia_prot->sgid)
+ new_stbuf.ia_prot.sgid = 0;
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, -ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file cleanup failed:"
- "%s: failed to perform fsetattr on %s ",
- loc->path, from->name);
- ret = -1;
- goto out;
- }
+ ret = syncop_fsetattr(from, fd, &new_stbuf,
+ (GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, NULL,
+ NULL, NULL);
- ret = syncop_fremovexattr (from, fd, conf->link_xattr_name, 0, NULL);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to remove linkto xattr on %s (%s)",
- loc->path, from->name, strerror (-ret));
- ret = -1;
- goto out;
- }
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file cleanup failed:"
+ "%s: failed to perform fsetattr on %s ",
+ loc->path, from->name);
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_fremovexattr(from, fd, conf->link_xattr_name, 0, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "%s: failed to remove linkto xattr on %s (%s)", loc->path,
+ from->name, strerror(-ret));
+ ret = -1;
+ goto out;
+ }
- ret = 0;
+ ret = 0;
out:
- return ret;
+ return ret;
}
-
-
/*
return values:
@@ -1171,2635 +1469,3234 @@ out:
1 : not a failure, but we can't migrate data as of now
*/
int
-dht_migrate_file (xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
- int flag)
+dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
+ int flag, int *fop_errno)
{
- int ret = -1;
- struct iatt new_stbuf = {0,};
- struct iatt stbuf = {0,};
- struct iatt empty_iatt = {0,};
- ia_prot_t src_ia_prot = {0,};
- fd_t *src_fd = NULL;
- fd_t *dst_fd = NULL;
- dict_t *dict = NULL;
- dict_t *xattr = NULL;
- dict_t *xattr_rsp = NULL;
- int file_has_holes = 0;
- dht_conf_t *conf = this->private;
- int rcvd_enoent_from_src = 0;
- struct gf_flock flock = {0, };
- struct gf_flock plock = {0, };
- loc_t tmp_loc = {0, };
- gf_boolean_t locked = _gf_false;
- gf_boolean_t p_locked = _gf_false;
- int lk_ret = -1;
- gf_defrag_info_t *defrag = NULL;
- gf_boolean_t clean_src = _gf_false;
- gf_boolean_t clean_dst = _gf_false;
- int log_level = GF_LOG_INFO;
- gf_boolean_t delete_src_linkto = _gf_true;
-
- defrag = conf->defrag;
- if (!defrag)
- goto out;
-
- if (defrag->tier_conf.is_tier)
- log_level = GF_LOG_TRACE;
-
- gf_log (this->name,
- log_level, "%s: attempting to move from %s to %s",
- loc->path, from->name, to->name);
+ int ret = -1;
+ struct iatt new_stbuf = {
+ 0,
+ };
+ struct iatt stbuf = {
+ 0,
+ };
+ struct iatt empty_iatt = {
+ 0,
+ };
+ ia_prot_t src_ia_prot = {
+ 0,
+ };
+ fd_t *src_fd = NULL;
+ fd_t *dst_fd = NULL;
+ dict_t *dict = NULL;
+ dict_t *xattr = NULL;
+ dict_t *xattr_rsp = NULL;
+ int file_has_holes = 0;
+ dht_conf_t *conf = this->private;
+ int rcvd_enoent_from_src = 0;
+ struct gf_flock flock = {
+ 0,
+ };
+ struct gf_flock plock = {
+ 0,
+ };
+ loc_t tmp_loc = {
+ 0,
+ };
+ loc_t parent_loc = {
+ 0,
+ };
+ gf_boolean_t inodelk_locked = _gf_false;
+ gf_boolean_t entrylk_locked = _gf_false;
+ gf_boolean_t p_locked = _gf_false;
+ int lk_ret = -1;
+ gf_defrag_info_t *defrag = NULL;
+ gf_boolean_t clean_src = _gf_false;
+ gf_boolean_t clean_dst = _gf_false;
+ int log_level = GF_LOG_INFO;
+ gf_boolean_t delete_src_linkto = _gf_true;
+ lock_migration_info_t locklist;
+ dict_t *meta_dict = NULL;
+ gf_boolean_t meta_locked = _gf_false;
+ gf_boolean_t target_changed = _gf_false;
+ xlator_t *new_target = NULL;
+ xlator_t *old_target = NULL;
+ xlator_t *hashed_subvol = NULL;
+ fd_t *linkto_fd = NULL;
+ dict_t *xdata = NULL;
+
+ if (from == to) {
+ gf_msg_debug(this->name, 0,
+ "destination and source are same. file %s"
+ " might have migrated already",
+ loc->path);
+ ret = 0;
+ goto out;
+ }
- dict = dict_new ();
- if (!dict)
- goto out;
+ gf_log(this->name, log_level, "%s: attempting to move from %s to %s",
+ loc->path, from->name, to->name);
- ret = dict_set_int32 (dict, conf->link_xattr_name, 256);
+ dict = dict_new();
+ if (!dict) {
+ ret = -1;
+ *fop_errno = ENOMEM;
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
+ "Could not allocate memory for dict");
+ goto out;
+ }
+ ret = dict_set_int32(dict, conf->link_xattr_name, 256);
+ if (ret) {
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to set 'linkto' key in dict",
+ loc->path);
+ goto out;
+ }
+
+ /* Do not migrate file in case lock migration is not enabled on the
+ * volume*/
+ if (!conf->lock_migration_enabled) {
+ ret = dict_set_int32(dict, GLUSTERFS_POSIXLK_COUNT, sizeof(int32_t));
if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: failed to set 'linkto' key in dict", loc->path);
- goto out;
- }
-
-
- /* Don't migrate files with POSIX locks */
- ret = dict_set_int32 (dict, GLUSTERFS_POSIXLK_COUNT, sizeof(int32_t));
+ *fop_errno = ENOMEM;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: %s: failed to "
+ "set " GLUSTERFS_POSIXLK_COUNT " key in dict",
+ loc->path);
+ goto out;
+ }
+ } else {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "locks will be migrated"
+ " for file: %s",
+ loc->path);
+ }
+
+ /* The file is locked to prevent a rename during a migration. Renames
+ * and migrations on the file at the same time can lead to data loss.
+ */
+
+ ret = dht_build_parent_loc(this, &parent_loc, loc, fop_errno);
+ if (ret < 0) {
+ ret = -1;
+ gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to build parent loc, which is needed to "
+ "acquire entrylk to synchronize with renames on this "
+ "path. Skipping migration",
+ loc->path);
+ goto out;
+ }
+
+ hashed_subvol = dht_subvol_get_hashed(this, loc);
+ if (hashed_subvol == NULL) {
+ ret = -1;
+ gf_msg(this->name, GF_LOG_WARNING, EINVAL, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: cannot find hashed subvol which is needed to "
+ "synchronize with renames on this path. "
+ "Skipping migration",
+ loc->path);
+ goto out;
+ }
+
+ flock.l_type = F_WRLCK;
+
+ tmp_loc.inode = inode_ref(loc->inode);
+ gf_uuid_copy(tmp_loc.gfid, loc->gfid);
+ tmp_loc.path = gf_strdup(loc->path);
+
+ /* this inodelk happens with flock.owner being zero. But to synchronize
+ * hardlink migration we need to have different lkowner for each migration
+ * Filed a bug here: https://bugzilla.redhat.com/show_bug.cgi?id=1468202 to
+ * track the fix for this. Currently synclock takes care of synchronizing
+ * hardlink migration. Once this bug is fixed we can avoid taking synclock
+ */
+ ret = syncop_inodelk(from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, F_SETLKW,
+ &flock, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "migrate file failed: "
+ "%s: failed to lock file on %s",
+ loc->path, from->name);
+ goto out;
+ }
+
+ inodelk_locked = _gf_true;
+
+ /* dht_rename has changed to use entrylk on hashed subvol for
+ * synchronization. So, rebalance too has to acquire an entrylk on
+ * hashed subvol.
+ */
+ ret = syncop_entrylk(hashed_subvol, DHT_ENTRY_SYNC_DOMAIN, &parent_loc,
+ loc->name, ENTRYLK_LOCK, ENTRYLK_WRLCK, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to acquire entrylk on subvol %s", loc->path,
+ hashed_subvol->name);
+ goto out;
+ }
+
+ entrylk_locked = _gf_true;
+
+ /* Phase 1 - Data migration is in progress from now on */
+ ret = syncop_lookup(from, loc, &stbuf, NULL, dict, &xattr_rsp);
+ if (ret) {
+ *fop_errno = -ret;
+ ret = -1;
+ gf_msg(this->name, GF_LOG_ERROR, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: lookup failed on %s",
+ loc->path, from->name);
+ goto out;
+ }
+
+ /* preserve source mode, so set the same to the destination */
+ src_ia_prot = stbuf.ia_prot;
+
+ /* Check if file can be migrated */
+ ret = __is_file_migratable(this, loc, &stbuf, xattr_rsp, flag, defrag, conf,
+ fop_errno);
+ if (ret) {
+ if (ret == HARDLINK_MIG_INPROGRESS)
+ ret = 0;
+ goto out;
+ }
+
+ /* Take care of the special files */
+ if (!IA_ISREG(stbuf.ia_type)) {
+ /* Special files */
+ ret = migrate_special_files(this, from, to, loc, &stbuf, fop_errno);
+ goto out;
+ }
+
+ /* Try to preserve 'holes' while migrating data */
+ if (stbuf.ia_size > (stbuf.ia_blocks * GF_DISK_SECTOR_SIZE))
+ file_has_holes = 1;
+
+ /* create the destination, with required modes/xattr */
+ ret = __dht_rebalance_create_dst_file(this, to, from, loc, &stbuf, &dst_fd,
+ fop_errno, file_has_holes);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+ "Create dst failed"
+ " on - %s for file - %s",
+ to->name, loc->path);
+ goto out;
+ }
+
+ clean_dst = _gf_true;
+
+ ret = __dht_check_free_space(this, to, from, loc, &stbuf, flag, conf,
+ &target_changed, &new_target, fop_errno);
+ if (target_changed) {
+ /* Can't handle for hardlinks. Marking this as failure */
+ if (flag == GF_DHT_MIGRATE_HARDLINK_IN_PROGRESS || stbuf.ia_nlink > 1) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SUBVOL_INSUFF_SPACE,
+ "Exiting migration for"
+ " file - %s. flag - %d, stbuf.ia_nlink - %d",
+ loc->path, flag, stbuf.ia_nlink);
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL);
if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed: %s: failed to "
- "set "GLUSTERFS_POSIXLK_COUNT" key in dict", loc->path);
- goto out;
+ gf_log(this->name, GF_LOG_WARNING,
+ "%s: failed to perform truncate on %s (%s)", loc->path,
+ to->name, strerror(-ret));
}
- flock.l_type = F_WRLCK;
+ syncop_close(dst_fd);
+ dst_fd = NULL;
- tmp_loc.inode = inode_ref (loc->inode);
- gf_uuid_copy (tmp_loc.gfid, loc->gfid);
- tmp_loc.path = gf_strdup(loc->path);
+ old_target = to;
+ to = new_target;
- ret = syncop_inodelk (from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc, F_SETLKW,
- &flock, NULL, NULL);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "migrate file failed: "
- "%s: failed to lock file on %s (%s)",
- loc->path, from->name, strerror (-ret));
- ret = -1;
- goto out;
- }
-
- locked = _gf_true;
+ clean_dst = _gf_false;
- /* Phase 1 - Data migration is in progress from now on */
- ret = syncop_lookup (from, loc, &stbuf, NULL, dict, &xattr_rsp);
+ /* if the file migration is successful to this new target, then
+ * update the xattr on the old destination to point the new
+ * destination. We need to do update this only post migration
+ * as in case of failure the linkto needs to point to the source
+ * subvol */
+ ret = __dht_rebalance_create_dst_file(
+ this, to, from, loc, &stbuf, &dst_fd, fop_errno, file_has_holes);
if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: lookup failed on %s (%s)",
- loc->path, from->name, strerror (-ret));
- ret = -1;
- goto out;
- }
+ gf_log(this->name, GF_LOG_ERROR,
+ "Create dst failed"
+ " on - %s for file - %s",
+ to->name, loc->path);
+ goto out;
+ } else {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "destination for file "
+ "- %s is changed to - %s",
+ loc->path, to->name);
+ clean_dst = _gf_true;
+ }
+ }
+
+ if (ret) {
+ goto out;
+ }
+
+ /* Open the source, and also update mode/xattr */
+ ret = __dht_rebalance_open_src_file(this, from, to, loc, &stbuf, &src_fd,
+ &clean_src, fop_errno);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: failed to open %s on %s", loc->path,
+ from->name);
+ goto out;
+ }
+
+ /* TODO: move all xattr related operations to fd based operations */
+ ret = syncop_listxattr(from, loc, &xattr, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, *fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to get xattr from %s",
+ loc->path, from->name);
+ ret = -1;
+ goto out;
+ }
+
+ /* Copying posix acls to the linkto file messes up the permissions*/
+ dht_strip_out_acls(xattr);
+
+ /* Remove the linkto xattr as we don't want to overwrite the value
+ * set on the dst.
+ */
+ dict_del(xattr, conf->link_xattr_name);
+
+ /* We need to error out if this fails as having the wrong shard xattrs
+ * set on the dst could cause data corruption
+ */
+ ret = syncop_fsetxattr(to, dst_fd, xattr, 0, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to set xattr on %s", loc->path, to->name);
+ ret = -1;
+ goto out;
+ }
+ if (xattr_rsp) {
/* we no more require this key */
- dict_del (dict, conf->link_xattr_name);
-
- /* preserve source mode, so set the same to the destination */
- src_ia_prot = stbuf.ia_prot;
-
- /* Check if file can be migrated */
- ret = __is_file_migratable (this, loc, &stbuf, xattr_rsp, flag, defrag);
- if (ret) {
- if (ret == -2)
- ret = 0;
- goto out;
- }
- /* Take care of the special files */
- if (!IA_ISREG (stbuf.ia_type)) {
- /* Special files */
- ret = migrate_special_files (this, from, to, loc, &stbuf);
- goto out;
- }
+ dict_del(dict, conf->link_xattr_name);
+ dict_unref(xattr_rsp);
+ }
+
+ ret = syncop_fstat(from, src_fd, &stbuf, dict, &xattr_rsp);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:failed to lookup %s on %s ", loc->path,
+ from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* Check again if file has hardlink */
+ ret = __check_file_has_hardlink(this, loc, &stbuf, xattr_rsp, flag, defrag,
+ conf, fop_errno);
+ if (ret) {
+ if (ret == HARDLINK_MIG_INPROGRESS)
+ ret = 0;
+ goto out;
+ }
+
+ ret = __dht_rebalance_migrate_data(this, defrag, from, to, src_fd, dst_fd,
+ stbuf.ia_size, file_has_holes,
+ fop_errno);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: %s: failed to migrate data", loc->path);
+ ret = -1;
+ goto out;
+ }
+
+ /* TODO: Sync the locks */
+
+ xdata = dict_new();
+ if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s: failed to set last-fsync flag on "
+ "%s (%s)",
+ loc->path, to->name, strerror(ENOMEM));
+ }
+
+ ret = syncop_fsync(to, dst_fd, 0, NULL, NULL, xdata, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "%s: failed to fsync on %s (%s)",
+ loc->path, to->name, strerror(-ret));
+ *fop_errno = -ret;
+ }
+
+ /* Phase 2 - Data-Migration Complete, Housekeeping updates pending */
+
+ ret = syncop_fstat(from, src_fd, &new_stbuf, NULL, NULL);
+ if (ret < 0) {
+ /* Failed to get the stat info */
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: failed to fstat file %s on %s ", loc->path,
+ from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ }
- /* TODO: move all xattr related operations to fd based operations */
- ret = syncop_listxattr (from, loc, &xattr, NULL, NULL);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: failed to get xattr from %s (%s)",
- loc->path, from->name, strerror (-ret));
- ret = -1;
- }
+ /* Lock the entire source file to prevent clients from taking a
+ lock on it as dht_lk does not handle file migration.
- /* create the destination, with required modes/xattr */
- ret = __dht_rebalance_create_dst_file (to, from, loc, &stbuf,
- &dst_fd, xattr);
- if (ret)
- goto out;
+ This still leaves a small window where conflicting locks can
+ be granted to different clients. If client1 requests a blocking
+ lock on the src file, it will be granted after the migrating
+ process releases its lock. If client2 requests a lock on the dst
+ data file, it will also be granted, but all FOPs will be redirected
+ to the dst data file.
+ */
- clean_dst = _gf_true;
+ /* Take meta lock */
- ret = __dht_check_free_space (to, from, loc, &stbuf, flag);
+ if (conf->lock_migration_enabled) {
+ meta_dict = dict_new();
+ if (!meta_dict) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "dict_new failed");
- if (ret) {
- goto out;
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
}
- /* Open the source, and also update mode/xattr */
- ret = __dht_rebalance_open_src_file (from, to, loc, &stbuf, &src_fd,
- &clean_src);
+ ret = dict_set_str(meta_dict, GLUSTERFS_INTERNAL_FOP_KEY, "yes");
if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed: failed to open %s on %s",
- loc->path, from->name);
- goto out;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s,"
+ " path = %s",
+ GLUSTERFS_INTERNAL_FOP_KEY, loc->path);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
}
-
- ret = syncop_fstat (from, src_fd, &stbuf, NULL, NULL);
+ ret = dict_set_int32(meta_dict, GF_META_LOCK_KEY, 1);
if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:failed to lookup %s on %s ",
- loc->path, from->name);
- ret = -1;
- goto out;
- }
-
- /* Try to preserve 'holes' while migrating data */
- if (stbuf.ia_size > (stbuf.ia_blocks * GF_DISK_SECTOR_SIZE))
- file_has_holes = 1;
-
-
- /* All I/O happens in this function */
- if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
- ret = __tier_migrate_data (defrag, from, to, src_fd, dst_fd,
- stbuf.ia_size, file_has_holes);
- } else {
- ret = __dht_rebalance_migrate_data (from, to, src_fd, dst_fd,
- stbuf.ia_size, file_has_holes);
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Trace dict_set failed");
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
}
+ ret = syncop_setxattr(from, loc, meta_dict, 0, NULL, NULL);
if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed: %s: failed to migrate data",
- loc->path);
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Trace syncop_setxattr metalock failed");
- ret = -1;
- goto out;
- }
-
- /* TODO: Sync the locks */
-
- ret = syncop_fsync (to, dst_fd, 0, NULL, NULL);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to fsync on %s (%s)",
- loc->path, to->name, strerror (-ret));
- ret = -1;
- }
-
-
- /* Phase 2 - Data-Migration Complete, Housekeeping updates pending */
-
- ret = syncop_fstat (from, src_fd, &new_stbuf, NULL, NULL);
- if (ret < 0) {
- /* Failed to get the stat info */
- gf_msg ( this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed: failed to fstat file %s on %s ",
- loc->path, from->name);
- ret = -1;
- goto out;
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ } else {
+ meta_locked = _gf_true;
}
+ }
- /* Lock the entire source file to prevent clients from taking a
- lock on it as dht_lk does not handle file migration.
-
- This still leaves a small window where conflicting locks can
- be granted to different clients. If client1 requests a blocking
- lock on the src file, it will be granted after the migrating
- process releases its lock. If client2 requests a lock on the dst
- data file, it will also be granted, but all FOPs will be redirected
- to the dst data file.
- */
-
+ if (!conf->lock_migration_enabled) {
plock.l_type = F_WRLCK;
plock.l_start = 0;
plock.l_len = 0;
plock.l_whence = SEEK_SET;
- ret = syncop_lk (from, src_fd, F_SETLK, &plock, NULL, NULL);
+ ret = syncop_lk(from, src_fd, F_SETLK, &plock, NULL, NULL);
if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, -ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: Failed to lock on %s",
- loc->path, from->name);
- ret = -1;
- goto out;
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: Failed to lock on %s",
+ loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
}
p_locked = _gf_true;
- /* source would have both sticky bit and sgid bit set, reset it to 0,
- and set the source permission on destination, if it was not set
- prior to setting rebalance-modes in source */
- if (!src_ia_prot.sticky)
- new_stbuf.ia_prot.sticky = 0;
-
- if (!src_ia_prot.sgid)
- new_stbuf.ia_prot.sgid = 0;
-
- /* TODO: if the source actually had sticky bit, or sgid bit set,
- we are not handling it */
-
- ret = syncop_fsetattr (to, dst_fd, &new_stbuf,
- (GF_SET_ATTR_UID | GF_SET_ATTR_GID |
- GF_SET_ATTR_MODE), NULL, NULL, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, -ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: failed to perform setattr on %s ",
- loc->path, to->name);
- ret = -1;
- goto out;
- }
-
- /* Because 'futimes' is not portable */
- ret = syncop_setattr (to, loc, &new_stbuf,
- (GF_SET_ATTR_MTIME | GF_SET_ATTR_ATIME),
- NULL, NULL, NULL, NULL);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform setattr on %s ",
- loc->path, to->name);
- ret = -1;
- }
+ } else {
+ INIT_LIST_HEAD(&locklist.list);
- clean_dst = _gf_false;
+ ret = syncop_getactivelk(from, loc, &locklist, NULL, NULL);
+ if (ret == 0) {
+ gf_log(this->name, GF_LOG_INFO, "No active locks on:%s", loc->path);
- /* Posix acls are not set on DHT linkto files as part of the initial
- * initial xattrs set on the dst file, so these need
- * to be set on the dst file after the linkto attrs are removed.
- * TODO: Optimize this.
- */
- if (xattr) {
- dict_unref (xattr);
- xattr = NULL;
- }
+ } else if (ret > 0) {
+ ret = syncop_setactivelk(to, loc, &locklist, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_LOCK_MIGRATION_FAILED, "write lock failed on:%s",
+ loc->path);
- ret = syncop_listxattr (from, loc, &xattr, NULL, NULL);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: failed to get xattr from %s (%s)",
- loc->path, from->name, strerror (-ret));
+ *fop_errno = -ret;
ret = -1;
+ goto metaunlock;
+ }
} else {
- ret = syncop_setxattr (to, loc, xattr, 0, NULL, NULL);
- if (ret < 0) {
- /* Potential problem here where Posix ACLs will
- * not be set on the target file */
-
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: failed to set xattr on %s (%s)",
- loc->path, to->name, strerror (-ret));
- ret = -1;
- }
- }
-
- /* store size of previous migrated file */
- if (defrag->tier_conf.is_tier) {
- if (from != TIER_HASHED_SUBVOL) {
- defrag->tier_conf.st_last_promoted_size = stbuf.ia_size;
- } else {
- /* Don't delete the linkto file on the hashed subvol */
- delete_src_linkto = _gf_false;
- defrag->tier_conf.st_last_demoted_size = stbuf.ia_size;
- }
- }
-
- /* The src file is being unlinked after this so we don't need
- to clean it up */
- clean_src = _gf_false;
-
- /* Make the source as a linkfile first before deleting it */
- empty_iatt.ia_prot.sticky = 1;
- ret = syncop_fsetattr (from, src_fd, &empty_iatt,
- GF_SET_ATTR_MODE, NULL, NULL, NULL, NULL);
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_LOCK_MIGRATION_FAILED,
+ "getactivelk failed for file: %s", loc->path);
+ *fop_errno = -ret;
+ }
+ }
+
+ /* source would have both sticky bit and sgid bit set, reset it to 0,
+ and set the source permission on destination, if it was not set
+ prior to setting rebalance-modes in source */
+ if (!src_ia_prot.sticky)
+ new_stbuf.ia_prot.sticky = 0;
+
+ if (!src_ia_prot.sgid)
+ new_stbuf.ia_prot.sgid = 0;
+
+ /* TODO: if the source actually had sticky bit, or sgid bit set,
+ we are not handling it */
+
+ ret = syncop_fsetattr(
+ to, dst_fd, &new_stbuf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID | GF_SET_ATTR_MODE), NULL, NULL,
+ NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to perform setattr on %s ",
+ loc->path, to->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto metaunlock;
+ }
+
+ /* Because 'futimes' is not portable */
+ ret = syncop_setattr(to, loc, &new_stbuf,
+ (GF_SET_ATTR_MTIME | GF_SET_ATTR_ATIME), NULL, NULL,
+ NULL, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "%s: failed to perform setattr on %s ", loc->path, to->name);
+ *fop_errno = -ret;
+ }
+
+ if (target_changed) {
+ dict_del(dict, GLUSTERFS_POSIXLK_COUNT);
+ ret = dict_set_str(dict, conf->link_xattr_name, to->name);
if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, -ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:"
- "%s: failed to perform setattr on %s ",
- loc->path, from->name);
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set xattr in dict for %s (linkto:%s)", loc->path,
+ to->name);
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_setxattr(old_target, loc, dict, 0, NULL, NULL);
+ if (ret && -ret != ESTALE && -ret != ENOENT) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to set xattr on %s in %s", loc->path,
+ old_target->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
+ } else if (-ret == ESTALE || -ret == ENOENT) {
+ /* The failure ESTALE indicates that the linkto
+ * file on the hashed subvol might have been deleted.
+ * In this case will create a linkto file with new target
+ * as linkto xattr value*/
+ linkto_fd = fd_create(loc->inode, DHT_REBALANCE_PID);
+ if (!linkto_fd) {
+ gf_msg(this->name, GF_LOG_ERROR, errno,
+ DHT_MSG_MIGRATE_FILE_FAILED, "%s: fd create failed",
+ loc->path);
+ *fop_errno = ENOMEM;
ret = -1;
goto out;
- }
-
- /* Free up the data blocks on the source node, as the whole
- file is migrated */
- ret = syncop_ftruncate (from, src_fd, 0, NULL, NULL);
- if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform truncate on %s (%s)",
- loc->path, from->name, strerror (-ret));
+ }
+ ret = syncop_create(old_target, loc, O_RDWR, DHT_LINKFILE_MODE,
+ linkto_fd, NULL, dict, NULL);
+ if (ret != 0 && -ret != EEXIST && -ret != ESTALE) {
+ *fop_errno = -ret;
ret = -1;
- }
-
- /* remove the 'linkto' xattr from the destination */
- ret = syncop_fremovexattr (to, dst_fd, conf->link_xattr_name, 0, NULL);
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "failed to create linkto file on %s in %s", loc->path,
+ old_target->name);
+ goto out;
+ } else if (ret == 0) {
+ ret = syncop_fsetattr(old_target, linkto_fd, &stbuf,
+ (GF_SET_ATTR_UID | GF_SET_ATTR_GID), NULL,
+ NULL, NULL, NULL);
+ if (ret < 0) {
+ *fop_errno = -ret;
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "chown failed for %s on %s", loc->path,
+ old_target->name);
+ }
+ }
+ }
+ }
+
+ clean_dst = _gf_false;
+
+ /* Posix acls are not set on DHT linkto files as part of the initial
+ * initial xattrs set on the dst file, so these need
+ * to be set on the dst file after the linkto attrs are removed.
+ * TODO: Optimize this.
+ */
+ if (xattr) {
+ dict_unref(xattr);
+ xattr = NULL;
+ }
+
+ /* Set only the Posix ACLs this time */
+ ret = syncop_getxattr(from, loc, &xattr, POSIX_ACL_ACCESS_XATTR, NULL,
+ NULL);
+ if (ret < 0) {
+ if ((-ret != ENODATA) && (-ret != ENOATTR)) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to get xattr from %s",
+ loc->path, from->name);
+ *fop_errno = -ret;
+ }
+ } else {
+ ret = syncop_setxattr(to, loc, xattr, 0, NULL, NULL);
+ if (ret < 0) {
+ /* Potential problem here where Posix ACLs will
+ * not be set on the target file */
+
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to set xattr on %s",
+ loc->path, to->name);
+ *fop_errno = -ret;
+ }
+ }
+
+ /* The src file is being unlinked after this so we don't need
+ to clean it up */
+ clean_src = _gf_false;
+
+ /* Make the source as a linkfile first before deleting it */
+ empty_iatt.ia_prot.sticky = 1;
+ ret = syncop_fsetattr(from, src_fd, &empty_iatt, GF_SET_ATTR_MODE, NULL,
+ NULL, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed:"
+ "%s: failed to perform setattr on %s ",
+ loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto metaunlock;
+ }
+
+ /* Free up the data blocks on the source node, as the whole
+ file is migrated */
+ ret = syncop_ftruncate(from, src_fd, 0, NULL, NULL, NULL, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "%s: failed to perform truncate on %s (%s)", loc->path,
+ from->name, strerror(-ret));
+ *fop_errno = -ret;
+ }
+
+ /* remove the 'linkto' xattr from the destination */
+ ret = syncop_fremovexattr(to, dst_fd, conf->link_xattr_name, 0, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "%s: failed to perform removexattr on %s (%s)", loc->path,
+ to->name, strerror(-ret));
+ *fop_errno = -ret;
+ }
+
+ /* Do a stat and check the gfid before unlink */
+
+ /*
+ * Cached file changes its state from non-linkto to linkto file after
+ * migrating data. If lookup from any other mount-point is performed,
+ * converted-linkto-cached file will be treated as a stale and will be
+ * unlinked. But by this time, file is already migrated. So further
+ * failure because of ENOENT should not be treated as error
+ */
+
+ ret = syncop_stat(from, loc, &empty_iatt, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to do a stat on %s", loc->path, from->name);
+
+ if (-ret != ENOENT) {
+ *fop_errno = -ret;
+ ret = -1;
+ goto metaunlock;
+ }
+
+ rcvd_enoent_from_src = 1;
+ }
+
+ if ((gf_uuid_compare(empty_iatt.ia_gfid, loc->gfid) == 0) &&
+ (!rcvd_enoent_from_src) && delete_src_linkto) {
+ /* take out the source from namespace */
+ ret = syncop_unlink(from, loc, NULL, NULL);
if (ret) {
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to perform removexattr on %s (%s)",
- loc->path, to->name, strerror (-ret));
- ret = -1;
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to perform unlink on %s", loc->path, from->name);
+ *fop_errno = -ret;
+ ret = -1;
+ goto metaunlock;
}
+ }
- /* Do a stat and check the gfid before unlink */
-
- /*
- * Cached file changes its state from non-linkto to linkto file after
- * migrating data. If lookup from any other mount-point is performed,
- * converted-linkto-cached file will be treated as a stale and will be
- * unlinked. But by this time, file is already migrated. So further
- * failure because of ENOENT should not be treated as error
- */
-
- ret = syncop_stat (from, loc, &empty_iatt, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: failed to do a stat on %s (%s)",
- loc->path, from->name, strerror (-ret));
+ ret = syncop_lookup(this, loc, NULL, NULL, NULL, NULL);
+ if (ret) {
+ gf_msg_debug(this->name, -ret,
+ "%s: failed to lookup the file on subvolumes", loc->path);
+ *fop_errno = -ret;
+ }
- if (-ret != ENOENT) {
- ret = -1;
- goto out;
- }
+ gf_msg(this->name, log_level, 0, DHT_MSG_MIGRATE_FILE_COMPLETE,
+ "completed migration of %s from subvolume %s to %s", loc->path,
+ from->name, to->name);
- rcvd_enoent_from_src = 1;
- }
+ ret = 0;
+metaunlock:
- if ((gf_uuid_compare (empty_iatt.ia_gfid, loc->gfid) == 0 ) &&
- (!rcvd_enoent_from_src) && delete_src_linkto) {
- /* take out the source from namespace */
- ret = syncop_unlink (from, loc, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: failed to perform unlink on %s (%s)",
- loc->path, from->name, strerror (-ret));
- ret = -1;
- goto out;
- }
- }
+ if (conf->lock_migration_enabled && meta_locked) {
+ dict_del(meta_dict, GF_META_LOCK_KEY);
- ret = syncop_lookup (this, loc, NULL, NULL, NULL, NULL);
+ ret = dict_set_int32(meta_dict, GF_META_UNLOCK_KEY, 1);
if (ret) {
- gf_msg_debug (this->name, 0,
- "%s: failed to lookup the file on subvolumes (%s)",
- loc->path, strerror (-ret));
- ret = -1;
- }
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Trace dict_set failed");
- gf_msg (this->name, log_level, 0,
- DHT_MSG_MIGRATE_FILE_COMPLETE,
- "completed migration of %s from subvolume %s to %s",
- loc->path, from->name, to->name);
-
- ret = 0;
-out:
- if (clean_src) {
- /* Revert source mode and xattr changes*/
- lk_ret = __dht_migration_cleanup_src_file (this, loc, src_fd,
- from, &src_ia_prot);
- if (lk_ret) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: failed to cleanup source file on %s",
- loc->path, from->name);
- }
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
}
- /* reset the destination back to 0 */
- if (clean_dst) {
- lk_ret = syncop_ftruncate (to, dst_fd, 0, NULL, NULL);
- if (lk_ret) {
- gf_msg (this->name, GF_LOG_ERROR, -lk_ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed: "
- "%s: failed to reset target size back to 0",
- loc->path);
- }
- }
+ if (clean_dst == _gf_false)
+ ret = dict_set_int32(meta_dict, "status", 1);
+ else
+ ret = dict_set_int32(meta_dict, "status", 0);
- if (locked) {
- flock.l_type = F_UNLCK;
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Trace dict_set failed");
- lk_ret = syncop_inodelk (from, DHT_FILE_MIGRATE_DOMAIN,
- &tmp_loc, F_SETLK, &flock, NULL, NULL);
- if (lk_ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: failed to unlock file on %s (%s)",
- loc->path, from->name, strerror (-lk_ret));
- }
+ *fop_errno = ENOMEM;
+ ret = -1;
+ goto out;
}
- if (p_locked) {
- plock.l_type = F_UNLCK;
- lk_ret = syncop_lk (from, src_fd, F_SETLK, &plock, NULL, NULL);
+ ret = syncop_setxattr(from, loc, meta_dict, 0, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Trace syncop_setxattr meta unlock failed");
- if (lk_ret < 0) {
- gf_msg (this->name, GF_LOG_WARNING, -lk_ret,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "%s: failed to unlock file on %s",
- loc->path, from->name);
- }
+ *fop_errno = -ret;
+ ret = -1;
+ goto out;
}
+ }
- if (dict)
- dict_unref (dict);
-
- if (xattr)
- dict_unref (xattr);
- if (xattr_rsp)
- dict_unref (xattr_rsp);
-
- if (dst_fd)
- syncop_close (dst_fd);
- if (src_fd)
- syncop_close (src_fd);
-
- loc_wipe (&tmp_loc);
-
- return ret;
+out:
+ if (clean_src) {
+ /* Revert source mode and xattr changes*/
+ lk_ret = __dht_migration_cleanup_src_file(this, loc, src_fd, from,
+ &src_ia_prot);
+ if (lk_ret) {
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to cleanup source file on %s", loc->path,
+ from->name);
+ }
+ }
+
+ /* reset the destination back to 0 */
+ if (clean_dst) {
+ lk_ret = syncop_ftruncate(to, dst_fd, 0, NULL, NULL, NULL, NULL);
+ if (lk_ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -lk_ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: "
+ "%s: failed to reset target size back to 0",
+ loc->path);
+ }
+ }
+
+ if (inodelk_locked) {
+ flock.l_type = F_UNLCK;
+
+ lk_ret = syncop_inodelk(from, DHT_FILE_MIGRATE_DOMAIN, &tmp_loc,
+ F_SETLK, &flock, NULL, NULL);
+ if (lk_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -lk_ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to unlock file on %s", loc->path, from->name);
+ }
+ }
+
+ if (entrylk_locked) {
+ lk_ret = syncop_entrylk(hashed_subvol, DHT_ENTRY_SYNC_DOMAIN,
+ &parent_loc, loc->name, ENTRYLK_UNLOCK,
+ ENTRYLK_UNLOCK, NULL, NULL);
+ if (lk_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -lk_ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to unlock entrylk on %s", loc->path,
+ hashed_subvol->name);
+ }
+ }
+
+ if (p_locked) {
+ plock.l_type = F_UNLCK;
+ lk_ret = syncop_lk(from, src_fd, F_SETLK, &plock, NULL, NULL);
+
+ if (lk_ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -lk_ret,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "%s: failed to unlock file on %s", loc->path, from->name);
+ }
+ }
+
+ lk_ret = syncop_removexattr(to, loc, GF_PROTECT_FROM_EXTERNAL_WRITES, NULL,
+ NULL);
+ if (lk_ret && (lk_ret != -ENODATA) && (lk_ret != -ENOATTR)) {
+ gf_msg(this->name, GF_LOG_WARNING, -lk_ret, 0,
+ "%s: removexattr failed key %s", loc->path,
+ GF_PROTECT_FROM_EXTERNAL_WRITES);
+ }
+
+ if (dict)
+ dict_unref(dict);
+
+ if (xattr)
+ dict_unref(xattr);
+ if (xattr_rsp)
+ dict_unref(xattr_rsp);
+
+ if (dst_fd)
+ syncop_close(dst_fd);
+
+ if (src_fd)
+ syncop_close(src_fd);
+ if (linkto_fd)
+ syncop_close(linkto_fd);
+
+ if (xdata)
+ dict_unref(xdata);
+
+ loc_wipe(&tmp_loc);
+ loc_wipe(&parent_loc);
+
+ return ret;
}
static int
-rebalance_task (void *data)
+rebalance_task(void *data)
{
- int ret = -1;
- dht_local_t *local = NULL;
- call_frame_t *frame = NULL;
+ int ret = -1;
+ dht_local_t *local = NULL;
+ call_frame_t *frame = NULL;
+ int fop_errno = 0;
- frame = data;
+ frame = data;
- local = frame->local;
+ local = frame->local;
- /* This function is 'synchrounous', hence if it returns,
- we are done with the task */
- ret = dht_migrate_file (THIS, &local->loc, local->rebalance.from_subvol,
- local->rebalance.target_node, local->flags);
+ /* This function is 'synchrounous', hence if it returns,
+ we are done with the task */
+ ret = dht_migrate_file(THIS, &local->loc, local->rebalance.from_subvol,
+ local->rebalance.target_node, local->flags,
+ &fop_errno);
- return ret;
+ return ret;
}
static int
-rebalance_task_completion (int op_ret, call_frame_t *sync_frame, void *data)
+rebalance_task_completion(int op_ret, call_frame_t *sync_frame, void *data)
{
- int ret = -1;
- uint64_t layout_int = 0;
- dht_layout_t *layout = 0;
- xlator_t *this = NULL;
- dht_local_t *local = NULL;
- int32_t op_errno = EINVAL;
-
- this = THIS;
- local = sync_frame->local;
-
- if (!op_ret) {
- /* Make sure we have valid 'layout' in inode ctx
- after the operation */
- ret = inode_ctx_del (local->loc.inode, this, &layout_int);
- if (!ret && layout_int) {
- layout = (dht_layout_t *)(long)layout_int;
- dht_layout_unref (this, layout);
- }
-
- ret = dht_layout_preset (this, local->rebalance.target_node,
- local->loc.inode);
- if (ret)
- gf_log (this->name, GF_LOG_WARNING,
- "%s: failed to set inode ctx", local->loc.path);
- }
-
- if (op_ret == -1) {
- /* Failure of migration process, mostly due to write process.
- as we can't preserve the exact errno, lets say there was
- no space to migrate-data
- */
- op_errno = ENOSPC;
- }
-
- if (op_ret == 1) {
- /* migration didn't happen, but is not a failure, let the user
- understand that he doesn't have permission to migrate the
- file.
- */
- op_ret = -1;
- op_errno = EPERM;
- }
+ int32_t op_errno = EINVAL;
- DHT_STACK_UNWIND (setxattr, sync_frame, op_ret, op_errno, NULL);
- return 0;
+ if (op_ret == -1) {
+ /* Failure of migration process, mostly due to write process.
+ as we can't preserve the exact errno, lets say there was
+ no space to migrate-data
+ */
+ op_errno = ENOSPC;
+ } else if (op_ret == 1) {
+ /* migration didn't happen, but is not a failure, let the user
+ understand that he doesn't have permission to migrate the
+ file.
+ */
+ op_ret = -1;
+ op_errno = EPERM;
+ } else if (op_ret != 0) {
+ op_errno = -op_ret;
+ op_ret = -1;
+ }
+
+ DHT_STACK_UNWIND(setxattr, sync_frame, op_ret, op_errno, NULL);
+ return 0;
}
int
-dht_start_rebalance_task (xlator_t *this, call_frame_t *frame)
+dht_start_rebalance_task(xlator_t *this, call_frame_t *frame)
{
- int ret = -1;
+ int ret = -1;
- ret = synctask_new (this->ctx->env, rebalance_task,
- rebalance_task_completion,
- frame, frame);
- return ret;
+ ret = synctask_new(this->ctx->env, rebalance_task,
+ rebalance_task_completion, frame, frame);
+ return ret;
}
int
-gf_listener_stop (xlator_t *this)
+gf_listener_stop(xlator_t *this)
{
- glusterfs_ctx_t *ctx = NULL;
- cmd_args_t *cmd_args = NULL;
- int ret = 0;
-
- ctx = this->ctx;
- GF_ASSERT (ctx);
- cmd_args = &ctx->cmd_args;
- if (cmd_args->sock_file) {
- ret = sys_unlink (cmd_args->sock_file);
- if (ret && (ENOENT == errno)) {
- ret = 0;
- }
- }
-
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, errno,
- DHT_MSG_SOCKET_ERROR,
- "Failed to unlink listener "
- "socket %s", cmd_args->sock_file);
- }
- return ret;
+ glusterfs_ctx_t *ctx = NULL;
+ cmd_args_t *cmd_args = NULL;
+ int ret = 0;
+
+ ctx = this->ctx;
+ GF_ASSERT(ctx);
+ cmd_args = &ctx->cmd_args;
+ if (cmd_args->sock_file) {
+ ret = sys_unlink(cmd_args->sock_file);
+ if (ret && (ENOENT == errno)) {
+ ret = 0;
+ }
+ }
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, DHT_MSG_SOCKET_ERROR,
+ "Failed to unlink listener "
+ "socket %s",
+ cmd_args->sock_file);
+ }
+ return ret;
}
void
-dht_build_root_inode (xlator_t *this, inode_t **inode)
+dht_build_root_inode(xlator_t *this, inode_t **inode)
{
- inode_table_t *itable = NULL;
- uuid_t root_gfid = {0, };
+ inode_table_t *itable = NULL;
+ static uuid_t root_gfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
- itable = inode_table_new (0, this);
- if (!itable)
- return;
+ itable = inode_table_new(0, this);
+ if (!itable)
+ return;
- root_gfid[15] = 1;
- *inode = inode_find (itable, root_gfid);
+ *inode = inode_find(itable, root_gfid);
}
void
-dht_build_root_loc (inode_t *inode, loc_t *loc)
+dht_build_root_loc(inode_t *inode, loc_t *loc)
{
- loc->path = "/";
- loc->inode = inode;
- loc->inode->ia_type = IA_IFDIR;
- memset (loc->gfid, 0, 16);
- loc->gfid[15] = 1;
+ loc->path = "/";
+ loc->inode = inode;
+ loc->inode->ia_type = IA_IFDIR;
+ memset(loc->gfid, 0, 16);
+ loc->gfid[15] = 1;
}
-
/* return values: 1 -> error, bug ignore and continue
0 -> proceed
-1 -> error, handle it */
int32_t
-gf_defrag_handle_migrate_error (int32_t op_errno, gf_defrag_info_t *defrag)
+gf_defrag_handle_migrate_error(int32_t op_errno, gf_defrag_info_t *defrag)
{
- /* if errno is not ENOSPC or ENOTCONN, we can still continue
- with rebalance process */
- if ((op_errno != ENOSPC) || (op_errno != ENOTCONN))
- return 1;
-
- if (op_errno == ENOTCONN) {
- /* Most probably mount point went missing (mostly due
- to a brick down), say rebalance failure to user,
- let him restart it if everything is fine */
- defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
- return -1;
- }
-
- if (op_errno == ENOSPC) {
- /* rebalance process itself failed, may be
- remote brick went down, or write failed due to
- disk full etc etc.. */
- defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
- return -1;
- }
+ int ret = 0;
+ /* if errno is not ENOTCONN, we can still continue
+ with rebalance process */
+ if (op_errno != ENOTCONN) {
+ ret = 1;
+ goto out;
+ }
+
+ if (op_errno == ENOTCONN) {
+ /* Most probably mount point went missing (mostly due
+ to a brick down), say rebalance failure to user,
+ let him restart it if everything is fine */
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ }
- return 0;
+out:
+ return ret;
}
static gf_boolean_t
-gf_defrag_pattern_match (gf_defrag_info_t *defrag, char *name, uint64_t size)
+gf_defrag_pattern_match(gf_defrag_info_t *defrag, char *name, uint64_t size)
{
- gf_defrag_pattern_list_t *trav = NULL;
- gf_boolean_t match = _gf_false;
- gf_boolean_t ret = _gf_false;
+ gf_defrag_pattern_list_t *trav = NULL;
+ gf_boolean_t match = _gf_false;
+ gf_boolean_t ret = _gf_false;
- GF_VALIDATE_OR_GOTO ("dht", defrag, out);
+ GF_VALIDATE_OR_GOTO("dht", defrag, out);
- trav = defrag->defrag_pattern;
- while (trav) {
- if (!fnmatch (trav->path_pattern, name, FNM_NOESCAPE)) {
- match = _gf_true;
- break;
- }
- trav = trav->next;
+ trav = defrag->defrag_pattern;
+ while (trav) {
+ if (!fnmatch(trav->path_pattern, name, FNM_NOESCAPE)) {
+ match = _gf_true;
+ break;
}
+ trav = trav->next;
+ }
- if ((match == _gf_true) && (size >= trav->size))
- ret = _gf_true;
+ if ((match == _gf_true) && (size >= trav->size))
+ ret = _gf_true;
- out:
- return ret;
+out:
+ return ret;
}
-int dht_dfreaddirp_done (dht_dfoffset_ctx_t *offset_var, int cnt) {
-
- int i;
- int result = 1;
+int
+dht_dfreaddirp_done(dht_dfoffset_ctx_t *offset_var, int cnt)
+{
+ int i;
+ int result = 1;
- for (i = 0; i < cnt; i++) {
- if (offset_var[i].readdir_done == 0) {
- result = 0;
- break;
- }
+ for (i = 0; i < cnt; i++) {
+ if (offset_var[i].readdir_done == 0) {
+ result = 0;
+ break;
}
- return result;
+ }
+ return result;
}
-int static
-gf_defrag_ctx_subvols_init (dht_dfoffset_ctx_t *offset_var, xlator_t *this) {
-
- int i;
- dht_conf_t *conf = NULL;
+int static gf_defrag_ctx_subvols_init(dht_dfoffset_ctx_t *offset_var,
+ xlator_t *this)
+{
+ int i;
+ dht_conf_t *conf = NULL;
- conf = this->private;
+ conf = this->private;
- if (!conf)
- return -1;
+ if (!conf)
+ return -1;
- for (i = 0; i < conf->local_subvols_cnt; i++) {
- offset_var[i].this = conf->local_subvols[i];
- offset_var[i].offset = (off_t) 0;
- offset_var[i].readdir_done = 0;
- }
+ for (i = 0; i < conf->local_subvols_cnt; i++) {
+ offset_var[i].this = conf->local_subvols[i];
+ offset_var[i].offset = (off_t)0;
+ offset_var[i].readdir_done = 0;
+ }
- return 0;
+ return 0;
}
-int
-gf_defrag_migrate_single_file (void *opaque)
+static int
+dht_get_first_non_null_index(subvol_nodeuuids_info_t *entry)
{
- xlator_t *this = NULL;
- dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
- int ret = 0;
- gf_dirent_t *entry = NULL;
- struct timeval start = {0,};
- loc_t entry_loc = {0,};
- loc_t *loc = NULL;
- struct iatt iatt = {0,};
- dict_t *migrate_data = NULL;
- int32_t op_errno = 0;
- struct timeval end = {0,};
- double elapsed = {0,};
- struct dht_container *rebal_entry = NULL;
- inode_t *inode = NULL;
-
- rebal_entry = (struct dht_container *)opaque;
- if (!rebal_entry) {
- gf_log (this->name, GF_LOG_ERROR, "rebal_entry is NULL");
- ret = -1;
- goto out;
- }
-
- this = rebal_entry->this;
-
- conf = this->private;
-
- defrag = conf->defrag;
+ int i = 0;
+ int index = 0;
- loc = rebal_entry->parent_loc;
-
- migrate_data = rebal_entry->migrate_data;
-
- entry = rebal_entry->df_entry;
-
- if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
- ret = -1;
- goto out;
+ for (i = 0; i < entry->count; i++) {
+ if (!gf_uuid_is_null(entry->elements[i].uuid)) {
+ index = i;
+ goto out;
}
+ }
- if (defrag->stats == _gf_true) {
- gettimeofday (&start, NULL);
- }
+ if (i == entry->count) {
+ index = -1;
+ }
+out:
+ return index;
+}
- if (defrag->defrag_pattern &&
- (gf_defrag_pattern_match (defrag, entry->d_name,
- entry->d_stat.ia_size) == _gf_false)) {
- gf_log (this->name, GF_LOG_ERROR, "pattern_match failed");
+/* Return value
+ * 0 : this node does not migrate the file
+ * 1 : this node migrates the file
+ *
+ * Use the hash value of the gfid to determine which node will migrate files.
+ * Using the gfid instead of the name also ensures that the same node handles
+ * all hardlinks.
+ */
+
+gf_boolean_t
+gf_defrag_should_i_migrate(xlator_t *this, int local_subvol_index, uuid_t gfid)
+{
+ gf_boolean_t ret = _gf_false;
+ int i = local_subvol_index;
+ char *str = NULL;
+ uint32_t hashval = 0;
+ int32_t index = 0;
+ dht_conf_t *conf = NULL;
+ char buf[UUID_CANONICAL_FORM_LEN + 1] = {
+ 0,
+ };
+ subvol_nodeuuids_info_t *entry = NULL;
+
+ conf = this->private;
+
+ /* Pure distribute. A subvol in this case
+ will be handled by only one node */
+
+ entry = &(conf->local_nodeuuids[i]);
+ if (entry->count == 1) {
+ return 1;
+ }
+
+ str = uuid_utoa_r(gfid, buf);
+ if (dht_hash_compute(this, 0, str, &hashval) == 0) {
+ index = (hashval % entry->count);
+ if (entry->elements[index].info == REBAL_NODEUUID_MINE) {
+ /* Index matches this node's nodeuuid.*/
+ ret = _gf_true;
+ goto out;
+ }
+
+ /* Brick down - some other node has to migrate these files*/
+ if (gf_uuid_is_null(entry->elements[index].uuid)) {
+ /* Fall back to the first non-null index */
+ index = dht_get_first_non_null_index(entry);
+
+ if (index == -1) {
+ /* None of the bricks in the subvol are up.
+ * CHILD_DOWN will kill the process soon */
+
+ return _gf_false;
+ }
+
+ if (entry->elements[index].info == REBAL_NODEUUID_MINE) {
+ /* Index matches this node's nodeuuid.*/
+ ret = _gf_true;
goto out;
+ }
}
+ }
+out:
+ return ret;
+}
- memset (&entry_loc, 0, sizeof (entry_loc));
-
- ret = dht_build_child_loc (this, &entry_loc, loc, entry->d_name);
- if (ret) {
- LOCK (&defrag->lock);
- {
- defrag->total_failures += 1;
- }
- UNLOCK (&defrag->lock);
-
- ret = 0;
-
- gf_log (this->name, GF_LOG_ERROR, "Child loc build failed");
+int
+gf_defrag_migrate_single_file(void *opaque)
+{
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ int ret = 0;
+ gf_dirent_t *entry = NULL;
+ struct timeval start = {
+ 0,
+ };
+ loc_t entry_loc = {
+ 0,
+ };
+ loc_t *loc = NULL;
+ struct iatt iatt = {
+ 0,
+ };
+ dict_t *migrate_data = NULL;
+ struct timeval end = {
+ 0,
+ };
+ double elapsed = {
+ 0,
+ };
+ struct dht_container *rebal_entry = NULL;
+ inode_t *inode = NULL;
+ xlator_t *hashed_subvol = NULL;
+ xlator_t *cached_subvol = NULL;
+ call_frame_t *statfs_frame = NULL;
+ xlator_t *old_THIS = NULL;
+ data_t *tmp = NULL;
+ int fop_errno = 0;
+ gf_dht_migrate_data_type_t rebal_type = GF_DHT_MIGRATE_DATA;
+ char value[MAX_REBAL_TYPE_SIZE] = {
+ 0,
+ };
+ struct iatt *iatt_ptr = NULL;
+ gf_boolean_t update_skippedcount = _gf_true;
+ int i = 0;
+ gf_boolean_t should_i_migrate = 0;
+
+ rebal_entry = (struct dht_container *)opaque;
+ if (!rebal_entry) {
+ gf_log("DHT", GF_LOG_ERROR, "rebal_entry is NULL");
+ ret = -1;
+ goto out;
+ }
- goto out;
- }
+ this = rebal_entry->this;
- gf_uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid);
+ conf = this->private;
- gf_uuid_copy (entry_loc.pargfid, loc->gfid);
+ defrag = conf->defrag;
- ret = syncop_lookup (this, &entry_loc, &iatt, NULL, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed: %s lookup failed",
- entry_loc.name);
- ret = 0;
- goto out;
- }
+ loc = rebal_entry->parent_loc;
- inode = inode_link (entry_loc.inode, entry_loc.parent, entry->d_name, &iatt);
- inode_unref (entry_loc.inode);
- /* use the inode returned by inode_link */
- entry_loc.inode = inode;
+ migrate_data = rebal_entry->migrate_data;
- ret = syncop_setxattr (this, &entry_loc, migrate_data, 0, NULL, NULL);
- if (ret < 0) {
- op_errno = -ret;
- /* errno is overloaded. See
- * rebalance_task_completion () */
- if (op_errno == ENOSPC) {
- gf_msg_debug (this->name, 0, "migrate-data skipped for"
- " %s due to space constraints",
- entry_loc.path);
- LOCK (&defrag->lock);
- {
- defrag->skipped += 1;
- }
- UNLOCK (&defrag->lock);
- } else if (op_errno != EEXIST) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "migrate-data failed for %s", entry_loc.path);
+ entry = rebal_entry->df_entry;
+ iatt_ptr = &entry->d_stat;
- LOCK (&defrag->lock);
- {
- defrag->total_failures += 1;
- }
- UNLOCK (&defrag->lock);
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = -1;
+ goto out;
+ }
- }
+ if (defrag->stats == _gf_true) {
+ gettimeofday(&start, NULL);
+ }
- ret = gf_defrag_handle_migrate_error (op_errno, defrag);
+ if (defrag->defrag_pattern &&
+ (gf_defrag_pattern_match(defrag, entry->d_name,
+ entry->d_stat.ia_size) == _gf_false)) {
+ gf_log(this->name, GF_LOG_ERROR, "pattern_match failed");
+ goto out;
+ }
- if (!ret) {
- gf_msg(this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "migrate-data on %s failed: %s", entry_loc.path,
- strerror (op_errno));
- } else if (ret == 1) {
- ret = 0;
- goto out;
- } else if (ret == -1) {
- goto out;
- }
- } else if (ret > 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "migrate-data failed for %s", entry_loc.path);
- ret = 0;
- LOCK (&defrag->lock);
- {
- defrag->total_failures += 1;
- }
- UNLOCK (&defrag->lock);
- }
+ memset(&entry_loc, 0, sizeof(entry_loc));
- LOCK (&defrag->lock);
+ ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
+ if (ret) {
+ LOCK(&defrag->lock);
{
- defrag->total_files += 1;
- defrag->total_data += iatt.ia_size;
+ defrag->total_failures += 1;
}
- UNLOCK (&defrag->lock);
+ UNLOCK(&defrag->lock);
- if (defrag->stats == _gf_true) {
- gettimeofday (&end, NULL);
- elapsed = (end.tv_sec - start.tv_sec) * 1e6 +
- (end.tv_usec - start.tv_usec);
- gf_log (this->name, GF_LOG_INFO, "Migration of "
- "file:%s size:%"PRIu64" bytes took %.2f"
- "secs and ret: %d", entry_loc.name,
- iatt.ia_size, elapsed/1e6, ret);
- }
-
-out:
- loc_wipe (&entry_loc);
+ ret = 0;
- return ret;
+ gf_log(this->name, GF_LOG_ERROR, "Child loc build failed");
-}
+ goto out;
+ }
-void *
-gf_defrag_task (void *opaque)
-{
- struct list_head *q_head = NULL;
- struct dht_container *iterator = NULL;
- gf_defrag_info_t *defrag = NULL;
- int ret = 0;
+ should_i_migrate = gf_defrag_should_i_migrate(
+ this, rebal_entry->local_subvol_index, entry->d_stat.ia_gfid);
+ gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid);
- defrag = (gf_defrag_info_t *)opaque;
- if (!defrag) {
- gf_msg ("dht", GF_LOG_ERROR, 0, 0, "defrag is NULL");
- goto out;
- }
+ gf_uuid_copy(entry_loc.pargfid, loc->gfid);
- q_head = &(defrag->queue[0].list);
+ ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
- /* The following while loop will dequeue one entry from the defrag->queue
- under lock. We will update the defrag->global_error only when there
- is an error which is critical to stop the rebalance process. The stop
- message will be intimated to other migrator threads by setting the
- defrag->defrag_status to GF_DEFRAG_STATUS_FAILED.
+ if (!should_i_migrate) {
+ /* this node isn't supposed to migrate the file. suppressing any
+ * potential error from lookup as this file is under migration by
+ * another node */
+ if (ret) {
+ gf_msg_debug(this->name, -ret,
+ "Ignoring lookup failure: node isn't migrating %s",
+ entry_loc.path);
+ ret = 0;
+ }
+ gf_msg_debug(this->name, 0, "Don't migrate %s ", entry_loc.path);
+ goto out;
+ }
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_MIGRATE_FILE_FAILED,
+ "Migrate file failed: %s lookup failed", entry_loc.path);
+
+ /* Increase failure count only for remove-brick op, so that
+ * user is warned to check the removed-brick for any files left
+ * unmigrated
+ */
+ if (conf->decommission_subvols_cnt) {
+ LOCK(&defrag->lock);
+ {
+ defrag->total_failures += 1;
+ }
+ UNLOCK(&defrag->lock);
+ }
- In defrag->queue, a low watermark (MIN_MIGRATE_QUEUE_COUNT) is
- maintained so that crawler does not starve the file migration
- workers and a high watermark (MAX_MIGRATE_QUEUE_COUNT) so that
- crawler does not go far ahead in filling up the queue.
- */
+ ret = 0;
+ goto out;
+ }
- while (_gf_true) {
+ iatt_ptr = &iatt;
- if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
- goto out;
- }
+ hashed_subvol = dht_subvol_get_hashed(this, &entry_loc);
+ if (!hashed_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_HASHED_SUBVOL_GET_FAILED,
+ "Failed to get hashed subvol for %s", entry_loc.path);
+ ret = 0;
+ goto out;
+ }
- pthread_mutex_lock (&defrag->dfq_mutex);
- {
+ cached_subvol = dht_subvol_get_cached(this, entry_loc.inode);
+ if (!cached_subvol) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_CACHED_SUBVOL_GET_FAILED,
+ "Failed to get cached subvol for %s", entry_loc.path);
- /*Throttle down:
- If the reconfigured count is less than current thread
- count, then the current thread will sleep */
-
- /*TODO: Need to refactor the following block to work
- *under defrag->lock. For now access
- * defrag->current_thread_count and rthcount under
- * dfq_mutex lock */
- while (!defrag->crawl_done &&
- (defrag->recon_thread_count <
- defrag->current_thread_count)) {
- defrag->current_thread_count--;
- gf_log ("DHT", GF_LOG_INFO,
- "Thread sleeping. "
- "defrag->current_thread_count: %d",
- defrag->current_thread_count);
-
- pthread_cond_wait (
- &defrag->df_wakeup_thread,
- &defrag->dfq_mutex);
-
- defrag->current_thread_count++;
-
- gf_log ("DHT", GF_LOG_INFO,
- "Thread wokeup. "
- "defrag->current_thread_count: %d",
- defrag->current_thread_count);
- }
+ ret = 0;
+ goto out;
+ }
- if (defrag->q_entry_count) {
- iterator = list_entry (q_head->next,
- typeof(*iterator), list);
-
- gf_msg_debug ("DHT", 0, "picking entry "
- "%s", iterator->df_entry->d_name);
-
- list_del_init (&(iterator->list));
-
- defrag->q_entry_count--;
-
- if ((defrag->q_entry_count <
- MIN_MIGRATE_QUEUE_COUNT) &&
- defrag->wakeup_crawler) {
- pthread_cond_broadcast (
- &defrag->rebalance_crawler_alarm);
- }
- pthread_mutex_unlock (&defrag->dfq_mutex);
- ret = gf_defrag_migrate_single_file
- ((void *)iterator);
-
- /*Critical errors: ENOTCONN and ENOSPACE*/
- if (ret) {
- dht_set_global_defrag_error
- (defrag, ret);
-
- defrag->defrag_status =
- GF_DEFRAG_STATUS_FAILED;
- goto out;
- }
-
- gf_defrag_free_container (iterator);
-
- continue;
- } else {
-
- /* defrag->crawl_done flag is set means crawling
- file system is done and hence a list_empty when
- the above flag is set indicates there are no more
- entries to be added to the queue and rebalance is
- finished */
-
- if (!defrag->crawl_done) {
- pthread_cond_wait (
- &defrag->parallel_migration_cond,
- &defrag->dfq_mutex);
- }
-
- if (defrag->crawl_done &&
- !defrag->q_entry_count) {
- pthread_cond_broadcast (
- &defrag->parallel_migration_cond);
- goto unlock;
- } else {
- pthread_mutex_unlock
- (&defrag->dfq_mutex);
- continue;
- }
+ if (hashed_subvol == cached_subvol) {
+ ret = 0;
+ goto out;
+ }
+
+ inode = inode_link(entry_loc.inode, entry_loc.parent, entry->d_name, &iatt);
+ inode_unref(entry_loc.inode);
+ /* use the inode returned by inode_link */
+ entry_loc.inode = inode;
+
+ old_THIS = THIS;
+ THIS = this;
+ statfs_frame = create_frame(this, this->ctx->pool);
+ if (!statfs_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM,
+ "Insufficient memory. Frame creation failed");
+ ret = -1;
+ goto out;
+ }
+
+ /* async statfs information for honoring min-free-disk */
+ dht_get_du_info(statfs_frame, this, loc);
+ THIS = old_THIS;
+
+ tmp = dict_get(migrate_data, GF_XATTR_FILE_MIGRATE_KEY);
+ if (tmp) {
+ memcpy(value, tmp->data, tmp->len);
+ if (strcmp(value, "force") == 0)
+ rebal_type = GF_DHT_MIGRATE_DATA_EVEN_IF_LINK_EXISTS;
+
+ if (conf->decommission_in_progress)
+ rebal_type = GF_DHT_MIGRATE_HARDLINK;
+ }
+
+ ret = dht_migrate_file(this, &entry_loc, cached_subvol, hashed_subvol,
+ rebal_type, &fop_errno);
+ if (ret == 1) {
+ if (fop_errno == ENOSPC) {
+ gf_msg_debug(this->name, 0,
+ "migrate-data skipped for"
+ " %s due to space constraints",
+ entry_loc.path);
+
+ /* For remove-brick case if the source is not one of the
+ * removed-brick, do not mark the error as failure */
+ if (conf->decommission_subvols_cnt) {
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->decommissioned_bricks[i] == cached_subvol) {
+ LOCK(&defrag->lock);
+ {
+ defrag->total_failures += 1;
+ update_skippedcount = _gf_false;
}
+ UNLOCK(&defrag->lock);
+ break;
+ }
}
-unlock:
- pthread_mutex_unlock (&defrag->dfq_mutex);
- break;
- }
-out:
- return NULL;
-}
+ }
-int static
-gf_defrag_get_entry (xlator_t *this, int i, struct dht_container **container,
- loc_t *loc, dht_conf_t *conf, gf_defrag_info_t *defrag,
- fd_t *fd, dict_t *migrate_data,
- struct dir_dfmeta *dir_dfmeta, dict_t *xattr_req,
- int *should_commit_hash)
-{
- int ret = -1;
- char is_linkfile = 0;
- gf_dirent_t *df_entry = NULL;
- loc_t entry_loc = {0,};
- dict_t *xattr_rsp = NULL;
- struct iatt iatt = {0,};
- struct dht_container *tmp_container = NULL;
- xlator_t *hashed_subvol = NULL;
- xlator_t *cached_subvol = NULL;
-
- if (dir_dfmeta->offset_var[i].readdir_done == 1) {
- ret = 0;
- goto out;
- }
- if (dir_dfmeta->fetch_entries[i] == 1) {
- ret = syncop_readdirp (conf->local_subvols[i], fd, 131072,
- dir_dfmeta->offset_var[i].offset,
- &(dir_dfmeta->equeue[i]),
- NULL, NULL);
- if (ret == 0) {
- dir_dfmeta->offset_var[i].readdir_done = 1;
- ret = 0;
- goto out;
+ if (update_skippedcount) {
+ LOCK(&defrag->lock);
+ {
+ defrag->skipped += 1;
}
+ UNLOCK(&defrag->lock);
- if (ret < 0) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_DATA_FAILED,
- "%s: Migrate data failed: Readdir returned"
- " %s. Aborting migrate-data", loc->path,
- strerror(-ret));
- ret = -1;
- goto out;
- }
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_MIGRATE_FILE_SKIPPED,
+ "File migration skipped for %s.", entry_loc.path);
+ }
- if (list_empty (&(dir_dfmeta->equeue[i].list))) {
- dir_dfmeta->offset_var[i].readdir_done = 1;
- ret = 0;
- goto out;
- }
+ } else if (fop_errno == ENOTSUP) {
+ gf_msg_debug(this->name, 0,
+ "migrate-data skipped for"
+ " hardlink %s ",
+ entry_loc.path);
+ LOCK(&defrag->lock);
+ {
+ defrag->skipped += 1;
+ }
+ UNLOCK(&defrag->lock);
- dir_dfmeta->fetch_entries[i] = 0;
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_MIGRATE_FILE_SKIPPED,
+ "File migration skipped for %s.", entry_loc.path);
}
- while (1) {
+ ret = 0;
+ goto out;
+ } else if (ret < 0) {
+ if (fop_errno != EEXIST) {
+ gf_msg(this->name, GF_LOG_ERROR, fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED, "migrate-data failed for %s",
+ entry_loc.path);
- if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
- ret = -1;
- goto out;
- }
+ LOCK(&defrag->lock);
+ {
+ defrag->total_failures += 1;
+ }
+ UNLOCK(&defrag->lock);
+ }
- df_entry = list_entry (dir_dfmeta->iterator[i]->next,
- typeof (*df_entry), list);
+ ret = gf_defrag_handle_migrate_error(fop_errno, defrag);
- if (&df_entry->list == dir_dfmeta->head[i]) {
- gf_dirent_free (&(dir_dfmeta->equeue[i]));
- INIT_LIST_HEAD (&(dir_dfmeta->equeue[i].list));
- dir_dfmeta->fetch_entries[i] = 1;
- dir_dfmeta->iterator[i] = dir_dfmeta->head[i];
- ret = 0;
- goto out;
- }
+ if (!ret) {
+ gf_msg(this->name, GF_LOG_ERROR, fop_errno,
+ DHT_MSG_MIGRATE_FILE_FAILED,
+ "migrate-data on %s failed:", entry_loc.path);
+ } else if (ret == 1) {
+ ret = 0;
+ }
+
+ goto out;
+ }
+
+ LOCK(&defrag->lock);
+ {
+ defrag->total_files += 1;
+ defrag->total_data += iatt.ia_size;
+ }
+ UNLOCK(&defrag->lock);
+
+ if (defrag->stats == _gf_true) {
+ gettimeofday(&end, NULL);
+ elapsed = gf_tvdiff(&start, &end);
+ gf_log(this->name, GF_LOG_INFO,
+ "Migration of "
+ "file:%s size:%" PRIu64
+ " bytes took %.2f"
+ "secs and ret: %d",
+ entry_loc.name, iatt.ia_size, elapsed / 1e6, ret);
+ }
- dir_dfmeta->iterator[i] = dir_dfmeta->iterator[i]->next;
+out:
+ if (statfs_frame) {
+ STACK_DESTROY(statfs_frame->root);
+ }
- dir_dfmeta->offset_var[i].offset = df_entry->d_off;
- if (!strcmp (df_entry->d_name, ".") ||
- !strcmp (df_entry->d_name, ".."))
- continue;
+ if (iatt_ptr) {
+ LOCK(&defrag->lock);
+ {
+ defrag->size_processed += iatt_ptr->ia_size;
+ }
+ UNLOCK(&defrag->lock);
+ }
+ loc_wipe(&entry_loc);
- if (IA_ISDIR (df_entry->d_stat.ia_type))
- continue;
+ return ret;
+}
- defrag->num_files_lookedup++;
+void *
+gf_defrag_task(void *opaque)
+{
+ struct list_head *q_head = NULL;
+ struct dht_container *iterator = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ int ret = 0;
+ pid_t pid = GF_CLIENT_PID_DEFRAG;
+
+ defrag = (gf_defrag_info_t *)opaque;
+ if (!defrag) {
+ gf_msg("dht", GF_LOG_ERROR, 0, 0, "defrag is NULL");
+ goto out;
+ }
+
+ syncopctx_setfspid(&pid);
+
+ q_head = &(defrag->queue[0].list);
+
+ /* The following while loop will dequeue one entry from the defrag->queue
+ under lock. We will update the defrag->global_error only when there
+ is an error which is critical to stop the rebalance process. The stop
+ message will be intimated to other migrator threads by setting the
+ defrag->defrag_status to GF_DEFRAG_STATUS_FAILED.
+
+ In defrag->queue, a low watermark (MIN_MIGRATE_QUEUE_COUNT) is
+ maintained so that crawler does not starve the file migration
+ workers and a high watermark (MAX_MIGRATE_QUEUE_COUNT) so that
+ crawler does not go far ahead in filling up the queue.
+ */
+
+ while (_gf_true) {
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ pthread_cond_broadcast(&defrag->rebalance_crawler_alarm);
+ pthread_cond_broadcast(&defrag->parallel_migration_cond);
+ goto out;
+ }
- if (defrag->defrag_pattern &&
- (gf_defrag_pattern_match (defrag, df_entry->d_name,
- df_entry->d_stat.ia_size)
- == _gf_false)) {
- continue;
+ pthread_mutex_lock(&defrag->dfq_mutex);
+ {
+ /*Throttle down:
+ If the reconfigured count is less than current thread
+ count, then the current thread will sleep */
+
+ /*TODO: Need to refactor the following block to work
+ *under defrag->lock. For now access
+ * defrag->current_thread_count and rthcount under
+ * dfq_mutex lock */
+ while (!defrag->crawl_done && (defrag->recon_thread_count <
+ defrag->current_thread_count)) {
+ defrag->current_thread_count--;
+ gf_msg_debug("DHT", 0,
+ "Thread sleeping. "
+ "current thread count: %d",
+ defrag->current_thread_count);
+
+ pthread_cond_wait(&defrag->df_wakeup_thread,
+ &defrag->dfq_mutex);
+
+ defrag->current_thread_count++;
+ gf_msg_debug("DHT", 0,
+ "Thread wokeup. "
+ "current thread count: %d",
+ defrag->current_thread_count);
+ }
+
+ if (defrag->q_entry_count) {
+ iterator = list_entry(q_head->next, typeof(*iterator), list);
+
+ gf_msg_debug("DHT", 0,
+ "picking entry "
+ "%s",
+ iterator->df_entry->d_name);
+
+ list_del_init(&(iterator->list));
+
+ defrag->q_entry_count--;
+
+ if ((defrag->q_entry_count < MIN_MIGRATE_QUEUE_COUNT) &&
+ defrag->wakeup_crawler) {
+ pthread_cond_broadcast(&defrag->rebalance_crawler_alarm);
}
+ pthread_mutex_unlock(&defrag->dfq_mutex);
+ ret = gf_defrag_migrate_single_file((void *)iterator);
- loc_wipe (&entry_loc);
- ret = dht_build_child_loc (this, &entry_loc, loc,
- df_entry->d_name);
+ /*Critical errors: ENOTCONN and ENOSPACE*/
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Child loc"
- " build failed");
- ret = -1;
- goto out;
- }
+ dht_set_global_defrag_error(defrag, ret);
- if (gf_uuid_is_null (df_entry->d_stat.ia_gfid)) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_GFID_NULL,
- "%s/%s gfid not present", loc->path,
- df_entry->d_name);
- continue;
- }
-
- gf_uuid_copy (entry_loc.gfid, df_entry->d_stat.ia_gfid);
-
- if (gf_uuid_is_null (loc->gfid)) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_GFID_NULL,
- "%s/%s gfid not present", loc->path,
- df_entry->d_name);
- continue;
- }
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
- gf_uuid_copy (entry_loc.pargfid, loc->gfid);
+ pthread_cond_broadcast(&defrag->rebalance_crawler_alarm);
- entry_loc.inode->ia_type = df_entry->d_stat.ia_type;
+ pthread_cond_broadcast(&defrag->parallel_migration_cond);
- if (xattr_rsp) {
- dict_unref (xattr_rsp);
- xattr_rsp = NULL;
+ goto out;
}
- ret = syncop_lookup (conf->local_subvols[i], &entry_loc,
- &iatt, NULL, xattr_req, &xattr_rsp);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:%s lookup failed",
- entry_loc.path);
-
- if (-ret != ENOENT && -ret != ESTALE) {
-
- defrag->total_failures++;
-
- if (conf->decommission_in_progress) {
- ret = -1;
- goto out;
- } else {
- *should_commit_hash = 0;
- continue;
- }
- }
-
- continue;
+ gf_defrag_free_container(iterator);
+
+ continue;
+ } else {
+ /* defrag->crawl_done flag is set means crawling
+ file system is done and hence a list_empty when
+ the above flag is set indicates there are no more
+ entries to be added to the queue and rebalance is
+ finished */
+
+ if (!defrag->crawl_done) {
+ defrag->current_thread_count--;
+ gf_msg_debug("DHT", 0,
+ "Thread "
+ "sleeping while waiting "
+ "for migration entries. "
+ "current thread count:%d",
+ defrag->current_thread_count);
+
+ pthread_cond_wait(&defrag->parallel_migration_cond,
+ &defrag->dfq_mutex);
}
+ if (defrag->crawl_done && !defrag->q_entry_count) {
+ defrag->current_thread_count++;
+ gf_msg_debug("DHT", 0, "Exiting thread");
- is_linkfile = check_is_linkfile (NULL, &iatt, xattr_rsp,
- conf->link_xattr_name);
-
- if (is_linkfile) {
- /* No need to add linkto file to the queue for
- migration. Only the actual data file need to
- be checked for migration criteria.
- */
- gf_msg_debug (this->name, 0, "Skipping linkfile"
- " %s on subvol: %s", entry_loc.path,
- conf->local_subvols[i]->name);
- continue;
+ pthread_cond_broadcast(&defrag->parallel_migration_cond);
+ goto unlock;
+ } else {
+ defrag->current_thread_count++;
+ gf_msg_debug("DHT", 0,
+ "Thread woke up"
+ " as found migrating entries. "
+ "current thread count:%d",
+ defrag->current_thread_count);
+
+ pthread_mutex_unlock(&defrag->dfq_mutex);
+ continue;
}
+ }
+ }
+ unlock:
+ pthread_mutex_unlock(&defrag->dfq_mutex);
+ break;
+ }
+out:
+ return NULL;
+}
+int static gf_defrag_get_entry(xlator_t *this, int i,
+ struct dht_container **container, loc_t *loc,
+ dht_conf_t *conf, gf_defrag_info_t *defrag,
+ fd_t *fd, dict_t *migrate_data,
+ struct dir_dfmeta *dir_dfmeta, dict_t *xattr_req,
+ int *perrno)
+{
+ int ret = 0;
+ char is_linkfile = 0;
+ gf_dirent_t *df_entry = NULL;
+ struct dht_container *tmp_container = NULL;
- ret = syncop_lookup (this, &entry_loc, NULL, NULL,
- NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_FILE_FAILED,
- "Migrate file failed:%s lookup failed",
- entry_loc.path);
-
- if (-ret != ENOENT && -ret != ESTALE) {
-
- defrag->total_failures++;
-
- if (conf->decommission_in_progress) {
- ret = -1;
- goto out;
- } else {
- *should_commit_hash = 0;
- continue;
- }
- }
-
- continue;
- }
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = -1;
+ goto out;
+ }
- /* if distribute is present, it will honor this key.
- * -1, ENODATA is returned if distribute is not present
- * or file doesn't have a link-file. If file has
- * link-file, the path of link-file will be the value,
- * and also that guarantees that file has to be mostly
- * migrated */
-
- hashed_subvol = dht_subvol_get_hashed (this, &entry_loc);
- if (!hashed_subvol) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_HASHED_SUBVOL_GET_FAILED,
- "Failed to get hashed subvol for %s",
- loc->path);
- continue;
- }
+ if (dir_dfmeta->offset_var[i].readdir_done == 1) {
+ ret = 0;
+ goto out;
+ }
- cached_subvol = dht_subvol_get_cached (this, entry_loc.inode);
- if (!cached_subvol) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_CACHED_SUBVOL_GET_FAILED,
- "Failed to get cached subvol for %s",
- loc->path);
+ if (dir_dfmeta->fetch_entries[i] == 1) {
+ if (!fd) {
+ dir_dfmeta->fetch_entries[i] = 0;
+ dir_dfmeta->offset_var[i].readdir_done = 1;
+ ret = 0;
+ goto out;
+ }
- continue;
- }
+ ret = syncop_readdirp(conf->local_subvols[i], fd, 131072,
+ dir_dfmeta->offset_var[i].offset,
+ &(dir_dfmeta->equeue[i]), xattr_req, NULL);
+ if (ret == 0) {
+ dir_dfmeta->offset_var[i].readdir_done = 1;
+ ret = 0;
+ goto out;
+ }
- if (hashed_subvol == cached_subvol) {
- continue;
- }
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_WARNING, -ret,
+ DHT_MSG_MIGRATE_DATA_FAILED,
+ "Readdirp failed. Aborting data migration for "
+ "directory: %s",
+ loc->path);
+ *perrno = -ret;
+ ret = -1;
+ goto out;
+ }
- /*Build Container Structure */
+ if (list_empty(&(dir_dfmeta->equeue[i].list))) {
+ dir_dfmeta->offset_var[i].readdir_done = 1;
+ ret = 0;
+ goto out;
+ }
- tmp_container = GF_CALLOC (1, sizeof(struct dht_container),
- gf_dht_mt_container_t);
- if (!tmp_container) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to allocate "
- "memory for container");
- ret = -1;
- goto out;
- }
- tmp_container->df_entry = gf_dirent_for_name (df_entry->d_name);
- if (!tmp_container->df_entry) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to allocate "
- "memory for df_entry");
- ret = -1;
- goto out;
- }
+ dir_dfmeta->fetch_entries[i] = 0;
+ }
- tmp_container->df_entry->d_stat = df_entry->d_stat;
+ while (1) {
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = -1;
+ goto out;
+ }
- tmp_container->df_entry->d_ino = df_entry->d_ino;
+ df_entry = list_entry(dir_dfmeta->iterator[i]->next, typeof(*df_entry),
+ list);
- tmp_container->df_entry->d_type = df_entry->d_type;
+ if (&df_entry->list == dir_dfmeta->head[i]) {
+ gf_dirent_free(&(dir_dfmeta->equeue[i]));
+ INIT_LIST_HEAD(&(dir_dfmeta->equeue[i].list));
+ dir_dfmeta->fetch_entries[i] = 1;
+ dir_dfmeta->iterator[i] = dir_dfmeta->head[i];
+ ret = 0;
+ goto out;
+ }
- tmp_container->df_entry->d_len = df_entry->d_len;
+ dir_dfmeta->iterator[i] = dir_dfmeta->iterator[i]->next;
- tmp_container->parent_loc = GF_CALLOC(1, sizeof(*loc),
- gf_dht_mt_loc_t);
- if (!tmp_container->parent_loc) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to allocate "
- "memory for loc");
- ret = -1;
- goto out;
- }
+ dir_dfmeta->offset_var[i].offset = df_entry->d_off;
+ if (!strcmp(df_entry->d_name, ".") || !strcmp(df_entry->d_name, ".."))
+ continue;
+ if (IA_ISDIR(df_entry->d_stat.ia_type)) {
+ defrag->size_processed += df_entry->d_stat.ia_size;
+ continue;
+ }
- ret = loc_copy (tmp_container->parent_loc, loc);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "loc_copy failed");
- ret = -1;
- goto out;
- }
+ defrag->num_files_lookedup++;
- tmp_container->migrate_data = migrate_data;
+ if (defrag->defrag_pattern &&
+ (gf_defrag_pattern_match(defrag, df_entry->d_name,
+ df_entry->d_stat.ia_size) == _gf_false)) {
+ defrag->size_processed += df_entry->d_stat.ia_size;
+ continue;
+ }
- tmp_container->this = this;
+ is_linkfile = check_is_linkfile(NULL, &df_entry->d_stat, df_entry->dict,
+ conf->link_xattr_name);
- if (df_entry->dict)
- tmp_container->df_entry->dict =
- dict_ref (df_entry->dict);
+ if (is_linkfile) {
+ /* No need to add linkto file to the queue for
+ migration. Only the actual data file need to
+ be checked for migration criteria.
+ */
- /*Build Container Structue >> END*/
+ gf_msg_debug(this->name, 0,
+ "Skipping linkfile"
+ " %s on subvol: %s",
+ df_entry->d_name, conf->local_subvols[i]->name);
+ continue;
+ }
- ret = 0;
- goto out;
+ /*Build Container Structure */
+ tmp_container = GF_CALLOC(1, sizeof(struct dht_container),
+ gf_dht_mt_container_t);
+ if (!tmp_container) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to allocate "
+ "memory for container");
+ ret = -1;
+ goto out;
+ }
+ tmp_container->df_entry = gf_dirent_for_name(df_entry->d_name);
+ if (!tmp_container->df_entry) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to allocate "
+ "memory for df_entry");
+ ret = -1;
+ goto out;
}
-out:
- loc_wipe (&entry_loc);
+ tmp_container->local_subvol_index = i;
- if (ret == 0) {
- *container = tmp_container;
- } else {
- if (tmp_container) {
- gf_defrag_free_container (tmp_container);
- }
- }
+ tmp_container->df_entry->d_stat = df_entry->d_stat;
- if (xattr_rsp)
- dict_unref (xattr_rsp);
- return ret;
-}
+ tmp_container->df_entry->d_ino = df_entry->d_ino;
-int
-gf_defrag_process_dir (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
- dict_t *migrate_data)
-{
- int ret = -1;
- fd_t *fd = NULL;
- dht_conf_t *conf = NULL;
- gf_dirent_t entries;
- dict_t *dict = NULL;
- dict_t *xattr_req = NULL;
- struct timeval dir_start = {0,};
- struct timeval end = {0,};
- double elapsed = {0,};
- int local_subvols_cnt = 0;
- int i = 0;
- int j = 0;
- struct dht_container *container = NULL;
- int ldfq_count = 0;
- int dfc_index = 0;
- int throttle_up = 0;
- struct dir_dfmeta *dir_dfmeta = NULL;
- int should_commit_hash = 1;
-
- gf_log (this->name, GF_LOG_INFO, "migrate data called on %s",
- loc->path);
- gettimeofday (&dir_start, NULL);
-
- conf = this->private;
- local_subvols_cnt = conf->local_subvols_cnt;
-
- if (!local_subvols_cnt) {
- ret = 0;
- goto out;
- }
+ tmp_container->df_entry->d_type = df_entry->d_type;
- fd = fd_create (loc->inode, defrag->pid);
- if (!fd) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to create fd");
- ret = -1;
- goto out;
+ tmp_container->df_entry->d_len = df_entry->d_len;
+
+ tmp_container->parent_loc = GF_CALLOC(1, sizeof(*loc), gf_dht_mt_loc_t);
+ if (!tmp_container->parent_loc) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to allocate "
+ "memory for loc");
+ ret = -1;
+ goto out;
}
- ret = syncop_opendir (this, loc, fd, NULL, NULL);
+ ret = loc_copy(tmp_container->parent_loc, loc);
if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_MIGRATE_DATA_FAILED,
- "Migrate data failed: Failed to open dir %s",
- loc->path);
- ret = -1;
- goto out;
+ gf_log(this->name, GF_LOG_ERROR, "loc_copy failed");
+ ret = -1;
+ goto out;
}
- fd_bind (fd);
- dir_dfmeta = GF_CALLOC (1, sizeof (*dir_dfmeta),
- gf_common_mt_pointer);
- if (!dir_dfmeta) {
- gf_log (this->name, GF_LOG_ERROR, "dir_dfmeta is NULL");
- ret = -1;
- goto out;
- }
+ tmp_container->migrate_data = migrate_data;
+ tmp_container->this = this;
- dir_dfmeta->head = GF_CALLOC (local_subvols_cnt,
- sizeof (*(dir_dfmeta->head)),
- gf_common_mt_pointer);
- if (!dir_dfmeta->head) {
- gf_log (this->name, GF_LOG_ERROR, "dir_dfmeta->head is NULL");
- ret = -1;
- goto out;
- }
+ if (df_entry->dict)
+ tmp_container->df_entry->dict = dict_ref(df_entry->dict);
- dir_dfmeta->iterator = GF_CALLOC (local_subvols_cnt,
- sizeof (*(dir_dfmeta->iterator)),
- gf_common_mt_pointer);
- if (!dir_dfmeta->iterator) {
- gf_log (this->name, GF_LOG_ERROR,
- "dir_dfmeta->iterator is NULL");
- ret = -1;
- goto out;
- }
+ /*Build Container Structure >> END*/
- dir_dfmeta->equeue = GF_CALLOC (local_subvols_cnt, sizeof (entries),
- gf_dht_mt_dirent_t);
- if (!dir_dfmeta->equeue) {
- gf_log (this->name, GF_LOG_ERROR, "dir_dfmeta->equeue is NULL");
- ret = -1;
- goto out;
- }
+ ret = 0;
+ goto out;
+ }
- dir_dfmeta->offset_var = GF_CALLOC (local_subvols_cnt,
- sizeof (dht_dfoffset_ctx_t),
- gf_dht_mt_octx_t);
- if (!dir_dfmeta->offset_var) {
- gf_log (this->name, GF_LOG_ERROR,
- "dir_dfmeta->offset_var is NULL");
- ret = -1;
- goto out;
- }
- ret = gf_defrag_ctx_subvols_init (dir_dfmeta->offset_var, this);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "dht_dfoffset_ctx_t"
- "initialization failed");
- ret = -1;
- goto out;
+out:
+ if (ret == 0) {
+ *container = tmp_container;
+ } else {
+ if (tmp_container) {
+ gf_defrag_free_container(tmp_container);
}
+ }
- dir_dfmeta->fetch_entries = GF_CALLOC (local_subvols_cnt,
- sizeof (int), gf_common_mt_int);
- if (!dir_dfmeta->fetch_entries) {
- gf_log (this->name, GF_LOG_ERROR,
- "dir_dfmeta->fetch_entries is NULL");
- ret = -1;
- goto out;
- }
+ return ret;
+}
- for (i = 0; i < local_subvols_cnt ; i++) {
- INIT_LIST_HEAD (&(dir_dfmeta->equeue[i].list));
- dir_dfmeta->head[i] = &(dir_dfmeta->equeue[i].list);
- dir_dfmeta->iterator[i] = dir_dfmeta->head[i];
- dir_dfmeta->fetch_entries[i] = 1;
- }
+int
+gf_defrag_process_dir(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
+ dict_t *migrate_data, int *perrno)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ gf_dirent_t entries;
+ dict_t *xattr_req = NULL;
+ struct timeval dir_start = {
+ 0,
+ };
+ struct timeval end = {
+ 0,
+ };
+ double elapsed = {
+ 0,
+ };
+ int local_subvols_cnt = 0;
+ int i = 0;
+ int j = 0;
+ struct dht_container *container = NULL;
+ int ldfq_count = 0;
+ int dfc_index = 0;
+ int throttle_up = 0;
+ struct dir_dfmeta *dir_dfmeta = NULL;
+ xlator_t *old_THIS = NULL;
+
+ gf_log(this->name, GF_LOG_INFO, "migrate data called on %s", loc->path);
+ gettimeofday(&dir_start, NULL);
+
+ conf = this->private;
+ local_subvols_cnt = conf->local_subvols_cnt;
+
+ if (!local_subvols_cnt) {
+ ret = 0;
+ goto out;
+ }
- xattr_req = dict_new ();
- if (!xattr_req) {
- gf_log (this->name, GF_LOG_ERROR, "dict_new failed");
- ret = -1;
- goto out;
- }
+ old_THIS = THIS;
+ THIS = this;
- ret = dict_set_uint32 (xattr_req,
- conf->link_xattr_name, 256);
+ dir_dfmeta = GF_CALLOC(1, sizeof(*dir_dfmeta), gf_common_mt_pointer);
+ if (!dir_dfmeta) {
+ gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ dir_dfmeta->lfd = GF_CALLOC(local_subvols_cnt, sizeof(fd_t *),
+ gf_common_mt_pointer);
+ if (!dir_dfmeta->lfd) {
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY,
+ "for dir_dfmeta", NULL);
+ ret = -1;
+ *perrno = ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < local_subvols_cnt; i++) {
+ dir_dfmeta->lfd[i] = fd_create(loc->inode, defrag->pid);
+ if (!dir_dfmeta->lfd[i]) {
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_FD_CREATE_FAILED,
+ NULL);
+ *perrno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_opendir(conf->local_subvols[i], loc, dir_dfmeta->lfd[i],
+ NULL, NULL);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "failed to set dict for "
- "key: %s", conf->link_xattr_name);
- ret = -1;
- goto out;
+ fd_unref(dir_dfmeta->lfd[i]);
+ dir_dfmeta->lfd[i] = NULL;
+ gf_smsg(this->name, GF_LOG_WARNING, 0, DHT_MSG_FAILED_TO_OPEN,
+ "dir: %s", loc->path, "subvol: %s",
+ conf->local_subvols[i]->name, NULL);
+
+ if (conf->decommission_in_progress) {
+ *perrno = -ret;
+ ret = -1;
+ goto out;
+ }
+ } else {
+ fd_bind(dir_dfmeta->lfd[i]);
}
+ }
- /*
- Job: Read entries from each local subvol and store the entries
- in equeue array of linked list. Now pick one entry from the
- equeue array in a round robin basis and add them to defrag Queue.
- */
-
- while (!dht_dfreaddirp_done(dir_dfmeta->offset_var,
- local_subvols_cnt)) {
-
- pthread_mutex_lock (&defrag->dfq_mutex);
- {
-
- /*Throttle up: If reconfigured count is higher than
- current thread count, wake up the sleeping threads
- TODO: Need to refactor this. Instead of making the
- thread sleep and wake, we should terminate and spawn
- threads on-demand*/
-
- if (defrag->recon_thread_count >
- defrag->current_thread_count) {
- throttle_up =
- (defrag->recon_thread_count -
- defrag->current_thread_count);
- for (j = 0; j < throttle_up; j++) {
- pthread_cond_signal (
- &defrag->df_wakeup_thread);
- }
-
- }
+ dir_dfmeta->head = GF_CALLOC(local_subvols_cnt, sizeof(*(dir_dfmeta->head)),
+ gf_common_mt_pointer);
+ if (!dir_dfmeta->head) {
+ gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->head is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ dir_dfmeta->iterator = GF_CALLOC(local_subvols_cnt,
+ sizeof(*(dir_dfmeta->iterator)),
+ gf_common_mt_pointer);
+ if (!dir_dfmeta->iterator) {
+ gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->iterator is NULL");
+ ret = -1;
+ goto out;
+ }
- while (defrag->q_entry_count >
- MAX_MIGRATE_QUEUE_COUNT) {
- defrag->wakeup_crawler = 1;
- pthread_cond_wait (
- &defrag->rebalance_crawler_alarm,
- &defrag->dfq_mutex);
- }
+ dir_dfmeta->equeue = GF_CALLOC(local_subvols_cnt, sizeof(entries),
+ gf_dht_mt_dirent_t);
+ if (!dir_dfmeta->equeue) {
+ gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->equeue is NULL");
+ ret = -1;
+ goto out;
+ }
- ldfq_count = defrag->q_entry_count;
+ dir_dfmeta->offset_var = GF_CALLOC(
+ local_subvols_cnt, sizeof(dht_dfoffset_ctx_t), gf_dht_mt_octx_t);
+ if (!dir_dfmeta->offset_var) {
+ gf_log(this->name, GF_LOG_ERROR, "dir_dfmeta->offset_var is NULL");
+ ret = -1;
+ goto out;
+ }
+
+ ret = gf_defrag_ctx_subvols_init(dir_dfmeta->offset_var, this);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "dht_dfoffset_ctx_t"
+ "initialization failed");
+ ret = -1;
+ goto out;
+ }
+
+ dir_dfmeta->fetch_entries = GF_CALLOC(local_subvols_cnt, sizeof(int),
+ gf_common_mt_int);
+ if (!dir_dfmeta->fetch_entries) {
+ gf_smsg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_INSUFF_MEMORY,
+ "for dir_dfmeta->fetch_entries", NULL);
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < local_subvols_cnt; i++) {
+ INIT_LIST_HEAD(&(dir_dfmeta->equeue[i].list));
+ dir_dfmeta->head[i] = &(dir_dfmeta->equeue[i].list);
+ dir_dfmeta->iterator[i] = dir_dfmeta->head[i];
+ dir_dfmeta->fetch_entries[i] = 1;
+ }
+
+ xattr_req = dict_new();
+ if (!xattr_req) {
+ gf_log(this->name, GF_LOG_ERROR, "dict_new failed");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dict_set_uint32(xattr_req, conf->link_xattr_name, 256);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "failed to set dict for "
+ "key: %s",
+ conf->link_xattr_name);
+ ret = -1;
+ goto out;
+ }
- if (defrag->wakeup_crawler) {
- defrag->wakeup_crawler = 0;
- }
+ /*
+ Job: Read entries from each local subvol and store the entries
+ in equeue array of linked list. Now pick one entry from the
+ equeue array in a round robin basis and add them to defrag Queue.
+ */
+ while (!dht_dfreaddirp_done(dir_dfmeta->offset_var, local_subvols_cnt)) {
+ pthread_mutex_lock(&defrag->dfq_mutex);
+ {
+ /*Throttle up: If reconfigured count is higher than
+ current thread count, wake up the sleeping threads
+ TODO: Need to refactor this. Instead of making the
+ thread sleep and wake, we should terminate and spawn
+ threads on-demand*/
+
+ if (defrag->recon_thread_count > defrag->current_thread_count) {
+ throttle_up = (defrag->recon_thread_count -
+ defrag->current_thread_count);
+ for (j = 0; j < throttle_up; j++) {
+ pthread_cond_signal(&defrag->df_wakeup_thread);
}
- pthread_mutex_unlock (&defrag->dfq_mutex);
-
- while (ldfq_count <= MAX_MIGRATE_QUEUE_COUNT &&
- !dht_dfreaddirp_done(dir_dfmeta->offset_var,
- local_subvols_cnt)) {
-
- ret = gf_defrag_get_entry (this, dfc_index, &container,
- loc, conf, defrag, fd,
- migrate_data, dir_dfmeta,
- xattr_req,
- &should_commit_hash);
- if (ret) {
- gf_log ("DHT", GF_LOG_INFO, "Found critical "
- "error from gf_defrag_get_entry");
- ret = -1;
- goto out;
- }
+ }
- /* Check if we got an entry, else we need to move the
- index to the next subvol */
- if (!container) {
- GF_CRAWL_INDEX_MOVE(dfc_index,
- local_subvols_cnt);
- continue;
- }
+ while (defrag->q_entry_count > MAX_MIGRATE_QUEUE_COUNT) {
+ defrag->wakeup_crawler = 1;
+ pthread_cond_wait(&defrag->rebalance_crawler_alarm,
+ &defrag->dfq_mutex);
+ }
- /* Q this entry in the dfq */
- pthread_mutex_lock (&defrag->dfq_mutex);
- {
- list_add_tail (&container->list,
- &(defrag->queue[0].list));
- defrag->q_entry_count++;
- ldfq_count = defrag->q_entry_count;
-
- gf_msg_debug (this->name, 0, "added "
- "file:%s parent:%s to the queue ",
- container->df_entry->d_name,
- container->parent_loc->path);
-
- pthread_cond_signal (
- &defrag->parallel_migration_cond);
- }
- pthread_mutex_unlock (&defrag->dfq_mutex);
+ ldfq_count = defrag->q_entry_count;
- GF_CRAWL_INDEX_MOVE(dfc_index, local_subvols_cnt);
- }
+ if (defrag->wakeup_crawler) {
+ defrag->wakeup_crawler = 0;
+ }
}
+ pthread_mutex_unlock(&defrag->dfq_mutex);
- gettimeofday (&end, NULL);
- elapsed = (end.tv_sec - dir_start.tv_sec) * 1e6 +
- (end.tv_usec - dir_start.tv_usec);
- gf_log (this->name, GF_LOG_INFO, "Migration operation on dir %s took "
- "%.2f secs", loc->path, elapsed/1e6);
- ret = 0;
-out:
+ while (
+ ldfq_count <= MAX_MIGRATE_QUEUE_COUNT &&
+ !dht_dfreaddirp_done(dir_dfmeta->offset_var, local_subvols_cnt)) {
+ ret = gf_defrag_get_entry(this, dfc_index, &container, loc, conf,
+ defrag, dir_dfmeta->lfd[dfc_index],
+ migrate_data, dir_dfmeta, xattr_req,
+ perrno);
- GF_FREE_DIR_DFMETA (dir_dfmeta);
-
- if (dict)
- dict_unref(dict);
-
- if (xattr_req)
- dict_unref(xattr_req);
-
- if (fd)
- fd_unref (fd);
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED) {
+ goto out;
+ }
- if (ret == 0 && should_commit_hash == 0) {
- ret = 2;
- }
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING,
+ "Found "
+ "error from gf_defrag_get_entry");
- return ret;
+ ret = -1;
+ goto out;
+ }
+
+ /* Check if we got an entry, else we need to move the
+ index to the next subvol */
+ if (!container) {
+ GF_CRAWL_INDEX_MOVE(dfc_index, local_subvols_cnt);
+ continue;
+ }
+
+ /* Q this entry in the dfq */
+ pthread_mutex_lock(&defrag->dfq_mutex);
+ {
+ list_add_tail(&container->list, &(defrag->queue[0].list));
+ defrag->q_entry_count++;
+ ldfq_count = defrag->q_entry_count;
+
+ gf_msg_debug(this->name, 0,
+ "added "
+ "file:%s parent:%s to the queue ",
+ container->df_entry->d_name,
+ container->parent_loc->path);
+
+ pthread_cond_signal(&defrag->parallel_migration_cond);
+ }
+ pthread_mutex_unlock(&defrag->dfq_mutex);
+
+ GF_CRAWL_INDEX_MOVE(dfc_index, local_subvols_cnt);
+ }
+ }
+
+ gettimeofday(&end, NULL);
+ elapsed = gf_tvdiff(&dir_start, &end);
+ gf_log(this->name, GF_LOG_INFO,
+ "Migration operation on dir %s took "
+ "%.2f secs",
+ loc->path, elapsed / 1e6);
+ ret = 0;
+out:
+ THIS = old_THIS;
+ gf_defrag_free_dir_dfmeta(dir_dfmeta, local_subvols_cnt);
+
+ if (xattr_req)
+ dict_unref(xattr_req);
+
+ /* It does not matter if it errored out - this number is
+ * used to calculate rebalance estimated time to complete.
+ * No locking required as dirs are processed by a single thread.
+ */
+ defrag->num_dirs_processed++;
+ return ret;
}
+
int
-gf_defrag_settle_hash (xlator_t *this, gf_defrag_info_t *defrag,
- loc_t *loc, dict_t *fix_layout)
+gf_defrag_settle_hash(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
+ dict_t *fix_layout)
{
- int ret;
- dht_conf_t *conf = NULL;
- /*
- * Now we're ready to update the directory commit hash for the volume
- * root, so that hash miscompares and broadcast lookups can stop.
- * However, we want to skip that if fix-layout is all we did. In
- * that case, we want the miscompares etc. to continue until a real
- * rebalance is complete.
+ int ret;
+ dht_conf_t *conf = NULL;
+ /*
+ * Now we're ready to update the directory commit hash for the volume
+ * root, so that hash miscompares and broadcast lookups can stop.
+ * However, we want to skip that if fix-layout is all we did. In
+ * that case, we want the miscompares etc. to continue until a real
+ * rebalance is complete.
+ */
+ if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX ||
+ defrag->cmd == GF_DEFRAG_CMD_DETACH_START) {
+ return 0;
+ }
+
+ conf = this->private;
+ if (!conf) {
+ /*Uh oh
*/
- if (defrag->cmd == GF_DEFRAG_CMD_START_LAYOUT_FIX
- || defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER) {
- return 0;
- }
+ return -1;
+ }
- conf = this->private;
- if (!conf) {
- /*Uh oh
- */
- return -1;
- }
+ if (conf->local_subvols_cnt == 0 || !conf->lookup_optimize) {
+ /* Commit hash updates are only done on local subvolumes and
+ * only when lookup optimization is needed (for older client
+ * support)
+ */
+ return 0;
+ }
- if (conf->local_subvols_cnt == 0 || !conf->lookup_optimize) {
- /* Commit hash updates are only done on local subvolumes and
- * only when lookup optmization is needed (for older client
- * support)
- */
- return 0;
- }
+ ret = dict_set_uint32(fix_layout, "new-commit-hash",
+ defrag->new_commit_hash);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to set new-commit-hash");
+ return -1;
+ }
- ret = dict_set_uint32 (fix_layout, "new-commit-hash",
- defrag->new_commit_hash);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to set new-commit-hash");
- return -1;
- }
+ ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "fix layout on %s failed", loc->path);
- ret = syncop_setxattr (this, loc, fix_layout, 0, NULL, NULL);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "fix layout on %s failed", loc->path);
- return -1;
+ if (-ret == ENOENT || -ret == ESTALE) {
+ /* Dir most likely is deleted */
+ return 0;
}
- /* TBD: find more efficient solution than adding/deleting every time */
- dict_del(fix_layout, "new-commit-hash");
-
- return 0;
-}
+ return -1;
+ }
+ /* TBD: find more efficient solution than adding/deleting every time */
+ dict_del(fix_layout, "new-commit-hash");
+ return 0;
+}
-/* Function for doing a named lookup on file inodes during an attach tier
- * So that a hardlink lookup heal i.e gfid to parent gfid lookup heal
- * happens on pre-existing data. This is required so that the ctr database has
- * hardlinks of all the exisitng file in the volume. CTR xlator on the
- * brick/server side does db update/insert of the hardlink on a namelookup.
- * Currently the namedlookup is done synchronous to the fixlayout that is
- * triggered by attach tier. This is not performant, adding more time to
- * fixlayout. The performant approach is record the hardlinks on a compressed
- * datastore and then do the namelookup asynchronously later, giving the ctr db
- * eventual consistency
- * */
int
-gf_fix_layout_tier_attach_lookup (xlator_t *this,
- loc_t *parent_loc,
- gf_dirent_t *file_dentry)
+gf_defrag_fix_layout(xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
+ dict_t *fix_layout, dict_t *migrate_data)
{
- int ret = -1;
- dict_t *lookup_xdata = NULL;
- dht_conf_t *conf = NULL;
- loc_t file_loc = {0,};
- struct iatt iatt = {0,};
-
- GF_VALIDATE_OR_GOTO ("tier", this, out);
-
- GF_VALIDATE_OR_GOTO (this->name, parent_loc, out);
-
- GF_VALIDATE_OR_GOTO (this->name, file_dentry, out);
+ int ret = -1;
+ loc_t entry_loc = {
+ 0,
+ };
+ fd_t *fd = NULL;
+ gf_dirent_t entries;
+ gf_dirent_t *tmp = NULL;
+ gf_dirent_t *entry = NULL;
+ gf_boolean_t free_entries = _gf_false;
+ off_t offset = 0;
+ struct iatt iatt = {
+ 0,
+ };
+ inode_t *linked_inode = NULL, *inode = NULL;
+ dht_conf_t *conf = NULL;
+ int perrno = 0;
+
+ conf = this->private;
+ if (!conf) {
+ ret = -1;
+ goto out;
+ }
- GF_VALIDATE_OR_GOTO (this->name, this->private, out);
+ ret = syncop_lookup(this, loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ if (strcmp(loc->path, "/") == 0) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+ "lookup failed for:%s", loc->path);
- if (!parent_loc->inode) {
- gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "%s/%s parent is NULL", parent_loc->path,
- file_dentry->d_name);
- goto out;
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
}
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+ "Dir:%s renamed or removed. Skipping", loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
+ goto out;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_DIR_LOOKUP_FAILED,
+ "lookup failed for:%s", loc->path);
- conf = this->private;
-
- loc_wipe (&file_loc);
-
- if (gf_uuid_is_null (file_dentry->d_stat.ia_gfid)) {
- gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "%s/%s gfid not present", parent_loc->path,
- file_dentry->d_name);
- goto out;
+ defrag->total_failures++;
+ goto out;
}
+ }
- gf_uuid_copy (file_loc.gfid, file_dentry->d_stat.ia_gfid);
+ fd = fd_create(loc->inode, defrag->pid);
+ if (!fd) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to create fd");
+ ret = -1;
+ goto out;
+ }
- if (gf_uuid_is_null (parent_loc->gfid)) {
- gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "%s/%s"
- " gfid not present", parent_loc->path,
- file_dentry->d_name);
- goto out;
+ ret = syncop_opendir(this, loc, fd, NULL, NULL);
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
+ goto out;
}
- gf_uuid_copy (file_loc.pargfid, parent_loc->gfid);
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to open dir %s, "
+ "err:%d",
+ loc->path, -ret);
+ ret = -1;
+ goto out;
+ }
- ret = dht_build_child_loc (this, &file_loc, parent_loc,
- file_dentry->d_name);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Child loc build failed");
- ret = -1;
- goto out;
- }
+ fd_bind(fd);
+ INIT_LIST_HEAD(&entries.list);
- lookup_xdata = dict_new ();
- if (!lookup_xdata) {
- gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed creating lookup dict for %s",
- file_dentry->d_name);
+ while ((ret = syncop_readdirp(this, fd, 131072, offset, &entries, NULL,
+ NULL)) != 0) {
+ if (ret < 0) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
goto out;
- }
+ }
- ret = dict_set_int32 (lookup_xdata, CTR_ATTACH_TIER_LOOKUP, 1);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
- "Failed to set lookup flag");
- goto out;
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_READDIR_ERROR,
+ "readdirp failed for "
+ "path %s. Aborting fix-layout",
+ loc->path);
+
+ ret = -1;
+ goto out;
}
- gf_uuid_copy (file_loc.parent->gfid, parent_loc->gfid);
+ if (list_empty(&entries.list))
+ break;
- /* Sending lookup to cold tier only */
- ret = syncop_lookup (conf->subvolumes[0], &file_loc, &iatt,
- NULL, lookup_xdata, NULL);
- if (ret) {
- /* If the file does not exist on the cold tier than it must */
- /* have been discovered on the hot tier. This is not an error. */
- gf_msg (this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
- "%s lookup to cold tier on attach heal failed", file_loc.path);
+ free_entries = _gf_true;
+
+ list_for_each_entry_safe(entry, tmp, &entries.list, list)
+ {
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
+ ret = 1;
goto out;
- }
+ }
- ret = 0;
+ offset = entry->d_off;
-out:
+ if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
+ continue;
+ if (!IA_ISDIR(entry->d_stat.ia_type)) {
+ continue;
+ }
+ loc_wipe(&entry_loc);
- loc_wipe (&file_loc);
+ ret = dht_build_child_loc(this, &entry_loc, loc, entry->d_name);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Child loc"
+ " build failed for entry: %s",
+ entry->d_name);
- if (lookup_xdata)
- dict_unref (lookup_xdata);
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
- return ret;
-}
+ goto out;
+ } else {
+ continue;
+ }
+ }
+
+ if (gf_uuid_is_null(entry->d_stat.ia_gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.gfid, entry->d_stat.ia_gfid);
+
+ /*In case the gfid stored in the inode by inode_link
+ * and the gfid obtained in the lookup differs, then
+ * client3_3_lookup_cbk will return ESTALE and proper
+ * error will be captured
+ */
+
+ linked_inode = inode_link(entry_loc.inode, loc->inode,
+ entry->d_name, &entry->d_stat);
+
+ inode = entry_loc.inode;
+ entry_loc.inode = linked_inode;
+ inode_unref(inode);
+
+ if (gf_uuid_is_null(loc->gfid)) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "%s/%s"
+ " gfid not present",
+ loc->path, entry->d_name);
+ continue;
+ }
+
+ gf_uuid_copy(entry_loc.pargfid, loc->gfid);
+
+ ret = syncop_lookup(this, &entry_loc, &iatt, NULL, NULL, NULL);
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret,
+ DHT_MSG_DIR_LOOKUP_FAILED,
+ "Dir:%s renamed or removed. "
+ "Skipping",
+ loc->path);
+ ret = 0;
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ continue;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret,
+ DHT_MSG_DIR_LOOKUP_FAILED, "lookup failed for:%s",
+ entry_loc.path);
+ defrag->total_failures++;
-int
-gf_defrag_fix_layout (xlator_t *this, gf_defrag_info_t *defrag, loc_t *loc,
- dict_t *fix_layout, dict_t *migrate_data)
-{
- int ret = -1;
- loc_t entry_loc = {0,};
- fd_t *fd = NULL;
- gf_dirent_t entries;
- gf_dirent_t *tmp = NULL;
- gf_dirent_t *entry = NULL;
- gf_boolean_t free_entries = _gf_false;
- off_t offset = 0;
- struct iatt iatt = {0,};
- inode_t *linked_inode = NULL, *inode = NULL;
- dht_conf_t *conf = NULL;
- int should_commit_hash = 1;
-
- conf = this->private;
- if (!conf) {
- ret = -1;
- goto out;
- }
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ ret = -1;
+ goto out;
+ } else {
+ continue;
+ }
+ }
+ }
+ /* A return value of 2 means, either process_dir or
+ * lookup of a dir failed. Hence, don't commit hash
+ * for the current directory*/
+ ret = gf_defrag_fix_layout(this, defrag, &entry_loc, fix_layout,
+ migrate_data);
- ret = syncop_lookup (this, loc, &iatt, NULL, NULL, NULL);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Lookup failed on %s",
- loc->path);
- ret = -1;
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STOPPED ||
+ defrag->defrag_status == GF_DEFRAG_STATUS_FAILED) {
goto out;
- }
-
- if ((defrag->cmd != GF_DEFRAG_CMD_START_TIER) &&
- (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX)) {
- ret = gf_defrag_process_dir (this, defrag, loc, migrate_data);
+ }
- if (ret && ret != 2) {
- defrag->total_failures++;
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Fix layout failed for %s", entry_loc.path);
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_DEFRAG_PROCESS_DIR_FAILED,
- "gf_defrag_process_dir failed for directory: %s"
- , loc->path);
+ defrag->total_failures++;
- if (conf->decommission_in_progress) {
- goto out;
- }
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
- should_commit_hash = 0;
- } else if (ret == 2) {
- should_commit_hash = 0;
+ goto out;
+ } else {
+ /* Let's not commit-hash if
+ * gf_defrag_fix_layout failed*/
+ continue;
}
- }
+ }
+ }
+
+ gf_dirent_free(&entries);
+ free_entries = _gf_false;
+ INIT_LIST_HEAD(&entries.list);
+ }
+
+ /* A directory layout is fixed only after its subdirs are healed to
+ * any newly added bricks. If the layout is fixed before subdirs are
+ * healed, the newly added brick will get a non-null layout.
+ * Any subdirs which hash to that layout will no longer show up
+ * in a directory listing until they are healed.
+ */
+
+ ret = syncop_setxattr(this, loc, fix_layout, 0, NULL, NULL);
+
+ /* In case of a race where the directory is deleted just before
+ * layout setxattr, the errors are updated in the layout structure.
+ * We can use this information to make a decision whether the directory
+ * is deleted entirely.
+ */
+ if (ret == 0) {
+ ret = dht_dir_layout_error_check(this, loc->inode);
+ ret = -ret;
+ }
+
+ if (ret) {
+ if (-ret == ENOENT || -ret == ESTALE) {
+ gf_msg(this->name, GF_LOG_INFO, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed. Dir %s "
+ "renamed or removed",
+ loc->path);
+ if (conf->decommission_subvols_cnt) {
+ defrag->total_failures++;
+ }
+ ret = 0;
+ goto out;
+ } else {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LAYOUT_FIX_FAILED,
+ "Setxattr failed for %s", loc->path);
- gf_msg_trace (this->name, 0, "fix layout called on %s", loc->path);
+ defrag->total_failures++;
- fd = fd_create (loc->inode, defrag->pid);
- if (!fd) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to create fd");
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
ret = -1;
goto out;
+ }
}
+ }
+
+ if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
+ ret = gf_defrag_process_dir(this, defrag, loc, migrate_data, &perrno);
- ret = syncop_opendir (this, loc, fd, NULL, NULL);
if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Failed to open dir %s",
- loc->path);
- ret = -1;
+ if (perrno == ENOENT || perrno == ESTALE) {
+ ret = 0;
goto out;
- }
+ } else {
+ defrag->total_failures++;
- fd_bind (fd);
- INIT_LIST_HEAD (&entries.list);
- while ((ret = syncop_readdirp (this, fd, 131072, offset, &entries,
- NULL, NULL)) != 0)
- {
+ gf_msg(this->name, GF_LOG_ERROR, 0,
+ DHT_MSG_DEFRAG_PROCESS_DIR_FAILED,
+ "gf_defrag_process_dir failed for "
+ "directory: %s",
+ loc->path);
- if (ret < 0) {
- gf_log (this->name, GF_LOG_ERROR, "Readdir returned %s"
- ". Aborting fix-layout",strerror(-ret));
- ret = -1;
- goto out;
+ if (conf->decommission_in_progress) {
+ goto out;
}
+ }
+ }
+ }
- if (list_empty (&entries.list))
- break;
-
- free_entries = _gf_true;
-
- list_for_each_entry_safe (entry, tmp, &entries.list, list) {
- if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
- ret = 1;
- goto out;
- }
-
- offset = entry->d_off;
-
- if (!strcmp (entry->d_name, ".") ||
- !strcmp (entry->d_name, ".."))
- continue;
- if (!IA_ISDIR (entry->d_stat.ia_type)) {
-
- /* If its a fix layout during the attach
- * tier operation do lookups on files
- * on cold subvolume so that there is a
- * CTR DB Lookup Heal triggered on existing
- * data.
- * */
- if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
- gf_fix_layout_tier_attach_lookup
- (this, loc, entry);
- }
-
- continue;
- }
- loc_wipe (&entry_loc);
+ gf_msg_trace(this->name, 0, "fix layout called on %s", loc->path);
- ret = dht_build_child_loc (this, &entry_loc, loc,
- entry->d_name);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Child loc"
- " build failed for entry: %s",
- entry->d_name);
+ if (gf_defrag_settle_hash(this, defrag, loc, fix_layout) != 0) {
+ defrag->total_failures++;
- if (conf->decommission_in_progress) {
- defrag->defrag_status =
- GF_DEFRAG_STATUS_FAILED;
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_SETTLE_HASH_FAILED,
+ "Settle hash failed for %s", loc->path);
- goto out;
- } else {
- should_commit_hash = 0;
+ ret = -1;
- continue;
- }
- }
+ if (conf->decommission_in_progress) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ goto out;
+ }
+ }
- if (gf_uuid_is_null (entry->d_stat.ia_gfid)) {
- gf_log (this->name, GF_LOG_ERROR, "%s/%s"
- " gfid not present", loc->path,
- entry->d_name);
- continue;
- }
+ ret = 0;
+out:
+ if (free_entries)
+ gf_dirent_free(&entries);
+ loc_wipe(&entry_loc);
- gf_uuid_copy (entry_loc.gfid, entry->d_stat.ia_gfid);
+ if (fd)
+ fd_unref(fd);
- /*In case the gfid stored in the inode by inode_link
- * and the gfid obtained in the lookup differs, then
- * client3_3_lookup_cbk will return ESTALE and proper
- * error will be captured
- */
+ return ret;
+}
- linked_inode = inode_link (entry_loc.inode, loc->inode,
- entry->d_name,
- &entry->d_stat);
+int
+dht_init_local_subvols_and_nodeuuids(xlator_t *this, dht_conf_t *conf,
+ loc_t *loc)
+{
+ dict_t *dict = NULL;
+ uuid_t *uuid_ptr = NULL;
+ int ret = -1;
+ int i = 0;
+ int j = 0;
+
+ /* Find local subvolumes */
+ ret = syncop_getxattr(this, loc, &dict, GF_REBAL_FIND_LOCAL_SUBVOL, NULL,
+ NULL);
+ if (ret && (ret != -ENODATA)) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, 0,
+ "local "
+ "subvolume determination failed with error: %d",
+ -ret);
+ ret = -1;
+ goto out;
+ }
+
+ if (!ret)
+ goto out;
+
+ ret = syncop_getxattr(this, loc, &dict, GF_REBAL_OLD_FIND_LOCAL_SUBVOL,
+ NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, 0,
+ "local "
+ "subvolume determination failed with error: %d",
+ -ret);
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
- inode = entry_loc.inode;
- entry_loc.inode = linked_inode;
- inode_unref (inode);
+out:
+ if (ret) {
+ return ret;
+ }
- if (gf_uuid_is_null (loc->gfid)) {
- gf_log (this->name, GF_LOG_ERROR, "%s/%s"
- " gfid not present", loc->path,
- entry->d_name);
- continue;
- }
+ for (i = 0; i < conf->local_subvols_cnt; i++) {
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "local subvol: "
+ "%s",
+ conf->local_subvols[i]->name);
- gf_uuid_copy (entry_loc.pargfid, loc->gfid);
+ for (j = 0; j < conf->local_nodeuuids[i].count; j++) {
+ uuid_ptr = &(conf->local_nodeuuids[i].elements[j].uuid);
+ gf_msg(this->name, GF_LOG_INFO, 0, 0, "node uuid : %s",
+ uuid_utoa(*uuid_ptr));
+ }
+ }
- ret = syncop_lookup (this, &entry_loc, &iatt, NULL,
- NULL, NULL);
- /*Check whether it is ENOENT or ESTALE*/
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "%s"
- " lookup failed with %d",
- entry_loc.path, -ret);
+ return ret;
+}
- if (!conf->decommission_in_progress &&
- -ret != ENOENT && -ret != ESTALE) {
- should_commit_hash = 0;
- }
+/* Functions for the rebalance estimates feature */
- continue;
- }
+uint64_t
+gf_defrag_subvol_file_size(xlator_t *this, loc_t *root_loc)
+{
+ int ret = -1;
+ struct statvfs buf = {
+ 0,
+ };
+
+ ret = syncop_statfs(this, root_loc, &buf, NULL, NULL);
+ if (ret) {
+ /* Aargh! */
+ return 0;
+ }
+ return ((buf.f_blocks - buf.f_bfree) * buf.f_frsize);
+}
- ret = syncop_setxattr (this, &entry_loc, fix_layout,
- 0, NULL, NULL);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "Setxattr "
- "failed for %s", entry_loc.path);
+uint64_t
+gf_defrag_total_file_size(xlator_t *this, loc_t *root_loc)
+{
+ dht_conf_t *conf = NULL;
+ int i = 0;
+ uint64_t size_files = 0;
+ uint64_t total_size = 0;
- defrag->total_failures++;
+ conf = this->private;
+ if (!conf) {
+ return 0;
+ }
+
+ for (i = 0; i < conf->local_subvols_cnt; i++) {
+ size_files = gf_defrag_subvol_file_size(conf->local_subvols[i],
+ root_loc);
+ total_size += size_files;
+ gf_msg(this->name, GF_LOG_INFO, 0, 0,
+ "local subvol: %s,"
+ "cnt = %" PRIu64,
+ conf->local_subvols[i]->name, size_files);
+ }
+
+ gf_msg(this->name, GF_LOG_INFO, 0, 0, "Total size files = %" PRIu64,
+ total_size);
+
+ return total_size;
+}
- /*Don't go for fix-layout of child subtree if"
- fix-layout failed*/
- if (conf->decommission_in_progress) {
- defrag->defrag_status =
- GF_DEFRAG_STATUS_FAILED;
+static void *
+dht_file_counter_thread(void *args)
+{
+ gf_defrag_info_t *defrag = NULL;
+ loc_t root_loc = {
+ 0,
+ };
+ struct timespec time_to_wait = {
+ 0,
+ };
+ uint64_t tmp_size = 0;
+
+ if (!args)
+ return NULL;
- ret = -1;
+ defrag = (gf_defrag_info_t *)args;
+ dht_build_root_loc(defrag->root_inode, &root_loc);
- goto out;
- } else {
- continue;
- }
- }
+ while (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) {
+ timespec_now(&time_to_wait);
+ time_to_wait.tv_sec += 600;
+ pthread_mutex_lock(&defrag->fc_mutex);
+ pthread_cond_timedwait(&defrag->fc_wakeup_cond, &defrag->fc_mutex,
+ &time_to_wait);
- /* A return value of 2 means, either process_dir or
- * lookup of a dir failed. Hence, don't commit hash
- * for the current directory*/
+ pthread_mutex_unlock(&defrag->fc_mutex);
- ret = gf_defrag_fix_layout (this, defrag, &entry_loc,
- fix_layout, migrate_data);
+ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED)
+ break;
- if (ret && ret != 2) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_LAYOUT_FIX_FAILED,
- "Fix layout failed for %s",
- entry_loc.path);
+ tmp_size = gf_defrag_total_file_size(defrag->this, &root_loc);
- defrag->total_failures++;
+ gf_log("dht", GF_LOG_INFO, "tmp data size =%" PRIu64, tmp_size);
- if (conf->decommission_in_progress) {
- defrag->defrag_status =
- GF_DEFRAG_STATUS_FAILED;
+ if (!tmp_size) {
+ gf_msg("dht", GF_LOG_ERROR, 0, 0,
+ "Failed to get "
+ "the total data size. Unable to estimate "
+ "time to complete rebalance.");
+ } else {
+ g_totalsize = tmp_size;
+ gf_msg_debug("dht", 0, "total data size =%" PRIu64, g_totalsize);
+ }
+ }
- ret = -1;
+ return NULL;
+}
- goto out;
- } else {
- /* Let's not commit-hash if
- * gf_defrag_fix_layout failed*/
- continue;
- }
- }
+int
+gf_defrag_estimates_cleanup(xlator_t *this, gf_defrag_info_t *defrag,
+ pthread_t filecnt_thread)
+{
+ int ret = -1;
+
+ /* Wake up the filecounter thread.
+ * By now the defrag status will no longer be
+ * GF_DEFRAG_STATUS_STARTED so the thread will exit the loop.
+ */
+ pthread_mutex_lock(&defrag->fc_mutex);
+ {
+ pthread_cond_broadcast(&defrag->fc_wakeup_cond);
+ }
+ pthread_mutex_unlock(&defrag->fc_mutex);
+
+ ret = pthread_join(filecnt_thread, NULL);
+ if (ret) {
+ gf_msg("dht", GF_LOG_ERROR, ret, 0,
+ "file_counter_thread: pthread_join failed.");
+ ret = -1;
+ }
+ return ret;
+}
- if (ret != 2 &&
- gf_defrag_settle_hash (this, defrag, &entry_loc,
- fix_layout) != 0) {
- defrag->total_failures++;
+int
+gf_defrag_estimates_init(xlator_t *this, loc_t *loc, pthread_t *filecnt_thread)
+{
+ int ret = -1;
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+
+ conf = this->private;
+ defrag = conf->defrag;
+
+ g_totalsize = gf_defrag_total_file_size(this, loc);
+ if (!g_totalsize) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0,
+ "Failed to get "
+ "the total data size. Unable to estimate "
+ "time to complete rebalance.");
+ goto out;
+ }
+
+ ret = gf_thread_create(filecnt_thread, NULL, dht_file_counter_thread,
+ (void *)defrag, "dhtfcnt");
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, ret, 0,
+ "Failed to "
+ "create the file counter thread ");
+ ret = -1;
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_SETTLE_HASH_FAILED,
- "Settle hash failed for %s",
- entry_loc.path);
+/* Init and cleanup functions for parallel file migration*/
+int
+gf_defrag_parallel_migration_init(xlator_t *this, gf_defrag_info_t *defrag,
+ pthread_t **tid_array, int *thread_index)
+{
+ int ret = -1;
+ int thread_spawn_count = 0;
+ int index = 0;
+ pthread_t *tid = NULL;
- ret = -1;
+ if (!defrag)
+ goto out;
- if (conf->decommission_in_progress) {
- defrag->defrag_status =
- GF_DEFRAG_STATUS_FAILED;
+ /* Initialize global entry queue */
+ defrag->queue = GF_CALLOC(1, sizeof(struct dht_container),
+ gf_dht_mt_container_t);
- goto out;
- }
- }
- }
- gf_dirent_free (&entries);
- free_entries = _gf_false;
- INIT_LIST_HEAD (&entries.list);
- }
+ if (!defrag->queue) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "Failed to initialise migration queue");
+ ret = -1;
+ goto out;
+ }
- ret = 0;
-out:
- if (free_entries)
- gf_dirent_free (&entries);
+ INIT_LIST_HEAD(&(defrag->queue[0].list));
- loc_wipe (&entry_loc);
+ thread_spawn_count = MAX(MAX_REBAL_THREADS, 4);
- if (fd)
- fd_unref (fd);
+ gf_msg_debug(this->name, 0, "thread_spawn_count: %d", thread_spawn_count);
- if (ret == 0 && should_commit_hash == 0) {
- ret = 2;
+ tid = GF_CALLOC(thread_spawn_count, sizeof(pthread_t),
+ gf_common_mt_pthread_t);
+ if (!tid) {
+ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "Failed to create migration threads");
+ ret = -1;
+ goto out;
+ }
+ defrag->current_thread_count = thread_spawn_count;
+
+ /*Spawn Threads Here*/
+ while (index < thread_spawn_count) {
+ ret = gf_thread_create(&(tid[index]), NULL, gf_defrag_task,
+ (void *)defrag, "dhtmig%d", (index + 1) & 0x3ff);
+ if (ret != 0) {
+ gf_msg("DHT", GF_LOG_ERROR, ret, 0, "Thread[%d] creation failed. ",
+ index);
+ ret = -1;
+ goto out;
+ } else {
+ gf_log("DHT", GF_LOG_INFO,
+ "Thread[%d] "
+ "creation successful",
+ index);
}
+ index++;
+ }
- return ret;
+ ret = 0;
+out:
+ *thread_index = index;
+ *tid_array = tid;
+ return ret;
}
int
-gf_defrag_start_crawl (void *data)
+gf_defrag_parallel_migration_cleanup(gf_defrag_info_t *defrag,
+ pthread_t *tid_array, int thread_index)
{
- xlator_t *this = NULL;
- dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
- int ret = -1;
- loc_t loc = {0,};
- struct iatt iatt = {0,};
- struct iatt parent = {0,};
- dict_t *fix_layout = NULL;
- dict_t *migrate_data = NULL;
- dict_t *status = NULL;
- dict_t *dict = NULL;
- glusterfs_ctx_t *ctx = NULL;
- dht_methods_t *methods = NULL;
- int i = 0;
- int thread_index = 0;
- int err = 0;
- int thread_spawn_count = 0;
- pthread_t tid[MAX_MIGRATOR_THREAD_COUNT];
-
- this = data;
- if (!this)
- goto exit;
-
- ctx = this->ctx;
- if (!ctx)
- goto exit;
-
- conf = this->private;
- if (!conf)
- goto exit;
-
- defrag = conf->defrag;
- if (!defrag)
- goto exit;
-
- gettimeofday (&defrag->start_time, NULL);
- dht_build_root_inode (this, &defrag->root_inode);
- if (!defrag->root_inode)
- goto out;
-
- dht_build_root_loc (defrag->root_inode, &loc);
+ int ret = -1;
+ int i = 0;
- /* fix-layout on '/' first */
+ if (!defrag)
+ goto out;
- ret = syncop_lookup (this, &loc, &iatt, &parent, NULL, NULL);
+ /* Wake up all migration threads */
+ pthread_mutex_lock(&defrag->dfq_mutex);
+ {
+ defrag->crawl_done = 1;
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_REBALANCE_START_FAILED,
- "Failed to start rebalance: look up on / failed");
- ret = -1;
- goto out;
- }
+ pthread_cond_broadcast(&defrag->parallel_migration_cond);
+ pthread_cond_broadcast(&defrag->df_wakeup_thread);
+ }
+ pthread_mutex_unlock(&defrag->dfq_mutex);
- fix_layout = dict_new ();
- if (!fix_layout) {
- ret = -1;
- goto out;
- }
+ /*Wait for all the threads to complete their task*/
+ for (i = 0; i < thread_index; i++) {
+ pthread_join(tid_array[i], NULL);
+ }
- /*
- * Unfortunately, we can't do special xattrs (like fix.layout) and
- * real ones in the same call currently, and changing it seems
- * riskier than just doing two calls.
- */
+ GF_FREE(tid_array);
- gf_log (this->name, GF_LOG_INFO, "%s using commit hash %u",
- __func__, conf->vol_commit_hash);
+ /* Cleanup the migration queue */
+ if (defrag->queue) {
+ gf_dirent_free(defrag->queue[0].df_entry);
+ INIT_LIST_HEAD(&(defrag->queue[0].list));
+ }
- ret = dict_set_uint32 (fix_layout, conf->commithash_xattr_name,
- conf->vol_commit_hash);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR,
- "Failed to set %s", conf->commithash_xattr_name);
- defrag->total_failures++;
- ret = -1;
- goto out;
- }
+ GF_FREE(defrag->queue);
- ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL);
- if (ret) {
- gf_log (this->name, GF_LOG_ERROR, "fix layout on %s failed",
- loc.path);
- defrag->total_failures++;
- ret = -1;
- goto out;
- }
+ ret = 0;
+out:
+ return ret;
+}
- /* We now return to our regularly scheduled program. */
+int
+gf_defrag_start_crawl(void *data)
+{
+ xlator_t *this = NULL;
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ dict_t *fix_layout = NULL;
+ dict_t *migrate_data = NULL;
+ dict_t *status = NULL;
+ glusterfs_ctx_t *ctx = NULL;
+ call_frame_t *statfs_frame = NULL;
+ xlator_t *old_THIS = NULL;
+ int ret = -1;
+ loc_t loc = {
+ 0,
+ };
+ struct iatt iatt = {
+ 0,
+ };
+ struct iatt parent = {
+ 0,
+ };
+ int thread_index = 0;
+ pthread_t *tid = NULL;
+ pthread_t filecnt_thread;
+ gf_boolean_t fc_thread_started = _gf_false;
+
+ this = data;
+ if (!this)
+ goto exit;
+
+ ctx = this->ctx;
+ if (!ctx)
+ goto exit;
+
+ conf = this->private;
+ if (!conf)
+ goto exit;
+
+ defrag = conf->defrag;
+ if (!defrag)
+ goto exit;
+
+ defrag->start_time = gf_time();
+
+ dht_build_root_inode(this, &defrag->root_inode);
+ if (!defrag->root_inode)
+ goto out;
+
+ dht_build_root_loc(defrag->root_inode, &loc);
+
+ /* fix-layout on '/' first */
+
+ ret = syncop_lookup(this, &loc, &iatt, &parent, NULL, NULL);
+
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_REBALANCE_START_FAILED,
+ "Failed to start rebalance: look up on / failed");
+ ret = -1;
+ goto out;
+ }
- ret = dict_set_str (fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes");
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_REBALANCE_START_FAILED,
- "Failed to start rebalance:"
- "Failed to set dictionary value: key = %s",
- GF_XATTR_FIX_LAYOUT_KEY);
- defrag->total_failures++;
- ret = -1;
- goto out;
- }
+ old_THIS = THIS;
+ THIS = this;
- defrag->new_commit_hash = conf->vol_commit_hash;
+ statfs_frame = create_frame(this, this->ctx->pool);
+ if (!statfs_frame) {
+ gf_msg(this->name, GF_LOG_ERROR, DHT_MSG_NO_MEMORY, ENOMEM,
+ "Insufficient memory. Frame creation failed");
+ ret = -1;
+ goto out;
+ }
- ret = syncop_setxattr (this, &loc, fix_layout, 0, NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_REBALANCE_FAILED,
- "fix layout on %s failed",
- loc.path);
- defrag->total_failures++;
- ret = -1;
- goto out;
- }
+ /* async statfs update for honoring min-free-disk */
+ dht_get_du_info(statfs_frame, this, &loc);
+ THIS = old_THIS;
- if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
- migrate_data = dict_new ();
- if (!migrate_data) {
- defrag->total_failures++;
- ret = -1;
- goto out;
- }
- ret = dict_set_str (migrate_data, GF_XATTR_FILE_MIGRATE_KEY,
- (defrag->cmd == GF_DEFRAG_CMD_START_FORCE)
- ? "force" : "non-force");
- if (ret) {
- defrag->total_failures++;
- ret = -1;
- goto out;
- }
-
- /* Find local subvolumes */
- ret = syncop_getxattr (this, &loc, &dict,
- GF_REBAL_FIND_LOCAL_SUBVOL,
- NULL, NULL);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0, 0, "local "
- "subvolume determination failed with error: %d",
- -ret);
- ret = -1;
- goto out;
- }
+ fix_layout = dict_new();
+ if (!fix_layout) {
+ ret = -1;
+ goto out;
+ }
+
+ /*
+ * Unfortunately, we can't do special xattrs (like fix.layout) and
+ * real ones in the same call currently, and changing it seems
+ * riskier than just doing two calls.
+ */
+
+ gf_log(this->name, GF_LOG_INFO, "%s using commit hash %u", __func__,
+ conf->vol_commit_hash);
+
+ ret = dict_set_uint32(fix_layout, conf->commithash_xattr_name,
+ conf->vol_commit_hash);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR, "Failed to set %s",
+ conf->commithash_xattr_name);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ ret = syncop_setxattr(this, &loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_ERROR,
+ "Failed to set commit hash on %s. "
+ "Rebalance cannot proceed.",
+ loc.path);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+
+ /* We now return to our regularly scheduled program. */
+
+ ret = dict_set_str(fix_layout, GF_XATTR_FIX_LAYOUT_KEY, "yes");
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED,
+ "Failed to start rebalance:"
+ "Failed to set dictionary value: key = %s",
+ GF_XATTR_FIX_LAYOUT_KEY);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
- for (i = 0 ; i < conf->local_subvols_cnt; i++) {
- gf_msg (this->name, GF_LOG_INFO, 0, 0, "local subvols "
- "are %s", conf->local_subvols[i]->name);
- }
+ defrag->new_commit_hash = conf->vol_commit_hash;
- /* Initialize global entry queue */
- defrag->queue = GF_CALLOC (1, sizeof (struct dht_container),
- gf_dht_mt_container_t);
+ ret = syncop_setxattr(this, &loc, fix_layout, 0, NULL, NULL);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_REBALANCE_FAILED,
+ "fix layout on %s failed", loc.path);
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
- if (!defrag->queue) {
- gf_log (this->name, GF_LOG_INFO, "No memory for queue");
- ret = -1;
- goto out;
- }
+ if (defrag->cmd != GF_DEFRAG_CMD_START_LAYOUT_FIX) {
+ /* We need to migrate files */
- INIT_LIST_HEAD (&(defrag->queue[0].list));
-
- thread_spawn_count = MAX ((sysconf(_SC_NPROCESSORS_ONLN) - 4), 4);
-
- gf_msg_debug (this->name, 0, "thread_spawn_count: %d",
- thread_spawn_count);
-
- defrag->current_thread_count = thread_spawn_count;
-
- /*Spawn Threads Here*/
- while (thread_index < thread_spawn_count) {
- err = pthread_create(&(tid[thread_index]), NULL,
- &gf_defrag_task, (void *)defrag);
- if (err != 0) {
- gf_log ("DHT", GF_LOG_ERROR,
- "Thread[%d] creation failed. "
- "Aborting Rebalance",
- thread_index);
- ret = -1;
- goto out;
- } else {
- gf_log ("DHT", GF_LOG_INFO, "Thread[%d] "
- "creation successful", thread_index);
- }
- thread_index++;
- }
+ migrate_data = dict_new();
+ if (!migrate_data) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
+ ret = dict_set_str(
+ migrate_data, GF_XATTR_FILE_MIGRATE_KEY,
+ (defrag->cmd == GF_DEFRAG_CMD_START_FORCE) ? "force" : "non-force");
+ if (ret) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
}
- ret = gf_defrag_fix_layout (this, defrag, &loc, fix_layout,
- migrate_data);
- if (ret && ret != 2) {
- defrag->total_failures++;
- ret = -1;
- goto out;
+ ret = dht_init_local_subvols_and_nodeuuids(this, conf, &loc);
+ if (ret) {
+ ret = -1;
+ goto out;
}
- if (ret != 2 &&
- gf_defrag_settle_hash (this, defrag, &loc, fix_layout) != 0) {
- defrag->total_failures++;
- ret = -1;
- goto out;
+ /* Initialise the structures required for parallel migration */
+ ret = gf_defrag_parallel_migration_init(this, defrag, &tid,
+ &thread_index);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, 0, "Aborting rebalance.");
+ goto out;
}
- if (defrag->cmd == GF_DEFRAG_CMD_START_TIER) {
- methods = &(conf->methods);
+ ret = gf_defrag_estimates_init(this, &loc, &filecnt_thread);
+ if (ret) {
+ /* Not a fatal error. Allow the rebalance to proceed*/
+ ret = 0;
+ } else {
+ fc_thread_started = _gf_true;
+ }
+ }
- methods->migration_other(this, defrag);
- if (defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER) {
+ ret = gf_defrag_fix_layout(this, defrag, &loc, fix_layout, migrate_data);
+ if (ret) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
- ret = dict_set_str (migrate_data,
- GF_XATTR_FILE_MIGRATE_KEY,
- "force");
- if (ret)
- goto out;
+ if (gf_defrag_settle_hash(this, defrag, &loc, fix_layout) != 0) {
+ defrag->total_failures++;
+ ret = -1;
+ goto out;
+ }
- }
- }
- gf_log ("DHT", GF_LOG_INFO, "crawling file-system completed");
+ gf_log("DHT", GF_LOG_INFO, "crawling file-system completed");
out:
- /* We are here means crawling the entire file system is done
- or something failed. Set defrag->crawl_done flag to intimate
- the migrator threads to exhaust the defrag->queue and terminate*/
- if (ret) {
- defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
- }
+ /* We are here means crawling the entire file system is done
+ or something failed. Set defrag->crawl_done flag to intimate
+ the migrator threads to exhaust the defrag->queue and terminate*/
- pthread_mutex_lock (&defrag->dfq_mutex);
- {
- defrag->crawl_done = 1;
+ if (ret) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
+ }
- pthread_cond_broadcast (
- &defrag->parallel_migration_cond);
- pthread_cond_broadcast (
- &defrag->df_wakeup_thread);
- }
- pthread_mutex_unlock (&defrag->dfq_mutex);
+ gf_defrag_parallel_migration_cleanup(defrag, tid, thread_index);
- /*Wait for all the threads to complete their task*/
- for (i = 0; i < thread_index; i++) {
- pthread_join (tid[i], NULL);
- }
+ if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) &&
+ (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) {
+ defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE;
+ }
- if (defrag->queue) {
- gf_dirent_free (defrag->queue[0].df_entry);
- INIT_LIST_HEAD (&(defrag->queue[0].list));
- }
+ if (fc_thread_started) {
+ gf_defrag_estimates_cleanup(this, defrag, filecnt_thread);
+ }
- if ((defrag->defrag_status != GF_DEFRAG_STATUS_STOPPED) &&
- (defrag->defrag_status != GF_DEFRAG_STATUS_FAILED)) {
- defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE;
- }
+ dht_send_rebalance_event(this, defrag->cmd, defrag->defrag_status);
- LOCK (&defrag->lock);
- {
- status = dict_new ();
- gf_defrag_status_get (defrag, status);
- if (ctx && ctx->notify)
- ctx->notify (GF_EN_DEFRAG_STATUS, status);
- if (status)
- dict_unref (status);
- defrag->is_exiting = 1;
- }
- UNLOCK (&defrag->lock);
+ status = dict_new();
+ LOCK(&defrag->lock);
+ {
+ gf_defrag_status_get(conf, status);
+ if (ctx && ctx->notify)
+ ctx->notify(GF_EN_DEFRAG_STATUS, status);
+ if (status)
+ dict_unref(status);
+ defrag->is_exiting = 1;
+ }
+ UNLOCK(&defrag->lock);
- GF_FREE (defrag->queue);
+ GF_FREE(defrag);
+ conf->defrag = NULL;
- GF_FREE (defrag);
- conf->defrag = NULL;
+ if (migrate_data)
+ dict_unref(migrate_data);
- if (dict)
- dict_unref(dict);
+ if (statfs_frame) {
+ STACK_DESTROY(statfs_frame->root);
+ }
exit:
- return ret;
+ return ret;
}
-
static int
-gf_defrag_done (int ret, call_frame_t *sync_frame, void *data)
+gf_defrag_done(int ret, call_frame_t *sync_frame, void *data)
{
- gf_listener_stop (sync_frame->this);
+ gf_listener_stop(sync_frame->this);
- STACK_DESTROY (sync_frame->root);
- kill (getpid(), SIGTERM);
- return 0;
+ STACK_DESTROY(sync_frame->root);
+ kill(getpid(), SIGTERM);
+ return 0;
}
void *
-gf_defrag_start (void *data)
+gf_defrag_start(void *data)
{
- int ret = -1;
- call_frame_t *frame = NULL;
- dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
- xlator_t *this = NULL;
- xlator_t *old_THIS = NULL;
-
- this = data;
- conf = this->private;
- if (!conf)
- goto out;
+ int ret = -1;
+ call_frame_t *frame = NULL;
+ dht_conf_t *conf = NULL;
+ gf_defrag_info_t *defrag = NULL;
+ xlator_t *this = NULL;
+ xlator_t *old_THIS = NULL;
- defrag = conf->defrag;
- if (!defrag)
- goto out;
+ this = data;
+ conf = this->private;
+ if (!conf)
+ goto out;
- frame = create_frame (this, this->ctx->pool);
- if (!frame)
- goto out;
+ defrag = conf->defrag;
+ if (!defrag)
+ goto out;
- frame->root->pid = GF_CLIENT_PID_DEFRAG;
+ frame = create_frame(this, this->ctx->pool);
+ if (!frame)
+ goto out;
- defrag->pid = frame->root->pid;
+ frame->root->pid = GF_CLIENT_PID_DEFRAG;
- defrag->defrag_status = GF_DEFRAG_STATUS_STARTED;
+ defrag->pid = frame->root->pid;
- old_THIS = THIS;
- THIS = this;
- ret = synctask_new (this->ctx->env, gf_defrag_start_crawl,
- gf_defrag_done, frame, this);
-
- if (ret)
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_REBALANCE_START_FAILED,
- "Could not create task for rebalance");
- THIS = old_THIS;
-out:
- return NULL;
-}
-
-int
-gf_defrag_status_get (gf_defrag_info_t *defrag, dict_t *dict)
-{
- int ret = 0;
- uint64_t files = 0;
- uint64_t size = 0;
- uint64_t lookup = 0;
- uint64_t failures = 0;
- uint64_t skipped = 0;
- uint64_t promoted = 0;
- uint64_t demoted = 0;
- char *status = "";
- double elapsed = 0;
- struct timeval end = {0,};
-
-
- if (!defrag)
- goto out;
-
- ret = 0;
- if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED)
- goto out;
-
- files = defrag->total_files;
- size = defrag->total_data;
- lookup = defrag->num_files_lookedup;
- failures = defrag->total_failures;
- skipped = defrag->skipped;
- promoted = defrag->total_files_promoted;
- demoted = defrag->total_files_demoted;
-
- gettimeofday (&end, NULL);
-
- elapsed = end.tv_sec - defrag->start_time.tv_sec;
-
- if (!dict)
- goto log;
-
- ret = dict_set_uint64 (dict, "promoted", promoted);
- if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
- "failed to set promoted count");
-
- ret = dict_set_uint64 (dict, "demoted", demoted);
- if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
- "failed to set demoted count");
-
- ret = dict_set_uint64 (dict, "files", files);
- if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
- "failed to set file count");
-
- ret = dict_set_uint64 (dict, "size", size);
- if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
- "failed to set size of xfer");
-
- ret = dict_set_uint64 (dict, "lookups", lookup);
- if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
- "failed to set lookedup file count");
-
-
- ret = dict_set_int32 (dict, "status", defrag->defrag_status);
- if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
- "failed to set status");
- if (elapsed) {
- ret = dict_set_double (dict, "run-time", elapsed);
- if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
- "failed to set run-time");
- }
-
- ret = dict_set_uint64 (dict, "failures", failures);
- if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
- "failed to set failure count");
-
- ret = dict_set_uint64 (dict, "skipped", skipped);
- if (ret)
- gf_log (THIS->name, GF_LOG_WARNING,
- "failed to set skipped file count");
-log:
- switch (defrag->defrag_status) {
- case GF_DEFRAG_STATUS_NOT_STARTED:
- status = "not started";
- break;
- case GF_DEFRAG_STATUS_STARTED:
- status = "in progress";
- break;
- case GF_DEFRAG_STATUS_STOPPED:
- status = "stopped";
- break;
- case GF_DEFRAG_STATUS_COMPLETE:
- status = "completed";
- break;
- case GF_DEFRAG_STATUS_FAILED:
- status = "failed";
- break;
- default:
- break;
- }
-
- gf_msg (THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS,
- "Rebalance is %s. Time taken is %.2f secs",
- status, elapsed);
- gf_msg (THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS,
- "Files migrated: %"PRIu64", size: %"
- PRIu64", lookups: %"PRIu64", failures: %"PRIu64", skipped: "
- "%"PRIu64, files, size, lookup, failures, skipped);
+ defrag->defrag_status = GF_DEFRAG_STATUS_STARTED;
+ old_THIS = THIS;
+ THIS = this;
+ ret = synctask_new(this->ctx->env, gf_defrag_start_crawl, gf_defrag_done,
+ frame, this);
+ if (ret)
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_REBALANCE_START_FAILED,
+ "Could not create task for rebalance");
+ THIS = old_THIS;
out:
- return 0;
-}
-
-void
-gf_defrag_set_pause_state (gf_tier_conf_t *tier_conf, tier_pause_state_t state)
-{
- pthread_mutex_lock (&tier_conf->pause_mutex);
- tier_conf->pause_state = state;
- pthread_mutex_unlock (&tier_conf->pause_mutex);
+ return NULL;
}
-tier_pause_state_t
-gf_defrag_get_pause_state (gf_tier_conf_t *tier_conf)
+uint64_t
+gf_defrag_get_estimates_based_on_size(dht_conf_t *conf)
{
- int state;
-
- pthread_mutex_lock (&tier_conf->pause_mutex);
- state = tier_conf->pause_state;
- pthread_mutex_unlock (&tier_conf->pause_mutex);
-
- return state;
-}
-
-tier_pause_state_t
-gf_defrag_check_pause_tier (gf_tier_conf_t *tier_conf)
-{
- int woke = 0;
- int state = -1;
-
- pthread_mutex_lock (&tier_conf->pause_mutex);
-
- if (tier_conf->pause_state == TIER_RUNNING)
- goto out;
+ gf_defrag_info_t *defrag = NULL;
+ double rate_processed = 0;
+ uint64_t total_processed = 0;
+ uint64_t tmp_count = 0;
+ uint64_t time_to_complete = 0;
+ double elapsed = 0;
- if (tier_conf->pause_state == TIER_PAUSED)
- goto out;
+ defrag = conf->defrag;
- if (tier_conf->promote_in_progress ||
- tier_conf->demote_in_progress)
- goto out;
+ if (!g_totalsize)
+ goto out;
- tier_conf->pause_state = TIER_PAUSED;
+ elapsed = gf_time() - defrag->start_time;
- if (tier_conf->pause_synctask) {
- synctask_wake (tier_conf->pause_synctask);
- tier_conf->pause_synctask = 0;
- woke = 1;
- }
+ /* Don't calculate the estimates for the first 10 minutes.
+ * It is unlikely to be accurate and estimates are not required
+ * if the process finishes in less than 10 mins.
+ */
- gf_msg ("tier", GF_LOG_DEBUG, 0,
- DHT_MSG_TIER_PAUSED,
- "woken %d", woke);
-out:
- state = tier_conf->pause_state;
+ if (elapsed < ESTIMATE_START_INTERVAL) {
+ gf_msg(THIS->name, GF_LOG_INFO, 0, 0,
+ "Rebalance estimates will not be available for the "
+ "first %d seconds.",
+ ESTIMATE_START_INTERVAL);
- pthread_mutex_unlock (&tier_conf->pause_mutex);
+ goto out;
+ }
- return state;
-}
-
-void
-gf_defrag_pause_tier_timeout (void *data)
-{
- xlator_t *this = NULL;
- dht_conf_t *conf = NULL;
- gf_defrag_info_t *defrag = NULL;
+ total_processed = defrag->size_processed;
- this = (xlator_t *) data;
- GF_VALIDATE_OR_GOTO ("tier", this, out);
+ /* rate at which files processed */
+ rate_processed = (total_processed) / elapsed;
- conf = this->private;
- GF_VALIDATE_OR_GOTO (this->name, conf, out);
+ tmp_count = g_totalsize;
- defrag = conf->defrag;
- GF_VALIDATE_OR_GOTO (this->name, defrag, out);
+ if (rate_processed) {
+ time_to_complete = (tmp_count) / rate_processed;
- gf_msg (this->name, GF_LOG_DEBUG, 0,
- DHT_MSG_TIER_PAUSED,
- "Request pause timer timeout");
+ } else {
+ gf_msg(THIS->name, GF_LOG_ERROR, 0, 0,
+ "Unable to calculate estimated time for rebalance");
+ }
- gf_defrag_check_pause_tier (&defrag->tier_conf);
+ gf_log(THIS->name, GF_LOG_INFO,
+ "TIME: (size) total_processed=%" PRIu64 " tmp_cnt = %" PRIu64
+ ","
+ "rate_processed=%f, elapsed = %f",
+ total_processed, tmp_count, rate_processed, elapsed);
out:
- return;
+ return time_to_complete;
}
int
-gf_defrag_pause_tier (xlator_t *this, gf_defrag_info_t *defrag)
+gf_defrag_status_get(dht_conf_t *conf, dict_t *dict)
{
- int ret = 0;
- struct timespec delta = {0,};
- int delay = 2;
+ int ret = 0;
+ uint64_t files = 0;
+ uint64_t size = 0;
+ uint64_t lookup = 0;
+ uint64_t failures = 0;
+ uint64_t skipped = 0;
+ char *status = "";
+ double elapsed = 0;
+ uint64_t time_to_complete = 0;
+ uint64_t time_left = 0;
+ gf_defrag_info_t *defrag = conf->defrag;
+
+ if (!defrag)
+ goto out;
+
+ ret = 0;
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED)
+ goto out;
+
+ files = defrag->total_files;
+ size = defrag->total_data;
+ lookup = defrag->num_files_lookedup;
+ failures = defrag->total_failures;
+ skipped = defrag->skipped;
+
+ elapsed = gf_time() - defrag->start_time;
+
+ /* The rebalance is still in progress */
+
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_STARTED) {
+ time_to_complete = gf_defrag_get_estimates_based_on_size(conf);
+
+ if (time_to_complete && (time_to_complete > elapsed))
+ time_left = time_to_complete - elapsed;
+
+ gf_log(THIS->name, GF_LOG_INFO,
+ "TIME: Estimated total time to complete (size)= %" PRIu64
+ " seconds, seconds left = %" PRIu64 "",
+ time_to_complete, time_left);
+ }
+
+ if (!dict)
+ goto log;
+
+ ret = dict_set_uint64(dict, "files", files);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set file count");
+
+ ret = dict_set_uint64(dict, "size", size);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set size of xfer");
+
+ ret = dict_set_uint64(dict, "lookups", lookup);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set lookedup file count");
+
+ ret = dict_set_int32(dict, "status", defrag->defrag_status);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set status");
+
+ ret = dict_set_double(dict, "run-time", elapsed);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set run-time");
+
+ ret = dict_set_uint64(dict, "failures", failures);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set failure count");
+
+ ret = dict_set_uint64(dict, "skipped", skipped);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set skipped file count");
+
+ ret = dict_set_uint64(dict, "time-left", time_left);
+ if (ret)
+ gf_log(THIS->name, GF_LOG_WARNING, "failed to set time-left");
- if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED)
- goto out;
-
- /*
- * Set flag requesting to pause tiering. Wait 'delay' seconds for
- * tiering to actually stop as indicated by the pause state
- * before returning success or failure.
- */
- gf_defrag_set_pause_state (&defrag->tier_conf, TIER_REQUEST_PAUSE);
-
- /*
- * If migration is not underway, can pause immediately.
- */
- gf_defrag_check_pause_tier (&defrag->tier_conf);
- if (gf_defrag_get_pause_state (&defrag->tier_conf) == TIER_PAUSED)
- goto out;
-
- gf_msg (this->name, GF_LOG_DEBUG, 0,
- DHT_MSG_TIER_PAUSED,
- "Request pause tier");
-
- defrag->tier_conf.pause_synctask = synctask_get ();
- delta.tv_sec = delay;
- delta.tv_nsec = 0;
- defrag->tier_conf.pause_timer =
- gf_timer_call_after (this->ctx, delta,
- gf_defrag_pause_tier_timeout,
- this);
-
- synctask_yield (defrag->tier_conf.pause_synctask);
-
- if (gf_defrag_get_pause_state (&defrag->tier_conf) == TIER_PAUSED)
- goto out;
-
- gf_defrag_set_pause_state (&defrag->tier_conf, TIER_RUNNING);
-
- ret = -1;
+log:
+ switch (defrag->defrag_status) {
+ case GF_DEFRAG_STATUS_NOT_STARTED:
+ status = "not started";
+ break;
+ case GF_DEFRAG_STATUS_STARTED:
+ status = "in progress";
+ break;
+ case GF_DEFRAG_STATUS_STOPPED:
+ status = "stopped";
+ break;
+ case GF_DEFRAG_STATUS_COMPLETE:
+ status = "completed";
+ break;
+ case GF_DEFRAG_STATUS_FAILED:
+ status = "failed";
+ break;
+ default:
+ break;
+ }
+
+ gf_msg(THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS,
+ "Rebalance is %s. Time taken is %.2f secs", status, elapsed);
+ gf_msg(THIS->name, GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STATUS,
+ "Files migrated: %" PRIu64 ", size: %" PRIu64 ", lookups: %" PRIu64
+ ", failures: %" PRIu64
+ ", skipped: "
+ "%" PRIu64,
+ files, size, lookup, failures, skipped);
out:
-
- gf_msg (this->name, GF_LOG_DEBUG, 0,
- DHT_MSG_TIER_PAUSED,
- "Pause tiering ret=%d", ret);
-
- return ret;
+ return 0;
}
int
-gf_defrag_resume_tier (xlator_t *this, gf_defrag_info_t *defrag)
+gf_defrag_stop(dht_conf_t *conf, gf_defrag_status_t status, dict_t *output)
{
- gf_msg (this->name, GF_LOG_DEBUG, 0,
- DHT_MSG_TIER_RESUME,
- "Pause end. Resume tiering");
+ /* TODO: set a variable 'stop_defrag' here, it should be checked
+ in defrag loop */
+ int ret = -1;
+ gf_defrag_info_t *defrag = conf->defrag;
- gf_defrag_set_pause_state (&defrag->tier_conf, TIER_RUNNING);
+ GF_ASSERT(defrag);
- return 0;
-}
+ if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) {
+ goto out;
+ }
-int
-gf_defrag_start_detach_tier (gf_defrag_info_t *defrag)
-{
- defrag->cmd = GF_DEFRAG_CMD_START_DETACH_TIER;
+ gf_msg("", GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STOPPED,
+ "Received stop command on rebalance");
+ defrag->defrag_status = status;
- return 0;
-}
-
-int
-gf_defrag_stop (gf_defrag_info_t *defrag, gf_defrag_status_t status,
- dict_t *output)
-{
- /* TODO: set a variable 'stop_defrag' here, it should be checked
- in defrag loop */
- int ret = -1;
- GF_ASSERT (defrag);
-
- if (defrag->defrag_status == GF_DEFRAG_STATUS_NOT_STARTED) {
- goto out;
- }
-
- gf_msg ("", GF_LOG_INFO, 0, DHT_MSG_REBALANCE_STOPPED,
- "Received stop command on rebalance");
- defrag->defrag_status = status;
-
- if (output)
- gf_defrag_status_get (defrag, output);
- ret = 0;
+ if (output)
+ gf_defrag_status_get(conf, output);
+ ret = 0;
out:
- gf_msg_debug ("", 0, "Returning %d", ret);
- return ret;
+ gf_msg_debug("", 0, "Returning %d", ret);
+ return ret;
}
diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c
index 80047264f0e..d9dbf50492f 100644
--- a/xlators/cluster/dht/src/dht-rename.c
+++ b/xlators/cluster/dht/src/dht-rename.c
@@ -11,1420 +11,1987 @@
/* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should
* delete the newpath if it gets EEXISTS from link() call.
*/
-#include "glusterfs.h"
-#include "xlator.h"
#include "dht-common.h"
-#include "defaults.h"
+#include "dht-lock.h"
+#include <glusterfs/defaults.h>
+int
+dht_rename_unlock(call_frame_t *frame, xlator_t *this);
+int32_t
+dht_rename_lock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata);
int
-dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
- struct iatt *preoldparent, struct iatt *postoldparent,
- struct iatt *prenewparent, struct iatt *postnewparent,
- dict_t *xdata)
+dht_rename_unlock_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = 0;
- call_frame_t *prev = NULL;
- char gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_local_t *local = NULL;
- local = frame->local;
- prev = cookie;
+ local = frame->local;
+ dht_set_fixed_dir_stat(&local->preoldparent);
+ dht_set_fixed_dir_stat(&local->postoldparent);
+ dht_set_fixed_dir_stat(&local->preparent);
+ dht_set_fixed_dir_stat(&local->postparent);
- if (op_ret == -1) {
- /* TODO: undo the damage */
- gf_uuid_unparse(local->loc.inode->gfid, gfid);
+ if (IA_ISREG(local->stbuf.ia_type))
+ DHT_STRIP_PHASE1_FLAGS(&local->stbuf);
- gf_msg (this->name, GF_LOG_INFO, op_errno,
- DHT_MSG_RENAME_FAILED,
- "Rename %s -> %s on %s failed, (gfid = %s)",
- local->loc.path, local->loc2.path,
- prev->this->name, gfid);
+ DHT_STACK_UNWIND(rename, frame, local->op_ret, local->op_errno,
+ &local->stbuf, &local->preoldparent, &local->postoldparent,
+ &local->preparent, &local->postparent, local->xattr);
+ return 0;
+}
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- goto unwind;
- }
- /* TODO: construct proper stbuf for dir */
- /*
- * FIXME: is this the correct way to build stbuf and
- * parent bufs?
- */
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preoldparent, preoldparent,
- prev->this);
- dht_iatt_merge (this, &local->postoldparent, postoldparent,
- prev->this);
- dht_iatt_merge (this, &local->preparent, prenewparent,
- prev->this);
- dht_iatt_merge (this, &local->postparent, postnewparent,
- prev->this);
+static void
+dht_rename_dir_unlock_src(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ local = frame->local;
+ dht_unlock_namespace(frame, &local->lock[0]);
+ return;
+}
-unwind:
- this_call_cnt = dht_frame_return (frame);
- if (is_last_call (this_call_cnt)) {
- WIPE (&local->preoldparent);
- WIPE (&local->postoldparent);
- WIPE (&local->preparent);
- WIPE (&local->postparent);
-
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent,
- &local->preparent, &local->postparent, xdata);
- }
+static void
+dht_rename_dir_unlock_dst(call_frame_t *frame, xlator_t *this)
+{
+ dht_local_t *local = NULL;
+ int op_ret = -1;
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+
+ local = frame->local;
+
+ /* Unlock entrylk */
+ dht_unlock_entrylk_wrapper(frame, &local->lock[1].ns.directory_ns);
+
+ /* Unlock inodelk */
+ op_ret = dht_unlock_inodelk(frame, local->lock[1].ns.parent_layout.locks,
+ local->lock[1].ns.parent_layout.lk_count,
+ dht_rename_unlock_cbk);
+ if (op_ret < 0) {
+ uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+ if (local->loc2.inode)
+ uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+ if (IA_ISREG(local->stbuf.ia_type))
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+ "winding unlock inodelk failed "
+ "rename (%s:%s:%s %s:%s:%s), "
+ "stale locks left on bricks",
+ local->loc.path, src_gfid, local->src_cached->name,
+ local->loc2.path, dst_gfid,
+ local->dst_cached ? local->dst_cached->name : NULL);
+ else
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+ "winding unlock inodelk failed "
+ "rename (%s:%s %s:%s), "
+ "stale locks left on bricks",
+ local->loc.path, src_gfid, local->loc2.path, dst_gfid);
- return 0;
-}
+ dht_rename_unlock_cbk(frame, NULL, this, 0, 0, NULL);
+ }
+ return;
+}
+static int
+dht_rename_dir_unlock(call_frame_t *frame, xlator_t *this)
+{
+ dht_rename_dir_unlock_src(frame, this);
+ dht_rename_dir_unlock_dst(frame, this);
+ return 0;
+}
int
-dht_rename_hashed_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
- struct iatt *preoldparent,
- struct iatt *postoldparent,
- struct iatt *prenewparent,
- struct iatt *postnewparent, dict_t *xdata)
+dht_rename_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int call_cnt = 0;
- call_frame_t *prev = NULL;
- int i = 0;
- char gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int this_call_cnt = 0;
+ xlator_t *prev = NULL;
+ int i = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+ int subvol_cnt = -1;
+
+ conf = this->private;
+ local = frame->local;
+ prev = cookie;
+ subvol_cnt = dht_subvol_cnt(this, prev);
+ local->ret_cache[subvol_cnt] = op_ret;
+
+ if (op_ret == -1) {
+ gf_uuid_unparse(local->loc.inode->gfid, gfid);
- conf = this->private;
- local = frame->local;
- prev = cookie;
+ gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED,
+ "Rename %s -> %s on %s failed, (gfid = %s)", local->loc.path,
+ local->loc2.path, prev->name, gfid);
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ goto unwind;
+ }
+ /* TODO: construct proper stbuf for dir */
+ /*
+ * FIXME: is this the correct way to build stbuf and
+ * parent bufs?
+ */
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->preoldparent, preoldparent);
+ dht_iatt_merge(this, &local->postoldparent, postoldparent);
+ dht_iatt_merge(this, &local->preparent, prenewparent);
+ dht_iatt_merge(this, &local->postparent, postnewparent);
+unwind:
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ /* We get here with local->call_cnt == 0. Which means
+ * we are the only one executing this code, there is
+ * no contention. Therefore it's safe to manipulate or
+ * deref local->call_cnt directly (without locking).
+ */
+ if (local->ret_cache[conf->subvolume_cnt] == 0) {
+ /* count errant subvols in last field of ret_cache */
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (local->ret_cache[i] != 0)
+ ++local->ret_cache[conf->subvolume_cnt];
+ }
+ if (local->ret_cache[conf->subvolume_cnt]) {
+ /* undoing the damage:
+ * for all subvolumes, where rename
+ * succeeded, we perform the reverse operation
+ */
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (local->ret_cache[i] == 0)
+ ++local->call_cnt;
+ }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (local->ret_cache[i])
+ continue;
- if (op_ret == -1) {
- /* TODO: undo the damage */
+ STACK_WIND(frame, dht_rename_dir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i]->fops->rename, &local->loc2,
+ &local->loc, NULL);
+ }
- gf_uuid_unparse(local->loc.inode->gfid, gfid);
+ return 0;
+ }
+ }
- gf_msg (this->name, GF_LOG_INFO, op_errno,
- DHT_MSG_RENAME_FAILED,
- "rename %s -> %s on %s failed, (gfid = %s) ",
- local->loc.path, local->loc2.path,
- prev->this->name, gfid );
+ WIPE(&local->preoldparent);
+ WIPE(&local->postoldparent);
+ WIPE(&local->preparent);
+ WIPE(&local->postparent);
- local->op_ret = op_ret;
- local->op_errno = op_errno;
- goto unwind;
- }
- /* TODO: construct proper stbuf for dir */
- /*
- * FIXME: is this the correct way to build stbuf and
- * parent bufs?
- */
- dht_iatt_merge (this, &local->stbuf, stbuf, prev->this);
- dht_iatt_merge (this, &local->preoldparent, preoldparent,
- prev->this);
- dht_iatt_merge (this, &local->postoldparent, postoldparent,
- prev->this);
- dht_iatt_merge (this, &local->preparent, prenewparent,
- prev->this);
- dht_iatt_merge (this, &local->postparent, postnewparent,
- prev->this);
-
- call_cnt = local->call_cnt = conf->subvolume_cnt - 1;
-
- if (!local->call_cnt)
- goto unwind;
-
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (conf->subvolumes[i] == local->dst_hashed)
- continue;
- STACK_WIND (frame, dht_rename_dir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->rename,
- &local->loc, &local->loc2, NULL);
- if (!--call_cnt)
- break;
- }
+ dht_rename_dir_unlock(frame, this);
+ }
+ return 0;
+}
- return 0;
-unwind:
- WIPE (&local->preoldparent);
- WIPE (&local->postoldparent);
- WIPE (&local->preparent);
- WIPE (&local->postparent);
+int
+dht_rename_hashed_dir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *stbuf,
+ struct iatt *preoldparent, struct iatt *postoldparent,
+ struct iatt *prenewparent, struct iatt *postnewparent,
+ dict_t *xdata)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ int call_cnt = 0;
+ xlator_t *prev = NULL;
+ int i = 0;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
+
+ conf = this->private;
+ local = frame->local;
+ prev = cookie;
+
+ if (op_ret == -1) {
+ gf_uuid_unparse(local->loc.inode->gfid, gfid);
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent,
- &local->preparent, &local->postparent, NULL);
+ gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_RENAME_FAILED,
+ "rename %s -> %s on %s failed, (gfid = %s) ", local->loc.path,
+ local->loc2.path, prev->name, gfid);
+
+ local->op_ret = op_ret;
+ local->op_errno = op_errno;
+ goto unwind;
+ }
+ /* TODO: construct proper stbuf for dir */
+ /*
+ * FIXME: is this the correct way to build stbuf and
+ * parent bufs?
+ */
+ dht_iatt_merge(this, &local->stbuf, stbuf);
+ dht_iatt_merge(this, &local->preoldparent, preoldparent);
+ dht_iatt_merge(this, &local->postoldparent, postoldparent);
+ dht_iatt_merge(this, &local->preparent, prenewparent);
+ dht_iatt_merge(this, &local->postparent, postnewparent);
+
+ call_cnt = local->call_cnt = conf->subvolume_cnt - 1;
+
+ if (!local->call_cnt)
+ goto unwind;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (conf->subvolumes[i] == local->dst_hashed)
+ continue;
+ STACK_WIND_COOKIE(
+ frame, dht_rename_dir_cbk, conf->subvolumes[i], conf->subvolumes[i],
+ conf->subvolumes[i]->fops->rename, &local->loc, &local->loc2, NULL);
+ if (!--call_cnt)
+ break;
+ }
+
+ return 0;
+unwind:
+ WIPE(&local->preoldparent);
+ WIPE(&local->postoldparent);
+ WIPE(&local->preparent);
+ WIPE(&local->postparent);
- return 0;
+ dht_rename_dir_unlock(frame, this);
+ return 0;
}
-
int
-dht_rename_dir_do (call_frame_t *frame, xlator_t *this)
+dht_rename_dir_do(call_frame_t *frame, xlator_t *this)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (local->op_ret == -1)
- goto err;
+ if (local->op_ret == -1)
+ goto err;
- local->op_ret = 0;
+ local->op_ret = 0;
- STACK_WIND (frame, dht_rename_hashed_dir_cbk,
- local->dst_hashed,
- local->dst_hashed->fops->rename,
- &local->loc, &local->loc2, NULL);
- return 0;
+ STACK_WIND_COOKIE(frame, dht_rename_hashed_dir_cbk, local->dst_hashed,
+ local->dst_hashed, local->dst_hashed->fops->rename,
+ &local->loc, &local->loc2, NULL);
+ return 0;
err:
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno, NULL, NULL,
- NULL, NULL, NULL, NULL);
- return 0;
+ dht_rename_dir_unlock(frame, this);
+ return 0;
}
-
int
-dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, gf_dirent_t *entries,
- dict_t *xdata)
+dht_rename_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, gf_dirent_t *entries,
+ dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = -1;
- call_frame_t *prev = NULL;
-
- local = frame->local;
- prev = cookie;
-
- if (op_ret > 2) {
- gf_msg_trace (this->name, 0,
- "readdir on %s for %s returned %d entries",
- prev->this->name, local->loc.path, op_ret);
- local->op_ret = -1;
- local->op_errno = ENOTEMPTY;
- }
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ xlator_t *prev = NULL;
- this_call_cnt = dht_frame_return (frame);
+ local = frame->local;
+ prev = cookie;
- if (is_last_call (this_call_cnt)) {
- dht_rename_dir_do (frame, this);
- }
+ if (op_ret > 2) {
+ gf_msg_trace(this->name, 0, "readdir on %s for %s returned %d entries",
+ prev->name, local->loc.path, op_ret);
+ local->op_ret = -1;
+ local->op_errno = ENOTEMPTY;
+ }
- return 0;
-}
+ this_call_cnt = dht_frame_return(frame);
+ if (is_last_call(this_call_cnt)) {
+ dht_rename_dir_do(frame, this);
+ }
+
+ return 0;
+}
int
-dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
+dht_rename_opendir_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int op_ret, int op_errno, fd_t *fd, dict_t *xdata)
{
- dht_local_t *local = NULL;
- int this_call_cnt = -1;
- call_frame_t *prev = NULL;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- local = frame->local;
- prev = cookie;
-
+ dht_local_t *local = NULL;
+ int this_call_cnt = -1;
+ xlator_t *prev = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
- if (op_ret == -1) {
+ local = frame->local;
+ prev = cookie;
- gf_uuid_unparse(local->loc.inode->gfid, gfid);
- gf_msg (this->name, GF_LOG_INFO, op_errno,
- DHT_MSG_OPENDIR_FAILED,
- "opendir on %s for %s failed,(gfid = %s) ",
- prev->this->name, local->loc.path, gfid);
- goto err;
- }
+ if (op_ret == -1) {
+ gf_uuid_unparse(local->loc.inode->gfid, gfid);
+ gf_msg(this->name, GF_LOG_INFO, op_errno, DHT_MSG_OPENDIR_FAILED,
+ "opendir on %s for %s failed,(gfid = %s) ", prev->name,
+ local->loc.path, gfid);
+ goto err;
+ }
- fd_bind (fd);
- STACK_WIND (frame, dht_rename_readdir_cbk,
- prev->this, prev->this->fops->readdir,
- local->fd, 4096, 0, NULL);
+ fd_bind(fd);
+ STACK_WIND_COOKIE(frame, dht_rename_readdir_cbk, prev, prev,
+ prev->fops->readdir, local->fd, 4096, 0, NULL);
- return 0;
+ return 0;
err:
- this_call_cnt = dht_frame_return (frame);
+ this_call_cnt = dht_frame_return(frame);
- if (is_last_call (this_call_cnt)) {
- dht_rename_dir_do (frame, this);
- }
+ if (is_last_call(this_call_cnt)) {
+ dht_rename_dir_do(frame, this);
+ }
- return 0;
+ return 0;
}
-
int
-dht_rename_dir (call_frame_t *frame, xlator_t *this)
+dht_rename_dir_lock2_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
{
- dht_conf_t *conf = NULL;
- dht_local_t *local = NULL;
- int i = 0;
- int op_errno = -1;
+ dht_local_t *local = NULL;
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_conf_t *conf = NULL;
+ int i = 0;
+
+ local = frame->local;
+ conf = this->private;
+
+ if (op_ret < 0) {
+ uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+ if (local->loc2.inode)
+ uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+ "acquiring entrylk after inodelk failed"
+ "rename (%s:%s:%s %s:%s:%s)",
+ local->loc.path, src_gfid, local->src_cached->name,
+ local->loc2.path, dst_gfid,
+ local->dst_cached ? local->dst_cached->name : NULL);
+
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ local->fd = fd_create(local->loc.inode, frame->root->pid);
+ if (!local->fd) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->op_ret = 0;
+
+ if (!local->dst_cached) {
+ dht_rename_dir_do(frame, this);
+ return 0;
+ }
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ STACK_WIND_COOKIE(frame, dht_rename_opendir_cbk, conf->subvolumes[i],
+ conf->subvolumes[i],
+ conf->subvolumes[i]->fops->opendir, &local->loc2,
+ local->fd, NULL);
+ }
- conf = frame->this->private;
- local = frame->local;
+ return 0;
- local->call_cnt = conf->subvolume_cnt;
+err:
+ /* No harm in calling an extra unlock */
+ dht_rename_dir_unlock(frame, this);
+ return 0;
+}
- for (i = 0; i < conf->subvolume_cnt; i++) {
- if (!conf->subvolume_status[i]) {
- gf_msg (this->name, GF_LOG_INFO, 0,
- DHT_MSG_RENAME_FAILED,
- "Rename dir failed: subvolume down (%s)",
- conf->subvolumes[i]->name);
- op_errno = ENOTCONN;
- goto err;
- }
- }
+int
+dht_rename_dir_lock1_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, dict_t *xdata)
+{
+ dht_local_t *local = NULL;
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+ int ret = 0;
+ loc_t *loc = NULL;
+ xlator_t *subvol = NULL;
+
+ local = frame->local;
+
+ if (op_ret < 0) {
+ uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+ if (local->loc2.inode)
+ uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_INODE_LK_ERROR,
+ "acquiring entrylk after inodelk failed"
+ "rename (%s:%s:%s %s:%s:%s)",
+ local->loc.path, src_gfid, local->src_cached->name,
+ local->loc2.path, dst_gfid,
+ local->dst_cached ? local->dst_cached->name : NULL);
+
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+ goto err;
+ }
+
+ if (local->current == &local->lock[0]) {
+ loc = &local->loc2;
+ subvol = local->dst_hashed;
+ local->current = &local->lock[1];
+ } else {
+ loc = &local->loc;
+ subvol = local->src_hashed;
+ local->current = &local->lock[0];
+ }
+ ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns,
+ dht_rename_dir_lock2_cbk);
+ if (ret < 0) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ return 0;
+err:
+ /* No harm in calling an extra unlock */
+ dht_rename_dir_unlock(frame, this);
+ return 0;
+}
- local->fd = fd_create (local->loc.inode, frame->root->pid);
- if (!local->fd) {
- op_errno = ENOMEM;
- goto err;
+/*
+ * If the hashed subvolumes of both source and dst are the different,
+ * lock in dictionary order of hashed subvol->name. This is important
+ * in case the parent directory is the same for both src and dst to
+ * prevent inodelk deadlocks when racing with a fix-layout op on the parent.
+ *
+ * If the hashed subvols are the same, use the gfid/name to determine
+ * the order of taking locks to prevent entrylk deadlocks when the parent
+ * dirs are the same.
+ *
+ */
+static int
+dht_order_rename_lock(call_frame_t *frame, loc_t **loc, xlator_t **subvol)
+{
+ int ret = 0;
+ int op_ret = 0;
+ dht_local_t *local = NULL;
+ char *src = NULL;
+ char *dst = NULL;
+
+ local = frame->local;
+
+ if (local->src_hashed->name == local->dst_hashed->name) {
+ ret = 0;
+ } else {
+ ret = strcmp(local->src_hashed->name, local->dst_hashed->name);
+ }
+
+ if (ret == 0) {
+ /* hashed subvols are the same for src and dst */
+ /* Entrylks need to be ordered*/
+
+ src = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc.name) + 1);
+ if (!src) {
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "Insufficient memory for src");
+ op_ret = -1;
+ goto out;
}
- local->op_ret = 0;
+ if (!gf_uuid_is_null(local->loc.pargfid))
+ uuid_utoa_r(local->loc.pargfid, src);
+ else if (local->loc.parent)
+ uuid_utoa_r(local->loc.parent->gfid, src);
+ else
+ src[0] = '\0';
- if (!local->dst_cached) {
- dht_rename_dir_do (frame, this);
- return 0;
- }
+ strcat(src, local->loc.name);
- for (i = 0; i < conf->subvolume_cnt; i++) {
- STACK_WIND (frame, dht_rename_opendir_cbk,
- conf->subvolumes[i],
- conf->subvolumes[i]->fops->opendir,
- &local->loc2, local->fd, NULL);
+ dst = alloca(GF_UUID_BNAME_BUF_SIZE + strlen(local->loc2.name) + 1);
+ if (!dst) {
+ gf_msg(frame->this->name, GF_LOG_ERROR, ENOMEM, 0,
+ "Insufficient memory for dst");
+ op_ret = -1;
+ goto out;
}
- return 0;
-
-err:
- op_errno = (op_errno == -1) ? errno : op_errno;
- DHT_STACK_UNWIND (rename, frame, -1, op_errno, NULL, NULL, NULL, NULL,
- NULL, NULL);
- return 0;
-}
+ if (!gf_uuid_is_null(local->loc2.pargfid))
+ uuid_utoa_r(local->loc2.pargfid, dst);
+ else if (local->loc2.parent)
+ uuid_utoa_r(local->loc2.parent->gfid, dst);
+ else
+ dst[0] = '\0';
+ strcat(dst, local->loc2.name);
+ ret = strcmp(src, dst);
+ }
+ if (ret <= 0) {
+ /*inodelk in dictionary order of hashed subvol names*/
+ /*entrylk in dictionary order of gfid/basename */
+ local->current = &local->lock[0];
+ *loc = &local->loc;
+ *subvol = local->src_hashed;
-static int
-dht_rename_track_for_changelog (xlator_t *this, dict_t *xattr,
- loc_t *oldloc, loc_t *newloc)
-{
- int ret = -1;
- dht_changelog_rename_info_t *info = NULL;
- char *name = NULL;
- int len1 = 0;
- int len2 = 0;
- int size = 0;
-
- if (!xattr || !oldloc || !newloc || !this)
- return ret;
-
- len1 = strlen (oldloc->name) + 1;
- len2 = strlen (newloc->name) + 1;
- size = sizeof (dht_changelog_rename_info_t) + len1 + len2;
-
- info = GF_CALLOC (size, sizeof(char), gf_common_mt_char);
- if (!info) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_DICT_SET_FAILED,
- "Failed to calloc memory");
- return ret;
- }
+ } else {
+ local->current = &local->lock[1];
+ *loc = &local->loc2;
+ *subvol = local->dst_hashed;
+ }
- gf_uuid_copy (info->old_pargfid, oldloc->pargfid);
- gf_uuid_copy (info->new_pargfid, newloc->pargfid);
-
- info->oldname_len = len1;
- info->newname_len = len2;
- strncpy (info->buffer, oldloc->name, len1);
- name = info->buffer + len1;
- strncpy (name, newloc->name, len2);
-
- ret = dict_set_bin (xattr, DHT_CHANGELOG_RENAME_OP_KEY,
- info, size);
- if (ret) {
- gf_msg (this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
- "Failed to set dictionary value: key = %s,"
- " path = %s", DHT_CHANGELOG_RENAME_OP_KEY,
- oldloc->name);
- GF_FREE (info);
- }
+ op_ret = 0;
- return ret;
+out:
+ return op_ret;
}
+int
+dht_rename_dir(call_frame_t *frame, xlator_t *this)
+{
+ dht_conf_t *conf = NULL;
+ dht_local_t *local = NULL;
+ loc_t *loc = NULL;
+ xlator_t *subvol = NULL;
+ int i = 0;
+ int ret = 0;
+ int op_errno = -1;
+
+ conf = frame->this->private;
+ local = frame->local;
+
+ local->ret_cache = GF_CALLOC(conf->subvolume_cnt + 1, sizeof(int),
+ gf_dht_ret_cache_t);
+
+ if (local->ret_cache == NULL) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ local->call_cnt = conf->subvolume_cnt;
+
+ for (i = 0; i < conf->subvolume_cnt; i++) {
+ if (!conf->subvolume_status[i]) {
+ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_RENAME_FAILED,
+ "Rename dir failed: subvolume down (%s)",
+ conf->subvolumes[i]->name);
+ op_errno = ENOTCONN;
+ goto err;
+ }
+ }
+
+ /* Locks on src and dst needs to ordered which otherwise might cause
+ * deadlocks when rename (src, dst) and rename (dst, src) is done from
+ * two different clients
+ */
+ ret = dht_order_rename_lock(frame, &loc, &subvol);
+ if (ret) {
+ op_errno = ENOMEM;
+ goto err;
+ }
+
+ /* Rename must take locks on src to avoid lookup selfheal from
+ * recreating src on those subvols where the rename was successful.
+ * The locks can't be issued parallel as two different clients might
+ * attempt same rename command and be in dead lock.
+ */
+ ret = dht_protect_namespace(frame, loc, subvol, &local->current->ns,
+ dht_rename_dir_lock1_cbk);
+ if (ret < 0) {
+ op_errno = EINVAL;
+ goto err;
+ }
+
+ return 0;
+err:
+ DHT_STACK_UNWIND(rename, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
+ NULL);
+ return 0;
+}
-#define DHT_MARKER_DONT_ACCOUNT(xattr) do { \
- int tmp = -1; \
- if (!xattr) { \
- xattr = dict_new (); \
- if (!xattr) \
- break; \
- } \
- tmp = dict_set_str (xattr, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, \
- "yes"); \
- if (tmp) { \
- gf_msg (this->name, GF_LOG_ERROR, 0, \
- DHT_MSG_DICT_SET_FAILED, \
- "Failed to set dictionary value: key = %s," \
- " path = %s",GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, \
- local->loc.path); \
- } \
- }while (0)
-
-
-#define DHT_CHANGELOG_TRACK_AS_RENAME(xattr, oldloc, newloc) do { \
- int tmp = -1; \
- if (!xattr) { \
- xattr = dict_new (); \
- if (!xattr) { \
- gf_msg (this->name, GF_LOG_ERROR, 0, \
- DHT_MSG_DICT_SET_FAILED, \
- "Failed to create dictionary to " \
- "track rename"); \
- break; \
- } \
- } \
- \
- tmp = dht_rename_track_for_changelog (this, xattr, \
- oldloc, newloc); \
- \
- if (tmp) { \
- gf_msg (this->name, GF_LOG_ERROR, 0, \
- DHT_MSG_DICT_SET_FAILED, \
- "Failed to set dictionary value: key = %s," \
- " path = %s", DHT_CHANGELOG_RENAME_OP_KEY, \
- (oldloc)->path); \
- } \
- } while (0)
-
-int
-dht_rename_unlock_cbk (call_frame_t *frame, void *cookie,
- xlator_t *this, int32_t op_ret, int32_t op_errno,
- dict_t *xdata)
+static int
+dht_rename_track_for_changelog(xlator_t *this, dict_t *xattr, loc_t *oldloc,
+ loc_t *newloc)
{
- dht_local_t *local = NULL;
+ int ret = -1;
+ dht_changelog_rename_info_t *info = NULL;
+ char *name = NULL;
+ int len1 = 0;
+ int len2 = 0;
+ int size = 0;
+
+ if (!xattr || !oldloc || !newloc || !this)
+ return ret;
- local = frame->local;
+ len1 = strlen(oldloc->name) + 1;
+ len2 = strlen(newloc->name) + 1;
+ size = sizeof(dht_changelog_rename_info_t) + len1 + len2;
- DHT_STRIP_PHASE1_FLAGS (&local->stbuf);
- DHT_STACK_UNWIND (rename, frame, local->op_ret, local->op_errno,
- &local->stbuf, &local->preoldparent,
- &local->postoldparent, &local->preparent,
- &local->postparent, NULL);
- return 0;
+ info = GF_CALLOC(size, sizeof(char), gf_common_mt_char);
+ if (!info) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to calloc memory");
+ return ret;
+ }
+
+ gf_uuid_copy(info->old_pargfid, oldloc->pargfid);
+ gf_uuid_copy(info->new_pargfid, newloc->pargfid);
+
+ info->oldname_len = len1;
+ info->newname_len = len2;
+ strncpy(info->buffer, oldloc->name, len1);
+ name = info->buffer + len1;
+ strncpy(name, newloc->name, len2);
+
+ ret = dict_set_bin(xattr, DHT_CHANGELOG_RENAME_OP_KEY, info, size);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
+ "Failed to set dictionary value: key = %s,"
+ " path = %s",
+ DHT_CHANGELOG_RENAME_OP_KEY, oldloc->name);
+ GF_FREE(info);
+ }
+
+ return ret;
}
+#define DHT_MARKER_DONT_ACCOUNT(xattr) \
+ do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new(); \
+ if (!xattr) \
+ break; \
+ } \
+ tmp = dict_set_str(xattr, GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, "yes"); \
+ if (tmp) { \
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \
+ "Failed to set dictionary value: key = %s," \
+ " path = %s", \
+ GLUSTERFS_MARKER_DONT_ACCOUNT_KEY, local->loc.path); \
+ } \
+ } while (0)
+
+#define DHT_CHANGELOG_TRACK_AS_RENAME(xattr, oldloc, newloc) \
+ do { \
+ int tmp = -1; \
+ if (!xattr) { \
+ xattr = dict_new(); \
+ if (!xattr) { \
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \
+ "Failed to create dictionary to " \
+ "track rename"); \
+ break; \
+ } \
+ } \
+ \
+ tmp = dht_rename_track_for_changelog(this, xattr, oldloc, newloc); \
+ \
+ if (tmp) { \
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, \
+ "Failed to set dictionary value: key = %s," \
+ " path = %s", \
+ DHT_CHANGELOG_RENAME_OP_KEY, (oldloc)->path); \
+ } \
+ } while (0)
+
int
-dht_rename_unlock (call_frame_t *frame, xlator_t *this)
+dht_rename_unlock(call_frame_t *frame, xlator_t *this)
{
- dht_local_t *local = NULL;
- int op_ret = -1;
- char src_gfid[GF_UUID_BUF_SIZE] = {0};
- char dst_gfid[GF_UUID_BUF_SIZE] = {0};
-
- local = frame->local;
-
- op_ret = dht_unlock_inodelk (frame, local->lock.locks,
- local->lock.lk_count,
- dht_rename_unlock_cbk);
- if (op_ret < 0) {
- uuid_utoa_r (local->loc.inode->gfid, src_gfid);
-
- if (local->loc2.inode)
- uuid_utoa_r (local->loc2.inode->gfid, dst_gfid);
-
- gf_msg (this->name, GF_LOG_WARNING, 0,
- DHT_MSG_UNLOCKING_FAILED,
- "winding unlock inodelk failed "
- "rename (%s:%s:%s %s:%s:%s), "
- "stale locks left on bricks",
- local->loc.path, src_gfid, local->src_cached->name,
- local->loc2.path, dst_gfid,
- local->dst_cached ? local->dst_cached->name : NULL);
-
- dht_rename_unlock_cbk (frame, NULL, this, 0, 0, NULL);
- }
-
- return 0;
+ dht_local_t *local = NULL;
+ int op_ret = -1;
+ char src_gfid[GF_UUID_BUF_SIZE] = {0};
+ char dst_gfid[GF_UUID_BUF_SIZE] = {0};
+ dht_ilock_wrap_t inodelk_wrapper = {
+ 0,
+ };
+
+ local = frame->local;
+ inodelk_wrapper.locks = local->rename_inodelk_backward_compatible;
+ inodelk_wrapper.lk_count = local->rename_inodelk_bc_count;
+
+ op_ret = dht_unlock_inodelk_wrapper(frame, &inodelk_wrapper);
+ if (op_ret < 0) {
+ uuid_utoa_r(local->loc.inode->gfid, src_gfid);
+
+ if (local->loc2.inode)
+ uuid_utoa_r(local->loc2.inode->gfid, dst_gfid);
+
+ if (IA_ISREG(local->stbuf.ia_type))
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+ "winding unlock inodelk failed "
+ "rename (%s:%s:%s %s:%s:%s), "
+ "stale locks left on bricks",
+ local->loc.path, src_gfid, local->src_cached->name,
+ local->loc2.path, dst_gfid,
+ local->dst_cached ? local->dst_cached->name : NULL);
+ else
+ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_UNLOCKING_FAILED,
+ "winding unlock inodelk failed "
+ "rename (%s:%s %s:%s), "
+ "stale locks left on bricks",
+ local->loc.path, src_gfid, local->loc2.path, dst_gfid);
+ }
+
+ dht_unlock_namespace(frame, &local->lock[0]);
+ dht_unlock_namespace(frame, &local->lock[1]);
+
+ dht_rename_unlock_cbk(frame, NULL, this, local->op_ret, local->op_errno,
+ NULL);
+ return 0;
}
int
-dht_rename_done (call_frame_t *frame, xlator_t *this)
+dht_rename_done(call_frame_t *frame, xlator_t *this)
{
- dht_local_t *local = NULL;
+ dht_local_t *local = NULL;
- local = frame->local;
+ local = frame->local;
- if (local->linked == _gf_true) {
- local->linked = _gf_false;
- dht_linkfile_attr_heal (frame, this);
- }
+ if (local->linked == _gf_true) {
+ local->linked = _gf_false;
+ dht_linkfile_attr_heal(frame, this);
+ }
- dht_rename_unlock (frame, this);
- return 0;
+ dht_rename_unlock(frame, this);
+ return 0;
}
int
-dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this,
- int32_t op_ret, int32_t op_errno, struct iatt *preparent,
- struct iatt *postparent, dict_t *xdata)
+dht_rename_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
+ struct iatt *postparent, dict_t *xdata)
{
- dht_local_t *local = NULL;
- call_frame_t *prev = NULL;
- int this_call_cnt = 0;
-
- local = frame->local;
- prev = cookie;
-
- if (!local) {
- gf_msg (this->name, GF_LOG_ERROR, 0,
- DHT_MSG_INVALID_VALUE,
- "!local, should not happen");
- goto out;
- }
+ dht_local_t *local = NULL;
+ xlator_t *prev = NULL;
+ int this_call_cnt = 0;
- this_call_cnt = dht_frame_return (frame);
+ local = frame->local;
+ prev = cookie;
- if (op_ret == -1) {
- gf_msg (this->name, GF_LOG_WARNING, op_errno,
- DHT_MSG_UNLINK_FAILED,
- "%s: Rename: unlink on %s failed ",
- local->loc.path, prev->this->name);
- }
+ FRAME_SU_UNDO(frame, dht_local_t);
+ if (!local) {
+ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_INVALID_VALUE,
+ "!local, should not happen");
+ goto out;
+ }
- WIPE (&local->preoldparent);
- WIPE (&local->postoldparent);
- WIPE (&local->preparent);
- WIPE (&local->postparent);
+ this_call_cnt = dht_frame_return(frame);
- if (is_last_call (this_call_cnt)) {
- dht_rename_done (frame, this);
- }
+ if (op_ret == -1) {
+ gf_msg(this->name, GF_LOG_WARNING, op_errno, DHT_MSG_UNLINK_FAILED,
+ "%s: Rename: unlink on %s failed ", local->loc.path, prev->name);
+ }
+
+ WIPE(&local->preoldparent);
+ WIPE(&local->postoldparent);
+ WIPE(&local->preparent);
+ WIPE(&local->postparent);
+
+ if (is_last_call(this_call_cnt)) {
+ dht_rename_done(frame, this);
+ }
out:
- return 0;
+ return 0;
}
-
int
-dht_rename_cleanup (call_frame_t *frame)
+dht_rename_cleanup(call_frame_t *frame)
{
- dht_local_t *local = NULL;
- xlator_t *this = NULL;
- xlator_t *src_hashed = NULL;
- xlator_t *src_cached = NULL;
- xlator_t *dst_hashed = NULL;
- xlator_t *dst_cached = NULL;
- int call_cnt = 0;
- dict_t *xattr = NULL;
- char gfid[GF_UUID_BUF_SIZE] = {0};
-
- local = frame->local;
- this = frame->this;
-
- src_hashed = local->src_hashed;
- src_cached = local->src_cached;
- dst_hashed = local->dst_hashed;
- dst_cached = local->dst_cached;
-
- if (src_cached == dst_cached)
- goto nolinks;
-
- if (local->linked && (dst_hashed != src_hashed )&&
- (dst_hashed != src_cached)) {
- call_cnt++;
- }
+ dht_local_t *local = NULL;
+ xlator_t *this = NULL;
+ xlator_t *src_hashed = NULL;
+ xlator_t *src_cached = NULL;
+ xlator_t *dst_hashed = NULL;
+ xlator_t *dst_cached = NULL;
+ int call_cnt = 0;
+ dict_t *xattr = NULL;
+ char gfid[GF_UUID_BUF_SIZE] = {0};
- if (local->added_link && (src_cached != dst_hashed)) {
- call_cnt++;
- }
+ local = frame->local;
+ this = frame->this;
- local->call_cnt = call_cnt;
+ src_hashed = local->src_hashed;
+ src_cached = local->src_cached;
+ dst_hashed = local->dst_hashed;
+ dst_cached = local->dst_cached;
- if (!call_cnt)
- goto nolinks;
+ if (src_cached == dst_cached)
+ goto nolinks;
- DHT_MARK_FOP_INTERNAL (xattr);
+ if (local->linked && (dst_hashed != src_hashed) &&
+ (dst_hashed != src_cached)) {
+ call_cnt++;
+ }
- gf_uuid_unparse(local->loc.inode->gfid, gfid);
+ if (local->added_link && (src_cached != dst_hashed)) {
+ call_cnt++;
+ }
- if (local->linked && (dst_hashed != src_hashed) &&
- (dst_hashed != src_cached)) {
- dict_t *xattr_new = NULL;
+ local->call_cnt = call_cnt;
- gf_msg_trace (this->name, 0,
- "unlinking linkfile %s @ %s => %s, (gfid = %s)",
- local->loc.path, dst_hashed->name,
- src_cached->name, gfid);
+ if (!call_cnt)
+ goto nolinks;
- xattr_new = dict_copy_with_ref (xattr, NULL);
+ DHT_MARK_FOP_INTERNAL(xattr);
+ gf_uuid_unparse(local->loc.inode->gfid, gfid);
- DHT_MARKER_DONT_ACCOUNT(xattr_new);
+ if (local->linked && (dst_hashed != src_hashed) &&
+ (dst_hashed != src_cached)) {
+ dict_t *xattr_new = NULL;
- STACK_WIND (frame, dht_rename_unlink_cbk,
- dst_hashed, dst_hashed->fops->unlink,
- &local->loc, 0, xattr_new);
+ gf_msg_trace(this->name, 0,
+ "unlinking linkfile %s @ %s => %s, (gfid = %s)",
+