From 77adf4cd648dce41f89469dd185deec6b6b53a0b Mon Sep 17 00:00:00 2001 From: Vikas Gorur Date: Wed, 18 Feb 2009 17:36:07 +0530 Subject: Added all files --- AUTHORS | 3 + COPYING | 674 ++ ChangeLog | 1 + INSTALL | 32 + Makefile.am | 15 + NEWS | 1 + README | 9 + THANKS | 3 + argp-standalone/Makefile.am | 38 + argp-standalone/acinclude.m4 | 1084 +++ argp-standalone/argp-ba.c | 26 + argp-standalone/argp-eexst.c | 36 + argp-standalone/argp-fmtstream.c | 475 ++ argp-standalone/argp-fmtstream.h | 319 + argp-standalone/argp-help.c | 1849 +++++ argp-standalone/argp-namefrob.h | 96 + argp-standalone/argp-parse.c | 1305 ++++ argp-standalone/argp-pv.c | 25 + argp-standalone/argp-pvh.c | 32 + argp-standalone/argp.h | 602 ++ argp-standalone/autogen.sh | 6 + argp-standalone/configure.ac | 100 + argp-standalone/mempcpy.c | 21 + argp-standalone/strcasecmp.c | 28 + argp-standalone/strchrnul.c | 23 + argp-standalone/strndup.c | 34 + argp-standalone/vsnprintf.c | 839 ++ auth/Makefile.am | 3 + auth/addr/Makefile.am | 3 + auth/addr/src/Makefile.am | 12 + auth/addr/src/addr.c | 208 + auth/login/Makefile.am | 3 + auth/login/src/Makefile.am | 13 + auth/login/src/login.c | 100 + autogen.sh | 8 + booster/Makefile.am | 1 + booster/src/Makefile.am | 17 + booster/src/booster.c | 920 +++ commit.sh | 6 + configure.ac | 554 ++ doc/Makefile.am | 11 + doc/authentication.txt | 112 + doc/booster.txt | 54 + doc/coding-standard.pdf | Bin 0 -> 68627 bytes doc/coding-standard.tex | 361 + doc/errno.list.bsd.txt | 376 + doc/errno.list.linux.txt | 1586 ++++ doc/errno.list.macosx.txt | 1513 ++++ doc/errno.list.solaris.txt | 206 + doc/examples/Makefile.am | 8 + doc/examples/README | 13 + doc/examples/filter.vol | 23 + doc/examples/io-cache.vol | 25 + doc/examples/io-threads.vol | 21 + doc/examples/posix-locks.vol | 20 + doc/examples/protocol-client.vol | 17 + doc/examples/protocol-server.vol | 25 + doc/examples/read-ahead.vol | 22 + doc/examples/replicate.vol | 119 + doc/examples/stripe.vol | 121 + doc/examples/trace.vol | 16 + doc/examples/trash.vol | 20 + doc/examples/unify.vol | 178 + doc/examples/write-behind.vol | 26 + doc/get_put_api_using_xattr.txt | 22 + doc/glusterfs.8 | 139 + doc/glusterfs.vol.sample | 61 + doc/glusterfsd.vol.sample | 47 + doc/hacker-guide/Makefile.am | 8 + doc/hacker-guide/adding-fops.txt | 33 + doc/hacker-guide/bdb.txt | 70 + doc/hacker-guide/call-stub.txt | 1033 +++ doc/hacker-guide/hacker-guide.tex | 312 + doc/hacker-guide/posix.txt | 59 + doc/hacker-guide/replicate.txt | 206 + doc/hacker-guide/write-behind.txt | 45 + doc/handling-options.txt | 13 + doc/mac-related-xattrs.txt | 21 + doc/porting_guide.txt | 45 + doc/qa/qa-client.vol | 170 + doc/qa/qa-high-avail-client.vol | 17 + doc/qa/qa-high-avail-server.vol | 346 + doc/qa/qa-server.vol | 284 + doc/replicate.lyx | 797 ++ doc/replicate.pdf | Bin 0 -> 109057 bytes doc/solaris-related-xattrs.txt | 44 + doc/translator-options.txt | 221 + doc/user-guide/Makefile.am | 1 + doc/user-guide/advanced-stripe.odg | Bin 0 -> 12648 bytes doc/user-guide/advanced-stripe.pdf | Bin 0 -> 13382 bytes doc/user-guide/colonO-icon.jpg | Bin 0 -> 779 bytes doc/user-guide/fdl.texi | 454 ++ doc/user-guide/fuse.odg | Bin 0 -> 13190 bytes doc/user-guide/fuse.pdf | Bin 0 -> 14948 bytes doc/user-guide/ha.odg | Bin 0 -> 37290 bytes doc/user-guide/ha.pdf | Bin 0 -> 19403 bytes doc/user-guide/stripe.odg | Bin 0 -> 10188 bytes doc/user-guide/stripe.pdf | Bin 0 -> 11941 bytes doc/user-guide/unify.odg | Bin 0 -> 12955 bytes doc/user-guide/unify.pdf | Bin 0 -> 18969 bytes doc/user-guide/user-guide.info | 2698 +++++++ doc/user-guide/user-guide.pdf | Bin 0 -> 353986 bytes doc/user-guide/user-guide.texi | 2226 ++++++ doc/user-guide/xlator.odg | Bin 0 -> 12169 bytes doc/user-guide/xlator.pdf | Bin 0 -> 14358 bytes extras/Makefile.am | 13 + extras/Portfile | 26 + extras/benchmarking/Makefile.am | 7 + extras/benchmarking/README | 18 + extras/benchmarking/glfs-bm.c | 619 ++ extras/benchmarking/launch-script.sh | 18 + extras/benchmarking/local-script.sh | 26 + extras/glusterfs-mode.el | 112 + extras/glusterfs.vim | 211 + extras/init.d/Makefile.am | 9 + extras/init.d/glusterfs-server | 100 + extras/init.d/glusterfs-server.plist.in | 15 + extras/init.d/glusterfsd | 110 + extras/specgen.scm | 98 + extras/stripe-merge.c | 48 + extras/test/Makefile.am | 3 + extras/test/rdd.c | 457 ++ glusterfs-guts/Makefile.am | 1 + glusterfs-guts/src/Makefile.am | 17 + glusterfs-guts/src/fuse-bridge.c | 2724 +++++++ glusterfs-guts/src/fuse-extra.c | 137 + glusterfs-guts/src/fuse-extra.h | 38 + glusterfs-guts/src/fuse_kernel.h | 380 + glusterfs-guts/src/glusterfs-fuse.h | 58 + glusterfs-guts/src/glusterfs-guts.c | 400 + glusterfs-guts/src/glusterfs-guts.h | 62 + glusterfs-guts/src/guts-extra.c | 18 + glusterfs-guts/src/guts-lowlevel.h | 86 + glusterfs-guts/src/guts-parse.c | 217 + glusterfs-guts/src/guts-parse.h | 140 + glusterfs-guts/src/guts-replay.c | 834 ++ glusterfs-guts/src/guts-replay.h | 33 + glusterfs-guts/src/guts-tables.c | 248 + glusterfs-guts/src/guts-tables.h | 80 + glusterfs-guts/src/guts-trace.c | 650 ++ glusterfs-guts/src/guts-trace.h | 54 + glusterfs.spec.in | 256 + glusterfsd/Makefile.am | 3 + glusterfsd/src/Makefile.am | 24 + glusterfsd/src/fetch-spec.c | 266 + glusterfsd/src/glusterfsd.c | 1123 +++ glusterfsd/src/glusterfsd.h | 78 + libglusterfs/Makefile.am | 3 + libglusterfs/src/Makefile.am | 21 + libglusterfs/src/authenticate.c | 240 + libglusterfs/src/authenticate.h | 61 + libglusterfs/src/byte-order.h | 150 + libglusterfs/src/call-stub.c | 3822 ++++++++++ libglusterfs/src/call-stub.h | 1104 +++ libglusterfs/src/common-utils.c | 1349 ++++ libglusterfs/src/common-utils.h | 313 + libglusterfs/src/compat-errno.c | 938 +++ libglusterfs/src/compat-errno.h | 240 + libglusterfs/src/compat.c | 383 + libglusterfs/src/compat.h | 356 + libglusterfs/src/defaults.c | 1388 ++++ libglusterfs/src/defaults.h | 273 + libglusterfs/src/dict.c | 2243 ++++++ libglusterfs/src/dict.h | 179 + libglusterfs/src/event.c | 978 +++ libglusterfs/src/event.h | 90 + libglusterfs/src/fd.c | 611 ++ libglusterfs/src/fd.h | 107 + libglusterfs/src/gf-dirent.c | 157 + libglusterfs/src/gf-dirent.h | 60 + libglusterfs/src/glusterfs.h | 277 + libglusterfs/src/hashfn.c | 89 + libglusterfs/src/hashfn.h | 33 + libglusterfs/src/inode.c | 1174 +++ libglusterfs/src/inode.h | 160 + libglusterfs/src/list.h | 154 + libglusterfs/src/locking.h | 49 + libglusterfs/src/logging.c | 207 + libglusterfs/src/logging.h | 132 + libglusterfs/src/mem-pool.c | 174 + libglusterfs/src/mem-pool.h | 54 + libglusterfs/src/protocol.h | 777 ++ libglusterfs/src/revision.h | 1 + libglusterfs/src/scheduler.c | 80 + libglusterfs/src/scheduler.h | 40 + libglusterfs/src/spec.l | 94 + libglusterfs/src/spec.y | 613 ++ libglusterfs/src/stack.h | 266 + libglusterfs/src/timer.c | 220 + libglusterfs/src/timer.h | 68 + libglusterfs/src/transport.c | 339 + libglusterfs/src/transport.h | 85 + libglusterfs/src/xlator.c | 728 ++ libglusterfs/src/xlator.h | 842 +++ libglusterfsclient/Makefile.am | 3 + libglusterfsclient/src/Makefile.am | 16 + .../src/libglusterfsclient-internals.h | 144 + libglusterfsclient/src/libglusterfsclient.c | 3146 ++++++++ libglusterfsclient/src/libglusterfsclient.h | 279 + mod_glusterfs/Makefile.am | 3 + mod_glusterfs/apache/1.3/Makefile.am | 3 + mod_glusterfs/apache/1.3/src/Makefile.am | 30 + mod_glusterfs/apache/1.3/src/README.txt | 107 + mod_glusterfs/apache/1.3/src/mod_glusterfs.c | 514 ++ mod_glusterfs/apache/2.2/Makefile.am | 3 + mod_glusterfs/apache/2.2/src/Makefile.am | 31 + mod_glusterfs/apache/2.2/src/README.txt | 105 + mod_glusterfs/apache/2.2/src/mod_glusterfs.c | 3536 +++++++++ mod_glusterfs/apache/Makefile.am | 10 + mod_glusterfs/lighttpd/1.4/Makefile.am | 3 + mod_glusterfs/lighttpd/1.4/Makefile.am.diff | 29 + mod_glusterfs/lighttpd/1.4/README.txt | 57 + mod_glusterfs/lighttpd/1.4/mod_glusterfs.c | 1709 +++++ mod_glusterfs/lighttpd/1.4/mod_glusterfs.h | 29 + mod_glusterfs/lighttpd/1.5/Makefile.am | 3 + mod_glusterfs/lighttpd/1.5/Makefile.am.diff | 29 + mod_glusterfs/lighttpd/1.5/README.txt | 57 + mod_glusterfs/lighttpd/1.5/mod_glusterfs.c | 1476 ++++ mod_glusterfs/lighttpd/1.5/mod_glusterfs.h | 29 + mod_glusterfs/lighttpd/Makefile.am | 3 + scheduler/Makefile.am | 3 + scheduler/alu/Makefile.am | 3 + scheduler/alu/src/Makefile.am | 14 + scheduler/alu/src/alu.c | 993 +++ scheduler/alu/src/alu.h | 89 + scheduler/nufa/Makefile.am | 3 + scheduler/nufa/src/Makefile.am | 12 + scheduler/nufa/src/nufa.c | 403 + scheduler/random/Makefile.am | 3 + scheduler/random/src/Makefile.am | 14 + scheduler/random/src/random.c | 283 + scheduler/random/src/random.h | 46 + scheduler/rr/Makefile.am | 3 + scheduler/rr/src/Makefile.am | 13 + scheduler/rr/src/rr-options.c | 256 + scheduler/rr/src/rr-options.h | 34 + scheduler/rr/src/rr.c | 565 ++ scheduler/rr/src/rr.h | 70 + scheduler/switch/Makefile.am | 3 + scheduler/switch/src/Makefile.am | 12 + scheduler/switch/src/switch.c | 398 + transport/Makefile.am | 3 + transport/ib-verbs/Makefile.am | 1 + transport/ib-verbs/src/Makefile.am | 15 + transport/ib-verbs/src/ib-verbs.c | 2392 ++++++ transport/ib-verbs/src/ib-verbs.h | 215 + transport/ib-verbs/src/name.c | 682 ++ transport/ib-verbs/src/name.h | 47 + transport/socket/Makefile.am | 1 + transport/socket/src/Makefile.am | 14 + transport/socket/src/name.c | 677 ++ transport/socket/src/name.h | 44 + transport/socket/src/socket.c | 1370 ++++ transport/socket/src/socket.h | 106 + xlators/Makefile.am | 3 + xlators/bindings/Makefile.am | 1 + xlators/bindings/python/Makefile.am | 1 + xlators/bindings/python/src/Makefile.am | 19 + xlators/bindings/python/src/gluster.py | 47 + xlators/bindings/python/src/glusterstack.py | 55 + xlators/bindings/python/src/glustertypes.py | 167 + xlators/bindings/python/src/python.c | 235 + xlators/bindings/python/src/testxlator.py | 56 + xlators/cluster/Makefile.am | 3 + xlators/cluster/afr/Makefile.am | 3 + xlators/cluster/afr/src/Makefile.am | 20 + xlators/cluster/afr/src/afr-dir-read.c | 345 + xlators/cluster/afr/src/afr-dir-read.h | 47 + xlators/cluster/afr/src/afr-dir-write.c | 1786 +++++ xlators/cluster/afr/src/afr-dir-write.h | 59 + xlators/cluster/afr/src/afr-inode-read.c | 721 ++ xlators/cluster/afr/src/afr-inode-read.h | 47 + xlators/cluster/afr/src/afr-inode-write.c | 2024 +++++ xlators/cluster/afr/src/afr-inode-write.h | 63 + xlators/cluster/afr/src/afr-self-heal-common.c | 1073 +++ xlators/cluster/afr/src/afr-self-heal-common.h | 66 + xlators/cluster/afr/src/afr-self-heal-data.c | 1030 +++ xlators/cluster/afr/src/afr-self-heal-entry.c | 2038 +++++ xlators/cluster/afr/src/afr-self-heal-metadata.c | 791 ++ xlators/cluster/afr/src/afr-self-heal.h | 52 + xlators/cluster/afr/src/afr-transaction.c | 957 +++ xlators/cluster/afr/src/afr-transaction.h | 36 + xlators/cluster/afr/src/afr.c | 2338 ++++++ xlators/cluster/afr/src/afr.h | 523 ++ xlators/cluster/dht/Makefile.am | 1 + xlators/cluster/dht/src/Makefile.am | 30 + xlators/cluster/dht/src/dht-common.c | 3470 +++++++++ xlators/cluster/dht/src/dht-common.h | 212 + xlators/cluster/dht/src/dht-hashfn-tea.c | 146 + xlators/cluster/dht/src/dht-hashfn.c | 88 + xlators/cluster/dht/src/dht-helper.c | 326 + xlators/cluster/dht/src/dht-layout.c | 543 ++ xlators/cluster/dht/src/dht-linkfile.c | 224 + xlators/cluster/dht/src/dht-rename.c | 562 ++ xlators/cluster/dht/src/dht-selfheal.c | 460 ++ xlators/cluster/dht/src/dht.c | 222 + xlators/cluster/dht/src/nufa.c | 684 ++ xlators/cluster/ha/Makefile.am | 3 + xlators/cluster/ha/src/Makefile.am | 15 + xlators/cluster/ha/src/ha-helpers.c | 191 + xlators/cluster/ha/src/ha.c | 3479 +++++++++ xlators/cluster/ha/src/ha.h | 59 + xlators/cluster/map/Makefile.am | 3 + xlators/cluster/map/src/Makefile.am | 15 + xlators/cluster/map/src/map-helper.c | 357 + xlators/cluster/map/src/map.c | 2193 ++++++ xlators/cluster/map/src/map.h | 76 + xlators/cluster/stripe/Makefile.am | 3 + xlators/cluster/stripe/src/Makefile.am | 14 + xlators/cluster/stripe/src/stripe.c | 3286 ++++++++ xlators/cluster/unify/Makefile.am | 3 + xlators/cluster/unify/src/Makefile.am | 16 + xlators/cluster/unify/src/unify-self-heal.c | 1225 +++ xlators/cluster/unify/src/unify.c | 4451 +++++++++++ xlators/cluster/unify/src/unify.h | 132 + xlators/debug/Makefile.am | 3 + xlators/debug/error-gen/Makefile.am | 3 + xlators/debug/error-gen/src/Makefile.am | 14 + xlators/debug/error-gen/src/error-gen.c | 1780 +++++ xlators/debug/trace/Makefile.am | 3 + xlators/debug/trace/src/Makefile.am | 14 + xlators/debug/trace/src/trace.c | 2321 ++++++ xlators/encryption/Makefile.am | 3 + xlators/encryption/rot-13/Makefile.am | 3 + xlators/encryption/rot-13/src/Makefile.am | 14 + xlators/encryption/rot-13/src/rot-13.c | 200 + xlators/encryption/rot-13/src/rot-13.h | 33 + xlators/features/Makefile.am | 3 + xlators/features/filter/Makefile.am | 3 + xlators/features/filter/src/Makefile.am | 13 + xlators/features/filter/src/filter.c | 1768 +++++ xlators/features/locks/Makefile.am | 3 + xlators/features/locks/src/Makefile.am | 20 + xlators/features/locks/src/common.c | 561 ++ xlators/features/locks/src/common.h | 59 + xlators/features/locks/src/internal.c | 762 ++ xlators/features/locks/src/locks.h | 111 + xlators/features/locks/src/posix.c | 834 ++ xlators/features/locks/tests/unit-test.c | 75 + xlators/features/path-convertor/Makefile.am | 3 + xlators/features/path-convertor/src/Makefile.am | 14 + xlators/features/path-convertor/src/path.c | 1217 +++ xlators/features/quota/Makefile.am | 3 + xlators/features/quota/src/Makefile.am | 13 + xlators/features/quota/src/quota.c | 1056 +++ xlators/features/trash/Makefile.am | 3 + xlators/features/trash/src/Makefile.am | 13 + xlators/features/trash/src/trash.c | 596 ++ xlators/meta/Makefile.am | 1 + xlators/meta/src/Makefile.am | 10 + xlators/meta/src/meta.c | 1285 ++++ xlators/meta/src/meta.h | 48 + xlators/meta/src/misc.c | 67 + xlators/meta/src/misc.h | 31 + xlators/meta/src/tree.c | 176 + xlators/meta/src/tree.h | 35 + xlators/meta/src/view.c | 258 + xlators/meta/src/view.h | 32 + xlators/mount/Makefile.am | 3 + xlators/mount/fuse/Makefile.am | 3 + xlators/mount/fuse/src/Makefile.am | 14 + xlators/mount/fuse/src/fuse-bridge.c | 2859 +++++++ xlators/mount/fuse/src/fuse-extra.c | 137 + xlators/mount/fuse/src/fuse-extra.h | 42 + xlators/mount/fuse/utils/Makefile.am | 10 + xlators/mount/fuse/utils/mount.glusterfs.in | 152 + xlators/mount/fuse/utils/mount_glusterfs.in | 181 + xlators/performance/Makefile.am | 3 + xlators/performance/io-cache/Makefile.am | 3 + xlators/performance/io-cache/src/Makefile.am | 14 + xlators/performance/io-cache/src/io-cache.c | 1478 ++++ xlators/performance/io-cache/src/io-cache.h | 330 + xlators/performance/io-cache/src/ioc-inode.c | 201 + xlators/performance/io-cache/src/page.c | 778 ++ xlators/performance/io-threads/Makefile.am | 3 + xlators/performance/io-threads/src/Makefile.am | 14 + xlators/performance/io-threads/src/io-threads.c | 1254 +++ xlators/performance/io-threads/src/io-threads.h | 99 + xlators/performance/read-ahead/Makefile.am | 3 + xlators/performance/read-ahead/src/Makefile.am | 14 + xlators/performance/read-ahead/src/page.c | 487 ++ xlators/performance/read-ahead/src/read-ahead.c | 890 +++ xlators/performance/read-ahead/src/read-ahead.h | 194 + xlators/performance/stat-prefetch/Makefile.am | 1 + xlators/performance/stat-prefetch/src/Makefile.am | 11 + .../performance/stat-prefetch/src/stat-prefetch.c | 508 ++ .../performance/stat-prefetch/src/stat-prefetch.h | 32 + xlators/performance/symlink-cache/Makefile.am | 3 + xlators/performance/symlink-cache/src/Makefile.am | 12 + .../performance/symlink-cache/src/symlink-cache.c | 399 + xlators/performance/write-behind/Makefile.am | 3 + xlators/performance/write-behind/src/Makefile.am | 12 + .../performance/write-behind/src/write-behind.c | 1444 ++++ xlators/protocol/Makefile.am | 3 + xlators/protocol/client/Makefile.am | 3 + xlators/protocol/client/src/Makefile.am | 16 + xlators/protocol/client/src/client-protocol.c | 6671 ++++++++++++++++ xlators/protocol/client/src/client-protocol.h | 173 + xlators/protocol/client/src/saved-frames.c | 178 + xlators/protocol/client/src/saved-frames.h | 74 + xlators/protocol/server/Makefile.am | 3 + xlators/protocol/server/src/Makefile.am | 18 + xlators/protocol/server/src/server-dentry.c | 413 + xlators/protocol/server/src/server-helpers.c | 586 ++ xlators/protocol/server/src/server-helpers.h | 77 + xlators/protocol/server/src/server-protocol.c | 7984 ++++++++++++++++++++ xlators/protocol/server/src/server-protocol.h | 143 + xlators/storage/Makefile.am | 3 + xlators/storage/bdb/Makefile.am | 3 + xlators/storage/bdb/src/Makefile.am | 18 + xlators/storage/bdb/src/bctx.c | 394 + xlators/storage/bdb/src/bdb-ll.c | 1455 ++++ xlators/storage/bdb/src/bdb.c | 3371 +++++++++ xlators/storage/bdb/src/bdb.h | 439 ++ xlators/storage/posix/Makefile.am | 3 + xlators/storage/posix/src/Makefile.am | 17 + xlators/storage/posix/src/posix.c | 3715 +++++++++ xlators/storage/posix/src/posix.h | 110 + xlators/storage/posix/src/xattr-cache.c | 521 ++ xlators/storage/posix/src/xattr-cache.h | 65 + 420 files changed, 164055 insertions(+) create mode 100644 AUTHORS create mode 100644 COPYING create mode 100644 ChangeLog create mode 100644 INSTALL create mode 100644 Makefile.am create mode 100644 NEWS create mode 100644 README create mode 100644 THANKS create mode 100644 argp-standalone/Makefile.am create mode 100644 argp-standalone/acinclude.m4 create mode 100644 argp-standalone/argp-ba.c create mode 100644 argp-standalone/argp-eexst.c create mode 100644 argp-standalone/argp-fmtstream.c create mode 100644 argp-standalone/argp-fmtstream.h create mode 100644 argp-standalone/argp-help.c create mode 100644 argp-standalone/argp-namefrob.h create mode 100644 argp-standalone/argp-parse.c create mode 100644 argp-standalone/argp-pv.c create mode 100644 argp-standalone/argp-pvh.c create mode 100644 argp-standalone/argp.h create mode 100755 argp-standalone/autogen.sh create mode 100644 argp-standalone/configure.ac create mode 100644 argp-standalone/mempcpy.c create mode 100644 argp-standalone/strcasecmp.c create mode 100644 argp-standalone/strchrnul.c create mode 100644 argp-standalone/strndup.c create mode 100644 argp-standalone/vsnprintf.c create mode 100644 auth/Makefile.am create mode 100644 auth/addr/Makefile.am create mode 100644 auth/addr/src/Makefile.am create mode 100644 auth/addr/src/addr.c create mode 100644 auth/login/Makefile.am create mode 100644 auth/login/src/Makefile.am create mode 100644 auth/login/src/login.c create mode 100755 autogen.sh create mode 100644 booster/Makefile.am create mode 100644 booster/src/Makefile.am create mode 100644 booster/src/booster.c create mode 100755 commit.sh create mode 100644 configure.ac create mode 100644 doc/Makefile.am create mode 100644 doc/authentication.txt create mode 100644 doc/booster.txt create mode 100644 doc/coding-standard.pdf create mode 100644 doc/coding-standard.tex create mode 100644 doc/errno.list.bsd.txt create mode 100644 doc/errno.list.linux.txt create mode 100644 doc/errno.list.macosx.txt create mode 100644 doc/errno.list.solaris.txt create mode 100644 doc/examples/Makefile.am create mode 100644 doc/examples/README create mode 100644 doc/examples/filter.vol create mode 100644 doc/examples/io-cache.vol create mode 100644 doc/examples/io-threads.vol create mode 100644 doc/examples/posix-locks.vol create mode 100644 doc/examples/protocol-client.vol create mode 100644 doc/examples/protocol-server.vol create mode 100644 doc/examples/read-ahead.vol create mode 100644 doc/examples/replicate.vol create mode 100644 doc/examples/stripe.vol create mode 100644 doc/examples/trace.vol create mode 100644 doc/examples/trash.vol create mode 100644 doc/examples/unify.vol create mode 100644 doc/examples/write-behind.vol create mode 100644 doc/get_put_api_using_xattr.txt create mode 100644 doc/glusterfs.8 create mode 100644 doc/glusterfs.vol.sample create mode 100644 doc/glusterfsd.vol.sample create mode 100644 doc/hacker-guide/Makefile.am create mode 100644 doc/hacker-guide/adding-fops.txt create mode 100644 doc/hacker-guide/bdb.txt create mode 100644 doc/hacker-guide/call-stub.txt create mode 100644 doc/hacker-guide/hacker-guide.tex create mode 100644 doc/hacker-guide/posix.txt create mode 100644 doc/hacker-guide/replicate.txt create mode 100644 doc/hacker-guide/write-behind.txt create mode 100644 doc/handling-options.txt create mode 100644 doc/mac-related-xattrs.txt create mode 100644 doc/porting_guide.txt create mode 100644 doc/qa/qa-client.vol create mode 100644 doc/qa/qa-high-avail-client.vol create mode 100644 doc/qa/qa-high-avail-server.vol create mode 100644 doc/qa/qa-server.vol create mode 100644 doc/replicate.lyx create mode 100644 doc/replicate.pdf create mode 100644 doc/solaris-related-xattrs.txt create mode 100644 doc/translator-options.txt create mode 100644 doc/user-guide/Makefile.am create mode 100644 doc/user-guide/advanced-stripe.odg create mode 100644 doc/user-guide/advanced-stripe.pdf create mode 100644 doc/user-guide/colonO-icon.jpg create mode 100644 doc/user-guide/fdl.texi create mode 100644 doc/user-guide/fuse.odg create mode 100644 doc/user-guide/fuse.pdf create mode 100644 doc/user-guide/ha.odg create mode 100644 doc/user-guide/ha.pdf create mode 100644 doc/user-guide/stripe.odg create mode 100644 doc/user-guide/stripe.pdf create mode 100644 doc/user-guide/unify.odg create mode 100644 doc/user-guide/unify.pdf create mode 100644 doc/user-guide/user-guide.info create mode 100644 doc/user-guide/user-guide.pdf create mode 100644 doc/user-guide/user-guide.texi create mode 100644 doc/user-guide/xlator.odg create mode 100644 doc/user-guide/xlator.pdf create mode 100644 extras/Makefile.am create mode 100644 extras/Portfile create mode 100644 extras/benchmarking/Makefile.am create mode 100644 extras/benchmarking/README create mode 100644 extras/benchmarking/glfs-bm.c create mode 100755 extras/benchmarking/launch-script.sh create mode 100755 extras/benchmarking/local-script.sh create mode 100644 extras/glusterfs-mode.el create mode 100644 extras/glusterfs.vim create mode 100644 extras/init.d/Makefile.am create mode 100755 extras/init.d/glusterfs-server create mode 100644 extras/init.d/glusterfs-server.plist.in create mode 100755 extras/init.d/glusterfsd create mode 100755 extras/specgen.scm create mode 100644 extras/stripe-merge.c create mode 100644 extras/test/Makefile.am create mode 100644 extras/test/rdd.c create mode 100644 glusterfs-guts/Makefile.am create mode 100644 glusterfs-guts/src/Makefile.am create mode 100644 glusterfs-guts/src/fuse-bridge.c create mode 100644 glusterfs-guts/src/fuse-extra.c create mode 100644 glusterfs-guts/src/fuse-extra.h create mode 100644 glusterfs-guts/src/fuse_kernel.h create mode 100644 glusterfs-guts/src/glusterfs-fuse.h create mode 100644 glusterfs-guts/src/glusterfs-guts.c create mode 100644 glusterfs-guts/src/glusterfs-guts.h create mode 100644 glusterfs-guts/src/guts-extra.c create mode 100644 glusterfs-guts/src/guts-lowlevel.h create mode 100644 glusterfs-guts/src/guts-parse.c create mode 100644 glusterfs-guts/src/guts-parse.h create mode 100644 glusterfs-guts/src/guts-replay.c create mode 100644 glusterfs-guts/src/guts-replay.h create mode 100644 glusterfs-guts/src/guts-tables.c create mode 100644 glusterfs-guts/src/guts-tables.h create mode 100644 glusterfs-guts/src/guts-trace.c create mode 100644 glusterfs-guts/src/guts-trace.h create mode 100644 glusterfs.spec.in create mode 100644 glusterfsd/Makefile.am create mode 100644 glusterfsd/src/Makefile.am create mode 100644 glusterfsd/src/fetch-spec.c create mode 100644 glusterfsd/src/glusterfsd.c create mode 100644 glusterfsd/src/glusterfsd.h create mode 100644 libglusterfs/Makefile.am create mode 100644 libglusterfs/src/Makefile.am create mode 100644 libglusterfs/src/authenticate.c create mode 100644 libglusterfs/src/authenticate.h create mode 100644 libglusterfs/src/byte-order.h create mode 100644 libglusterfs/src/call-stub.c create mode 100644 libglusterfs/src/call-stub.h create mode 100644 libglusterfs/src/common-utils.c create mode 100644 libglusterfs/src/common-utils.h create mode 100644 libglusterfs/src/compat-errno.c create mode 100644 libglusterfs/src/compat-errno.h create mode 100644 libglusterfs/src/compat.c create mode 100644 libglusterfs/src/compat.h create mode 100644 libglusterfs/src/defaults.c create mode 100644 libglusterfs/src/defaults.h create mode 100644 libglusterfs/src/dict.c create mode 100644 libglusterfs/src/dict.h create mode 100644 libglusterfs/src/event.c create mode 100644 libglusterfs/src/event.h create mode 100644 libglusterfs/src/fd.c create mode 100644 libglusterfs/src/fd.h create mode 100644 libglusterfs/src/gf-dirent.c create mode 100644 libglusterfs/src/gf-dirent.h create mode 100644 libglusterfs/src/glusterfs.h create mode 100644 libglusterfs/src/hashfn.c create mode 100644 libglusterfs/src/hashfn.h create mode 100644 libglusterfs/src/inode.c create mode 100644 libglusterfs/src/inode.h create mode 100644 libglusterfs/src/list.h create mode 100644 libglusterfs/src/locking.h create mode 100644 libglusterfs/src/logging.c create mode 100644 libglusterfs/src/logging.h create mode 100644 libglusterfs/src/mem-pool.c create mode 100644 libglusterfs/src/mem-pool.h create mode 100644 libglusterfs/src/protocol.h create mode 100644 libglusterfs/src/revision.h create mode 100644 libglusterfs/src/scheduler.c create mode 100644 libglusterfs/src/scheduler.h create mode 100644 libglusterfs/src/spec.l create mode 100644 libglusterfs/src/spec.y create mode 100644 libglusterfs/src/stack.h create mode 100644 libglusterfs/src/timer.c create mode 100644 libglusterfs/src/timer.h create mode 100644 libglusterfs/src/transport.c create mode 100644 libglusterfs/src/transport.h create mode 100644 libglusterfs/src/xlator.c create mode 100644 libglusterfs/src/xlator.h create mode 100644 libglusterfsclient/Makefile.am create mode 100644 libglusterfsclient/src/Makefile.am create mode 100755 libglusterfsclient/src/libglusterfsclient-internals.h create mode 100755 libglusterfsclient/src/libglusterfsclient.c create mode 100755 libglusterfsclient/src/libglusterfsclient.h create mode 100644 mod_glusterfs/Makefile.am create mode 100644 mod_glusterfs/apache/1.3/Makefile.am create mode 100644 mod_glusterfs/apache/1.3/src/Makefile.am create mode 100644 mod_glusterfs/apache/1.3/src/README.txt create mode 100644 mod_glusterfs/apache/1.3/src/mod_glusterfs.c create mode 100644 mod_glusterfs/apache/2.2/Makefile.am create mode 100644 mod_glusterfs/apache/2.2/src/Makefile.am create mode 100644 mod_glusterfs/apache/2.2/src/README.txt create mode 100644 mod_glusterfs/apache/2.2/src/mod_glusterfs.c create mode 100644 mod_glusterfs/apache/Makefile.am create mode 100644 mod_glusterfs/lighttpd/1.4/Makefile.am create mode 100644 mod_glusterfs/lighttpd/1.4/Makefile.am.diff create mode 100644 mod_glusterfs/lighttpd/1.4/README.txt create mode 100644 mod_glusterfs/lighttpd/1.4/mod_glusterfs.c create mode 100644 mod_glusterfs/lighttpd/1.4/mod_glusterfs.h create mode 100644 mod_glusterfs/lighttpd/1.5/Makefile.am create mode 100644 mod_glusterfs/lighttpd/1.5/Makefile.am.diff create mode 100644 mod_glusterfs/lighttpd/1.5/README.txt create mode 100644 mod_glusterfs/lighttpd/1.5/mod_glusterfs.c create mode 100644 mod_glusterfs/lighttpd/1.5/mod_glusterfs.h create mode 100644 mod_glusterfs/lighttpd/Makefile.am create mode 100644 scheduler/Makefile.am create mode 100644 scheduler/alu/Makefile.am create mode 100644 scheduler/alu/src/Makefile.am create mode 100644 scheduler/alu/src/alu.c create mode 100644 scheduler/alu/src/alu.h create mode 100644 scheduler/nufa/Makefile.am create mode 100644 scheduler/nufa/src/Makefile.am create mode 100644 scheduler/nufa/src/nufa.c create mode 100644 scheduler/random/Makefile.am create mode 100644 scheduler/random/src/Makefile.am create mode 100644 scheduler/random/src/random.c create mode 100644 scheduler/random/src/random.h create mode 100644 scheduler/rr/Makefile.am create mode 100644 scheduler/rr/src/Makefile.am create mode 100644 scheduler/rr/src/rr-options.c create mode 100644 scheduler/rr/src/rr-options.h create mode 100644 scheduler/rr/src/rr.c create mode 100644 scheduler/rr/src/rr.h create mode 100644 scheduler/switch/Makefile.am create mode 100644 scheduler/switch/src/Makefile.am create mode 100644 scheduler/switch/src/switch.c create mode 100644 transport/Makefile.am create mode 100644 transport/ib-verbs/Makefile.am create mode 100644 transport/ib-verbs/src/Makefile.am create mode 100644 transport/ib-verbs/src/ib-verbs.c create mode 100644 transport/ib-verbs/src/ib-verbs.h create mode 100644 transport/ib-verbs/src/name.c create mode 100644 transport/ib-verbs/src/name.h create mode 100644 transport/socket/Makefile.am create mode 100644 transport/socket/src/Makefile.am create mode 100644 transport/socket/src/name.c create mode 100644 transport/socket/src/name.h create mode 100644 transport/socket/src/socket.c create mode 100644 transport/socket/src/socket.h create mode 100644 xlators/Makefile.am create mode 100644 xlators/bindings/Makefile.am create mode 100644 xlators/bindings/python/Makefile.am create mode 100644 xlators/bindings/python/src/Makefile.am create mode 100644 xlators/bindings/python/src/gluster.py create mode 100644 xlators/bindings/python/src/glusterstack.py create mode 100644 xlators/bindings/python/src/glustertypes.py create mode 100644 xlators/bindings/python/src/python.c create mode 100644 xlators/bindings/python/src/testxlator.py create mode 100644 xlators/cluster/Makefile.am create mode 100644 xlators/cluster/afr/Makefile.am create mode 100644 xlators/cluster/afr/src/Makefile.am create mode 100644 xlators/cluster/afr/src/afr-dir-read.c create mode 100644 xlators/cluster/afr/src/afr-dir-read.h create mode 100644 xlators/cluster/afr/src/afr-dir-write.c create mode 100644 xlators/cluster/afr/src/afr-dir-write.h create mode 100644 xlators/cluster/afr/src/afr-inode-read.c create mode 100644 xlators/cluster/afr/src/afr-inode-read.h create mode 100644 xlators/cluster/afr/src/afr-inode-write.c create mode 100644 xlators/cluster/afr/src/afr-inode-write.h create mode 100644 xlators/cluster/afr/src/afr-self-heal-common.c create mode 100644 xlators/cluster/afr/src/afr-self-heal-common.h create mode 100644 xlators/cluster/afr/src/afr-self-heal-data.c create mode 100644 xlators/cluster/afr/src/afr-self-heal-entry.c create mode 100644 xlators/cluster/afr/src/afr-self-heal-metadata.c create mode 100644 xlators/cluster/afr/src/afr-self-heal.h create mode 100644 xlators/cluster/afr/src/afr-transaction.c create mode 100644 xlators/cluster/afr/src/afr-transaction.h create mode 100644 xlators/cluster/afr/src/afr.c create mode 100644 xlators/cluster/afr/src/afr.h create mode 100644 xlators/cluster/dht/Makefile.am create mode 100644 xlators/cluster/dht/src/Makefile.am create mode 100644 xlators/cluster/dht/src/dht-common.c create mode 100644 xlators/cluster/dht/src/dht-common.h create mode 100644 xlators/cluster/dht/src/dht-hashfn-tea.c create mode 100644 xlators/cluster/dht/src/dht-hashfn.c create mode 100644 xlators/cluster/dht/src/dht-helper.c create mode 100644 xlators/cluster/dht/src/dht-layout.c create mode 100644 xlators/cluster/dht/src/dht-linkfile.c create mode 100644 xlators/cluster/dht/src/dht-rename.c create mode 100644 xlators/cluster/dht/src/dht-selfheal.c create mode 100644 xlators/cluster/dht/src/dht.c create mode 100644 xlators/cluster/dht/src/nufa.c create mode 100644 xlators/cluster/ha/Makefile.am create mode 100644 xlators/cluster/ha/src/Makefile.am create mode 100644 xlators/cluster/ha/src/ha-helpers.c create mode 100644 xlators/cluster/ha/src/ha.c create mode 100644 xlators/cluster/ha/src/ha.h create mode 100644 xlators/cluster/map/Makefile.am create mode 100644 xlators/cluster/map/src/Makefile.am create mode 100644 xlators/cluster/map/src/map-helper.c create mode 100644 xlators/cluster/map/src/map.c create mode 100644 xlators/cluster/map/src/map.h create mode 100644 xlators/cluster/stripe/Makefile.am create mode 100644 xlators/cluster/stripe/src/Makefile.am create mode 100644 xlators/cluster/stripe/src/stripe.c create mode 100644 xlators/cluster/unify/Makefile.am create mode 100644 xlators/cluster/unify/src/Makefile.am create mode 100644 xlators/cluster/unify/src/unify-self-heal.c create mode 100644 xlators/cluster/unify/src/unify.c create mode 100644 xlators/cluster/unify/src/unify.h create mode 100644 xlators/debug/Makefile.am create mode 100644 xlators/debug/error-gen/Makefile.am create mode 100644 xlators/debug/error-gen/src/Makefile.am create mode 100644 xlators/debug/error-gen/src/error-gen.c create mode 100644 xlators/debug/trace/Makefile.am create mode 100644 xlators/debug/trace/src/Makefile.am create mode 100644 xlators/debug/trace/src/trace.c create mode 100644 xlators/encryption/Makefile.am create mode 100644 xlators/encryption/rot-13/Makefile.am create mode 100644 xlators/encryption/rot-13/src/Makefile.am create mode 100644 xlators/encryption/rot-13/src/rot-13.c create mode 100644 xlators/encryption/rot-13/src/rot-13.h create mode 100644 xlators/features/Makefile.am create mode 100644 xlators/features/filter/Makefile.am create mode 100644 xlators/features/filter/src/Makefile.am create mode 100644 xlators/features/filter/src/filter.c create mode 100644 xlators/features/locks/Makefile.am create mode 100644 xlators/features/locks/src/Makefile.am create mode 100644 xlators/features/locks/src/common.c create mode 100644 xlators/features/locks/src/common.h create mode 100644 xlators/features/locks/src/internal.c create mode 100644 xlators/features/locks/src/locks.h create mode 100644 xlators/features/locks/src/posix.c create mode 100644 xlators/features/locks/tests/unit-test.c create mode 100644 xlators/features/path-convertor/Makefile.am create mode 100644 xlators/features/path-convertor/src/Makefile.am create mode 100644 xlators/features/path-convertor/src/path.c create mode 100644 xlators/features/quota/Makefile.am create mode 100644 xlators/features/quota/src/Makefile.am create mode 100644 xlators/features/quota/src/quota.c create mode 100644 xlators/features/trash/Makefile.am create mode 100644 xlators/features/trash/src/Makefile.am create mode 100644 xlators/features/trash/src/trash.c create mode 100644 xlators/meta/Makefile.am create mode 100644 xlators/meta/src/Makefile.am create mode 100644 xlators/meta/src/meta.c create mode 100644 xlators/meta/src/meta.h create mode 100644 xlators/meta/src/misc.c create mode 100644 xlators/meta/src/misc.h create mode 100644 xlators/meta/src/tree.c create mode 100644 xlators/meta/src/tree.h create mode 100644 xlators/meta/src/view.c create mode 100644 xlators/meta/src/view.h create mode 100644 xlators/mount/Makefile.am create mode 100644 xlators/mount/fuse/Makefile.am create mode 100644 xlators/mount/fuse/src/Makefile.am create mode 100644 xlators/mount/fuse/src/fuse-bridge.c create mode 100644 xlators/mount/fuse/src/fuse-extra.c create mode 100644 xlators/mount/fuse/src/fuse-extra.h create mode 100644 xlators/mount/fuse/utils/Makefile.am create mode 100755 xlators/mount/fuse/utils/mount.glusterfs.in create mode 100755 xlators/mount/fuse/utils/mount_glusterfs.in create mode 100644 xlators/performance/Makefile.am create mode 100644 xlators/performance/io-cache/Makefile.am create mode 100644 xlators/performance/io-cache/src/Makefile.am create mode 100644 xlators/performance/io-cache/src/io-cache.c create mode 100644 xlators/performance/io-cache/src/io-cache.h create mode 100644 xlators/performance/io-cache/src/ioc-inode.c create mode 100644 xlators/performance/io-cache/src/page.c create mode 100644 xlators/performance/io-threads/Makefile.am create mode 100644 xlators/performance/io-threads/src/Makefile.am create mode 100644 xlators/performance/io-threads/src/io-threads.c create mode 100644 xlators/performance/io-threads/src/io-threads.h create mode 100644 xlators/performance/read-ahead/Makefile.am create mode 100644 xlators/performance/read-ahead/src/Makefile.am create mode 100644 xlators/performance/read-ahead/src/page.c create mode 100644 xlators/performance/read-ahead/src/read-ahead.c create mode 100644 xlators/performance/read-ahead/src/read-ahead.h create mode 100644 xlators/performance/stat-prefetch/Makefile.am create mode 100644 xlators/performance/stat-prefetch/src/Makefile.am create mode 100644 xlators/performance/stat-prefetch/src/stat-prefetch.c create mode 100644 xlators/performance/stat-prefetch/src/stat-prefetch.h create mode 100644 xlators/performance/symlink-cache/Makefile.am create mode 100644 xlators/performance/symlink-cache/src/Makefile.am create mode 100644 xlators/performance/symlink-cache/src/symlink-cache.c create mode 100644 xlators/performance/write-behind/Makefile.am create mode 100644 xlators/performance/write-behind/src/Makefile.am create mode 100644 xlators/performance/write-behind/src/write-behind.c create mode 100644 xlators/protocol/Makefile.am create mode 100644 xlators/protocol/client/Makefile.am create mode 100644 xlators/protocol/client/src/Makefile.am create mode 100644 xlators/protocol/client/src/client-protocol.c create mode 100644 xlators/protocol/client/src/client-protocol.h create mode 100644 xlators/protocol/client/src/saved-frames.c create mode 100644 xlators/protocol/client/src/saved-frames.h create mode 100644 xlators/protocol/server/Makefile.am create mode 100644 xlators/protocol/server/src/Makefile.am create mode 100644 xlators/protocol/server/src/server-dentry.c create mode 100644 xlators/protocol/server/src/server-helpers.c create mode 100644 xlators/protocol/server/src/server-helpers.h create mode 100644 xlators/protocol/server/src/server-protocol.c create mode 100644 xlators/protocol/server/src/server-protocol.h create mode 100644 xlators/storage/Makefile.am create mode 100644 xlators/storage/bdb/Makefile.am create mode 100644 xlators/storage/bdb/src/Makefile.am create mode 100644 xlators/storage/bdb/src/bctx.c create mode 100644 xlators/storage/bdb/src/bdb-ll.c create mode 100644 xlators/storage/bdb/src/bdb.c create mode 100644 xlators/storage/bdb/src/bdb.h create mode 100644 xlators/storage/posix/Makefile.am create mode 100644 xlators/storage/posix/src/Makefile.am create mode 100644 xlators/storage/posix/src/posix.c create mode 100644 xlators/storage/posix/src/posix.h create mode 100644 xlators/storage/posix/src/xattr-cache.c create mode 100644 xlators/storage/posix/src/xattr-cache.h diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 000000000..d9abdcd07 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,3 @@ +CORE TEAM: +* Please visit http://www.gluster.org/core-team.php for complete list + of contributors. diff --git a/COPYING b/COPYING new file mode 100644 index 000000000..94a9ed024 --- /dev/null +++ b/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 000000000..800cdf327 --- /dev/null +++ b/ChangeLog @@ -0,0 +1 @@ +ChangeLog is maintained by "tla changelog". diff --git a/INSTALL b/INSTALL new file mode 100644 index 000000000..88e28999d --- /dev/null +++ b/INSTALL @@ -0,0 +1,32 @@ +Installation Instructions +************************* + +Run ./configure after untaring the package. + + bash# ./configure + GlusterFS configure summary + =========================== + FUSE client : yes + Infiniband verbs : yes + epoll IO multiplex : yes + Berkeley-DB : yes + libglusterfsclient : yes + mod_glusterfs : yes + argp-standalone : no + +The configure summary will tell you what all components will be built with +GlusterFS. Other than 'argp-standalone' if something else says 'no', that +feature in GlusterFS will not be built. 'argp-standalone' package will only +be used if the system doesn't have a proper argp package installed. + +Now just run 'make' and later run 'make install' to install the package. + + bash# make + bash# make install + +Installation complete :-) + + bash# glusterfs --version + +Make sure your version is the latest from the release, and the one you +just installed :-) diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 000000000..6c7988a1c --- /dev/null +++ b/Makefile.am @@ -0,0 +1,15 @@ +EXTRA_DIST = autogen.sh COPYING INSTALL README AUTHORS THANKS NEWS glusterfs.spec + +SUBDIRS = argp-standalone libglusterfs $(LIBGLUSTERFSCLIENT_SUBDIR) xlators scheduler transport auth glusterfsd $(MOD_GLUSTERFS_SUBDIR) $(GF_BOOSTER_SUBDIR) doc extras + +CLEANFILES = + +tlaclean: distclean + find . -name Makefile.in -exec rm -f {} \; + find . -name Makefile -exec rm -f {} \; + find . -name mount.glusterfs -exec rm -f {} \; + rm -fr autom4te.cache + rm -f missing aclocal.m4 config.h.in config.guess config.sub ltmain.sh install-sh configure depcomp + rm -fr argp-standalone/autom4te.cache + rm -f argp-standalone/aclocal.m4 argp-standalone/config.h.in argp-standalone/configure argp-standalone/depcomp argp-standalone/install-sh argp-standalone/missing + rm -fr mod_glusterfs/apache-1.3/src/.deps transport/ib-verbs/src/.deps diff --git a/NEWS b/NEWS new file mode 100644 index 000000000..39a37cd93 --- /dev/null +++ b/NEWS @@ -0,0 +1 @@ +Please visit http://www.gluster.org/news.php for news updates. diff --git a/README b/README new file mode 100644 index 000000000..8c6cb42ce --- /dev/null +++ b/README @@ -0,0 +1,9 @@ +GlusterFS is a powerful network/cluster filesystem. Posix compliant. You +can keep scaling your storage beyond peta bytes as your demand increases. + +GlusterFS natively supports Infiniband verbs calls to do RDMA between remote +machine to get the maximum I/O throughput. + +It has on the fly replication and striping as inbuilt options. + +Please visit http://www.gluster.org/glusterfs.php for more info. diff --git a/THANKS b/THANKS new file mode 100644 index 000000000..5e86b41a7 --- /dev/null +++ b/THANKS @@ -0,0 +1,3 @@ + +* http://www.gluster.org/glusterfs-thanks.php + diff --git a/argp-standalone/Makefile.am b/argp-standalone/Makefile.am new file mode 100644 index 000000000..4775d4876 --- /dev/null +++ b/argp-standalone/Makefile.am @@ -0,0 +1,38 @@ +# From glibc + +# Copyright (C) 1997, 2003, 2004 Free Software Foundation, Inc. +# This file is part of the GNU C Library. + +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Library General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. + +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Library General Public License for more details. + +# You should have received a copy of the GNU Library General Public +# License along with the GNU C Library; see the file COPYING.LIB. If +# not, write to the Free Software Foundation, Inc., +# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +AUTOMAKE_OPTIONS = foreign +SUBDIRS = . + +LIBOBJS = @LIBOBJS@ + +noinst_LIBRARIES = libargp.a + +noinst_HEADERS = argp.h argp-fmtstream.h argp-namefrob.h + +EXTRA_DIST = mempcpy.c strchrnul.c strndup.c strcasecmp.c vsnprintf.c autogen.sh + +# Leaves out argp-fs-xinl.c and argp-xinl.c +libargp_a_SOURCES = argp-ba.c argp-eexst.c argp-fmtstream.c \ + argp-help.c argp-parse.c argp-pv.c \ + argp-pvh.c + +libargp_a_LIBADD = $(LIBOBJS) + + diff --git a/argp-standalone/acinclude.m4 b/argp-standalone/acinclude.m4 new file mode 100644 index 000000000..fb61e957d --- /dev/null +++ b/argp-standalone/acinclude.m4 @@ -0,0 +1,1084 @@ +dnl Try to detect the type of the third arg to getsockname() et al +AC_DEFUN([LSH_TYPE_SOCKLEN_T], +[AH_TEMPLATE([socklen_t], [Length type used by getsockopt]) +AC_CACHE_CHECK([for socklen_t in sys/socket.h], ac_cv_type_socklen_t, +[AC_EGREP_HEADER(socklen_t, sys/socket.h, + [ac_cv_type_socklen_t=yes], [ac_cv_type_socklen_t=no])]) +if test $ac_cv_type_socklen_t = no; then + AC_MSG_CHECKING(for AIX) + AC_EGREP_CPP(yes, [ +#ifdef _AIX + yes +#endif +],[ +AC_MSG_RESULT(yes) +AC_DEFINE(socklen_t, size_t) +],[ +AC_MSG_RESULT(no) +AC_DEFINE(socklen_t, int) +]) +fi +]) + +dnl Choose cc flags for compiling position independent code +AC_DEFUN([LSH_CCPIC], +[AC_MSG_CHECKING(CCPIC) +AC_CACHE_VAL(lsh_cv_sys_ccpic,[ + if test -z "$CCPIC" ; then + if test "$GCC" = yes ; then + case `uname -sr` in + BSD/OS*) + case `uname -r` in + 4.*) CCPIC="-fPIC";; + *) CCPIC="";; + esac + ;; + Darwin*) + CCPIC="-fPIC" + ;; + SunOS\ 5.*) + # Could also use -fPIC, if there are a large number of symbol reference + CCPIC="-fPIC" + ;; + CYGWIN*) + CCPIC="" + ;; + *) + CCPIC="-fpic" + ;; + esac + else + case `uname -sr` in + Darwin*) + CCPIC="-fPIC" + ;; + IRIX*) + CCPIC="-share" + ;; + hp*|HP*) CCPIC="+z"; ;; + FreeBSD*) CCPIC="-fpic";; + SCO_SV*) CCPIC="-KPIC -dy -Bdynamic";; + UnixWare*|OpenUNIX*) CCPIC="-KPIC -dy -Bdynamic";; + Solaris*) CCPIC="-KPIC -Bdynamic";; + Windows_NT*) CCPIC="-shared" ;; + esac + fi + fi + OLD_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $CCPIC" + AC_TRY_COMPILE([], [exit(0);], + lsh_cv_sys_ccpic="$CCPIC", lsh_cv_sys_ccpic='') + CFLAGS="$OLD_CFLAGS" +]) +CCPIC="$lsh_cv_sys_ccpic" +AC_MSG_RESULT($CCPIC) +AC_SUBST([CCPIC])]) + +dnl LSH_PATH_ADD(path-id, directory) +AC_DEFUN([LSH_PATH_ADD], +[AC_MSG_CHECKING($2) +ac_exists=no +if test -d "$2/." ; then + ac_real_dir=`cd $2 && pwd` + if test -n "$ac_real_dir" ; then + ac_exists=yes + for old in $1_REAL_DIRS ; do + ac_found=no + if test x$ac_real_dir = x$old ; then + ac_found=yes; + break; + fi + done + if test $ac_found = yes ; then + AC_MSG_RESULT(already added) + else + AC_MSG_RESULT(added) + # LDFLAGS="$LDFLAGS -L $2" + $1_REAL_DIRS="$ac_real_dir [$]$1_REAL_DIRS" + $1_DIRS="$2 [$]$1_DIRS" + fi + fi +fi +if test $ac_exists = no ; then + AC_MSG_RESULT(not found) +fi +]) + +dnl LSH_RPATH_ADD(dir) +AC_DEFUN([LSH_RPATH_ADD], [LSH_PATH_ADD(RPATH_CANDIDATE, $1)]) + +dnl LSH_RPATH_INIT(candidates) +AC_DEFUN([LSH_RPATH_INIT], +[AC_MSG_CHECKING([for -R flag]) +RPATHFLAG='' +case `uname -sr` in + OSF1\ V4.*) + RPATHFLAG="-rpath " + ;; + IRIX\ 6.*) + RPATHFLAG="-rpath " + ;; + IRIX\ 5.*) + RPATHFLAG="-rpath " + ;; + SunOS\ 5.*) + if test "$TCC" = "yes"; then + # tcc doesn't know about -R + RPATHFLAG="-Wl,-R," + else + RPATHFLAG=-R + fi + ;; + Linux\ 2.*) + RPATHFLAG="-Wl,-rpath," + ;; + *) + : + ;; +esac + +if test x$RPATHFLAG = x ; then + AC_MSG_RESULT(none) +else + AC_MSG_RESULT([using $RPATHFLAG]) +fi + +RPATH_CANDIDATE_REAL_DIRS='' +RPATH_CANDIDATE_DIRS='' + +AC_MSG_RESULT([Searching for libraries]) + +for d in $1 ; do + LSH_RPATH_ADD($d) +done +]) + +dnl Try to execute a main program, and if it fails, try adding some +dnl -R flag. +dnl LSH_RPATH_FIX +AC_DEFUN([LSH_RPATH_FIX], +[if test $cross_compiling = no -a "x$RPATHFLAG" != x ; then + ac_success=no + AC_TRY_RUN([int main(int argc, char **argv) { return 0; }], + ac_success=yes, ac_success=no, :) + + if test $ac_success = no ; then + AC_MSG_CHECKING([Running simple test program failed. Trying -R flags]) +dnl echo RPATH_CANDIDATE_DIRS = $RPATH_CANDIDATE_DIRS + ac_remaining_dirs='' + ac_rpath_save_LDFLAGS="$LDFLAGS" + for d in $RPATH_CANDIDATE_DIRS ; do + if test $ac_success = yes ; then + ac_remaining_dirs="$ac_remaining_dirs $d" + else + LDFLAGS="$RPATHFLAG$d $LDFLAGS" +dnl echo LDFLAGS = $LDFLAGS + AC_TRY_RUN([int main(int argc, char **argv) { return 0; }], + [ac_success=yes + ac_rpath_save_LDFLAGS="$LDFLAGS" + AC_MSG_RESULT([adding $RPATHFLAG$d]) + ], + [ac_remaining_dirs="$ac_remaining_dirs $d"], :) + LDFLAGS="$ac_rpath_save_LDFLAGS" + fi + done + RPATH_CANDIDATE_DIRS=$ac_remaining_dirs + fi + if test $ac_success = no ; then + AC_MSG_RESULT(failed) + fi +fi +]) + +dnl Like AC_CHECK_LIB, but uses $KRB_LIBS rather than $LIBS. +dnl LSH_CHECK_KRB_LIB(LIBRARY, FUNCTION, [, ACTION-IF-FOUND [, +dnl ACTION-IF-NOT-FOUND [, OTHER-LIBRARIES]]]) + +AC_DEFUN([LSH_CHECK_KRB_LIB], +[AC_CHECK_LIB([$1], [$2], + ifelse([$3], , + [[ac_tr_lib=HAVE_LIB`echo $1 | sed -e 's/[^a-zA-Z0-9_]/_/g' \ + -e 'y/abcdefghijklmnopqrstuvwxyz/ABCDEFGHIJKLMNOPQRSTUVWXYZ/'` + AC_DEFINE_UNQUOTED($ac_tr_lib) + KRB_LIBS="-l$1 $KRB_LIBS" + ]], [$3]), + ifelse([$4], , , [$4 +])dnl +, [$5 $KRB_LIBS]) +]) + +dnl LSH_LIB_ARGP(ACTION-IF-OK, ACTION-IF-BAD) +AC_DEFUN([LSH_LIB_ARGP], +[ ac_argp_save_LIBS="$LIBS" + ac_argp_save_LDFLAGS="$LDFLAGS" + ac_argp_ok=no + # First check if we can link with argp. + AC_SEARCH_LIBS(argp_parse, argp, + [ LSH_RPATH_FIX + AC_CACHE_CHECK([for working argp], + lsh_cv_lib_argp_works, + [ AC_TRY_RUN( +[#include +#include + +static const struct argp_option +options[] = +{ + { NULL, 0, NULL, 0, NULL, 0 } +}; + +struct child_state +{ + int n; +}; + +static error_t +child_parser(int key, char *arg, struct argp_state *state) +{ + struct child_state *input = (struct child_state *) state->input; + + switch(key) + { + default: + return ARGP_ERR_UNKNOWN; + case ARGP_KEY_END: + if (!input->n) + input->n = 1; + break; + } + return 0; +} + +const struct argp child_argp = +{ + options, + child_parser, + NULL, NULL, NULL, NULL, NULL +}; + +struct main_state +{ + struct child_state child; + int m; +}; + +static error_t +main_parser(int key, char *arg, struct argp_state *state) +{ + struct main_state *input = (struct main_state *) state->input; + + switch(key) + { + default: + return ARGP_ERR_UNKNOWN; + case ARGP_KEY_INIT: + state->child_inputs[0] = &input->child; + break; + case ARGP_KEY_END: + if (!input->m) + input->m = input->child.n; + + break; + } + return 0; +} + +static const struct argp_child +main_children[] = +{ + { &child_argp, 0, "", 0 }, + { NULL, 0, NULL, 0} +}; + +static const struct argp +main_argp = +{ options, main_parser, + NULL, + NULL, + main_children, + NULL, NULL +}; + +int main(int argc, char **argv) +{ + struct main_state input = { { 0 }, 0 }; + char *v[2] = { "foo", NULL }; + + argp_parse(&main_argp, 1, v, 0, NULL, &input); + + if ( (input.m == 1) && (input.child.n == 1) ) + return 0; + else + return 1; +} +], lsh_cv_lib_argp_works=yes, + lsh_cv_lib_argp_works=no, + lsh_cv_lib_argp_works=no)]) + + if test x$lsh_cv_lib_argp_works = xyes ; then + ac_argp_ok=yes + else + # Reset link flags + LIBS="$ac_argp_save_LIBS" + LDFLAGS="$ac_argp_save_LDFLAGS" + fi]) + + if test x$ac_argp_ok = xyes ; then + ifelse([$1],, true, [$1]) + else + ifelse([$2],, true, [$2]) + fi +]) + +dnl LSH_GCC_ATTRIBUTES +dnl Check for gcc's __attribute__ construction + +AC_DEFUN([LSH_GCC_ATTRIBUTES], +[AC_CACHE_CHECK(for __attribute__, + lsh_cv_c_attribute, +[ AC_TRY_COMPILE([ +#include +], +[ +static void foo(void) __attribute__ ((noreturn)); + +static void __attribute__ ((noreturn)) +foo(void) +{ + exit(1); +} +], +lsh_cv_c_attribute=yes, +lsh_cv_c_attribute=no)]) + +AH_TEMPLATE([HAVE_GCC_ATTRIBUTE], [Define if the compiler understands __attribute__]) +if test "x$lsh_cv_c_attribute" = "xyes"; then + AC_DEFINE(HAVE_GCC_ATTRIBUTE) +fi + +AH_BOTTOM( +[#if __GNUC__ || HAVE_GCC_ATTRIBUTE +# define NORETURN __attribute__ ((__noreturn__)) +# define PRINTF_STYLE(f, a) __attribute__ ((__format__ (__printf__, f, a))) +# define UNUSED __attribute__ ((__unused__)) +#else +# define NORETURN +# define PRINTF_STYLE(f, a) +# define UNUSED +#endif +])]) + +AC_DEFUN([LSH_GCC_FUNCTION_NAME], +[# Check for gcc's __FUNCTION__ variable +AH_TEMPLATE([HAVE_GCC_FUNCTION], + [Define if the compiler understands __FUNCTION__]) +AH_BOTTOM( +[#if HAVE_GCC_FUNCTION +# define FUNCTION_NAME __FUNCTION__ +#else +# define FUNCTION_NAME "Unknown" +#endif +]) + +AC_CACHE_CHECK(for __FUNCTION__, + lsh_cv_c_FUNCTION, + [ AC_TRY_COMPILE(, + [ #if __GNUC__ == 3 + # error __FUNCTION__ is broken in gcc-3 + #endif + void foo(void) { char c = __FUNCTION__[0]; } ], + lsh_cv_c_FUNCTION=yes, + lsh_cv_c_FUNCTION=no)]) + +if test "x$lsh_cv_c_FUNCTION" = "xyes"; then + AC_DEFINE(HAVE_GCC_FUNCTION) +fi +]) + +# Check for alloca, and include the standard blurb in config.h +AC_DEFUN([LSH_FUNC_ALLOCA], +[AC_FUNC_ALLOCA +AC_CHECK_HEADERS([malloc.h]) +AH_BOTTOM( +[/* AIX requires this to be the first thing in the file. */ +#ifndef __GNUC__ +# if HAVE_ALLOCA_H +# include +# else +# ifdef _AIX + #pragma alloca +# else +# ifndef alloca /* predefined by HP cc +Olibcalls */ +char *alloca (); +# endif +# endif +# endif +#else /* defined __GNUC__ */ +# if HAVE_ALLOCA_H +# include +# endif +#endif +/* Needed for alloca on windows */ +#if HAVE_MALLOC_H +# include +#endif +])]) + +AC_DEFUN([LSH_FUNC_STRERROR], +[AC_CHECK_FUNCS(strerror) +AH_BOTTOM( +[#if HAVE_STRERROR +#define STRERROR strerror +#else +#define STRERROR(x) (sys_errlist[x]) +#endif +])]) + +AC_DEFUN([LSH_FUNC_STRSIGNAL], +[AC_CHECK_FUNCS(strsignal) +AC_CHECK_DECLS([sys_siglist, _sys_siglist]) +AH_BOTTOM( +[#if HAVE_STRSIGNAL +# define STRSIGNAL strsignal +#else /* !HAVE_STRSIGNAL */ +# if HAVE_DECL_SYS_SIGLIST +# define STRSIGNAL(x) (sys_siglist[x]) +# else +# if HAVE_DECL__SYS_SIGLIST +# define STRSIGNAL(x) (_sys_siglist[x]) +# else +# define STRSIGNAL(x) "Unknown signal" +# if __GNUC__ +# warning Using dummy STRSIGNAL +# endif +# endif +# endif +#endif /* !HAVE_STRSIGNAL */ +])]) + +dnl LSH_MAKE_CONDITIONAL(symbol, test) +AC_DEFUN([LSH_MAKE_CONDITIONAL], +[if $2 ; then + IF_$1='' + UNLESS_$1='# ' +else + IF_$1='# ' + UNLESS_$1='' +fi +AC_SUBST(IF_$1) +AC_SUBST(UNLESS_$1)]) + +dnl LSH_DEPENDENCY_TRACKING + +dnl Defines compiler flags DEP_FLAGS to generate dependency +dnl information, and DEP_PROCESS that is any shell commands needed for +dnl massaging the dependency information further. Dependencies are +dnl generated as a side effect of compilation. Dependency files +dnl themselves are not treated as targets. + +AC_DEFUN([LSH_DEPENDENCY_TRACKING], +[AC_ARG_ENABLE(dependency_tracking, + AC_HELP_STRING([--disable-dependency-tracking], + [Disable dependency tracking. Dependency tracking doesn't work with BSD make]),, + [enable_dependency_tracking=yes]) + +DEP_FLAGS='' +DEP_PROCESS='true' +if test x$enable_dependency_tracking = xyes ; then + if test x$GCC = xyes ; then + gcc_version=`gcc --version | head -1` + case "$gcc_version" in + 2.*|*[[!0-9.]]2.*) + enable_dependency_tracking=no + AC_MSG_WARN([Dependency tracking disabled, gcc-3.x is needed]) + ;; + *) + DEP_FLAGS='-MT $[]@ -MD -MP -MF $[]@.d' + DEP_PROCESS='true' + ;; + esac + else + enable_dependency_tracking=no + AC_MSG_WARN([Dependency tracking disabled]) + fi +fi + +if test x$enable_dependency_tracking = xyes ; then + DEP_INCLUDE='include ' +else + DEP_INCLUDE='# ' +fi + +AC_SUBST([DEP_INCLUDE]) +AC_SUBST([DEP_FLAGS]) +AC_SUBST([DEP_PROCESS])]) + +dnl @synopsis AX_CREATE_STDINT_H [( HEADER-TO-GENERATE [, HEADERS-TO-CHECK])] +dnl +dnl the "ISO C9X: 7.18 Integer types " section requires the +dnl existence of an include file that defines a set of +dnl typedefs, especially uint8_t,int32_t,uintptr_t. +dnl Many older installations will not provide this file, but some will +dnl have the very same definitions in . In other enviroments +dnl we can use the inet-types in which would define the +dnl typedefs int8_t and u_int8_t respectivly. +dnl +dnl This macros will create a local "_stdint.h" or the headerfile given as +dnl an argument. In many cases that file will just "#include " +dnl or "#include ", while in other environments it will provide +dnl the set of basic 'stdint's definitions/typedefs: +dnl int8_t,uint8_t,int16_t,uint16_t,int32_t,uint32_t,intptr_t,uintptr_t +dnl int_least32_t.. int_fast32_t.. intmax_t +dnl which may or may not rely on the definitions of other files, +dnl or using the AC_CHECK_SIZEOF macro to determine the actual +dnl sizeof each type. +dnl +dnl if your header files require the stdint-types you will want to create an +dnl installable file mylib-int.h that all your other installable header +dnl may include. So if you have a library package named "mylib", just use +dnl AX_CREATE_STDINT_H(mylib-int.h) +dnl in configure.ac and go to install that very header file in Makefile.am +dnl along with the other headers (mylib.h) - and the mylib-specific headers +dnl can simply use "#include " to obtain the stdint-types. +dnl +dnl Remember, if the system already had a valid , the generated +dnl file will include it directly. No need for fuzzy HAVE_STDINT_H things... +dnl +dnl @, (status: used on new platforms) (see http://ac-archive.sf.net/gstdint/) +dnl @version $Id: acinclude.m4,v 1.27 2004/11/23 21:27:35 nisse Exp $ +dnl @author Guido Draheim + +AC_DEFUN([AX_CREATE_STDINT_H], +[# ------ AX CREATE STDINT H ------------------------------------- +AC_MSG_CHECKING([for stdint types]) +ac_stdint_h=`echo ifelse($1, , _stdint.h, $1)` +# try to shortcircuit - if the default include path of the compiler +# can find a "stdint.h" header then we assume that all compilers can. +AC_CACHE_VAL([ac_cv_header_stdint_t],[ +old_CXXFLAGS="$CXXFLAGS" ; CXXFLAGS="" +old_CPPFLAGS="$CPPFLAGS" ; CPPFLAGS="" +old_CFLAGS="$CFLAGS" ; CFLAGS="" +AC_TRY_COMPILE([#include ],[int_least32_t v = 0;], +[ac_cv_stdint_result="(assuming C99 compatible system)" + ac_cv_header_stdint_t="stdint.h"; ], +[ac_cv_header_stdint_t=""]) +CXXFLAGS="$old_CXXFLAGS" +CPPFLAGS="$old_CPPFLAGS" +CFLAGS="$old_CFLAGS" ]) + +v="... $ac_cv_header_stdint_h" +if test "$ac_stdint_h" = "stdint.h" ; then + AC_MSG_RESULT([(are you sure you want them in ./stdint.h?)]) +elif test "$ac_stdint_h" = "inttypes.h" ; then + AC_MSG_RESULT([(are you sure you want them in ./inttypes.h?)]) +elif test "_$ac_cv_header_stdint_t" = "_" ; then + AC_MSG_RESULT([(putting them into $ac_stdint_h)$v]) +else + ac_cv_header_stdint="$ac_cv_header_stdint_t" + AC_MSG_RESULT([$ac_cv_header_stdint (shortcircuit)]) +fi + +if test "_$ac_cv_header_stdint_t" = "_" ; then # can not shortcircuit.. + +dnl .....intro message done, now do a few system checks..... +dnl btw, all CHECK_TYPE macros do automatically "DEFINE" a type, therefore +dnl we use the autoconf implementation detail _AC CHECK_TYPE_NEW instead + +inttype_headers=`echo $2 | sed -e 's/,/ /g'` + +ac_cv_stdint_result="(no helpful system typedefs seen)" +AC_CACHE_CHECK([for stdint uintptr_t], [ac_cv_header_stdint_x],[ + ac_cv_header_stdint_x="" # the 1997 typedefs (inttypes.h) + AC_MSG_RESULT([(..)]) + for i in stdint.h inttypes.h sys/inttypes.h $inttype_headers ; do + unset ac_cv_type_uintptr_t + unset ac_cv_type_uint64_t + _AC_CHECK_TYPE_NEW(uintptr_t,[ac_cv_header_stdint_x=$i],dnl + continue,[#include <$i>]) + AC_CHECK_TYPE(uint64_t,[and64="/uint64_t"],[and64=""],[#include<$i>]) + ac_cv_stdint_result="(seen uintptr_t$and64 in $i)" + break; + done + AC_MSG_CHECKING([for stdint uintptr_t]) + ]) + +if test "_$ac_cv_header_stdint_x" = "_" ; then +AC_CACHE_CHECK([for stdint uint32_t], [ac_cv_header_stdint_o],[ + ac_cv_header_stdint_o="" # the 1995 typedefs (sys/inttypes.h) + AC_MSG_RESULT([(..)]) + for i in inttypes.h sys/inttypes.h stdint.h $inttype_headers ; do + unset ac_cv_type_uint32_t + unset ac_cv_type_uint64_t + AC_CHECK_TYPE(uint32_t,[ac_cv_header_stdint_o=$i],dnl + continue,[#include <$i>]) + AC_CHECK_TYPE(uint64_t,[and64="/uint64_t"],[and64=""],[#include<$i>]) + ac_cv_stdint_result="(seen uint32_t$and64 in $i)" + break; + done + AC_MSG_CHECKING([for stdint uint32_t]) + ]) +fi + +if test "_$ac_cv_header_stdint_x" = "_" ; then +if test "_$ac_cv_header_stdint_o" = "_" ; then +AC_CACHE_CHECK([for stdint u_int32_t], [ac_cv_header_stdint_u],[ + ac_cv_header_stdint_u="" # the BSD typedefs (sys/types.h) + AC_MSG_RESULT([(..)]) + for i in sys/types.h inttypes.h sys/inttypes.h $inttype_headers ; do + unset ac_cv_type_u_int32_t + unset ac_cv_type_u_int64_t + AC_CHECK_TYPE(u_int32_t,[ac_cv_header_stdint_u=$i],dnl + continue,[#include <$i>]) + AC_CHECK_TYPE(u_int64_t,[and64="/u_int64_t"],[and64=""],[#include<$i>]) + ac_cv_stdint_result="(seen u_int32_t$and64 in $i)" + break; + done + AC_MSG_CHECKING([for stdint u_int32_t]) + ]) +fi fi + +dnl if there was no good C99 header file, do some typedef checks... +if test "_$ac_cv_header_stdint_x" = "_" ; then + AC_MSG_CHECKING([for stdint datatype model]) + AC_MSG_RESULT([(..)]) + AC_CHECK_SIZEOF(char) + AC_CHECK_SIZEOF(short) + AC_CHECK_SIZEOF(int) + AC_CHECK_SIZEOF(long) + AC_CHECK_SIZEOF(void*) + ac_cv_stdint_char_model="" + ac_cv_stdint_char_model="$ac_cv_stdint_char_model$ac_cv_sizeof_char" + ac_cv_stdint_char_model="$ac_cv_stdint_char_model$ac_cv_sizeof_short" + ac_cv_stdint_char_model="$ac_cv_stdint_char_model$ac_cv_sizeof_int" + ac_cv_stdint_long_model="" + ac_cv_stdint_long_model="$ac_cv_stdint_long_model$ac_cv_sizeof_int" + ac_cv_stdint_long_model="$ac_cv_stdint_long_model$ac_cv_sizeof_long" + ac_cv_stdint_long_model="$ac_cv_stdint_long_model$ac_cv_sizeof_voidp" + name="$ac_cv_stdint_long_model" + case "$ac_cv_stdint_char_model/$ac_cv_stdint_long_model" in + 122/242) name="$name, IP16 (standard 16bit machine)" ;; + 122/244) name="$name, LP32 (standard 32bit mac/win)" ;; + 122/*) name="$name (unusual int16 model)" ;; + 124/444) name="$name, ILP32 (standard 32bit unixish)" ;; + 124/488) name="$name, LP64 (standard 64bit unixish)" ;; + 124/448) name="$name, LLP64 (unusual 64bit unixish)" ;; + 124/*) name="$name (unusual int32 model)" ;; + 128/888) name="$name, ILP64 (unusual 64bit numeric)" ;; + 128/*) name="$name (unusual int64 model)" ;; + 222/*|444/*) name="$name (unusual dsptype)" ;; + *) name="$name (very unusal model)" ;; + esac + AC_MSG_RESULT([combined for stdint datatype model... $name]) +fi + +if test "_$ac_cv_header_stdint_x" != "_" ; then + ac_cv_header_stdint="$ac_cv_header_stdint_x" +elif test "_$ac_cv_header_stdint_o" != "_" ; then + ac_cv_header_stdint="$ac_cv_header_stdint_o" +elif test "_$ac_cv_header_stdint_u" != "_" ; then + ac_cv_header_stdint="$ac_cv_header_stdint_u" +else + ac_cv_header_stdint="stddef.h" +fi + +AC_MSG_CHECKING([for extra inttypes in chosen header]) +AC_MSG_RESULT([($ac_cv_header_stdint)]) +dnl see if int_least and int_fast types are present in _this_ header. +unset ac_cv_type_int_least32_t +unset ac_cv_type_int_fast32_t +AC_CHECK_TYPE(int_least32_t,,,[#include <$ac_cv_header_stdint>]) +AC_CHECK_TYPE(int_fast32_t,,,[#include<$ac_cv_header_stdint>]) +AC_CHECK_TYPE(intmax_t,,,[#include <$ac_cv_header_stdint>]) + +fi # shortcircut to system "stdint.h" +# ------------------ PREPARE VARIABLES ------------------------------ +if test "$GCC" = "yes" ; then +ac_cv_stdint_message="using gnu compiler "`$CC --version | head -1` +else +ac_cv_stdint_message="using $CC" +fi + +AC_MSG_RESULT([make use of $ac_cv_header_stdint in $ac_stdint_h dnl +$ac_cv_stdint_result]) + +# ----------------- DONE inttypes.h checks START header ------------- +AC_CONFIG_COMMANDS([$ac_stdint_h],[ +AC_MSG_NOTICE(creating $ac_stdint_h : $_ac_stdint_h) +ac_stdint=$tmp/_stdint.h + +echo "#ifndef" $_ac_stdint_h >$ac_stdint +echo "#define" $_ac_stdint_h "1" >>$ac_stdint +echo "#ifndef" _GENERATED_STDINT_H >>$ac_stdint +echo "#define" _GENERATED_STDINT_H '"'$PACKAGE $VERSION'"' >>$ac_stdint +echo "/* generated $ac_cv_stdint_message */" >>$ac_stdint +if test "_$ac_cv_header_stdint_t" != "_" ; then +echo "#define _STDINT_HAVE_STDINT_H" "1" >>$ac_stdint +fi + +cat >>$ac_stdint < +#else +#include + +/* .................... configured part ............................ */ + +STDINT_EOF + +echo "/* whether we have a C99 compatible stdint header file */" >>$ac_stdint +if test "_$ac_cv_header_stdint_x" != "_" ; then + ac_header="$ac_cv_header_stdint_x" + echo "#define _STDINT_HEADER_INTPTR" '"'"$ac_header"'"' >>$ac_stdint +else + echo "/* #undef _STDINT_HEADER_INTPTR */" >>$ac_stdint +fi + +echo "/* whether we have a C96 compatible inttypes header file */" >>$ac_stdint +if test "_$ac_cv_header_stdint_o" != "_" ; then + ac_header="$ac_cv_header_stdint_o" + echo "#define _STDINT_HEADER_UINT32" '"'"$ac_header"'"' >>$ac_stdint +else + echo "/* #undef _STDINT_HEADER_UINT32 */" >>$ac_stdint +fi + +echo "/* whether we have a BSD compatible inet types header */" >>$ac_stdint +if test "_$ac_cv_header_stdint_u" != "_" ; then + ac_header="$ac_cv_header_stdint_u" + echo "#define _STDINT_HEADER_U_INT32" '"'"$ac_header"'"' >>$ac_stdint +else + echo "/* #undef _STDINT_HEADER_U_INT32 */" >>$ac_stdint +fi + +echo "" >>$ac_stdint + +if test "_$ac_header" != "_" ; then if test "$ac_header" != "stddef.h" ; then + echo "#include <$ac_header>" >>$ac_stdint + echo "" >>$ac_stdint +fi fi + +echo "/* which 64bit typedef has been found */" >>$ac_stdint +if test "$ac_cv_type_uint64_t" = "yes" ; then +echo "#define _STDINT_HAVE_UINT64_T" "1" >>$ac_stdint +else +echo "/* #undef _STDINT_HAVE_UINT64_T */" >>$ac_stdint +fi +if test "$ac_cv_type_u_int64_t" = "yes" ; then +echo "#define _STDINT_HAVE_U_INT64_T" "1" >>$ac_stdint +else +echo "/* #undef _STDINT_HAVE_U_INT64_T */" >>$ac_stdint +fi +echo "" >>$ac_stdint + +echo "/* which type model has been detected */" >>$ac_stdint +if test "_$ac_cv_stdint_char_model" != "_" ; then +echo "#define _STDINT_CHAR_MODEL" "$ac_cv_stdint_char_model" >>$ac_stdint +echo "#define _STDINT_LONG_MODEL" "$ac_cv_stdint_long_model" >>$ac_stdint +else +echo "/* #undef _STDINT_CHAR_MODEL // skipped */" >>$ac_stdint +echo "/* #undef _STDINT_LONG_MODEL // skipped */" >>$ac_stdint +fi +echo "" >>$ac_stdint + +echo "/* whether int_least types were detected */" >>$ac_stdint +if test "$ac_cv_type_int_least32_t" = "yes"; then +echo "#define _STDINT_HAVE_INT_LEAST32_T" "1" >>$ac_stdint +else +echo "/* #undef _STDINT_HAVE_INT_LEAST32_T */" >>$ac_stdint +fi +echo "/* whether int_fast types were detected */" >>$ac_stdint +if test "$ac_cv_type_int_fast32_t" = "yes"; then +echo "#define _STDINT_HAVE_INT_FAST32_T" "1" >>$ac_stdint +else +echo "/* #undef _STDINT_HAVE_INT_FAST32_T */" >>$ac_stdint +fi +echo "/* whether intmax_t type was detected */" >>$ac_stdint +if test "$ac_cv_type_intmax_t" = "yes"; then +echo "#define _STDINT_HAVE_INTMAX_T" "1" >>$ac_stdint +else +echo "/* #undef _STDINT_HAVE_INTMAX_T */" >>$ac_stdint +fi +echo "" >>$ac_stdint + + cat >>$ac_stdint <= 199901L +#define _HAVE_UINT64_T +typedef long long int64_t; +typedef unsigned long long uint64_t; + +#elif !defined __STRICT_ANSI__ +#if defined _MSC_VER || defined __WATCOMC__ || defined __BORLANDC__ +#define _HAVE_UINT64_T +typedef __int64 int64_t; +typedef unsigned __int64 uint64_t; + +#elif defined __GNUC__ || defined __MWERKS__ || defined __ELF__ +/* note: all ELF-systems seem to have loff-support which needs 64-bit */ +#if !defined _NO_LONGLONG +#define _HAVE_UINT64_T +typedef long long int64_t; +typedef unsigned long long uint64_t; +#endif + +#elif defined __alpha || (defined __mips && defined _ABIN32) +#if !defined _NO_LONGLONG +typedef long int64_t; +typedef unsigned long uint64_t; +#endif + /* compiler/cpu type to define int64_t */ +#endif +#endif +#endif + +#if defined _STDINT_HAVE_U_INT_TYPES +/* int8_t int16_t int32_t defined by inet code, redeclare the u_intXX types */ +typedef u_int8_t uint8_t; +typedef u_int16_t uint16_t; +typedef u_int32_t uint32_t; + +/* glibc compatibility */ +#ifndef __int8_t_defined +#define __int8_t_defined +#endif +#endif + +#ifdef _STDINT_NEED_INT_MODEL_T +/* we must guess all the basic types. Apart from byte-adressable system, */ +/* there a few 32-bit-only dsp-systems that we guard with BYTE_MODEL 8-} */ +/* (btw, those nibble-addressable systems are way off, or so we assume) */ + +dnl /* have a look at "64bit and data size neutrality" at */ +dnl /* http://unix.org/version2/whatsnew/login_64bit.html */ +dnl /* (the shorthand "ILP" types always have a "P" part) */ + +#if defined _STDINT_BYTE_MODEL +#if _STDINT_LONG_MODEL+0 == 242 +/* 2:4:2 = IP16 = a normal 16-bit system */ +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned long uint32_t; +#ifndef __int8_t_defined +#define __int8_t_defined +typedef char int8_t; +typedef short int16_t; +typedef long int32_t; +#endif +#elif _STDINT_LONG_MODEL+0 == 244 || _STDINT_LONG_MODEL == 444 +/* 2:4:4 = LP32 = a 32-bit system derived from a 16-bit */ +/* 4:4:4 = ILP32 = a normal 32-bit system */ +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +#ifndef __int8_t_defined +#define __int8_t_defined +typedef char int8_t; +typedef short int16_t; +typedef int int32_t; +#endif +#elif _STDINT_LONG_MODEL+0 == 484 || _STDINT_LONG_MODEL+0 == 488 +/* 4:8:4 = IP32 = a 32-bit system prepared for 64-bit */ +/* 4:8:8 = LP64 = a normal 64-bit system */ +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +#ifndef __int8_t_defined +#define __int8_t_defined +typedef char int8_t; +typedef short int16_t; +typedef int int32_t; +#endif +/* this system has a "long" of 64bit */ +#ifndef _HAVE_UINT64_T +#define _HAVE_UINT64_T +typedef unsigned long uint64_t; +typedef long int64_t; +#endif +#elif _STDINT_LONG_MODEL+0 == 448 +/* LLP64 a 64-bit system derived from a 32-bit system */ +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +#ifndef __int8_t_defined +#define __int8_t_defined +typedef char int8_t; +typedef short int16_t; +typedef int int32_t; +#endif +/* assuming the system has a "long long" */ +#ifndef _HAVE_UINT64_T +#define _HAVE_UINT64_T +typedef unsigned long long uint64_t; +typedef long long int64_t; +#endif +#else +#define _STDINT_NO_INT32_T +#endif +#else +#define _STDINT_NO_INT8_T +#define _STDINT_NO_INT32_T +#endif +#endif + +/* + * quote from SunOS-5.8 sys/inttypes.h: + * Use at your own risk. As of February 1996, the committee is squarely + * behind the fixed sized types; the "least" and "fast" types are still being + * discussed. The probability that the "fast" types may be removed before + * the standard is finalized is high enough that they are not currently + * implemented. + */ + +#if defined _STDINT_NEED_INT_LEAST_T +typedef int8_t int_least8_t; +typedef int16_t int_least16_t; +typedef int32_t int_least32_t; +#ifdef _HAVE_UINT64_T +typedef int64_t int_least64_t; +#endif + +typedef uint8_t uint_least8_t; +typedef uint16_t uint_least16_t; +typedef uint32_t uint_least32_t; +#ifdef _HAVE_UINT64_T +typedef uint64_t uint_least64_t; +#endif + /* least types */ +#endif + +#if defined _STDINT_NEED_INT_FAST_T +typedef int8_t int_fast8_t; +typedef int int_fast16_t; +typedef int32_t int_fast32_t; +#ifdef _HAVE_UINT64_T +typedef int64_t int_fast64_t; +#endif + +typedef uint8_t uint_fast8_t; +typedef unsigned uint_fast16_t; +typedef uint32_t uint_fast32_t; +#ifdef _HAVE_UINT64_T +typedef uint64_t uint_fast64_t; +#endif + /* fast types */ +#endif + +#ifdef _STDINT_NEED_INTMAX_T +#ifdef _HAVE_UINT64_T +typedef int64_t intmax_t; +typedef uint64_t uintmax_t; +#else +typedef long intmax_t; +typedef unsigned long uintmax_t; +#endif +#endif + +#ifdef _STDINT_NEED_INTPTR_T +#ifndef __intptr_t_defined +#define __intptr_t_defined +/* we encourage using "long" to store pointer values, never use "int" ! */ +#if _STDINT_LONG_MODEL+0 == 242 || _STDINT_LONG_MODEL+0 == 484 +typedef unsinged int uintptr_t; +typedef int intptr_t; +#elif _STDINT_LONG_MODEL+0 == 244 || _STDINT_LONG_MODEL+0 == 444 +typedef unsigned long uintptr_t; +typedef long intptr_t; +#elif _STDINT_LONG_MODEL+0 == 448 && defined _HAVE_UINT64_T +typedef uint64_t uintptr_t; +typedef int64_t intptr_t; +#else /* matches typical system types ILP32 and LP64 - but not IP16 or LLP64 */ +typedef unsigned long uintptr_t; +typedef long intptr_t; +#endif +#endif +#endif + + /* shortcircuit*/ +#endif + /* once */ +#endif +#endif +STDINT_EOF + if cmp -s $ac_stdint_h $ac_stdint 2>/dev/null; then + AC_MSG_NOTICE([$ac_stdint_h is unchanged]) + else + ac_dir=`AS_DIRNAME(["$ac_stdint_h"])` + AS_MKDIR_P(["$ac_dir"]) + rm -f $ac_stdint_h + mv $ac_stdint $ac_stdint_h + fi +],[# variables for create stdint.h replacement +PACKAGE="$PACKAGE" +VERSION="$VERSION" +ac_stdint_h="$ac_stdint_h" +_ac_stdint_h=AS_TR_CPP(_$PACKAGE-$ac_stdint_h) +ac_cv_stdint_message="$ac_cv_stdint_message" +ac_cv_header_stdint_t="$ac_cv_header_stdint_t" +ac_cv_header_stdint_x="$ac_cv_header_stdint_x" +ac_cv_header_stdint_o="$ac_cv_header_stdint_o" +ac_cv_header_stdint_u="$ac_cv_header_stdint_u" +ac_cv_type_uint64_t="$ac_cv_type_uint64_t" +ac_cv_type_u_int64_t="$ac_cv_type_u_int64_t" +ac_cv_stdint_char_model="$ac_cv_stdint_char_model" +ac_cv_stdint_long_model="$ac_cv_stdint_long_model" +ac_cv_type_int_least32_t="$ac_cv_type_int_least32_t" +ac_cv_type_int_fast32_t="$ac_cv_type_int_fast32_t" +ac_cv_type_intmax_t="$ac_cv_type_intmax_t" +]) +]) diff --git a/argp-standalone/argp-ba.c b/argp-standalone/argp-ba.c new file mode 100644 index 000000000..0d3958c11 --- /dev/null +++ b/argp-standalone/argp-ba.c @@ -0,0 +1,26 @@ +/* Default definition for ARGP_PROGRAM_BUG_ADDRESS. + Copyright (C) 1996, 1997, 1999, 2004 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Miles Bader . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* If set by the user program, it should point to string that is the + bug-reporting address for the program. It will be printed by argp_help if + the ARGP_HELP_BUG_ADDR flag is set (as it is by various standard help + messages), embedded in a sentence that says something like `Report bugs to + ADDR.'. */ +const char *argp_program_bug_address = 0; diff --git a/argp-standalone/argp-eexst.c b/argp-standalone/argp-eexst.c new file mode 100644 index 000000000..46b27847a --- /dev/null +++ b/argp-standalone/argp-eexst.c @@ -0,0 +1,36 @@ +/* Default definition for ARGP_ERR_EXIT_STATUS + Copyright (C) 1997 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Miles Bader . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#if HAVE_SYSEXITS_H +# include +#else +# define EX_USAGE 64 +#endif + +#include "argp.h" + +/* The exit status that argp will use when exiting due to a parsing error. + If not defined or set by the user program, this defaults to EX_USAGE from + . */ +error_t argp_err_exit_status = EX_USAGE; diff --git a/argp-standalone/argp-fmtstream.c b/argp-standalone/argp-fmtstream.c new file mode 100644 index 000000000..7f792854f --- /dev/null +++ b/argp-standalone/argp-fmtstream.c @@ -0,0 +1,475 @@ +/* Word-wrapping and line-truncating streams + Copyright (C) 1997, 1998, 1999, 2001 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Miles Bader . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* This package emulates glibc `line_wrap_stream' semantics for systems that + don't have that. */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include +#include + +#include "argp-fmtstream.h" +#include "argp-namefrob.h" + +#ifndef ARGP_FMTSTREAM_USE_LINEWRAP + +#ifndef isblank +#define isblank(ch) ((ch)==' ' || (ch)=='\t') +#endif + +#if defined _LIBC && defined USE_IN_LIBIO +# include +# define __vsnprintf(s, l, f, a) _IO_vsnprintf (s, l, f, a) +#endif + +#define INIT_BUF_SIZE 200 +#define PRINTF_SIZE_GUESS 150 + +/* Return an argp_fmtstream that outputs to STREAM, and which prefixes lines + written on it with LMARGIN spaces and limits them to RMARGIN columns + total. If WMARGIN >= 0, words that extend past RMARGIN are wrapped by + replacing the whitespace before them with a newline and WMARGIN spaces. + Otherwise, chars beyond RMARGIN are simply dropped until a newline. + Returns NULL if there was an error. */ +argp_fmtstream_t +__argp_make_fmtstream (FILE *stream, + size_t lmargin, size_t rmargin, ssize_t wmargin) +{ + argp_fmtstream_t fs = malloc (sizeof (struct argp_fmtstream)); + if (fs) + { + fs->stream = stream; + + fs->lmargin = lmargin; + fs->rmargin = rmargin; + fs->wmargin = wmargin; + fs->point_col = 0; + fs->point_offs = 0; + + fs->buf = malloc (INIT_BUF_SIZE); + if (! fs->buf) + { + free (fs); + fs = 0; + } + else + { + fs->p = fs->buf; + fs->end = fs->buf + INIT_BUF_SIZE; + } + } + + return fs; +} +#ifdef weak_alias +weak_alias (__argp_make_fmtstream, argp_make_fmtstream) +#endif + +/* Flush FS to its stream, and free it (but don't close the stream). */ +void +__argp_fmtstream_free (argp_fmtstream_t fs) +{ + __argp_fmtstream_update (fs); + if (fs->p > fs->buf) + FWRITE_UNLOCKED (fs->buf, 1, fs->p - fs->buf, fs->stream); + free (fs->buf); + free (fs); +} +#ifdef weak_alias +weak_alias (__argp_fmtstream_free, argp_fmtstream_free) +#endif + +/* Process FS's buffer so that line wrapping is done from POINT_OFFS to the + end of its buffer. This code is mostly from glibc stdio/linewrap.c. */ +void +__argp_fmtstream_update (argp_fmtstream_t fs) +{ + char *buf, *nl; + size_t len; + + /* Scan the buffer for newlines. */ + buf = fs->buf + fs->point_offs; + while (buf < fs->p) + { + size_t r; + + if (fs->point_col == 0 && fs->lmargin != 0) + { + /* We are starting a new line. Print spaces to the left margin. */ + const size_t pad = fs->lmargin; + if (fs->p + pad < fs->end) + { + /* We can fit in them in the buffer by moving the + buffer text up and filling in the beginning. */ + memmove (buf + pad, buf, fs->p - buf); + fs->p += pad; /* Compensate for bigger buffer. */ + memset (buf, ' ', pad); /* Fill in the spaces. */ + buf += pad; /* Don't bother searching them. */ + } + else + { + /* No buffer space for spaces. Must flush. */ + size_t i; + for (i = 0; i < pad; i++) + PUTC_UNLOCKED (' ', fs->stream); + } + fs->point_col = pad; + } + + len = fs->p - buf; + nl = memchr (buf, '\n', len); + + if (fs->point_col < 0) + fs->point_col = 0; + + if (!nl) + { + /* The buffer ends in a partial line. */ + + if (fs->point_col + len < fs->rmargin) + { + /* The remaining buffer text is a partial line and fits + within the maximum line width. Advance point for the + characters to be written and stop scanning. */ + fs->point_col += len; + break; + } + else + /* Set the end-of-line pointer for the code below to + the end of the buffer. */ + nl = fs->p; + } + else if (fs->point_col + (nl - buf) < (ssize_t) fs->rmargin) + { + /* The buffer contains a full line that fits within the maximum + line width. Reset point and scan the next line. */ + fs->point_col = 0; + buf = nl + 1; + continue; + } + + /* This line is too long. */ + r = fs->rmargin - 1; + + if (fs->wmargin < 0) + { + /* Truncate the line by overwriting the excess with the + newline and anything after it in the buffer. */ + if (nl < fs->p) + { + memmove (buf + (r - fs->point_col), nl, fs->p - nl); + fs->p -= buf + (r - fs->point_col) - nl; + /* Reset point for the next line and start scanning it. */ + fs->point_col = 0; + buf += r + 1; /* Skip full line plus \n. */ + } + else + { + /* The buffer ends with a partial line that is beyond the + maximum line width. Advance point for the characters + written, and discard those past the max from the buffer. */ + fs->point_col += len; + fs->p -= fs->point_col - r; + break; + } + } + else + { + /* Do word wrap. Go to the column just past the maximum line + width and scan back for the beginning of the word there. + Then insert a line break. */ + + char *p, *nextline; + int i; + + p = buf + (r + 1 - fs->point_col); + while (p >= buf && !isblank (*p)) + --p; + nextline = p + 1; /* This will begin the next line. */ + + if (nextline > buf) + { + /* Swallow separating blanks. */ + if (p >= buf) + do + --p; + while (p >= buf && isblank (*p)); + nl = p + 1; /* The newline will replace the first blank. */ + } + else + { + /* A single word that is greater than the maximum line width. + Oh well. Put it on an overlong line by itself. */ + p = buf + (r + 1 - fs->point_col); + /* Find the end of the long word. */ + do + ++p; + while (p < nl && !isblank (*p)); + if (p == nl) + { + /* It already ends a line. No fussing required. */ + fs->point_col = 0; + buf = nl + 1; + continue; + } + /* We will move the newline to replace the first blank. */ + nl = p; + /* Swallow separating blanks. */ + do + ++p; + while (isblank (*p)); + /* The next line will start here. */ + nextline = p; + } + + /* Note: There are a bunch of tests below for + NEXTLINE == BUF + LEN + 1; this case is where NL happens to fall + at the end of the buffer, and NEXTLINE is in fact empty (and so + we need not be careful to maintain its contents). */ + + if (nextline == buf + len + 1 + ? fs->end - nl < fs->wmargin + 1 + : nextline - (nl + 1) < fs->wmargin) + { + /* The margin needs more blanks than we removed. */ + if (fs->end - fs->p > fs->wmargin + 1) + /* Make some space for them. */ + { + size_t mv = fs->p - nextline; + memmove (nl + 1 + fs->wmargin, nextline, mv); + nextline = nl + 1 + fs->wmargin; + len = nextline + mv - buf; + *nl++ = '\n'; + } + else + /* Output the first line so we can use the space. */ + { + if (nl > fs->buf) + FWRITE_UNLOCKED (fs->buf, 1, nl - fs->buf, fs->stream); + PUTC_UNLOCKED ('\n', fs->stream); + len += buf - fs->buf; + nl = buf = fs->buf; + } + } + else + /* We can fit the newline and blanks in before + the next word. */ + *nl++ = '\n'; + + if (nextline - nl >= fs->wmargin + || (nextline == buf + len + 1 && fs->end - nextline >= fs->wmargin)) + /* Add blanks up to the wrap margin column. */ + for (i = 0; i < fs->wmargin; ++i) + *nl++ = ' '; + else + for (i = 0; i < fs->wmargin; ++i) + PUTC_UNLOCKED (' ', fs->stream); + + /* Copy the tail of the original buffer into the current buffer + position. */ + if (nl < nextline) + memmove (nl, nextline, buf + len - nextline); + len -= nextline - buf; + + /* Continue the scan on the remaining lines in the buffer. */ + buf = nl; + + /* Restore bufp to include all the remaining text. */ + fs->p = nl + len; + + /* Reset the counter of what has been output this line. If wmargin + is 0, we want to avoid the lmargin getting added, so we set + point_col to a magic value of -1 in that case. */ + fs->point_col = fs->wmargin ? fs->wmargin : -1; + } + } + + /* Remember that we've scanned as far as the end of the buffer. */ + fs->point_offs = fs->p - fs->buf; +} + +/* Ensure that FS has space for AMOUNT more bytes in its buffer, either by + growing the buffer, or by flushing it. True is returned iff we succeed. */ +int +__argp_fmtstream_ensure (struct argp_fmtstream *fs, size_t amount) +{ + if ((size_t) (fs->end - fs->p) < amount) + { + ssize_t wrote; + + /* Flush FS's buffer. */ + __argp_fmtstream_update (fs); + + wrote = FWRITE_UNLOCKED (fs->buf, 1, fs->p - fs->buf, fs->stream); + if (wrote == fs->p - fs->buf) + { + fs->p = fs->buf; + fs->point_offs = 0; + } + else + { + fs->p -= wrote; + fs->point_offs -= wrote; + memmove (fs->buf, fs->buf + wrote, fs->p - fs->buf); + return 0; + } + + if ((size_t) (fs->end - fs->buf) < amount) + /* Gotta grow the buffer. */ + { + size_t new_size = fs->end - fs->buf + amount; + char *new_buf = realloc (fs->buf, new_size); + + if (! new_buf) + { + __set_errno (ENOMEM); + return 0; + } + + fs->buf = new_buf; + fs->end = new_buf + new_size; + fs->p = fs->buf; + } + } + + return 1; +} + +ssize_t +__argp_fmtstream_printf (struct argp_fmtstream *fs, const char *fmt, ...) +{ + size_t out; + size_t avail; + size_t size_guess = PRINTF_SIZE_GUESS; /* How much space to reserve. */ + + do + { + va_list args; + + if (! __argp_fmtstream_ensure (fs, size_guess)) + return -1; + + va_start (args, fmt); + avail = fs->end - fs->p; + out = __vsnprintf (fs->p, avail, fmt, args); + va_end (args); + if (out >= avail) + size_guess = out + 1; + } + while (out >= avail); + + fs->p += out; + + return out; +} +#ifdef weak_alias +weak_alias (__argp_fmtstream_printf, argp_fmtstream_printf) +#endif + +/* Duplicate the inline definitions in argp-fmtstream.h, for compilers + * that don't do inlining. */ +size_t +__argp_fmtstream_write (argp_fmtstream_t __fs, + __const char *__str, size_t __len) +{ + if (__fs->p + __len <= __fs->end || __argp_fmtstream_ensure (__fs, __len)) + { + memcpy (__fs->p, __str, __len); + __fs->p += __len; + return __len; + } + else + return 0; +} + +int +__argp_fmtstream_puts (argp_fmtstream_t __fs, __const char *__str) +{ + size_t __len = strlen (__str); + if (__len) + { + size_t __wrote = __argp_fmtstream_write (__fs, __str, __len); + return __wrote == __len ? 0 : -1; + } + else + return 0; +} + +int +__argp_fmtstream_putc (argp_fmtstream_t __fs, int __ch) +{ + if (__fs->p < __fs->end || __argp_fmtstream_ensure (__fs, 1)) + return *__fs->p++ = __ch; + else + return EOF; +} + +/* Set __FS's left margin to __LMARGIN and return the old value. */ +size_t +__argp_fmtstream_set_lmargin (argp_fmtstream_t __fs, size_t __lmargin) +{ + size_t __old; + if ((size_t) (__fs->p - __fs->buf) > __fs->point_offs) + __argp_fmtstream_update (__fs); + __old = __fs->lmargin; + __fs->lmargin = __lmargin; + return __old; +} + +/* Set __FS's right margin to __RMARGIN and return the old value. */ +size_t +__argp_fmtstream_set_rmargin (argp_fmtstream_t __fs, size_t __rmargin) +{ + size_t __old; + if ((size_t) (__fs->p - __fs->buf) > __fs->point_offs) + __argp_fmtstream_update (__fs); + __old = __fs->rmargin; + __fs->rmargin = __rmargin; + return __old; +} + +/* Set FS's wrap margin to __WMARGIN and return the old value. */ +size_t +__argp_fmtstream_set_wmargin (argp_fmtstream_t __fs, size_t __wmargin) +{ + size_t __old; + if ((size_t) (__fs->p - __fs->buf) > __fs->point_offs) + __argp_fmtstream_update (__fs); + __old = __fs->wmargin; + __fs->wmargin = __wmargin; + return __old; +} + +/* Return the column number of the current output point in __FS. */ +size_t +__argp_fmtstream_point (argp_fmtstream_t __fs) +{ + if ((size_t) (__fs->p - __fs->buf) > __fs->point_offs) + __argp_fmtstream_update (__fs); + return __fs->point_col >= 0 ? __fs->point_col : 0; +} + +#endif /* !ARGP_FMTSTREAM_USE_LINEWRAP */ diff --git a/argp-standalone/argp-fmtstream.h b/argp-standalone/argp-fmtstream.h new file mode 100644 index 000000000..e797b119e --- /dev/null +++ b/argp-standalone/argp-fmtstream.h @@ -0,0 +1,319 @@ +/* Word-wrapping and line-truncating streams. + Copyright (C) 1997, 2003 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Miles Bader . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* This package emulates glibc `line_wrap_stream' semantics for systems that + don't have that. If the system does have it, it is just a wrapper for + that. This header file is only used internally while compiling argp, and + shouldn't be installed. */ + +#ifndef _ARGP_FMTSTREAM_H +#define _ARGP_FMTSTREAM_H + +#include +#include + +#if HAVE_UNISTD_H +# include +#else +/* This is a kludge to make the code compile on windows. Perhaps it + would be better to just replace ssize_t with int through out the + code. */ +# define ssize_t int +#endif + +#if _LIBC || (defined (HAVE_FLOCKFILE) && defined(HAVE_PUTC_UNLOCKED) \ + && defined (HAVE_FPUTS_UNLOCKED) && defined (HAVE_FWRITE_UNLOCKED) ) +/* Use locking funxtions */ +# define FLOCKFILE(f) flockfile(f) +# define FUNLOCKFILE(f) funlockfile(f) +# define PUTC_UNLOCKED(c, f) putc_unlocked((c), (f)) +# define FPUTS_UNLOCKED(s, f) fputs_unlocked((s), (f)) +# define FWRITE_UNLOCKED(b, s, n, f) fwrite_unlocked((b), (s), (n), (f)) +#else +/* Disable stdio locking */ +# define FLOCKFILE(f) +# define FUNLOCKFILE(f) +# define PUTC_UNLOCKED(c, f) putc((c), (f)) +# define FPUTS_UNLOCKED(s, f) fputs((s), (f)) +# define FWRITE_UNLOCKED(b, s, n, f) fwrite((b), (s), (n), (f)) +#endif /* No thread safe i/o */ + +#if (_LIBC - 0 && !defined (USE_IN_LIBIO)) \ + || (defined (__GNU_LIBRARY__) && defined (HAVE_LINEWRAP_H)) +/* line_wrap_stream is available, so use that. */ +#define ARGP_FMTSTREAM_USE_LINEWRAP +#endif + +#ifdef ARGP_FMTSTREAM_USE_LINEWRAP +/* Just be a simple wrapper for line_wrap_stream; the semantics are + *slightly* different, as line_wrap_stream doesn't actually make a new + object, it just modifies the given stream (reversibly) to do + line-wrapping. Since we control who uses this code, it doesn't matter. */ + +#include + +typedef FILE *argp_fmtstream_t; + +#define argp_make_fmtstream line_wrap_stream +#define __argp_make_fmtstream line_wrap_stream +#define argp_fmtstream_free line_unwrap_stream +#define __argp_fmtstream_free line_unwrap_stream + +#define __argp_fmtstream_putc(fs,ch) putc(ch,fs) +#define argp_fmtstream_putc(fs,ch) putc(ch,fs) +#define __argp_fmtstream_puts(fs,str) fputs(str,fs) +#define argp_fmtstream_puts(fs,str) fputs(str,fs) +#define __argp_fmtstream_write(fs,str,len) fwrite(str,1,len,fs) +#define argp_fmtstream_write(fs,str,len) fwrite(str,1,len,fs) +#define __argp_fmtstream_printf fprintf +#define argp_fmtstream_printf fprintf + +#define __argp_fmtstream_lmargin line_wrap_lmargin +#define argp_fmtstream_lmargin line_wrap_lmargin +#define __argp_fmtstream_set_lmargin line_wrap_set_lmargin +#define argp_fmtstream_set_lmargin line_wrap_set_lmargin +#define __argp_fmtstream_rmargin line_wrap_rmargin +#define argp_fmtstream_rmargin line_wrap_rmargin +#define __argp_fmtstream_set_rmargin line_wrap_set_rmargin +#define argp_fmtstream_set_rmargin line_wrap_set_rmargin +#define __argp_fmtstream_wmargin line_wrap_wmargin +#define argp_fmtstream_wmargin line_wrap_wmargin +#define __argp_fmtstream_set_wmargin line_wrap_set_wmargin +#define argp_fmtstream_set_wmargin line_wrap_set_wmargin +#define __argp_fmtstream_point line_wrap_point +#define argp_fmtstream_point line_wrap_point + +#else /* !ARGP_FMTSTREAM_USE_LINEWRAP */ +/* Guess we have to define our own version. */ + +#ifndef __const +#define __const const +#endif + + +struct argp_fmtstream +{ + FILE *stream; /* The stream we're outputting to. */ + + size_t lmargin, rmargin; /* Left and right margins. */ + ssize_t wmargin; /* Margin to wrap to, or -1 to truncate. */ + + /* Point in buffer to which we've processed for wrapping, but not output. */ + size_t point_offs; + /* Output column at POINT_OFFS, or -1 meaning 0 but don't add lmargin. */ + ssize_t point_col; + + char *buf; /* Output buffer. */ + char *p; /* Current end of text in BUF. */ + char *end; /* Absolute end of BUF. */ +}; + +typedef struct argp_fmtstream *argp_fmtstream_t; + +/* Return an argp_fmtstream that outputs to STREAM, and which prefixes lines + written on it with LMARGIN spaces and limits them to RMARGIN columns + total. If WMARGIN >= 0, words that extend past RMARGIN are wrapped by + replacing the whitespace before them with a newline and WMARGIN spaces. + Otherwise, chars beyond RMARGIN are simply dropped until a newline. + Returns NULL if there was an error. */ +extern argp_fmtstream_t __argp_make_fmtstream (FILE *__stream, + size_t __lmargin, + size_t __rmargin, + ssize_t __wmargin); +extern argp_fmtstream_t argp_make_fmtstream (FILE *__stream, + size_t __lmargin, + size_t __rmargin, + ssize_t __wmargin); + +/* Flush __FS to its stream, and free it (but don't close the stream). */ +extern void __argp_fmtstream_free (argp_fmtstream_t __fs); +extern void argp_fmtstream_free (argp_fmtstream_t __fs); + +extern ssize_t __argp_fmtstream_printf (argp_fmtstream_t __fs, + __const char *__fmt, ...) + PRINTF_STYLE(2,3); +extern ssize_t argp_fmtstream_printf (argp_fmtstream_t __fs, + __const char *__fmt, ...) + PRINTF_STYLE(2,3); + +extern int __argp_fmtstream_putc (argp_fmtstream_t __fs, int __ch); +extern int argp_fmtstream_putc (argp_fmtstream_t __fs, int __ch); + +extern int __argp_fmtstream_puts (argp_fmtstream_t __fs, __const char *__str); +extern int argp_fmtstream_puts (argp_fmtstream_t __fs, __const char *__str); + +extern size_t __argp_fmtstream_write (argp_fmtstream_t __fs, + __const char *__str, size_t __len); +extern size_t argp_fmtstream_write (argp_fmtstream_t __fs, + __const char *__str, size_t __len); + +/* Access macros for various bits of state. */ +#define argp_fmtstream_lmargin(__fs) ((__fs)->lmargin) +#define argp_fmtstream_rmargin(__fs) ((__fs)->rmargin) +#define argp_fmtstream_wmargin(__fs) ((__fs)->wmargin) +#define __argp_fmtstream_lmargin argp_fmtstream_lmargin +#define __argp_fmtstream_rmargin argp_fmtstream_rmargin +#define __argp_fmtstream_wmargin argp_fmtstream_wmargin + +/* Set __FS's left margin to LMARGIN and return the old value. */ +extern size_t argp_fmtstream_set_lmargin (argp_fmtstream_t __fs, + size_t __lmargin); +extern size_t __argp_fmtstream_set_lmargin (argp_fmtstream_t __fs, + size_t __lmargin); + +/* Set __FS's right margin to __RMARGIN and return the old value. */ +extern size_t argp_fmtstream_set_rmargin (argp_fmtstream_t __fs, + size_t __rmargin); +extern size_t __argp_fmtstream_set_rmargin (argp_fmtstream_t __fs, + size_t __rmargin); + +/* Set __FS's wrap margin to __WMARGIN and return the old value. */ +extern size_t argp_fmtstream_set_wmargin (argp_fmtstream_t __fs, + size_t __wmargin); +extern size_t __argp_fmtstream_set_wmargin (argp_fmtstream_t __fs, + size_t __wmargin); + +/* Return the column number of the current output point in __FS. */ +extern size_t argp_fmtstream_point (argp_fmtstream_t __fs); +extern size_t __argp_fmtstream_point (argp_fmtstream_t __fs); + +/* Internal routines. */ +extern void _argp_fmtstream_update (argp_fmtstream_t __fs); +extern void __argp_fmtstream_update (argp_fmtstream_t __fs); +extern int _argp_fmtstream_ensure (argp_fmtstream_t __fs, size_t __amount); +extern int __argp_fmtstream_ensure (argp_fmtstream_t __fs, size_t __amount); + +#ifdef __OPTIMIZE__ +/* Inline versions of above routines. */ + +#if !_LIBC +#define __argp_fmtstream_putc argp_fmtstream_putc +#define __argp_fmtstream_puts argp_fmtstream_puts +#define __argp_fmtstream_write argp_fmtstream_write +#define __argp_fmtstream_set_lmargin argp_fmtstream_set_lmargin +#define __argp_fmtstream_set_rmargin argp_fmtstream_set_rmargin +#define __argp_fmtstream_set_wmargin argp_fmtstream_set_wmargin +#define __argp_fmtstream_point argp_fmtstream_point +#define __argp_fmtstream_update _argp_fmtstream_update +#define __argp_fmtstream_ensure _argp_fmtstream_ensure +#endif + +#ifndef ARGP_FS_EI +#define ARGP_FS_EI extern inline +#endif + +ARGP_FS_EI size_t +__argp_fmtstream_write (argp_fmtstream_t __fs, + __const char *__str, size_t __len) +{ + if (__fs->p + __len <= __fs->end || __argp_fmtstream_ensure (__fs, __len)) + { + memcpy (__fs->p, __str, __len); + __fs->p += __len; + return __len; + } + else + return 0; +} + +ARGP_FS_EI int +__argp_fmtstream_puts (argp_fmtstream_t __fs, __const char *__str) +{ + size_t __len = strlen (__str); + if (__len) + { + size_t __wrote = __argp_fmtstream_write (__fs, __str, __len); + return __wrote == __len ? 0 : -1; + } + else + return 0; +} + +ARGP_FS_EI int +__argp_fmtstream_putc (argp_fmtstream_t __fs, int __ch) +{ + if (__fs->p < __fs->end || __argp_fmtstream_ensure (__fs, 1)) + return *__fs->p++ = __ch; + else + return EOF; +} + +/* Set __FS's left margin to __LMARGIN and return the old value. */ +ARGP_FS_EI size_t +__argp_fmtstream_set_lmargin (argp_fmtstream_t __fs, size_t __lmargin) +{ + size_t __old; + if ((size_t) (__fs->p - __fs->buf) > __fs->point_offs) + __argp_fmtstream_update (__fs); + __old = __fs->lmargin; + __fs->lmargin = __lmargin; + return __old; +} + +/* Set __FS's right margin to __RMARGIN and return the old value. */ +ARGP_FS_EI size_t +__argp_fmtstream_set_rmargin (argp_fmtstream_t __fs, size_t __rmargin) +{ + size_t __old; + if ((size_t) (__fs->p - __fs->buf) > __fs->point_offs) + __argp_fmtstream_update (__fs); + __old = __fs->rmargin; + __fs->rmargin = __rmargin; + return __old; +} + +/* Set FS's wrap margin to __WMARGIN and return the old value. */ +ARGP_FS_EI size_t +__argp_fmtstream_set_wmargin (argp_fmtstream_t __fs, size_t __wmargin) +{ + size_t __old; + if ((size_t) (__fs->p - __fs->buf) > __fs->point_offs) + __argp_fmtstream_update (__fs); + __old = __fs->wmargin; + __fs->wmargin = __wmargin; + return __old; +} + +/* Return the column number of the current output point in __FS. */ +ARGP_FS_EI size_t +__argp_fmtstream_point (argp_fmtstream_t __fs) +{ + if ((size_t) (__fs->p - __fs->buf) > __fs->point_offs) + __argp_fmtstream_update (__fs); + return __fs->point_col >= 0 ? __fs->point_col : 0; +} + +#if !_LIBC +#undef __argp_fmtstream_putc +#undef __argp_fmtstream_puts +#undef __argp_fmtstream_write +#undef __argp_fmtstream_set_lmargin +#undef __argp_fmtstream_set_rmargin +#undef __argp_fmtstream_set_wmargin +#undef __argp_fmtstream_point +#undef __argp_fmtstream_update +#undef __argp_fmtstream_ensure +#endif + +#endif /* __OPTIMIZE__ */ + +#endif /* ARGP_FMTSTREAM_USE_LINEWRAP */ + +#endif /* argp-fmtstream.h */ diff --git a/argp-standalone/argp-help.c b/argp-standalone/argp-help.c new file mode 100644 index 000000000..ced78c4cb --- /dev/null +++ b/argp-standalone/argp-help.c @@ -0,0 +1,1849 @@ +/* Hierarchial argument parsing help output + Copyright (C) 1995,96,97,98,99,2000, 2003 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Miles Bader . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE 1 +#endif + +#ifdef HAVE_CONFIG_H +#include +#endif + +#if HAVE_ALLOCA_H +#include +#endif + +#include +#include +#include +#include +#include +#include +#if HAVE_MALLOC_H +/* Needed, for alloca on windows */ +# include +#endif + +#ifndef _ +/* This is for other GNU distributions with internationalized messages. */ +# if defined HAVE_LIBINTL_H || defined _LIBC +# include +# ifdef _LIBC +# undef dgettext +# define dgettext(domain, msgid) __dcgettext (domain, msgid, LC_MESSAGES) +# endif +# else +# define dgettext(domain, msgid) (msgid) +# endif +#endif + +#include "argp.h" +#include "argp-fmtstream.h" +#include "argp-namefrob.h" + + +#ifndef _LIBC +# ifndef __strchrnul +# define __strchrnul strchrnul +# endif +# ifndef __mempcpy +# define __mempcpy mempcpy +# endif +/* We need to use a different name, as __strndup is likely a macro. */ +# define STRNDUP strndup +# if HAVE_STRERROR +# define STRERROR strerror +# else +# define STRERROR(x) (sys_errlist[x]) +# endif +#else /* _LIBC */ +# define FLOCKFILE __flockfile +# define FUNLOCKFILE __funlockfile +# define STRNDUP __strndup +# define STRERROR strerror +#endif + +#if !_LIBC +# if !HAVE_STRNDUP +char *strndup (const char *s, size_t size); +# endif /* !HAVE_STRNDUP */ + +# if !HAVE_MEMPCPY +void *mempcpy (void *to, const void *from, size_t size); +# endif /* !HAVE_MEMPCPY */ + +# if !HAVE_STRCHRNUL +char *strchrnul(const char *s, int c); +# endif /* !HAVE_STRCHRNUL */ + +# if !HAVE_STRCASECMP +int strcasecmp(const char *s1, const char *s2); +#endif + +#endif /* !_LIBC */ + + +/* User-selectable (using an environment variable) formatting parameters. + + These may be specified in an environment variable called `ARGP_HELP_FMT', + with a contents like: VAR1=VAL1,VAR2=VAL2,BOOLVAR2,no-BOOLVAR2 + Where VALn must be a positive integer. The list of variables is in the + UPARAM_NAMES vector, below. */ + +/* Default parameters. */ +#define DUP_ARGS 0 /* True if option argument can be duplicated. */ +#define DUP_ARGS_NOTE 1 /* True to print a note about duplicate args. */ +#define SHORT_OPT_COL 2 /* column in which short options start */ +#define LONG_OPT_COL 6 /* column in which long options start */ +#define DOC_OPT_COL 2 /* column in which doc options start */ +#define OPT_DOC_COL 29 /* column in which option text starts */ +#define HEADER_COL 1 /* column in which group headers are printed */ +#define USAGE_INDENT 12 /* indentation of wrapped usage lines */ +#define RMARGIN 79 /* right margin used for wrapping */ + +/* User-selectable (using an environment variable) formatting parameters. + They must all be of type `int' for the parsing code to work. */ +struct uparams +{ + /* If true, arguments for an option are shown with both short and long + options, even when a given option has both, e.g. `-x ARG, --longx=ARG'. + If false, then if an option has both, the argument is only shown with + the long one, e.g., `-x, --longx=ARG', and a message indicating that + this really means both is printed below the options. */ + int dup_args; + + /* This is true if when DUP_ARGS is false, and some duplicate arguments have + been suppressed, an explanatory message should be printed. */ + int dup_args_note; + + /* Various output columns. */ + int short_opt_col; + int long_opt_col; + int doc_opt_col; + int opt_doc_col; + int header_col; + int usage_indent; + int rmargin; + + int valid; /* True when the values in here are valid. */ +}; + +/* This is a global variable, as user options are only ever read once. */ +static struct uparams uparams = { + DUP_ARGS, DUP_ARGS_NOTE, + SHORT_OPT_COL, LONG_OPT_COL, DOC_OPT_COL, OPT_DOC_COL, HEADER_COL, + USAGE_INDENT, RMARGIN, + 0 +}; + +/* A particular uparam, and what the user name is. */ +struct uparam_name +{ + const char *name; /* User name. */ + int is_bool; /* Whether it's `boolean'. */ + size_t uparams_offs; /* Location of the (int) field in UPARAMS. */ +}; + +/* The name-field mappings we know about. */ +static const struct uparam_name uparam_names[] = +{ + { "dup-args", 1, offsetof (struct uparams, dup_args) }, + { "dup-args-note", 1, offsetof (struct uparams, dup_args_note) }, + { "short-opt-col", 0, offsetof (struct uparams, short_opt_col) }, + { "long-opt-col", 0, offsetof (struct uparams, long_opt_col) }, + { "doc-opt-col", 0, offsetof (struct uparams, doc_opt_col) }, + { "opt-doc-col", 0, offsetof (struct uparams, opt_doc_col) }, + { "header-col", 0, offsetof (struct uparams, header_col) }, + { "usage-indent", 0, offsetof (struct uparams, usage_indent) }, + { "rmargin", 0, offsetof (struct uparams, rmargin) }, + { 0, 0, 0 } +}; + +/* Read user options from the environment, and fill in UPARAMS appropiately. */ +static void +fill_in_uparams (const struct argp_state *state) +{ + + const char *var = getenv ("ARGP_HELP_FMT"); + +#define SKIPWS(p) do { while (isspace (*p)) p++; } while (0); + + if (var) + /* Parse var. */ + while (*var) + { + SKIPWS (var); + + if (isalpha (*var)) + { + size_t var_len; + const struct uparam_name *un; + int unspec = 0, val = 0; + const char *arg = var; + + while (isalnum (*arg) || *arg == '-' || *arg == '_') + arg++; + var_len = arg - var; + + SKIPWS (arg); + + if (*arg == '\0' || *arg == ',') + unspec = 1; + else if (*arg == '=') + { + arg++; + SKIPWS (arg); + } + + if (unspec) + { + if (var[0] == 'n' && var[1] == 'o' && var[2] == '-') + { + val = 0; + var += 3; + var_len -= 3; + } + else + val = 1; + } + else if (isdigit (*arg)) + { + val = atoi (arg); + while (isdigit (*arg)) + arg++; + SKIPWS (arg); + } + + for (un = uparam_names; un->name; un++) + if (strlen (un->name) == var_len + && strncmp (var, un->name, var_len) == 0) + { + if (unspec && !un->is_bool) + __argp_failure (state, 0, 0, + dgettext (state->root_argp->argp_domain, "\ +%.*s: ARGP_HELP_FMT parameter requires a value"), + (int) var_len, var); + else + *(int *)((char *)&uparams + un->uparams_offs) = val; + break; + } + if (! un->name) + __argp_failure (state, 0, 0, + dgettext (state->root_argp->argp_domain, "\ +%.*s: Unknown ARGP_HELP_FMT parameter"), + (int) var_len, var); + + var = arg; + if (*var == ',') + var++; + } + else if (*var) + { + __argp_failure (state, 0, 0, + dgettext (state->root_argp->argp_domain, + "Garbage in ARGP_HELP_FMT: %s"), var); + break; + } + } +} + +/* Returns true if OPT hasn't been marked invisible. Visibility only affects + whether OPT is displayed or used in sorting, not option shadowing. */ +#define ovisible(opt) (! ((opt)->flags & OPTION_HIDDEN)) + +/* Returns true if OPT is an alias for an earlier option. */ +#define oalias(opt) ((opt)->flags & OPTION_ALIAS) + +/* Returns true if OPT is an documentation-only entry. */ +#define odoc(opt) ((opt)->flags & OPTION_DOC) + +/* Returns true if OPT is the end-of-list marker for a list of options. */ +#define oend(opt) __option_is_end (opt) + +/* Returns true if OPT has a short option. */ +#define oshort(opt) __option_is_short (opt) + +/* + The help format for a particular option is like: + + -xARG, -yARG, --long1=ARG, --long2=ARG Documentation... + + Where ARG will be omitted if there's no argument, for this option, or + will be surrounded by "[" and "]" appropiately if the argument is + optional. The documentation string is word-wrapped appropiately, and if + the list of options is long enough, it will be started on a separate line. + If there are no short options for a given option, the first long option is + indented slighly in a way that's supposed to make most long options appear + to be in a separate column. + + For example, the following output (from ps): + + -p PID, --pid=PID List the process PID + --pgrp=PGRP List processes in the process group PGRP + -P, -x, --no-parent Include processes without parents + -Q, --all-fields Don't elide unusable fields (normally if there's + some reason ps can't print a field for any + process, it's removed from the output entirely) + -r, --reverse, --gratuitously-long-reverse-option + Reverse the order of any sort + --session[=SID] Add the processes from the session SID (which + defaults to the sid of the current process) + + Here are some more options: + -f ZOT, --foonly=ZOT Glork a foonly + -z, --zaza Snit a zar + + -?, --help Give this help list + --usage Give a short usage message + -V, --version Print program version + + The struct argp_option array for the above could look like: + + { + {"pid", 'p', "PID", 0, "List the process PID"}, + {"pgrp", OPT_PGRP, "PGRP", 0, "List processes in the process group PGRP"}, + {"no-parent", 'P', 0, 0, "Include processes without parents"}, + {0, 'x', 0, OPTION_ALIAS}, + {"all-fields",'Q', 0, 0, "Don't elide unusable fields (normally" + " if there's some reason ps can't" + " print a field for any process, it's" + " removed from the output entirely)" }, + {"reverse", 'r', 0, 0, "Reverse the order of any sort"}, + {"gratuitously-long-reverse-option", 0, 0, OPTION_ALIAS}, + {"session", OPT_SESS, "SID", OPTION_ARG_OPTIONAL, + "Add the processes from the session" + " SID (which defaults to the sid of" + " the current process)" }, + + {0,0,0,0, "Here are some more options:"}, + {"foonly", 'f', "ZOT", 0, "Glork a foonly"}, + {"zaza", 'z', 0, 0, "Snit a zar"}, + + {0} + } + + Note that the last three options are automatically supplied by argp_parse, + unless you tell it not to with ARGP_NO_HELP. + +*/ + +/* Returns true if CH occurs between BEG and END. */ +static int +find_char (char ch, char *beg, char *end) +{ + while (beg < end) + if (*beg == ch) + return 1; + else + beg++; + return 0; +} + +struct hol_cluster; /* fwd decl */ + +struct hol_entry +{ + /* First option. */ + const struct argp_option *opt; + /* Number of options (including aliases). */ + unsigned num; + + /* A pointers into the HOL's short_options field, to the first short option + letter for this entry. The order of the characters following this point + corresponds to the order of options pointed to by OPT, and there are at + most NUM. A short option recorded in a option following OPT is only + valid if it occurs in the right place in SHORT_OPTIONS (otherwise it's + probably been shadowed by some other entry). */ + char *short_options; + + /* Entries are sorted by their group first, in the order: + 1, 2, ..., n, 0, -m, ..., -2, -1 + and then alphabetically within each group. The default is 0. */ + int group; + + /* The cluster of options this entry belongs to, or 0 if none. */ + struct hol_cluster *cluster; + + /* The argp from which this option came. */ + const struct argp *argp; +}; + +/* A cluster of entries to reflect the argp tree structure. */ +struct hol_cluster +{ + /* A descriptive header printed before options in this cluster. */ + const char *header; + + /* Used to order clusters within the same group with the same parent, + according to the order in which they occurred in the parent argp's child + list. */ + int index; + + /* How to sort this cluster with respect to options and other clusters at the + same depth (clusters always follow options in the same group). */ + int group; + + /* The cluster to which this cluster belongs, or 0 if it's at the base + level. */ + struct hol_cluster *parent; + + /* The argp from which this cluster is (eventually) derived. */ + const struct argp *argp; + + /* The distance this cluster is from the root. */ + int depth; + + /* Clusters in a given hol are kept in a linked list, to make freeing them + possible. */ + struct hol_cluster *next; +}; + +/* A list of options for help. */ +struct hol +{ + /* An array of hol_entry's. */ + struct hol_entry *entries; + /* The number of entries in this hol. If this field is zero, the others + are undefined. */ + unsigned num_entries; + + /* A string containing all short options in this HOL. Each entry contains + pointers into this string, so the order can't be messed with blindly. */ + char *short_options; + + /* Clusters of entries in this hol. */ + struct hol_cluster *clusters; +}; + +/* Create a struct hol from the options in ARGP. CLUSTER is the + hol_cluster in which these entries occur, or 0, if at the root. */ +static struct hol * +make_hol (const struct argp *argp, struct hol_cluster *cluster) +{ + char *so; + const struct argp_option *o; + const struct argp_option *opts = argp->options; + struct hol_entry *entry; + unsigned num_short_options = 0; + struct hol *hol = malloc (sizeof (struct hol)); + + assert (hol); + + hol->num_entries = 0; + hol->clusters = 0; + + if (opts) + { + int cur_group = 0; + + /* The first option must not be an alias. */ + assert (! oalias (opts)); + + /* Calculate the space needed. */ + for (o = opts; ! oend (o); o++) + { + if (! oalias (o)) + hol->num_entries++; + if (oshort (o)) + num_short_options++; /* This is an upper bound. */ + } + + hol->entries = malloc (sizeof (struct hol_entry) * hol->num_entries); + hol->short_options = malloc (num_short_options + 1); + + assert (hol->entries && hol->short_options); + + /* Fill in the entries. */ + so = hol->short_options; + for (o = opts, entry = hol->entries; ! oend (o); entry++) + { + entry->opt = o; + entry->num = 0; + entry->short_options = so; + entry->group = cur_group = + o->group + ? o->group + : ((!o->name && !o->key) + ? cur_group + 1 + : cur_group); + entry->cluster = cluster; + entry->argp = argp; + + do + { + entry->num++; + if (oshort (o) && ! find_char (o->key, hol->short_options, so)) + /* O has a valid short option which hasn't already been used.*/ + *so++ = o->key; + o++; + } + while (! oend (o) && oalias (o)); + } + *so = '\0'; /* null terminated so we can find the length */ + } + + return hol; +} + +/* Add a new cluster to HOL, with the given GROUP and HEADER (taken from the + associated argp child list entry), INDEX, and PARENT, and return a pointer + to it. ARGP is the argp that this cluster results from. */ +static struct hol_cluster * +hol_add_cluster (struct hol *hol, int group, const char *header, int index, + struct hol_cluster *parent, const struct argp *argp) +{ + struct hol_cluster *cl = malloc (sizeof (struct hol_cluster)); + if (cl) + { + cl->group = group; + cl->header = header; + + cl->index = index; + cl->parent = parent; + cl->argp = argp; + cl->depth = parent ? parent->depth + 1 : 0; + + cl->next = hol->clusters; + hol->clusters = cl; + } + return cl; +} + +/* Free HOL and any resources it uses. */ +static void +hol_free (struct hol *hol) +{ + struct hol_cluster *cl = hol->clusters; + + while (cl) + { + struct hol_cluster *next = cl->next; + free (cl); + cl = next; + } + + if (hol->num_entries > 0) + { + free (hol->entries); + free (hol->short_options); + } + + free (hol); +} + +static inline int +hol_entry_short_iterate (const struct hol_entry *entry, + int (*func)(const struct argp_option *opt, + const struct argp_option *real, + const char *domain, void *cookie), + const char *domain, void *cookie) +{ + unsigned nopts; + int val = 0; + const struct argp_option *opt, *real = entry->opt; + char *so = entry->short_options; + + for (opt = real, nopts = entry->num; nopts > 0 && !val; opt++, nopts--) + if (oshort (opt) && *so == opt->key) + { + if (!oalias (opt)) + real = opt; + if (ovisible (opt)) + val = (*func)(opt, real, domain, cookie); + so++; + } + + return val; +} + +static inline int +hol_entry_long_iterate (const struct hol_entry *entry, + int (*func)(const struct argp_option *opt, + const struct argp_option *real, + const char *domain, void *cookie), + const char *domain, void *cookie) +{ + unsigned nopts; + int val = 0; + const struct argp_option *opt, *real = entry->opt; + + for (opt = real, nopts = entry->num; nopts > 0 && !val; opt++, nopts--) + if (opt->name) + { + if (!oalias (opt)) + real = opt; + if (ovisible (opt)) + val = (*func)(opt, real, domain, cookie); + } + + return val; +} + +/* Iterator that returns true for the first short option. */ +static inline int +until_short (const struct argp_option *opt, const struct argp_option *real UNUSED, + const char *domain UNUSED, void *cookie UNUSED) +{ + return oshort (opt) ? opt->key : 0; +} + +/* Returns the first valid short option in ENTRY, or 0 if there is none. */ +static char +hol_entry_first_short (const struct hol_entry *entry) +{ + return hol_entry_short_iterate (entry, until_short, + entry->argp->argp_domain, 0); +} + +/* Returns the first valid long option in ENTRY, or 0 if there is none. */ +static const char * +hol_entry_first_long (const struct hol_entry *entry) +{ + const struct argp_option *opt; + unsigned num; + for (opt = entry->opt, num = entry->num; num > 0; opt++, num--) + if (opt->name && ovisible (opt)) + return opt->name; + return 0; +} + +/* Returns the entry in HOL with the long option name NAME, or 0 if there is + none. */ +static struct hol_entry * +hol_find_entry (struct hol *hol, const char *name) +{ + struct hol_entry *entry = hol->entries; + unsigned num_entries = hol->num_entries; + + while (num_entries-- > 0) + { + const struct argp_option *opt = entry->opt; + unsigned num_opts = entry->num; + + while (num_opts-- > 0) + if (opt->name && ovisible (opt) && strcmp (opt->name, name) == 0) + return entry; + else + opt++; + + entry++; + } + + return 0; +} + +/* If an entry with the long option NAME occurs in HOL, set it's special + sort position to GROUP. */ +static void +hol_set_group (struct hol *hol, const char *name, int group) +{ + struct hol_entry *entry = hol_find_entry (hol, name); + if (entry) + entry->group = group; +} + +/* Order by group: 0, 1, 2, ..., n, -m, ..., -2, -1. + EQ is what to return if GROUP1 and GROUP2 are the same. */ +static int +group_cmp (int group1, int group2, int eq) +{ + if (group1 == group2) + return eq; + else if ((group1 < 0 && group2 < 0) || (group1 >= 0 && group2 >= 0)) + return group1 - group2; + else + return group2 - group1; +} + +/* Compare clusters CL1 & CL2 by the order that they should appear in + output. */ +static int +hol_cluster_cmp (const struct hol_cluster *cl1, const struct hol_cluster *cl2) +{ + /* If one cluster is deeper than the other, use its ancestor at the same + level, so that finding the common ancestor is straightforward. */ + while (cl1->depth < cl2->depth) + cl1 = cl1->parent; + while (cl2->depth < cl1->depth) + cl2 = cl2->parent; + + /* Now reduce both clusters to their ancestors at the point where both have + a common parent; these can be directly compared. */ + while (cl1->parent != cl2->parent) + cl1 = cl1->parent, cl2 = cl2->parent; + + return group_cmp (cl1->group, cl2->group, cl2->index - cl1->index); +} + +/* Return the ancestor of CL that's just below the root (i.e., has a parent + of 0). */ +static struct hol_cluster * +hol_cluster_base (struct hol_cluster *cl) +{ + while (cl->parent) + cl = cl->parent; + return cl; +} + +/* Return true if CL1 is a child of CL2. */ +static int +hol_cluster_is_child (const struct hol_cluster *cl1, + const struct hol_cluster *cl2) +{ + while (cl1 && cl1 != cl2) + cl1 = cl1->parent; + return cl1 == cl2; +} + +/* Given the name of a OPTION_DOC option, modifies NAME to start at the tail + that should be used for comparisons, and returns true iff it should be + treated as a non-option. */ + +/* FIXME: Can we use unsigned char * for the argument? */ +static int +canon_doc_option (const char **name) +{ + int non_opt; + /* Skip initial whitespace. */ + while (isspace ( (unsigned char) **name)) + (*name)++; + /* Decide whether this looks like an option (leading `-') or not. */ + non_opt = (**name != '-'); + /* Skip until part of name used for sorting. */ + while (**name && !isalnum ( (unsigned char) **name)) + (*name)++; + return non_opt; +} + +/* Order ENTRY1 & ENTRY2 by the order which they should appear in a help + listing. */ +static int +hol_entry_cmp (const struct hol_entry *entry1, + const struct hol_entry *entry2) +{ + /* The group numbers by which the entries should be ordered; if either is + in a cluster, then this is just the group within the cluster. */ + int group1 = entry1->group, group2 = entry2->group; + + if (entry1->cluster != entry2->cluster) + { + /* The entries are not within the same cluster, so we can't compare them + directly, we have to use the appropiate clustering level too. */ + if (! entry1->cluster) + /* ENTRY1 is at the `base level', not in a cluster, so we have to + compare it's group number with that of the base cluster in which + ENTRY2 resides. Note that if they're in the same group, the + clustered option always comes laster. */ + return group_cmp (group1, hol_cluster_base (entry2->cluster)->group, -1); + else if (! entry2->cluster) + /* Likewise, but ENTRY2's not in a cluster. */ + return group_cmp (hol_cluster_base (entry1->cluster)->group, group2, 1); + else + /* Both entries are in clusters, we can just compare the clusters. */ + return hol_cluster_cmp (entry1->cluster, entry2->cluster); + } + else if (group1 == group2) + /* The entries are both in the same cluster and group, so compare them + alphabetically. */ + { + int short1 = hol_entry_first_short (entry1); + int short2 = hol_entry_first_short (entry2); + int doc1 = odoc (entry1->opt); + int doc2 = odoc (entry2->opt); + /* FIXME: Can we use unsigned char * instead? */ + const char *long1 = hol_entry_first_long (entry1); + const char *long2 = hol_entry_first_long (entry2); + + if (doc1) + doc1 = canon_doc_option (&long1); + if (doc2) + doc2 = canon_doc_option (&long2); + + if (doc1 != doc2) + /* `documentation' options always follow normal options (or + documentation options that *look* like normal options). */ + return doc1 - doc2; + else if (!short1 && !short2 && long1 && long2) + /* Only long options. */ + return __strcasecmp (long1, long2); + else + /* Compare short/short, long/short, short/long, using the first + character of long options. Entries without *any* valid + options (such as options with OPTION_HIDDEN set) will be put + first, but as they're not displayed, it doesn't matter where + they are. */ + { + unsigned char first1 = short1 ? short1 : long1 ? *long1 : 0; + unsigned char first2 = short2 ? short2 : long2 ? *long2 : 0; +#ifdef _tolower + int lower_cmp = _tolower (first1) - _tolower (first2); +#else + int lower_cmp = tolower (first1) - tolower (first2); +#endif + /* Compare ignoring case, except when the options are both the + same letter, in which case lower-case always comes first. */ + /* NOTE: The subtraction below does the right thing + even with eight-bit chars: first1 and first2 are + converted to int *before* the subtraction. */ + return lower_cmp ? lower_cmp : first2 - first1; + } + } + else + /* Within the same cluster, but not the same group, so just compare + groups. */ + return group_cmp (group1, group2, 0); +} + +/* Version of hol_entry_cmp with correct signature for qsort. */ +static int +hol_entry_qcmp (const void *entry1_v, const void *entry2_v) +{ + return hol_entry_cmp (entry1_v, entry2_v); +} + +/* Sort HOL by group and alphabetically by option name (with short options + taking precedence over long). Since the sorting is for display purposes + only, the shadowing of options isn't effected. */ +static void +hol_sort (struct hol *hol) +{ + if (hol->num_entries > 0) + qsort (hol->entries, hol->num_entries, sizeof (struct hol_entry), + hol_entry_qcmp); +} + +/* Append MORE to HOL, destroying MORE in the process. Options in HOL shadow + any in MORE with the same name. */ +static void +hol_append (struct hol *hol, struct hol *more) +{ + struct hol_cluster **cl_end = &hol->clusters; + + /* Steal MORE's cluster list, and add it to the end of HOL's. */ + while (*cl_end) + cl_end = &(*cl_end)->next; + *cl_end = more->clusters; + more->clusters = 0; + + /* Merge entries. */ + if (more->num_entries > 0) + { + if (hol->num_entries == 0) + { + hol->num_entries = more->num_entries; + hol->entries = more->entries; + hol->short_options = more->short_options; + more->num_entries = 0; /* Mark MORE's fields as invalid. */ + } + else + /* Append the entries in MORE to those in HOL, taking care to only add + non-shadowed SHORT_OPTIONS values. */ + { + unsigned left; + char *so, *more_so; + struct hol_entry *e; + unsigned num_entries = hol->num_entries + more->num_entries; + struct hol_entry *entries = + malloc (num_entries * sizeof (struct hol_entry)); + unsigned hol_so_len = strlen (hol->short_options); + char *short_options = + malloc (hol_so_len + strlen (more->short_options) + 1); + + __mempcpy (__mempcpy (entries, hol->entries, + hol->num_entries * sizeof (struct hol_entry)), + more->entries, + more->num_entries * sizeof (struct hol_entry)); + + __mempcpy (short_options, hol->short_options, hol_so_len); + + /* Fix up the short options pointers from HOL. */ + for (e = entries, left = hol->num_entries; left > 0; e++, left--) + e->short_options += (short_options - hol->short_options); + + /* Now add the short options from MORE, fixing up its entries + too. */ + so = short_options + hol_so_len; + more_so = more->short_options; + for (left = more->num_entries; left > 0; e++, left--) + { + int opts_left; + const struct argp_option *opt; + + e->short_options = so; + + for (opts_left = e->num, opt = e->opt; opts_left; opt++, opts_left--) + { + int ch = *more_so; + if (oshort (opt) && ch == opt->key) + /* The next short option in MORE_SO, CH, is from OPT. */ + { + if (! find_char (ch, short_options, + short_options + hol_so_len)) + /* The short option CH isn't shadowed by HOL's options, + so add it to the sum. */ + *so++ = ch; + more_so++; + } + } + } + + *so = '\0'; + + free (hol->entries); + free (hol->short_options); + + hol->entries = entries; + hol->num_entries = num_entries; + hol->short_options = short_options; + } + } + + hol_free (more); +} + +/* Inserts enough spaces to make sure STREAM is at column COL. */ +static void +indent_to (argp_fmtstream_t stream, unsigned col) +{ + int needed = col - __argp_fmtstream_point (stream); + while (needed-- > 0) + __argp_fmtstream_putc (stream, ' '); +} + +/* Output to STREAM either a space, or a newline if there isn't room for at + least ENSURE characters before the right margin. */ +static void +space (argp_fmtstream_t stream, size_t ensure) +{ + if (__argp_fmtstream_point (stream) + ensure + >= __argp_fmtstream_rmargin (stream)) + __argp_fmtstream_putc (stream, '\n'); + else + __argp_fmtstream_putc (stream, ' '); +} + +/* If the option REAL has an argument, we print it in using the printf + format REQ_FMT or OPT_FMT depending on whether it's a required or + optional argument. */ +static void +arg (const struct argp_option *real, const char *req_fmt, const char *opt_fmt, + const char *domain UNUSED, argp_fmtstream_t stream) +{ + if (real->arg) + { + if (real->flags & OPTION_ARG_OPTIONAL) + __argp_fmtstream_printf (stream, opt_fmt, + dgettext (domain, real->arg)); + else + __argp_fmtstream_printf (stream, req_fmt, + dgettext (domain, real->arg)); + } +} + +/* Helper functions for hol_entry_help. */ + +/* State used during the execution of hol_help. */ +struct hol_help_state +{ + /* PREV_ENTRY should contain the previous entry printed, or 0. */ + struct hol_entry *prev_entry; + + /* If an entry is in a different group from the previous one, and SEP_GROUPS + is true, then a blank line will be printed before any output. */ + int sep_groups; + + /* True if a duplicate option argument was suppressed (only ever set if + UPARAMS.dup_args is false). */ + int suppressed_dup_arg; +}; + +/* Some state used while printing a help entry (used to communicate with + helper functions). See the doc for hol_entry_help for more info, as most + of the fields are copied from its arguments. */ +struct pentry_state +{ + const struct hol_entry *entry; + argp_fmtstream_t stream; + struct hol_help_state *hhstate; + + /* True if nothing's been printed so far. */ + int first; + + /* If non-zero, the state that was used to print this help. */ + const struct argp_state *state; +}; + +/* If a user doc filter should be applied to DOC, do so. */ +static const char * +filter_doc (const char *doc, int key, const struct argp *argp, + const struct argp_state *state) +{ + if (argp->help_filter) + /* We must apply a user filter to this output. */ + { + void *input = __argp_input (argp, state); + return (*argp->help_filter) (key, doc, input); + } + else + /* No filter. */ + return doc; +} + +/* Prints STR as a header line, with the margin lines set appropiately, and + notes the fact that groups should be separated with a blank line. ARGP is + the argp that should dictate any user doc filtering to take place. Note + that the previous wrap margin isn't restored, but the left margin is reset + to 0. */ +static void +print_header (const char *str, const struct argp *argp, + struct pentry_state *pest) +{ + const char *tstr = dgettext (argp->argp_domain, str); + const char *fstr = filter_doc (tstr, ARGP_KEY_HELP_HEADER, argp, pest->state); + + if (fstr) + { + if (*fstr) + { + if (pest->hhstate->prev_entry) + /* Precede with a blank line. */ + __argp_fmtstream_putc (pest->stream, '\n'); + indent_to (pest->stream, uparams.header_col); + __argp_fmtstream_set_lmargin (pest->stream, uparams.header_col); + __argp_fmtstream_set_wmargin (pest->stream, uparams.header_col); + __argp_fmtstream_puts (pest->stream, fstr); + __argp_fmtstream_set_lmargin (pest->stream, 0); + __argp_fmtstream_putc (pest->stream, '\n'); + } + + pest->hhstate->sep_groups = 1; /* Separate subsequent groups. */ + } + + if (fstr != tstr) + free ((char *) fstr); +} + +/* Inserts a comma if this isn't the first item on the line, and then makes + sure we're at least to column COL. If this *is* the first item on a line, + prints any pending whitespace/headers that should precede this line. Also + clears FIRST. */ +static void +comma (unsigned col, struct pentry_state *pest) +{ + if (pest->first) + { + const struct hol_entry *pe = pest->hhstate->prev_entry; + const struct hol_cluster *cl = pest->entry->cluster; + + if (pest->hhstate->sep_groups && pe && pest->entry->group != pe->group) + __argp_fmtstream_putc (pest->stream, '\n'); + + if (cl && cl->header && *cl->header + && (!pe + || (pe->cluster != cl + && !hol_cluster_is_child (pe->cluster, cl)))) + /* If we're changing clusters, then this must be the start of the + ENTRY's cluster unless that is an ancestor of the previous one + (in which case we had just popped into a sub-cluster for a bit). + If so, then print the cluster's header line. */ + { + int old_wm = __argp_fmtstream_wmargin (pest->stream); + print_header (cl->header, cl->argp, pest); + __argp_fmtstream_set_wmargin (pest->stream, old_wm); + } + + pest->first = 0; + } + else + __argp_fmtstream_puts (pest->stream, ", "); + + indent_to (pest->stream, col); +} + +/* Print help for ENTRY to STREAM. */ +static void +hol_entry_help (struct hol_entry *entry, const struct argp_state *state, + argp_fmtstream_t stream, struct hol_help_state *hhstate) +{ + unsigned num; + const struct argp_option *real = entry->opt, *opt; + char *so = entry->short_options; + int have_long_opt = 0; /* We have any long options. */ + /* Saved margins. */ + int old_lm = __argp_fmtstream_set_lmargin (stream, 0); + int old_wm = __argp_fmtstream_wmargin (stream); + /* PEST is a state block holding some of our variables that we'd like to + share with helper functions. */ + + /* Decent initializers are a GNU extension, so don't use it here. */ + struct pentry_state pest; + pest.entry = entry; + pest.stream = stream; + pest.hhstate = hhstate; + pest.first = 1; + pest.state = state; + + if (! odoc (real)) + for (opt = real, num = entry->num; num > 0; opt++, num--) + if (opt->name && ovisible (opt)) + { + have_long_opt = 1; + break; + } + + /* First emit short options. */ + __argp_fmtstream_set_wmargin (stream, uparams.short_opt_col); /* For truly bizarre cases. */ + for (opt = real, num = entry->num; num > 0; opt++, num--) + if (oshort (opt) && opt->key == *so) + /* OPT has a valid (non shadowed) short option. */ + { + if (ovisible (opt)) + { + comma (uparams.short_opt_col, &pest); + __argp_fmtstream_putc (stream, '-'); + __argp_fmtstream_putc (stream, *so); + if (!have_long_opt || uparams.dup_args) + arg (real, " %s", "[%s]", state->root_argp->argp_domain, stream); + else if (real->arg) + hhstate->suppressed_dup_arg = 1; + } + so++; + } + + /* Now, long options. */ + if (odoc (real)) + /* A `documentation' option. */ + { + __argp_fmtstream_set_wmargin (stream, uparams.doc_opt_col); + for (opt = real, num = entry->num; num > 0; opt++, num--) + if (opt->name && ovisible (opt)) + { + comma (uparams.doc_opt_col, &pest); + /* Calling gettext here isn't quite right, since sorting will + have been done on the original; but documentation options + should be pretty rare anyway... */ + __argp_fmtstream_puts (stream, + dgettext (state->root_argp->argp_domain, + opt->name)); + } + } + else + /* A real long option. */ + { + int first_long_opt = 1; + + __argp_fmtstream_set_wmargin (stream, uparams.long_opt_col); + for (opt = real, num = entry->num; num > 0; opt++, num--) + if (opt->name && ovisible (opt)) + { + comma (uparams.long_opt_col, &pest); + __argp_fmtstream_printf (stream, "--%s", opt->name); + if (first_long_opt || uparams.dup_args) + arg (real, "=%s", "[=%s]", state->root_argp->argp_domain, + stream); + else if (real->arg) + hhstate->suppressed_dup_arg = 1; + } + } + + /* Next, documentation strings. */ + __argp_fmtstream_set_lmargin (stream, 0); + + if (pest.first) + { + /* Didn't print any switches, what's up? */ + if (!oshort (real) && !real->name) + /* This is a group header, print it nicely. */ + print_header (real->doc, entry->argp, &pest); + else + /* Just a totally shadowed option or null header; print nothing. */ + goto cleanup; /* Just return, after cleaning up. */ + } + else + { + const char *tstr = real->doc ? dgettext (state->root_argp->argp_domain, + real->doc) : 0; + const char *fstr = filter_doc (tstr, real->key, entry->argp, state); + if (fstr && *fstr) + { + unsigned int col = __argp_fmtstream_point (stream); + + __argp_fmtstream_set_lmargin (stream, uparams.opt_doc_col); + __argp_fmtstream_set_wmargin (stream, uparams.opt_doc_col); + + if (col > (unsigned int) (uparams.opt_doc_col + 3)) + __argp_fmtstream_putc (stream, '\n'); + else if (col >= (unsigned int) uparams.opt_doc_col) + __argp_fmtstream_puts (stream, " "); + else + indent_to (stream, uparams.opt_doc_col); + + __argp_fmtstream_puts (stream, fstr); + } + if (fstr && fstr != tstr) + free ((char *) fstr); + + /* Reset the left margin. */ + __argp_fmtstream_set_lmargin (stream, 0); + __argp_fmtstream_putc (stream, '\n'); + } + + hhstate->prev_entry = entry; + +cleanup: + __argp_fmtstream_set_lmargin (stream, old_lm); + __argp_fmtstream_set_wmargin (stream, old_wm); +} + +/* Output a long help message about the options in HOL to STREAM. */ +static void +hol_help (struct hol *hol, const struct argp_state *state, + argp_fmtstream_t stream) +{ + unsigned num; + struct hol_entry *entry; + struct hol_help_state hhstate = { 0, 0, 0 }; + + for (entry = hol->entries, num = hol->num_entries; num > 0; entry++, num--) + hol_entry_help (entry, state, stream, &hhstate); + + if (hhstate.suppressed_dup_arg && uparams.dup_args_note) + { + const char *tstr = dgettext (state->root_argp->argp_domain, "\ +Mandatory or optional arguments to long options are also mandatory or \ +optional for any corresponding short options."); + const char *fstr = filter_doc (tstr, ARGP_KEY_HELP_DUP_ARGS_NOTE, + state ? state->root_argp : 0, state); + if (fstr && *fstr) + { + __argp_fmtstream_putc (stream, '\n'); + __argp_fmtstream_puts (stream, fstr); + __argp_fmtstream_putc (stream, '\n'); + } + if (fstr && fstr != tstr) + free ((char *) fstr); + } +} + +/* Helper functions for hol_usage. */ + +/* If OPT is a short option without an arg, append its key to the string + pointer pointer to by COOKIE, and advance the pointer. */ +static int +add_argless_short_opt (const struct argp_option *opt, + const struct argp_option *real, + const char *domain UNUSED, void *cookie) +{ + char **snao_end = cookie; + if (!(opt->arg || real->arg) + && !((opt->flags | real->flags) & OPTION_NO_USAGE)) + *(*snao_end)++ = opt->key; + return 0; +} + +/* If OPT is a short option with an arg, output a usage entry for it to the + stream pointed at by COOKIE. */ +static int +usage_argful_short_opt (const struct argp_option *opt, + const struct argp_option *real, + const char *domain UNUSED, void *cookie) +{ + argp_fmtstream_t stream = cookie; + const char *arg = opt->arg; + int flags = opt->flags | real->flags; + + if (! arg) + arg = real->arg; + + if (arg && !(flags & OPTION_NO_USAGE)) + { + arg = dgettext (domain, arg); + + if (flags & OPTION_ARG_OPTIONAL) + __argp_fmtstream_printf (stream, " [-%c[%s]]", opt->key, arg); + else + { + /* Manually do line wrapping so that it (probably) won't + get wrapped at the embedded space. */ + space (stream, 6 + strlen (arg)); + __argp_fmtstream_printf (stream, "[-%c %s]", opt->key, arg); + } + } + + return 0; +} + +/* Output a usage entry for the long option opt to the stream pointed at by + COOKIE. */ +static int +usage_long_opt (const struct argp_option *opt, + const struct argp_option *real, + const char *domain UNUSED, void *cookie) +{ + argp_fmtstream_t stream = cookie; + const char *arg = opt->arg; + int flags = opt->flags | real->flags; + + if (! arg) + arg = real->arg; + + if (! (flags & OPTION_NO_USAGE)) + { + if (arg) + { + arg = dgettext (domain, arg); + if (flags & OPTION_ARG_OPTIONAL) + __argp_fmtstream_printf (stream, " [--%s[=%s]]", opt->name, arg); + else + __argp_fmtstream_printf (stream, " [--%s=%s]", opt->name, arg); + } + else + __argp_fmtstream_printf (stream, " [--%s]", opt->name); + } + + return 0; +} + +/* Print a short usage description for the arguments in HOL to STREAM. */ +static void +hol_usage (struct hol *hol, argp_fmtstream_t stream) +{ + if (hol->num_entries > 0) + { + unsigned nentries; + struct hol_entry *entry; + char *short_no_arg_opts = alloca (strlen (hol->short_options) + 1); + char *snao_end = short_no_arg_opts; + + /* First we put a list of short options without arguments. */ + for (entry = hol->entries, nentries = hol->num_entries + ; nentries > 0 + ; entry++, nentries--) + hol_entry_short_iterate (entry, add_argless_short_opt, + entry->argp->argp_domain, &snao_end); + if (snao_end > short_no_arg_opts) + { + *snao_end++ = 0; + __argp_fmtstream_printf (stream, " [-%s]", short_no_arg_opts); + } + + /* Now a list of short options *with* arguments. */ + for (entry = hol->entries, nentries = hol->num_entries + ; nentries > 0 + ; entry++, nentries--) + hol_entry_short_iterate (entry, usage_argful_short_opt, + entry->argp->argp_domain, stream); + + /* Finally, a list of long options (whew!). */ + for (entry = hol->entries, nentries = hol->num_entries + ; nentries > 0 + ; entry++, nentries--) + hol_entry_long_iterate (entry, usage_long_opt, + entry->argp->argp_domain, stream); + } +} + +/* Make a HOL containing all levels of options in ARGP. CLUSTER is the + cluster in which ARGP's entries should be clustered, or 0. */ +static struct hol * +argp_hol (const struct argp *argp, struct hol_cluster *cluster) +{ + const struct argp_child *child = argp->children; + struct hol *hol = make_hol (argp, cluster); + if (child) + while (child->argp) + { + struct hol_cluster *child_cluster = + ((child->group || child->header) + /* Put CHILD->argp within its own cluster. */ + ? hol_add_cluster (hol, child->group, child->header, + child - argp->children, cluster, argp) + /* Just merge it into the parent's cluster. */ + : cluster); + hol_append (hol, argp_hol (child->argp, child_cluster)) ; + child++; + } + return hol; +} + +/* Calculate how many different levels with alternative args strings exist in + ARGP. */ +static size_t +argp_args_levels (const struct argp *argp) +{ + size_t levels = 0; + const struct argp_child *child = argp->children; + + if (argp->args_doc && strchr (argp->args_doc, '\n')) + levels++; + + if (child) + while (child->argp) + levels += argp_args_levels ((child++)->argp); + + return levels; +} + +/* Print all the non-option args documented in ARGP to STREAM. Any output is + preceded by a space. LEVELS is a pointer to a byte vector the length + returned by argp_args_levels; it should be initialized to zero, and + updated by this routine for the next call if ADVANCE is true. True is + returned as long as there are more patterns to output. */ +static int +argp_args_usage (const struct argp *argp, const struct argp_state *state, + char **levels, int advance, argp_fmtstream_t stream) +{ + char *our_level = *levels; + int multiple = 0; + const struct argp_child *child = argp->children; + const char *tdoc = dgettext (argp->argp_domain, argp->args_doc), *nl = 0; + const char *fdoc = filter_doc (tdoc, ARGP_KEY_HELP_ARGS_DOC, argp, state); + + if (fdoc) + { + const char *cp = fdoc; + nl = __strchrnul (cp, '\n'); + if (*nl != '\0') + /* This is a `multi-level' args doc; advance to the correct position + as determined by our state in LEVELS, and update LEVELS. */ + { + int i; + multiple = 1; + for (i = 0; i < *our_level; i++) + cp = nl + 1, nl = __strchrnul (cp, '\n'); + (*levels)++; + } + + /* Manually do line wrapping so that it (probably) won't get wrapped at + any embedded spaces. */ + space (stream, 1 + nl - cp); + + __argp_fmtstream_write (stream, cp, nl - cp); + } + if (fdoc && fdoc != tdoc) + free ((char *)fdoc); /* Free user's modified doc string. */ + + if (child) + while (child->argp) + advance = !argp_args_usage ((child++)->argp, state, levels, advance, stream); + + if (advance && multiple) + { + /* Need to increment our level. */ + if (*nl) + /* There's more we can do here. */ + { + (*our_level)++; + advance = 0; /* Our parent shouldn't advance also. */ + } + else if (*our_level > 0) + /* We had multiple levels, but used them up; reset to zero. */ + *our_level = 0; + } + + return !advance; +} + +/* Print the documentation for ARGP to STREAM; if POST is false, then + everything preceeding a `\v' character in the documentation strings (or + the whole string, for those with none) is printed, otherwise, everything + following the `\v' character (nothing for strings without). Each separate + bit of documentation is separated a blank line, and if PRE_BLANK is true, + then the first is as well. If FIRST_ONLY is true, only the first + occurrence is output. Returns true if anything was output. */ +static int +argp_doc (const struct argp *argp, const struct argp_state *state, + int post, int pre_blank, int first_only, + argp_fmtstream_t stream) +{ + const char *text; + const char *inp_text; + void *input = 0; + int anything = 0; + size_t inp_text_limit = 0; + const char *doc = dgettext (argp->argp_domain, argp->doc); + const struct argp_child *child = argp->children; + + if (doc) + { + char *vt = strchr (doc, '\v'); + inp_text = post ? (vt ? vt + 1 : 0) : doc; + inp_text_limit = (!post && vt) ? (vt - doc) : 0; + } + else + inp_text = 0; + + if (argp->help_filter) + /* We have to filter the doc strings. */ + { + if (inp_text_limit) + /* Copy INP_TEXT so that it's nul-terminated. */ + inp_text = STRNDUP (inp_text, inp_text_limit); + input = __argp_input (argp, state); + text = + (*argp->help_filter) (post + ? ARGP_KEY_HELP_POST_DOC + : ARGP_KEY_HELP_PRE_DOC, + inp_text, input); + } + else + text = (const char *) inp_text; + + if (text) + { + if (pre_blank) + __argp_fmtstream_putc (stream, '\n'); + + if (text == inp_text && inp_text_limit) + __argp_fmtstream_write (stream, inp_text, inp_text_limit); + else + __argp_fmtstream_puts (stream, text); + + if (__argp_fmtstream_point (stream) > __argp_fmtstream_lmargin (stream)) + __argp_fmtstream_putc (stream, '\n'); + + anything = 1; + } + + if (text && text != inp_text) + free ((char *) text); /* Free TEXT returned from the help filter. */ + if (inp_text && inp_text_limit && argp->help_filter) + free ((char *) inp_text); /* We copied INP_TEXT, so free it now. */ + + if (post && argp->help_filter) + /* Now see if we have to output a ARGP_KEY_HELP_EXTRA text. */ + { + text = (*argp->help_filter) (ARGP_KEY_HELP_EXTRA, 0, input); + if (text) + { + if (anything || pre_blank) + __argp_fmtstream_putc (stream, '\n'); + __argp_fmtstream_puts (stream, text); + free ((char *) text); + if (__argp_fmtstream_point (stream) + > __argp_fmtstream_lmargin (stream)) + __argp_fmtstream_putc (stream, '\n'); + anything = 1; + } + } + + if (child) + while (child->argp && !(first_only && anything)) + anything |= + argp_doc ((child++)->argp, state, + post, anything || pre_blank, first_only, + stream); + + return anything; +} + +/* Output a usage message for ARGP to STREAM. If called from + argp_state_help, STATE is the relevent parsing state. FLAGS are from the + set ARGP_HELP_*. NAME is what to use wherever a `program name' is + needed. */ + +static void +_help (const struct argp *argp, const struct argp_state *state, FILE *stream, + unsigned flags, const char *name) +{ + int anything = 0; /* Whether we've output anything. */ + struct hol *hol = 0; + argp_fmtstream_t fs; + + if (! stream) + return; + + FLOCKFILE (stream); + + if (! uparams.valid) + fill_in_uparams (state); + + fs = __argp_make_fmtstream (stream, 0, uparams.rmargin, 0); + if (! fs) + { + FUNLOCKFILE (stream); + return; + } + + if (flags & (ARGP_HELP_USAGE | ARGP_HELP_SHORT_USAGE | ARGP_HELP_LONG)) + { + hol = argp_hol (argp, 0); + + /* If present, these options always come last. */ + hol_set_group (hol, "help", -1); + hol_set_group (hol, "version", -1); + + hol_sort (hol); + } + + if (flags & (ARGP_HELP_USAGE | ARGP_HELP_SHORT_USAGE)) + /* Print a short `Usage:' message. */ + { + int first_pattern = 1, more_patterns; + size_t num_pattern_levels = argp_args_levels (argp); + char *pattern_levels = alloca (num_pattern_levels); + + memset (pattern_levels, 0, num_pattern_levels); + + do + { + int old_lm; + int old_wm = __argp_fmtstream_set_wmargin (fs, uparams.usage_indent); + char *levels = pattern_levels; + + if (first_pattern) + __argp_fmtstream_printf (fs, "%s %s", + dgettext (argp->argp_domain, "Usage:"), + name); + else + __argp_fmtstream_printf (fs, "%s %s", + dgettext (argp->argp_domain, " or: "), + name); + + /* We set the lmargin as well as the wmargin, because hol_usage + manually wraps options with newline to avoid annoying breaks. */ + old_lm = __argp_fmtstream_set_lmargin (fs, uparams.usage_indent); + + if (flags & ARGP_HELP_SHORT_USAGE) + /* Just show where the options go. */ + { + if (hol->num_entries > 0) + __argp_fmtstream_puts (fs, dgettext (argp->argp_domain, + " [OPTION...]")); + } + else + /* Actually print the options. */ + { + hol_usage (hol, fs); + flags |= ARGP_HELP_SHORT_USAGE; /* But only do so once. */ + } + + more_patterns = argp_args_usage (argp, state, &levels, 1, fs); + + __argp_fmtstream_set_wmargin (fs, old_wm); + __argp_fmtstream_set_lmargin (fs, old_lm); + + __argp_fmtstream_putc (fs, '\n'); + anything = 1; + + first_pattern = 0; + } + while (more_patterns); + } + + if (flags & ARGP_HELP_PRE_DOC) + anything |= argp_doc (argp, state, 0, 0, 1, fs); + + if (flags & ARGP_HELP_SEE) + { + __argp_fmtstream_printf (fs, dgettext (argp->argp_domain, "\ +Try `%s --help' or `%s --usage' for more information.\n"), + name, name); + anything = 1; + } + + if (flags & ARGP_HELP_LONG) + /* Print a long, detailed help message. */ + { + /* Print info about all the options. */ + if (hol->num_entries > 0) + { + if (anything) + __argp_fmtstream_putc (fs, '\n'); + hol_help (hol, state, fs); + anything = 1; + } + } + + if (flags & ARGP_HELP_POST_DOC) + /* Print any documentation strings at the end. */ + anything |= argp_doc (argp, state, 1, anything, 0, fs); + + if ((flags & ARGP_HELP_BUG_ADDR) && argp_program_bug_address) + { + if (anything) + __argp_fmtstream_putc (fs, '\n'); + __argp_fmtstream_printf (fs, dgettext (argp->argp_domain, + "Report bugs to %s.\n"), + argp_program_bug_address); + anything = 1; + } + + FUNLOCKFILE (stream); + + if (hol) + hol_free (hol); + + __argp_fmtstream_free (fs); +} + +/* Output a usage message for ARGP to STREAM. FLAGS are from the set + ARGP_HELP_*. NAME is what to use wherever a `program name' is needed. */ +void __argp_help (const struct argp *argp, FILE *stream, + unsigned flags, char *name) +{ + _help (argp, 0, stream, flags, name); +} +#ifdef weak_alias +weak_alias (__argp_help, argp_help) +#endif + +char *__argp_basename(char *name) +{ + char *short_name = strrchr(name, '/'); + return short_name ? short_name + 1 : name; +} + +char * +__argp_short_program_name(const struct argp_state *state) +{ + if (state) + return state->name; +#if HAVE_DECL_PROGRAM_INVOCATION_SHORT_NAME + return program_invocation_short_name; +#elif HAVE_DECL_PROGRAM_INVOCATION_NAME + return __argp_basename(program_invocation_name); +#else /* !HAVE_DECL_PROGRAM_INVOCATION_NAME */ + /* FIXME: What now? Miles suggests that it is better to use NULL, + but currently the value is passed on directly to fputs_unlocked, + so that requires more changes. */ +# if __GNUC__ + return ""; +# endif /* __GNUC__ */ +#endif /* !HAVE_DECL_PROGRAM_INVOCATION_NAME */ +} + +/* Output, if appropriate, a usage message for STATE to STREAM. FLAGS are + from the set ARGP_HELP_*. */ +void +__argp_state_help (const struct argp_state *state, FILE *stream, unsigned flags) +{ + if ((!state || ! (state->flags & ARGP_NO_ERRS)) && stream) + { + if (state && (state->flags & ARGP_LONG_ONLY)) + flags |= ARGP_HELP_LONG_ONLY; + + _help (state ? state->root_argp : 0, state, stream, flags, + __argp_short_program_name(state)); + + if (!state || ! (state->flags & ARGP_NO_EXIT)) + { + if (flags & ARGP_HELP_EXIT_ERR) + exit (argp_err_exit_status); + if (flags & ARGP_HELP_EXIT_OK) + exit (0); + } + } +} +#ifdef weak_alias +weak_alias (__argp_state_help, argp_state_help) +#endif + +/* If appropriate, print the printf string FMT and following args, preceded + by the program name and `:', to stderr, and followed by a `Try ... --help' + message, then exit (1). */ +void +__argp_error (const struct argp_state *state, const char *fmt, ...) +{ + if (!state || !(state->flags & ARGP_NO_ERRS)) + { + FILE *stream = state ? state->err_stream : stderr; + + if (stream) + { + va_list ap; + + FLOCKFILE (stream); + + FPUTS_UNLOCKED (__argp_short_program_name(state), + stream); + PUTC_UNLOCKED (':', stream); + PUTC_UNLOCKED (' ', stream); + + va_start (ap, fmt); + vfprintf (stream, fmt, ap); + va_end (ap); + + PUTC_UNLOCKED ('\n', stream); + + __argp_state_help (state, stream, ARGP_HELP_STD_ERR); + + FUNLOCKFILE (stream); + } + } +} +#ifdef weak_alias +weak_alias (__argp_error, argp_error) +#endif + +/* Similar to the standard gnu error-reporting function error(), but will + respect the ARGP_NO_EXIT and ARGP_NO_ERRS flags in STATE, and will print + to STATE->err_stream. This is useful for argument parsing code that is + shared between program startup (when exiting is desired) and runtime + option parsing (when typically an error code is returned instead). The + difference between this function and argp_error is that the latter is for + *parsing errors*, and the former is for other problems that occur during + parsing but don't reflect a (syntactic) problem with the input. */ +void +__argp_failure (const struct argp_state *state, int status, int errnum, + const char *fmt, ...) +{ + if (!state || !(state->flags & ARGP_NO_ERRS)) + { + FILE *stream = state ? state->err_stream : stderr; + + if (stream) + { + FLOCKFILE (stream); + + FPUTS_UNLOCKED (__argp_short_program_name(state), + stream); + + if (fmt) + { + va_list ap; + + PUTC_UNLOCKED (':', stream); + PUTC_UNLOCKED (' ', stream); + + va_start (ap, fmt); + vfprintf (stream, fmt, ap); + va_end (ap); + } + + if (errnum) + { + PUTC_UNLOCKED (':', stream); + PUTC_UNLOCKED (' ', stream); + fputs (STRERROR (errnum), stream); + } + + PUTC_UNLOCKED ('\n', stream); + + FUNLOCKFILE (stream); + + if (status && (!state || !(state->flags & ARGP_NO_EXIT))) + exit (status); + } + } +} +#ifdef weak_alias +weak_alias (__argp_failure, argp_failure) +#endif diff --git a/argp-standalone/argp-namefrob.h b/argp-standalone/argp-namefrob.h new file mode 100644 index 000000000..0ce11481a --- /dev/null +++ b/argp-standalone/argp-namefrob.h @@ -0,0 +1,96 @@ +/* Name frobnication for compiling argp outside of glibc + Copyright (C) 1997 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Miles Bader . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#if !_LIBC +/* This code is written for inclusion in gnu-libc, and uses names in the + namespace reserved for libc. If we're not compiling in libc, define those + names to be the normal ones instead. */ + +/* argp-parse functions */ +#undef __argp_parse +#define __argp_parse argp_parse +#undef __option_is_end +#define __option_is_end _option_is_end +#undef __option_is_short +#define __option_is_short _option_is_short +#undef __argp_input +#define __argp_input _argp_input + +/* argp-help functions */ +#undef __argp_help +#define __argp_help argp_help +#undef __argp_error +#define __argp_error argp_error +#undef __argp_failure +#define __argp_failure argp_failure +#undef __argp_state_help +#define __argp_state_help argp_state_help +#undef __argp_usage +#define __argp_usage argp_usage +#undef __argp_basename +#define __argp_basename _argp_basename +#undef __argp_short_program_name +#define __argp_short_program_name _argp_short_program_name + +/* argp-fmtstream functions */ +#undef __argp_make_fmtstream +#define __argp_make_fmtstream argp_make_fmtstream +#undef __argp_fmtstream_free +#define __argp_fmtstream_free argp_fmtstream_free +#undef __argp_fmtstream_putc +#define __argp_fmtstream_putc argp_fmtstream_putc +#undef __argp_fmtstream_puts +#define __argp_fmtstream_puts argp_fmtstream_puts +#undef __argp_fmtstream_write +#define __argp_fmtstream_write argp_fmtstream_write +#undef __argp_fmtstream_printf +#define __argp_fmtstream_printf argp_fmtstream_printf +#undef __argp_fmtstream_set_lmargin +#define __argp_fmtstream_set_lmargin argp_fmtstream_set_lmargin +#undef __argp_fmtstream_set_rmargin +#define __argp_fmtstream_set_rmargin argp_fmtstream_set_rmargin +#undef __argp_fmtstream_set_wmargin +#define __argp_fmtstream_set_wmargin argp_fmtstream_set_wmargin +#undef __argp_fmtstream_point +#define __argp_fmtstream_point argp_fmtstream_point +#undef __argp_fmtstream_update +#define __argp_fmtstream_update _argp_fmtstream_update +#undef __argp_fmtstream_ensure +#define __argp_fmtstream_ensure _argp_fmtstream_ensure +#undef __argp_fmtstream_lmargin +#define __argp_fmtstream_lmargin argp_fmtstream_lmargin +#undef __argp_fmtstream_rmargin +#define __argp_fmtstream_rmargin argp_fmtstream_rmargin +#undef __argp_fmtstream_wmargin +#define __argp_fmtstream_wmargin argp_fmtstream_wmargin + +/* normal libc functions we call */ +#undef __sleep +#define __sleep sleep +#undef __strcasecmp +#define __strcasecmp strcasecmp +#undef __vsnprintf +#define __vsnprintf vsnprintf + +#endif /* !_LIBC */ + +#ifndef __set_errno +#define __set_errno(e) (errno = (e)) +#endif diff --git a/argp-standalone/argp-parse.c b/argp-standalone/argp-parse.c new file mode 100644 index 000000000..78f7bf139 --- /dev/null +++ b/argp-standalone/argp-parse.c @@ -0,0 +1,1305 @@ +/* Hierarchial argument parsing + Copyright (C) 1995, 96, 97, 98, 99, 2000,2003 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Miles Bader . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE 1 +#endif + +#ifdef HAVE_CONFIG_H +#include +#endif + +#if HAVE_ALLOCA_H +#include +#endif + +#include +#include +#if HAVE_UNISTD_H +# include +#endif +#include +#include + +#if HAVE_MALLOC_H +/* Needed, for alloca on windows */ +# include +#endif + +#ifndef _ +/* This is for other GNU distributions with internationalized messages. + When compiling libc, the _ macro is predefined. */ +# if defined HAVE_LIBINTL_H || defined _LIBC +# include +# ifdef _LIBC +# undef dgettext +# define dgettext(domain, msgid) __dcgettext (domain, msgid, LC_MESSAGES) +# endif +# else +# define dgettext(domain, msgid) (msgid) +# define gettext(msgid) (msgid) +# endif +#endif +#ifndef N_ +# define N_(msgid) (msgid) +#endif + +#if _LIBC - 0 +#include +#else +#ifdef HAVE_CTHREADS_H +#include +#endif +#endif /* _LIBC */ + +#include "argp.h" +#include "argp-namefrob.h" + + +/* The meta-argument used to prevent any further arguments being interpreted + as options. */ +#define QUOTE "--" + +/* EZ alias for ARGP_ERR_UNKNOWN. */ +#define EBADKEY ARGP_ERR_UNKNOWN + + +/* Default options. */ + +/* When argp is given the --HANG switch, _ARGP_HANG is set and argp will sleep + for one second intervals, decrementing _ARGP_HANG until it's zero. Thus + you can force the program to continue by attaching a debugger and setting + it to 0 yourself. */ +volatile int _argp_hang; + +#define OPT_PROGNAME -2 +#define OPT_USAGE -3 +#if HAVE_SLEEP && HAVE_GETPID +#define OPT_HANG -4 +#endif + +static const struct argp_option argp_default_options[] = +{ + {"help", '?', 0, 0, N_("Give this help list"), -1}, + {"usage", OPT_USAGE, 0, 0, N_("Give a short usage message"), 0 }, + {"program-name",OPT_PROGNAME,"NAME", OPTION_HIDDEN, + N_("Set the program name"), 0}, +#if OPT_HANG + {"HANG", OPT_HANG, "SECS", OPTION_ARG_OPTIONAL | OPTION_HIDDEN, + N_("Hang for SECS seconds (default 3600)"), 0 }, +#endif + {0, 0, 0, 0, 0, 0} +}; + +static error_t +argp_default_parser (int key, char *arg, struct argp_state *state) +{ + switch (key) + { + case '?': + __argp_state_help (state, state->out_stream, ARGP_HELP_STD_HELP); + break; + case OPT_USAGE: + __argp_state_help (state, state->out_stream, + ARGP_HELP_USAGE | ARGP_HELP_EXIT_OK); + break; + + case OPT_PROGNAME: /* Set the program name. */ +#if HAVE_DECL_PROGRAM_INVOCATION_NAME + program_invocation_name = arg; +#endif + /* [Note that some systems only have PROGRAM_INVOCATION_SHORT_NAME (aka + __PROGNAME), in which case, PROGRAM_INVOCATION_NAME is just defined + to be that, so we have to be a bit careful here.] */ + + /* Update what we use for messages. */ + + state->name = __argp_basename(arg); + +#if HAVE_DECL_PROGRAM_INVOCATION_SHORT_NAME + program_invocation_short_name = state->name; +#endif + + if ((state->flags & (ARGP_PARSE_ARGV0 | ARGP_NO_ERRS)) + == ARGP_PARSE_ARGV0) + /* Update what getopt uses too. */ + state->argv[0] = arg; + + break; + +#if OPT_HANG + case OPT_HANG: + _argp_hang = atoi (arg ? arg : "3600"); + fprintf(state->err_stream, "%s: pid = %ld\n", + state->name, (long) getpid()); + while (_argp_hang-- > 0) + __sleep (1); + break; +#endif + + default: + return EBADKEY; + } + return 0; +} + +static const struct argp argp_default_argp = + {argp_default_options, &argp_default_parser, NULL, NULL, NULL, NULL, "libc"}; + + +static const struct argp_option argp_version_options[] = +{ + {"version", 'V', 0, 0, N_("Print program version"), -1}, + {0, 0, 0, 0, 0, 0 } +}; + +static error_t +argp_version_parser (int key, char *arg UNUSED, struct argp_state *state) +{ + switch (key) + { + case 'V': + if (argp_program_version_hook) + (*argp_program_version_hook) (state->out_stream, state); + else if (argp_program_version) + fprintf (state->out_stream, "%s\n", argp_program_version); + else + __argp_error (state, dgettext (state->root_argp->argp_domain, + "(PROGRAM ERROR) No version known!?")); + if (! (state->flags & ARGP_NO_EXIT)) + exit (0); + break; + default: + return EBADKEY; + } + return 0; +} + +static const struct argp argp_version_argp = + {argp_version_options, &argp_version_parser, NULL, NULL, NULL, NULL, "libc"}; + + + +/* The state of a `group' during parsing. Each group corresponds to a + particular argp structure from the tree of such descending from the top + level argp passed to argp_parse. */ +struct group +{ + /* This group's parsing function. */ + argp_parser_t parser; + + /* Which argp this group is from. */ + const struct argp *argp; + + /* The number of non-option args sucessfully handled by this parser. */ + unsigned args_processed; + + /* This group's parser's parent's group. */ + struct group *parent; + unsigned parent_index; /* And the our position in the parent. */ + + /* These fields are swapped into and out of the state structure when + calling this group's parser. */ + void *input, **child_inputs; + void *hook; +}; + +/* Call GROUP's parser with KEY and ARG, swapping any group-specific info + from STATE before calling, and back into state afterwards. If GROUP has + no parser, EBADKEY is returned. */ +static error_t +group_parse (struct group *group, struct argp_state *state, int key, char *arg) +{ + if (group->parser) + { + error_t err; + state->hook = group->hook; + state->input = group->input; + state->child_inputs = group->child_inputs; + state->arg_num = group->args_processed; + err = (*group->parser)(key, arg, state); + group->hook = state->hook; + return err; + } + else + return EBADKEY; +} + +struct parser +{ + const struct argp *argp; + + const char *posixly_correct; + + /* True if there are only no-option arguments left, which are just + passed verbatim with ARGP_KEY_ARG. This is set if we encounter a + quote, or the end of the proper options, but may be cleared again + if the user moves the next argument pointer backwards. */ + int args_only; + + /* Describe how to deal with options that follow non-option ARGV-elements. + + If the caller did not specify anything, the default is + REQUIRE_ORDER if the environment variable POSIXLY_CORRECT is + defined, PERMUTE otherwise. + + REQUIRE_ORDER means don't recognize them as options; stop option + processing when the first non-option is seen. This is what Unix + does. This mode of operation is selected by either setting the + environment variable POSIXLY_CORRECT, or using `+' as the first + character of the list of option characters. + + PERMUTE is the default. We permute the contents of ARGV as we + scan, so that eventually all the non-options are at the end. This + allows options to be given in any order, even with programs that + were not written to expect this. + + RETURN_IN_ORDER is an option available to programs that were + written to expect options and other ARGV-elements in any order + and that care about the ordering of the two. We describe each + non-option ARGV-element as if it were the argument of an option + with character code 1. Using `-' as the first character of the + list of option characters selects this mode of operation. + + */ + enum { REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER } ordering; + + /* A segment of non-option arguments that have been skipped for + later processing, after all options. `first_nonopt' is the index + in ARGV of the first of them; `last_nonopt' is the index after + the last of them. + + If quoted or args_only is non-zero, this segment should be empty. */ + + /* FIXME: I'd prefer to use unsigned, but it's more consistent to + use the same type as for state.next. */ + int first_nonopt; + int last_nonopt; + + /* String of all recognized short options. Needed for ARGP_LONG_ONLY. */ + /* FIXME: Perhaps change to a pointer to a suitable bitmap instead? */ + char *short_opts; + + /* For parsing combined short options. */ + char *nextchar; + + /* States of the various parsing groups. */ + struct group *groups; + /* The end of the GROUPS array. */ + struct group *egroup; + /* An vector containing storage for the CHILD_INPUTS field in all groups. */ + void **child_inputs; + + /* State block supplied to parsing routines. */ + struct argp_state state; + + /* Memory used by this parser. */ + void *storage; +}; + +/* Search for a group defining a short option. */ +static const struct argp_option * +find_short_option(struct parser *parser, int key, struct group **p) +{ + struct group *group; + + assert(key >= 0); + assert(isascii(key)); + + for (group = parser->groups; group < parser->egroup; group++) + { + const struct argp_option *opts; + + for (opts = group->argp->options; !__option_is_end(opts); opts++) + if (opts->key == key) + { + *p = group; + return opts; + } + } + return NULL; +} + +enum match_result { MATCH_EXACT, MATCH_PARTIAL, MATCH_NO }; + +/* If defined, allow complete.el-like abbreviations of long options. */ +#ifndef ARGP_COMPLETE +#define ARGP_COMPLETE 0 +#endif + +/* Matches an encountern long-option argument ARG against an option NAME. + * ARG is terminated by NUL or '='. */ +static enum match_result +match_option(const char *arg, const char *name) +{ + unsigned i, j; + for (i = j = 0;; i++, j++) + { + switch(arg[i]) + { + case '\0': + case '=': + return name[j] ? MATCH_PARTIAL : MATCH_EXACT; +#if ARGP_COMPLETE + case '-': + while (name[j] != '-') + if (!name[j++]) + return MATCH_NO; + break; +#endif + default: + if (arg[i] != name[j]) + return MATCH_NO; + } + } +} + +static const struct argp_option * +find_long_option(struct parser *parser, + const char *arg, + struct group **p) +{ + struct group *group; + + /* Partial match found so far. */ + struct group *matched_group = NULL; + const struct argp_option *matched_option = NULL; + + /* Number of partial matches. */ + int num_partial = 0; + + for (group = parser->groups; group < parser->egroup; group++) + { + const struct argp_option *opts; + + for (opts = group->argp->options; !__option_is_end(opts); opts++) + { + if (!opts->name) + continue; + switch (match_option(arg, opts->name)) + { + case MATCH_NO: + break; + case MATCH_PARTIAL: + num_partial++; + + matched_group = group; + matched_option = opts; + + break; + case MATCH_EXACT: + /* Exact match. */ + *p = group; + return opts; + } + } + } + if (num_partial == 1) + { + *p = matched_group; + return matched_option; + } + + return NULL; +} + + +/* The next usable entries in the various parser tables being filled in by + convert_options. */ +struct parser_convert_state +{ + struct parser *parser; + char *short_end; + void **child_inputs_end; +}; + +/* Initialize GROUP from ARGP. If CVT->SHORT_END is non-NULL, short + options are recorded in the short options string. Returns the next + unused group entry. CVT holds state used during the conversion. */ +static struct group * +convert_options (const struct argp *argp, + struct group *parent, unsigned parent_index, + struct group *group, struct parser_convert_state *cvt) +{ + const struct argp_option *opt = argp->options; + const struct argp_child *children = argp->children; + + if (opt || argp->parser) + { + /* This parser needs a group. */ + if (cvt->short_end) + { + /* Record any short options. */ + for ( ; !__option_is_end (opt); opt++) + if (__option_is_short(opt)) + *cvt->short_end++ = opt->key; + } + + group->parser = argp->parser; + group->argp = argp; + group->args_processed = 0; + group->parent = parent; + group->parent_index = parent_index; + group->input = 0; + group->hook = 0; + group->child_inputs = 0; + + if (children) + /* Assign GROUP's CHILD_INPUTS field some space from + CVT->child_inputs_end.*/ + { + unsigned num_children = 0; + while (children[num_children].argp) + num_children++; + group->child_inputs = cvt->child_inputs_end; + cvt->child_inputs_end += num_children; + } + parent = group++; + } + else + parent = 0; + + if (children) + { + unsigned index = 0; + while (children->argp) + group = + convert_options (children++->argp, parent, index++, group, cvt); + } + + return group; +} +/* Allocate and initialize the group structures, so that they are + ordered as if by traversing the corresponding argp parser tree in + pre-order. Also build the list of short options, if that is needed. */ +static void +parser_convert (struct parser *parser, const struct argp *argp) +{ + struct parser_convert_state cvt; + + cvt.parser = parser; + cvt.short_end = parser->short_opts; + cvt.child_inputs_end = parser->child_inputs; + + parser->argp = argp; + + if (argp) + parser->egroup = convert_options (argp, 0, 0, parser->groups, &cvt); + else + parser->egroup = parser->groups; /* No parsers at all! */ + + if (parser->short_opts) + *cvt.short_end ='\0'; +} + +/* Lengths of various parser fields which we will allocated. */ +struct parser_sizes +{ + /* Needed only ARGP_LONG_ONLY */ + size_t short_len; /* Number of short options. */ + + size_t num_groups; /* Group structures we allocate. */ + size_t num_child_inputs; /* Child input slots. */ +}; + +/* For ARGP, increments the NUM_GROUPS field in SZS by the total + number of argp structures descended from it, and the SHORT_LEN by + the total number of short options. */ +static void +calc_sizes (const struct argp *argp, struct parser_sizes *szs) +{ + const struct argp_child *child = argp->children; + const struct argp_option *opt = argp->options; + + if (opt || argp->parser) + { + /* This parser needs a group. */ + szs->num_groups++; + if (opt) + { + while (__option_is_short (opt++)) + szs->short_len++; + } + } + + if (child) + while (child->argp) + { + calc_sizes ((child++)->argp, szs); + szs->num_child_inputs++; + } +} + +/* Initializes PARSER to parse ARGP in a manner described by FLAGS. */ +static error_t +parser_init (struct parser *parser, const struct argp *argp, + int argc, char **argv, int flags, void *input) +{ + error_t err = 0; + struct group *group; + struct parser_sizes szs; + + parser->posixly_correct = getenv ("POSIXLY_CORRECT"); + + if (flags & ARGP_IN_ORDER) + parser->ordering = RETURN_IN_ORDER; + else if (flags & ARGP_NO_ARGS) + parser->ordering = REQUIRE_ORDER; + else if (parser->posixly_correct) + parser->ordering = REQUIRE_ORDER; + else + parser->ordering = PERMUTE; + + szs.short_len = 0; + szs.num_groups = 0; + szs.num_child_inputs = 0; + + if (argp) + calc_sizes (argp, &szs); + + if (!(flags & ARGP_LONG_ONLY)) + /* We have no use for the short option array. */ + szs.short_len = 0; + + /* Lengths of the various bits of storage used by PARSER. */ +#define GLEN (szs.num_groups + 1) * sizeof (struct group) +#define CLEN (szs.num_child_inputs * sizeof (void *)) +#define SLEN (szs.short_len + 1) +#define STORAGE(offset) ((void *) (((char *) parser->storage) + (offset))) + + parser->storage = malloc (GLEN + CLEN + SLEN); + if (! parser->storage) + return ENOMEM; + + parser->groups = parser->storage; + + parser->child_inputs = STORAGE(GLEN); + memset (parser->child_inputs, 0, szs.num_child_inputs * sizeof (void *)); + + if (flags & ARGP_LONG_ONLY) + parser->short_opts = STORAGE(GLEN + CLEN); + else + parser->short_opts = NULL; + + parser_convert (parser, argp); + + memset (&parser->state, 0, sizeof (struct argp_state)); + + parser->state.root_argp = parser->argp; + parser->state.argc = argc; + parser->state.argv = argv; + parser->state.flags = flags; + parser->state.err_stream = stderr; + parser->state.out_stream = stdout; + parser->state.pstate = parser; + + parser->args_only = 0; + parser->nextchar = NULL; + parser->first_nonopt = parser->last_nonopt = 0; + + /* Call each parser for the first time, giving it a chance to propagate + values to child parsers. */ + if (parser->groups < parser->egroup) + parser->groups->input = input; + for (group = parser->groups; + group < parser->egroup && (!err || err == EBADKEY); + group++) + { + if (group->parent) + /* If a child parser, get the initial input value from the parent. */ + group->input = group->parent->child_inputs[group->parent_index]; + + if (!group->parser + && group->argp->children && group->argp->children->argp) + /* For the special case where no parsing function is supplied for an + argp, propagate its input to its first child, if any (this just + makes very simple wrapper argps more convenient). */ + group->child_inputs[0] = group->input; + + err = group_parse (group, &parser->state, ARGP_KEY_INIT, 0); + } + if (err == EBADKEY) + err = 0; /* Some parser didn't understand. */ + + if (err) + return err; + + if (argv[0] && !(parser->state.flags & ARGP_PARSE_ARGV0)) + /* There's an argv[0]; use it for messages. */ + { + parser->state.name = __argp_basename(argv[0]); + + /* Don't parse it as an argument. */ + parser->state.next = 1; + } + else + parser->state.name = __argp_short_program_name(NULL); + + return 0; +} + +/* Free any storage consumed by PARSER (but not PARSER itself). */ +static error_t +parser_finalize (struct parser *parser, + error_t err, int arg_ebadkey, int *end_index) +{ + struct group *group; + + if (err == EBADKEY && arg_ebadkey) + /* Suppress errors generated by unparsed arguments. */ + err = 0; + + if (! err) + { + if (parser->state.next == parser->state.argc) + /* We successfully parsed all arguments! Call all the parsers again, + just a few more times... */ + { + for (group = parser->groups; + group < parser->egroup && (!err || err==EBADKEY); + group++) + if (group->args_processed == 0) + err = group_parse (group, &parser->state, ARGP_KEY_NO_ARGS, 0); + for (group = parser->egroup - 1; + group >= parser->groups && (!err || err==EBADKEY); + group--) + err = group_parse (group, &parser->state, ARGP_KEY_END, 0); + + if (err == EBADKEY) + err = 0; /* Some parser didn't understand. */ + + /* Tell the user that all arguments are parsed. */ + if (end_index) + *end_index = parser->state.next; + } + else if (end_index) + /* Return any remaining arguments to the user. */ + *end_index = parser->state.next; + else + /* No way to return the remaining arguments, they must be bogus. */ + { + if (!(parser->state.flags & ARGP_NO_ERRS) + && parser->state.err_stream) + fprintf (parser->state.err_stream, + dgettext (parser->argp->argp_domain, + "%s: Too many arguments\n"), + parser->state.name); + err = EBADKEY; + } + } + + /* Okay, we're all done, with either an error or success; call the parsers + to indicate which one. */ + + if (err) + { + /* Maybe print an error message. */ + if (err == EBADKEY) + /* An appropriate message describing what the error was should have + been printed earlier. */ + __argp_state_help (&parser->state, parser->state.err_stream, + ARGP_HELP_STD_ERR); + + /* Since we didn't exit, give each parser an error indication. */ + for (group = parser->groups; group < parser->egroup; group++) + group_parse (group, &parser->state, ARGP_KEY_ERROR, 0); + } + else + /* Notify parsers of success, and propagate back values from parsers. */ + { + /* We pass over the groups in reverse order so that child groups are + given a chance to do there processing before passing back a value to + the parent. */ + for (group = parser->egroup - 1 + ; group >= parser->groups && (!err || err == EBADKEY) + ; group--) + err = group_parse (group, &parser->state, ARGP_KEY_SUCCESS, 0); + if (err == EBADKEY) + err = 0; /* Some parser didn't understand. */ + } + + /* Call parsers once more, to do any final cleanup. Errors are ignored. */ + for (group = parser->egroup - 1; group >= parser->groups; group--) + group_parse (group, &parser->state, ARGP_KEY_FINI, 0); + + if (err == EBADKEY) + err = EINVAL; + + free (parser->storage); + + return err; +} + +/* Call the user parsers to parse the non-option argument VAL, at the + current position, returning any error. The state NEXT pointer + should point to the argument; this function will adjust it + correctly to reflect however many args actually end up being + consumed. */ +static error_t +parser_parse_arg (struct parser *parser, char *val) +{ + /* Save the starting value of NEXT */ + int index = parser->state.next; + error_t err = EBADKEY; + struct group *group; + int key = 0; /* Which of ARGP_KEY_ARG[S] we used. */ + + /* Try to parse the argument in each parser. */ + for (group = parser->groups + ; group < parser->egroup && err == EBADKEY + ; group++) + { + parser->state.next++; /* For ARGP_KEY_ARG, consume the arg. */ + key = ARGP_KEY_ARG; + err = group_parse (group, &parser->state, key, val); + + if (err == EBADKEY) + /* This parser doesn't like ARGP_KEY_ARG; try ARGP_KEY_ARGS instead. */ + { + parser->state.next--; /* For ARGP_KEY_ARGS, put back the arg. */ + key = ARGP_KEY_ARGS; + err = group_parse (group, &parser->state, key, 0); + } + } + + if (! err) + { + if (key == ARGP_KEY_ARGS) + /* The default for ARGP_KEY_ARGS is to assume that if NEXT isn't + changed by the user, *all* arguments should be considered + consumed. */ + parser->state.next = parser->state.argc; + + if (parser->state.next > index) + /* Remember that we successfully processed a non-option + argument -- but only if the user hasn't gotten tricky and set + the clock back. */ + (--group)->args_processed += (parser->state.next - index); + else + /* The user wants to reparse some args, so try looking for options again. */ + parser->args_only = 0; + } + + return err; +} + +/* Exchange two adjacent subsequences of ARGV. + One subsequence is elements [first_nonopt,last_nonopt) + which contains all the non-options that have been skipped so far. + The other is elements [last_nonopt,next), which contains all + the options processed since those non-options were skipped. + + `first_nonopt' and `last_nonopt' are relocated so that they describe + the new indices of the non-options in ARGV after they are moved. */ + +static void +exchange (struct parser *parser) +{ + int bottom = parser->first_nonopt; + int middle = parser->last_nonopt; + int top = parser->state.next; + char **argv = parser->state.argv; + + char *tem; + + /* Exchange the shorter segment with the far end of the longer segment. + That puts the shorter segment into the right place. + It leaves the longer segment in the right place overall, + but it consists of two parts that need to be swapped next. */ + + while (top > middle && middle > bottom) + { + if (top - middle > middle - bottom) + { + /* Bottom segment is the short one. */ + int len = middle - bottom; + register int i; + + /* Swap it with the top part of the top segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[top - (middle - bottom) + i]; + argv[top - (middle - bottom) + i] = tem; + } + /* Exclude the moved bottom segment from further swapping. */ + top -= len; + } + else + { + /* Top segment is the short one. */ + int len = top - middle; + register int i; + + /* Swap it with the bottom part of the bottom segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[middle + i]; + argv[middle + i] = tem; + } + /* Exclude the moved top segment from further swapping. */ + bottom += len; + } + } + + /* Update records for the slots the non-options now occupy. */ + + parser->first_nonopt += (parser->state.next - parser->last_nonopt); + parser->last_nonopt = parser->state.next; +} + + + +enum arg_type { ARG_ARG, ARG_SHORT_OPTION, + ARG_LONG_OPTION, ARG_LONG_ONLY_OPTION, + ARG_QUOTE }; + +static enum arg_type +classify_arg(struct parser *parser, char *arg, char **opt) +{ + if (arg[0] == '-') + /* Looks like an option... */ + switch (arg[1]) + { + case '\0': + /* "-" is not an option. */ + return ARG_ARG; + case '-': + /* Long option, or quote. */ + if (!arg[2]) + return ARG_QUOTE; + + /* A long option. */ + if (opt) + *opt = arg + 2; + return ARG_LONG_OPTION; + + default: + /* Short option. But if ARGP_LONG_ONLY, it can also be a long option. */ + + if (opt) + *opt = arg + 1; + + if (parser->state.flags & ARGP_LONG_ONLY) + { + /* Rules from getopt.c: + + If long_only and the ARGV-element has the form "-f", + where f is a valid short option, don't consider it an + abbreviated form of a long option that starts with f. + Otherwise there would be no way to give the -f short + option. + + On the other hand, if there's a long option "fubar" and + the ARGV-element is "-fu", do consider that an + abbreviation of the long option, just like "--fu", and + not "-f" with arg "u". + + This distinction seems to be the most useful approach. */ + + assert(parser->short_opts); + + if (arg[2] || !strchr(parser->short_opts, arg[1])) + return ARG_LONG_ONLY_OPTION; + } + + return ARG_SHORT_OPTION; + } + + else + return ARG_ARG; +} + +/* Parse the next argument in PARSER (as indicated by PARSER->state.next). + Any error from the parsers is returned, and *ARGP_EBADKEY indicates + whether a value of EBADKEY is due to an unrecognized argument (which is + generally not fatal). */ +static error_t +parser_parse_next (struct parser *parser, int *arg_ebadkey) +{ + if (parser->state.quoted && parser->state.next < parser->state.quoted) + /* The next argument pointer has been moved to before the quoted + region, so pretend we never saw the quoting `--', and start + looking for options again. If the `--' is still there we'll just + process it one more time. */ + parser->state.quoted = parser->args_only = 0; + + /* Give FIRST_NONOPT & LAST_NONOPT rational values if NEXT has been + moved back by the user (who may also have changed the arguments). */ + if (parser->last_nonopt > parser->state.next) + parser->last_nonopt = parser->state.next; + if (parser->first_nonopt > parser->state.next) + parser->first_nonopt = parser->state.next; + + if (parser->nextchar) + /* Deal with short options. */ + { + struct group *group; + char c; + const struct argp_option *option; + char *value = NULL;; + + assert(!parser->args_only); + + c = *parser->nextchar++; + + option = find_short_option(parser, c, &group); + if (!option) + { + if (parser->posixly_correct) + /* 1003.2 specifies the format of this message. */ + fprintf (parser->state.err_stream, + dgettext(parser->state.root_argp->argp_domain, + "%s: illegal option -- %c\n"), + parser->state.name, c); + else + fprintf (parser->state.err_stream, + dgettext(parser->state.root_argp->argp_domain, + "%s: invalid option -- %c\n"), + parser->state.name, c); + + *arg_ebadkey = 0; + return EBADKEY; + } + + if (!*parser->nextchar) + parser->nextchar = NULL; + + if (option->arg) + { + value = parser->nextchar; + parser->nextchar = NULL; + + if (!value + && !(option->flags & OPTION_ARG_OPTIONAL)) + /* We need an mandatory argument. */ + { + if (parser->state.next == parser->state.argc) + /* Missing argument */ + { + /* 1003.2 specifies the format of this message. */ + fprintf (parser->state.err_stream, + dgettext(parser->state.root_argp->argp_domain, + "%s: option requires an argument -- %c\n"), + parser->state.name, c); + + *arg_ebadkey = 0; + return EBADKEY; + } + value = parser->state.argv[parser->state.next++]; + } + } + return group_parse(group, &parser->state, + option->key, value); + } + else + /* Advance to the next ARGV-element. */ + { + if (parser->args_only) + { + *arg_ebadkey = 1; + if (parser->state.next >= parser->state.argc) + /* We're done. */ + return EBADKEY; + else + return parser_parse_arg(parser, + parser->state.argv[parser->state.next]); + } + + if (parser->state.next >= parser->state.argc) + /* Almost done. If there are non-options that we skipped + previously, we should process them now. */ + { + *arg_ebadkey = 1; + if (parser->first_nonopt != parser->last_nonopt) + { + exchange(parser); + + /* Start processing the arguments we skipped previously. */ + parser->state.next = parser->first_nonopt; + + parser->first_nonopt = parser->last_nonopt = 0; + + parser->args_only = 1; + return 0; + } + else + /* Indicate that we're really done. */ + return EBADKEY; + } + else + /* Look for options. */ + { + char *arg = parser->state.argv[parser->state.next]; + + char *optstart; + enum arg_type token = classify_arg(parser, arg, &optstart); + + switch (token) + { + case ARG_ARG: + switch (parser->ordering) + { + case PERMUTE: + if (parser->first_nonopt == parser->last_nonopt) + /* Skipped sequence is empty; start a new one. */ + parser->first_nonopt = parser->last_nonopt = parser->state.next; + + else if (parser->last_nonopt != parser->state.next) + /* We have a non-empty skipped sequence, and + we're not at the end-point, so move it. */ + exchange(parser); + + assert(parser->last_nonopt == parser->state.next); + + /* Skip this argument for now. */ + parser->state.next++; + parser->last_nonopt = parser->state.next; + + return 0; + + case REQUIRE_ORDER: + /* Implicit quote before the first argument. */ + parser->args_only = 1; + return 0; + + case RETURN_IN_ORDER: + *arg_ebadkey = 1; + return parser_parse_arg(parser, arg); + + default: + abort(); + } + case ARG_QUOTE: + /* Skip it, then exchange with any previous non-options. */ + parser->state.next++; + assert (parser->last_nonopt != parser->state.next); + + if (parser->first_nonopt != parser->last_nonopt) + { + exchange(parser); + + /* Start processing the skipped and the quoted + arguments. */ + + parser->state.quoted = parser->state.next = parser->first_nonopt; + + /* Also empty the skipped-list, to avoid confusion + if the user resets the next pointer. */ + parser->first_nonopt = parser->last_nonopt = 0; + } + else + parser->state.quoted = parser->state.next; + + parser->args_only = 1; + return 0; + + case ARG_LONG_ONLY_OPTION: + case ARG_LONG_OPTION: + { + struct group *group; + const struct argp_option *option; + char *value; + + parser->state.next++; + option = find_long_option(parser, optstart, &group); + + if (!option) + { + /* NOTE: This includes any "=something" in the output. */ + fprintf (parser->state.err_stream, + dgettext(parser->state.root_argp->argp_domain, + "%s: unrecognized option `%s'\n"), + parser->state.name, arg); + *arg_ebadkey = 0; + return EBADKEY; + } + + value = strchr(optstart, '='); + if (value) + value++; + + if (value && !option->arg) + /* Unexpected argument. */ + { + if (token == ARG_LONG_OPTION) + /* --option */ + fprintf (parser->state.err_stream, + dgettext(parser->state.root_argp->argp_domain, + "%s: option `--%s' doesn't allow an argument\n"), + parser->state.name, option->name); + else + /* +option or -option */ + fprintf (parser->state.err_stream, + dgettext(parser->state.root_argp->argp_domain, + "%s: option `%c%s' doesn't allow an argument\n"), + parser->state.name, arg[0], option->name); + + *arg_ebadkey = 0; + return EBADKEY; + } + + if (option->arg && !value + && !(option->flags & OPTION_ARG_OPTIONAL)) + /* We need an mandatory argument. */ + { + if (parser->state.next == parser->state.argc) + /* Missing argument */ + { + if (token == ARG_LONG_OPTION) + /* --option */ + fprintf (parser->state.err_stream, + dgettext(parser->state.root_argp->argp_domain, + "%s: option `--%s' requires an argument\n"), + parser->state.name, option->name); + else + /* +option or -option */ + fprintf (parser->state.err_stream, + dgettext(parser->state.root_argp->argp_domain, + "%s: option `%c%s' requires an argument\n"), + parser->state.name, arg[0], option->name); + + *arg_ebadkey = 0; + return EBADKEY; + } + + value = parser->state.argv[parser->state.next++]; + } + *arg_ebadkey = 0; + return group_parse(group, &parser->state, + option->key, value); + } + case ARG_SHORT_OPTION: + parser->state.next++; + parser->nextchar = optstart; + return 0; + + default: + abort(); + } + } + } +} + +/* Parse the options strings in ARGC & ARGV according to the argp in ARGP. + FLAGS is one of the ARGP_ flags above. If END_INDEX is non-NULL, the + index in ARGV of the first unparsed option is returned in it. If an + unknown option is present, EINVAL is returned; if some parser routine + returned a non-zero value, it is returned; otherwise 0 is returned. */ +error_t +__argp_parse (const struct argp *argp, int argc, char **argv, unsigned flags, + int *end_index, void *input) +{ + error_t err; + struct parser parser; + + /* If true, then err == EBADKEY is a result of a non-option argument failing + to be parsed (which in some cases isn't actually an error). */ + int arg_ebadkey = 0; + + if (! (flags & ARGP_NO_HELP)) + /* Add our own options. */ + { + struct argp_child *child = alloca (4 * sizeof (struct argp_child)); + struct argp *top_argp = alloca (sizeof (struct argp)); + + /* TOP_ARGP has no options, it just serves to group the user & default + argps. */ + memset (top_argp, 0, sizeof (*top_argp)); + top_argp->children = child; + + memset (child, 0, 4 * sizeof (struct argp_child)); + + if (argp) + (child++)->argp = argp; + (child++)->argp = &argp_default_argp; + if (argp_program_version || argp_program_version_hook) + (child++)->argp = &argp_version_argp; + child->argp = 0; + + argp = top_argp; + } + + /* Construct a parser for these arguments. */ + err = parser_init (&parser, argp, argc, argv, flags, input); + + if (! err) + /* Parse! */ + { + while (! err) + err = parser_parse_next (&parser, &arg_ebadkey); + err = parser_finalize (&parser, err, arg_ebadkey, end_index); + } + + return err; +} +#ifdef weak_alias +weak_alias (__argp_parse, argp_parse) +#endif + +/* Return the input field for ARGP in the parser corresponding to STATE; used + by the help routines. */ +void * +__argp_input (const struct argp *argp, const struct argp_state *state) +{ + if (state) + { + struct group *group; + struct parser *parser = state->pstate; + + for (group = parser->groups; group < parser->egroup; group++) + if (group->argp == argp) + return group->input; + } + + return 0; +} +#ifdef weak_alias +weak_alias (__argp_input, _argp_input) +#endif + +/* Defined here, in case a user is not inlining the definitions in + * argp.h */ +void +__argp_usage (__const struct argp_state *__state) +{ + __argp_state_help (__state, stderr, ARGP_HELP_STD_USAGE); +} + +int +__option_is_short (__const struct argp_option *__opt) +{ + if (__opt->flags & OPTION_DOC) + return 0; + else + { + int __key = __opt->key; + /* FIXME: whether or not a particular key implies a short option + * ought not to be locale dependent. */ + return __key > 0 && isprint (__key); + } +} + +int +__option_is_end (__const struct argp_option *__opt) +{ + return !__opt->key && !__opt->name && !__opt->doc && !__opt->group; +} diff --git a/argp-standalone/argp-pv.c b/argp-standalone/argp-pv.c new file mode 100644 index 000000000..d7d374a66 --- /dev/null +++ b/argp-standalone/argp-pv.c @@ -0,0 +1,25 @@ +/* Default definition for ARGP_PROGRAM_VERSION. + Copyright (C) 1996, 1997, 1999, 2004 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Miles Bader . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* If set by the user program to a non-zero value, then a default option + --version is added (unless the ARGP_NO_HELP flag is used), which will + print this this string followed by a newline and exit (unless the + ARGP_NO_EXIT flag is used). Overridden by ARGP_PROGRAM_VERSION_HOOK. */ +const char *argp_program_version = 0; diff --git a/argp-standalone/argp-pvh.c b/argp-standalone/argp-pvh.c new file mode 100644 index 000000000..829a1cda8 --- /dev/null +++ b/argp-standalone/argp-pvh.c @@ -0,0 +1,32 @@ +/* Default definition for ARGP_PROGRAM_VERSION_HOOK. + Copyright (C) 1996, 1997, 1999, 2004 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Miles Bader . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include "argp.h" + +/* If set by the user program to a non-zero value, then a default option + --version is added (unless the ARGP_NO_HELP flag is used), which calls + this function with a stream to print the version to and a pointer to the + current parsing state, and then exits (unless the ARGP_NO_EXIT flag is + used). This variable takes precedent over ARGP_PROGRAM_VERSION. */ +void (*argp_program_version_hook) (FILE *stream, struct argp_state *state) = 0; diff --git a/argp-standalone/argp.h b/argp-standalone/argp.h new file mode 100644 index 000000000..29d3dfe97 --- /dev/null +++ b/argp-standalone/argp.h @@ -0,0 +1,602 @@ +/* Hierarchial argument parsing. + Copyright (C) 1995, 96, 97, 98, 99, 2003 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Miles Bader . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#ifndef _ARGP_H +#define _ARGP_H + +#include +#include + +#define __need_error_t +#include + +#ifndef __THROW +# define __THROW +#endif + +#ifndef __const +# define __const const +#endif + +#ifndef __error_t_defined +typedef int error_t; +# define __error_t_defined +#endif + +/* FIXME: What's the right way to check for __restrict? Sun's cc seems + not to have it. Perhaps it's easiest to just delete the use of + __restrict from the prototypes. */ +#ifndef __restrict +# ifndef __GNUC___ +# define __restrict +# endif +#endif + +/* NOTE: We can't use the autoconf tests, since this is supposed to be + an installed header file and argp's config.h is of course not + installed. */ +#ifndef PRINTF_STYLE +# if __GNUC__ >= 2 +# define PRINTF_STYLE(f, a) __attribute__ ((__format__ (__printf__, f, a))) +# else +# define PRINTF_STYLE(f, a) +# endif +#endif + + +#ifdef __cplusplus +extern "C" { +#endif + +/* A description of a particular option. A pointer to an array of + these is passed in the OPTIONS field of an argp structure. Each option + entry can correspond to one long option and/or one short option; more + names for the same option can be added by following an entry in an option + array with options having the OPTION_ALIAS flag set. */ +struct argp_option +{ + /* The long option name. For more than one name for the same option, you + can use following options with the OPTION_ALIAS flag set. */ + __const char *name; + + /* What key is returned for this option. If > 0 and printable, then it's + also accepted as a short option. */ + int key; + + /* If non-NULL, this is the name of the argument associated with this + option, which is required unless the OPTION_ARG_OPTIONAL flag is set. */ + __const char *arg; + + /* OPTION_ flags. */ + int flags; + + /* The doc string for this option. If both NAME and KEY are 0, This string + will be printed outdented from the normal option column, making it + useful as a group header (it will be the first thing printed in its + group); in this usage, it's conventional to end the string with a `:'. */ + __const char *doc; + + /* The group this option is in. In a long help message, options are sorted + alphabetically within each group, and the groups presented in the order + 0, 1, 2, ..., n, -m, ..., -2, -1. Every entry in an options array with + if this field 0 will inherit the group number of the previous entry, or + zero if it's the first one, unless its a group header (NAME and KEY both + 0), in which case, the previous entry + 1 is the default. Automagic + options such as --help are put into group -1. */ + int group; +}; + +/* The argument associated with this option is optional. */ +#define OPTION_ARG_OPTIONAL 0x1 + +/* This option isn't displayed in any help messages. */ +#define OPTION_HIDDEN 0x2 + +/* This option is an alias for the closest previous non-alias option. This + means that it will be displayed in the same help entry, and will inherit + fields other than NAME and KEY from the aliased option. */ +#define OPTION_ALIAS 0x4 + +/* This option isn't actually an option (and so should be ignored by the + actual option parser), but rather an arbitrary piece of documentation that + should be displayed in much the same manner as the options. If this flag + is set, then the option NAME field is displayed unmodified (e.g., no `--' + prefix is added) at the left-margin (where a *short* option would normally + be displayed), and the documentation string in the normal place. For + purposes of sorting, any leading whitespace and puncuation is ignored, + except that if the first non-whitespace character is not `-', this entry + is displayed after all options (and OPTION_DOC entries with a leading `-') + in the same group. */ +#define OPTION_DOC 0x8 + +/* This option shouldn't be included in `long' usage messages (but is still + included in help messages). This is mainly intended for options that are + completely documented in an argp's ARGS_DOC field, in which case including + the option in the generic usage list would be redundant. For instance, + if ARGS_DOC is "FOO BAR\n-x BLAH", and the `-x' option's purpose is to + distinguish these two cases, -x should probably be marked + OPTION_NO_USAGE. */ +#define OPTION_NO_USAGE 0x10 + +struct argp; /* fwd declare this type */ +struct argp_state; /* " */ +struct argp_child; /* " */ + +/* The type of a pointer to an argp parsing function. */ +typedef error_t (*argp_parser_t) (int key, char *arg, + struct argp_state *state); + +/* What to return for unrecognized keys. For special ARGP_KEY_ keys, such + returns will simply be ignored. For user keys, this error will be turned + into EINVAL (if the call to argp_parse is such that errors are propagated + back to the user instead of exiting); returning EINVAL itself would result + in an immediate stop to parsing in *all* cases. */ +#define ARGP_ERR_UNKNOWN E2BIG /* Hurd should never need E2BIG. XXX */ + +/* Special values for the KEY argument to an argument parsing function. + ARGP_ERR_UNKNOWN should be returned if they aren't understood. + + The sequence of keys to a parsing function is either (where each + uppercased word should be prefixed by `ARGP_KEY_' and opt is a user key): + + INIT opt... NO_ARGS END SUCCESS -- No non-option arguments at all + or INIT (opt | ARG)... END SUCCESS -- All non-option args parsed + or INIT (opt | ARG)... SUCCESS -- Some non-option arg unrecognized + + The third case is where every parser returned ARGP_KEY_UNKNOWN for an + argument, in which case parsing stops at that argument (returning the + unparsed arguments to the caller of argp_parse if requested, or stopping + with an error message if not). + + If an error occurs (either detected by argp, or because the parsing + function returned an error value), then the parser is called with + ARGP_KEY_ERROR, and no further calls are made. */ + +/* This is not an option at all, but rather a command line argument. If a + parser receiving this key returns success, the fact is recorded, and the + ARGP_KEY_NO_ARGS case won't be used. HOWEVER, if while processing the + argument, a parser function decrements the NEXT field of the state it's + passed, the option won't be considered processed; this is to allow you to + actually modify the argument (perhaps into an option), and have it + processed again. */ +#define ARGP_KEY_ARG 0 +/* There are remaining arguments not parsed by any parser, which may be found + starting at (STATE->argv + STATE->next). If success is returned, but + STATE->next left untouched, it's assumed that all arguments were consume, + otherwise, the parser should adjust STATE->next to reflect any arguments + consumed. */ +#define ARGP_KEY_ARGS 0x1000006 +/* There are no more command line arguments at all. */ +#define ARGP_KEY_END 0x1000001 +/* Because it's common to want to do some special processing if there aren't + any non-option args, user parsers are called with this key if they didn't + successfully process any non-option arguments. Called just before + ARGP_KEY_END (where more general validity checks on previously parsed + arguments can take place). */ +#define ARGP_KEY_NO_ARGS 0x1000002 +/* Passed in before any parsing is done. Afterwards, the values of each + element of the CHILD_INPUT field, if any, in the state structure is + copied to each child's state to be the initial value of the INPUT field. */ +#define ARGP_KEY_INIT 0x1000003 +/* Use after all other keys, including SUCCESS & END. */ +#define ARGP_KEY_FINI 0x1000007 +/* Passed in when parsing has successfully been completed (even if there are + still arguments remaining). */ +#define ARGP_KEY_SUCCESS 0x1000004 +/* Passed in if an error occurs. */ +#define ARGP_KEY_ERROR 0x1000005 + +/* An argp structure contains a set of options declarations, a function to + deal with parsing one, documentation string, a possible vector of child + argp's, and perhaps a function to filter help output. When actually + parsing options, getopt is called with the union of all the argp + structures chained together through their CHILD pointers, with conflicts + being resolved in favor of the first occurrence in the chain. */ +struct argp +{ + /* An array of argp_option structures, terminated by an entry with both + NAME and KEY having a value of 0. */ + __const struct argp_option *options; + + /* What to do with an option from this structure. KEY is the key + associated with the option, and ARG is any associated argument (NULL if + none was supplied). If KEY isn't understood, ARGP_ERR_UNKNOWN should be + returned. If a non-zero, non-ARGP_ERR_UNKNOWN value is returned, then + parsing is stopped immediately, and that value is returned from + argp_parse(). For special (non-user-supplied) values of KEY, see the + ARGP_KEY_ definitions below. */ + argp_parser_t parser; + + /* A string describing what other arguments are wanted by this program. It + is only used by argp_usage to print the `Usage:' message. If it + contains newlines, the strings separated by them are considered + alternative usage patterns, and printed on separate lines (lines after + the first are prefix by ` or: ' instead of `Usage:'). */ + __const char *args_doc; + + /* If non-NULL, a string containing extra text to be printed before and + after the options in a long help message (separated by a vertical tab + `\v' character). */ + __const char *doc; + + /* A vector of argp_children structures, terminated by a member with a 0 + argp field, pointing to child argps should be parsed with this one. Any + conflicts are resolved in favor of this argp, or early argps in the + CHILDREN list. This field is useful if you use libraries that supply + their own argp structure, which you want to use in conjunction with your + own. */ + __const struct argp_child *children; + + /* If non-zero, this should be a function to filter the output of help + messages. KEY is either a key from an option, in which case TEXT is + that option's help text, or a special key from the ARGP_KEY_HELP_ + defines, below, describing which other help text TEXT is. The function + should return either TEXT, if it should be used as-is, a replacement + string, which should be malloced, and will be freed by argp, or NULL, + meaning `print nothing'. The value for TEXT is *after* any translation + has been done, so if any of the replacement text also needs translation, + that should be done by the filter function. INPUT is either the input + supplied to argp_parse, or NULL, if argp_help was called directly. */ + char *(*help_filter) (int __key, __const char *__text, void *__input); + + /* If non-zero the strings used in the argp library are translated using + the domain described by this string. Otherwise the currently installed + default domain is used. */ + const char *argp_domain; +}; + +/* Possible KEY arguments to a help filter function. */ +#define ARGP_KEY_HELP_PRE_DOC 0x2000001 /* Help text preceeding options. */ +#define ARGP_KEY_HELP_POST_DOC 0x2000002 /* Help text following options. */ +#define ARGP_KEY_HELP_HEADER 0x2000003 /* Option header string. */ +#define ARGP_KEY_HELP_EXTRA 0x2000004 /* After all other documentation; + TEXT is NULL for this key. */ +/* Explanatory note emitted when duplicate option arguments have been + suppressed. */ +#define ARGP_KEY_HELP_DUP_ARGS_NOTE 0x2000005 +#define ARGP_KEY_HELP_ARGS_DOC 0x2000006 /* Argument doc string. */ + +/* When an argp has a non-zero CHILDREN field, it should point to a vector of + argp_child structures, each of which describes a subsidiary argp. */ +struct argp_child +{ + /* The child parser. */ + __const struct argp *argp; + + /* Flags for this child. */ + int flags; + + /* If non-zero, an optional header to be printed in help output before the + child options. As a side-effect, a non-zero value forces the child + options to be grouped together; to achieve this effect without actually + printing a header string, use a value of "". */ + __const char *header; + + /* Where to group the child options relative to the other (`consolidated') + options in the parent argp; the values are the same as the GROUP field + in argp_option structs, but all child-groupings follow parent options at + a particular group level. If both this field and HEADER are zero, then + they aren't grouped at all, but rather merged with the parent options + (merging the child's grouping levels with the parents). */ + int group; +}; + +/* Parsing state. This is provided to parsing functions called by argp, + which may examine and, as noted, modify fields. */ +struct argp_state +{ + /* The top level ARGP being parsed. */ + __const struct argp *root_argp; + + /* The argument vector being parsed. May be modified. */ + int argc; + char **argv; + + /* The index in ARGV of the next arg that to be parsed. May be modified. */ + int next; + + /* The flags supplied to argp_parse. May be modified. */ + unsigned flags; + + /* While calling a parsing function with a key of ARGP_KEY_ARG, this is the + number of the current arg, starting at zero, and incremented after each + such call returns. At all other times, this is the number of such + arguments that have been processed. */ + unsigned arg_num; + + /* If non-zero, the index in ARGV of the first argument following a special + `--' argument (which prevents anything following being interpreted as an + option). Only set once argument parsing has proceeded past this point. */ + int quoted; + + /* An arbitrary pointer passed in from the user. */ + void *input; + /* Values to pass to child parsers. This vector will be the same length as + the number of children for the current parser. */ + void **child_inputs; + + /* For the parser's use. Initialized to 0. */ + void *hook; + + /* The name used when printing messages. This is initialized to ARGV[0], + or PROGRAM_INVOCATION_NAME if that is unavailable. */ + char *name; + + /* Streams used when argp prints something. */ + FILE *err_stream; /* For errors; initialized to stderr. */ + FILE *out_stream; /* For information; initialized to stdout. */ + + void *pstate; /* Private, for use by argp. */ +}; + +/* Flags for argp_parse (note that the defaults are those that are + convenient for program command line parsing): */ + +/* Don't ignore the first element of ARGV. Normally (and always unless + ARGP_NO_ERRS is set) the first element of the argument vector is + skipped for option parsing purposes, as it corresponds to the program name + in a command line. */ +#define ARGP_PARSE_ARGV0 0x01 + +/* Don't print error messages for unknown options to stderr; unless this flag + is set, ARGP_PARSE_ARGV0 is ignored, as ARGV[0] is used as the program + name in the error messages. This flag implies ARGP_NO_EXIT (on the + assumption that silent exiting upon errors is bad behaviour). */ +#define ARGP_NO_ERRS 0x02 + +/* Don't parse any non-option args. Normally non-option args are parsed by + calling the parse functions with a key of ARGP_KEY_ARG, and the actual arg + as the value. Since it's impossible to know which parse function wants to + handle it, each one is called in turn, until one returns 0 or an error + other than ARGP_ERR_UNKNOWN; if an argument is handled by no one, the + argp_parse returns prematurely (but with a return value of 0). If all + args have been parsed without error, all parsing functions are called one + last time with a key of ARGP_KEY_END. This flag needn't normally be set, + as the normal behavior is to stop parsing as soon as some argument can't + be handled. */ +#define ARGP_NO_ARGS 0x04 + +/* Parse options and arguments in the same order they occur on the command + line -- normally they're rearranged so that all options come first. */ +#define ARGP_IN_ORDER 0x08 + +/* Don't provide the standard long option --help, which causes usage and + option help information to be output to stdout, and exit (0) called. */ +#define ARGP_NO_HELP 0x10 + +/* Don't exit on errors (they may still result in error messages). */ +#define ARGP_NO_EXIT 0x20 + +/* Use the gnu getopt `long-only' rules for parsing arguments. */ +#define ARGP_LONG_ONLY 0x40 + +/* Turns off any message-printing/exiting options. */ +#define ARGP_SILENT (ARGP_NO_EXIT | ARGP_NO_ERRS | ARGP_NO_HELP) + +/* Parse the options strings in ARGC & ARGV according to the options in ARGP. + FLAGS is one of the ARGP_ flags above. If ARG_INDEX is non-NULL, the + index in ARGV of the first unparsed option is returned in it. If an + unknown option is present, ARGP_ERR_UNKNOWN is returned; if some parser + routine returned a non-zero value, it is returned; otherwise 0 is + returned. This function may also call exit unless the ARGP_NO_HELP flag + is set. INPUT is a pointer to a value to be passed in to the parser. */ +extern error_t argp_parse (__const struct argp *__restrict argp, + int argc, char **__restrict argv, + unsigned flags, int *__restrict arg_index, + void *__restrict input) __THROW; +extern error_t __argp_parse (__const struct argp *__restrict argp, + int argc, char **__restrict argv, + unsigned flags, int *__restrict arg_index, + void *__restrict input) __THROW; + +/* Global variables. */ + +/* If defined or set by the user program to a non-zero value, then a default + option --version is added (unless the ARGP_NO_HELP flag is used), which + will print this string followed by a newline and exit (unless the + ARGP_NO_EXIT flag is used). Overridden by ARGP_PROGRAM_VERSION_HOOK. */ +extern __const char *argp_program_version; + +/* If defined or set by the user program to a non-zero value, then a default + option --version is added (unless the ARGP_NO_HELP flag is used), which + calls this function with a stream to print the version to and a pointer to + the current parsing state, and then exits (unless the ARGP_NO_EXIT flag is + used). This variable takes precedent over ARGP_PROGRAM_VERSION. */ +extern void (*argp_program_version_hook) (FILE *__restrict __stream, + struct argp_state *__restrict + __state); + +/* If defined or set by the user program, it should point to string that is + the bug-reporting address for the program. It will be printed by + argp_help if the ARGP_HELP_BUG_ADDR flag is set (as it is by various + standard help messages), embedded in a sentence that says something like + `Report bugs to ADDR.'. */ +extern __const char *argp_program_bug_address; + +/* The exit status that argp will use when exiting due to a parsing error. + If not defined or set by the user program, this defaults to EX_USAGE from + . */ +extern error_t argp_err_exit_status; + +/* Flags for argp_help. */ +#define ARGP_HELP_USAGE 0x01 /* a Usage: message. */ +#define ARGP_HELP_SHORT_USAGE 0x02 /* " but don't actually print options. */ +#define ARGP_HELP_SEE 0x04 /* a `Try ... for more help' message. */ +#define ARGP_HELP_LONG 0x08 /* a long help message. */ +#define ARGP_HELP_PRE_DOC 0x10 /* doc string preceding long help. */ +#define ARGP_HELP_POST_DOC 0x20 /* doc string following long help. */ +#define ARGP_HELP_DOC (ARGP_HELP_PRE_DOC | ARGP_HELP_POST_DOC) +#define ARGP_HELP_BUG_ADDR 0x40 /* bug report address */ +#define ARGP_HELP_LONG_ONLY 0x80 /* modify output appropriately to + reflect ARGP_LONG_ONLY mode. */ + +/* These ARGP_HELP flags are only understood by argp_state_help. */ +#define ARGP_HELP_EXIT_ERR 0x100 /* Call exit(1) instead of returning. */ +#define ARGP_HELP_EXIT_OK 0x200 /* Call exit(0) instead of returning. */ + +/* The standard thing to do after a program command line parsing error, if an + error message has already been printed. */ +#define ARGP_HELP_STD_ERR \ + (ARGP_HELP_SEE | ARGP_HELP_EXIT_ERR) +/* The standard thing to do after a program command line parsing error, if no + more specific error message has been printed. */ +#define ARGP_HELP_STD_USAGE \ + (ARGP_HELP_SHORT_USAGE | ARGP_HELP_SEE | ARGP_HELP_EXIT_ERR) +/* The standard thing to do in response to a --help option. */ +#define ARGP_HELP_STD_HELP \ + (ARGP_HELP_SHORT_USAGE | ARGP_HELP_LONG | ARGP_HELP_EXIT_OK \ + | ARGP_HELP_DOC | ARGP_HELP_BUG_ADDR) + +/* Output a usage message for ARGP to STREAM. FLAGS are from the set + ARGP_HELP_*. */ +extern void argp_help (__const struct argp *__restrict __argp, + FILE *__restrict __stream, + unsigned __flags, char *__restrict __name) __THROW; +extern void __argp_help (__const struct argp *__restrict __argp, + FILE *__restrict __stream, unsigned __flags, + char *__name) __THROW; + +/* The following routines are intended to be called from within an argp + parsing routine (thus taking an argp_state structure as the first + argument). They may or may not print an error message and exit, depending + on the flags in STATE -- in any case, the caller should be prepared for + them *not* to exit, and should return an appropiate error after calling + them. [argp_usage & argp_error should probably be called argp_state_..., + but they're used often enough that they should be short] */ + +/* Output, if appropriate, a usage message for STATE to STREAM. FLAGS are + from the set ARGP_HELP_*. */ +extern void argp_state_help (__const struct argp_state *__restrict __state, + FILE *__restrict __stream, + unsigned int __flags) __THROW; +extern void __argp_state_help (__const struct argp_state *__restrict __state, + FILE *__restrict __stream, + unsigned int __flags) __THROW; + +/* Possibly output the standard usage message for ARGP to stderr and exit. */ +extern void argp_usage (__const struct argp_state *__state) __THROW; +extern void __argp_usage (__const struct argp_state *__state) __THROW; + +/* If appropriate, print the printf string FMT and following args, preceded + by the program name and `:', to stderr, and followed by a `Try ... --help' + message, then exit (1). */ +extern void argp_error (__const struct argp_state *__restrict __state, + __const char *__restrict __fmt, ...) __THROW + PRINTF_STYLE(2,3); +extern void __argp_error (__const struct argp_state *__restrict __state, + __const char *__restrict __fmt, ...) __THROW + PRINTF_STYLE(2,3); + +/* Similar to the standard gnu error-reporting function error(), but will + respect the ARGP_NO_EXIT and ARGP_NO_ERRS flags in STATE, and will print + to STATE->err_stream. This is useful for argument parsing code that is + shared between program startup (when exiting is desired) and runtime + option parsing (when typically an error code is returned instead). The + difference between this function and argp_error is that the latter is for + *parsing errors*, and the former is for other problems that occur during + parsing but don't reflect a (syntactic) problem with the input. */ +extern void argp_failure (__const struct argp_state *__restrict __state, + int __status, int __errnum, + __const char *__restrict __fmt, ...) __THROW + PRINTF_STYLE(4,5); +extern void __argp_failure (__const struct argp_state *__restrict __state, + int __status, int __errnum, + __const char *__restrict __fmt, ...) __THROW + PRINTF_STYLE(4,5); + +/* Returns true if the option OPT is a valid short option. */ +extern int _option_is_short (__const struct argp_option *__opt) __THROW; +extern int __option_is_short (__const struct argp_option *__opt) __THROW; + +/* Returns true if the option OPT is in fact the last (unused) entry in an + options array. */ +extern int _option_is_end (__const struct argp_option *__opt) __THROW; +extern int __option_is_end (__const struct argp_option *__opt) __THROW; + +/* Return the input field for ARGP in the parser corresponding to STATE; used + by the help routines. */ +extern void *_argp_input (__const struct argp *__restrict __argp, + __const struct argp_state *__restrict __state) + __THROW; +extern void *__argp_input (__const struct argp *__restrict __argp, + __const struct argp_state *__restrict __state) + __THROW; + +/* Used for extracting the program name from argv[0] */ +extern char *_argp_basename(char *name) __THROW; +extern char *__argp_basename(char *name) __THROW; + +/* Getting the program name given an argp state */ +extern char * +_argp_short_program_name(const struct argp_state *state) __THROW; +extern char * +__argp_short_program_name(const struct argp_state *state) __THROW; + + +#ifdef __USE_EXTERN_INLINES + +# if !_LIBC +# define __argp_usage argp_usage +# define __argp_state_help argp_state_help +# define __option_is_short _option_is_short +# define __option_is_end _option_is_end +# endif + +# ifndef ARGP_EI +# define ARGP_EI extern __inline__ +# endif + +ARGP_EI void +__argp_usage (__const struct argp_state *__state) +{ + __argp_state_help (__state, stderr, ARGP_HELP_STD_USAGE); +} + +ARGP_EI int +__option_is_short (__const struct argp_option *__opt) +{ + if (__opt->flags & OPTION_DOC) + return 0; + else + { + int __key = __opt->key; + return __key > 0 && isprint (__key); + } +} + +ARGP_EI int +__option_is_end (__const struct argp_option *__opt) +{ + return !__opt->key && !__opt->name && !__opt->doc && !__opt->group; +} + +# if !_LIBC +# undef __argp_usage +# undef __argp_state_help +# undef __option_is_short +# undef __option_is_end +# endif +#endif /* Use extern inlines. */ + +#ifdef __cplusplus +} +#endif + +#endif /* argp.h */ diff --git a/argp-standalone/autogen.sh b/argp-standalone/autogen.sh new file mode 100755 index 000000000..8337353b5 --- /dev/null +++ b/argp-standalone/autogen.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +aclocal -I . +autoheader +autoconf +automake --add-missing --copy --foreign diff --git a/argp-standalone/configure.ac b/argp-standalone/configure.ac new file mode 100644 index 000000000..fe54d5ac9 --- /dev/null +++ b/argp-standalone/configure.ac @@ -0,0 +1,100 @@ +dnl Process this file with autoconf to produce a configure script. + +dnl This configure.ac is only for building a standalone argp library. +AC_INIT([argp], [standalone-1.3]) +AC_PREREQ(2.54) +AC_CONFIG_SRCDIR([argp-ba.c]) +# Needed to stop autoconf from looking for files in parent directories. +AC_CONFIG_AUX_DIR([.]) + +AM_INIT_AUTOMAKE +AM_CONFIG_HEADER(config.h) + +# GNU libc defaults to supplying the ISO C library functions only. The +# _GNU_SOURCE define enables these extensions, in particular we want +# errno.h to declare program_invocation_name. Enable it on all +# systems; no problems have been reported with it so far. +AC_GNU_SOURCE + +# Checks for programs. +AC_PROG_CC +AC_PROG_MAKE_SET +AC_PROG_RANLIB +AM_PROG_CC_STDC + +if test "x$am_cv_prog_cc_stdc" = xno ; then + AC_ERROR([the C compiler doesn't handle ANSI-C]) +fi + +# Checks for libraries. + +# Checks for header files. +AC_HEADER_STDC +AC_CHECK_HEADERS(limits.h malloc.h unistd.h sysexits.h stdarg.h) + +# Checks for typedefs, structures, and compiler characteristics. +AC_C_CONST +AC_C_INLINE +AC_TYPE_SIZE_T + +LSH_GCC_ATTRIBUTES + +# Checks for library functions. +AC_FUNC_ALLOCA +AC_FUNC_VPRINTF +AC_CHECK_FUNCS(strerror sleep getpid snprintf) + +AC_REPLACE_FUNCS(mempcpy strndup strchrnul strcasecmp vsnprintf) + +dnl ARGP_CHECK_FUNC(includes, function-call [, if-found [, if-not-found]]) +AC_DEFUN([ARGP_CHECK_FUNC], + [AS_VAR_PUSHDEF([ac_func], m4_substr([$2], 0, m4_index([$2], [(]))) + AS_VAR_PUSHDEF([ac_var], [ac_cv_func_call_]ac_func) + AH_TEMPLATE(AS_TR_CPP(HAVE_[]ac_func), + [Define to 1 if you have the `]ac_func[' function.]) + AC_CACHE_CHECK([for $2], ac_var, + [AC_TRY_LINK([$1], [$2], + [AS_VAR_SET(ac_var, yes)], + [AS_VAR_SET(ac_var, no)])]) + if test AS_VAR_GET(ac_var) = yes ; then + ifelse([$3],, + [AC_DEFINE_UNQUOTED(AS_TR_CPP(HAVE_[]ac_func))], + [$3 +]) + else + ifelse([$4],, true, [$4]) + fi + AS_VAR_POPDEF([ac_var]) + AS_VAR_POPDEF([ac_func]) + ]) + +# At least on freebsd, putc_unlocked is a macro, so the standard +# AC_CHECK_FUNCS doesn't work well. +ARGP_CHECK_FUNC([#include ], [putc_unlocked('x', stdout)]) + +AC_CHECK_FUNCS(flockfile) +AC_CHECK_FUNCS(fputs_unlocked fwrite_unlocked) + +# Used only by argp-test.c, so don't use AC_REPLACE_FUNCS. +AC_CHECK_FUNCS(strdup asprintf) + +AC_CHECK_DECLS([program_invocation_name, program_invocation_short_name], + [], [], [[#include ]]) + +# Set these flags *last*, or else the test programs won't compile +if test x$GCC = xyes ; then + # Using -ggdb3 makes (some versions of) Redhat's gcc-2.96 dump core + if "$CC" --version | grep '^2\.96$' 1>/dev/null 2>&1; then + true + else + CFLAGS="$CFLAGS -ggdb3" + fi + CFLAGS="$CFLAGS -Wall -W \ + -Wmissing-prototypes -Wmissing-declarations -Wstrict-prototypes \ + -Waggregate-return \ + -Wpointer-arith -Wbad-function-cast -Wnested-externs" +fi + +CPPFLAGS="$CPPFLAGS -I$srcdir" + +AC_OUTPUT(Makefile) diff --git a/argp-standalone/mempcpy.c b/argp-standalone/mempcpy.c new file mode 100644 index 000000000..21d8bd2ed --- /dev/null +++ b/argp-standalone/mempcpy.c @@ -0,0 +1,21 @@ +/* strndup.c + * + */ + +/* Written by Niels Möller + * + * This file is hereby placed in the public domain. + */ + +#include + +void * +mempcpy (void *, const void *, size_t) ; + +void * +mempcpy (void *to, const void *from, size_t size) +{ + memcpy(to, from, size); + return (char *) to + size; +} + diff --git a/argp-standalone/strcasecmp.c b/argp-standalone/strcasecmp.c new file mode 100644 index 000000000..bcad7a226 --- /dev/null +++ b/argp-standalone/strcasecmp.c @@ -0,0 +1,28 @@ +/* strcasecmp.c + * + */ + +/* Written by Niels Möller + * + * This file is hereby placed in the public domain. + */ + +#include + +int strcasecmp(const char *s1, const char *s2) +{ + unsigned i; + + for (i = 0; s1[i] && s2[i]; i++) + { + unsigned char c1 = tolower( (unsigned char) s1[i]); + unsigned char c2 = tolower( (unsigned char) s2[i]); + + if (c1 < c2) + return -1; + else if (c1 > c2) + return 1; + } + + return !s2[i] - !s1[i]; +} diff --git a/argp-standalone/strchrnul.c b/argp-standalone/strchrnul.c new file mode 100644 index 000000000..ee4145e4e --- /dev/null +++ b/argp-standalone/strchrnul.c @@ -0,0 +1,23 @@ +/* strchrnul.c + * + */ + +/* Written by Niels Möller + * + * This file is hereby placed in the public domain. + */ + +/* FIXME: What is this function supposed to do? My guess is that it is + * like strchr, but returns a pointer to the NUL character, not a NULL + * pointer, if the character isn't found. */ + +char *strchrnul(const char *, int ); + +char *strchrnul(const char *s, int c) +{ + const char *p = s; + while (*p && (*p != c)) + p++; + + return (char *) p; +} diff --git a/argp-standalone/strndup.c b/argp-standalone/strndup.c new file mode 100644 index 000000000..4147b7a20 --- /dev/null +++ b/argp-standalone/strndup.c @@ -0,0 +1,34 @@ +/* strndup.c + * + */ + +/* Written by Niels Möller + * + * This file is hereby placed in the public domain. + */ + +#include +#include + +char * +strndup (const char *, size_t); + +char * +strndup (const char *s, size_t size) +{ + char *r; + char *end = memchr(s, 0, size); + + if (end) + /* Length + 1 */ + size = end - s + 1; + + r = malloc(size); + + if (size) + { + memcpy(r, s, size-1); + r[size-1] = '\0'; + } + return r; +} diff --git a/argp-standalone/vsnprintf.c b/argp-standalone/vsnprintf.c new file mode 100644 index 000000000..e9b5f192b --- /dev/null +++ b/argp-standalone/vsnprintf.c @@ -0,0 +1,839 @@ +/* Copied from http://www.fiction.net/blong/programs/snprintf.c */ + +/* + * Copyright Patrick Powell 1995 + * This code is based on code written by Patrick Powell (papowell@astart.com) + * It may be used for any purpose as long as this notice remains intact + * on all source code distributions + */ + +/************************************************************** + * Original: + * Patrick Powell Tue Apr 11 09:48:21 PDT 1995 + * A bombproof version of doprnt (dopr) included. + * Sigh. This sort of thing is always nasty do deal with. Note that + * the version here does not include floating point... + * + * snprintf() is used instead of sprintf() as it does limit checks + * for string length. This covers a nasty loophole. + * + * The other functions are there to prevent NULL pointers from + * causing nast effects. + * + * More Recently: + * Brandon Long 9/15/96 for mutt 0.43 + * This was ugly. It is still ugly. I opted out of floating point + * numbers, but the formatter understands just about everything + * from the normal C string format, at least as far as I can tell from + * the Solaris 2.5 printf(3S) man page. + * + * Brandon Long 10/22/97 for mutt 0.87.1 + * Ok, added some minimal floating point support, which means this + * probably requires libm on most operating systems. Don't yet + * support the exponent (e,E) and sigfig (g,G). Also, fmtint() + * was pretty badly broken, it just wasn't being exercised in ways + * which showed it, so that's been fixed. Also, formated the code + * to mutt conventions, and removed dead code left over from the + * original. Also, there is now a builtin-test, just compile with: + * gcc -DTEST_SNPRINTF -o snprintf snprintf.c -lm + * and run snprintf for results. + * + * Thomas Roessler 01/27/98 for mutt 0.89i + * The PGP code was using unsigned hexadecimal formats. + * Unfortunately, unsigned formats simply didn't work. + * + * Michael Elkins 03/05/98 for mutt 0.90.8 + * The original code assumed that both snprintf() and vsnprintf() were + * missing. Some systems only have snprintf() but not vsnprintf(), so + * the code is now broken down under HAVE_SNPRINTF and HAVE_VSNPRINTF. + * + * Andrew Tridgell (tridge@samba.org) Oct 1998 + * fixed handling of %.0f + * added test for HAVE_LONG_DOUBLE + * + * Russ Allbery 2000-08-26 + * fixed return value to comply with C99 + * fixed handling of snprintf(NULL, ...) + * + * Niels Möller 2004-03-05 + * fixed calls to isdigit to use unsigned char. + * fixed calls to va_arg; short arguments are always passed as int. + * + **************************************************************/ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#if !defined(HAVE_SNPRINTF) || !defined(HAVE_VSNPRINTF) + +#include +#include +#include + +/* Define this as a fall through, HAVE_STDARG_H is probably already set */ + +#define HAVE_VARARGS_H + + +/* varargs declarations: */ + +#if defined(HAVE_STDARG_H) +# include +# define HAVE_STDARGS /* let's hope that works everywhere (mj) */ +# define VA_LOCAL_DECL va_list ap +# define VA_START(f) va_start(ap, f) +# define VA_SHIFT(v,t) ; /* no-op for ANSI */ +# define VA_END va_end(ap) +#else +# if defined(HAVE_VARARGS_H) +# include +# undef HAVE_STDARGS +# define VA_LOCAL_DECL va_list ap +# define VA_START(f) va_start(ap) /* f is ignored! */ +# define VA_SHIFT(v,t) v = va_arg(ap,t) +# define VA_END va_end(ap) +# else +/*XX ** NO VARARGS ** XX*/ +# endif +#endif + +#ifdef HAVE_LONG_DOUBLE +#define LDOUBLE long double +#else +#define LDOUBLE double +#endif + +int snprintf (char *str, size_t count, const char *fmt, ...); +int vsnprintf (char *str, size_t count, const char *fmt, va_list arg); + +static int dopr (char *buffer, size_t maxlen, const char *format, + va_list args); +static int fmtstr (char *buffer, size_t *currlen, size_t maxlen, + char *value, int flags, int min, int max); +static int fmtint (char *buffer, size_t *currlen, size_t maxlen, + long value, int base, int min, int max, int flags); +static int fmtfp (char *buffer, size_t *currlen, size_t maxlen, + LDOUBLE fvalue, int min, int max, int flags); +static int dopr_outch (char *buffer, size_t *currlen, size_t maxlen, char c ); + +/* + * dopr(): poor man's version of doprintf + */ + +/* format read states */ +#define DP_S_DEFAULT 0 +#define DP_S_FLAGS 1 +#define DP_S_MIN 2 +#define DP_S_DOT 3 +#define DP_S_MAX 4 +#define DP_S_MOD 5 +#define DP_S_CONV 6 +#define DP_S_DONE 7 + +/* format flags - Bits */ +#define DP_F_MINUS (1 << 0) +#define DP_F_PLUS (1 << 1) +#define DP_F_SPACE (1 << 2) +#define DP_F_NUM (1 << 3) +#define DP_F_ZERO (1 << 4) +#define DP_F_UP (1 << 5) +#define DP_F_UNSIGNED (1 << 6) + +/* Conversion Flags */ +#define DP_C_SHORT 1 +#define DP_C_LONG 2 +#define DP_C_LDOUBLE 3 + +#define char_to_int(p) (p - '0') +#define MAX(p,q) ((p >= q) ? p : q) +#define MIN(p,q) ((p <= q) ? p : q) + +static int dopr (char *buffer, size_t maxlen, const char *format, va_list args) +{ + unsigned char ch; + long value; + LDOUBLE fvalue; + char *strvalue; + int min; + int max; + int state; + int flags; + int cflags; + int total; + size_t currlen; + + state = DP_S_DEFAULT; + currlen = flags = cflags = min = 0; + max = -1; + ch = *format++; + total = 0; + + while (state != DP_S_DONE) + { + if (ch == '\0') + state = DP_S_DONE; + + switch(state) + { + case DP_S_DEFAULT: + if (ch == '%') + state = DP_S_FLAGS; + else + total += dopr_outch (buffer, &currlen, maxlen, ch); + ch = *format++; + break; + case DP_S_FLAGS: + switch (ch) + { + case '-': + flags |= DP_F_MINUS; + ch = *format++; + break; + case '+': + flags |= DP_F_PLUS; + ch = *format++; + break; + case ' ': + flags |= DP_F_SPACE; + ch = *format++; + break; + case '#': + flags |= DP_F_NUM; + ch = *format++; + break; + case '0': + flags |= DP_F_ZERO; + ch = *format++; + break; + default: + state = DP_S_MIN; + break; + } + break; + case DP_S_MIN: + if (isdigit(ch)) + { + min = 10*min + char_to_int (ch); + ch = *format++; + } + else if (ch == '*') + { + min = va_arg (args, int); + ch = *format++; + state = DP_S_DOT; + } + else + state = DP_S_DOT; + break; + case DP_S_DOT: + if (ch == '.') + { + state = DP_S_MAX; + ch = *format++; + } + else + state = DP_S_MOD; + break; + case DP_S_MAX: + if (isdigit(ch)) + { + if (max < 0) + max = 0; + max = 10*max + char_to_int (ch); + ch = *format++; + } + else if (ch == '*') + { + max = va_arg (args, int); + ch = *format++; + state = DP_S_MOD; + } + else + state = DP_S_MOD; + break; + case DP_S_MOD: + /* Currently, we don't support Long Long, bummer */ + switch (ch) + { + case 'h': + cflags = DP_C_SHORT; + ch = *format++; + break; + case 'l': + cflags = DP_C_LONG; + ch = *format++; + break; + case 'L': + cflags = DP_C_LDOUBLE; + ch = *format++; + break; + default: + break; + } + state = DP_S_CONV; + break; + case DP_S_CONV: + switch (ch) + { + case 'd': + case 'i': + if (cflags == DP_C_SHORT) + value = (short) va_arg (args, int); + else if (cflags == DP_C_LONG) + value = va_arg (args, long int); + else + value = va_arg (args, int); + total += fmtint (buffer, &currlen, maxlen, value, 10, min, max, flags); + break; + case 'o': + flags |= DP_F_UNSIGNED; + if (cflags == DP_C_SHORT) + value = (unsigned short) va_arg (args, unsigned); + else if (cflags == DP_C_LONG) + value = va_arg (args, unsigned long int); + else + value = va_arg (args, unsigned int); + total += fmtint (buffer, &currlen, maxlen, value, 8, min, max, flags); + break; + case 'u': + flags |= DP_F_UNSIGNED; + if (cflags == DP_C_SHORT) + value = (unsigned short) va_arg (args, unsigned); + else if (cflags == DP_C_LONG) + value = va_arg (args, unsigned long int); + else + value = va_arg (args, unsigned int); + total += fmtint (buffer, &currlen, maxlen, value, 10, min, max, flags); + break; + case 'X': + flags |= DP_F_UP; + case 'x': + flags |= DP_F_UNSIGNED; + if (cflags == DP_C_SHORT) + value = (unsigned short) va_arg (args, unsigned); + else if (cflags == DP_C_LONG) + value = va_arg (args, unsigned long int); + else + value = va_arg (args, unsigned int); + total += fmtint (buffer, &currlen, maxlen, value, 16, min, max, flags); + break; + case 'f': + if (cflags == DP_C_LDOUBLE) + fvalue = va_arg (args, LDOUBLE); + else + fvalue = va_arg (args, double); + /* um, floating point? */ + total += fmtfp (buffer, &currlen, maxlen, fvalue, min, max, flags); + break; + case 'E': + flags |= DP_F_UP; + case 'e': + if (cflags == DP_C_LDOUBLE) + fvalue = va_arg (args, LDOUBLE); + else + fvalue = va_arg (args, double); + break; + case 'G': + flags |= DP_F_UP; + case 'g': + if (cflags == DP_C_LDOUBLE) + fvalue = va_arg (args, LDOUBLE); + else + fvalue = va_arg (args, double); + break; + case 'c': + total += dopr_outch (buffer, &currlen, maxlen, va_arg (args, int)); + break; + case 's': + strvalue = va_arg (args, char *); + total += fmtstr (buffer, &currlen, maxlen, strvalue, flags, min, max); + break; + case 'p': + strvalue = va_arg (args, void *); + total += fmtint (buffer, &currlen, maxlen, (long) strvalue, 16, min, + max, flags); + break; + case 'n': + if (cflags == DP_C_SHORT) + { + short int *num; + num = va_arg (args, short int *); + *num = currlen; + } + else if (cflags == DP_C_LONG) + { + long int *num; + num = va_arg (args, long int *); + *num = currlen; + } + else + { + int *num; + num = va_arg (args, int *); + *num = currlen; + } + break; + case '%': + total += dopr_outch (buffer, &currlen, maxlen, ch); + break; + case 'w': + /* not supported yet, treat as next char */ + ch = *format++; + break; + default: + /* Unknown, skip */ + break; + } + ch = *format++; + state = DP_S_DEFAULT; + flags = cflags = min = 0; + max = -1; + break; + case DP_S_DONE: + break; + default: + /* hmm? */ + break; /* some picky compilers need this */ + } + } + if (buffer != NULL) + { + if (currlen < maxlen - 1) + buffer[currlen] = '\0'; + else + buffer[maxlen - 1] = '\0'; + } + return total; +} + +static int fmtstr (char *buffer, size_t *currlen, size_t maxlen, + char *value, int flags, int min, int max) +{ + int padlen, strln; /* amount to pad */ + int cnt = 0; + int total = 0; + + if (value == 0) + { + value = ""; + } + + for (strln = 0; value[strln]; ++strln); /* strlen */ + if (max >= 0 && max < strln) + strln = max; + padlen = min - strln; + if (padlen < 0) + padlen = 0; + if (flags & DP_F_MINUS) + padlen = -padlen; /* Left Justify */ + + while (padlen > 0) + { + total += dopr_outch (buffer, currlen, maxlen, ' '); + --padlen; + } + while (*value && ((max < 0) || (cnt < max))) + { + total += dopr_outch (buffer, currlen, maxlen, *value++); + ++cnt; + } + while (padlen < 0) + { + total += dopr_outch (buffer, currlen, maxlen, ' '); + ++padlen; + } + return total; +} + +/* Have to handle DP_F_NUM (ie 0x and 0 alternates) */ + +static int fmtint (char *buffer, size_t *currlen, size_t maxlen, + long value, int base, int min, int max, int flags) +{ + int signvalue = 0; + unsigned long uvalue; + char convert[20]; + int place = 0; + int spadlen = 0; /* amount to space pad */ + int zpadlen = 0; /* amount to zero pad */ + int caps = 0; + int total = 0; + + if (max < 0) + max = 0; + + uvalue = value; + + if(!(flags & DP_F_UNSIGNED)) + { + if( value < 0 ) { + signvalue = '-'; + uvalue = -value; + } + else + if (flags & DP_F_PLUS) /* Do a sign (+/i) */ + signvalue = '+'; + else + if (flags & DP_F_SPACE) + signvalue = ' '; + } + + if (flags & DP_F_UP) caps = 1; /* Should characters be upper case? */ + + do { + convert[place++] = + (caps? "0123456789ABCDEF":"0123456789abcdef") + [uvalue % (unsigned)base ]; + uvalue = (uvalue / (unsigned)base ); + } while(uvalue && (place < 20)); + if (place == 20) place--; + convert[place] = 0; + + zpadlen = max - place; + spadlen = min - MAX (max, place) - (signvalue ? 1 : 0); + if (zpadlen < 0) zpadlen = 0; + if (spadlen < 0) spadlen = 0; + if (flags & DP_F_ZERO) + { + zpadlen = MAX(zpadlen, spadlen); + spadlen = 0; + } + if (flags & DP_F_MINUS) + spadlen = -spadlen; /* Left Justifty */ + +#ifdef DEBUG_SNPRINTF + dprint (1, (debugfile, "zpad: %d, spad: %d, min: %d, max: %d, place: %d\n", + zpadlen, spadlen, min, max, place)); +#endif + + /* Spaces */ + while (spadlen > 0) + { + total += dopr_outch (buffer, currlen, maxlen, ' '); + --spadlen; + } + + /* Sign */ + if (signvalue) + total += dopr_outch (buffer, currlen, maxlen, signvalue); + + /* Zeros */ + if (zpadlen > 0) + { + while (zpadlen > 0) + { + total += dopr_outch (buffer, currlen, maxlen, '0'); + --zpadlen; + } + } + + /* Digits */ + while (place > 0) + total += dopr_outch (buffer, currlen, maxlen, convert[--place]); + + /* Left Justified spaces */ + while (spadlen < 0) { + total += dopr_outch (buffer, currlen, maxlen, ' '); + ++spadlen; + } + + return total; +} + +static LDOUBLE abs_val (LDOUBLE value) +{ + LDOUBLE result = value; + + if (value < 0) + result = -value; + + return result; +} + +static LDOUBLE pow10 (int exp) +{ + LDOUBLE result = 1; + + while (exp) + { + result *= 10; + exp--; + } + + return result; +} + +static long round (LDOUBLE value) +{ + long intpart; + + intpart = value; + value = value - intpart; + if (value >= 0.5) + intpart++; + + return intpart; +} + +static int fmtfp (char *buffer, size_t *currlen, size_t maxlen, + LDOUBLE fvalue, int min, int max, int flags) +{ + int signvalue = 0; + LDOUBLE ufvalue; + char iconvert[20]; + char fconvert[20]; + int iplace = 0; + int fplace = 0; + int padlen = 0; /* amount to pad */ + int zpadlen = 0; + int caps = 0; + int total = 0; + long intpart; + long fracpart; + + /* + * AIX manpage says the default is 0, but Solaris says the default + * is 6, and sprintf on AIX defaults to 6 + */ + if (max < 0) + max = 6; + + ufvalue = abs_val (fvalue); + + if (fvalue < 0) + signvalue = '-'; + else + if (flags & DP_F_PLUS) /* Do a sign (+/i) */ + signvalue = '+'; + else + if (flags & DP_F_SPACE) + signvalue = ' '; + +#if 0 + if (flags & DP_F_UP) caps = 1; /* Should characters be upper case? */ +#endif + + intpart = ufvalue; + + /* + * Sorry, we only support 9 digits past the decimal because of our + * conversion method + */ + if (max > 9) + max = 9; + + /* We "cheat" by converting the fractional part to integer by + * multiplying by a factor of 10 + */ + fracpart = round ((pow10 (max)) * (ufvalue - intpart)); + + if (fracpart >= pow10 (max)) + { + intpart++; + fracpart -= pow10 (max); + } + +#ifdef DEBUG_SNPRINTF + dprint (1, (debugfile, "fmtfp: %f =? %d.%d\n", fvalue, intpart, fracpart)); +#endif + + /* Convert integer part */ + do { + iconvert[iplace++] = + (caps? "0123456789ABCDEF":"0123456789abcdef")[intpart % 10]; + intpart = (intpart / 10); + } while(intpart && (iplace < 20)); + if (iplace == 20) iplace--; + iconvert[iplace] = 0; + + /* Convert fractional part */ + do { + fconvert[fplace++] = + (caps? "0123456789ABCDEF":"0123456789abcdef")[fracpart % 10]; + fracpart = (fracpart / 10); + } while(fracpart && (fplace < 20)); + if (fplace == 20) fplace--; + fconvert[fplace] = 0; + + /* -1 for decimal point, another -1 if we are printing a sign */ + padlen = min - iplace - max - 1 - ((signvalue) ? 1 : 0); + zpadlen = max - fplace; + if (zpadlen < 0) + zpadlen = 0; + if (padlen < 0) + padlen = 0; + if (flags & DP_F_MINUS) + padlen = -padlen; /* Left Justifty */ + + if ((flags & DP_F_ZERO) && (padlen > 0)) + { + if (signvalue) + { + total += dopr_outch (buffer, currlen, maxlen, signvalue); + --padlen; + signvalue = 0; + } + while (padlen > 0) + { + total += dopr_outch (buffer, currlen, maxlen, '0'); + --padlen; + } + } + while (padlen > 0) + { + total += dopr_outch (buffer, currlen, maxlen, ' '); + --padlen; + } + if (signvalue) + total += dopr_outch (buffer, currlen, maxlen, signvalue); + + while (iplace > 0) + total += dopr_outch (buffer, currlen, maxlen, iconvert[--iplace]); + + /* + * Decimal point. This should probably use locale to find the correct + * char to print out. + */ + if (max > 0) + { + total += dopr_outch (buffer, currlen, maxlen, '.'); + + while (fplace > 0) + total += dopr_outch (buffer, currlen, maxlen, fconvert[--fplace]); + } + + while (zpadlen > 0) + { + total += dopr_outch (buffer, currlen, maxlen, '0'); + --zpadlen; + } + + while (padlen < 0) + { + total += dopr_outch (buffer, currlen, maxlen, ' '); + ++padlen; + } + + return total; +} + +static int dopr_outch (char *buffer, size_t *currlen, size_t maxlen, char c) +{ + if (*currlen + 1 < maxlen) + buffer[(*currlen)++] = c; + return 1; +} + +#ifndef HAVE_VSNPRINTF +int vsnprintf (char *str, size_t count, const char *fmt, va_list args) +{ + if (str != NULL) + str[0] = 0; + return dopr(str, count, fmt, args); +} +#endif /* !HAVE_VSNPRINTF */ + +#ifndef HAVE_SNPRINTF +/* VARARGS3 */ +#ifdef HAVE_STDARGS +int snprintf (char *str,size_t count,const char *fmt,...) +#else +int snprintf (va_alist) va_dcl +#endif +{ +#ifndef HAVE_STDARGS + char *str; + size_t count; + char *fmt; +#endif + VA_LOCAL_DECL; + int total; + + VA_START (fmt); + VA_SHIFT (str, char *); + VA_SHIFT (count, size_t ); + VA_SHIFT (fmt, char *); + total = vsnprintf(str, count, fmt, ap); + VA_END; + return total; +} +#endif /* !HAVE_SNPRINTF */ + +#ifdef TEST_SNPRINTF +#ifndef LONG_STRING +#define LONG_STRING 1024 +#endif +int main (void) +{ + char buf1[LONG_STRING]; + char buf2[LONG_STRING]; + char *fp_fmt[] = { + "%-1.5f", + "%1.5f", + "%123.9f", + "%10.5f", + "% 10.5f", + "%+22.9f", + "%+4.9f", + "%01.3f", + "%4f", + "%3.1f", + "%3.2f", + "%.0f", + "%.1f", + NULL + }; + double fp_nums[] = { -1.5, 134.21, 91340.2, 341.1234, 0203.9, 0.96, 0.996, + 0.9996, 1.996, 4.136, 0}; + char *int_fmt[] = { + "%-1.5d", + "%1.5d", + "%123.9d", + "%5.5d", + "%10.5d", + "% 10.5d", + "%+22.33d", + "%01.3d", + "%4d", + NULL + }; + long int_nums[] = { -1, 134, 91340, 341, 0203, 0}; + int x, y; + int fail = 0; + int num = 0; + + printf ("Testing snprintf format codes against system sprintf...\n"); + + for (x = 0; fp_fmt[x] != NULL ; x++) + for (y = 0; fp_nums[y] != 0 ; y++) + { + snprintf (buf1, sizeof (buf1), fp_fmt[x], fp_nums[y]); + sprintf (buf2, fp_fmt[x], fp_nums[y]); + if (strcmp (buf1, buf2)) + { + printf("snprintf doesn't match Format: %s\n\tsnprintf = %s\n\tsprintf = %s\n", + fp_fmt[x], buf1, buf2); + fail++; + } + num++; + } + + for (x = 0; int_fmt[x] != NULL ; x++) + for (y = 0; int_nums[y] != 0 ; y++) + { + snprintf (buf1, sizeof (buf1), int_fmt[x], int_nums[y]); + sprintf (buf2, int_fmt[x], int_nums[y]); + if (strcmp (buf1, buf2)) + { + printf("snprintf doesn't match Format: %s\n\tsnprintf = %s\n\tsprintf = %s\n", + int_fmt[x], buf1, buf2); + fail++; + } + num++; + } + printf ("%d tests failed out of %d.\n", fail, num); +} +#endif /* SNPRINTF_TEST */ + +#endif /* !HAVE_SNPRINTF */ diff --git a/auth/Makefile.am b/auth/Makefile.am new file mode 100644 index 000000000..6bd54eee3 --- /dev/null +++ b/auth/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = addr login + +CLEANFILES = diff --git a/auth/addr/Makefile.am b/auth/addr/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/auth/addr/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/auth/addr/src/Makefile.am b/auth/addr/src/Makefile.am new file mode 100644 index 000000000..cca406151 --- /dev/null +++ b/auth/addr/src/Makefile.am @@ -0,0 +1,12 @@ +auth_LTLIBRARIES = addr.la +authdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/auth + +addr_la_LDFLAGS = -module -avoidversion + +addr_la_SOURCES = addr.c +addr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/auth/addr/src/addr.c b/auth/addr/src/addr.c new file mode 100644 index 000000000..0b248b4c6 --- /dev/null +++ b/auth/addr/src/addr.c @@ -0,0 +1,208 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include "authenticate.h" +#include "dict.h" + +#define ADDR_DELIMITER " ," +#define PRIVILAGED_PORT_CIELING 1024 + +#ifndef AF_INET_SDP +#define AF_INET_SDP 27 +#endif + +auth_result_t +gf_auth (dict_t *input_params, dict_t *config_params) +{ + char *name = NULL; + char *searchstr = NULL; + char peer_addr[UNIX_PATH_MAX]; + data_t *peer_info_data = NULL; + peer_info_t *peer_info = NULL; + data_t *allow_addr = NULL, *reject_addr = NULL; + char is_inet_sdp = 0; + + name = data_to_str (dict_get (input_params, "remote-subvolume")); + if (!name) { + gf_log ("authenticate/addr", + GF_LOG_ERROR, + "remote-subvolume not specified"); + return AUTH_DONT_CARE; + } + + asprintf (&searchstr, "auth.addr.%s.allow", name); + allow_addr = dict_get (config_params, + searchstr); + free (searchstr); + + asprintf (&searchstr, "auth.addr.%s.reject", name); + reject_addr = dict_get (config_params, + searchstr); + free (searchstr); + + if (!allow_addr) { + /* TODO: backword compatibility */ + asprintf (&searchstr, "auth.ip.%s.allow", name); + allow_addr = dict_get (config_params, searchstr); + free (searchstr); + } + + if (!(allow_addr || reject_addr)) { + gf_log ("auth/addr", GF_LOG_DEBUG, + "none of the options auth.addr.%s.allow or " + "auth.addr.%s.reject specified, returning auth_dont_care", + name, name); + return AUTH_DONT_CARE; + } + + peer_info_data = dict_get (input_params, "peer-info"); + if (!peer_info_data) { + gf_log ("authenticate/addr", + GF_LOG_ERROR, + "peer-info not present"); + return AUTH_DONT_CARE; + } + + peer_info = data_to_ptr (peer_info_data); + + switch (((struct sockaddr *) &peer_info->sockaddr)->sa_family) + { + case AF_INET_SDP: + is_inet_sdp = 1; + ((struct sockaddr *) &peer_info->sockaddr)->sa_family = AF_INET; + + case AF_INET: + case AF_INET6: + { + char *service; + uint16_t peer_port; + strcpy (peer_addr, peer_info->identifier); + service = strrchr (peer_addr, ':'); + *service = '\0'; + service ++; + + if (is_inet_sdp) { + ((struct sockaddr *) &peer_info->sockaddr)->sa_family = AF_INET_SDP; + } + + peer_port = atoi (service); + if (peer_port >= PRIVILAGED_PORT_CIELING) { + gf_log ("auth/addr", GF_LOG_ERROR, + "client is bound to port %d which is not privilaged", + peer_port); + return AUTH_DONT_CARE; + } + break; + + case AF_UNIX: + strcpy (peer_addr, peer_info->identifier); + break; + + default: + gf_log ("authenticate/addr", GF_LOG_ERROR, + "unknown address family %d", + ((struct sockaddr *) &peer_info->sockaddr)->sa_family); + return AUTH_DONT_CARE; + } + } + + if (reject_addr) { + char *addr_str = NULL; + char *tmp; + char *addr_cpy = strdup (reject_addr->data); + + addr_str = strtok_r (addr_cpy, ADDR_DELIMITER, &tmp); + + while (addr_str) { + char negate = 0, match =0; + gf_log (name, GF_LOG_DEBUG, + "rejected = \"%s\", received addr = \"%s\"", + addr_str, peer_addr); + if (addr_str[0] == '!') { + negate = 1; + addr_str++; + } + + match = fnmatch (addr_str, + peer_addr, + 0); + if (negate ? match : !match) { + free (addr_cpy); + return AUTH_REJECT; + } + addr_str = strtok_r (NULL, ADDR_DELIMITER, &tmp); + } + free (addr_cpy); + } + + if (allow_addr) { + char *addr_str = NULL; + char *tmp; + char *addr_cpy = strdup (allow_addr->data); + + addr_str = strtok_r (addr_cpy, ADDR_DELIMITER, &tmp); + + while (addr_str) { + char negate = 0, match = 0; + gf_log (name, GF_LOG_DEBUG, + "allowed = \"%s\", received addr = \"%s\"", + addr_str, peer_addr); + if (addr_str[0] == '!') { + negate = 1; + addr_str++; + } + + match = fnmatch (addr_str, + peer_addr, + 0); + + if (negate ? match : !match) { + free (addr_cpy); + return AUTH_ACCEPT; + } + addr_str = strtok_r (NULL, ADDR_DELIMITER, &tmp); + } + free (addr_cpy); + } + + return AUTH_DONT_CARE; +} + +struct volume_options options[] = { + { .key = {"auth.addr.*.allow"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"auth.addr.*.reject"}, + .type = GF_OPTION_TYPE_ANY + }, + /* Backword compatibility */ + { .key = {"auth.ip.*.allow"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {NULL} } +}; diff --git a/auth/login/Makefile.am b/auth/login/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/auth/login/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/auth/login/src/Makefile.am b/auth/login/src/Makefile.am new file mode 100644 index 000000000..eb7b990c2 --- /dev/null +++ b/auth/login/src/Makefile.am @@ -0,0 +1,13 @@ +auth_LTLIBRARIES = login.la +authdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/auth + +login_la_LDFLAGS = -module -avoidversion + +login_la_SOURCES = login.c +login_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/auth/login/src/login.c b/auth/login/src/login.c new file mode 100644 index 000000000..88c9f8206 --- /dev/null +++ b/auth/login/src/login.c @@ -0,0 +1,100 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include "authenticate.h" + +auth_result_t gf_auth (dict_t *input_params, dict_t *config_params) +{ + char *username = NULL, *password = NULL; + data_t *allow_user = NULL, *username_data = NULL, *password_data = NULL; + int32_t result = AUTH_DONT_CARE; + char *brick_name = NULL, *searchstr = NULL; + + username_data = dict_get (input_params, "username"); + if (!username_data) + return AUTH_DONT_CARE; + + username = data_to_str (username_data); + + password_data = dict_get (input_params, "password"); + if (!password_data) + return AUTH_DONT_CARE; + + password = data_to_str (password_data); + + brick_name = data_to_str (dict_get (input_params, "remote-subvolume")); + if (!brick_name) { + gf_log ("auth/login", + GF_LOG_ERROR, + "remote-subvolume not specified"); + return AUTH_REJECT; + } + + asprintf (&searchstr, "auth.login.%s.allow", brick_name); + allow_user = dict_get (config_params, + searchstr); + free (searchstr); + + if (allow_user) { + char *username_str = NULL; + char *tmp; + char *username_cpy = strdup (allow_user->data); + + username_str = strtok_r (username_cpy, " ,", &tmp); + + while (username_str) { + data_t *passwd_data = NULL; + if (!fnmatch (username_str, + username, + 0)) { + asprintf (&searchstr, "auth.login.%s.password", username); + passwd_data = dict_get (config_params, searchstr); + if (!passwd_data) { + gf_log ("auth/login", + GF_LOG_DEBUG, + "wrong username/password combination"); + result = AUTH_REJECT; + } + else + result = !strcmp (data_to_str (passwd_data), password) ? AUTH_ACCEPT : AUTH_REJECT; + break; + } + username_str = strtok_r (NULL, " ,", &tmp); + } + free (username_cpy); + } + + return result; +} + +struct volume_options options[] = { + { .key = {"auth.login.*.allow"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"auth.login.*.password"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {NULL} } +}; diff --git a/autogen.sh b/autogen.sh new file mode 100755 index 000000000..e20408bf2 --- /dev/null +++ b/autogen.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +aclocal +autoheader +(libtoolize --automake --copy --force || glibtoolize --automake --copy --force) +autoconf +automake --add-missing --copy --foreign +cd argp-standalone;./autogen.sh diff --git a/booster/Makefile.am b/booster/Makefile.am new file mode 100644 index 000000000..e1c45f305 --- /dev/null +++ b/booster/Makefile.am @@ -0,0 +1 @@ +SUBDIRS=src \ No newline at end of file diff --git a/booster/src/Makefile.am b/booster/src/Makefile.am new file mode 100644 index 000000000..9b6e77f95 --- /dev/null +++ b/booster/src/Makefile.am @@ -0,0 +1,17 @@ +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +ldpreload_PROGRAMS = glusterfs-booster.so +ldpreloaddir = $(libdir)/glusterfs/ +glusterfs_booster_so_SOURCES = booster.c +glusterfs_booster_so_CFLAGS = -I$(top_srcdir)/libglusterfsclient/src/ -D_GNU_SOURCE -D$(GF_HOST_OS) -fPIC -Wall \ + -pthread $(GF_BOOSTER_CFLAGS) +glusterfs_booster_so_CPPFLAGS = -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE \ + -I$(top_srcdir)/libglusterfsclient/src \ + -I$(top_srcdir)/libglusterfs/src -DDATADIR=\"$(localstatedir)\" \ + -DCONFDIR=\"$(sysconfdir)/glusterfs\" +glusterfs_booster_so_LDFLAGS = -shared -nostartfiles +glusterfs_booster_so_LDADD = -L$(top_builddir)/libglusterfs/src -lglusterfs \ + -L$(top_builddir)/libglusterfsclient/src -lglusterfsclient + +CLEANFILES = + diff --git a/booster/src/booster.c b/booster/src/booster.c new file mode 100644 index 000000000..cf5f7883c --- /dev/null +++ b/booster/src/booster.c @@ -0,0 +1,920 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef GF_UNIT_KB +#define GF_UNIT_KB 1024 +#endif + +#ifndef UNIX_PATH_MAX +#define UNIX_PATH_MAX 108 +#endif + +struct _inode; +struct _dict; +struct _fd { + pid_t pid; + struct list_head inode_list; + struct _inode *inode; + struct _dict *ctx; + int32_t refcount; +}; + +typedef struct _fdtable fdtable_t; +typedef struct _fd fd_t; + + +inline void +gf_fd_put (struct _fdtable *fdtable, int64_t fd); + +struct _fd * +gf_fd_fdptr_get (struct _fdtable *fdtable, int64_t fd); + +struct _fdtable * +gf_fd_fdtable_alloc (void); + +void +gf_fd_fdtable_destroy (struct _fdtable *); + +int32_t +gf_fd_unused_get (struct _fdtable *fdtable, struct _fd *fdptr); + +int32_t +gf_fd_unused_get2 (struct _fdtable *fdtable, struct _fd *fdptr, int64_t fd); + +void +fd_unref (struct _fd *fd); + +fd_t * +fd_ref (struct _fd *fd); + +pid_t +getpid (void); + +ssize_t +write (int fd, const void *buf, size_t count); + +/* open, open64, creat */ +static int (*real_open) (const char *pathname, int flags, ...); +static int (*real_open64) (const char *pathname, int flags, ...); +static int (*real_creat) (const char *pathname, mode_t mode); + +/* read, readv, pread, pread64 */ +static ssize_t (*real_read) (int fd, void *buf, size_t count); +static ssize_t (*real_readv) (int fd, const struct iovec *vector, int count); +static ssize_t (*real_pread) (int fd, void *buf, size_t count, unsigned long offset); +static ssize_t (*real_pread64) (int fd, void *buf, size_t count, uint64_t offset); + +/* write, writev, pwrite, pwrite64 */ +static ssize_t (*real_write) (int fd, const void *buf, size_t count); +static ssize_t (*real_writev) (int fd, const struct iovec *vector, int count); +static ssize_t (*real_pwrite) (int fd, const void *buf, size_t count, unsigned long offset); +static ssize_t (*real_pwrite64) (int fd, const void *buf, size_t count, uint64_t offset); + +/* lseek, llseek, lseek64 */ +static off_t (*real_lseek) (int fildes, unsigned long offset, int whence); +static off_t (*real_lseek64) (int fildes, uint64_t offset, int whence); + +/* close */ +static int (*real_close) (int fd); + +/* dup dup2 */ +static int (*real_dup) (int fd); +static int (*real_dup2) (int oldfd, int newfd); + +static pid_t (*real_fork) (void); + +#define RESOLVE(sym) do { \ + if (!real_##sym) \ + real_##sym = dlsym (RTLD_NEXT, #sym); \ + } while (0) + +/*TODO: set proper value */ +#define MOUNT_HASH_SIZE 256 + +struct booster_mount { + dev_t st_dev; + libglusterfs_handle_t handle; + struct list_head device_list; +}; +typedef struct booster_mount booster_mount_t; + +struct booster_mount_table { + pthread_mutex_t lock; + struct list_head *mounts; + int32_t hash_size; +}; +typedef struct booster_mount_table booster_mount_table_t; + +static fdtable_t *booster_glfs_fdtable = NULL; +static booster_mount_table_t *booster_mount_table = NULL; + +static int32_t +booster_put_handle (booster_mount_table_t *table, + dev_t st_dev, + libglusterfs_handle_t handle) +{ + int32_t hash = 0; + booster_mount_t *mount = NULL, *tmp = NULL; + int32_t ret = 0; + + mount = calloc (1, sizeof (*mount)); + if (!mount) { + return -1; + } + + // ERR_ABORT (mount); + INIT_LIST_HEAD (&mount->device_list); + mount->st_dev = st_dev; + mount->handle = handle; + + hash = st_dev % table->hash_size; + + pthread_mutex_lock (&table->lock); + { + list_for_each_entry (tmp, &table->mounts[hash], device_list) { + if (tmp->st_dev == st_dev) { + ret = -1; + errno = EEXIST; + goto unlock; + } + } + + list_add (&mount->device_list, &table->mounts[hash]); + } +unlock: + pthread_mutex_unlock (&table->lock); + + return ret; +} + + +static inline long +booster_get_glfs_fd (fdtable_t *fdtable, int fd) +{ + fd_t *glfs_fd = NULL; + + glfs_fd = gf_fd_fdptr_get (fdtable, fd); + return (long) glfs_fd; +} + + +static inline void +booster_put_glfs_fd (long glfs_fd) +{ + fd_unref ((fd_t *)glfs_fd); +} + + +static inline int32_t +booster_get_unused_fd (fdtable_t *fdtable, long glfs_fd, int fd) +{ + int32_t ret = -1; + ret = gf_fd_unused_get2 (fdtable, (fd_t *)glfs_fd, fd); + return ret; +} + + +static inline void +booster_put_fd (fdtable_t *fdtable, int fd) +{ + gf_fd_put (fdtable, fd); +} + + +static libglusterfs_handle_t +booster_get_handle (booster_mount_table_t *table, dev_t st_dev) +{ + int32_t hash = 0; + booster_mount_t *mount = NULL; + libglusterfs_handle_t handle = NULL; + + hash = st_dev % table->hash_size; + + pthread_mutex_lock (&table->lock); + { + list_for_each_entry (mount, &table->mounts[hash], device_list) { + if (mount->st_dev == st_dev) { + handle = mount->handle; + break; + } + } + } + pthread_mutex_unlock (&table->lock); + + return handle; +} + + +void +do_open (int fd, int flags, mode_t mode) +{ + char *specfile = NULL; + libglusterfs_handle_t handle; + int32_t file_size; + struct stat st = {0,}; + int32_t ret = -1; + + ret = fstat (fd, &st); + if (ret == -1) { + return; + } + + if (!booster_mount_table) { + return; + } + + handle = booster_get_handle (booster_mount_table, st.st_dev); + if (!handle) { + FILE *specfp = NULL; + + glusterfs_init_ctx_t ctx = { + .loglevel = "critical", + .lookup_timeout = 600, + .stat_timeout = 600, + }; + + file_size = fgetxattr (fd, "user.glusterfs-booster-volfile", NULL, 0); + if (file_size == -1) { + return; + } + + specfile = calloc (1, file_size); + if (!specfile) { + fprintf (stderr, "cannot allocate memory: %s\n", strerror (errno)); + return; + } + + ret = fgetxattr (fd, "user.glusterfs-booster-volfile", specfile, file_size); + if (ret == -1) { + free (specfile); + return ; + } + + specfp = tmpfile (); + if (!specfp) { + free (specfile); + return; + } + + ret = fwrite (specfile, file_size, 1, specfp); + if (ret != 1) { + fclose (specfp); + free (specfile); + } + + fseek (specfp, 0L, SEEK_SET); + + ctx.logfile = getenv ("GLFS_BOOSTER_LOGFILE"); + ctx.specfp = specfp; + + handle = glusterfs_init (&ctx); + + free (specfile); + fclose (specfp); + + if (!handle) { + return; + } + + ret = booster_put_handle (booster_mount_table, st.st_dev, handle); + if (ret == -1) { + glusterfs_fini (handle); + if (errno != EEXIST) { + return; + } + } + } + + if (handle) { + long glfs_fd; + char path [UNIX_PATH_MAX]; + ret = fgetxattr (fd, "user.glusterfs-booster-path", path, UNIX_PATH_MAX); + if (ret == -1) { + return; + } + + glfs_fd = glusterfs_open (handle, path, flags, mode); + if (glfs_fd) { + ret = booster_get_unused_fd (booster_glfs_fdtable, glfs_fd, fd); + if (ret == -1) { + glusterfs_close (glfs_fd); + return; + } + } + } + + return; +} + +#ifndef __USE_FILE_OFFSET64 +int +open (const char *pathname, int flags, ...) +{ + int ret; + mode_t mode = 0; + va_list ap; + + if (flags & O_CREAT) { + va_start (ap, flags); + mode = va_arg (ap, mode_t); + va_end (ap); + + ret = real_open (pathname, flags, mode); + } else { + ret = real_open (pathname, flags); + } + + if (ret != -1) { + flags &= ~ O_CREAT; + do_open (ret, flags, mode); + } + + return ret; +} +#endif + +#if defined (__USE_LARGEFILE64) || !defined (__USE_FILE_OFFSET64) +int +open64 (const char *pathname, int flags, ...) +{ + int ret; + mode_t mode = 0; + va_list ap; + + if (flags & O_CREAT) { + va_start (ap, flags); + mode = va_arg (ap, mode_t); + va_end (ap); + + ret = real_open64 (pathname, flags, mode); + } else { + ret = real_open64 (pathname, flags); + } + + if (ret != -1) { + flags &= ~O_CREAT; + do_open (ret, flags, mode); + } + + return ret; +} +#endif + +int +creat (const char *pathname, mode_t mode) +{ + int ret; + + ret = real_creat (pathname, mode); + + if (ret != -1) { + do_open (ret, O_WRONLY | O_TRUNC, mode); + } + + return ret; +} + + +/* pread */ + +ssize_t +pread (int fd, void *buf, size_t count, unsigned long offset) +{ + ssize_t ret; + long glfs_fd = 0; + + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, fd); + if (!glfs_fd) { + ret = real_pread (fd, buf, count, offset); + } else { + ret = glusterfs_pread (glfs_fd, buf, count, offset); + if (ret == -1) { + ret = real_pread (fd, buf, count, offset); + } + booster_put_glfs_fd (glfs_fd); + } + + return ret; +} + + +ssize_t +pread64 (int fd, void *buf, size_t count, uint64_t offset) +{ + ssize_t ret; + long glfs_fd = 0; + + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, fd); + if (!glfs_fd) { + ret = real_pread (fd, buf, count, offset); + } else { + ret = glusterfs_pread (glfs_fd, buf, count, offset); + if (ret == -1) { + ret = real_pread (fd, buf, count, offset); + } + } + + return ret; +} + + +ssize_t +read (int fd, void *buf, size_t count) +{ + int ret; + long glfs_fd; + + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, fd); + if (!glfs_fd) { + ret = real_read (fd, buf, count); + } else { + uint64_t offset = 0; + offset = real_lseek64 (fd, 0L, SEEK_CUR); + if ((int64_t)offset != -1) { + ret = glusterfs_lseek (glfs_fd, offset, SEEK_SET); + if (ret != -1) { + ret = glusterfs_read (glfs_fd, buf, count); + } + } else { + ret = -1; + } + + if (ret == -1) { + ret = real_read (fd, buf, count); + } + + if (ret > 0 && ((int64_t) offset) >= 0) { + real_lseek64 (fd, ret + offset, SEEK_SET); + } + + booster_put_glfs_fd (glfs_fd); + } + + return ret; +} + + +ssize_t +readv (int fd, const struct iovec *vector, int count) +{ + int ret; + long glfs_fd = 0; + + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, fd); + if (!glfs_fd) { + ret = real_readv (fd, vector, count); + } else { + uint64_t offset = 0; + offset = real_lseek64 (fd, 0L, SEEK_CUR); + if ((int64_t)offset != -1) { + ret = glusterfs_lseek (glfs_fd, offset, SEEK_SET); + if (ret != -1) { + ret = glusterfs_readv (glfs_fd, vector, count); + } + } else { + ret = -1; + } + + ret = glusterfs_readv (glfs_fd, vector, count); + if (ret > 0) { + real_lseek64 (fd, offset + ret, SEEK_SET); + } + + booster_put_glfs_fd (glfs_fd); + } + + return ret; +} + + +ssize_t +write (int fd, const void *buf, size_t count) +{ + int ret; + long glfs_fd = 0; + + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, fd); + + if (!glfs_fd) { + ret = real_write (fd, buf, count); + } else { + uint64_t offset = 0; + offset = real_lseek64 (fd, 0L, SEEK_CUR); + if (((int64_t) offset) != -1) { + ret = glusterfs_lseek (glfs_fd, offset, SEEK_SET); + if (ret != -1) { + ret = glusterfs_write (glfs_fd, buf, count); + } + } else { + ret = -1; + } + + if (ret == -1) { + ret = real_write (fd, buf, count); + } + + if (ret > 0 && ((int64_t) offset) >= 0) { + real_lseek64 (fd, offset + ret, SEEK_SET); + } + booster_put_glfs_fd (glfs_fd); + } + + return ret; +} + +ssize_t +writev (int fd, const struct iovec *vector, int count) +{ + int ret = 0; + long glfs_fd = 0; + + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, fd); + + if (!glfs_fd) { + ret = real_writev (fd, vector, count); + } else { + uint64_t offset = 0; + offset = real_lseek64 (fd, 0L, SEEK_CUR); + + if (((int64_t) offset) != -1) { + ret = glusterfs_lseek (glfs_fd, offset, SEEK_SET); + if (ret != -1) { + ret = glusterfs_writev (glfs_fd, vector, count); + } + } else { + ret = -1; + } + +/* ret = glusterfs_writev (glfs_fd, vector, count); */ + if (ret == -1) { + ret = real_writev (fd, vector, count); + } + + if (ret > 0 && ((int64_t)offset) >= 0) { + real_lseek64 (fd, offset + ret, SEEK_SET); + } + + booster_put_glfs_fd (glfs_fd); + } + + return ret; +} + + +ssize_t +pwrite (int fd, const void *buf, size_t count, unsigned long offset) +{ + int ret; + long glfs_fd = 0; + + assert (real_pwrite != NULL); + + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, fd); + + if (!glfs_fd) { + ret = real_pwrite (fd, buf, count, offset); + } else { + ret = glusterfs_pwrite (glfs_fd, buf, count, offset); + if (ret == -1) { + ret = real_pwrite (fd, buf, count, offset); + } + booster_put_glfs_fd (glfs_fd); + } + + return ret; +} + + +ssize_t +pwrite64 (int fd, const void *buf, size_t count, uint64_t offset) +{ + int ret; + long glfs_fd = 0; + + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, fd); + + if (!glfs_fd) { + ret = real_pwrite64 (fd, buf, count, offset); + } else { + ret = glusterfs_pwrite (glfs_fd, buf, count, offset); + if (ret == -1) { + ret = real_pwrite64 (fd, buf, count, offset); + } + } + + return ret; +} + + +int +close (int fd) +{ + int ret = -1; + long glfs_fd = 0; +/* struct stat st = {0,}; */ + +/* ret = fstat (fd, &st); + if (ret != -1) { + libglusterfs_handle_t handle = 0; + handle = booster_get_handle (booster_mount_table, st.st_dev); + if (handle) { */ + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, fd); + + if (glfs_fd) { + booster_put_fd (booster_glfs_fdtable, fd); + ret = glusterfs_close (glfs_fd); + booster_put_glfs_fd (glfs_fd); + } +/*} + }*/ + + ret = real_close (fd); + + return ret; +} + +#ifndef _LSEEK_DECLARED +#define _LSEEK_DECLARED +off_t +lseek (int filedes, unsigned long offset, int whence) +{ + int ret; + long glfs_fd = 0; + + ret = real_lseek (filedes, offset, whence); + + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, filedes); + if (glfs_fd) { + ret = glusterfs_lseek (glfs_fd, offset, whence); + booster_put_glfs_fd (glfs_fd); + } + + return ret; +} +#endif + +off_t +lseek64 (int filedes, uint64_t offset, int whence) +{ + int ret; + long glfs_fd = 0; + + ret = real_lseek64 (filedes, offset, whence); + + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, filedes); + if (glfs_fd) { + ret = glusterfs_lseek (glfs_fd, offset, whence); + booster_put_glfs_fd (glfs_fd); + } + + return ret; +} + +int +dup (int oldfd) +{ + int ret = -1, new_fd = -1; + long glfs_fd = 0; + + glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, oldfd); + new_fd = real_dup (oldfd); + + if (new_fd >=0 && glfs_fd) { + ret = booster_get_unused_fd (booster_glfs_fdtable, glfs_fd, new_fd); + fd_ref ((fd_t *)glfs_fd); + if (ret == -1) { + real_close (new_fd); + } + } + + if (glfs_fd) { + booster_put_glfs_fd (glfs_fd); + } + + return new_fd; +} + + +int +dup2 (int oldfd, int newfd) +{ + int ret = -1; + long old_glfs_fd = 0, new_glfs_fd = 0; + + if (oldfd == newfd) { + return newfd; + } + + old_glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, oldfd); + new_glfs_fd = booster_get_glfs_fd (booster_glfs_fdtable, newfd); + + ret = real_dup2 (oldfd, newfd); + if (ret >= 0) { + if (new_glfs_fd) { + glusterfs_close (new_glfs_fd); + booster_put_glfs_fd (new_glfs_fd); + booster_put_fd (booster_glfs_fdtable, newfd); + new_glfs_fd = 0; + } + + if (old_glfs_fd) { + ret = booster_get_unused_fd (booster_glfs_fdtable, old_glfs_fd, newfd); + fd_ref ((fd_t *)old_glfs_fd); + if (ret == -1) { + real_close (newfd); + } + } + } + + if (old_glfs_fd) { + booster_put_glfs_fd (old_glfs_fd); + } + + if (new_glfs_fd) { + booster_put_glfs_fd (new_glfs_fd); + } + + return ret; +} + + +#define MOUNT_TABLE_HASH_SIZE 256 + + +static int +booster_init (void) +{ + int i = 0; + booster_glfs_fdtable = gf_fd_fdtable_alloc (); + if (!booster_glfs_fdtable) { + fprintf (stderr, "cannot allocate fdtable: %s\n", strerror (errno)); + goto err; + } + + booster_mount_table = calloc (1, sizeof (*booster_mount_table)); + if (!booster_mount_table) { + fprintf (stderr, "cannot allocate memory: %s\n", strerror (errno)); + goto err; + } + + pthread_mutex_init (&booster_mount_table->lock, NULL); + booster_mount_table->hash_size = MOUNT_TABLE_HASH_SIZE; + booster_mount_table->mounts = calloc (booster_mount_table->hash_size, sizeof (*booster_mount_table->mounts)); + if (!booster_mount_table->mounts) { + fprintf (stderr, "cannot allocate memory: %s\n", strerror (errno)); + goto err; + } + + for (i = 0; i < booster_mount_table->hash_size; i++) + { + INIT_LIST_HEAD (&booster_mount_table->mounts[i]); + } + + return 0; + +err: + if (booster_glfs_fdtable) { + gf_fd_fdtable_destroy (booster_glfs_fdtable); + booster_glfs_fdtable = NULL; + } + + if (booster_mount_table) { + if (booster_mount_table->mounts) { + free (booster_mount_table->mounts); + } + + free (booster_mount_table); + booster_mount_table = NULL; + } + return -1; +} + + +static void +booster_cleanup (void) +{ + int i; + booster_mount_t *mount = NULL, *tmp = NULL; + + /* gf_fd_fdtable_destroy (booster_glfs_fdtable);*/ + /*for (i=0; i < booster_glfs_fdtable->max_fds; i++) { + if (booster_glfs_fdtable->fds[i]) { + fd_t *fd = booster_glfs_fdtable->fds[i]; + free (fd); + } + }*/ + + free (booster_glfs_fdtable); + booster_glfs_fdtable = NULL; + + pthread_mutex_lock (&booster_mount_table->lock); + { + for (i = 0; i < booster_mount_table->hash_size; i++) + { + list_for_each_entry_safe (mount, tmp, + &booster_mount_table->mounts[i], device_list) { + list_del (&mount->device_list); + glusterfs_fini (mount->handle); + free (mount); + } + } + free (booster_mount_table->mounts); + } + pthread_mutex_unlock (&booster_mount_table->lock); + + glusterfs_reset (); + free (booster_mount_table); + booster_mount_table = NULL; +} + + + +pid_t +fork (void) +{ + pid_t pid = 0; + char child = 0; + + glusterfs_log_lock (); + { + pid = real_fork (); + } + glusterfs_log_unlock (); + + child = (pid == 0); + if (child) { + booster_cleanup (); + booster_init (); + } + + return pid; +} + + +void +_init (void) +{ + booster_init (); + + RESOLVE (open); + RESOLVE (open64); + RESOLVE (creat); + + RESOLVE (read); + RESOLVE (readv); + RESOLVE (pread); + RESOLVE (pread64); + + RESOLVE (write); + RESOLVE (writev); + RESOLVE (pwrite); + RESOLVE (pwrite64); + + RESOLVE (lseek); + RESOLVE (lseek64); + + RESOLVE (close); + + RESOLVE (dup); + RESOLVE (dup2); + + RESOLVE (fork); +} + diff --git a/commit.sh b/commit.sh new file mode 100755 index 000000000..26318959d --- /dev/null +++ b/commit.sh @@ -0,0 +1,6 @@ +#!/bin/sh + +export EDITOR="emacs" +#TLA_REVISION=$(expr 1 + $(cat ./libglusterfs/src/revision.h | cut -f 8 -d '-' | sed -e 's/"//')) +#sed -i "s/AC_INIT.*/AC_INIT([glusterfs],[2.0.0tla${TLA_REVISION}],[gluster-users@gluster.org])/g" ./configure.ac +tla commit --write-revision ./libglusterfs/src/revision.h:'#define GLUSTERFS_REPOSITORY_REVISION "%s"' "$@" diff --git a/configure.ac b/configure.ac new file mode 100644 index 000000000..6fb7838df --- /dev/null +++ b/configure.ac @@ -0,0 +1,554 @@ +dnl Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. +dnl This file is part of GlusterFS. +dnl +dnl GlusterFS is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. +dnl +dnl GlusterFS is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +dnl GNU General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with this program. If not, see . + +AC_INIT([glusterfs],[2.0.0tla],[gluster-users@gluster.org]) + +AM_INIT_AUTOMAKE + +AM_CONFIG_HEADER([config.h]) + +AC_CONFIG_FILES([Makefile + libglusterfs/Makefile + libglusterfs/src/Makefile + libglusterfsclient/Makefile + libglusterfsclient/src/Makefile + mod_glusterfs/Makefile + mod_glusterfs/apache/Makefile + mod_glusterfs/apache/1.3/Makefile + mod_glusterfs/apache/1.3/src/Makefile + mod_glusterfs/apache/2.2/Makefile + mod_glusterfs/apache/2.2/src/Makefile + mod_glusterfs/lighttpd/Makefile + mod_glusterfs/lighttpd/1.4/Makefile + mod_glusterfs/lighttpd/1.5/Makefile + glusterfsd/Makefile + glusterfsd/src/Makefile + booster/Makefile + booster/src/Makefile + xlators/Makefile + xlators/mount/Makefile + xlators/mount/fuse/Makefile + xlators/mount/fuse/src/Makefile + xlators/mount/fuse/utils/mount.glusterfs + xlators/mount/fuse/utils/mount_glusterfs + xlators/mount/fuse/utils/Makefile + xlators/storage/Makefile + xlators/storage/posix/Makefile + xlators/storage/posix/src/Makefile + xlators/storage/bdb/Makefile + xlators/storage/bdb/src/Makefile + xlators/cluster/Makefile + xlators/cluster/unify/Makefile + xlators/cluster/unify/src/Makefile + xlators/cluster/afr/Makefile + xlators/cluster/afr/src/Makefile + xlators/cluster/stripe/Makefile + xlators/cluster/stripe/src/Makefile + xlators/cluster/dht/Makefile + xlators/cluster/dht/src/Makefile + xlators/cluster/ha/Makefile + xlators/cluster/ha/src/Makefile + xlators/cluster/map/Makefile + xlators/cluster/map/src/Makefile + xlators/performance/Makefile + xlators/performance/write-behind/Makefile + xlators/performance/write-behind/src/Makefile + xlators/performance/read-ahead/Makefile + xlators/performance/read-ahead/src/Makefile + xlators/performance/io-threads/Makefile + xlators/performance/io-threads/src/Makefile + xlators/performance/io-cache/Makefile + xlators/performance/io-cache/src/Makefile + xlators/performance/symlink-cache/Makefile + xlators/performance/symlink-cache/src/Makefile + xlators/debug/Makefile + xlators/debug/trace/Makefile + xlators/debug/trace/src/Makefile + xlators/debug/error-gen/Makefile + xlators/debug/error-gen/src/Makefile + xlators/protocol/Makefile + xlators/protocol/client/Makefile + xlators/protocol/client/src/Makefile + xlators/protocol/server/Makefile + xlators/protocol/server/src/Makefile + xlators/features/Makefile + xlators/features/locks/Makefile + xlators/features/locks/src/Makefile + xlators/features/path-convertor/Makefile + xlators/features/path-convertor/src/Makefile + xlators/features/trash/Makefile + xlators/features/trash/src/Makefile + xlators/features/filter/Makefile + xlators/features/filter/src/Makefile + xlators/features/quota/Makefile + xlators/features/quota/src/Makefile + xlators/encryption/Makefile + xlators/encryption/rot-13/Makefile + xlators/encryption/rot-13/src/Makefile + scheduler/Makefile + scheduler/alu/Makefile + scheduler/alu/src/Makefile + scheduler/random/Makefile + scheduler/random/src/Makefile + scheduler/nufa/Makefile + scheduler/nufa/src/Makefile + scheduler/rr/Makefile + scheduler/rr/src/Makefile + scheduler/switch/Makefile + scheduler/switch/src/Makefile + transport/Makefile + transport/socket/Makefile + transport/socket/src/Makefile + transport/ib-verbs/Makefile + transport/ib-verbs/src/Makefile + auth/Makefile + auth/addr/Makefile + auth/addr/src/Makefile + auth/login/Makefile + auth/login/src/Makefile + doc/Makefile + doc/examples/Makefile + doc/hacker-guide/Makefile + doc/user-guide/Makefile + extras/Makefile + extras/init.d/Makefile + extras/init.d/glusterfs-server.plist + extras/benchmarking/Makefile + extras/test/Makefile + glusterfs.spec]) + +AC_CANONICAL_HOST + +AC_PROG_CC +AC_PROG_LIBTOOL + +# LEX needs a check +AC_PROG_LEX +if test "x${LEX}" != "xflex" -a "x${FLEX}" != "xlex"; then + AC_MSG_ERROR([Flex or lex required to build glusterfs.]) +fi + +# YACC needs a check +AC_PROG_YACC +if test "x${YACC}" = "xbyacc" -o "x${YACC}" = "xyacc" -o "x${YACC}" = "x"; then + AC_MSG_ERROR([GNU Bison required to build glusterfs.]) +fi + +AC_CHECK_TOOL([LD],[ld]) + +AC_CHECK_LIB([pthread], [pthread_mutex_init], , AC_MSG_ERROR([Posix threads library is required to build glusterfs])) + +AC_CHECK_FUNC([dlopen], [has_dlopen=yes], AC_CHECK_LIB([dl], [dlopen], , AC_MSG_ERROR([Dynamic linking library required to build glusterfs]))) + + +AC_CHECK_HEADERS([sys/xattr.h]) + +AC_CHECK_HEADERS([sys/extattr.h]) + +dnl Mac OS X does not have spinlocks +AC_CHECK_FUNC([pthread_spin_init], [have_spinlock=yes]) +if test "x${have_spinlock}" = "xyes"; then + AC_DEFINE(HAVE_SPINLOCK, 1, [define if found spinlock]) +fi +AC_SUBST(HAVE_SPINLOCK) + +dnl some os may not have GNU defined strnlen function +AC_CHECK_FUNC([strnlen], [have_strnlen=yes]) +if test "x${have_strnlen}" = "xyes"; then + AC_DEFINE(HAVE_STRNLEN, 1, [define if found strnlen]) +fi +AC_SUBST(HAVE_STRNLEN) + + +AC_CHECK_FUNC([setfsuid], [have_setfsuid=yes]) +AC_CHECK_FUNC([setfsgid], [have_setfsgid=yes]) + +if test "x${have_setfsuid}" = "xyes" -a "x${have_setfsgid}" = "xyes"; then + AC_DEFINE(HAVE_SET_FSID, 1, [define if found setfsuid setfsgid]) +fi + + +# LIBGLUSTERFSCLIENT section +AC_ARG_ENABLE([libglusterfsclient], + AC_HELP_STRING([--disable-libglusterfsclient], + [Do not build libglusterfsclient])) + +BUILD_LIBGLUSTERFSCLIENT="no" + +if test "x$enable_libglusterfsclient" != "xno"; then + LIBGLUSTERFSCLIENT_SUBDIR="libglusterfsclient" + BUILD_LIBGLUSTERFSCLIENT="yes" +fi + +AC_SUBST(LIBGLUSTERFSCLIENT_SUBDIR) +# end LIBGLUSTERFSCLIENT section + + +# MOD_GLUSTERFS section +AC_ARG_ENABLE([mod_glusterfs], + AC_HELP_STRING([--disable-mod_glusterfs], + [Do not build glusterfs module for webserver. Currently supported module is for apache/1.3.x])) + +if test "x$enable_mod_glusterfs" != "xno"; then + AC_ARG_WITH([apxs], + AC_HELP_STRING([--with-apxs], + [directory containing apxs binary])) + if test "x$with_apxs" != "x"; then + APXS_BIN=$with_apxs + else + APXS_BIN="$PATH" + fi + AC_CHECK_TOOL([APXS],[apxs], ["no"], [$APXS_BIN]) + if test "X$APXS" = "Xno"; then + HAVE_APXS="no"; + else + if test "x$with_apxs" != "x"; then + APXS="$with_apxs/apxs"; + fi + HAVE_APXS="yes"; + fi + + HAVE_LIBGLUSTERFSCLIENT="no"; + if test "x$BUILD_LIBGLUSTERFSCLIENT" = "xyes"; then + HAVE_LIBGLUSTERFSCLIENT="yes"; + fi + + AC_ARG_WITH([apxspath], + AC_HELP_STRING([--with-apxspath], + [Path to apxs binary])) + + AC_ARG_WITH([apachepath], + AC_HELP_STRING([--with-apachepath], + [Path to apache binary])) +fi + +if test "x$enable_mod_glusterfs" = "xyes" -a "x$HAVE_APXS" = "xno"; then + echo "apxs is required to build mod_glusterfs. Use --with-apxs to specify path to apxs. If mod_glusterfs is not required, do not pass --enable-mod_glusterfs option to configure " + exit 1 +fi + +if test "x$enable_mod_glusterfs" = "xyes" -a "x$HAVE_LIBGLUSTERFSCLIENT" = "xno"; then + echo "libglusterfsclient is required to build mod_glusterfs. Do not specify --disable-libglusterfsclient to configure script. If mod_glusterfs is not required, do not pass --enable-mod_glusterfs option to configure " + exit 1 +fi + +BUILD_MOD_GLUSTERFS=no +MOD_GLUSTERFS_HTTPD_VERSION="" + +if test "x$enable_mod_glusterfs" != "xno" -a "x$HAVE_APXS" = "xyes" -a "x$HAVE_LIBGLUSTERFSCLIENT" = "xyes"; then + BUILD_MOD_GLUSTERFS="yes"; + MOD_GLUSTERFS_SUBDIR="mod_glusterfs"; +fi + +if test "x$BUILD_MOD_GLUSTERFS" = "xyes"; then + HTTPD_BIN_DIR=`$APXS -q SBINDIR` + MOD_GLUSTERFS_HTTPD_VERSION=`$HTTPD_BIN_DIR/httpd -V | head -1 | awk "{print $3}" | sed 's/[[^0-9.]]//g' | sed 's/\(.*\..*\)\..*/\1/'` +fi + +if test "x$with_apxspath" != "x"; then + APXS_MANUAL=$with_apxspath +fi + +if test "x$with_apachepath" != "x"; then + HTTPD_MANUAL=$with_apachepath +fi + +if test "x$enable_mod_glusterfs" != "xno" -a "x$with_apxspath" != "x" -a "x$with_apachepath" != "x"; then + BUILD_MOD_GLUSTERFS="yes"; + MOD_GLUSTERFS_SUBDIR="mod_glusterfs"; + APACHE_MANUAL=yes +fi + +if test "x$APACHE_MANUAL" = "xyes"; then + HTTPD_BIN_DIR=`$APXS_MANUAL -q SBINDIR` + MOD_GLUSTERFS_HTTPD_VERSION=`$HTTPD_MANUAL -V | head -1 | awk "{print $3}" | sed 's/[[^0-9.]]//g' | sed 's/\(.*\..*\)\..*/\1/'` + APXS=$APXS_MANUAL +fi + +AC_SUBST(MOD_GLUSTERFS_SUBDIR) +AC_SUBST(APXS) +AC_SUBST(MOD_GLUSTERFS_HTTPD_VERSION) +# end MOD_GLUSTERFS section + + +# FUSE section +# TODO: make a clean version check of libfuse +AC_ARG_ENABLE([fuse-client], + AC_HELP_STRING([--disable-fuse-client], + [Do not build the fuse client. NOTE: you cannot mount glusterfs without the client])) + +if test "x$enable_fuse_client" != "xno"; then + AC_CHECK_LIB([fuse], + [fuse_req_interrupt_func], + [HAVE_LIBFUSE="yes"], + [HAVE_LIBFUSE="no"]) +fi + +if test "x$enable_fuse_client" = "xyes" -a "x$HAVE_LIBFUSE" = "xno"; then + echo "FUSE requested but not found." + exit 1 +fi + +BUILD_FUSE_CLIENT=no +if test "x$enable_fuse_client" != "xno" -a "x$HAVE_LIBFUSE" = "xyes"; then + FUSE_CLIENT_SUBDIR=fuse + BUILD_FUSE_CLIENT="yes" +fi + +AC_SUBST(FUSE_CLIENT_SUBDIR) +# end FUSE section + + +# EPOLL section +AC_ARG_ENABLE([epoll], + AC_HELP_STRING([--disable-epoll], + [Use poll instead of epoll.])) + +BUILD_EPOLL=no +if test "x$enable_epoll" != "xno"; then + AC_CHECK_HEADERS([sys/epoll.h], + [BUILD_EPOLL=yes], + [BUILD_EPOLL=no]) +fi +# end EPOLL section + + +# IBVERBS section +AC_ARG_ENABLE([ibverbs], + AC_HELP_STRING([--disable-ibverbs], + [Do not build the ibverbs transport])) + +if test "x$enable_ibverbs" != "xno"; then + AC_CHECK_LIB([ibverbs], + [ibv_get_device_list], + [HAVE_LIBIBVERBS="yes"], + [HAVE_LIBIBVERBS="no"]) +fi + +if test "x$enable_ibverbs" = "xyes" -a "x$HAVE_LIBIBVERBS" = "xno"; then + echo "ibverbs requested but not found." + exit 1 +fi + + +BUILD_IBVERBS=no +if test "x$enable_ibverbs" != "xno" -a "x$HAVE_LIBIBVERBS" = "xyes"; then + IBVERBS_SUBDIR=ib-verbs + BUILD_IBVERBS=yes +fi + +AC_SUBST(IBVERBS_SUBDIR) +# end IBVERBS section + + +# Berkely-DB section +# storage/bdb requires Berkeley-DB version 4.6.21 or higher +_GLFS_DB_VERSION_MAJOR=4 +_GLFS_DB_VERSION_MINOR=6 +_GLFS_DB_VERSION_PATCH=21 +AC_ARG_ENABLE([db], + AC_HELP_STRING([--disable-bdb], + [Do not build the Berkeley-DB translator])) + +if test "x$enable_bdb" != "xno"; then + AC_CHECK_HEADERS([db.h], + [HAVE_BDB="yes"], + [HAVE_BDB="no"]) + if test "x$HAVE_BDB" = "xyes"; then + AC_CHECK_LIB([db], + [db_create], + [HAVE_BDB="yes"], + [HAVE_BDB="no"]) + fi + + if test "x$HAVE_BDB" = "xyes"; then + AC_TRY_COMPILE([#include ], + #if (DB_VERSION_MAJOR < $_GLFS_DB_VERSION_MAJOR) ||\ + (DB_VERSION_MAJOR == $_GLFS_DB_VERSION_MAJOR && \ + DB_VERSION_MINOR < $_GLFS_DB_VERSION_MINOR) || \ + (DB_VERSION_MAJOR == $_GLFS_DB_VERSION_MAJOR && \ + DB_VERSION_MINOR == $_GLFS_DB_VERSION_MINOR && \ + DB_VERSION_PATCH < $_GLFS_DB_VERSION_PATCH) + #error "bdb older than required" + #endif + , + [HAVE_BDB_VERSION="yes"], + [HAVE_BDB_VERSION="no"]) + + dnl check for DB->stat having 4 arguments. + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]], + [[DB *bdb; bdb->stat (NULL, NULL, NULL, 0);]])], + [HAVE_BDB_VERSION=yes], [HAVE_BDB_VERSION=no]) + + dnl check for DBC->c_get presence. + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]], + [[DBC *cursor; cursor->get (NULL, NULL, NULL, 0);]])], + [HAVE_BDB_CURSOR_GET=yes], [HAVE_BDB_CURSOR_GET=no]) + + fi +fi + +if test "x$HAVE_BDB_CURSOR_GET" = "xyes" -a "x$HAVE_BDB_VERSION" = "xyes"; then + AC_DEFINE(HAVE_BDB_CURSOR_GET, 1, [Berkeley-DB version has cursor->get()]) +fi + +if test "x$enable_bdb" = "xyes" -a "x$HAVE_BDB" = "xno" -a "x$HAVE_BDB_VERSION" = "xno" -a "x$HAVE_BDB_CURSOR_GET" = "xno"; then + echo "Berkeley-DB requested but not found. glusterfs bdb feature requires db version 4.6.21 or higher" + exit 1 +fi + + +BUILD_BDB=no +if test "x$enable_bdb" != "xno" -a "x$HAVE_BDB" = "xyes"; then + BDB_SUBDIR=bdb + BUILD_BDB=yes +fi + + + +AC_SUBST(BDB_SUBDIR) +# end BDB section + +dnl FreeBSD > 5 has execinfo as a Ported library for giving a workaround +dnl solution to GCC backtrace functionality + +AC_CHECK_HEADERS([execinfo.h], [have_backtrace=yes], + AC_CHECK_LIB([execinfo], [backtrace], [have_backtrace=yes])) +dnl AC_MSG_ERROR([libexecinfo not found libexecinfo required.]))) + +if test "x${have_backtrace}" = "xyes"; then + AC_DEFINE(HAVE_BACKTRACE, 1, [define if found backtrace]) +fi +AC_SUBST(HAVE_BACKTRACE) + +dnl glusterfs prints memory usage to stderr by sending it SIGUSR1 +AC_CHECK_FUNC([malloc_stats], [have_malloc_stats=yes]) +if test "x${have_malloc_stats}" = "xyes"; then + AC_DEFINE(HAVE_MALLOC_STATS, 1, [define if found malloc_stats]) +fi +AC_SUBST(HAVE_MALLOC_STATS) + +dnl Linux, Solaris, Cygwin +AC_CHECK_MEMBERS([struct stat.st_atim.tv_nsec]) +dnl FreeBSD, NetBSD +AC_CHECK_MEMBERS([struct stat.st_atimespec.tv_nsec]) + +dnl Check for argp +AC_CHECK_HEADER([argp.h], AC_DEFINE(HAVE_ARGP, 1, [have argp])) +AC_CONFIG_SUBDIRS(argp-standalone) +BUILD_ARGP_STANDALONE=no +if test "x${ac_cv_header_argp_h}" = "xno"; then + BUILD_ARGP_STANDALONE=yes + ARGP_STANDALONE_CPPFLAGS='-I${top_srcdir}/argp-standalone' + ARGP_STANDALONE_LDADD='${top_builddir}/argp-standalone/libargp.a' +fi + +AC_SUBST(ARGP_STANDALONE_CPPFLAGS) +AC_SUBST(ARGP_STANDALONE_LDADD) + +AC_CHECK_HEADER([malloc.h], AC_DEFINE(HAVE_MALLOC_H, 1, [have malloc.h])) + +AC_CHECK_FUNC([llistxattr], [have_llistxattr=yes]) +if test "x${have_llistxattr}" = "xyes"; then + AC_DEFINE(HAVE_LLISTXATTR, 1, [define if llistxattr exists]) +fi + +AC_CHECK_FUNC([fdatasync], [have_fdatasync=yes]) +if test "x${have_fdatasync}" = "xyes"; then + AC_DEFINE(HAVE_FDATASYNC, 1, [define if fdatasync exists]) +fi + +GF_HOST_OS="" +GF_LDFLAGS="-rdynamic" + +if test "x$HAVE_LIBGLUSTERFSCLIENT" = "xyes"; then + GF_BOOSTER_SUBDIR="booster" +fi + +GF_FUSE_LDADD="-lfuse" +case $host_os in + linux*) + dnl GF_LINUX_HOST_OS=1 + GF_HOST_OS="GF_LINUX_HOST_OS" + GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS}" + GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}" + GF_LDADD="${ARGP_STANDALONE_LDADD}" + ;; + solaris*) + GF_HOST_OS="GF_SOLARIS_HOST_OS" + GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D_REENTRANT" + GF_LDFLAGS="" + GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}" + GF_LDADD="${ARGP_STANDALONE_LDADD}" + GF_GLUSTERFS_LDFLAGS="-lnsl -lresolv -lsocket" + GF_BOOSTER_SUBDIR="" + ;; + *bsd*) + GF_HOST_OS="GF_BSD_HOST_OS" + GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS}" + GF_GLUSTERFS_CFLAGS="${GF_CFLAGS}" + GF_LDADD="${ARGP_STANDALONE_LDADD}" + if test "x$ac_cv_header_execinfo_h" = "xyes"; then + GF_GLUSTERFS_LDFLAGS="-lexecinfo" + fi + GF_FUSE_LDADD="-liconv -lfuse" + BUILD_MOD_GLUSTERFS=no + MOD_GLUSTERFS_SUBDIR="" + BUILD_LIBGLUSTERFSCLIENT=no + LIBGLUSTERFSCLIENT_SUBDIR="" + GF_BOOSTER_SUBDIR="" + ;; + darwin*) + GF_HOST_OS="GF_DARWIN_HOST_OS" + LIBTOOL=glibtool + GF_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D__DARWIN_64_BIT_INO_T -bundle -undefined suppress -flat_namespace" + GF_GLUSTERFS_CFLAGS="${ARGP_STANDALONE_CPPFLAGS} -D__DARWIN_64_BIT_INO_T -undefined suppress -flat_namespace" + GF_LDADD="${ARGP_STANDALONE_LDADD}" + GF_FUSE_LDADD="-liconv -lfuse_ino64" + BUILD_MOD_GLUSTERFS=no + MOD_GLUSTERFS_SUBDIR="" + BUILD_LIBGLUSTERFSCLIENT=no + LIBGLUSTERFSCLIENT_SUBDIR="" + GF_BOOSTER_SUBDIR="" + ;; +esac + +AC_SUBST(GF_HOST_OS) +AC_SUBST(GF_GLUSTERFS_LDFLAGS) +AC_SUBST(GF_GLUSTERFS_CFLAGS) +AC_SUBST(GF_CFLAGS) +AC_SUBST(GF_LDFLAGS) +AC_SUBST(GF_LDADD) +AC_SUBST(GF_FUSE_LDADD) +AC_SUBST(GF_BOOSTER_SUBDIR) + +AM_CONDITIONAL([GF_DARWIN_HOST_OS], test "${GF_HOST_OS}" = "GF_DARWIN_HOST_OS") + +AC_OUTPUT + +exec >&2 + +echo +echo "GlusterFS configure summary" +echo "===========================" +echo "FUSE client : $BUILD_FUSE_CLIENT" +echo "Infiniband verbs : $BUILD_IBVERBS" +echo "epoll IO multiplex : $BUILD_EPOLL" +echo "Berkeley-DB : $BUILD_BDB" +echo "libglusterfsclient : $BUILD_LIBGLUSTERFSCLIENT" +echo "mod_glusterfs : $BUILD_MOD_GLUSTERFS ($MOD_GLUSTERFS_HTTPD_VERSION)" +echo "argp-standalone : $BUILD_ARGP_STANDALONE" +echo diff --git a/doc/Makefile.am b/doc/Makefile.am new file mode 100644 index 000000000..83f88320c --- /dev/null +++ b/doc/Makefile.am @@ -0,0 +1,11 @@ +EXTRA_DIST = glusterfs.vol.sample glusterfsd.vol.sample glusterfs.8 \ + porting_guide.txt authentication.txt coding-standard.pdf get_put_api_using_xattr.txt \ + translator-options.txt mac-related-xattrs.txt replicate.pdf +SUBDIRS = examples hacker-guide user-guide + +voldir = $(sysconfdir)/glusterfs +vol_DATA = glusterfs.vol.sample glusterfsd.vol.sample + +man8_MANS = glusterfs.8 + +CLEANFILES = diff --git a/doc/authentication.txt b/doc/authentication.txt new file mode 100644 index 000000000..70aafd933 --- /dev/null +++ b/doc/authentication.txt @@ -0,0 +1,112 @@ + +* Authentication is provided by two modules addr and login. Login based authentication uses username/password from client for authentication. Each module returns either ACCEPT, REJCET or DONT_CARE. DONT_CARE is returned if the input authentication information to the module is not concerned to its working. The theory behind authentication is that "none of the auth modules should return REJECT and atleast one of them should return ACCEPT" + +* Currently all the authentication related information is passed un-encrypted over the network from client to server. + +---------------------------------------------------------------------------------------------------- +* options provided in protocol/client: + * for username/password based authentication: + option username + option password + * client can have only one set of username/password + * for addr based authentication: + * no options required in protocol/client. Client has to bind to privileged port (port < 1024 ) which means the process in which protocol/client is loaded has to be run as root. + +---------------------------------------------------------------------------------------------------- +* options provided in protocol/server: + * for username/password based authentication: + option auth.login..allow [comma seperated list of usernames using which clients can connect to volume ] + option auth.login..password #specify password for username + * for addr based authentication: + option auth.addr..allow [comma seperated list of ip-addresses/unix-paths from which clients are allowed to connect to volume ] + option auth.addr..reject [comma seperated list of ip-addresses/unix-paths from which clients are not allowed to connect to volume ] + * negation operator '!' is used to invert the sense of matching. + Eg., option auth.addr.brick.allow !a.b.c.d #do not allow client from a.b.c.d to connect to volume brick + option auth.addr.brick.reject !w.x.y.z #allow client from w.x.y.z to connect to volume brick + * wildcard '*' can be used to match any ip-address/unix-path + +---------------------------------------------------------------------------------------------------- + +* Usecases: + +* username/password based authentication only + protocol/client: + option username foo + option password foo-password + option remote-subvolume foo-brick + + protocol/server: + option auth.login.foo-brick.allow foo,who #,other users allowed to connect to foo-brick + option auth.login.foo.password foo-password + option auth.login.who.password who-password + + * in protocol/server, dont specify ip from which client is connecting in auth.addr.foo-brick.reject list + +**************************************************************************************************** + +* ip based authentication only + protocol/client: + option remote-subvolume foo-brick + * Client is connecting from a.b.c.d + + protocol/server: + option auth.addr.foo-brick.allow a.b.c.d,e.f.g.h,i.j.k.l #, other ip addresses from which clients are allowed to connect to foo-brick + +**************************************************************************************************** +* ip and username/password based authentication + * allow only "user foo from a.b.c.d" + protocol/client: + option username foo + option password foo-password + option remote-subvolume foo-brick + + protocol/server: + option auth.login.foo-brick.allow foo + option auth.login.foo.password foo-password + option auth.addr.foo-brick.reject !a.b.c.d + + * allow only "user foo" from a.b.c.d i.e., only user foo is allowed from a.b.c.d, but anyone is allowed from ip addresses other than a.b.c.d + protocol/client: + option username foo + option password foo-password + option remote-subvolume foo-brick + + protocol/server: + option auth.login.foo-brick.allow foo + option auth.login.foo.password foo-password + option auth.addr.foo-brick.allow !a.b.c.d + + * reject only "user shoo from a.b.c.d" + protcol/client: + option remote-subvolume shoo-brick + + protocol/server: + # observe that no "option auth.login.shoo-brick.allow shoo" given + # Also other users from a.b.c.d have to be explicitly allowed using auth.login.shoo-brick.allow ... + option auth.addr.shoo-brick.allow !a.b.c.d + + * reject only "user shoo" from a.b.c.d i.e., user shoo from a.b.c.d has to be rejected. + * same as reject only "user shoo from a.b.c.d" above, but rules have to be added whether to allow ip addresses (and users from those ips) other than a.b.c.d + +**************************************************************************************************** + +* ip or username/password based authentication + + * allow user foo or clients from a.b.c.d + protocol/client: + option remote-subvolume foo-brick + + protocol/server: + option auth.login.foo-brick.allow foo + option auth.login.foo.password foo-password + option auth.addr.foo-brick.allow a.b.c.d + + * reject user shoo or clients from a.b.c.d + protocol/client: + option remote-subvolume shoo-brick + + protocol/server: + option auth.login.shoo-brick.allow + #for each username mentioned in the above list, specify password as below + option auth.login..password password + option auth.addr.shoo-brick.reject a.b.c.d diff --git a/doc/booster.txt b/doc/booster.txt new file mode 100644 index 000000000..684ac8965 --- /dev/null +++ b/doc/booster.txt @@ -0,0 +1,54 @@ +Introduction +============ +* booster is a LD_PRELOADable library which boosts read/write performance by bypassing fuse for + read() and write() calls. + +Requirements +============ +* fetch volfile from glusterfs. +* identify whether multiple files are from the same mount point. If so, use only one context. + +Design +====== +* for a getxattr, along with other attributes, fuse returns following attributes. + * contents of client volume-file. + * mount point. + +* LD_PRELOADed booster.so maintains an hash table storing mount-points and libglusterfsclient handles + so that handles are reused for files from same mount point. + +* it also maintains a fdtable. fdtable maps the fd (integer) returned to application to fd (pointer to fd struct) + used by libglusterfsclient. application is returned the same fd as the one returned from libc apis. + +* During fork, these tables are overwritten to enable creation of fresh glusterfs context in child. + +Working +======= +* application willing to use booster LD_PRELOADs booster.so which is a wrapper library implementing + open, read and write. + +* application should specify the path to logfile through the environment variable GLFS_BOOSTER_LOGFILE. If + not specified, logging is done to /dev/stderr. + +* open call does, + * real_open on the file. + * fgetxattr(fd). + * store the volume-file content got in the dictionary to a temparory file. + * look in the hashtable for the mount-point, if already present get the libglusterfsclient handle from the + hashtable. Otherwise get a new handle from libglusterfsclient (be careful about mount point not present in + the hashtable and multiple glusterfs_inits running simultaneously for the same mount-point there by using + multiple handles for the same mount point). + * real_close (fd). + * delete temporary volume-volfile. + * glusterfs_open (handle, path, mode). + * store the fd returned by glusterfs_open in the fdtable at the same index as the fd returned by real_open. + * return the index as fd. + +* read/write calls do, + * get the libglusterfsclient fd from fdtable. + * if found use glusterfs_read/glusterfs_write, else use real_read/real_write. + +* close call does, + * remove the fd from the fdtable. + +* other calls use real_calls. diff --git a/doc/coding-standard.pdf b/doc/coding-standard.pdf new file mode 100644 index 000000000..bc9cb5620 Binary files /dev/null and b/doc/coding-standard.pdf differ diff --git a/doc/coding-standard.tex b/doc/coding-standard.tex new file mode 100644 index 000000000..ed9d920ec --- /dev/null +++ b/doc/coding-standard.tex @@ -0,0 +1,361 @@ +\documentclass{article}[12pt] +\usepackage{color} + +\begin{document} + + +\hrule +\begin{center}\textbf{\Large{GlusterFS Coding Standards}}\end{center} +\begin{center}\textbf{\large{\textcolor{red}{Z} Research}}\end{center} +\begin{center}{July 14, 2008}\end{center} +\hrule + +\vspace{8ex} + +\section*{$\bullet$ Structure definitions should have a comment per member} + +Every member in a structure definition must have a comment about its +purpose. The comment should be descriptive without being overly verbose. + +\vspace{2ex} +\textsl{Bad}: + +\begin{verbatim} + gf_lock_t lock; /* lock */ +\end{verbatim} + +\textsl{Good}: + +\begin{verbatim} + DBTYPE access_mode; /* access mode for accessing + * the databases, can be + * DB_HASH, DB_BTREE + * (option access-mode ) + */ +\end{verbatim} + +\section*{$\bullet$ Declare all variables at the beginning of the function} +All local variables in a function must be declared immediately after the +opening brace. This makes it easy to keep track of memory that needs to be freed +during exit. It also helps debugging, since gdb cannot handle variables +declared inside loops or other such blocks. + +\section*{$\bullet$ Always initialize local variables} +Every local variable should be initialized to a sensible default value +at the point of its declaration. All pointers should be initialized to NULL, +and all integers should be zero or (if it makes sense) an error value. + +\vspace{2ex} + +\textsl{Good}: + +\begin{verbatim} + int ret = 0; + char *databuf = NULL; + int _fd = -1; +\end{verbatim} + +\section*{$\bullet$ Initialization should always be done with a constant value} +Never use a non-constant expression as the initialization value for a variable. + +\vspace{2ex} + +\textsl{Bad}: + +\begin{verbatim} + pid_t pid = frame->root->pid; + char *databuf = malloc (1024); +\end{verbatim} + +\section*{$\bullet$ Validate all arguments to a function} +All pointer arguments to a function must be checked for \texttt{NULL}. +A macro named \texttt{VALIDATE} (in \texttt{common-utils.h}) +takes one argument, and if it is \texttt{NULL}, writes a log message and +jumps to a label called \texttt{err} after setting op\_ret and op\_errno +appropriately. It is recommended to use this template. + +\vspace{2ex} + +\textsl{Good}: + +\begin{verbatim} + VALIDATE(frame); + VALIDATE(this); + VALIDATE(inode); +\end{verbatim} + +\section*{$\bullet$ Never rely on precedence of operators} +Never write code that relies on the precedence of operators to execute +correctly. Such code can be hard to read and someone else might not +know the precedence of operators as accurately as you do. +\vspace{2ex} + +\textsl{Bad}: + +\begin{verbatim} + if (op_ret == -1 && errno != ENOENT) +\end{verbatim} + +\textsl{Good}: + +\begin{verbatim} + if ((op_ret == -1) && (errno != ENOENT)) +\end{verbatim} + +\section*{$\bullet$ Use exactly matching types} +Use a variable of the exact type declared in the manual to hold the +return value of a function. Do not use an ``equivalent'' type. + +\vspace{2ex} + +\textsl{Bad}: + +\begin{verbatim} + int len = strlen (path); +\end{verbatim} + +\textsl{Good}: + +\begin{verbatim} + size_t len = strlen (path); +\end{verbatim} + +\section*{$\bullet$ Never write code such as \texttt{foo->bar->baz}; check every pointer} +Do not write code that blindly follows a chain of pointer +references. Any pointer in the chain may be \texttt{NULL} and thus +cause a crash. Verify that each pointer is non-null before following +it. + +\section*{$\bullet$ Check return value of all functions and system calls} +The return value of all system calls and API functions must be checked +for success or failure. + +\vspace{2ex} +\textsl{Bad}: + +\begin{verbatim} + close (fd); +\end{verbatim} + +\textsl{Good}: + +\begin{verbatim} + op_ret = close (_fd); + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "close on file %s failed (%s)", real_path, + strerror (errno)); + op_errno = errno; + goto out; + } +\end{verbatim} + + +\section*{$\bullet$ Gracefully handle failure of malloc} +GlusterFS should never crash or exit due to lack of memory. If a +memory allocation fails, the call should be unwound and an error +returned to the user. + +\section*{$\bullet$ Use result args and reserve the return value to indicate success or failure} +The return value of every functions must indicate success or failure (unless +it is impossible for the function to fail --- e.g., boolean functions). If +the function needs to return additional data, it must be returned using a +result (pointer) argument. + +\vspace{2ex} +\textsl{Bad}: + +\begin{verbatim} + int32_t dict_get_int32 (dict_t *this, char *key); +\end{verbatim} + +\textsl{Good}: + +\begin{verbatim} + int dict_get_int32 (dict_t *this, char *key, int32_t *val); +\end{verbatim} + +\section*{$\bullet$ Always use the `n' versions of string functions} +Unless impossible, use the length-limited versions of the string functions. + +\vspace{2ex} +\textsl{Bad}: + +\begin{verbatim} + strcpy (entry_path, real_path); +\end{verbatim} + +\textsl{Good}: + +\begin{verbatim} + strncpy (entry_path, real_path, entry_path_len); +\end{verbatim} + +\section*{$\bullet$ No dead or commented code} +There must be no dead code (code to which control can never be passed) or +commented out code in the codebase. + +\section*{$\bullet$ Only one unwind and return per function} +There must be only one exit out of a function. \texttt{UNWIND} and return +should happen at only point in the function. + +\section*{$\bullet$ Keep functions small} +Try to keep functions small. Two to three screenfulls (80 lines per screen) is +considered a reasonable limit. If a function is very long, try splitting it +into many little helper functions. + +\vspace{2ex} +\textsl{Example for a helper function}: +\begin{verbatim} + static int + same_owner (posix_lock_t *l1, posix_lock_t *l2) + { + return ((l1->client_pid == l2->client_pid) && + (l1->transport == l2->transport)); + } +\end{verbatim} + +\section*{Style issues} + +\subsection*{Brace placement} +Use K\&R/Linux style of brace placement for blocks. + +\textsl{Example}: +\begin{verbatim} + int some_function (...) + { + if (...) { + /* ... */ + } else if (...) { + /* ... */ + } else { + /* ... */ + } + + do { + /* ... */ + } while (cond); + } +\end{verbatim} + +\subsection*{Indentation} +Use \textbf{eight} spaces for indenting blocks. Ensure that your +file contains only spaces and not tab characters. You can do this +in Emacs by selecting the entire file (\texttt{C-x h}) and +running \texttt{M-x untabify}. + +To make Emacs indent lines automatically by eight spaces, add this +line to your \texttt{.emacs}: + +\begin{verbatim} + (add-hook 'c-mode-hook (lambda () (c-set-style "linux"))) +\end{verbatim} + +\subsection*{Comments} +Write a comment before every function describing its purpose (one-line), +its arguments, and its return value. Mention whether it is an internal +function or an exported function. + +Write a comment before every structure describing its purpose, and +write comments about each of its members. + +Follow the style shown below for comments, since such comments +can then be automatically extracted by doxygen to generate +documentation. + +\textsl{Example}: +\begin{verbatim} +/** + * hash_name -hash function for filenames + * @par: parent inode number + * @name: basename of inode + * @mod: number of buckets in the hashtable + * + * @return: success: bucket number + * failure: -1 + * + * Not for external use. + */ +\end{verbatim} + +\subsection*{Indicating critical sections} +To clearly show regions of code which execute with locks held, use +the following format: + +\begin{verbatim} + pthread_mutex_lock (&mutex); + { + /* code */ + } + pthread_mutex_unlock (&mutex); +\end{verbatim} + +\section*{A skeleton fop function} +This is the recommended template for any fop. In the beginning come +the initializations. After that, the `success' control flow should be +linear. Any error conditions should cause a \texttt{goto} to a single +point, \texttt{out}. At that point, the code should detect the error +that has occured and do appropriate cleanup. + +\begin{verbatim} +int32_t +sample_fop (call_frame_t *frame, + xlator_t *this, + ...) +{ + char * var1 = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + DIR * dir = NULL; + struct posix_fd * pfd = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + + /* other validations */ + + dir = opendir (...); + + if (dir == NULL) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "opendir failed on %s (%s)", loc->path, + strerror (op_errno)); + goto out; + } + + /* another system call */ + if (...) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + /* ... */ + + out: + if (op_ret == -1) { + + /* check for all the cleanup that needs to be + done */ + + if (dir) { + closedir (dir); + dir = NULL; + } + + if (pfd) { + if (pfd->path) + FREE (pfd->path); + FREE (pfd); + pfd = NULL; + } + } + + STACK_UNWIND (frame, op_ret, op_errno, fd); + return 0; +} +\end{verbatim} + +\end{document} diff --git a/doc/errno.list.bsd.txt b/doc/errno.list.bsd.txt new file mode 100644 index 000000000..350af25e4 --- /dev/null +++ b/doc/errno.list.bsd.txt @@ -0,0 +1,376 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)errno.h 8.5 (Berkeley) 1/21/94 + * $FreeBSD: src/sys/sys/errno.h,v 1.28 2005/04/02 12:33:28 das Exp $ + */ + +#ifndef _SYS_ERRNO_H_ +#define _SYS_ERRNO_H_ + +#ifndef _KERNEL +#include +__BEGIN_DECLS +int * __error(void); +__END_DECLS +#define errno (* __error()) +#endif + +#define EPERM 1 /* Operation not permitted */ +#define ENOENT 2 /* No such file or directory */ +#define ESRCH 3 /* No such process */ +#define EINTR 4 /* Interrupted system call */ +#define EIO 5 /* Input/output error */ +#define ENXIO 6 /* Device not configured */ +#define E2BIG 7 /* Argument list too long */ +#define ENOEXEC 8 /* Exec format error */ +#define EBADF 9 /* Bad file descriptor */ +#define ECHILD 10 /* No child processes */ +#define EDEADLK 11 /* Resource deadlock avoided */ + /* 11 was EAGAIN */ +#define ENOMEM 12 /* Cannot allocate memory */ +#define EACCES 13 /* Permission denied */ +#define EFAULT 14 /* Bad address */ +#ifndef _POSIX_SOURCE +#define ENOTBLK 15 /* Block device required */ +#endif +#define EBUSY 16 /* Device busy */ +#define EEXIST 17 /* File exists */ +#define EXDEV 18 /* Cross-device link */ +#define ENODEV 19 /* Operation not supported by device */ +#define ENOTDIR 20 /* Not a directory */ +#define EISDIR 21 /* Is a directory */ +#define EINVAL 22 /* Invalid argument */ +#define ENFILE 23 /* Too many open files in system */ +#define EMFILE 24 /* Too many open files */ +#define ENOTTY 25 /* Inappropriate ioctl for device */ +#ifndef _POSIX_SOURCE +#define ETXTBSY 26 /* Text file busy */ +#endif +#define EFBIG 27 /* File too large */ +#define ENOSPC 28 /* No space left on device */ +#define ESPIPE 29 /* Illegal seek */ +#define EROFS 30 /* Read-only filesystem */ +#define EMLINK 31 /* Too many links */ +#define EPIPE 32 /* Broken pipe */ + +/* math software */ +#define EDOM 33 /* Numerical argument out of domain */ +#define ERANGE 34 /* Result too large */ + +/* non-blocking and interrupt i/o */ +#define EAGAIN 35 /* Resource temporarily unavailable */ +#ifndef _POSIX_SOURCE +#define EWOULDBLOCK EAGAIN /* Operation would block */ +#define EINPROGRESS 36 /* Operation now in progress */ +#define EALREADY 37 /* Operation already in progress */ + +/* ipc/network software -- argument errors */ +#define ENOTSOCK 38 /* Socket operation on non-socket */ +#define EDESTADDRREQ 39 /* Destination address required */ +#define EMSGSIZE 40 /* Message too long */ +#define EPROTOTYPE 41 /* Protocol wrong type for socket */ +#define ENOPROTOOPT 42 /* Protocol not available */ +#define EPROTONOSUPPORT 43 /* Protocol not supported */ +#define ESOCKTNOSUPPORT 44 /* Socket type not supported */ +#define EOPNOTSUPP 45 /* Operation not supported */ +#define ENOTSUP EOPNOTSUPP /* Operation not supported */ +#define EPFNOSUPPORT 46 /* Protocol family not supported */ +#define EAFNOSUPPORT 47 /* Address family not supported by protocol family */ +#define EADDRINUSE 48 /* Address already in use */ +#define EADDRNOTAVAIL 49 /* Can't assign requested address */ + +/* ipc/network software -- operational errors */ +#define ENETDOWN 50 /* Network is down */ +#define ENETUNREACH 51 /* Network is unreachable */ +#define ENETRESET 52 /* Network dropped connection on reset */ +#define ECONNABORTED 53 /* Software caused connection abort */ +#define ECONNRESET 54 /* Connection reset by peer */ +#define ENOBUFS 55 /* No buffer space available */ +#define EISCONN 56 /* Socket is already connected */ +#define ENOTCONN 57 /* Socket is not connected */ +#define ESHUTDOWN 58 /* Can't send after socket shutdown */ +#define ETOOMANYREFS 59 /* Too many references: can't splice */ +#define ETIMEDOUT 60 /* Operation timed out */ +#define ECONNREFUSED 61 /* Connection refused */ + +#define ELOOP 62 /* Too many levels of symbolic links */ +#endif /* _POSIX_SOURCE */ +#define ENAMETOOLONG 63 /* File name too long */ + +/* should be rearranged */ +#ifndef _POSIX_SOURCE +#define EHOSTDOWN 64 /* Host is down */ +#define EHOSTUNREACH 65 /* No route to host */ +#endif /* _POSIX_SOURCE */ +#define ENOTEMPTY 66 /* Directory not empty */ + +/* quotas & mush */ +#ifndef _POSIX_SOURCE +#define EPROCLIM 67 /* Too many processes */ +#define EUSERS 68 /* Too many users */ +#define EDQUOT 69 /* Disc quota exceeded */ + +/* Network File System */ +#define ESTALE 70 /* Stale NFS file handle */ +#define EREMOTE 71 /* Too many levels of remote in path */ +#define EBADRPC 72 /* RPC struct is bad */ +#define ERPCMISMATCH 73 /* RPC version wrong */ +#define EPROGUNAVAIL 74 /* RPC prog. not avail */ +#define EPROGMISMATCH 75 /* Program version wrong */ +#define EPROCUNAVAIL 76 /* Bad procedure for program */ +#endif /* _POSIX_SOURCE */ + +#define ENOLCK 77 /* No locks available */ +#define ENOSYS 78 /* Function not implemented */ + +#ifndef _POSIX_SOURCE +#define EFTYPE 79 /* Inappropriate file type or format */ +#define EAUTH 80 /* Authentication error */ +#define ENEEDAUTH 81 /* Need authenticator */ +#define EIDRM 82 /* Identifier removed */ +#define ENOMSG 83 /* No message of desired type */ +#define EOVERFLOW 84 /* Value too large to be stored in data type */ +#define ECANCELED 85 /* Operation canceled */ +#define EILSEQ 86 /* Illegal byte sequence */ +#define ENOATTR 87 /* Attribute not found */ + +#define EDOOFUS 88 /* Programming error */ +#endif /* _POSIX_SOURCE */ + +#define EBADMSG 89 /* Bad message */ +#define EMULTIHOP 90 /* Multihop attempted */ +#define ENOLINK 91 /* Link has been severed */ +#define EPROTO 92 /* Protocol error */ + +#ifndef _POSIX_SOURCE +#define ELAST 92 /* Must be equal largest errno */ +#endif /* _POSIX_SOURCE */ + +#ifdef _KERNEL +/* pseudo-errors returned inside kernel to modify return to process */ +#define ERESTART (-1) /* restart syscall */ +#define EJUSTRETURN (-2) /* don't modify regs, just return */ +#define ENOIOCTL (-3) /* ioctl not handled by this layer */ +#define EDIRIOCTL (-4) /* do direct ioctl in GEOM */ +#endif + +#endif +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)errno.h 8.5 (Berkeley) 1/21/94 + * $FreeBSD: src/sys/sys/errno.h,v 1.28 2005/04/02 12:33:28 das Exp $ + */ + +#ifndef _SYS_ERRNO_H_ +#define _SYS_ERRNO_H_ + +#ifndef _KERNEL +#include +__BEGIN_DECLS +int * __error(void); +__END_DECLS +#define errno (* __error()) +#endif + +#define EPERM 1 /* Operation not permitted */ +#define ENOENT 2 /* No such file or directory */ +#define ESRCH 3 /* No such process */ +#define EINTR 4 /* Interrupted system call */ +#define EIO 5 /* Input/output error */ +#define ENXIO 6 /* Device not configured */ +#define E2BIG 7 /* Argument list too long */ +#define ENOEXEC 8 /* Exec format error */ +#define EBADF 9 /* Bad file descriptor */ +#define ECHILD 10 /* No child processes */ +#define EDEADLK 11 /* Resource deadlock avoided */ + /* 11 was EAGAIN */ +#define ENOMEM 12 /* Cannot allocate memory */ +#define EACCES 13 /* Permission denied */ +#define EFAULT 14 /* Bad address */ +#ifndef _POSIX_SOURCE +#define ENOTBLK 15 /* Block device required */ +#endif +#define EBUSY 16 /* Device busy */ +#define EEXIST 17 /* File exists */ +#define EXDEV 18 /* Cross-device link */ +#define ENODEV 19 /* Operation not supported by device */ +#define ENOTDIR 20 /* Not a directory */ +#define EISDIR 21 /* Is a directory */ +#define EINVAL 22 /* Invalid argument */ +#define ENFILE 23 /* Too many open files in system */ +#define EMFILE 24 /* Too many open files */ +#define ENOTTY 25 /* Inappropriate ioctl for device */ +#ifndef _POSIX_SOURCE +#define ETXTBSY 26 /* Text file busy */ +#endif +#define EFBIG 27 /* File too large */ +#define ENOSPC 28 /* No space left on device */ +#define ESPIPE 29 /* Illegal seek */ +#define EROFS 30 /* Read-only filesystem */ +#define EMLINK 31 /* Too many links */ +#define EPIPE 32 /* Broken pipe */ + +/* math software */ +#define EDOM 33 /* Numerical argument out of domain */ +#define ERANGE 34 /* Result too large */ + +/* non-blocking and interrupt i/o */ +#define EAGAIN 35 /* Resource temporarily unavailable */ +#ifndef _POSIX_SOURCE +#define EWOULDBLOCK EAGAIN /* Operation would block */ +#define EINPROGRESS 36 /* Operation now in progress */ +#define EALREADY 37 /* Operation already in progress */ + +/* ipc/network software -- argument errors */ +#define ENOTSOCK 38 /* Socket operation on non-socket */ +#define EDESTADDRREQ 39 /* Destination address required */ +#define EMSGSIZE 40 /* Message too long */ +#define EPROTOTYPE 41 /* Protocol wrong type for socket */ +#define ENOPROTOOPT 42 /* Protocol not available */ +#define EPROTONOSUPPORT 43 /* Protocol not supported */ +#define ESOCKTNOSUPPORT 44 /* Socket type not supported */ +#define EOPNOTSUPP 45 /* Operation not supported */ +#define ENOTSUP EOPNOTSUPP /* Operation not supported */ +#define EPFNOSUPPORT 46 /* Protocol family not supported */ +#define EAFNOSUPPORT 47 /* Address family not supported by protocol family */ +#define EADDRINUSE 48 /* Address already in use */ +#define EADDRNOTAVAIL 49 /* Can't assign requested address */ + +/* ipc/network software -- operational errors */ +#define ENETDOWN 50 /* Network is down */ +#define ENETUNREACH 51 /* Network is unreachable */ +#define ENETRESET 52 /* Network dropped connection on reset */ +#define ECONNABORTED 53 /* Software caused connection abort */ +#define ECONNRESET 54 /* Connection reset by peer */ +#define ENOBUFS 55 /* No buffer space available */ +#define EISCONN 56 /* Socket is already connected */ +#define ENOTCONN 57 /* Socket is not connected */ +#define ESHUTDOWN 58 /* Can't send after socket shutdown */ +#define ETOOMANYREFS 59 /* Too many references: can't splice */ +#define ETIMEDOUT 60 /* Operation timed out */ +#define ECONNREFUSED 61 /* Connection refused */ + +#define ELOOP 62 /* Too many levels of symbolic links */ +#endif /* _POSIX_SOURCE */ +#define ENAMETOOLONG 63 /* File name too long */ + +/* should be rearranged */ +#ifndef _POSIX_SOURCE +#define EHOSTDOWN 64 /* Host is down */ +#define EHOSTUNREACH 65 /* No route to host */ +#endif /* _POSIX_SOURCE */ +#define ENOTEMPTY 66 /* Directory not empty */ + +/* quotas & mush */ +#ifndef _POSIX_SOURCE +#define EPROCLIM 67 /* Too many processes */ +#define EUSERS 68 /* Too many users */ +#define EDQUOT 69 /* Disc quota exceeded */ + +/* Network File System */ +#define ESTALE 70 /* Stale NFS file handle */ +#define EREMOTE 71 /* Too many levels of remote in path */ +#define EBADRPC 72 /* RPC struct is bad */ +#define ERPCMISMATCH 73 /* RPC version wrong */ +#define EPROGUNAVAIL 74 /* RPC prog. not avail */ +#define EPROGMISMATCH 75 /* Program version wrong */ +#define EPROCUNAVAIL 76 /* Bad procedure for program */ +#endif /* _POSIX_SOURCE */ + +#define ENOLCK 77 /* No locks available */ +#define ENOSYS 78 /* Function not implemented */ + +#ifndef _POSIX_SOURCE +#define EFTYPE 79 /* Inappropriate file type or format */ +#define EAUTH 80 /* Authentication error */ +#define ENEEDAUTH 81 /* Need authenticator */ +#define EIDRM 82 /* Identifier removed */ +#define ENOMSG 83 /* No message of desired type */ +#define EOVERFLOW 84 /* Value too large to be stored in data type */ +#define ECANCELED 85 /* Operation canceled */ +#define EILSEQ 86 /* Illegal byte sequence */ +#define ENOATTR 87 /* Attribute not found */ + +#define EDOOFUS 88 /* Programming error */ +#endif /* _POSIX_SOURCE */ + +#define EBADMSG 89 /* Bad message */ +#define EMULTIHOP 90 /* Multihop attempted */ +#define ENOLINK 91 /* Link has been severed */ +#define EPROTO 92 /* Protocol error */ + +#ifndef _POSIX_SOURCE +#define ELAST 92 /* Must be equal largest errno */ +#endif /* _POSIX_SOURCE */ + +#ifdef _KERNEL +/* pseudo-errors returned inside kernel to modify return to process */ +#define ERESTART (-1) /* restart syscall */ +#define EJUSTRETURN (-2) /* don't modify regs, just return */ +#define ENOIOCTL (-3) /* ioctl not handled by this layer */ +#define EDIRIOCTL (-4) /* do direct ioctl in GEOM */ +#endif + +#endif diff --git a/doc/errno.list.linux.txt b/doc/errno.list.linux.txt new file mode 100644 index 000000000..baa50792d --- /dev/null +++ b/doc/errno.list.linux.txt @@ -0,0 +1,1586 @@ +#define ICONV_SUPPORTS_ERRNO 1 +#include +/* Error constants. Linux specific version. + Copyright (C) 1996, 1997, 1998, 1999, 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifdef _ERRNO_H + +# undef EDOM +# undef EILSEQ +# undef ERANGE +# include + +/* Linux has no ENOTSUP error code. */ +# define ENOTSUP EOPNOTSUPP + +/* Older Linux versions also had no ECANCELED error code. */ +# ifndef ECANCELED +# define ECANCELED 125 +# endif + +/* Support for error codes to support robust mutexes was added later, too. */ +# ifndef EOWNERDEAD +# define EOWNERDEAD 130 +# define ENOTRECOVERABLE 131 +# endif + +# ifndef __ASSEMBLER__ +/* Function to get address of global `errno' variable. */ +extern int *__errno_location (void) __THROW __attribute__ ((__const__)); + +# if !defined _LIBC || defined _LIBC_REENTRANT +/* When using threads, errno is a per-thread value. */ +# define errno (*__errno_location ()) +# endif +# endif /* !__ASSEMBLER__ */ +#endif /* _ERRNO_H */ + +#if !defined _ERRNO_H && defined __need_Emath +/* This is ugly but the kernel header is not clean enough. We must + define only the values EDOM, EILSEQ and ERANGE in case __need_Emath is + defined. */ +# define EDOM 33 /* Math argument out of domain of function. */ +# define EILSEQ 84 /* Illegal byte sequence. */ +# define ERANGE 34 /* Math result not representable. */ +#endif /* !_ERRNO_H && __need_Emath */ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef APR_ERRNO_H +#define APR_ERRNO_H + +/** + * @file apr_errno.h + * @brief APR Error Codes + */ + +#include "apr.h" + +#if APR_HAVE_ERRNO_H +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** + * @defgroup apr_errno Error Codes + * @ingroup APR + * @{ + */ + +/** + * Type for specifying an error or status code. + */ +typedef int apr_status_t; + +/** + * Return a human readable string describing the specified error. + * @param statcode The error code the get a string for. + * @param buf A buffer to hold the error string. + * @param bufsize Size of the buffer to hold the string. + */ +APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, + apr_size_t bufsize); + +#if defined(DOXYGEN) +/** + * @def APR_FROM_OS_ERROR(os_err_type syserr) + * Fold a platform specific error into an apr_status_t code. + * @return apr_status_t + * @param e The platform os error code. + * @warning macro implementation; the syserr argument may be evaluated + * multiple times. + */ +#define APR_FROM_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e + APR_OS_START_SYSERR) + +/** + * @def APR_TO_OS_ERROR(apr_status_t statcode) + * @return os_err_type + * Fold an apr_status_t code back to the native platform defined error. + * @param e The apr_status_t folded platform os error code. + * @warning macro implementation; the statcode argument may be evaluated + * multiple times. If the statcode was not created by apr_get_os_error + * or APR_FROM_OS_ERROR, the results are undefined. + */ +#define APR_TO_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e - APR_OS_START_SYSERR) + +/** @def apr_get_os_error() + * @return apr_status_t the last platform error, folded into apr_status_t, on most platforms + * @remark This retrieves errno, or calls a GetLastError() style function, and + * folds it with APR_FROM_OS_ERROR. Some platforms (such as OS2) have no + * such mechanism, so this call may be unsupported. Do NOT use this + * call for socket errors from socket, send, recv etc! + */ + +/** @def apr_set_os_error(e) + * Reset the last platform error, unfolded from an apr_status_t, on some platforms + * @param e The OS error folded in a prior call to APR_FROM_OS_ERROR() + * @warning This is a macro implementation; the statcode argument may be evaluated + * multiple times. If the statcode was not created by apr_get_os_error + * or APR_FROM_OS_ERROR, the results are undefined. This macro sets + * errno, or calls a SetLastError() style function, unfolding statcode + * with APR_TO_OS_ERROR. Some platforms (such as OS2) have no such + * mechanism, so this call may be unsupported. + */ + +/** @def apr_get_netos_error() + * Return the last socket error, folded into apr_status_t, on all platforms + * @remark This retrieves errno or calls a GetLastSocketError() style function, + * and folds it with APR_FROM_OS_ERROR. + */ + +/** @def apr_set_netos_error(e) + * Reset the last socket error, unfolded from an apr_status_t + * @param e The socket error folded in a prior call to APR_FROM_OS_ERROR() + * @warning This is a macro implementation; the statcode argument may be evaluated + * multiple times. If the statcode was not created by apr_get_os_error + * or APR_FROM_OS_ERROR, the results are undefined. This macro sets + * errno, or calls a WSASetLastError() style function, unfolding + * socketcode with APR_TO_OS_ERROR. + */ + +#endif /* defined(DOXYGEN) */ + +/** + * APR_OS_START_ERROR is where the APR specific error values start. + */ +#define APR_OS_START_ERROR 20000 +/** + * APR_OS_ERRSPACE_SIZE is the maximum number of errors you can fit + * into one of the error/status ranges below -- except for + * APR_OS_START_USERERR, which see. + */ +#define APR_OS_ERRSPACE_SIZE 50000 +/** + * APR_OS_START_STATUS is where the APR specific status codes start. + */ +#define APR_OS_START_STATUS (APR_OS_START_ERROR + APR_OS_ERRSPACE_SIZE) +/** + * APR_OS_START_USERERR are reserved for applications that use APR that + * layer their own error codes along with APR's. Note that the + * error immediately following this one is set ten times farther + * away than usual, so that users of apr have a lot of room in + * which to declare custom error codes. + */ +#define APR_OS_START_USERERR (APR_OS_START_STATUS + APR_OS_ERRSPACE_SIZE) +/** + * APR_OS_START_USEERR is obsolete, defined for compatibility only. + * Use APR_OS_START_USERERR instead. + */ +#define APR_OS_START_USEERR APR_OS_START_USERERR +/** + * APR_OS_START_CANONERR is where APR versions of errno values are defined + * on systems which don't have the corresponding errno. + */ +#define APR_OS_START_CANONERR (APR_OS_START_USERERR \ + + (APR_OS_ERRSPACE_SIZE * 10)) +/** + * APR_OS_START_EAIERR folds EAI_ error codes from getaddrinfo() into + * apr_status_t values. + */ +#define APR_OS_START_EAIERR (APR_OS_START_CANONERR + APR_OS_ERRSPACE_SIZE) +/** + * APR_OS_START_SYSERR folds platform-specific system error values into + * apr_status_t values. + */ +#define APR_OS_START_SYSERR (APR_OS_START_EAIERR + APR_OS_ERRSPACE_SIZE) + +/** no error. */ +#define APR_SUCCESS 0 + +/** + * @defgroup APR_Error APR Error Values + *
+ * APR ERROR VALUES
+ * APR_ENOSTAT      APR was unable to perform a stat on the file 
+ * APR_ENOPOOL      APR was not provided a pool with which to allocate memory
+ * APR_EBADDATE     APR was given an invalid date 
+ * APR_EINVALSOCK   APR was given an invalid socket
+ * APR_ENOPROC      APR was not given a process structure
+ * APR_ENOTIME      APR was not given a time structure
+ * APR_ENODIR       APR was not given a directory structure
+ * APR_ENOLOCK      APR was not given a lock structure
+ * APR_ENOPOLL      APR was not given a poll structure
+ * APR_ENOSOCKET    APR was not given a socket
+ * APR_ENOTHREAD    APR was not given a thread structure
+ * APR_ENOTHDKEY    APR was not given a thread key structure
+ * APR_ENOSHMAVAIL  There is no more shared memory available
+ * APR_EDSOOPEN     APR was unable to open the dso object.  For more 
+ *                  information call apr_dso_error().
+ * APR_EGENERAL     General failure (specific information not available)
+ * APR_EBADIP       The specified IP address is invalid
+ * APR_EBADMASK     The specified netmask is invalid
+ * APR_ESYMNOTFOUND Could not find the requested symbol
+ * 
+ * + *
+ * APR STATUS VALUES
+ * APR_INCHILD        Program is currently executing in the child
+ * APR_INPARENT       Program is currently executing in the parent
+ * APR_DETACH         The thread is detached
+ * APR_NOTDETACH      The thread is not detached
+ * APR_CHILD_DONE     The child has finished executing
+ * APR_CHILD_NOTDONE  The child has not finished executing
+ * APR_TIMEUP         The operation did not finish before the timeout
+ * APR_INCOMPLETE     The operation was incomplete although some processing
+ *                    was performed and the results are partially valid
+ * APR_BADCH          Getopt found an option not in the option string
+ * APR_BADARG         Getopt found an option that is missing an argument 
+ *                    and an argument was specified in the option string
+ * APR_EOF            APR has encountered the end of the file
+ * APR_NOTFOUND       APR was unable to find the socket in the poll structure
+ * APR_ANONYMOUS      APR is using anonymous shared memory
+ * APR_FILEBASED      APR is using a file name as the key to the shared memory
+ * APR_KEYBASED       APR is using a shared key as the key to the shared memory
+ * APR_EINIT          Ininitalizer value.  If no option has been found, but 
+ *                    the status variable requires a value, this should be used
+ * APR_ENOTIMPL       The APR function has not been implemented on this 
+ *                    platform, either because nobody has gotten to it yet, 
+ *                    or the function is impossible on this platform.
+ * APR_EMISMATCH      Two passwords do not match.
+ * APR_EABSOLUTE      The given path was absolute.
+ * APR_ERELATIVE      The given path was relative.
+ * APR_EINCOMPLETE    The given path was neither relative nor absolute.
+ * APR_EABOVEROOT     The given path was above the root path.
+ * APR_EBUSY          The given lock was busy.
+ * APR_EPROC_UNKNOWN  The given process wasn't recognized by APR
+ * 
+ * @{ + */ +/** @see APR_STATUS_IS_ENOSTAT */ +#define APR_ENOSTAT (APR_OS_START_ERROR + 1) +/** @see APR_STATUS_IS_ENOPOOL */ +#define APR_ENOPOOL (APR_OS_START_ERROR + 2) +/* empty slot: +3 */ +/** @see APR_STATUS_IS_EBADDATE */ +#define APR_EBADDATE (APR_OS_START_ERROR + 4) +/** @see APR_STATUS_IS_EINVALSOCK */ +#define APR_EINVALSOCK (APR_OS_START_ERROR + 5) +/** @see APR_STATUS_IS_ENOPROC */ +#define APR_ENOPROC (APR_OS_START_ERROR + 6) +/** @see APR_STATUS_IS_ENOTIME */ +#define APR_ENOTIME (APR_OS_START_ERROR + 7) +/** @see APR_STATUS_IS_ENODIR */ +#define APR_ENODIR (APR_OS_START_ERROR + 8) +/** @see APR_STATUS_IS_ENOLOCK */ +#define APR_ENOLOCK (APR_OS_START_ERROR + 9) +/** @see APR_STATUS_IS_ENOPOLL */ +#define APR_ENOPOLL (APR_OS_START_ERROR + 10) +/** @see APR_STATUS_IS_ENOSOCKET */ +#define APR_ENOSOCKET (APR_OS_START_ERROR + 11) +/** @see APR_STATUS_IS_ENOTHREAD */ +#define APR_ENOTHREAD (APR_OS_START_ERROR + 12) +/** @see APR_STATUS_IS_ENOTHDKEY */ +#define APR_ENOTHDKEY (APR_OS_START_ERROR + 13) +/** @see APR_STATUS_IS_EGENERAL */ +#define APR_EGENERAL (APR_OS_START_ERROR + 14) +/** @see APR_STATUS_IS_ENOSHMAVAIL */ +#define APR_ENOSHMAVAIL (APR_OS_START_ERROR + 15) +/** @see APR_STATUS_IS_EBADIP */ +#define APR_EBADIP (APR_OS_START_ERROR + 16) +/** @see APR_STATUS_IS_EBADMASK */ +#define APR_EBADMASK (APR_OS_START_ERROR + 17) +/* empty slot: +18 */ +/** @see APR_STATUS_IS_EDSOPEN */ +#define APR_EDSOOPEN (APR_OS_START_ERROR + 19) +/** @see APR_STATUS_IS_EABSOLUTE */ +#define APR_EABSOLUTE (APR_OS_START_ERROR + 20) +/** @see APR_STATUS_IS_ERELATIVE */ +#define APR_ERELATIVE (APR_OS_START_ERROR + 21) +/** @see APR_STATUS_IS_EINCOMPLETE */ +#define APR_EINCOMPLETE (APR_OS_START_ERROR + 22) +/** @see APR_STATUS_IS_EABOVEROOT */ +#define APR_EABOVEROOT (APR_OS_START_ERROR + 23) +/** @see APR_STATUS_IS_EBADPATH */ +#define APR_EBADPATH (APR_OS_START_ERROR + 24) +/** @see APR_STATUS_IS_EPATHWILD */ +#define APR_EPATHWILD (APR_OS_START_ERROR + 25) +/** @see APR_STATUS_IS_ESYMNOTFOUND */ +#define APR_ESYMNOTFOUND (APR_OS_START_ERROR + 26) +/** @see APR_STATUS_IS_EPROC_UNKNOWN */ +#define APR_EPROC_UNKNOWN (APR_OS_START_ERROR + 27) +/** @see APR_STATUS_IS_ENOTENOUGHENTROPY */ +#define APR_ENOTENOUGHENTROPY (APR_OS_START_ERROR + 28) +/** @} */ + +/** + * @defgroup APR_STATUS_IS Status Value Tests + * @warning For any particular error condition, more than one of these tests + * may match. This is because platform-specific error codes may not + * always match the semantics of the POSIX codes these tests (and the + * corresponding APR error codes) are named after. A notable example + * are the APR_STATUS_IS_ENOENT and APR_STATUS_IS_ENOTDIR tests on + * Win32 platforms. The programmer should always be aware of this and + * adjust the order of the tests accordingly. + * @{ + */ +/** + * APR was unable to perform a stat on the file + * @warning always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_ENOSTAT(s) ((s) == APR_ENOSTAT) +/** + * APR was not provided a pool with which to allocate memory + * @warning always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_ENOPOOL(s) ((s) == APR_ENOPOOL) +/** APR was given an invalid date */ +#define APR_STATUS_IS_EBADDATE(s) ((s) == APR_EBADDATE) +/** APR was given an invalid socket */ +#define APR_STATUS_IS_EINVALSOCK(s) ((s) == APR_EINVALSOCK) +/** APR was not given a process structure */ +#define APR_STATUS_IS_ENOPROC(s) ((s) == APR_ENOPROC) +/** APR was not given a time structure */ +#define APR_STATUS_IS_ENOTIME(s) ((s) == APR_ENOTIME) +/** APR was not given a directory structure */ +#define APR_STATUS_IS_ENODIR(s) ((s) == APR_ENODIR) +/** APR was not given a lock structure */ +#define APR_STATUS_IS_ENOLOCK(s) ((s) == APR_ENOLOCK) +/** APR was not given a poll structure */ +#define APR_STATUS_IS_ENOPOLL(s) ((s) == APR_ENOPOLL) +/** APR was not given a socket */ +#define APR_STATUS_IS_ENOSOCKET(s) ((s) == APR_ENOSOCKET) +/** APR was not given a thread structure */ +#define APR_STATUS_IS_ENOTHREAD(s) ((s) == APR_ENOTHREAD) +/** APR was not given a thread key structure */ +#define APR_STATUS_IS_ENOTHDKEY(s) ((s) == APR_ENOTHDKEY) +/** Generic Error which can not be put into another spot */ +#define APR_STATUS_IS_EGENERAL(s) ((s) == APR_EGENERAL) +/** There is no more shared memory available */ +#define APR_STATUS_IS_ENOSHMAVAIL(s) ((s) == APR_ENOSHMAVAIL) +/** The specified IP address is invalid */ +#define APR_STATUS_IS_EBADIP(s) ((s) == APR_EBADIP) +/** The specified netmask is invalid */ +#define APR_STATUS_IS_EBADMASK(s) ((s) == APR_EBADMASK) +/* empty slot: +18 */ +/** + * APR was unable to open the dso object. + * For more information call apr_dso_error(). + */ +#if defined(WIN32) +#define APR_STATUS_IS_EDSOOPEN(s) ((s) == APR_EDSOOPEN \ + || APR_TO_OS_ERROR(s) == ERROR_MOD_NOT_FOUND) +#else +#define APR_STATUS_IS_EDSOOPEN(s) ((s) == APR_EDSOOPEN) +#endif +/** The given path was absolute. */ +#define APR_STATUS_IS_EABSOLUTE(s) ((s) == APR_EABSOLUTE) +/** The given path was relative. */ +#define APR_STATUS_IS_ERELATIVE(s) ((s) == APR_ERELATIVE) +/** The given path was neither relative nor absolute. */ +#define APR_STATUS_IS_EINCOMPLETE(s) ((s) == APR_EINCOMPLETE) +/** The given path was above the root path. */ +#define APR_STATUS_IS_EABOVEROOT(s) ((s) == APR_EABOVEROOT) +/** The given path was bad. */ +#define APR_STATUS_IS_EBADPATH(s) ((s) == APR_EBADPATH) +/** The given path contained wildcards. */ +#define APR_STATUS_IS_EPATHWILD(s) ((s) == APR_EPATHWILD) +/** Could not find the requested symbol. + * For more information call apr_dso_error(). + */ +#if defined(WIN32) +#define APR_STATUS_IS_ESYMNOTFOUND(s) ((s) == APR_ESYMNOTFOUND \ + || APR_TO_OS_ERROR(s) == ERROR_PROC_NOT_FOUND) +#else +#define APR_STATUS_IS_ESYMNOTFOUND(s) ((s) == APR_ESYMNOTFOUND) +#endif +/** The given process was not recognized by APR. */ +#define APR_STATUS_IS_EPROC_UNKNOWN(s) ((s) == APR_EPROC_UNKNOWN) + +/** APR could not gather enough entropy to continue. */ +#define APR_STATUS_IS_ENOTENOUGHENTROPY(s) ((s) == APR_ENOTENOUGHENTROPY) + +/** @} */ + +/** + * @addtogroup APR_Error + * @{ + */ +/** @see APR_STATUS_IS_INCHILD */ +#define APR_INCHILD (APR_OS_START_STATUS + 1) +/** @see APR_STATUS_IS_INPARENT */ +#define APR_INPARENT (APR_OS_START_STATUS + 2) +/** @see APR_STATUS_IS_DETACH */ +#define APR_DETACH (APR_OS_START_STATUS + 3) +/** @see APR_STATUS_IS_NOTDETACH */ +#define APR_NOTDETACH (APR_OS_START_STATUS + 4) +/** @see APR_STATUS_IS_CHILD_DONE */ +#define APR_CHILD_DONE (APR_OS_START_STATUS + 5) +/** @see APR_STATUS_IS_CHILD_NOTDONE */ +#define APR_CHILD_NOTDONE (APR_OS_START_STATUS + 6) +/** @see APR_STATUS_IS_TIMEUP */ +#define APR_TIMEUP (APR_OS_START_STATUS + 7) +/** @see APR_STATUS_IS_INCOMPLETE */ +#define APR_INCOMPLETE (APR_OS_START_STATUS + 8) +/* empty slot: +9 */ +/* empty slot: +10 */ +/* empty slot: +11 */ +/** @see APR_STATUS_IS_BADCH */ +#define APR_BADCH (APR_OS_START_STATUS + 12) +/** @see APR_STATUS_IS_BADARG */ +#define APR_BADARG (APR_OS_START_STATUS + 13) +/** @see APR_STATUS_IS_EOF */ +#define APR_EOF (APR_OS_START_STATUS + 14) +/** @see APR_STATUS_IS_NOTFOUND */ +#define APR_NOTFOUND (APR_OS_START_STATUS + 15) +/* empty slot: +16 */ +/* empty slot: +17 */ +/* empty slot: +18 */ +/** @see APR_STATUS_IS_ANONYMOUS */ +#define APR_ANONYMOUS (APR_OS_START_STATUS + 19) +/** @see APR_STATUS_IS_FILEBASED */ +#define APR_FILEBASED (APR_OS_START_STATUS + 20) +/** @see APR_STATUS_IS_KEYBASED */ +#define APR_KEYBASED (APR_OS_START_STATUS + 21) +/** @see APR_STATUS_IS_EINIT */ +#define APR_EINIT (APR_OS_START_STATUS + 22) +/** @see APR_STATUS_IS_ENOTIMPL */ +#define APR_ENOTIMPL (APR_OS_START_STATUS + 23) +/** @see APR_STATUS_IS_EMISMATCH */ +#define APR_EMISMATCH (APR_OS_START_STATUS + 24) +/** @see APR_STATUS_IS_EBUSY */ +#define APR_EBUSY (APR_OS_START_STATUS + 25) +/** @} */ + +/** + * @addtogroup APR_STATUS_IS + * @{ + */ +/** + * Program is currently executing in the child + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code */ +#define APR_STATUS_IS_INCHILD(s) ((s) == APR_INCHILD) +/** + * Program is currently executing in the parent + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_INPARENT(s) ((s) == APR_INPARENT) +/** + * The thread is detached + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_DETACH(s) ((s) == APR_DETACH) +/** + * The thread is not detached + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_NOTDETACH(s) ((s) == APR_NOTDETACH) +/** + * The child has finished executing + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_CHILD_DONE(s) ((s) == APR_CHILD_DONE) +/** + * The child has not finished executing + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_CHILD_NOTDONE(s) ((s) == APR_CHILD_NOTDONE) +/** + * The operation did not finish before the timeout + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP) +/** + * The operation was incomplete although some processing was performed + * and the results are partially valid. + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_INCOMPLETE(s) ((s) == APR_INCOMPLETE) +/* empty slot: +9 */ +/* empty slot: +10 */ +/* empty slot: +11 */ +/** + * Getopt found an option not in the option string + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_BADCH(s) ((s) == APR_BADCH) +/** + * Getopt found an option not in the option string and an argument was + * specified in the option string + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_BADARG(s) ((s) == APR_BADARG) +/** + * APR has encountered the end of the file + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_EOF(s) ((s) == APR_EOF) +/** + * APR was unable to find the socket in the poll structure + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_NOTFOUND(s) ((s) == APR_NOTFOUND) +/* empty slot: +16 */ +/* empty slot: +17 */ +/* empty slot: +18 */ +/** + * APR is using anonymous shared memory + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_ANONYMOUS(s) ((s) == APR_ANONYMOUS) +/** + * APR is using a file name as the key to the shared memory + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_FILEBASED(s) ((s) == APR_FILEBASED) +/** + * APR is using a shared key as the key to the shared memory + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_KEYBASED(s) ((s) == APR_KEYBASED) +/** + * Ininitalizer value. If no option has been found, but + * the status variable requires a value, this should be used + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_EINIT(s) ((s) == APR_EINIT) +/** + * The APR function has not been implemented on this + * platform, either because nobody has gotten to it yet, + * or the function is impossible on this platform. + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_ENOTIMPL(s) ((s) == APR_ENOTIMPL) +/** + * Two passwords do not match. + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_EMISMATCH(s) ((s) == APR_EMISMATCH) +/** + * The given lock was busy + * @warning always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_EBUSY(s) ((s) == APR_EBUSY) + +/** @} */ + +/** + * @addtogroup APR_Error APR Error Values + * @{ + */ +/* APR CANONICAL ERROR VALUES */ +/** @see APR_STATUS_IS_EACCES */ +#ifdef EACCES +#define APR_EACCES EACCES +#else +#define APR_EACCES (APR_OS_START_CANONERR + 1) +#endif + +/** @see APR_STATUS_IS_EXIST */ +#ifdef EEXIST +#define APR_EEXIST EEXIST +#else +#define APR_EEXIST (APR_OS_START_CANONERR + 2) +#endif + +/** @see APR_STATUS_IS_ENAMETOOLONG */ +#ifdef ENAMETOOLONG +#define APR_ENAMETOOLONG ENAMETOOLONG +#else +#define APR_ENAMETOOLONG (APR_OS_START_CANONERR + 3) +#endif + +/** @see APR_STATUS_IS_ENOENT */ +#ifdef ENOENT +#define APR_ENOENT ENOENT +#else +#define APR_ENOENT (APR_OS_START_CANONERR + 4) +#endif + +/** @see APR_STATUS_IS_ENOTDIR */ +#ifdef ENOTDIR +#define APR_ENOTDIR ENOTDIR +#else +#define APR_ENOTDIR (APR_OS_START_CANONERR + 5) +#endif + +/** @see APR_STATUS_IS_ENOSPC */ +#ifdef ENOSPC +#define APR_ENOSPC ENOSPC +#else +#define APR_ENOSPC (APR_OS_START_CANONERR + 6) +#endif + +/** @see APR_STATUS_IS_ENOMEM */ +#ifdef ENOMEM +#define APR_ENOMEM ENOMEM +#else +#define APR_ENOMEM (APR_OS_START_CANONERR + 7) +#endif + +/** @see APR_STATUS_IS_EMFILE */ +#ifdef EMFILE +#define APR_EMFILE EMFILE +#else +#define APR_EMFILE (APR_OS_START_CANONERR + 8) +#endif + +/** @see APR_STATUS_IS_ENFILE */ +#ifdef ENFILE +#define APR_ENFILE ENFILE +#else +#define APR_ENFILE (APR_OS_START_CANONERR + 9) +#endif + +/** @see APR_STATUS_IS_EBADF */ +#ifdef EBADF +#define APR_EBADF EBADF +#else +#define APR_EBADF (APR_OS_START_CANONERR + 10) +#endif + +/** @see APR_STATUS_IS_EINVAL */ +#ifdef EINVAL +#define APR_EINVAL EINVAL +#else +#define APR_EINVAL (APR_OS_START_CANONERR + 11) +#endif + +/** @see APR_STATUS_IS_ESPIPE */ +#ifdef ESPIPE +#define APR_ESPIPE ESPIPE +#else +#define APR_ESPIPE (APR_OS_START_CANONERR + 12) +#endif + +/** + * @see APR_STATUS_IS_EAGAIN + * @warning use APR_STATUS_IS_EAGAIN instead of just testing this value + */ +#ifdef EAGAIN +#define APR_EAGAIN EAGAIN +#elif defined(EWOULDBLOCK) +#define APR_EAGAIN EWOULDBLOCK +#else +#define APR_EAGAIN (APR_OS_START_CANONERR + 13) +#endif + +/** @see APR_STATUS_IS_EINTR */ +#ifdef EINTR +#define APR_EINTR EINTR +#else +#define APR_EINTR (APR_OS_START_CANONERR + 14) +#endif + +/** @see APR_STATUS_IS_ENOTSOCK */ +#ifdef ENOTSOCK +#define APR_ENOTSOCK ENOTSOCK +#else +#define APR_ENOTSOCK (APR_OS_START_CANONERR + 15) +#endif + +/** @see APR_STATUS_IS_ECONNREFUSED */ +#ifdef ECONNREFUSED +#define APR_ECONNREFUSED ECONNREFUSED +#else +#define APR_ECONNREFUSED (APR_OS_START_CANONERR + 16) +#endif + +/** @see APR_STATUS_IS_EINPROGRESS */ +#ifdef EINPROGRESS +#define APR_EINPROGRESS EINPROGRESS +#else +#define APR_EINPROGRESS (APR_OS_START_CANONERR + 17) +#endif + +/** + * @see APR_STATUS_IS_ECONNABORTED + * @warning use APR_STATUS_IS_ECONNABORTED instead of just testing this value + */ + +#ifdef ECONNABORTED +#define APR_ECONNABORTED ECONNABORTED +#else +#define APR_ECONNABORTED (APR_OS_START_CANONERR + 18) +#endif + +/** @see APR_STATUS_IS_ECONNRESET */ +#ifdef ECONNRESET +#define APR_ECONNRESET ECONNRESET +#else +#define APR_ECONNRESET (APR_OS_START_CANONERR + 19) +#endif + +/** @see APR_STATUS_IS_ETIMEDOUT + * @deprecated */ +#ifdef ETIMEDOUT +#define APR_ETIMEDOUT ETIMEDOUT +#else +#define APR_ETIMEDOUT (APR_OS_START_CANONERR + 20) +#endif + +/** @see APR_STATUS_IS_EHOSTUNREACH */ +#ifdef EHOSTUNREACH +#define APR_EHOSTUNREACH EHOSTUNREACH +#else +#define APR_EHOSTUNREACH (APR_OS_START_CANONERR + 21) +#endif + +/** @see APR_STATUS_IS_ENETUNREACH */ +#ifdef ENETUNREACH +#define APR_ENETUNREACH ENETUNREACH +#else +#define APR_ENETUNREACH (APR_OS_START_CANONERR + 22) +#endif + +/** @see APR_STATUS_IS_EFTYPE */ +#ifdef EFTYPE +#define APR_EFTYPE EFTYPE +#else +#define APR_EFTYPE (APR_OS_START_CANONERR + 23) +#endif + +/** @see APR_STATUS_IS_EPIPE */ +#ifdef EPIPE +#define APR_EPIPE EPIPE +#else +#define APR_EPIPE (APR_OS_START_CANONERR + 24) +#endif + +/** @see APR_STATUS_IS_EXDEV */ +#ifdef EXDEV +#define APR_EXDEV EXDEV +#else +#define APR_EXDEV (APR_OS_START_CANONERR + 25) +#endif + +/** @see APR_STATUS_IS_ENOTEMPTY */ +#ifdef ENOTEMPTY +#define APR_ENOTEMPTY ENOTEMPTY +#else +#define APR_ENOTEMPTY (APR_OS_START_CANONERR + 26) +#endif + +/** @} */ + +#if defined(OS2) && !defined(DOXYGEN) + +#define APR_FROM_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e + APR_OS_START_SYSERR) +#define APR_TO_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e - APR_OS_START_SYSERR) + +#define INCL_DOSERRORS +#define INCL_DOS + +/* Leave these undefined. + * OS2 doesn't rely on the errno concept. + * The API calls always return a result codes which + * should be filtered through APR_FROM_OS_ERROR(). + * + * #define apr_get_os_error() (APR_FROM_OS_ERROR(GetLastError())) + * #define apr_set_os_error(e) (SetLastError(APR_TO_OS_ERROR(e))) + */ + +/* A special case, only socket calls require this; + */ +#define apr_get_netos_error() (APR_FROM_OS_ERROR(errno)) +#define apr_set_netos_error(e) (errno = APR_TO_OS_ERROR(e)) + +/* And this needs to be greped away for good: + */ +#define APR_OS2_STATUS(e) (APR_FROM_OS_ERROR(e)) + +/* These can't sit in a private header, so in spite of the extra size, + * they need to be made available here. + */ +#define SOCBASEERR 10000 +#define SOCEPERM (SOCBASEERR+1) /* Not owner */ +#define SOCESRCH (SOCBASEERR+3) /* No such process */ +#define SOCEINTR (SOCBASEERR+4) /* Interrupted system call */ +#define SOCENXIO (SOCBASEERR+6) /* No such device or address */ +#define SOCEBADF (SOCBASEERR+9) /* Bad file number */ +#define SOCEACCES (SOCBASEERR+13) /* Permission denied */ +#define SOCEFAULT (SOCBASEERR+14) /* Bad address */ +#define SOCEINVAL (SOCBASEERR+22) /* Invalid argument */ +#define SOCEMFILE (SOCBASEERR+24) /* Too many open files */ +#define SOCEPIPE (SOCBASEERR+32) /* Broken pipe */ +#define SOCEOS2ERR (SOCBASEERR+100) /* OS/2 Error */ +#define SOCEWOULDBLOCK (SOCBASEERR+35) /* Operation would block */ +#define SOCEINPROGRESS (SOCBASEERR+36) /* Operation now in progress */ +#define SOCEALREADY (SOCBASEERR+37) /* Operation already in progress */ +#define SOCENOTSOCK (SOCBASEERR+38) /* Socket operation on non-socket */ +#define SOCEDESTADDRREQ (SOCBASEERR+39) /* Destination address required */ +#define SOCEMSGSIZE (SOCBASEERR+40) /* Message too long */ +#define SOCEPROTOTYPE (SOCBASEERR+41) /* Protocol wrong type for socket */ +#define SOCENOPROTOOPT (SOCBASEERR+42) /* Protocol not available */ +#define SOCEPROTONOSUPPORT (SOCBASEERR+43) /* Protocol not supported */ +#define SOCESOCKTNOSUPPORT (SOCBASEERR+44) /* Socket type not supported */ +#define SOCEOPNOTSUPP (SOCBASEERR+45) /* Operation not supported on socket */ +#define SOCEPFNOSUPPORT (SOCBASEERR+46) /* Protocol family not supported */ +#define SOCEAFNOSUPPORT (SOCBASEERR+47) /* Address family not supported by protocol family */ +#define SOCEADDRINUSE (SOCBASEERR+48) /* Address already in use */ +#define SOCEADDRNOTAVAIL (SOCBASEERR+49) /* Can't assign requested address */ +#define SOCENETDOWN (SOCBASEERR+50) /* Network is down */ +#define SOCENETUNREACH (SOCBASEERR+51) /* Network is unreachable */ +#define SOCENETRESET (SOCBASEERR+52) /* Network dropped connection on reset */ +#define SOCECONNABORTED (SOCBASEERR+53) /* Software caused connection abort */ +#define SOCECONNRESET (SOCBASEERR+54) /* Connection reset by peer */ +#define SOCENOBUFS (SOCBASEERR+55) /* No buffer space available */ +#define SOCEISCONN (SOCBASEERR+56) /* Socket is already connected */ +#define SOCENOTCONN (SOCBASEERR+57) /* Socket is not connected */ +#define SOCESHUTDOWN (SOCBASEERR+58) /* Can't send after socket shutdown */ +#define SOCETOOMANYREFS (SOCBASEERR+59) /* Too many references: can't splice */ +#define SOCETIMEDOUT (SOCBASEERR+60) /* Connection timed out */ +#define SOCECONNREFUSED (SOCBASEERR+61) /* Connection refused */ +#define SOCELOOP (SOCBASEERR+62) /* Too many levels of symbolic links */ +#define SOCENAMETOOLONG (SOCBASEERR+63) /* File name too long */ +#define SOCEHOSTDOWN (SOCBASEERR+64) /* Host is down */ +#define SOCEHOSTUNREACH (SOCBASEERR+65) /* No route to host */ +#define SOCENOTEMPTY (SOCBASEERR+66) /* Directory not empty */ + +/* APR CANONICAL ERROR TESTS */ +#define APR_STATUS_IS_EACCES(s) ((s) == APR_EACCES \ + || (s) == APR_OS_START_SYSERR + ERROR_ACCESS_DENIED \ + || (s) == APR_OS_START_SYSERR + ERROR_SHARING_VIOLATION) +#define APR_STATUS_IS_EEXIST(s) ((s) == APR_EEXIST \ + || (s) == APR_OS_START_SYSERR + ERROR_OPEN_FAILED \ + || (s) == APR_OS_START_SYSERR + ERROR_FILE_EXISTS \ + || (s) == APR_OS_START_SYSERR + ERROR_ALREADY_EXISTS \ + || (s) == APR_OS_START_SYSERR + ERROR_ACCESS_DENIED) +#define APR_STATUS_IS_ENAMETOOLONG(s) ((s) == APR_ENAMETOOLONG \ + || (s) == APR_OS_START_SYSERR + ERROR_FILENAME_EXCED_RANGE \ + || (s) == APR_OS_START_SYSERR + SOCENAMETOOLONG) +#define APR_STATUS_IS_ENOENT(s) ((s) == APR_ENOENT \ + || (s) == APR_OS_START_SYSERR + ERROR_FILE_NOT_FOUND \ + || (s) == APR_OS_START_SYSERR + ERROR_PATH_NOT_FOUND \ + || (s) == APR_OS_START_SYSERR + ERROR_NO_MORE_FILES \ + || (s) == APR_OS_START_SYSERR + ERROR_OPEN_FAILED) +#define APR_STATUS_IS_ENOTDIR(s) ((s) == APR_ENOTDIR) +#define APR_STATUS_IS_ENOSPC(s) ((s) == APR_ENOSPC \ + || (s) == APR_OS_START_SYSERR + ERROR_DISK_FULL) +#define APR_STATUS_IS_ENOMEM(s) ((s) == APR_ENOMEM) +#define APR_STATUS_IS_EMFILE(s) ((s) == APR_EMFILE \ + || (s) == APR_OS_START_SYSERR + ERROR_TOO_MANY_OPEN_FILES) +#define APR_STATUS_IS_ENFILE(s) ((s) == APR_ENFILE) +#define APR_STATUS_IS_EBADF(s) ((s) == APR_EBADF \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_HANDLE) +#define APR_STATUS_IS_EINVAL(s) ((s) == APR_EINVAL \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_PARAMETER \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_FUNCTION) +#define APR_STATUS_IS_ESPIPE(s) ((s) == APR_ESPIPE \ + || (s) == APR_OS_START_SYSERR + ERROR_NEGATIVE_SEEK) +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN \ + || (s) == APR_OS_START_SYSERR + ERROR_NO_DATA \ + || (s) == APR_OS_START_SYSERR + SOCEWOULDBLOCK \ + || (s) == APR_OS_START_SYSERR + ERROR_LOCK_VIOLATION) +#define APR_STATUS_IS_EINTR(s) ((s) == APR_EINTR \ + || (s) == APR_OS_START_SYSERR + SOCEINTR) +#define APR_STATUS_IS_ENOTSOCK(s) ((s) == APR_ENOTSOCK \ + || (s) == APR_OS_START_SYSERR + SOCENOTSOCK) +#define APR_STATUS_IS_ECONNREFUSED(s) ((s) == APR_ECONNREFUSED \ + || (s) == APR_OS_START_SYSERR + SOCECONNREFUSED) +#define APR_STATUS_IS_EINPROGRESS(s) ((s) == APR_EINPROGRESS \ + || (s) == APR_OS_START_SYSERR + SOCEINPROGRESS) +#define APR_STATUS_IS_ECONNABORTED(s) ((s) == APR_ECONNABORTED \ + || (s) == APR_OS_START_SYSERR + SOCECONNABORTED) +#define APR_STATUS_IS_ECONNRESET(s) ((s) == APR_ECONNRESET \ + || (s) == APR_OS_START_SYSERR + SOCECONNRESET) +/* XXX deprecated */ +#define APR_STATUS_IS_ETIMEDOUT(s) ((s) == APR_ETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) +#undef APR_STATUS_IS_TIMEUP +#define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP \ + || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) +#define APR_STATUS_IS_EHOSTUNREACH(s) ((s) == APR_EHOSTUNREACH \ + || (s) == APR_OS_START_SYSERR + SOCEHOSTUNREACH) +#define APR_STATUS_IS_ENETUNREACH(s) ((s) == APR_ENETUNREACH \ + || (s) == APR_OS_START_SYSERR + SOCENETUNREACH) +#define APR_STATUS_IS_EFTYPE(s) ((s) == APR_EFTYPE) +#define APR_STATUS_IS_EPIPE(s) ((s) == APR_EPIPE \ + || (s) == APR_OS_START_SYSERR + ERROR_BROKEN_PIPE \ + || (s) == APR_OS_START_SYSERR + SOCEPIPE) +#define APR_STATUS_IS_EXDEV(s) ((s) == APR_EXDEV \ + || (s) == APR_OS_START_SYSERR + ERROR_NOT_SAME_DEVICE) +#define APR_STATUS_IS_ENOTEMPTY(s) ((s) == APR_ENOTEMPTY \ + || (s) == APR_OS_START_SYSERR + ERROR_DIR_NOT_EMPTY \ + || (s) == APR_OS_START_SYSERR + ERROR_ACCESS_DENIED) + +/* + Sorry, too tired to wrap this up for OS2... feel free to + fit the following into their best matches. + + { ERROR_NO_SIGNAL_SENT, ESRCH }, + { SOCEALREADY, EALREADY }, + { SOCEDESTADDRREQ, EDESTADDRREQ }, + { SOCEMSGSIZE, EMSGSIZE }, + { SOCEPROTOTYPE, EPROTOTYPE }, + { SOCENOPROTOOPT, ENOPROTOOPT }, + { SOCEPROTONOSUPPORT, EPROTONOSUPPORT }, + { SOCESOCKTNOSUPPORT, ESOCKTNOSUPPORT }, + { SOCEOPNOTSUPP, EOPNOTSUPP }, + { SOCEPFNOSUPPORT, EPFNOSUPPORT }, + { SOCEAFNOSUPPORT, EAFNOSUPPORT }, + { SOCEADDRINUSE, EADDRINUSE }, + { SOCEADDRNOTAVAIL, EADDRNOTAVAIL }, + { SOCENETDOWN, ENETDOWN }, + { SOCENETRESET, ENETRESET }, + { SOCENOBUFS, ENOBUFS }, + { SOCEISCONN, EISCONN }, + { SOCENOTCONN, ENOTCONN }, + { SOCESHUTDOWN, ESHUTDOWN }, + { SOCETOOMANYREFS, ETOOMANYREFS }, + { SOCELOOP, ELOOP }, + { SOCEHOSTDOWN, EHOSTDOWN }, + { SOCENOTEMPTY, ENOTEMPTY }, + { SOCEPIPE, EPIPE } +*/ + +#elif defined(WIN32) && !defined(DOXYGEN) /* !defined(OS2) */ + +#define APR_FROM_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e + APR_OS_START_SYSERR) +#define APR_TO_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e - APR_OS_START_SYSERR) + +#define apr_get_os_error() (APR_FROM_OS_ERROR(GetLastError())) +#define apr_set_os_error(e) (SetLastError(APR_TO_OS_ERROR(e))) + +/* A special case, only socket calls require this: + */ +#define apr_get_netos_error() (APR_FROM_OS_ERROR(WSAGetLastError())) +#define apr_set_netos_error(e) (WSASetLastError(APR_TO_OS_ERROR(e))) + +/* APR CANONICAL ERROR TESTS */ +#define APR_STATUS_IS_EACCES(s) ((s) == APR_EACCES \ + || (s) == APR_OS_START_SYSERR + ERROR_ACCESS_DENIED \ + || (s) == APR_OS_START_SYSERR + ERROR_CANNOT_MAKE \ + || (s) == APR_OS_START_SYSERR + ERROR_CURRENT_DIRECTORY \ + || (s) == APR_OS_START_SYSERR + ERROR_DRIVE_LOCKED \ + || (s) == APR_OS_START_SYSERR + ERROR_FAIL_I24 \ + || (s) == APR_OS_START_SYSERR + ERROR_LOCK_VIOLATION \ + || (s) == APR_OS_START_SYSERR + ERROR_LOCK_FAILED \ + || (s) == APR_OS_START_SYSERR + ERROR_NOT_LOCKED \ + || (s) == APR_OS_START_SYSERR + ERROR_NETWORK_ACCESS_DENIED \ + || (s) == APR_OS_START_SYSERR + ERROR_SHARING_VIOLATION) +#define APR_STATUS_IS_EEXIST(s) ((s) == APR_EEXIST \ + || (s) == APR_OS_START_SYSERR + ERROR_FILE_EXISTS \ + || (s) == APR_OS_START_SYSERR + ERROR_ALREADY_EXISTS) +#define APR_STATUS_IS_ENAMETOOLONG(s) ((s) == APR_ENAMETOOLONG \ + || (s) == APR_OS_START_SYSERR + ERROR_FILENAME_EXCED_RANGE \ + || (s) == APR_OS_START_SYSERR + WSAENAMETOOLONG) +#define APR_STATUS_IS_ENOENT(s) ((s) == APR_ENOENT \ + || (s) == APR_OS_START_SYSERR + ERROR_FILE_NOT_FOUND \ + || (s) == APR_OS_START_SYSERR + ERROR_PATH_NOT_FOUND \ + || (s) == APR_OS_START_SYSERR + ERROR_OPEN_FAILED \ + || (s) == APR_OS_START_SYSERR + ERROR_NO_MORE_FILES) +#define APR_STATUS_IS_ENOTDIR(s) ((s) == APR_ENOTDIR \ + || (s) == APR_OS_START_SYSERR + ERROR_PATH_NOT_FOUND \ + || (s) == APR_OS_START_SYSERR + ERROR_BAD_NETPATH \ + || (s) == APR_OS_START_SYSERR + ERROR_BAD_NET_NAME \ + || (s) == APR_OS_START_SYSERR + ERROR_BAD_PATHNAME \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_DRIVE) +#define APR_STATUS_IS_ENOSPC(s) ((s) == APR_ENOSPC \ + || (s) == APR_OS_START_SYSERR + ERROR_DISK_FULL) +#define APR_STATUS_IS_ENOMEM(s) ((s) == APR_ENOMEM \ + || (s) == APR_OS_START_SYSERR + ERROR_ARENA_TRASHED \ + || (s) == APR_OS_START_SYSERR + ERROR_NOT_ENOUGH_MEMORY \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_BLOCK \ + || (s) == APR_OS_START_SYSERR + ERROR_NOT_ENOUGH_QUOTA \ + || (s) == APR_OS_START_SYSERR + ERROR_OUTOFMEMORY) +#define APR_STATUS_IS_EMFILE(s) ((s) == APR_EMFILE \ + || (s) == APR_OS_START_SYSERR + ERROR_TOO_MANY_OPEN_FILES) +#define APR_STATUS_IS_ENFILE(s) ((s) == APR_ENFILE) +#define APR_STATUS_IS_EBADF(s) ((s) == APR_EBADF \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_HANDLE \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_TARGET_HANDLE) +#define APR_STATUS_IS_EINVAL(s) ((s) == APR_EINVAL \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_ACCESS \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_DATA \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_FUNCTION \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_HANDLE \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_PARAMETER \ + || (s) == APR_OS_START_SYSERR + ERROR_NEGATIVE_SEEK) +#define APR_STATUS_IS_ESPIPE(s) ((s) == APR_ESPIPE \ + || (s) == APR_OS_START_SYSERR + ERROR_SEEK_ON_DEVICE \ + || (s) == APR_OS_START_SYSERR + ERROR_NEGATIVE_SEEK) +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN \ + || (s) == APR_OS_START_SYSERR + ERROR_NO_DATA \ + || (s) == APR_OS_START_SYSERR + ERROR_NO_PROC_SLOTS \ + || (s) == APR_OS_START_SYSERR + ERROR_NESTING_NOT_ALLOWED \ + || (s) == APR_OS_START_SYSERR + ERROR_MAX_THRDS_REACHED \ + || (s) == APR_OS_START_SYSERR + ERROR_LOCK_VIOLATION \ + || (s) == APR_OS_START_SYSERR + WSAEWOULDBLOCK) +#define APR_STATUS_IS_EINTR(s) ((s) == APR_EINTR \ + || (s) == APR_OS_START_SYSERR + WSAEINTR) +#define APR_STATUS_IS_ENOTSOCK(s) ((s) == APR_ENOTSOCK \ + || (s) == APR_OS_START_SYSERR + WSAENOTSOCK) +#define APR_STATUS_IS_ECONNREFUSED(s) ((s) == APR_ECONNREFUSED \ + || (s) == APR_OS_START_SYSERR + WSAECONNREFUSED) +#define APR_STATUS_IS_EINPROGRESS(s) ((s) == APR_EINPROGRESS \ + || (s) == APR_OS_START_SYSERR + WSAEINPROGRESS) +#define APR_STATUS_IS_ECONNABORTED(s) ((s) == APR_ECONNABORTED \ + || (s) == APR_OS_START_SYSERR + WSAECONNABORTED) +#define APR_STATUS_IS_ECONNRESET(s) ((s) == APR_ECONNRESET \ + || (s) == APR_OS_START_SYSERR + ERROR_NETNAME_DELETED \ + || (s) == APR_OS_START_SYSERR + WSAECONNRESET) +/* XXX deprecated */ +#define APR_STATUS_IS_ETIMEDOUT(s) ((s) == APR_ETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WSAETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WAIT_TIMEOUT) +#undef APR_STATUS_IS_TIMEUP +#define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP \ + || (s) == APR_OS_START_SYSERR + WSAETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WAIT_TIMEOUT) +#define APR_STATUS_IS_EHOSTUNREACH(s) ((s) == APR_EHOSTUNREACH \ + || (s) == APR_OS_START_SYSERR + WSAEHOSTUNREACH) +#define APR_STATUS_IS_ENETUNREACH(s) ((s) == APR_ENETUNREACH \ + || (s) == APR_OS_START_SYSERR + WSAENETUNREACH) +#define APR_STATUS_IS_EFTYPE(s) ((s) == APR_EFTYPE \ + || (s) == APR_OS_START_SYSERR + ERROR_EXE_MACHINE_TYPE_MISMATCH \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_DLL \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_MODULETYPE \ + || (s) == APR_OS_START_SYSERR + ERROR_BAD_EXE_FORMAT \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_EXE_SIGNATURE \ + || (s) == APR_OS_START_SYSERR + ERROR_FILE_CORRUPT \ + || (s) == APR_OS_START_SYSERR + ERROR_BAD_FORMAT) +#define APR_STATUS_IS_EPIPE(s) ((s) == APR_EPIPE \ + || (s) == APR_OS_START_SYSERR + ERROR_BROKEN_PIPE) +#define APR_STATUS_IS_EXDEV(s) ((s) == APR_EXDEV \ + || (s) == APR_OS_START_SYSERR + ERROR_NOT_SAME_DEVICE) +#define APR_STATUS_IS_ENOTEMPTY(s) ((s) == APR_ENOTEMPTY \ + || (s) == APR_OS_START_SYSERR + ERROR_DIR_NOT_EMPTY) + +#elif defined(NETWARE) && defined(USE_WINSOCK) && !defined(DOXYGEN) /* !defined(OS2) && !defined(WIN32) */ + +#define APR_FROM_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e + APR_OS_START_SYSERR) +#define APR_TO_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e - APR_OS_START_SYSERR) + +#define apr_get_os_error() (errno) +#define apr_set_os_error(e) (errno = (e)) + +/* A special case, only socket calls require this: */ +#define apr_get_netos_error() (APR_FROM_OS_ERROR(WSAGetLastError())) +#define apr_set_netos_error(e) (WSASetLastError(APR_TO_OS_ERROR(e))) + +/* APR CANONICAL ERROR TESTS */ +#define APR_STATUS_IS_EACCES(s) ((s) == APR_EACCES) +#define APR_STATUS_IS_EEXIST(s) ((s) == APR_EEXIST) +#define APR_STATUS_IS_ENAMETOOLONG(s) ((s) == APR_ENAMETOOLONG) +#define APR_STATUS_IS_ENOENT(s) ((s) == APR_ENOENT) +#define APR_STATUS_IS_ENOTDIR(s) ((s) == APR_ENOTDIR) +#define APR_STATUS_IS_ENOSPC(s) ((s) == APR_ENOSPC) +#define APR_STATUS_IS_ENOMEM(s) ((s) == APR_ENOMEM) +#define APR_STATUS_IS_EMFILE(s) ((s) == APR_EMFILE) +#define APR_STATUS_IS_ENFILE(s) ((s) == APR_ENFILE) +#define APR_STATUS_IS_EBADF(s) ((s) == APR_EBADF) +#define APR_STATUS_IS_EINVAL(s) ((s) == APR_EINVAL) +#define APR_STATUS_IS_ESPIPE(s) ((s) == APR_ESPIPE) + +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN \ + || (s) == EWOULDBLOCK \ + || (s) == APR_OS_START_SYSERR + WSAEWOULDBLOCK) +#define APR_STATUS_IS_EINTR(s) ((s) == APR_EINTR \ + || (s) == APR_OS_START_SYSERR + WSAEINTR) +#define APR_STATUS_IS_ENOTSOCK(s) ((s) == APR_ENOTSOCK \ + || (s) == APR_OS_START_SYSERR + WSAENOTSOCK) +#define APR_STATUS_IS_ECONNREFUSED(s) ((s) == APR_ECONNREFUSED \ + || (s) == APR_OS_START_SYSERR + WSAECONNREFUSED) +#define APR_STATUS_IS_EINPROGRESS(s) ((s) == APR_EINPROGRESS \ + || (s) == APR_OS_START_SYSERR + WSAEINPROGRESS) +#define APR_STATUS_IS_ECONNABORTED(s) ((s) == APR_ECONNABORTED \ + || (s) == APR_OS_START_SYSERR + WSAECONNABORTED) +#define APR_STATUS_IS_ECONNRESET(s) ((s) == APR_ECONNRESET \ + || (s) == APR_OS_START_SYSERR + WSAECONNRESET) +/* XXX deprecated */ +#define APR_STATUS_IS_ETIMEDOUT(s) ((s) == APR_ETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WSAETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WAIT_TIMEOUT) +#undef APR_STATUS_IS_TIMEUP +#define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP \ + || (s) == APR_OS_START_SYSERR + WSAETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WAIT_TIMEOUT) +#define APR_STATUS_IS_EHOSTUNREACH(s) ((s) == APR_EHOSTUNREACH \ + || (s) == APR_OS_START_SYSERR + WSAEHOSTUNREACH) +#define APR_STATUS_IS_ENETUNREACH(s) ((s) == APR_ENETUNREACH \ + || (s) == APR_OS_START_SYSERR + WSAENETUNREACH) +#define APR_STATUS_IS_ENETDOWN(s) ((s) == APR_OS_START_SYSERR + WSAENETDOWN) +#define APR_STATUS_IS_EFTYPE(s) ((s) == APR_EFTYPE) +#define APR_STATUS_IS_EPIPE(s) ((s) == APR_EPIPE) +#define APR_STATUS_IS_EXDEV(s) ((s) == APR_EXDEV) +#define APR_STATUS_IS_ENOTEMPTY(s) ((s) == APR_ENOTEMPTY) + +#else /* !defined(NETWARE) && !defined(OS2) && !defined(WIN32) */ + +/* + * os error codes are clib error codes + */ +#define APR_FROM_OS_ERROR(e) (e) +#define APR_TO_OS_ERROR(e) (e) + +#define apr_get_os_error() (errno) +#define apr_set_os_error(e) (errno = (e)) + +/* A special case, only socket calls require this: + */ +#define apr_get_netos_error() (errno) +#define apr_set_netos_error(e) (errno = (e)) + +/** + * @addtogroup APR_STATUS_IS + * @{ + */ + +/** permission denied */ +#define APR_STATUS_IS_EACCES(s) ((s) == APR_EACCES) +/** file exists */ +#define APR_STATUS_IS_EEXIST(s) ((s) == APR_EEXIST) +/** path name is too long */ +#define APR_STATUS_IS_ENAMETOOLONG(s) ((s) == APR_ENAMETOOLONG) +/** + * no such file or directory + * @remark + * EMVSCATLG can be returned by the automounter on z/OS for + * paths which do not exist. + */ +#ifdef EMVSCATLG +#define APR_STATUS_IS_ENOENT(s) ((s) == APR_ENOENT \ + || (s) == EMVSCATLG) +#else +#define APR_STATUS_IS_ENOENT(s) ((s) == APR_ENOENT) +#endif +/** not a directory */ +#define APR_STATUS_IS_ENOTDIR(s) ((s) == APR_ENOTDIR) +/** no space left on device */ +#ifdef EDQUOT +#define APR_STATUS_IS_ENOSPC(s) ((s) == APR_ENOSPC \ + || (s) == EDQUOT) +#else +#define APR_STATUS_IS_ENOSPC(s) ((s) == APR_ENOSPC) +#endif +/** not enough memory */ +#define APR_STATUS_IS_ENOMEM(s) ((s) == APR_ENOMEM) +/** too many open files */ +#define APR_STATUS_IS_EMFILE(s) ((s) == APR_EMFILE) +/** file table overflow */ +#define APR_STATUS_IS_ENFILE(s) ((s) == APR_ENFILE) +/** bad file # */ +#define APR_STATUS_IS_EBADF(s) ((s) == APR_EBADF) +/** invalid argument */ +#define APR_STATUS_IS_EINVAL(s) ((s) == APR_EINVAL) +/** illegal seek */ +#define APR_STATUS_IS_ESPIPE(s) ((s) == APR_ESPIPE) + +/** operation would block */ +#if !defined(EWOULDBLOCK) || !defined(EAGAIN) +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN) +#elif (EWOULDBLOCK == EAGAIN) +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN) +#else +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN \ + || (s) == EWOULDBLOCK) +#endif + +/** interrupted system call */ +#define APR_STATUS_IS_EINTR(s) ((s) == APR_EINTR) +/** socket operation on a non-socket */ +#define APR_STATUS_IS_ENOTSOCK(s) ((s) == APR_ENOTSOCK) +/** Connection Refused */ +#define APR_STATUS_IS_ECONNREFUSED(s) ((s) == APR_ECONNREFUSED) +/** operation now in progress */ +#define APR_STATUS_IS_EINPROGRESS(s) ((s) == APR_EINPROGRESS) + +/** + * Software caused connection abort + * @remark + * EPROTO on certain older kernels really means ECONNABORTED, so we need to + * ignore it for them. See discussion in new-httpd archives nh.9701 & nh.9603 + * + * There is potentially a bug in Solaris 2.x x<6, and other boxes that + * implement tcp sockets in userland (i.e. on top of STREAMS). On these + * systems, EPROTO can actually result in a fatal loop. See PR#981 for + * example. It's hard to handle both uses of EPROTO. + */ +#ifdef EPROTO +#define APR_STATUS_IS_ECONNABORTED(s) ((s) == APR_ECONNABORTED \ + || (s) == EPROTO) +#else +#define APR_STATUS_IS_ECONNABORTED(s) ((s) == APR_ECONNABORTED) +#endif + +/** Connection Reset by peer */ +#define APR_STATUS_IS_ECONNRESET(s) ((s) == APR_ECONNRESET) +/** Operation timed out + * @deprecated */ +#define APR_STATUS_IS_ETIMEDOUT(s) ((s) == APR_ETIMEDOUT) +/** no route to host */ +#define APR_STATUS_IS_EHOSTUNREACH(s) ((s) == APR_EHOSTUNREACH) +/** network is unreachable */ +#define APR_STATUS_IS_ENETUNREACH(s) ((s) == APR_ENETUNREACH) +/** inappropiate file type or format */ +#define APR_STATUS_IS_EFTYPE(s) ((s) == APR_EFTYPE) +/** broken pipe */ +#define APR_STATUS_IS_EPIPE(s) ((s) == APR_EPIPE) +/** cross device link */ +#define APR_STATUS_IS_EXDEV(s) ((s) == APR_EXDEV) +/** Directory Not Empty */ +#define APR_STATUS_IS_ENOTEMPTY(s) ((s) == APR_ENOTEMPTY || \ + (s) == APR_EEXIST) +/** @} */ + +#endif /* !defined(NETWARE) && !defined(OS2) && !defined(WIN32) */ + +/** @} */ + +#ifdef __cplusplus +} +#endif + +#endif /* ! APR_ERRNO_H */ +#ifndef _LINUX_ERRNO_H +#define _LINUX_ERRNO_H + +#include + +#ifdef __KERNEL__ + +/* Should never be seen by user programs */ +#define ERESTARTSYS 512 +#define ERESTARTNOINTR 513 +#define ERESTARTNOHAND 514 /* restart if no handler.. */ +#define ENOIOCTLCMD 515 /* No ioctl command */ +#define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */ + +/* Defined for the NFSv3 protocol */ +#define EBADHANDLE 521 /* Illegal NFS file handle */ +#define ENOTSYNC 522 /* Update synchronization mismatch */ +#define EBADCOOKIE 523 /* Cookie is stale */ +#define ENOTSUPP 524 /* Operation is not supported */ +#define ETOOSMALL 525 /* Buffer or request is too small */ +#define ESERVERFAULT 526 /* An untranslatable error occurred */ +#define EBADTYPE 527 /* Type not supported by server */ +#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ +#define EIOCBQUEUED 529 /* iocb queued, will get completion event */ +#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */ + +#endif + +#endif +// Copyright (c) 1994 James Clark +// See the file COPYING for copying permission. + +#ifndef ErrnoMessageArg_INCLUDED +#define ErrnoMessageArg_INCLUDED 1 + +#include "MessageArg.h" +#include "rtti.h" + +#ifdef SP_NAMESPACE +namespace SP_NAMESPACE { +#endif + +class SP_API ErrnoMessageArg : public OtherMessageArg { + RTTI_CLASS +public: + ErrnoMessageArg(int errnum) : errno_(errnum) { } + MessageArg *copy() const; + // errno might be a macro so we must use a different name + int errnum() const; +private: + int errno_; +}; + +inline +int ErrnoMessageArg::errnum() const +{ + return errno_; +} + +#ifdef SP_NAMESPACE +} +#endif + +#endif /* not ErrnoMessageArg_INCLUDED */ +/* Copyright (C) 1991,92,93,94,95,96,97,2002 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* + * ISO C99 Standard: 7.5 Errors + */ + +#ifndef _ERRNO_H + +/* The includer defined __need_Emath if he wants only the definitions + of EDOM and ERANGE, and not everything else. */ +#ifndef __need_Emath +# define _ERRNO_H 1 +# include +#endif + +__BEGIN_DECLS + +/* Get the error number constants from the system-specific file. + This file will test __need_Emath and _ERRNO_H. */ +#include +#undef __need_Emath + +#ifdef _ERRNO_H + +/* Declare the `errno' variable, unless it's defined as a macro by + bits/errno.h. This is the case in GNU, where it is a per-thread + variable. This redeclaration using the macro still works, but it + will be a function declaration without a prototype and may trigger + a -Wstrict-prototypes warning. */ +#ifndef errno +extern int errno; +#endif + +#ifdef __USE_GNU + +/* The full and simple forms of the name with which the program was + invoked. These variables are set up automatically at startup based on + the value of ARGV[0] (this works only if you use GNU ld). */ +extern char *program_invocation_name, *program_invocation_short_name; +#endif /* __USE_GNU */ +#endif /* _ERRNO_H */ + +__END_DECLS + +#endif /* _ERRNO_H */ + +/* The Hurd defines `error_t' as an enumerated type so + that printing `error_t' values in the debugger shows the names. We + might need this definition sometimes even if this file was included + before. */ +#if defined __USE_GNU || defined __need_error_t +# ifndef __error_t_defined +typedef int error_t; +# define __error_t_defined 1 +# endif +# undef __need_error_t +#endif +#ifndef _I386_ERRNO_H +#define _I386_ERRNO_H + +#include + +#endif +#ifndef _ASM_GENERIC_ERRNO_BASE_H +#define _ASM_GENERIC_ERRNO_BASE_H + +#define EPERM 1 /* Operation not permitted */ +#define ENOENT 2 /* No such file or directory */ +#define ESRCH 3 /* No such process */ +#define EINTR 4 /* Interrupted system call */ +#define EIO 5 /* I/O error */ +#define ENXIO 6 /* No such device or address */ +#define E2BIG 7 /* Argument list too long */ +#define ENOEXEC 8 /* Exec format error */ +#define EBADF 9 /* Bad file number */ +#define ECHILD 10 /* No child processes */ +#define EAGAIN 11 /* Try again */ +#define ENOMEM 12 /* Out of memory */ +#define EACCES 13 /* Permission denied */ +#define EFAULT 14 /* Bad address */ +#define ENOTBLK 15 /* Block device required */ +#define EBUSY 16 /* Device or resource busy */ +#define EEXIST 17 /* File exists */ +#define EXDEV 18 /* Cross-device link */ +#define ENODEV 19 /* No such device */ +#define ENOTDIR 20 /* Not a directory */ +#define EISDIR 21 /* Is a directory */ +#define EINVAL 22 /* Invalid argument */ +#define ENFILE 23 /* File table overflow */ +#define EMFILE 24 /* Too many open files */ +#define ENOTTY 25 /* Not a typewriter */ +#define ETXTBSY 26 /* Text file busy */ +#define EFBIG 27 /* File too large */ +#define ENOSPC 28 /* No space left on device */ +#define ESPIPE 29 /* Illegal seek */ +#define EROFS 30 /* Read-only file system */ +#define EMLINK 31 /* Too many links */ +#define EPIPE 32 /* Broken pipe */ +#define EDOM 33 /* Math argument out of domain of func */ +#define ERANGE 34 /* Math result not representable */ + +#endif +#ifndef _ASM_GENERIC_ERRNO_H +#define _ASM_GENERIC_ERRNO_H + +#include + +#define EDEADLK 35 /* Resource deadlock would occur */ +#define ENAMETOOLONG 36 /* File name too long */ +#define ENOLCK 37 /* No record locks available */ +#define ENOSYS 38 /* Function not implemented */ +#define ENOTEMPTY 39 /* Directory not empty */ +#define ELOOP 40 /* Too many symbolic links encountered */ +#define EWOULDBLOCK EAGAIN /* Operation would block */ +#define ENOMSG 42 /* No message of desired type */ +#define EIDRM 43 /* Identifier removed */ +#define ECHRNG 44 /* Channel number out of range */ +#define EL2NSYNC 45 /* Level 2 not synchronized */ +#define EL3HLT 46 /* Level 3 halted */ +#define EL3RST 47 /* Level 3 reset */ +#define ELNRNG 48 /* Link number out of range */ +#define EUNATCH 49 /* Protocol driver not attached */ +#define ENOCSI 50 /* No CSI structure available */ +#define EL2HLT 51 /* Level 2 halted */ +#define EBADE 52 /* Invalid exchange */ +#define EBADR 53 /* Invalid request descriptor */ +#define EXFULL 54 /* Exchange full */ +#define ENOANO 55 /* No anode */ +#define EBADRQC 56 /* Invalid request code */ +#define EBADSLT 57 /* Invalid slot */ + +#define EDEADLOCK EDEADLK + +#define EBFONT 59 /* Bad font file format */ +#define ENOSTR 60 /* Device not a stream */ +#define ENODATA 61 /* No data available */ +#define ETIME 62 /* Timer expired */ +#define ENOSR 63 /* Out of streams resources */ +#define ENONET 64 /* Machine is not on the network */ +#define ENOPKG 65 /* Package not installed */ +#define EREMOTE 66 /* Object is remote */ +#define ENOLINK 67 /* Link has been severed */ +#define EADV 68 /* Advertise error */ +#define ESRMNT 69 /* Srmount error */ +#define ECOMM 70 /* Communication error on send */ +#define EPROTO 71 /* Protocol error */ +#define EMULTIHOP 72 /* Multihop attempted */ +#define EDOTDOT 73 /* RFS specific error */ +#define EBADMSG 74 /* Not a data message */ +#define EOVERFLOW 75 /* Value too large for defined data type */ +#define ENOTUNIQ 76 /* Name not unique on network */ +#define EBADFD 77 /* File descriptor in bad state */ +#define EREMCHG 78 /* Remote address changed */ +#define ELIBACC 79 /* Can not access a needed shared library */ +#define ELIBBAD 80 /* Accessing a corrupted shared library */ +#define ELIBSCN 81 /* .lib section in a.out corrupted */ +#define ELIBMAX 82 /* Attempting to link in too many shared libraries */ +#define ELIBEXEC 83 /* Cannot exec a shared library directly */ +#define EILSEQ 84 /* Illegal byte sequence */ +#define ERESTART 85 /* Interrupted system call should be restarted */ +#define ESTRPIPE 86 /* Streams pipe error */ +#define EUSERS 87 /* Too many users */ +#define ENOTSOCK 88 /* Socket operation on non-socket */ +#define EDESTADDRREQ 89 /* Destination address required */ +#define EMSGSIZE 90 /* Message too long */ +#define EPROTOTYPE 91 /* Protocol wrong type for socket */ +#define ENOPROTOOPT 92 /* Protocol not available */ +#define EPROTONOSUPPORT 93 /* Protocol not supported */ +#define ESOCKTNOSUPPORT 94 /* Socket type not supported */ +#define EOPNOTSUPP 95 /* Operation not supported on transport endpoint */ +#define EPFNOSUPPORT 96 /* Protocol family not supported */ +#define EAFNOSUPPORT 97 /* Address family not supported by protocol */ +#define EADDRINUSE 98 /* Address already in use */ +#define EADDRNOTAVAIL 99 /* Cannot assign requested address */ +#define ENETDOWN 100 /* Network is down */ +#define ENETUNREACH 101 /* Network is unreachable */ +#define ENETRESET 102 /* Network dropped connection because of reset */ +#define ECONNABORTED 103 /* Software caused connection abort */ +#define ECONNRESET 104 /* Connection reset by peer */ +#define ENOBUFS 105 /* No buffer space available */ +#define EISCONN 106 /* Transport endpoint is already connected */ +#define ENOTCONN 107 /* Transport endpoint is not connected */ +#define ESHUTDOWN 108 /* Cannot send after transport endpoint shutdown */ +#define ETOOMANYREFS 109 /* Too many references: cannot splice */ +#define ETIMEDOUT 110 /* Connection timed out */ +#define ECONNREFUSED 111 /* Connection refused */ +#define EHOSTDOWN 112 /* Host is down */ +#define EHOSTUNREACH 113 /* No route to host */ +#define EALREADY 114 /* Operation already in progress */ +#define EINPROGRESS 115 /* Operation now in progress */ +#define ESTALE 116 /* Stale NFS file handle */ +#define EUCLEAN 117 /* Structure needs cleaning */ +#define ENOTNAM 118 /* Not a XENIX named type file */ +#define ENAVAIL 119 /* No XENIX semaphores available */ +#define EISNAM 120 /* Is a named type file */ +#define EREMOTEIO 121 /* Remote I/O error */ +#define EDQUOT 122 /* Quota exceeded */ + +#define ENOMEDIUM 123 /* No medium found */ +#define EMEDIUMTYPE 124 /* Wrong medium type */ +#define ECANCELED 125 /* Operation Canceled */ +#define ENOKEY 126 /* Required key not available */ +#define EKEYEXPIRED 127 /* Key has expired */ +#define EKEYREVOKED 128 /* Key has been revoked */ +#define EKEYREJECTED 129 /* Key was rejected by service */ + +/* for robust mutexes */ +#define EOWNERDEAD 130 /* Owner died */ +#define ENOTRECOVERABLE 131 /* State not recoverable */ + +#endif diff --git a/doc/errno.list.macosx.txt b/doc/errno.list.macosx.txt new file mode 100644 index 000000000..728753ac7 --- /dev/null +++ b/doc/errno.list.macosx.txt @@ -0,0 +1,1513 @@ +/* Copyright 2000-2005 The Apache Software Foundation or its licensors, as + * applicable. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef APR_ERRNO_H +#define APR_ERRNO_H + +/** + * @file apr_errno.h + * @brief APR Error Codes + */ + +#include "apr.h" + +#if APR_HAVE_ERRNO_H +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** + * @defgroup apr_errno Error Codes + * @ingroup APR + * @{ + */ + +/** + * Type for specifying an error or status code. + */ +typedef int apr_status_t; + +/** + * Return a human readable string describing the specified error. + * @param statcode The error code the get a string for. + * @param buf A buffer to hold the error string. + * @param bufsize Size of the buffer to hold the string. + */ +APR_DECLARE(char *) apr_strerror(apr_status_t statcode, char *buf, + apr_size_t bufsize); + +#if defined(DOXYGEN) +/** + * @def APR_FROM_OS_ERROR(os_err_type syserr) + * Fold a platform specific error into an apr_status_t code. + * @return apr_status_t + * @param e The platform os error code. + * @warning macro implementation; the syserr argument may be evaluated + * multiple times. + */ +#define APR_FROM_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e + APR_OS_START_SYSERR) + +/** + * @def APR_TO_OS_ERROR(apr_status_t statcode) + * @return os_err_type + * Fold an apr_status_t code back to the native platform defined error. + * @param e The apr_status_t folded platform os error code. + * @warning macro implementation; the statcode argument may be evaluated + * multiple times. If the statcode was not created by apr_get_os_error + * or APR_FROM_OS_ERROR, the results are undefined. + */ +#define APR_TO_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e - APR_OS_START_SYSERR) + +/** @def apr_get_os_error() + * @return apr_status_t the last platform error, folded into apr_status_t, on most platforms + * @remark This retrieves errno, or calls a GetLastError() style function, and + * folds it with APR_FROM_OS_ERROR. Some platforms (such as OS2) have no + * such mechanism, so this call may be unsupported. Do NOT use this + * call for socket errors from socket, send, recv etc! + */ + +/** @def apr_set_os_error(e) + * Reset the last platform error, unfolded from an apr_status_t, on some platforms + * @param e The OS error folded in a prior call to APR_FROM_OS_ERROR() + * @warning This is a macro implementation; the statcode argument may be evaluated + * multiple times. If the statcode was not created by apr_get_os_error + * or APR_FROM_OS_ERROR, the results are undefined. This macro sets + * errno, or calls a SetLastError() style function, unfolding statcode + * with APR_TO_OS_ERROR. Some platforms (such as OS2) have no such + * mechanism, so this call may be unsupported. + */ + +/** @def apr_get_netos_error() + * Return the last socket error, folded into apr_status_t, on all platforms + * @remark This retrieves errno or calls a GetLastSocketError() style function, + * and folds it with APR_FROM_OS_ERROR. + */ + +/** @def apr_set_netos_error(e) + * Reset the last socket error, unfolded from an apr_status_t + * @param e The socket error folded in a prior call to APR_FROM_OS_ERROR() + * @warning This is a macro implementation; the statcode argument may be evaluated + * multiple times. If the statcode was not created by apr_get_os_error + * or APR_FROM_OS_ERROR, the results are undefined. This macro sets + * errno, or calls a WSASetLastError() style function, unfolding + * socketcode with APR_TO_OS_ERROR. + */ + +#endif /* defined(DOXYGEN) */ + +/** + * APR_OS_START_ERROR is where the APR specific error values start. + */ +#define APR_OS_START_ERROR 20000 +/** + * APR_OS_ERRSPACE_SIZE is the maximum number of errors you can fit + * into one of the error/status ranges below -- except for + * APR_OS_START_USERERR, which see. + */ +#define APR_OS_ERRSPACE_SIZE 50000 +/** + * APR_OS_START_STATUS is where the APR specific status codes start. + */ +#define APR_OS_START_STATUS (APR_OS_START_ERROR + APR_OS_ERRSPACE_SIZE) +/** + * APR_OS_START_USERERR are reserved for applications that use APR that + * layer their own error codes along with APR's. Note that the + * error immediately following this one is set ten times farther + * away than usual, so that users of apr have a lot of room in + * which to declare custom error codes. + */ +#define APR_OS_START_USERERR (APR_OS_START_STATUS + APR_OS_ERRSPACE_SIZE) +/** + * APR_OS_START_USEERR is obsolete, defined for compatibility only. + * Use APR_OS_START_USERERR instead. + */ +#define APR_OS_START_USEERR APR_OS_START_USERERR +/** + * APR_OS_START_CANONERR is where APR versions of errno values are defined + * on systems which don't have the corresponding errno. + */ +#define APR_OS_START_CANONERR (APR_OS_START_USERERR \ + + (APR_OS_ERRSPACE_SIZE * 10)) +/** + * APR_OS_START_EAIERR folds EAI_ error codes from getaddrinfo() into + * apr_status_t values. + */ +#define APR_OS_START_EAIERR (APR_OS_START_CANONERR + APR_OS_ERRSPACE_SIZE) +/** + * APR_OS_START_SYSERR folds platform-specific system error values into + * apr_status_t values. + */ +#define APR_OS_START_SYSERR (APR_OS_START_EAIERR + APR_OS_ERRSPACE_SIZE) + +/** no error. */ +#define APR_SUCCESS 0 + +/** + * @defgroup APR_Error APR Error Values + *
+ * APR ERROR VALUES
+ * APR_ENOSTAT      APR was unable to perform a stat on the file 
+ * APR_ENOPOOL      APR was not provided a pool with which to allocate memory
+ * APR_EBADDATE     APR was given an invalid date 
+ * APR_EINVALSOCK   APR was given an invalid socket
+ * APR_ENOPROC      APR was not given a process structure
+ * APR_ENOTIME      APR was not given a time structure
+ * APR_ENODIR       APR was not given a directory structure
+ * APR_ENOLOCK      APR was not given a lock structure
+ * APR_ENOPOLL      APR was not given a poll structure
+ * APR_ENOSOCKET    APR was not given a socket
+ * APR_ENOTHREAD    APR was not given a thread structure
+ * APR_ENOTHDKEY    APR was not given a thread key structure
+ * APR_ENOSHMAVAIL  There is no more shared memory available
+ * APR_EDSOOPEN     APR was unable to open the dso object.  For more 
+ *                  information call apr_dso_error().
+ * APR_EGENERAL     General failure (specific information not available)
+ * APR_EBADIP       The specified IP address is invalid
+ * APR_EBADMASK     The specified netmask is invalid
+ * APR_ESYMNOTFOUND Could not find the requested symbol
+ * 
+ * + *
+ * APR STATUS VALUES
+ * APR_INCHILD        Program is currently executing in the child
+ * APR_INPARENT       Program is currently executing in the parent
+ * APR_DETACH         The thread is detached
+ * APR_NOTDETACH      The thread is not detached
+ * APR_CHILD_DONE     The child has finished executing
+ * APR_CHILD_NOTDONE  The child has not finished executing
+ * APR_TIMEUP         The operation did not finish before the timeout
+ * APR_INCOMPLETE     The operation was incomplete although some processing
+ *                    was performed and the results are partially valid
+ * APR_BADCH          Getopt found an option not in the option string
+ * APR_BADARG         Getopt found an option that is missing an argument 
+ *                    and an argument was specified in the option string
+ * APR_EOF            APR has encountered the end of the file
+ * APR_NOTFOUND       APR was unable to find the socket in the poll structure
+ * APR_ANONYMOUS      APR is using anonymous shared memory
+ * APR_FILEBASED      APR is using a file name as the key to the shared memory
+ * APR_KEYBASED       APR is using a shared key as the key to the shared memory
+ * APR_EINIT          Ininitalizer value.  If no option has been found, but 
+ *                    the status variable requires a value, this should be used
+ * APR_ENOTIMPL       The APR function has not been implemented on this 
+ *                    platform, either because nobody has gotten to it yet, 
+ *                    or the function is impossible on this platform.
+ * APR_EMISMATCH      Two passwords do not match.
+ * APR_EABSOLUTE      The given path was absolute.
+ * APR_ERELATIVE      The given path was relative.
+ * APR_EINCOMPLETE    The given path was neither relative nor absolute.
+ * APR_EABOVEROOT     The given path was above the root path.
+ * APR_EBUSY          The given lock was busy.
+ * APR_EPROC_UNKNOWN  The given process wasn't recognized by APR
+ * 
+ * @{ + */ +/** @see APR_STATUS_IS_ENOSTAT */ +#define APR_ENOSTAT (APR_OS_START_ERROR + 1) +/** @see APR_STATUS_IS_ENOPOOL */ +#define APR_ENOPOOL (APR_OS_START_ERROR + 2) +/* empty slot: +3 */ +/** @see APR_STATUS_IS_EBADDATE */ +#define APR_EBADDATE (APR_OS_START_ERROR + 4) +/** @see APR_STATUS_IS_EINVALSOCK */ +#define APR_EINVALSOCK (APR_OS_START_ERROR + 5) +/** @see APR_STATUS_IS_ENOPROC */ +#define APR_ENOPROC (APR_OS_START_ERROR + 6) +/** @see APR_STATUS_IS_ENOTIME */ +#define APR_ENOTIME (APR_OS_START_ERROR + 7) +/** @see APR_STATUS_IS_ENODIR */ +#define APR_ENODIR (APR_OS_START_ERROR + 8) +/** @see APR_STATUS_IS_ENOLOCK */ +#define APR_ENOLOCK (APR_OS_START_ERROR + 9) +/** @see APR_STATUS_IS_ENOPOLL */ +#define APR_ENOPOLL (APR_OS_START_ERROR + 10) +/** @see APR_STATUS_IS_ENOSOCKET */ +#define APR_ENOSOCKET (APR_OS_START_ERROR + 11) +/** @see APR_STATUS_IS_ENOTHREAD */ +#define APR_ENOTHREAD (APR_OS_START_ERROR + 12) +/** @see APR_STATUS_IS_ENOTHDKEY */ +#define APR_ENOTHDKEY (APR_OS_START_ERROR + 13) +/** @see APR_STATUS_IS_EGENERAL */ +#define APR_EGENERAL (APR_OS_START_ERROR + 14) +/** @see APR_STATUS_IS_ENOSHMAVAIL */ +#define APR_ENOSHMAVAIL (APR_OS_START_ERROR + 15) +/** @see APR_STATUS_IS_EBADIP */ +#define APR_EBADIP (APR_OS_START_ERROR + 16) +/** @see APR_STATUS_IS_EBADMASK */ +#define APR_EBADMASK (APR_OS_START_ERROR + 17) +/* empty slot: +18 */ +/** @see APR_STATUS_IS_EDSOPEN */ +#define APR_EDSOOPEN (APR_OS_START_ERROR + 19) +/** @see APR_STATUS_IS_EABSOLUTE */ +#define APR_EABSOLUTE (APR_OS_START_ERROR + 20) +/** @see APR_STATUS_IS_ERELATIVE */ +#define APR_ERELATIVE (APR_OS_START_ERROR + 21) +/** @see APR_STATUS_IS_EINCOMPLETE */ +#define APR_EINCOMPLETE (APR_OS_START_ERROR + 22) +/** @see APR_STATUS_IS_EABOVEROOT */ +#define APR_EABOVEROOT (APR_OS_START_ERROR + 23) +/** @see APR_STATUS_IS_EBADPATH */ +#define APR_EBADPATH (APR_OS_START_ERROR + 24) +/** @see APR_STATUS_IS_EPATHWILD */ +#define APR_EPATHWILD (APR_OS_START_ERROR + 25) +/** @see APR_STATUS_IS_ESYMNOTFOUND */ +#define APR_ESYMNOTFOUND (APR_OS_START_ERROR + 26) +/** @see APR_STATUS_IS_EPROC_UNKNOWN */ +#define APR_EPROC_UNKNOWN (APR_OS_START_ERROR + 27) +/** @see APR_STATUS_IS_ENOTENOUGHENTROPY */ +#define APR_ENOTENOUGHENTROPY (APR_OS_START_ERROR + 28) +/** @} */ + +/** + * @defgroup APR_STATUS_IS Status Value Tests + * @warning For any particular error condition, more than one of these tests + * may match. This is because platform-specific error codes may not + * always match the semantics of the POSIX codes these tests (and the + * corresponding APR error codes) are named after. A notable example + * are the APR_STATUS_IS_ENOENT and APR_STATUS_IS_ENOTDIR tests on + * Win32 platforms. The programmer should always be aware of this and + * adjust the order of the tests accordingly. + * @{ + */ +/** + * APR was unable to perform a stat on the file + * @warning always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_ENOSTAT(s) ((s) == APR_ENOSTAT) +/** + * APR was not provided a pool with which to allocate memory + * @warning always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_ENOPOOL(s) ((s) == APR_ENOPOOL) +/** APR was given an invalid date */ +#define APR_STATUS_IS_EBADDATE(s) ((s) == APR_EBADDATE) +/** APR was given an invalid socket */ +#define APR_STATUS_IS_EINVALSOCK(s) ((s) == APR_EINVALSOCK) +/** APR was not given a process structure */ +#define APR_STATUS_IS_ENOPROC(s) ((s) == APR_ENOPROC) +/** APR was not given a time structure */ +#define APR_STATUS_IS_ENOTIME(s) ((s) == APR_ENOTIME) +/** APR was not given a directory structure */ +#define APR_STATUS_IS_ENODIR(s) ((s) == APR_ENODIR) +/** APR was not given a lock structure */ +#define APR_STATUS_IS_ENOLOCK(s) ((s) == APR_ENOLOCK) +/** APR was not given a poll structure */ +#define APR_STATUS_IS_ENOPOLL(s) ((s) == APR_ENOPOLL) +/** APR was not given a socket */ +#define APR_STATUS_IS_ENOSOCKET(s) ((s) == APR_ENOSOCKET) +/** APR was not given a thread structure */ +#define APR_STATUS_IS_ENOTHREAD(s) ((s) == APR_ENOTHREAD) +/** APR was not given a thread key structure */ +#define APR_STATUS_IS_ENOTHDKEY(s) ((s) == APR_ENOTHDKEY) +/** Generic Error which can not be put into another spot */ +#define APR_STATUS_IS_EGENERAL(s) ((s) == APR_EGENERAL) +/** There is no more shared memory available */ +#define APR_STATUS_IS_ENOSHMAVAIL(s) ((s) == APR_ENOSHMAVAIL) +/** The specified IP address is invalid */ +#define APR_STATUS_IS_EBADIP(s) ((s) == APR_EBADIP) +/** The specified netmask is invalid */ +#define APR_STATUS_IS_EBADMASK(s) ((s) == APR_EBADMASK) +/* empty slot: +18 */ +/** + * APR was unable to open the dso object. + * For more information call apr_dso_error(). + */ +#if defined(WIN32) +#define APR_STATUS_IS_EDSOOPEN(s) ((s) == APR_EDSOOPEN \ + || APR_TO_OS_ERROR(s) == ERROR_MOD_NOT_FOUND) +#else +#define APR_STATUS_IS_EDSOOPEN(s) ((s) == APR_EDSOOPEN) +#endif +/** The given path was absolute. */ +#define APR_STATUS_IS_EABSOLUTE(s) ((s) == APR_EABSOLUTE) +/** The given path was relative. */ +#define APR_STATUS_IS_ERELATIVE(s) ((s) == APR_ERELATIVE) +/** The given path was neither relative nor absolute. */ +#define APR_STATUS_IS_EINCOMPLETE(s) ((s) == APR_EINCOMPLETE) +/** The given path was above the root path. */ +#define APR_STATUS_IS_EABOVEROOT(s) ((s) == APR_EABOVEROOT) +/** The given path was bad. */ +#define APR_STATUS_IS_EBADPATH(s) ((s) == APR_EBADPATH) +/** The given path contained wildcards. */ +#define APR_STATUS_IS_EPATHWILD(s) ((s) == APR_EPATHWILD) +/** Could not find the requested symbol. + * For more information call apr_dso_error(). + */ +#if defined(WIN32) +#define APR_STATUS_IS_ESYMNOTFOUND(s) ((s) == APR_ESYMNOTFOUND \ + || APR_TO_OS_ERROR(s) == ERROR_PROC_NOT_FOUND) +#else +#define APR_STATUS_IS_ESYMNOTFOUND(s) ((s) == APR_ESYMNOTFOUND) +#endif +/** The given process was not recognized by APR. */ +#define APR_STATUS_IS_EPROC_UNKNOWN(s) ((s) == APR_EPROC_UNKNOWN) + +/** APR could not gather enough entropy to continue. */ +#define APR_STATUS_IS_ENOTENOUGHENTROPY(s) ((s) == APR_ENOTENOUGHENTROPY) + +/** @} */ + +/** + * @addtogroup APR_Error + * @{ + */ +/** @see APR_STATUS_IS_INCHILD */ +#define APR_INCHILD (APR_OS_START_STATUS + 1) +/** @see APR_STATUS_IS_INPARENT */ +#define APR_INPARENT (APR_OS_START_STATUS + 2) +/** @see APR_STATUS_IS_DETACH */ +#define APR_DETACH (APR_OS_START_STATUS + 3) +/** @see APR_STATUS_IS_NOTDETACH */ +#define APR_NOTDETACH (APR_OS_START_STATUS + 4) +/** @see APR_STATUS_IS_CHILD_DONE */ +#define APR_CHILD_DONE (APR_OS_START_STATUS + 5) +/** @see APR_STATUS_IS_CHILD_NOTDONE */ +#define APR_CHILD_NOTDONE (APR_OS_START_STATUS + 6) +/** @see APR_STATUS_IS_TIMEUP */ +#define APR_TIMEUP (APR_OS_START_STATUS + 7) +/** @see APR_STATUS_IS_INCOMPLETE */ +#define APR_INCOMPLETE (APR_OS_START_STATUS + 8) +/* empty slot: +9 */ +/* empty slot: +10 */ +/* empty slot: +11 */ +/** @see APR_STATUS_IS_BADCH */ +#define APR_BADCH (APR_OS_START_STATUS + 12) +/** @see APR_STATUS_IS_BADARG */ +#define APR_BADARG (APR_OS_START_STATUS + 13) +/** @see APR_STATUS_IS_EOF */ +#define APR_EOF (APR_OS_START_STATUS + 14) +/** @see APR_STATUS_IS_NOTFOUND */ +#define APR_NOTFOUND (APR_OS_START_STATUS + 15) +/* empty slot: +16 */ +/* empty slot: +17 */ +/* empty slot: +18 */ +/** @see APR_STATUS_IS_ANONYMOUS */ +#define APR_ANONYMOUS (APR_OS_START_STATUS + 19) +/** @see APR_STATUS_IS_FILEBASED */ +#define APR_FILEBASED (APR_OS_START_STATUS + 20) +/** @see APR_STATUS_IS_KEYBASED */ +#define APR_KEYBASED (APR_OS_START_STATUS + 21) +/** @see APR_STATUS_IS_EINIT */ +#define APR_EINIT (APR_OS_START_STATUS + 22) +/** @see APR_STATUS_IS_ENOTIMPL */ +#define APR_ENOTIMPL (APR_OS_START_STATUS + 23) +/** @see APR_STATUS_IS_EMISMATCH */ +#define APR_EMISMATCH (APR_OS_START_STATUS + 24) +/** @see APR_STATUS_IS_EBUSY */ +#define APR_EBUSY (APR_OS_START_STATUS + 25) +/** @} */ + +/** + * @addtogroup APR_STATUS_IS + * @{ + */ +/** + * Program is currently executing in the child + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code */ +#define APR_STATUS_IS_INCHILD(s) ((s) == APR_INCHILD) +/** + * Program is currently executing in the parent + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_INPARENT(s) ((s) == APR_INPARENT) +/** + * The thread is detached + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_DETACH(s) ((s) == APR_DETACH) +/** + * The thread is not detached + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_NOTDETACH(s) ((s) == APR_NOTDETACH) +/** + * The child has finished executing + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_CHILD_DONE(s) ((s) == APR_CHILD_DONE) +/** + * The child has not finished executing + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_CHILD_NOTDONE(s) ((s) == APR_CHILD_NOTDONE) +/** + * The operation did not finish before the timeout + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP) +/** + * The operation was incomplete although some processing was performed + * and the results are partially valid. + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_INCOMPLETE(s) ((s) == APR_INCOMPLETE) +/* empty slot: +9 */ +/* empty slot: +10 */ +/* empty slot: +11 */ +/** + * Getopt found an option not in the option string + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_BADCH(s) ((s) == APR_BADCH) +/** + * Getopt found an option not in the option string and an argument was + * specified in the option string + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_BADARG(s) ((s) == APR_BADARG) +/** + * APR has encountered the end of the file + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_EOF(s) ((s) == APR_EOF) +/** + * APR was unable to find the socket in the poll structure + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_NOTFOUND(s) ((s) == APR_NOTFOUND) +/* empty slot: +16 */ +/* empty slot: +17 */ +/* empty slot: +18 */ +/** + * APR is using anonymous shared memory + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_ANONYMOUS(s) ((s) == APR_ANONYMOUS) +/** + * APR is using a file name as the key to the shared memory + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_FILEBASED(s) ((s) == APR_FILEBASED) +/** + * APR is using a shared key as the key to the shared memory + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_KEYBASED(s) ((s) == APR_KEYBASED) +/** + * Ininitalizer value. If no option has been found, but + * the status variable requires a value, this should be used + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_EINIT(s) ((s) == APR_EINIT) +/** + * The APR function has not been implemented on this + * platform, either because nobody has gotten to it yet, + * or the function is impossible on this platform. + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_ENOTIMPL(s) ((s) == APR_ENOTIMPL) +/** + * Two passwords do not match. + * @warning + * always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_EMISMATCH(s) ((s) == APR_EMISMATCH) +/** + * The given lock was busy + * @warning always use this test, as platform-specific variances may meet this + * more than one error code + */ +#define APR_STATUS_IS_EBUSY(s) ((s) == APR_EBUSY) + +/** @} */ + +/** + * @addtogroup APR_Error APR Error Values + * @{ + */ +/* APR CANONICAL ERROR VALUES */ +/** @see APR_STATUS_IS_EACCES */ +#ifdef EACCES +#define APR_EACCES EACCES +#else +#define APR_EACCES (APR_OS_START_CANONERR + 1) +#endif + +/** @see APR_STATUS_IS_EXIST */ +#ifdef EEXIST +#define APR_EEXIST EEXIST +#else +#define APR_EEXIST (APR_OS_START_CANONERR + 2) +#endif + +/** @see APR_STATUS_IS_ENAMETOOLONG */ +#ifdef ENAMETOOLONG +#define APR_ENAMETOOLONG ENAMETOOLONG +#else +#define APR_ENAMETOOLONG (APR_OS_START_CANONERR + 3) +#endif + +/** @see APR_STATUS_IS_ENOENT */ +#ifdef ENOENT +#define APR_ENOENT ENOENT +#else +#define APR_ENOENT (APR_OS_START_CANONERR + 4) +#endif + +/** @see APR_STATUS_IS_ENOTDIR */ +#ifdef ENOTDIR +#define APR_ENOTDIR ENOTDIR +#else +#define APR_ENOTDIR (APR_OS_START_CANONERR + 5) +#endif + +/** @see APR_STATUS_IS_ENOSPC */ +#ifdef ENOSPC +#define APR_ENOSPC ENOSPC +#else +#define APR_ENOSPC (APR_OS_START_CANONERR + 6) +#endif + +/** @see APR_STATUS_IS_ENOMEM */ +#ifdef ENOMEM +#define APR_ENOMEM ENOMEM +#else +#define APR_ENOMEM (APR_OS_START_CANONERR + 7) +#endif + +/** @see APR_STATUS_IS_EMFILE */ +#ifdef EMFILE +#define APR_EMFILE EMFILE +#else +#define APR_EMFILE (APR_OS_START_CANONERR + 8) +#endif + +/** @see APR_STATUS_IS_ENFILE */ +#ifdef ENFILE +#define APR_ENFILE ENFILE +#else +#define APR_ENFILE (APR_OS_START_CANONERR + 9) +#endif + +/** @see APR_STATUS_IS_EBADF */ +#ifdef EBADF +#define APR_EBADF EBADF +#else +#define APR_EBADF (APR_OS_START_CANONERR + 10) +#endif + +/** @see APR_STATUS_IS_EINVAL */ +#ifdef EINVAL +#define APR_EINVAL EINVAL +#else +#define APR_EINVAL (APR_OS_START_CANONERR + 11) +#endif + +/** @see APR_STATUS_IS_ESPIPE */ +#ifdef ESPIPE +#define APR_ESPIPE ESPIPE +#else +#define APR_ESPIPE (APR_OS_START_CANONERR + 12) +#endif + +/** + * @see APR_STATUS_IS_EAGAIN + * @warning use APR_STATUS_IS_EAGAIN instead of just testing this value + */ +#ifdef EAGAIN +#define APR_EAGAIN EAGAIN +#elif defined(EWOULDBLOCK) +#define APR_EAGAIN EWOULDBLOCK +#else +#define APR_EAGAIN (APR_OS_START_CANONERR + 13) +#endif + +/** @see APR_STATUS_IS_EINTR */ +#ifdef EINTR +#define APR_EINTR EINTR +#else +#define APR_EINTR (APR_OS_START_CANONERR + 14) +#endif + +/** @see APR_STATUS_IS_ENOTSOCK */ +#ifdef ENOTSOCK +#define APR_ENOTSOCK ENOTSOCK +#else +#define APR_ENOTSOCK (APR_OS_START_CANONERR + 15) +#endif + +/** @see APR_STATUS_IS_ECONNREFUSED */ +#ifdef ECONNREFUSED +#define APR_ECONNREFUSED ECONNREFUSED +#else +#define APR_ECONNREFUSED (APR_OS_START_CANONERR + 16) +#endif + +/** @see APR_STATUS_IS_EINPROGRESS */ +#ifdef EINPROGRESS +#define APR_EINPROGRESS EINPROGRESS +#else +#define APR_EINPROGRESS (APR_OS_START_CANONERR + 17) +#endif + +/** + * @see APR_STATUS_IS_ECONNABORTED + * @warning use APR_STATUS_IS_ECONNABORTED instead of just testing this value + */ + +#ifdef ECONNABORTED +#define APR_ECONNABORTED ECONNABORTED +#else +#define APR_ECONNABORTED (APR_OS_START_CANONERR + 18) +#endif + +/** @see APR_STATUS_IS_ECONNRESET */ +#ifdef ECONNRESET +#define APR_ECONNRESET ECONNRESET +#else +#define APR_ECONNRESET (APR_OS_START_CANONERR + 19) +#endif + +/** @see APR_STATUS_IS_ETIMEDOUT + * @deprecated */ +#ifdef ETIMEDOUT +#define APR_ETIMEDOUT ETIMEDOUT +#else +#define APR_ETIMEDOUT (APR_OS_START_CANONERR + 20) +#endif + +/** @see APR_STATUS_IS_EHOSTUNREACH */ +#ifdef EHOSTUNREACH +#define APR_EHOSTUNREACH EHOSTUNREACH +#else +#define APR_EHOSTUNREACH (APR_OS_START_CANONERR + 21) +#endif + +/** @see APR_STATUS_IS_ENETUNREACH */ +#ifdef ENETUNREACH +#define APR_ENETUNREACH ENETUNREACH +#else +#define APR_ENETUNREACH (APR_OS_START_CANONERR + 22) +#endif + +/** @see APR_STATUS_IS_EFTYPE */ +#ifdef EFTYPE +#define APR_EFTYPE EFTYPE +#else +#define APR_EFTYPE (APR_OS_START_CANONERR + 23) +#endif + +/** @see APR_STATUS_IS_EPIPE */ +#ifdef EPIPE +#define APR_EPIPE EPIPE +#else +#define APR_EPIPE (APR_OS_START_CANONERR + 24) +#endif + +/** @see APR_STATUS_IS_EXDEV */ +#ifdef EXDEV +#define APR_EXDEV EXDEV +#else +#define APR_EXDEV (APR_OS_START_CANONERR + 25) +#endif + +/** @see APR_STATUS_IS_ENOTEMPTY */ +#ifdef ENOTEMPTY +#define APR_ENOTEMPTY ENOTEMPTY +#else +#define APR_ENOTEMPTY (APR_OS_START_CANONERR + 26) +#endif + +/** @} */ + +#if defined(OS2) && !defined(DOXYGEN) + +#define APR_FROM_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e + APR_OS_START_SYSERR) +#define APR_TO_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e - APR_OS_START_SYSERR) + +#define INCL_DOSERRORS +#define INCL_DOS + +/* Leave these undefined. + * OS2 doesn't rely on the errno concept. + * The API calls always return a result codes which + * should be filtered through APR_FROM_OS_ERROR(). + * + * #define apr_get_os_error() (APR_FROM_OS_ERROR(GetLastError())) + * #define apr_set_os_error(e) (SetLastError(APR_TO_OS_ERROR(e))) + */ + +/* A special case, only socket calls require this; + */ +#define apr_get_netos_error() (APR_FROM_OS_ERROR(errno)) +#define apr_set_netos_error(e) (errno = APR_TO_OS_ERROR(e)) + +/* And this needs to be greped away for good: + */ +#define APR_OS2_STATUS(e) (APR_FROM_OS_ERROR(e)) + +/* These can't sit in a private header, so in spite of the extra size, + * they need to be made available here. + */ +#define SOCBASEERR 10000 +#define SOCEPERM (SOCBASEERR+1) /* Not owner */ +#define SOCESRCH (SOCBASEERR+3) /* No such process */ +#define SOCEINTR (SOCBASEERR+4) /* Interrupted system call */ +#define SOCENXIO (SOCBASEERR+6) /* No such device or address */ +#define SOCEBADF (SOCBASEERR+9) /* Bad file number */ +#define SOCEACCES (SOCBASEERR+13) /* Permission denied */ +#define SOCEFAULT (SOCBASEERR+14) /* Bad address */ +#define SOCEINVAL (SOCBASEERR+22) /* Invalid argument */ +#define SOCEMFILE (SOCBASEERR+24) /* Too many open files */ +#define SOCEPIPE (SOCBASEERR+32) /* Broken pipe */ +#define SOCEOS2ERR (SOCBASEERR+100) /* OS/2 Error */ +#define SOCEWOULDBLOCK (SOCBASEERR+35) /* Operation would block */ +#define SOCEINPROGRESS (SOCBASEERR+36) /* Operation now in progress */ +#define SOCEALREADY (SOCBASEERR+37) /* Operation already in progress */ +#define SOCENOTSOCK (SOCBASEERR+38) /* Socket operation on non-socket */ +#define SOCEDESTADDRREQ (SOCBASEERR+39) /* Destination address required */ +#define SOCEMSGSIZE (SOCBASEERR+40) /* Message too long */ +#define SOCEPROTOTYPE (SOCBASEERR+41) /* Protocol wrong type for socket */ +#define SOCENOPROTOOPT (SOCBASEERR+42) /* Protocol not available */ +#define SOCEPROTONOSUPPORT (SOCBASEERR+43) /* Protocol not supported */ +#define SOCESOCKTNOSUPPORT (SOCBASEERR+44) /* Socket type not supported */ +#define SOCEOPNOTSUPP (SOCBASEERR+45) /* Operation not supported on socket */ +#define SOCEPFNOSUPPORT (SOCBASEERR+46) /* Protocol family not supported */ +#define SOCEAFNOSUPPORT (SOCBASEERR+47) /* Address family not supported by protocol family */ +#define SOCEADDRINUSE (SOCBASEERR+48) /* Address already in use */ +#define SOCEADDRNOTAVAIL (SOCBASEERR+49) /* Can't assign requested address */ +#define SOCENETDOWN (SOCBASEERR+50) /* Network is down */ +#define SOCENETUNREACH (SOCBASEERR+51) /* Network is unreachable */ +#define SOCENETRESET (SOCBASEERR+52) /* Network dropped connection on reset */ +#define SOCECONNABORTED (SOCBASEERR+53) /* Software caused connection abort */ +#define SOCECONNRESET (SOCBASEERR+54) /* Connection reset by peer */ +#define SOCENOBUFS (SOCBASEERR+55) /* No buffer space available */ +#define SOCEISCONN (SOCBASEERR+56) /* Socket is already connected */ +#define SOCENOTCONN (SOCBASEERR+57) /* Socket is not connected */ +#define SOCESHUTDOWN (SOCBASEERR+58) /* Can't send after socket shutdown */ +#define SOCETOOMANYREFS (SOCBASEERR+59) /* Too many references: can't splice */ +#define SOCETIMEDOUT (SOCBASEERR+60) /* Connection timed out */ +#define SOCECONNREFUSED (SOCBASEERR+61) /* Connection refused */ +#define SOCELOOP (SOCBASEERR+62) /* Too many levels of symbolic links */ +#define SOCENAMETOOLONG (SOCBASEERR+63) /* File name too long */ +#define SOCEHOSTDOWN (SOCBASEERR+64) /* Host is down */ +#define SOCEHOSTUNREACH (SOCBASEERR+65) /* No route to host */ +#define SOCENOTEMPTY (SOCBASEERR+66) /* Directory not empty */ + +/* APR CANONICAL ERROR TESTS */ +#define APR_STATUS_IS_EACCES(s) ((s) == APR_EACCES \ + || (s) == APR_OS_START_SYSERR + ERROR_ACCESS_DENIED \ + || (s) == APR_OS_START_SYSERR + ERROR_SHARING_VIOLATION) +#define APR_STATUS_IS_EEXIST(s) ((s) == APR_EEXIST \ + || (s) == APR_OS_START_SYSERR + ERROR_OPEN_FAILED \ + || (s) == APR_OS_START_SYSERR + ERROR_FILE_EXISTS \ + || (s) == APR_OS_START_SYSERR + ERROR_ALREADY_EXISTS \ + || (s) == APR_OS_START_SYSERR + ERROR_ACCESS_DENIED) +#define APR_STATUS_IS_ENAMETOOLONG(s) ((s) == APR_ENAMETOOLONG \ + || (s) == APR_OS_START_SYSERR + ERROR_FILENAME_EXCED_RANGE \ + || (s) == APR_OS_START_SYSERR + SOCENAMETOOLONG) +#define APR_STATUS_IS_ENOENT(s) ((s) == APR_ENOENT \ + || (s) == APR_OS_START_SYSERR + ERROR_FILE_NOT_FOUND \ + || (s) == APR_OS_START_SYSERR + ERROR_PATH_NOT_FOUND \ + || (s) == APR_OS_START_SYSERR + ERROR_NO_MORE_FILES \ + || (s) == APR_OS_START_SYSERR + ERROR_OPEN_FAILED) +#define APR_STATUS_IS_ENOTDIR(s) ((s) == APR_ENOTDIR) +#define APR_STATUS_IS_ENOSPC(s) ((s) == APR_ENOSPC \ + || (s) == APR_OS_START_SYSERR + ERROR_DISK_FULL) +#define APR_STATUS_IS_ENOMEM(s) ((s) == APR_ENOMEM) +#define APR_STATUS_IS_EMFILE(s) ((s) == APR_EMFILE \ + || (s) == APR_OS_START_SYSERR + ERROR_TOO_MANY_OPEN_FILES) +#define APR_STATUS_IS_ENFILE(s) ((s) == APR_ENFILE) +#define APR_STATUS_IS_EBADF(s) ((s) == APR_EBADF \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_HANDLE) +#define APR_STATUS_IS_EINVAL(s) ((s) == APR_EINVAL \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_PARAMETER \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_FUNCTION) +#define APR_STATUS_IS_ESPIPE(s) ((s) == APR_ESPIPE \ + || (s) == APR_OS_START_SYSERR + ERROR_NEGATIVE_SEEK) +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN \ + || (s) == APR_OS_START_SYSERR + ERROR_NO_DATA \ + || (s) == APR_OS_START_SYSERR + SOCEWOULDBLOCK \ + || (s) == APR_OS_START_SYSERR + ERROR_LOCK_VIOLATION) +#define APR_STATUS_IS_EINTR(s) ((s) == APR_EINTR \ + || (s) == APR_OS_START_SYSERR + SOCEINTR) +#define APR_STATUS_IS_ENOTSOCK(s) ((s) == APR_ENOTSOCK \ + || (s) == APR_OS_START_SYSERR + SOCENOTSOCK) +#define APR_STATUS_IS_ECONNREFUSED(s) ((s) == APR_ECONNREFUSED \ + || (s) == APR_OS_START_SYSERR + SOCECONNREFUSED) +#define APR_STATUS_IS_EINPROGRESS(s) ((s) == APR_EINPROGRESS \ + || (s) == APR_OS_START_SYSERR + SOCEINPROGRESS) +#define APR_STATUS_IS_ECONNABORTED(s) ((s) == APR_ECONNABORTED \ + || (s) == APR_OS_START_SYSERR + SOCECONNABORTED) +#define APR_STATUS_IS_ECONNRESET(s) ((s) == APR_ECONNRESET \ + || (s) == APR_OS_START_SYSERR + SOCECONNRESET) +/* XXX deprecated */ +#define APR_STATUS_IS_ETIMEDOUT(s) ((s) == APR_ETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) +#undef APR_STATUS_IS_TIMEUP +#define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP \ + || (s) == APR_OS_START_SYSERR + SOCETIMEDOUT) +#define APR_STATUS_IS_EHOSTUNREACH(s) ((s) == APR_EHOSTUNREACH \ + || (s) == APR_OS_START_SYSERR + SOCEHOSTUNREACH) +#define APR_STATUS_IS_ENETUNREACH(s) ((s) == APR_ENETUNREACH \ + || (s) == APR_OS_START_SYSERR + SOCENETUNREACH) +#define APR_STATUS_IS_EFTYPE(s) ((s) == APR_EFTYPE) +#define APR_STATUS_IS_EPIPE(s) ((s) == APR_EPIPE \ + || (s) == APR_OS_START_SYSERR + ERROR_BROKEN_PIPE \ + || (s) == APR_OS_START_SYSERR + SOCEPIPE) +#define APR_STATUS_IS_EXDEV(s) ((s) == APR_EXDEV \ + || (s) == APR_OS_START_SYSERR + ERROR_NOT_SAME_DEVICE) +#define APR_STATUS_IS_ENOTEMPTY(s) ((s) == APR_ENOTEMPTY \ + || (s) == APR_OS_START_SYSERR + ERROR_DIR_NOT_EMPTY \ + || (s) == APR_OS_START_SYSERR + ERROR_ACCESS_DENIED) + +/* + Sorry, too tired to wrap this up for OS2... feel free to + fit the following into their best matches. + + { ERROR_NO_SIGNAL_SENT, ESRCH }, + { SOCEALREADY, EALREADY }, + { SOCEDESTADDRREQ, EDESTADDRREQ }, + { SOCEMSGSIZE, EMSGSIZE }, + { SOCEPROTOTYPE, EPROTOTYPE }, + { SOCENOPROTOOPT, ENOPROTOOPT }, + { SOCEPROTONOSUPPORT, EPROTONOSUPPORT }, + { SOCESOCKTNOSUPPORT, ESOCKTNOSUPPORT }, + { SOCEOPNOTSUPP, EOPNOTSUPP }, + { SOCEPFNOSUPPORT, EPFNOSUPPORT }, + { SOCEAFNOSUPPORT, EAFNOSUPPORT }, + { SOCEADDRINUSE, EADDRINUSE }, + { SOCEADDRNOTAVAIL, EADDRNOTAVAIL }, + { SOCENETDOWN, ENETDOWN }, + { SOCENETRESET, ENETRESET }, + { SOCENOBUFS, ENOBUFS }, + { SOCEISCONN, EISCONN }, + { SOCENOTCONN, ENOTCONN }, + { SOCESHUTDOWN, ESHUTDOWN }, + { SOCETOOMANYREFS, ETOOMANYREFS }, + { SOCELOOP, ELOOP }, + { SOCEHOSTDOWN, EHOSTDOWN }, + { SOCENOTEMPTY, ENOTEMPTY }, + { SOCEPIPE, EPIPE } +*/ + +#elif defined(WIN32) && !defined(DOXYGEN) /* !defined(OS2) */ + +#define APR_FROM_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e + APR_OS_START_SYSERR) +#define APR_TO_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e - APR_OS_START_SYSERR) + +#define apr_get_os_error() (APR_FROM_OS_ERROR(GetLastError())) +#define apr_set_os_error(e) (SetLastError(APR_TO_OS_ERROR(e))) + +/* A special case, only socket calls require this: + */ +#define apr_get_netos_error() (APR_FROM_OS_ERROR(WSAGetLastError())) +#define apr_set_netos_error(e) (WSASetLastError(APR_TO_OS_ERROR(e))) + +/* APR CANONICAL ERROR TESTS */ +#define APR_STATUS_IS_EACCES(s) ((s) == APR_EACCES \ + || (s) == APR_OS_START_SYSERR + ERROR_ACCESS_DENIED \ + || (s) == APR_OS_START_SYSERR + ERROR_CANNOT_MAKE \ + || (s) == APR_OS_START_SYSERR + ERROR_CURRENT_DIRECTORY \ + || (s) == APR_OS_START_SYSERR + ERROR_DRIVE_LOCKED \ + || (s) == APR_OS_START_SYSERR + ERROR_FAIL_I24 \ + || (s) == APR_OS_START_SYSERR + ERROR_LOCK_VIOLATION \ + || (s) == APR_OS_START_SYSERR + ERROR_LOCK_FAILED \ + || (s) == APR_OS_START_SYSERR + ERROR_NOT_LOCKED \ + || (s) == APR_OS_START_SYSERR + ERROR_NETWORK_ACCESS_DENIED \ + || (s) == APR_OS_START_SYSERR + ERROR_SHARING_VIOLATION) +#define APR_STATUS_IS_EEXIST(s) ((s) == APR_EEXIST \ + || (s) == APR_OS_START_SYSERR + ERROR_FILE_EXISTS \ + || (s) == APR_OS_START_SYSERR + ERROR_ALREADY_EXISTS) +#define APR_STATUS_IS_ENAMETOOLONG(s) ((s) == APR_ENAMETOOLONG \ + || (s) == APR_OS_START_SYSERR + ERROR_FILENAME_EXCED_RANGE \ + || (s) == APR_OS_START_SYSERR + WSAENAMETOOLONG) +#define APR_STATUS_IS_ENOENT(s) ((s) == APR_ENOENT \ + || (s) == APR_OS_START_SYSERR + ERROR_FILE_NOT_FOUND \ + || (s) == APR_OS_START_SYSERR + ERROR_PATH_NOT_FOUND \ + || (s) == APR_OS_START_SYSERR + ERROR_OPEN_FAILED \ + || (s) == APR_OS_START_SYSERR + ERROR_NO_MORE_FILES) +#define APR_STATUS_IS_ENOTDIR(s) ((s) == APR_ENOTDIR \ + || (s) == APR_OS_START_SYSERR + ERROR_PATH_NOT_FOUND \ + || (s) == APR_OS_START_SYSERR + ERROR_BAD_NETPATH \ + || (s) == APR_OS_START_SYSERR + ERROR_BAD_NET_NAME \ + || (s) == APR_OS_START_SYSERR + ERROR_BAD_PATHNAME \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_DRIVE) +#define APR_STATUS_IS_ENOSPC(s) ((s) == APR_ENOSPC \ + || (s) == APR_OS_START_SYSERR + ERROR_DISK_FULL) +#define APR_STATUS_IS_ENOMEM(s) ((s) == APR_ENOMEM \ + || (s) == APR_OS_START_SYSERR + ERROR_ARENA_TRASHED \ + || (s) == APR_OS_START_SYSERR + ERROR_NOT_ENOUGH_MEMORY \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_BLOCK \ + || (s) == APR_OS_START_SYSERR + ERROR_NOT_ENOUGH_QUOTA \ + || (s) == APR_OS_START_SYSERR + ERROR_OUTOFMEMORY) +#define APR_STATUS_IS_EMFILE(s) ((s) == APR_EMFILE \ + || (s) == APR_OS_START_SYSERR + ERROR_TOO_MANY_OPEN_FILES) +#define APR_STATUS_IS_ENFILE(s) ((s) == APR_ENFILE) +#define APR_STATUS_IS_EBADF(s) ((s) == APR_EBADF \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_HANDLE \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_TARGET_HANDLE) +#define APR_STATUS_IS_EINVAL(s) ((s) == APR_EINVAL \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_ACCESS \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_DATA \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_FUNCTION \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_HANDLE \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_PARAMETER \ + || (s) == APR_OS_START_SYSERR + ERROR_NEGATIVE_SEEK) +#define APR_STATUS_IS_ESPIPE(s) ((s) == APR_ESPIPE \ + || (s) == APR_OS_START_SYSERR + ERROR_SEEK_ON_DEVICE \ + || (s) == APR_OS_START_SYSERR + ERROR_NEGATIVE_SEEK) +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN \ + || (s) == APR_OS_START_SYSERR + ERROR_NO_DATA \ + || (s) == APR_OS_START_SYSERR + ERROR_NO_PROC_SLOTS \ + || (s) == APR_OS_START_SYSERR + ERROR_NESTING_NOT_ALLOWED \ + || (s) == APR_OS_START_SYSERR + ERROR_MAX_THRDS_REACHED \ + || (s) == APR_OS_START_SYSERR + ERROR_LOCK_VIOLATION \ + || (s) == APR_OS_START_SYSERR + WSAEWOULDBLOCK) +#define APR_STATUS_IS_EINTR(s) ((s) == APR_EINTR \ + || (s) == APR_OS_START_SYSERR + WSAEINTR) +#define APR_STATUS_IS_ENOTSOCK(s) ((s) == APR_ENOTSOCK \ + || (s) == APR_OS_START_SYSERR + WSAENOTSOCK) +#define APR_STATUS_IS_ECONNREFUSED(s) ((s) == APR_ECONNREFUSED \ + || (s) == APR_OS_START_SYSERR + WSAECONNREFUSED) +#define APR_STATUS_IS_EINPROGRESS(s) ((s) == APR_EINPROGRESS \ + || (s) == APR_OS_START_SYSERR + WSAEINPROGRESS) +#define APR_STATUS_IS_ECONNABORTED(s) ((s) == APR_ECONNABORTED \ + || (s) == APR_OS_START_SYSERR + WSAECONNABORTED) +#define APR_STATUS_IS_ECONNRESET(s) ((s) == APR_ECONNRESET \ + || (s) == APR_OS_START_SYSERR + ERROR_NETNAME_DELETED \ + || (s) == APR_OS_START_SYSERR + WSAECONNRESET) +/* XXX deprecated */ +#define APR_STATUS_IS_ETIMEDOUT(s) ((s) == APR_ETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WSAETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WAIT_TIMEOUT) +#undef APR_STATUS_IS_TIMEUP +#define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP \ + || (s) == APR_OS_START_SYSERR + WSAETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WAIT_TIMEOUT) +#define APR_STATUS_IS_EHOSTUNREACH(s) ((s) == APR_EHOSTUNREACH \ + || (s) == APR_OS_START_SYSERR + WSAEHOSTUNREACH) +#define APR_STATUS_IS_ENETUNREACH(s) ((s) == APR_ENETUNREACH \ + || (s) == APR_OS_START_SYSERR + WSAENETUNREACH) +#define APR_STATUS_IS_EFTYPE(s) ((s) == APR_EFTYPE \ + || (s) == APR_OS_START_SYSERR + ERROR_EXE_MACHINE_TYPE_MISMATCH \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_DLL \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_MODULETYPE \ + || (s) == APR_OS_START_SYSERR + ERROR_BAD_EXE_FORMAT \ + || (s) == APR_OS_START_SYSERR + ERROR_INVALID_EXE_SIGNATURE \ + || (s) == APR_OS_START_SYSERR + ERROR_FILE_CORRUPT \ + || (s) == APR_OS_START_SYSERR + ERROR_BAD_FORMAT) +#define APR_STATUS_IS_EPIPE(s) ((s) == APR_EPIPE \ + || (s) == APR_OS_START_SYSERR + ERROR_BROKEN_PIPE) +#define APR_STATUS_IS_EXDEV(s) ((s) == APR_EXDEV \ + || (s) == APR_OS_START_SYSERR + ERROR_NOT_SAME_DEVICE) +#define APR_STATUS_IS_ENOTEMPTY(s) ((s) == APR_ENOTEMPTY \ + || (s) == APR_OS_START_SYSERR + ERROR_DIR_NOT_EMPTY) + +#elif defined(NETWARE) && defined(USE_WINSOCK) && !defined(DOXYGEN) /* !defined(OS2) && !defined(WIN32) */ + +#define APR_FROM_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e + APR_OS_START_SYSERR) +#define APR_TO_OS_ERROR(e) (e == 0 ? APR_SUCCESS : e - APR_OS_START_SYSERR) + +#define apr_get_os_error() (errno) +#define apr_set_os_error(e) (errno = (e)) + +/* A special case, only socket calls require this: */ +#define apr_get_netos_error() (APR_FROM_OS_ERROR(WSAGetLastError())) +#define apr_set_netos_error(e) (WSASetLastError(APR_TO_OS_ERROR(e))) + +/* APR CANONICAL ERROR TESTS */ +#define APR_STATUS_IS_EACCES(s) ((s) == APR_EACCES) +#define APR_STATUS_IS_EEXIST(s) ((s) == APR_EEXIST) +#define APR_STATUS_IS_ENAMETOOLONG(s) ((s) == APR_ENAMETOOLONG) +#define APR_STATUS_IS_ENOENT(s) ((s) == APR_ENOENT) +#define APR_STATUS_IS_ENOTDIR(s) ((s) == APR_ENOTDIR) +#define APR_STATUS_IS_ENOSPC(s) ((s) == APR_ENOSPC) +#define APR_STATUS_IS_ENOMEM(s) ((s) == APR_ENOMEM) +#define APR_STATUS_IS_EMFILE(s) ((s) == APR_EMFILE) +#define APR_STATUS_IS_ENFILE(s) ((s) == APR_ENFILE) +#define APR_STATUS_IS_EBADF(s) ((s) == APR_EBADF) +#define APR_STATUS_IS_EINVAL(s) ((s) == APR_EINVAL) +#define APR_STATUS_IS_ESPIPE(s) ((s) == APR_ESPIPE) + +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN \ + || (s) == EWOULDBLOCK \ + || (s) == APR_OS_START_SYSERR + WSAEWOULDBLOCK) +#define APR_STATUS_IS_EINTR(s) ((s) == APR_EINTR \ + || (s) == APR_OS_START_SYSERR + WSAEINTR) +#define APR_STATUS_IS_ENOTSOCK(s) ((s) == APR_ENOTSOCK \ + || (s) == APR_OS_START_SYSERR + WSAENOTSOCK) +#define APR_STATUS_IS_ECONNREFUSED(s) ((s) == APR_ECONNREFUSED \ + || (s) == APR_OS_START_SYSERR + WSAECONNREFUSED) +#define APR_STATUS_IS_EINPROGRESS(s) ((s) == APR_EINPROGRESS \ + || (s) == APR_OS_START_SYSERR + WSAEINPROGRESS) +#define APR_STATUS_IS_ECONNABORTED(s) ((s) == APR_ECONNABORTED \ + || (s) == APR_OS_START_SYSERR + WSAECONNABORTED) +#define APR_STATUS_IS_ECONNRESET(s) ((s) == APR_ECONNRESET \ + || (s) == APR_OS_START_SYSERR + WSAECONNRESET) +/* XXX deprecated */ +#define APR_STATUS_IS_ETIMEDOUT(s) ((s) == APR_ETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WSAETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WAIT_TIMEOUT) +#undef APR_STATUS_IS_TIMEUP +#define APR_STATUS_IS_TIMEUP(s) ((s) == APR_TIMEUP \ + || (s) == APR_OS_START_SYSERR + WSAETIMEDOUT \ + || (s) == APR_OS_START_SYSERR + WAIT_TIMEOUT) +#define APR_STATUS_IS_EHOSTUNREACH(s) ((s) == APR_EHOSTUNREACH \ + || (s) == APR_OS_START_SYSERR + WSAEHOSTUNREACH) +#define APR_STATUS_IS_ENETUNREACH(s) ((s) == APR_ENETUNREACH \ + || (s) == APR_OS_START_SYSERR + WSAENETUNREACH) +#define APR_STATUS_IS_ENETDOWN(s) ((s) == APR_OS_START_SYSERR + WSAENETDOWN) +#define APR_STATUS_IS_EFTYPE(s) ((s) == APR_EFTYPE) +#define APR_STATUS_IS_EPIPE(s) ((s) == APR_EPIPE) +#define APR_STATUS_IS_EXDEV(s) ((s) == APR_EXDEV) +#define APR_STATUS_IS_ENOTEMPTY(s) ((s) == APR_ENOTEMPTY) + +#else /* !defined(NETWARE) && !defined(OS2) && !defined(WIN32) */ + +/* + * os error codes are clib error codes + */ +#define APR_FROM_OS_ERROR(e) (e) +#define APR_TO_OS_ERROR(e) (e) + +#define apr_get_os_error() (errno) +#define apr_set_os_error(e) (errno = (e)) + +/* A special case, only socket calls require this: + */ +#define apr_get_netos_error() (errno) +#define apr_set_netos_error(e) (errno = (e)) + +/** + * @addtogroup APR_STATUS_IS + * @{ + */ + +/** permission denied */ +#define APR_STATUS_IS_EACCES(s) ((s) == APR_EACCES) +/** file exists */ +#define APR_STATUS_IS_EEXIST(s) ((s) == APR_EEXIST) +/** path name is too long */ +#define APR_STATUS_IS_ENAMETOOLONG(s) ((s) == APR_ENAMETOOLONG) +/** + * no such file or directory + * @remark + * EMVSCATLG can be returned by the automounter on z/OS for + * paths which do not exist. + */ +#ifdef EMVSCATLG +#define APR_STATUS_IS_ENOENT(s) ((s) == APR_ENOENT \ + || (s) == EMVSCATLG) +#else +#define APR_STATUS_IS_ENOENT(s) ((s) == APR_ENOENT) +#endif +/** not a directory */ +#define APR_STATUS_IS_ENOTDIR(s) ((s) == APR_ENOTDIR) +/** no space left on device */ +#ifdef EDQUOT +#define APR_STATUS_IS_ENOSPC(s) ((s) == APR_ENOSPC \ + || (s) == EDQUOT) +#else +#define APR_STATUS_IS_ENOSPC(s) ((s) == APR_ENOSPC) +#endif +/** not enough memory */ +#define APR_STATUS_IS_ENOMEM(s) ((s) == APR_ENOMEM) +/** too many open files */ +#define APR_STATUS_IS_EMFILE(s) ((s) == APR_EMFILE) +/** file table overflow */ +#define APR_STATUS_IS_ENFILE(s) ((s) == APR_ENFILE) +/** bad file # */ +#define APR_STATUS_IS_EBADF(s) ((s) == APR_EBADF) +/** invalid argument */ +#define APR_STATUS_IS_EINVAL(s) ((s) == APR_EINVAL) +/** illegal seek */ +#define APR_STATUS_IS_ESPIPE(s) ((s) == APR_ESPIPE) + +/** operation would block */ +#if !defined(EWOULDBLOCK) || !defined(EAGAIN) +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN) +#elif (EWOULDBLOCK == EAGAIN) +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN) +#else +#define APR_STATUS_IS_EAGAIN(s) ((s) == APR_EAGAIN \ + || (s) == EWOULDBLOCK) +#endif + +/** interrupted system call */ +#define APR_STATUS_IS_EINTR(s) ((s) == APR_EINTR) +/** socket operation on a non-socket */ +#define APR_STATUS_IS_ENOTSOCK(s) ((s) == APR_ENOTSOCK) +/** Connection Refused */ +#define APR_STATUS_IS_ECONNREFUSED(s) ((s) == APR_ECONNREFUSED) +/** operation now in progress */ +#define APR_STATUS_IS_EINPROGRESS(s) ((s) == APR_EINPROGRESS) + +/** + * Software caused connection abort + * @remark + * EPROTO on certain older kernels really means ECONNABORTED, so we need to + * ignore it for them. See discussion in new-httpd archives nh.9701 & nh.9603 + * + * There is potentially a bug in Solaris 2.x x<6, and other boxes that + * implement tcp sockets in userland (i.e. on top of STREAMS). On these + * systems, EPROTO can actually result in a fatal loop. See PR#981 for + * example. It's hard to handle both uses of EPROTO. + */ +#ifdef EPROTO +#define APR_STATUS_IS_ECONNABORTED(s) ((s) == APR_ECONNABORTED \ + || (s) == EPROTO) +#else +#define APR_STATUS_IS_ECONNABORTED(s) ((s) == APR_ECONNABORTED) +#endif + +/** Connection Reset by peer */ +#define APR_STATUS_IS_ECONNRESET(s) ((s) == APR_ECONNRESET) +/** Operation timed out + * @deprecated */ +#define APR_STATUS_IS_ETIMEDOUT(s) ((s) == APR_ETIMEDOUT) +/** no route to host */ +#define APR_STATUS_IS_EHOSTUNREACH(s) ((s) == APR_EHOSTUNREACH) +/** network is unreachable */ +#define APR_STATUS_IS_ENETUNREACH(s) ((s) == APR_ENETUNREACH) +/** inappropiate file type or format */ +#define APR_STATUS_IS_EFTYPE(s) ((s) == APR_EFTYPE) +/** broken pipe */ +#define APR_STATUS_IS_EPIPE(s) ((s) == APR_EPIPE) +/** cross device link */ +#define APR_STATUS_IS_EXDEV(s) ((s) == APR_EXDEV) +/** Directory Not Empty */ +#define APR_STATUS_IS_ENOTEMPTY(s) ((s) == APR_ENOTEMPTY || \ + (s) == APR_EEXIST) +/** @} */ + +#endif /* !defined(NETWARE) && !defined(OS2) && !defined(WIN32) */ + +/** @} */ + +#ifdef __cplusplus +} +#endif + +#endif /* ! APR_ERRNO_H */ +/* + * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * + * @APPLE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_LICENSE_HEADER_END@ + */ +#include + + +/* + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)errno.h 8.5 (Berkeley) 1/21/94 + */ + +#ifndef _SYS_ERRNO_H_ +#define _SYS_ERRNO_H_ + +#include +__BEGIN_DECLS +extern int * __error(void); +#define errno (*__error()) +__END_DECLS + +/* + * Error codes + */ + +#define EPERM 1 /* Operation not permitted */ +#define ENOENT 2 /* No such file or directory */ +#define ESRCH 3 /* No such process */ +#define EINTR 4 /* Interrupted system call */ +#define EIO 5 /* Input/output error */ +#define ENXIO 6 /* Device not configured */ +#define E2BIG 7 /* Argument list too long */ +#define ENOEXEC 8 /* Exec format error */ +#define EBADF 9 /* Bad file descriptor */ +#define ECHILD 10 /* No child processes */ +#define EDEADLK 11 /* Resource deadlock avoided */ + /* 11 was EAGAIN */ +#define ENOMEM 12 /* Cannot allocate memory */ +#define EACCES 13 /* Permission denied */ +#define EFAULT 14 /* Bad address */ +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define ENOTBLK 15 /* Block device required */ +#endif +#define EBUSY 16 /* Device / Resource busy */ +#define EEXIST 17 /* File exists */ +#define EXDEV 18 /* Cross-device link */ +#define ENODEV 19 /* Operation not supported by device */ +#define ENOTDIR 20 /* Not a directory */ +#define EISDIR 21 /* Is a directory */ +#define EINVAL 22 /* Invalid argument */ +#define ENFILE 23 /* Too many open files in system */ +#define EMFILE 24 /* Too many open files */ +#define ENOTTY 25 /* Inappropriate ioctl for device */ +#define ETXTBSY 26 /* Text file busy */ +#define EFBIG 27 /* File too large */ +#define ENOSPC 28 /* No space left on device */ +#define ESPIPE 29 /* Illegal seek */ +#define EROFS 30 /* Read-only file system */ +#define EMLINK 31 /* Too many links */ +#define EPIPE 32 /* Broken pipe */ + +/* math software */ +#define EDOM 33 /* Numerical argument out of domain */ +#define ERANGE 34 /* Result too large */ + +/* non-blocking and interrupt i/o */ +#define EAGAIN 35 /* Resource temporarily unavailable */ +#define EWOULDBLOCK EAGAIN /* Operation would block */ +#define EINPROGRESS 36 /* Operation now in progress */ +#define EALREADY 37 /* Operation already in progress */ + +/* ipc/network software -- argument errors */ +#define ENOTSOCK 38 /* Socket operation on non-socket */ +#define EDESTADDRREQ 39 /* Destination address required */ +#define EMSGSIZE 40 /* Message too long */ +#define EPROTOTYPE 41 /* Protocol wrong type for socket */ +#define ENOPROTOOPT 42 /* Protocol not available */ +#define EPROTONOSUPPORT 43 /* Protocol not supported */ +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define ESOCKTNOSUPPORT 44 /* Socket type not supported */ +#endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */ +#define ENOTSUP 45 /* Operation not supported */ +#if !__DARWIN_UNIX03 && !defined(KERNEL) +/* + * This is the same for binary and source copmpatability, unless compiling + * the kernel itself, or compiling __DARWIN_UNIX03; if compiling for the + * kernel, the correct value will be returned. If compiling non-POSIX + * source, the kernel return value will be converted by a stub in libc, and + * if compiling source with __DARWIN_UNIX03, the conversion in libc is not + * done, and the caller gets the expected (discrete) value. + */ +#define EOPNOTSUPP ENOTSUP /* Operation not supported on socket */ +#endif /* !__DARWIN_UNIX03 && !KERNEL */ + +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define EPFNOSUPPORT 46 /* Protocol family not supported */ +#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#define EAFNOSUPPORT 47 /* Address family not supported by protocol family */ +#define EADDRINUSE 48 /* Address already in use */ +#define EADDRNOTAVAIL 49 /* Can't assign requested address */ + +/* ipc/network software -- operational errors */ +#define ENETDOWN 50 /* Network is down */ +#define ENETUNREACH 51 /* Network is unreachable */ +#define ENETRESET 52 /* Network dropped connection on reset */ +#define ECONNABORTED 53 /* Software caused connection abort */ +#define ECONNRESET 54 /* Connection reset by peer */ +#define ENOBUFS 55 /* No buffer space available */ +#define EISCONN 56 /* Socket is already connected */ +#define ENOTCONN 57 /* Socket is not connected */ +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define ESHUTDOWN 58 /* Can't send after socket shutdown */ +#define ETOOMANYREFS 59 /* Too many references: can't splice */ +#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#define ETIMEDOUT 60 /* Operation timed out */ +#define ECONNREFUSED 61 /* Connection refused */ + +#define ELOOP 62 /* Too many levels of symbolic links */ +#define ENAMETOOLONG 63 /* File name too long */ + +/* should be rearranged */ +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define EHOSTDOWN 64 /* Host is down */ +#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#define EHOSTUNREACH 65 /* No route to host */ +#define ENOTEMPTY 66 /* Directory not empty */ + +/* quotas & mush */ +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define EPROCLIM 67 /* Too many processes */ +#define EUSERS 68 /* Too many users */ +#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ +#define EDQUOT 69 /* Disc quota exceeded */ + +/* Network File System */ +#define ESTALE 70 /* Stale NFS file handle */ +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define EREMOTE 71 /* Too many levels of remote in path */ +#define EBADRPC 72 /* RPC struct is bad */ +#define ERPCMISMATCH 73 /* RPC version wrong */ +#define EPROGUNAVAIL 74 /* RPC prog. not avail */ +#define EPROGMISMATCH 75 /* Program version wrong */ +#define EPROCUNAVAIL 76 /* Bad procedure for program */ +#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ + +#define ENOLCK 77 /* No locks available */ +#define ENOSYS 78 /* Function not implemented */ + +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define EFTYPE 79 /* Inappropriate file type or format */ +#define EAUTH 80 /* Authentication error */ +#define ENEEDAUTH 81 /* Need authenticator */ + +/* Intelligent device errors */ +#define EPWROFF 82 /* Device power is off */ +#define EDEVERR 83 /* Device error, e.g. paper out */ +#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ + +#define EOVERFLOW 84 /* Value too large to be stored in data type */ + +/* Program loading errors */ +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define EBADEXEC 85 /* Bad executable */ +#define EBADARCH 86 /* Bad CPU type in executable */ +#define ESHLIBVERS 87 /* Shared library version mismatch */ +#define EBADMACHO 88 /* Malformed Macho file */ +#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ + +#define ECANCELED 89 /* Operation canceled */ + +#define EIDRM 90 /* Identifier removed */ +#define ENOMSG 91 /* No message of desired type */ +#define EILSEQ 92 /* Illegal byte sequence */ +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define ENOATTR 93 /* Attribute not found */ +#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ + +#define EBADMSG 94 /* Bad message */ +#define EMULTIHOP 95 /* Reserved */ +#define ENODATA 96 /* No message available on STREAM */ +#define ENOLINK 97 /* Reserved */ +#define ENOSR 98 /* No STREAM resources */ +#define ENOSTR 99 /* Not a STREAM */ +#define EPROTO 100 /* Protocol error */ +#define ETIME 101 /* STREAM ioctl timeout */ + +#if __DARWIN_UNIX03 || defined(KERNEL) +/* This value is only discrete when compiling __DARWIN_UNIX03, or KERNEL */ +#define EOPNOTSUPP 102 /* Operation not supported on socket */ +#endif /* __DARWIN_UNIX03 || KERNEL */ + +#define ENOPOLICY 103 /* No such policy registered */ + +#if !defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE) +#define ELAST 103 /* Must be equal largest errno */ +#endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */ + +#endif /* _SYS_ERRNO_H_ */ diff --git a/doc/errno.list.solaris.txt b/doc/errno.list.solaris.txt new file mode 100644 index 000000000..23601e9d3 --- /dev/null +++ b/doc/errno.list.solaris.txt @@ -0,0 +1,206 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2000 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_ERRNO_H +#define _SYS_ERRNO_H + +#pragma ident "@(#)errno.h 1.22 05/06/08 SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Error codes + */ + +#define EPERM 1 /* Not super-user */ +#define ENOENT 2 /* No such file or directory */ +#define ESRCH 3 /* No such process */ +#define EINTR 4 /* interrupted system call */ +#define EIO 5 /* I/O error */ +#define ENXIO 6 /* No such device or address */ +#define E2BIG 7 /* Arg list too long */ +#define ENOEXEC 8 /* Exec format error */ +#define EBADF 9 /* Bad file number */ +#define ECHILD 10 /* No children */ +#define EAGAIN 11 /* Resource temporarily unavailable */ +#define ENOMEM 12 /* Not enough core */ +#define EACCES 13 /* Permission denied */ +#define EFAULT 14 /* Bad address */ +#define ENOTBLK 15 /* Block device required */ +#define EBUSY 16 /* Mount device busy */ +#define EEXIST 17 /* File exists */ +#define EXDEV 18 /* Cross-device link */ +#define ENODEV 19 /* No such device */ +#define ENOTDIR 20 /* Not a directory */ +#define EISDIR 21 /* Is a directory */ +#define EINVAL 22 /* Invalid argument */ +#define ENFILE 23 /* File table overflow */ +#define EMFILE 24 /* Too many open files */ +#define ENOTTY 25 /* Inappropriate ioctl for device */ +#define ETXTBSY 26 /* Text file busy */ +#define EFBIG 27 /* File too large */ +#define ENOSPC 28 /* No space left on device */ +#define ESPIPE 29 /* Illegal seek */ +#define EROFS 30 /* Read only file system */ +#define EMLINK 31 /* Too many links */ +#define EPIPE 32 /* Broken pipe */ +#define EDOM 33 /* Math arg out of domain of func */ +#define ERANGE 34 /* Math result not representable */ +#define ENOMSG 35 /* No message of desired type */ +#define EIDRM 36 /* Identifier removed */ +#define ECHRNG 37 /* Channel number out of range */ +#define EL2NSYNC 38 /* Level 2 not synchronized */ +#define EL3HLT 39 /* Level 3 halted */ +#define EL3RST 40 /* Level 3 reset */ +#define ELNRNG 41 /* Link number out of range */ +#define EUNATCH 42 /* Protocol driver not attached */ +#define ENOCSI 43 /* No CSI structure available */ +#define EL2HLT 44 /* Level 2 halted */ +#define EDEADLK 45 /* Deadlock condition. */ +#define ENOLCK 46 /* No record locks available. */ +#define ECANCELED 47 /* Operation canceled */ +#define ENOTSUP 48 /* Operation not supported */ + +/* Filesystem Quotas */ +#define EDQUOT 49 /* Disc quota exceeded */ + +/* Convergent Error Returns */ +#define EBADE 50 /* invalid exchange */ +#define EBADR 51 /* invalid request descriptor */ +#define EXFULL 52 /* exchange full */ +#define ENOANO 53 /* no anode */ +#define EBADRQC 54 /* invalid request code */ +#define EBADSLT 55 /* invalid slot */ +#define EDEADLOCK 56 /* file locking deadlock error */ + +#define EBFONT 57 /* bad font file fmt */ + +/* Interprocess Robust Locks */ +#define EOWNERDEAD 58 /* process died with the lock */ +#define ENOTRECOVERABLE 59 /* lock is not recoverable */ + +/* stream problems */ +#define ENOSTR 60 /* Device not a stream */ +#define ENODATA 61 /* no data (for no delay io) */ +#define ETIME 62 /* timer expired */ +#define ENOSR 63 /* out of streams resources */ + +#define ENONET 64 /* Machine is not on the network */ +#define ENOPKG 65 /* Package not installed */ +#define EREMOTE 66 /* The object is remote */ +#define ENOLINK 67 /* the link has been severed */ +#define EADV 68 /* advertise error */ +#define ESRMNT 69 /* srmount error */ + +#define ECOMM 70 /* Communication error on send */ +#define EPROTO 71 /* Protocol error */ + +/* Interprocess Robust Locks */ +#define ELOCKUNMAPPED 72 /* locked lock was unmapped */ + +#define ENOTACTIVE 73 /* Facility is not active */ +#define EMULTIHOP 74 /* multihop attempted */ +#define EBADMSG 77 /* trying to read unreadable message */ +#define ENAMETOOLONG 78 /* path name is too long */ +#define EOVERFLOW 79 /* value too large to be stored in data type */ +#define ENOTUNIQ 80 /* given log. name not unique */ +#define EBADFD 81 /* f.d. invalid for this operation */ +#define EREMCHG 82 /* Remote address changed */ + +/* shared library problems */ +#define ELIBACC 83 /* Can't access a needed shared lib. */ +#define ELIBBAD 84 /* Accessing a corrupted shared lib. */ +#define ELIBSCN 85 /* .lib section in a.out corrupted. */ +#define ELIBMAX 86 /* Attempting to link in too many libs. */ +#define ELIBEXEC 87 /* Attempting to exec a shared library. */ +#define EILSEQ 88 /* Illegal byte sequence. */ +#define ENOSYS 89 /* Unsupported file system operation */ +#define ELOOP 90 /* Symbolic link loop */ +#define ERESTART 91 /* Restartable system call */ +#define ESTRPIPE 92 /* if pipe/FIFO, don't sleep in stream head */ +#define ENOTEMPTY 93 /* directory not empty */ +#define EUSERS 94 /* Too many users (for UFS) */ + +/* BSD Networking Software */ + /* argument errors */ +#define ENOTSOCK 95 /* Socket operation on non-socket */ +#define EDESTADDRREQ 96 /* Destination address required */ +#define EMSGSIZE 97 /* Message too long */ +#define EPROTOTYPE 98 /* Protocol wrong type for socket */ +#define ENOPROTOOPT 99 /* Protocol not available */ +#define EPROTONOSUPPORT 120 /* Protocol not supported */ +#define ESOCKTNOSUPPORT 121 /* Socket type not supported */ +#define EOPNOTSUPP 122 /* Operation not supported on socket */ +#define EPFNOSUPPORT 123 /* Protocol family not supported */ +#define EAFNOSUPPORT 124 /* Address family not supported by */ + /* protocol family */ +#define EADDRINUSE 125 /* Address already in use */ +#define EADDRNOTAVAIL 126 /* Can't assign requested address */ + /* operational errors */ +#define ENETDOWN 127 /* Network is down */ +#define ENETUNREACH 128 /* Network is unreachable */ +#define ENETRESET 129 /* Network dropped connection because */ + /* of reset */ +#define ECONNABORTED 130 /* Software caused connection abort */ +#define ECONNRESET 131 /* Connection reset by peer */ +#define ENOBUFS 132 /* No buffer space available */ +#define EISCONN 133 /* Socket is already connected */ +#define ENOTCONN 134 /* Socket is not connected */ +/* XENIX has 135 - 142 */ +#define ESHUTDOWN 143 /* Can't send after socket shutdown */ +#define ETOOMANYREFS 144 /* Too many references: can't splice */ +#define ETIMEDOUT 145 /* Connection timed out */ +#define ECONNREFUSED 146 /* Connection refused */ +#define EHOSTDOWN 147 /* Host is down */ +#define EHOSTUNREACH 148 /* No route to host */ +#define EWOULDBLOCK EAGAIN +#define EALREADY 149 /* operation already in progress */ +#define EINPROGRESS 150 /* operation now in progress */ + +/* SUN Network File System */ +#define ESTALE 151 /* Stale NFS file handle */ + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ERRNO_H */ diff --git a/doc/examples/Makefile.am b/doc/examples/Makefile.am new file mode 100644 index 000000000..b4c93f4c9 --- /dev/null +++ b/doc/examples/Makefile.am @@ -0,0 +1,8 @@ +EXTRA = README unify.vol replicate.vol stripe.vol protocol-client.vol protocol-server.vol posix-locks.vol trash.vol write-behind.vol io-threads.vol io-cache.vol read-ahead.vol filter.vol trace.vol +EXTRA_DIST = $(EXTRA) + +docdir = $(datadir)/doc/$(PACKAGE_NAME) +Examplesdir = $(docdir)/examples +Examples_DATA = $(EXTRA) + +CLEANFILES = diff --git a/doc/examples/README b/doc/examples/README new file mode 100644 index 000000000..4d472ac08 --- /dev/null +++ b/doc/examples/README @@ -0,0 +1,13 @@ +GlusterFS's translator feature is very flexible and there are quite a lot of ways one +can configure their filesystem to behave like. + +Volume Specification is a way in which GlusterFS understands how it has to work, based +on what is written there. + +Going through the following URLs may give you more idea about all these. + +* http://www.gluster.org/docs/index.php/GlusterFS +* http://www.gluster.org/docs/index.php/GlusterFS_Volume_Specification +* http://www.gluster.org/docs/index.php/GlusterFS_Translators + +Mail us any doubts, suggestions on 'gluster-devel(at)nongnu.org' diff --git a/doc/examples/filter.vol b/doc/examples/filter.vol new file mode 100644 index 000000000..ca5c59837 --- /dev/null +++ b/doc/examples/filter.vol @@ -0,0 +1,23 @@ +volume client + type protocol/client + option transport-type tcp # for TCP/IP transport + option remote-host 192.168.1.10 # IP address of the remote brick + option remote-subvolume brick # name of the remote volume +end-volume + +## In normal clustered storage type, any of the cluster translators can come here. +# +# Definition of other clients +# +# Definition of cluster translator (may be unify, afr, or unify over afr) +# + +### 'Filter' translator is used on client side (or server side according to needs). This traslator makes all the below translators, (or say volumes) as read-only. Hence if one wants a 'read-only' filesystem, using filter as the top most volume will make it really fast as the fops are returned from this level itself. + +volume filter-ro + type features/filter + option root-squashing enable +# option completely-read-only yes +# translate-uid 1-99=0 + subvolumes client +end-volume \ No newline at end of file diff --git a/doc/examples/io-cache.vol b/doc/examples/io-cache.vol new file mode 100644 index 000000000..5f3eca4c5 --- /dev/null +++ b/doc/examples/io-cache.vol @@ -0,0 +1,25 @@ +volume client + type protocol/client + option transport-type tcp # for TCP/IP transport + option remote-host 192.168.1.10 # IP address of the remote brick + option remote-subvolume brick # name of the remote volume +end-volume + +## In normal clustered storage type, any of the cluster translators can come here. +# +# Definition of other clients +# +# Definition of cluster translator (may be unify, replicate, or unify over replicate) +# + +### 'IO-Cache' translator is best used on client side when a filesystem has file which are not modified frequently but read several times. For example, while compiling a kernel, *.h files are read while compiling every *.c file, in these case, io-cache translator comes very handy, as it keeps the whole file content in the cache, and serves from the cache. +# One can provide the priority of the cache too. + +volume ioc + type performance/io-cache + subvolumes client # In this example it is 'client' you may have to change it according to your spec file. + option page-size 1MB # 128KB is default + option cache-size 64MB # 32MB is default + option force-revalidate-timeout 5 # 1second is default + option priority *.html:2,*:1 # default is *:0 +end-volume diff --git a/doc/examples/io-threads.vol b/doc/examples/io-threads.vol new file mode 100644 index 000000000..9954724e1 --- /dev/null +++ b/doc/examples/io-threads.vol @@ -0,0 +1,21 @@ + +volume brick + type storage/posix # POSIX FS translator + option directory /home/export # Export this directory +end-volume + +### 'IO-threads' translator gives a threading behaviour to File I/O calls. All other normal fops are having default behaviour. Loading this on server side helps to reduce the contension of network. (Which is assumed as a GlusterFS hang). +# One can load it in client side to reduce the latency involved in case of a slow network, when loaded below write-behind. +volume iot + type performance/io-threads + subvolumes brick + option thread-count 4 # default value is 1 +end-volume + +volume server + type protocol/server + subvolumes iot brick + option transport-type tcp # For TCP/IP transport + option auth.addr.brick.allow 192.168.* # Allow access to "brick" volume + option auth.addr.iot.allow 192.168.* # Allow access to "p-locks" volume +end-volume diff --git a/doc/examples/posix-locks.vol b/doc/examples/posix-locks.vol new file mode 100644 index 000000000..b9c9e7a64 --- /dev/null +++ b/doc/examples/posix-locks.vol @@ -0,0 +1,20 @@ + +volume brick + type storage/posix # POSIX FS translator + option directory /home/export # Export this directory +end-volume + +### 'Posix-locks' feature should be added on the server side (as posix volume as subvolume) because it contains the actual file. +volume p-locks + type features/posix-locks + subvolumes brick + option mandatory on +end-volume + +volume server + type protocol/server + subvolumes p-locks brick + option transport-type tcp + option auth.addr.brick.allow 192.168.* # Allow access to "brick" volume + option auth.addr.p-locks.allow 192.168.* # Allow access to "p-locks" volume +end-volume diff --git a/doc/examples/protocol-client.vol b/doc/examples/protocol-client.vol new file mode 100644 index 000000000..43108f2c2 --- /dev/null +++ b/doc/examples/protocol-client.vol @@ -0,0 +1,17 @@ +volume client + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport + option remote-host 192.168.1.10 # IP address of the remote brick +# option transport.socket.remote-port 6996 # default server port is 6996 + +# option transport-type ib-verbs # for Infiniband verbs transport +# option transport.ib-verbs.work-request-send-size 1048576 +# option transport.ib-verbs.work-request-send-count 16 +# option transport.ib-verbs.work-request-recv-size 1048576 +# option transport.ib-verbs.work-request-recv-count 16 +# option transport.ib-verbs.remote-port 6996 # default server port is 6996 + + option remote-subvolume brick # name of the remote volume +# option transport-timeout 30 # default value is 120seconds +end-volume diff --git a/doc/examples/protocol-server.vol b/doc/examples/protocol-server.vol new file mode 100644 index 000000000..88477511f --- /dev/null +++ b/doc/examples/protocol-server.vol @@ -0,0 +1,25 @@ + +### Export volume "brick" with the contents of "/home/export" directory. +volume brick + type storage/posix # POSIX FS translator + option directory /home/export # Export this directory +end-volume + +### Add network serving capability to above brick. +volume server + type protocol/server + option transport-type tcp # For TCP/IP transport +# option transport.socket.listen-port 6996 # Default is 6996 + +# option transport-type ib-verbs # For Infiniband Verbs transport +# option transport.ib-verbs.work-request-send-size 131072 +# option transport.ib-verbs.work-request-send-count 64 +# option transport.ib-verbs.work-request-recv-size 131072 +# option transport.ib-verbs.work-request-recv-count 64 +# option transport.ib-verbs.listen-port 6996 # Default is 6996 + +# option bind-address 192.168.1.10 # Default is to listen on all interfaces +# option client-volume-filename /etc/glusterfs/glusterfs-client.vol + subvolumes brick + option auth.addr.brick.allow 192.168.* # Allow access to "brick" volume +end-volume diff --git a/doc/examples/read-ahead.vol b/doc/examples/read-ahead.vol new file mode 100644 index 000000000..3ce0d95ac --- /dev/null +++ b/doc/examples/read-ahead.vol @@ -0,0 +1,22 @@ +volume client + type protocol/client + option transport-type tcp # for TCP/IP transport + option remote-host 192.168.1.10 # IP address of the remote brick + option remote-subvolume brick # name of the remote volume +end-volume + +## In normal clustered storage type, any of the cluster translators can come here. +# +# Definition of other clients +# +# Definition of cluster translator (may be unify, replicate, or unify over replicate) +# + +### 'Read-Ahead' translator is best utilized on client side, as it prefetches the file contents when the first read() call is issued. +volume ra + type performance/read-ahead + subvolumes client # In this example it is 'client' you may have to change it according to your spec file. + option page-size 1MB # default is 256KB + option page-count 4 # default is 2 + option force-atime-update no # defalut is 'no' +end-volume diff --git a/doc/examples/replicate.vol b/doc/examples/replicate.vol new file mode 100644 index 000000000..8c9541444 --- /dev/null +++ b/doc/examples/replicate.vol @@ -0,0 +1,119 @@ +### 'NOTE' +# This file has both server spec and client spec to get an understanding of stripe's spec file. Hence can't be used as it is, as a GlusterFS spec file. +# One need to seperate out server spec and client spec to get it working. + +#========================================================================= + +# **** server1 spec file **** + +### Export volume "brick" with the contents of "/home/export" directory. +volume posix1 + type storage/posix # POSIX FS translator + option directory /home/export1 # Export this directory +end-volume + +### Add POSIX record locking support to the storage brick +volume brick1 + type features/posix-locks + option mandatory on # enables mandatory locking on all files + subvolumes posix1 +end-volume + +### Add network serving capability to above brick. +volume server + type protocol/server + option transport-type tcp # For TCP/IP transport + option transport.socket.listen-port 6996 # Default is 6996 +# option client-volume-filename /etc/glusterfs/glusterfs-client.vol + subvolumes brick1 + option auth.addr.brick1.allow * # access to "brick" volume +end-volume + + +#========================================================================= + +# **** server2 spec file **** +volume posix2 + type storage/posix # POSIX FS translator + option directory /home/export2 # Export this directory +end-volume + +### Add POSIX record locking support to the storage brick +volume brick2 + type features/posix-locks + option mandatory on # enables mandatory locking on all files + subvolumes posix2 +end-volume + +### Add network serving capability to above brick. +volume server + type protocol/server + option transport-type tcp # For TCP/IP transport + option transport.socket.listen-port 6997 # Default is 6996 + subvolumes brick2 + option auth.addr.brick2.allow * # Allow access to "brick" volume +end-volume + + +#========================================================================= + +# **** server3 spec file **** + +volume posix3 + type storage/posix # POSIX FS translator + option directory /home/export3 # Export this directory +end-volume + +### Add POSIX record locking support to the storage brick +volume brick3 + type features/posix-locks + option mandatory on # enables mandatory locking on all files + subvolumes posix3 +end-volume + +### Add network serving capability to above brick. +volume server + type protocol/server + option transport-type tcp # For TCP/IP transport + option transport.socket.listen-port 6998 # Default is 6996 + subvolumes brick3 + option auth.addr.brick3.allow * # access to "brick" volume +end-volume + + +#========================================================================= + +# **** Clustered Client config file **** + +### Add client feature and attach to remote subvolume of server1 +volume client1 + type protocol/client + option transport-type tcp # for TCP/IP transport + option remote-host 127.0.0.1 # IP address of the remote brick + option transport.socket.remote-port 6996 # default server port is 6996 + option remote-subvolume brick1 # name of the remote volume +end-volume + +### Add client feature and attach to remote subvolume of server2 +volume client2 + type protocol/client + option transport-type tcp # for TCP/IP transport + option remote-host 127.0.0.1 # IP address of the remote brick + option transport.socket.remote-port 6997 # default server port is 6996 + option remote-subvolume brick2 # name of the remote volume +end-volume + +volume client3 + type protocol/client + option transport-type tcp # for TCP/IP transport + option remote-host 127.0.0.1 # IP address of the remote brick + option transport.socket.remote-port 6998 # default server port is 6996 + option remote-subvolume brick3 # name of the remote volume +end-volume + +## Add replicate feature. +volume replicate + type cluster/replicate + subvolumes client1 client2 client3 +end-volume + diff --git a/doc/examples/stripe.vol b/doc/examples/stripe.vol new file mode 100644 index 000000000..ea24cf860 --- /dev/null +++ b/doc/examples/stripe.vol @@ -0,0 +1,121 @@ + +### 'NOTE' +# This file has both server spec and client spec to get an understanding of stripe's spec file. Hence can't be used as it is, as a GlusterFS spec file. +# One need to seperate out server spec and client spec to get it working. + +#========================================================================= + +# **** server1 spec file **** + +### Export volume "brick" with the contents of "/home/export" directory. +volume posix1 + type storage/posix # POSIX FS translator + option directory /home/export1 # Export this directory +end-volume + +### Add POSIX record locking support to the storage brick +volume brick1 + type features/posix-locks + option mandatory on # enables mandatory locking on all files + subvolumes posix1 +end-volume + +### Add network serving capability to above brick. +volume server + type protocol/server + option transport-type tcp # For TCP/IP transport + option transport.socket.listen-port 6996 # Default is 6996 +# option client-volume-filename /etc/glusterfs/glusterfs-client.vol + subvolumes brick1 + option auth.addr.brick1.allow * # access to "brick" volume +end-volume + + +#========================================================================= + +# **** server2 spec file **** +volume posix2 + type storage/posix # POSIX FS translator + option directory /home/export2 # Export this directory +end-volume + +### Add POSIX record locking support to the storage brick +volume brick2 + type features/posix-locks + option mandatory on # enables mandatory locking on all files + subvolumes posix2 +end-volume + +### Add network serving capability to above brick. +volume server + type protocol/server + option transport-type tcp # For TCP/IP transport + option transport.socket.listen-port 6997 # Default is 6996 + subvolumes brick2 + option auth.addr.brick2.allow * # Allow access to "brick" volume +end-volume + + +#========================================================================= + +# **** server3 spec file **** + +volume posix3 + type storage/posix # POSIX FS translator + option directory /home/export3 # Export this directory +end-volume + +### Add POSIX record locking support to the storage brick +volume brick3 + type features/posix-locks + option mandatory on # enables mandatory locking on all files + subvolumes posix3 +end-volume + +### Add network serving capability to above brick. +volume server + type protocol/server + option transport-type tcp # For TCP/IP transport + option transport.socket.listen-port 6998 # Default is 6996 + subvolumes brick3 + option auth.addr.brick3.allow * # access to "brick" volume +end-volume + + +#========================================================================= + +# **** Clustered Client config file **** + +### Add client feature and attach to remote subvolume of server1 +volume client1 + type protocol/client + option transport-type tcp # for TCP/IP transport + option remote-host 127.0.0.1 # IP address of the remote brick + option transport.socket.remote-port 6996 # default server port is 6996 + option remote-subvolume brick1 # name of the remote volume +end-volume + +### Add client feature and attach to remote subvolume of server2 +volume client2 + type protocol/client + option transport-type tcp # for TCP/IP transport + option remote-host 127.0.0.1 # IP address of the remote brick + option transport.socket.remote-port 6997 # default server port is 6996 + option remote-subvolume brick2 # name of the remote volume +end-volume + +volume client3 + type protocol/client + option transport-type tcp # for TCP/IP transport + option remote-host 127.0.0.1 # IP address of the remote brick + option transport.socket.remote-port 6998 # default server port is 6996 + option remote-subvolume brick3 # name of the remote volume +end-volume + +## Add Stripe Feature. +volume stripe + type cluster/stripe + subvolumes client1 client2 client3 + option block-size 1MB +end-volume + diff --git a/doc/examples/trace.vol b/doc/examples/trace.vol new file mode 100644 index 000000000..3f4864db4 --- /dev/null +++ b/doc/examples/trace.vol @@ -0,0 +1,16 @@ +volume client + type protocol/client + option transport-type tcp # for TCP/IP transport + option remote-host 192.168.1.10 # IP address of the remote brick + option remote-subvolume brick # name of the remote volume +end-volume + +### 'Trace' translator is a very handy debug tool for GlusterFS, as it can be loaded between any of the two volumes without changing the behaviour of the filesystem. +# On client side it can be the top most volume in spec (like now) to understand what calls are made on FUSE filesystem, when a mounted filesystem is accessed. + +volume trace + type debug/trace + subvolumes client +end-volume + +# 'NOTE:' By loading 'debug/trace' translator, filesystem will be very slow as it logs each and every calls to the log file. diff --git a/doc/examples/trash.vol b/doc/examples/trash.vol new file mode 100644 index 000000000..16e71be32 --- /dev/null +++ b/doc/examples/trash.vol @@ -0,0 +1,20 @@ + +volume brick + type storage/posix # POSIX FS translator + option directory /home/export # Export this directory +end-volume + +### 'Trash' translator is best used on server side as it just renames the deleted file inside 'trash-dir', and it makes 4 seperate fops for one unlink call. +volume trashcan + type features/trash + subvolumes brick + option trash-dir /.trashcan +end-volume + +volume server + type protocol/server + subvolumes trashcan brick + option transport-type tcp # For TCP/IP transport + option auth.addr.brick.allow 192.168.* # Allow access to "brick" volume + option auth.addr.trashcan.allow 192.168.* # Allow access to "p-locks" volume +end-volume diff --git a/doc/examples/unify.vol b/doc/examples/unify.vol new file mode 100644 index 000000000..4f7415a23 --- /dev/null +++ b/doc/examples/unify.vol @@ -0,0 +1,178 @@ +### 'NOTE' +# This file has both server spec and client spec to get an understanding of stripe's spec file. Hence can't be used as it is, as a GlusterFS spec file. +# One need to seperate out server spec and client spec to get it working. + + +#========================================================================= + +# **** server1 spec file **** + +### Export volume "brick" with the contents of "/home/export" directory. +volume posix1 + type storage/posix # POSIX FS translator + option directory /home/export1 # Export this directory +end-volume + +### Add POSIX record locking support to the storage brick +volume brick1 + type features/posix-locks + option mandatory on # enables mandatory locking on all files + subvolumes posix1 +end-volume + +### Add network serving capability to above brick. +volume server + type protocol/server + option transport-type tcp # For TCP/IP transport + option transport.socket.listen-port 6996 # Default is 6996 +# option client-volume-filename /etc/glusterfs/glusterfs-client.vol + subvolumes brick1 + option auth.addr.brick1.allow * # access to "brick" volume +end-volume + + +#========================================================================= + +# **** server2 spec file **** +volume posix2 + type storage/posix # POSIX FS translator + option directory /home/export2 # Export this directory +end-volume + +### Add POSIX record locking support to the storage brick +volume brick2 + type features/posix-locks + option mandatory on # enables mandatory locking on all files + subvolumes posix2 +end-volume + +### Add network serving capability to above brick. +volume server + type protocol/server + option transport-type tcp # For TCP/IP transport + option transport.socket.listen-port 6997 # Default is 6996 + subvolumes brick2 + option auth.addr.brick2.allow * # Allow access to "brick" volume +end-volume + + +#========================================================================= + +# **** server3 spec file **** + +volume posix3 + type storage/posix # POSIX FS translator + option directory /home/export3 # Export this directory +end-volume + +### Add POSIX record locking support to the storage brick +volume brick3 + type features/posix-locks + option mandatory on # enables mandatory locking on all files + subvolumes posix3 +end-volume + +### Add network serving capability to above brick. +volume server + type protocol/server + option transport-type tcp # For TCP/IP transport + option transport.socket.listen-port 6998 # Default is 6996 + subvolumes brick3 + option auth.addr.brick3.allow * # access to "brick" volume +end-volume + +#========================================================================= + +# *** server for namespace *** +### Export volume "brick" with the contents of "/home/export" directory. +volume brick-ns + type storage/posix # POSIX FS translator + option directory /home/export-ns # Export this directory +end-volume + +volume server + type protocol/server + option transport-type tcp # For TCP/IP transport + option transport.socket.listen-port 6999 # Default is 6996 + subvolumes brick-ns + option auth.addr.brick-ns.allow * # access to "brick" volume +end-volume + + +#========================================================================= + +# **** Clustered Client config file **** + +### Add client feature and attach to remote subvolume of server1 +volume client1 + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport + option remote-host 127.0.0.1 # IP address of the remote brick + option transport.socket.remote-port 6996 # default server port is 6996 + option remote-subvolume brick1 # name of the remote volume +end-volume + +### Add client feature and attach to remote subvolume of server2 +volume client2 + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport + option remote-host 127.0.0.1 # IP address of the remote brick + option transport.socket.remote-port 6997 # default server port is 6996 + option remote-subvolume brick2 # name of the remote volume +end-volume + +volume client3 + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport + option remote-host 127.0.0.1 # IP address of the remote brick + option transport.socket.remote-port 6998 # default server port is 6996 + option remote-subvolume brick3 # name of the remote volume +end-volume + + +volume client-ns + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport + option remote-host 127.0.0.1 # IP address of the remote brick + option transport.socket.remote-port 6999 # default server port is 6996 + option remote-subvolume brick-ns # name of the remote volume +end-volume + +### Add unify feature to cluster the servers. Associate an +### appropriate scheduler that matches your I/O demand. +volume bricks + type cluster/unify + option namespace client-ns # this will not be storage child of unify. + subvolumes client1 client2 client3 +### ** ALU Scheduler Option ** + option self-heal background # foreground off # default is foreground + option scheduler alu + option alu.limits.min-free-disk 5% #% + option alu.limits.max-open-files 10000 + option alu.order disk-usage:read-usage:write-usage:open-files-usage:disk-speed-usage + option alu.disk-usage.entry-threshold 2GB + option alu.disk-usage.exit-threshold 128MB + option alu.open-files-usage.entry-threshold 1024 + option alu.open-files-usage.exit-threshold 32 + option alu.read-usage.entry-threshold 20 #% + option alu.read-usage.exit-threshold 4 #% + option alu.write-usage.entry-threshold 20 #% + option alu.write-usage.exit-threshold 4 #% + option alu.disk-speed-usage.entry-threshold 0 # DO NOT SET IT. SPEED IS CONSTANT!!!. + option alu.disk-speed-usage.exit-threshold 0 # DO NOT SET IT. SPEED IS CONSTANT!!!. + option alu.stat-refresh.interval 10sec + option alu.stat-refresh.num-file-create 10 +### ** Random Scheduler ** +# option scheduler random +### ** NUFA Scheduler ** +# option scheduler nufa +# option nufa.local-volume-name posix1 +### ** Round Robin (RR) Scheduler ** +# option scheduler rr +# option rr.limits.min-free-disk 5% #% +end-volume + diff --git a/doc/examples/write-behind.vol b/doc/examples/write-behind.vol new file mode 100644 index 000000000..9c6bae11c --- /dev/null +++ b/doc/examples/write-behind.vol @@ -0,0 +1,26 @@ +volume client + type protocol/client + option transport-type tcp # for TCP/IP transport + option remote-host 192.168.1.10 # IP address of the remote brick + option remote-subvolume brick # name of the remote volume +end-volume + +## In normal clustered storage type, any of the cluster translators can come here. +# +# Definition of other clients +# +# Definition of cluster translator (may be unify, replicate, or unify over replicate) +# + + +### 'Write-behind' translator is a performance booster for write operation. Best used on client side, as its main intension is to reduce the network latency caused for each write operation. + +volume wb + type performance/write-behind + subvolumes client # In this example it is 'client' you may have to change it according to your spec file. + option flush-behind on # default value is 'off' + option window-size 2MB + option aggregate-size 1MB # default value is 0 + option enable_O_SYNC no # default is no + option disable-for-first-nbytes 128KB #default is 1 +end-volume diff --git a/doc/get_put_api_using_xattr.txt b/doc/get_put_api_using_xattr.txt new file mode 100644 index 000000000..58951f5bf --- /dev/null +++ b/doc/get_put_api_using_xattr.txt @@ -0,0 +1,22 @@ +GlusterFS get/put API interface provided through extended attributes: + +API usage: + int put(dirpath/filename, data): setfattr -n glusterfs.file. -v + void *get(dirpath/filename): getfattr -n glusterfs.file. + + +internals: +* unify handling setxattr/getxattr + - setxattr + unify's setxattr forwards setxattr call to all the child nodes with XATTR_REPLACE flag, except namespace. setxattr will succeeds only on the child node on which the file already exists. if the setxattr operation fails on all child nodes, it indicates that the file does not already exist on any of the child nodes. unify follows the same rules as it follows for create, but using setxattr call itself with XATTR_CREATE flag. unify sends a setxattr to namespace first, with zero length data. if namespace setxattr succeeds, unify schedules setxattr to one of the child nodes. + + - getxattr + unify's getxattr forwards getxattr call to all the child nodes. wait for completion of operation on all the child nodes, and returns success if getxattr succeeded one child node. + +* posix handling setxattr/getxattr + - setxattr + posix setxattr does a open with O_CREAT|O_TRUNC on the /, writes value of the setxattr as data into the file and closes the file. when data is null, posix setxattr avoids doing write. file is closed after write. + + - getxattr + posix getxattr does open with O_RDONLY on the /, reads the complete content of the file. file is closed after read. + diff --git a/doc/glusterfs.8 b/doc/glusterfs.8 new file mode 100644 index 000000000..46c596a5b --- /dev/null +++ b/doc/glusterfs.8 @@ -0,0 +1,139 @@ +.\" Copyright (c) 2008 Z RESEARCH, Inc. +.\" This file is part of GlusterFS. +.\" +.\" GlusterFS is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published +.\" by the Free Software Foundation; either version 3 of the License, +.\" or (at your option) any later version. +.\" +.\" GlusterFS is distributed in the hope that it will be useful, but +.\" WITHOUT ANY WARRANTY; without even the implied warranty of +.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +.\" General Public License for more details. +.\" +.\" You should have received a copy of the GNU General Public License +.\" long with this program. If not, see +.\" . +.\" +.\" :O +.\" +.TH GlusterFS 8 ":O Cluster Filesystem" "07 December 2008" "Z Research Inc." +.SH NAME +GlusterFS \- Clustered Filesystem. +.SH SYNOPSYS +.B glusterfs +.I [options] [mountpoint] +.PP +.SH DESCRIPTION +GlusterFS is a clustered file-system capable of scaling to several peta-bytes. It aggregates various storage bricks over Infiniband RDMA or TCP/IP interconnect into one large parallel network file system. Storage bricks can be made of any commodity hardware such as x86-64 server with SATA-II RAID and Infiniband HBA. +GlusterFS is fully POSIX compliant FileSystem. On client side, it has dependency on FUSE package, on server side, it works seemlessly on different OSes. (Currently supported on GNU/Linux, Mac OSX, FreeBSD, OpenSolaris). +.SH OPTIONS +.PP +Mandatory or optional arguments to long options are also mandatory or optional +for any corresponding short options. +.SS "Basic options" +.PP +.TP + +\fB\-f, \fB\-\-volfile=VOLUME-FILE\fR +File to use as VOLUME-FILE [default:/etc/glusterfs/glusterfs.vol] +.TP +\fB\-l, \fB\-\-log\-file=LOGFILE\fR +File to use for logging [default:/var/log/glusterfs/glusterfs.log] +.TP +\fB\-L, \fB\-\-log\-level=LOGLEVEL\fR +Logging severity. Valid options are DEBUG, WARNING, ERROR, CRITICAL +and NONE [default: WARNING] +.TP +\fB\-s, \fB\-\-volfile\-server=SERVER\fR +Server to get the volume from. This option overrides \fB\-\-volfile option + +.SS "Advanced options" +.PP +.TP + +\fB\-\-debug\fR +Run in debug mode. This option sets \fB\-\-no\-daemon\fR, \fB\-\-log\-level\fR to DEBUG +and \fB\-\-log\-file\fR to console +.TP +\fB\-N, \fB\-\-no\-daemon\fR +Run in foreground +.TP +\fB\-p, \fB\-\-pid\-file=PIDFILE\fR +File to use as pid file +.TP +\fB\-\-volfile\-id=KEY\fR +KEY of the volume file to be fetched from server +.TP +\fB\-\-volfile\-server\-port=PORT\fR +Port number of volfile server +.TP +\fB\-\-volfile\-server\-transport=TRANSPORT\fR +Transport type to get volume file from server [default: socket] +.TP +\fB\-\-volume\-name=VOLUME\-NAME\fR +Volume name to be used for MOUNT-POINT [default: top most volume in +VOLUME-FILE] +.TP +\fB\-\-xlator\-option=VOLUME\-NAME.OPTION=VALUE\fR +Add/override a translator option for a volume with the specified value + + +.SS "Fuse options" +.PP +.TP + +\fB\-\-attribute\-timeout=SECONDS\fR +Set attribute timeout to SECONDS for inodes in fuse kernel module [default: 1] +.TP +\fB\-\-entry\-timeout=SECONDS\fR +Set entry timeout to SECONDS in fuse kernel module [default: 1] +.TP +\fB\-\-disable\-direct\-io\-mode\fR +Disable direct I/O mode in fuse kernel module +.TP + +.SS "Miscellaneous Options" +.PP +.TP + +\fB\-?, \fB\-\-help\fR +Give this help list +.TP +\fB\-\-usage\fR +Give a short usage message +.TP +\fB\-V, \fB\-\-version\fR +Print program version + +.PP +.SH FILES +/etc/glusterfs/*.vol + +.SH SEE ALSO +.nf +The full documentation for \fBGlusterFS\fR is maintained as a Texinfo manual. +If the \fBinfo\fR and \fBglusterfs\fR are properly installed on your site, the command + \fBinfo glusterfs\fR +should give you access to complete documentation. + +.nf +\fBbison\fR(1) \fBflex\fR(1) \fBfusermount\fR(1) +\fBhttp://www.glusterfs.org/ +\fR +.fi +.SH SUPPORT +.nf +\fBhttp://support.gluster.com/ +\fR +.fi +.SH AUTHORS +.nf +\fBhttp://www.gluster.org/core-team.php +\fR +.fi +.SH COPYRIGHT +.nf +\fBCopyright(c)2006,2007,2008,2009 Z RESEARCH, Inc. +\fR +.fi diff --git a/doc/glusterfs.vol.sample b/doc/glusterfs.vol.sample new file mode 100644 index 000000000..e126f66b3 --- /dev/null +++ b/doc/glusterfs.vol.sample @@ -0,0 +1,61 @@ +### file: client-volume.vol.sample + +##################################### +### GlusterFS Client Volume File ## +##################################### + +#### CONFIG FILE RULES: +### "#" is comment character. +### - Config file is case sensitive +### - Options within a volume block can be in any order. +### - Spaces or tabs are used as delimitter within a line. +### - Each option should end within a line. +### - Missing or commented fields will assume default values. +### - Blank/commented lines are allowed. +### - Sub-volumes should already be defined above before referring. + +### Add client feature and attach to remote subvolume +volume client + type protocol/client + option transport-type tcp +# option transport-type unix +# option transport-type ib-sdp + option remote-host 127.0.0.1 # IP address of the remote brick +# option transport.socket.remote-port 6996 # default server port is 6996 + +# option transport-type ib-verbs +# option transport.ib-verbs.remote-port 6996 # default server port is 6996 +# option transport.ib-verbs.work-request-send-size 1048576 +# option transport.ib-verbs.work-request-send-count 16 +# option transport.ib-verbs.work-request-recv-size 1048576 +# option transport.ib-verbs.work-request-recv-count 16 + +# option transport-timeout 30 # seconds to wait for a reply + # from server for each request + option remote-subvolume brick # name of the remote volume +end-volume + +### Add readahead feature +#volume readahead +# type performance/read-ahead +# option page-size 1MB # unit in bytes +# option page-count 2 # cache per file = (page-count x page-size) +# subvolumes client +#end-volume + +### Add IO-Cache feature +#volume iocache +# type performance/io-cache +# option page-size 256KB +# option page-count 2 +# subvolumes readahead +#end-volume + +### Add writeback feature +#volume writeback +# type performance/write-behind +# option aggregate-size 1MB +# option window-size 2MB +# option flush-behind off +# subvolumes iocache +#end-volume diff --git a/doc/glusterfsd.vol.sample b/doc/glusterfsd.vol.sample new file mode 100644 index 000000000..b6d8a1580 --- /dev/null +++ b/doc/glusterfsd.vol.sample @@ -0,0 +1,47 @@ +### file: server-volume.vol.sample + +##################################### +### GlusterFS Server Volume File ## +##################################### + +#### CONFIG FILE RULES: +### "#" is comment character. +### - Config file is case sensitive +### - Options within a volume block can be in any order. +### - Spaces or tabs are used as delimitter within a line. +### - Multiple values to options will be : delimitted. +### - Each option should end within a line. +### - Missing or commented fields will assume default values. +### - Blank/commented lines are allowed. +### - Sub-volumes should already be defined above before referring. + +### Export volume "brick" with the contents of "/home/export" directory. +volume brick + type storage/posix # POSIX FS translator + option directory /home/export # Export this directory +end-volume + +### Add network serving capability to above brick. +volume server + type protocol/server + option transport-type tcp +# option transport-type unix +# option transport-type ib-sdp +# option transport.socket.bind-address 192.168.1.10 # Default is to listen on all interfaces +# option transport.socket.listen-port 6996 # Default is 6996 + +# option transport-type ib-verbs +# option transport.ib-verbs.bind-address 192.168.1.10 # Default is to listen on all interfaces +# option transport.ib-verbs.listen-port 6996 # Default is 6996 +# option transport.ib-verbs.work-request-send-size 131072 +# option transport.ib-verbs.work-request-send-count 64 +# option transport.ib-verbs.work-request-recv-size 131072 +# option transport.ib-verbs.work-request-recv-count 64 + +# option client-volume-filename /etc/glusterfs/glusterfs-client.vol + subvolumes brick +# NOTE: Access to any volume through protocol/server is denied by +# default. You need to explicitly grant access through # "auth" +# option. + option auth.addr.brick.allow * # Allow access to "brick" volume +end-volume diff --git a/doc/hacker-guide/Makefile.am b/doc/hacker-guide/Makefile.am new file mode 100644 index 000000000..65c92ac23 --- /dev/null +++ b/doc/hacker-guide/Makefile.am @@ -0,0 +1,8 @@ +EXTRA_DIST = replicate.txt bdb.txt posix.txt call-stub.txt write-behind.txt + +#EXTRA_DIST = hacker-guide.tex afr.txt bdb.txt posix.txt call-stub.txt write-behind.txt +#hacker_guidedir = $(docdir) +#hacker_guide_DATA = hacker-guide.pdf + +#hacker-guide.pdf: $(EXTRA_DIST) +# pdflatex $(srcdir)/hacker-guide.tex diff --git a/doc/hacker-guide/adding-fops.txt b/doc/hacker-guide/adding-fops.txt new file mode 100644 index 000000000..293de2637 --- /dev/null +++ b/doc/hacker-guide/adding-fops.txt @@ -0,0 +1,33 @@ + HOW TO ADD A NEW FOP TO GlusterFS + ================================= + +Steps to be followed when adding a new FOP to GlusterFS: + +1. Edit glusterfs.h and add a GF_FOP_* constant. + +2. Edit xlator.[ch] and: + 2a. add the new prototype for fop and callback. + 2b. edit xlator_fops structure. + +3. Edit xlator.c and add to fill_defaults. + +4. Edit protocol.h and add struct necessary for the new FOP. + +5. Edit defaults.[ch] and provide default implementation. + +6. Edit call-stub.[ch] and provide stub implementation. + +7. Edit common-utils.c and add to gf_global_variable_init(). + +8. Edit client-protocol and add your FOP. + +9. Edit server-protocol and add your FOP. + +10. Implement your FOP in any translator for which the default implementation + is not sufficient. + +========================================== +Last updated: Mon Oct 27 21:35:49 IST 2008 + +Author: Vikas Gorur +========================================== diff --git a/doc/hacker-guide/bdb.txt b/doc/hacker-guide/bdb.txt new file mode 100644 index 000000000..fd0bd3652 --- /dev/null +++ b/doc/hacker-guide/bdb.txt @@ -0,0 +1,70 @@ + +* How does file translates to key/value pair? +--------------------------------------------- + + in bdb a file is identified by key (obtained by taking basename() of the path of +the file) and file contents are stored as value corresponding to the key in database +file (defaults to glusterfs_storage.db under dirname() directory). + +* symlinks, directories +----------------------- + + symlinks and directories are stored as is. + +* db (database) files +--------------------- + + every directory, including root directory, contains a database file called +glusterfs_storage.db. all the regular files contained in the directory are stored +as key/value pair inside the glusterfs_storage.db. + +* internal data cache +--------------------- + + db does not provide a way to find out the size of the value corresponding to a key. +so, bdb makes DB->get() call for key and takes the length of the value returned. +since DB->get() also returns file contents for key, bdb maintains an internal cache and +stores the file contents in the cache. + every directory maintains a seperate cache. + +* inode number transformation +----------------------------- + + bdb allocates a inode number to each file and directory on its own. bdb maintains a +global counter and increments it after allocating inode number for each file +(regular, symlink or directory). NOTE: bdb does not guarantee persistent inode numbers. + +* checkpoint thread +------------------- + + bdb creates a checkpoint thread at the time of init(). checkpoint thread does a +periodic checkpoint on the DB_ENV. checkpoint is the mechanism, provided by db, to +forcefully commit the logged transactions to the storage. + +NOTES ABOUT FOPS: +----------------- + +lookup() - + 1> do lstat() on the path, if lstat fails, we assume that the file being looked up + is either a regular file or doesn't exist. + 2> lookup in the DB of parent directory for key corresponding to path. if key exists, + return key, with. + NOTE: 'struct stat' stat()ed from DB file is used as a container for 'struct stat' + of the regular file. st_ino, st_size, st_blocks are updated with file's values. + +readv() - + 1> do a lookup in bctx cache. if successful, return the requested data from cache. + 2> if cache missed, do a DB->get() the entire file content and insert to cache. + +writev(): + 1> flush any cached content of this file. + 2> do a DB->put(), with DB_DBT_PARTIAL flag. + NOTE: DB_DBT_PARTIAL is used to do partial update of a value in DB. + +readdir(): + 1> regular readdir() in a loop, and vomit all DB_ENV log files and DB files that + we encounter. + 2> if the readdir() buffer still has space, open a DB cursor and do a sequential + DBC->get() to fill the reaadir buffer. + + diff --git a/doc/hacker-guide/call-stub.txt b/doc/hacker-guide/call-stub.txt new file mode 100644 index 000000000..bca1579b2 --- /dev/null +++ b/doc/hacker-guide/call-stub.txt @@ -0,0 +1,1033 @@ +creating a call stub and pausing a call +--------------------------------------- +libglusterfs provides seperate API to pause each of the fop. parameters to each API is +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). + NOTE: @fn should exactly take the same type and number of parameters that + the corresponding regular fop takes. +rest will be the regular parameters to corresponding fop. + +NOTE: @frame can never be NULL. fop__stub() fails with errno + set to EINVAL, if @frame is NULL. also wherever @loc is applicable, + @loc cannot be NULL. + +refer to individual stub creation API to know about call-stub creation's behaviour with +specific parameters. + +here is the list of stub creation APIs for xlator fops. + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@need_xattr - flag to specify if xattr should be returned or not. +call_stub_t * +fop_lookup_stub (call_frame_t *frame, + fop_lookup_t fn, + loc_t *loc, + int32_t need_xattr); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +call_stub_t * +fop_stat_stub (call_frame_t *frame, + fop_stat_t fn, + loc_t *loc); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to lk fop. + NOTE: @fd is stored with a fd_ref(). +call_stub_t * +fop_fstat_stub (call_frame_t *frame, + fop_fstat_t fn, + fd_t *fd); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and + @loc->parent, if not NULL. also @loc->path will be copied to a different location. +@mode - mode parameter to chmod. +call_stub_t * +fop_chmod_stub (call_frame_t *frame, + fop_chmod_t fn, + loc_t *loc, + mode_t mode); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to lk fop. + NOTE: @fd is stored with a fd_ref(). +@mode - mode parameter for fchmod fop. +call_stub_t * +fop_fchmod_stub (call_frame_t *frame, + fop_fchmod_t fn, + fd_t *fd, + mode_t mode); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and + @loc->parent, if not NULL. also @loc->path will be copied to a different location. +@uid - uid parameter to chown. +@gid - gid parameter to chown. +call_stub_t * +fop_chown_stub (call_frame_t *frame, + fop_chown_t fn, + loc_t *loc, + uid_t uid, + gid_t gid); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to lk fop. + NOTE: @fd is stored with a fd_ref(). +@uid - uid parameter to fchown. +@gid - gid parameter to fchown. +call_stub_t * +fop_fchown_stub (call_frame_t *frame, + fop_fchown_t fn, + fd_t *fd, + uid_t uid, + gid_t gid); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location, if not NULL. +@off - offset parameter to truncate fop. +call_stub_t * +fop_truncate_stub (call_frame_t *frame, + fop_truncate_t fn, + loc_t *loc, + off_t off); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to lk fop. + NOTE: @fd is stored with a fd_ref(). +@off - offset parameter to ftruncate fop. +call_stub_t * +fop_ftruncate_stub (call_frame_t *frame, + fop_ftruncate_t fn, + fd_t *fd, + off_t off); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@tv - tv parameter to utimens fop. +call_stub_t * +fop_utimens_stub (call_frame_t *frame, + fop_utimens_t fn, + loc_t *loc, + struct timespec tv[2]); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@mask - mask parameter for access fop. +call_stub_t * +fop_access_stub (call_frame_t *frame, + fop_access_t fn, + loc_t *loc, + int32_t mask); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@size - size parameter to readlink fop. +call_stub_t * +fop_readlink_stub (call_frame_t *frame, + fop_readlink_t fn, + loc_t *loc, + size_t size); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@mode - mode parameter to mknod fop. +@rdev - rdev parameter to mknod fop. +call_stub_t * +fop_mknod_stub (call_frame_t *frame, + fop_mknod_t fn, + loc_t *loc, + mode_t mode, + dev_t rdev); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@mode - mode parameter to mkdir fop. +call_stub_t * +fop_mkdir_stub (call_frame_t *frame, + fop_mkdir_t fn, + loc_t *loc, + mode_t mode); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +call_stub_t * +fop_unlink_stub (call_frame_t *frame, + fop_unlink_t fn, + loc_t *loc); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +call_stub_t * +fop_rmdir_stub (call_frame_t *frame, + fop_rmdir_t fn, + loc_t *loc); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@linkname - linkname parameter to symlink fop. +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +call_stub_t * +fop_symlink_stub (call_frame_t *frame, + fop_symlink_t fn, + const char *linkname, + loc_t *loc); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@oldloc - pointer to location structure. + NOTE: @oldloc will be copied to a different location, with inode_ref() to + @oldloc->inode and @oldloc->parent, if not NULL. also @oldloc->path will + be copied to a different location, if not NULL. +@newloc - pointer to location structure. + NOTE: @newloc will be copied to a different location, with inode_ref() to + @newloc->inode and @newloc->parent, if not NULL. also @newloc->path will + be copied to a different location, if not NULL. +call_stub_t * +fop_rename_stub (call_frame_t *frame, + fop_rename_t fn, + loc_t *oldloc, + loc_t *newloc); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@newpath - newpath parameter to link fop. +call_stub_t * +fop_link_stub (call_frame_t *frame, + fop_link_t fn, + loc_t *oldloc, + const char *newpath); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@flags - flags parameter to create fop. +@mode - mode parameter to create fop. +@fd - file descriptor parameter to create fop. + NOTE: @fd is stored with a fd_ref(). +call_stub_t * +fop_create_stub (call_frame_t *frame, + fop_create_t fn, + loc_t *loc, + int32_t flags, + mode_t mode, fd_t *fd); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@flags - flags parameter to open fop. +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +call_stub_t * +fop_open_stub (call_frame_t *frame, + fop_open_t fn, + loc_t *loc, + int32_t flags, + fd_t *fd); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to lk fop. + NOTE: @fd is stored with a fd_ref(). +@size - size parameter to readv fop. +@off - offset parameter to readv fop. +call_stub_t * +fop_readv_stub (call_frame_t *frame, + fop_readv_t fn, + fd_t *fd, + size_t size, + off_t off); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to lk fop. + NOTE: @fd is stored with a fd_ref(). +@vector - vector parameter to writev fop. + NOTE: @vector is iov_dup()ed while creating stub. and frame->root->req_refs + dictionary is dict_ref()ed. +@count - count parameter to writev fop. +@off - off parameter to writev fop. +call_stub_t * +fop_writev_stub (call_frame_t *frame, + fop_writev_t fn, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to flush fop. + NOTE: @fd is stored with a fd_ref(). +call_stub_t * +fop_flush_stub (call_frame_t *frame, + fop_flush_t fn, + fd_t *fd); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to lk fop. + NOTE: @fd is stored with a fd_ref(). +@datasync - datasync parameter to fsync fop. +call_stub_t * +fop_fsync_stub (call_frame_t *frame, + fop_fsync_t fn, + fd_t *fd, + int32_t datasync); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to @loc->inode and + @loc->parent, if not NULL. also @loc->path will be copied to a different location. +@fd - file descriptor parameter to opendir fop. + NOTE: @fd is stored with a fd_ref(). +call_stub_t * +fop_opendir_stub (call_frame_t *frame, + fop_opendir_t fn, + loc_t *loc, + fd_t *fd); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to getdents fop. + NOTE: @fd is stored with a fd_ref(). +@size - size parameter to getdents fop. +@off - off parameter to getdents fop. +@flags - flags parameter to getdents fop. +call_stub_t * +fop_getdents_stub (call_frame_t *frame, + fop_getdents_t fn, + fd_t *fd, + size_t size, + off_t off, + int32_t flag); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to setdents fop. + NOTE: @fd is stored with a fd_ref(). +@flags - flags parameter to setdents fop. +@entries - entries parameter to setdents fop. +call_stub_t * +fop_setdents_stub (call_frame_t *frame, + fop_setdents_t fn, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to setdents fop. + NOTE: @fd is stored with a fd_ref(). +@datasync - datasync parameter to fsyncdir fop. +call_stub_t * +fop_fsyncdir_stub (call_frame_t *frame, + fop_fsyncdir_t fn, + fd_t *fd, + int32_t datasync); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +call_stub_t * +fop_statfs_stub (call_frame_t *frame, + fop_statfs_t fn, + loc_t *loc); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@dict - dict parameter to setxattr fop. + NOTE: stub creation procedure stores @dict pointer with dict_ref() to it. +call_stub_t * +fop_setxattr_stub (call_frame_t *frame, + fop_setxattr_t fn, + loc_t *loc, + dict_t *dict, + int32_t flags); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@name - name parameter to getxattr fop. +call_stub_t * +fop_getxattr_stub (call_frame_t *frame, + fop_getxattr_t fn, + loc_t *loc, + const char *name); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@name - name parameter to removexattr fop. + NOTE: name string will be copied to a different location while creating stub. +call_stub_t * +fop_removexattr_stub (call_frame_t *frame, + fop_removexattr_t fn, + loc_t *loc, + const char *name); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to lk fop. + NOTE: @fd is stored with a fd_ref(). +@cmd - command parameter to lk fop. +@lock - lock parameter to lk fop. + NOTE: lock will be copied to a different location while creating stub. +call_stub_t * +fop_lk_stub (call_frame_t *frame, + fop_lk_t fn, + fd_t *fd, + int32_t cmd, + struct flock *lock); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - fd parameter to gf_lk fop. + NOTE: @fd is fd_ref()ed while creating stub, if not NULL. +@cmd - cmd parameter to gf_lk fop. +@lock - lock paramater to gf_lk fop. + NOTE: @lock is copied to a different memory location while creating + stub. +call_stub_t * +fop_gf_lk_stub (call_frame_t *frame, + fop_gf_lk_t fn, + fd_t *fd, + int32_t cmd, + struct flock *lock); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@fd - file descriptor parameter to readdir fop. + NOTE: @fd is stored with a fd_ref(). +@size - size parameter to readdir fop. +@off - offset parameter to readdir fop. +call_stub_t * +fop_readdir_stub (call_frame_t *frame, + fop_readdir_t fn, + fd_t *fd, + size_t size, + off_t off); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@loc - pointer to location structure. + NOTE: @loc will be copied to a different location, with inode_ref() to + @loc->inode and @loc->parent, if not NULL. also @loc->path will be + copied to a different location. +@flags - flags parameter to checksum fop. +call_stub_t * +fop_checksum_stub (call_frame_t *frame, + fop_checksum_t fn, + loc_t *loc, + int32_t flags); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@inode - inode parameter to @fn. + NOTE: @inode pointer is stored with a inode_ref(). +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +@dict - dict parameter to @fn. + NOTE: @dict pointer is stored with dict_ref(). +call_stub_t * +fop_lookup_cbk_stub (call_frame_t *frame, + fop_lookup_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict); +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_stat_cbk_stub (call_frame_t *frame, + fop_stat_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_fstat_cbk_stub (call_frame_t *frame, + fop_fstat_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_chmod_cbk_stub (call_frame_t *frame, + fop_chmod_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_fchmod_cbk_stub (call_frame_t *frame, + fop_fchmod_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_chown_cbk_stub (call_frame_t *frame, + fop_chown_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_fchown_cbk_stub (call_frame_t *frame, + fop_fchown_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_truncate_cbk_stub (call_frame_t *frame, + fop_truncate_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_ftruncate_cbk_stub (call_frame_t *frame, + fop_ftruncate_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_utimens_cbk_stub (call_frame_t *frame, + fop_utimens_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +call_stub_t * +fop_access_cbk_stub (call_frame_t *frame, + fop_access_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@path - path parameter to @fn. + NOTE: @path is copied to a different memory location, if not NULL. +call_stub_t * +fop_readlink_cbk_stub (call_frame_t *frame, + fop_readlink_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + const char *path); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@inode - inode parameter to @fn. + NOTE: @inode pointer is stored with a inode_ref(). +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_mknod_cbk_stub (call_frame_t *frame, + fop_mknod_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@inode - inode parameter to @fn. + NOTE: @inode pointer is stored with a inode_ref(). +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_mkdir_cbk_stub (call_frame_t *frame, + fop_mkdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +call_stub_t * +fop_unlink_cbk_stub (call_frame_t *frame, + fop_unlink_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +call_stub_t * +fop_rmdir_cbk_stub (call_frame_t *frame, + fop_rmdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@inode - inode parameter to @fn. + NOTE: @inode pointer is stored with a inode_ref(). +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_symlink_cbk_stub (call_frame_t *frame, + fop_symlink_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_rename_cbk_stub (call_frame_t *frame, + fop_rename_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@inode - inode parameter to @fn. + NOTE: @inode pointer is stored with a inode_ref(). +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_link_cbk_stub (call_frame_t *frame, + fop_link_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@fd - fd parameter to @fn. + NOTE: @fd pointer is stored with a fd_ref(). +@inode - inode parameter to @fn. + NOTE: @inode pointer is stored with a inode_ref(). +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_create_cbk_stub (call_frame_t *frame, + fop_create_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@fd - fd parameter to @fn. + NOTE: @fd pointer is stored with a fd_ref(). +call_stub_t * +fop_open_cbk_stub (call_frame_t *frame, + fop_open_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + fd_t *fd); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@vector - vector parameter to @fn. + NOTE: @vector is copied to a different memory location, if not NULL. also + frame->root->rsp_refs is dict_ref()ed. +@stbuf - stbuf parameter to @fn. + NOTE: @stbuf is copied to a different memory location, if not NULL. +call_stub_t * +fop_readv_cbk_stub (call_frame_t *frame, + fop_readv_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@stbuf - stbuf parameter to @fn. + NOTE: @stbuf is copied to a different memory location, if not NULL. +call_stub_t * +fop_writev_cbk_stub (call_frame_t *frame, + fop_writev_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +call_stub_t * +fop_flush_cbk_stub (call_frame_t *frame, + fop_flush_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +call_stub_t * +fop_fsync_cbk_stub (call_frame_t *frame, + fop_fsync_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@fd - fd parameter to @fn. + NOTE: @fd pointer is stored with a fd_ref(). +call_stub_t * +fop_opendir_cbk_stub (call_frame_t *frame, + fop_opendir_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + fd_t *fd); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@entries - entries parameter to @fn. +@count - count parameter to @fn. +call_stub_t * +fop_getdents_cbk_stub (call_frame_t *frame, + fop_getdents_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +call_stub_t * +fop_setdents_cbk_stub (call_frame_t *frame, + fop_setdents_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +call_stub_t * +fop_fsyncdir_cbk_stub (call_frame_t *frame, + fop_fsyncdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@buf - buf parameter to @fn. + NOTE: @buf is copied to a different memory location, if not NULL. +call_stub_t * +fop_statfs_cbk_stub (call_frame_t *frame, + fop_statfs_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +call_stub_t * +fop_setxattr_cbk_stub (call_frame_t *frame, + fop_setxattr_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@value - value dictionary parameter to @fn. + NOTE: @value pointer is stored with a dict_ref(). +call_stub_t * +fop_getxattr_cbk_stub (call_frame_t *frame, + fop_getxattr_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + dict_t *value); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +call_stub_t * +fop_removexattr_cbk_stub (call_frame_t *frame, + fop_removexattr_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@lock - lock parameter to @fn. + NOTE: @lock is copied to a different memory location while creating + stub. +call_stub_t * +fop_lk_cbk_stub (call_frame_t *frame, + fop_lk_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct flock *lock); + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@lock - lock parameter to @fn. + NOTE: @lock is copied to a different memory location while creating + stub. +call_stub_t * +fop_gf_lk_cbk_stub (call_frame_t *frame, + fop_gf_lk_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct flock *lock); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@entries - entries parameter to @fn. +call_stub_t * +fop_readdir_cbk_stub (call_frame_t *frame, + fop_readdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries); + + +@frame - call frame which has to be used to resume the call at call_resume(). +@fn - procedure to call during call_resume(). +@op_ret - op_ret parameter to @fn. +@op_errno - op_errno parameter to @fn. +@file_checksum - file_checksum parameter to @fn. + NOTE: file_checksum will be copied to a different memory location + while creating stub. +@dir_checksum - dir_checksum parameter to @fn. + NOTE: file_checksum will be copied to a different memory location + while creating stub. +call_stub_t * +fop_checksum_cbk_stub (call_frame_t *frame, + fop_checksum_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum); + +resuming a call: +--------------- + call can be resumed using call stub through call_resume API. + + void call_resume (call_stub_t *stub); + + stub - call stub created during pausing a call. + + NOTE: call_resume() will decrease reference count of any fd_t, dict_t and inode_t that it finds + in stub->args... so, if any fd_t, dict_t or + inode_t pointers are assigned at stub->args.. after + fop__stub() call, they must be _ref()ed. + + call_resume does not STACK_DESTROY() for any fop. + + if stub->fn is NULL, call_resume does STACK_WIND() or STACK_UNWIND() using the stub->frame. + + return - call resume fails only if stub is NULL. call resume fails with errno set to EINVAL. diff --git a/doc/hacker-guide/hacker-guide.tex b/doc/hacker-guide/hacker-guide.tex new file mode 100644 index 000000000..72c44df1a --- /dev/null +++ b/doc/hacker-guide/hacker-guide.tex @@ -0,0 +1,312 @@ +\documentclass{book}[12pt] +\usepackage{graphicx} +% \usepackage{fancyhdr} + +% \pagestyle{fancy} +\begin{document} + +% \headheight 117pt +% \rhead{\includegraphics{zr-logo.eps}} + +\author{Z Research} +\title{GlusterFS 1.3 Hacker's Guide} +\date{June 1, 2007} + +\maketitle +\frontmatter +\tableofcontents + +\mainmatter +\chapter{Introduction} + +\section{Coding guidelines} +GlusterFS uses GNU Arch for version control. To get the latest source do: +\begin{verbatim} + $ tla register-archive http://arch.sv.gnu.org/archives/gluster + $ tla -A gluster@sv.gnu.org get glusterfs--mainline--2.4 +\end{verbatim} +\noindent +GlusterFS follows the GNU coding +standards\footnote{http://www.gnu.org/prep/standards\_toc.html} for the +most part. + +\chapter{Major components} +\section{libglusterfs} +\texttt{libglusterfs} contains supporting code used by all the other components. +The important files here are: + +\texttt{dict.c}: This is an implementation of a serializable dictionary type. It is +used by the protocol code to send requests and replies. It is also used to pass options +to translators. + +\texttt{logging.c}: This is a thread-safe logging library. The log messages go to a +file (default \texttt{/usr/local/var/log/glusterfs/*}). + +\texttt{protocol.c}: This file implements the GlusterFS on-the-wire +protocol. The protocol itself is a simple ASCII protocol, designed to +be easy to parse and be human readable. + +A sample GlusterFS protocol block looks like this: +\begin{verbatim} + Block Start header + 0000000000000023 callid + 00000001 type + 00000016 op + xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx human-readable name + 00000000000000000000000000000ac3 block size + <...> block + Block End +\end{verbatim} + +\texttt{stack.h}: This file defines the \texttt{STACK\_WIND} and +\texttt{STACK\_UNWIND} macros which are used to implement the parallel +stack that is maintained for inter-xlator calls. See the \textsl{Taking control +of the stack} section below for more details. + +\texttt{spec.y}: This contains the Yacc grammar for the GlusterFS +specification file, and the parsing code. + + +Draw diagrams of trees +Two rules: +(1) directory structure is same +(2) file can exist only on one node + +\section{glusterfs-fuse} +\section{glusterfsd} +\section{transport} +\section{scheduler} +\section{xlator} + +\chapter{xlators} +\section{Taking control of the stack} +One can think of STACK\_WIND/UNWIND as a very specific RPC mechanism. + +% \includegraphics{stack.eps} + +\section{Overview of xlators} + +\flushleft{\LARGE\texttt{cluster/}} +\vskip 2ex +\flushleft{\Large\texttt{afr}} +\vskip 2ex +\flushleft{\Large\texttt{stripe}} +\vskip 2ex +\flushleft{\Large\texttt{unify}} + +\vskip 4ex +\flushleft{\LARGE\texttt{debug/}} +\vskip 2ex +\flushleft{\Large\texttt{trace}} +\vskip 2ex +The trace xlator simply logs all fops and mops, and passes them through to its child. + +\vskip 4ex +\flushleft{\LARGE\texttt{features/}} +\flushleft{\Large\texttt{posix-locks}} +\vskip 2ex +This xlator implements \textsc{posix} record locking semantics over +any kind of storage. + +\vskip 4ex +\flushleft{\LARGE\texttt{performance/}} + +\flushleft{\Large\texttt{io-threads}} +\vskip 2ex +\flushleft{\Large\texttt{read-ahead}} +\vskip 2ex +\flushleft{\Large\texttt{stat-prefetch}} +\vskip 2ex +\flushleft{\Large\texttt{write-behind}} +\vskip 2ex + +\vskip 4ex +\flushleft{\LARGE\texttt{protocol/}} +\vskip 2ex + +\flushleft{\Large\texttt{client}} +\vskip 2ex + +\flushleft{\Large\texttt{server}} +\vskip 2ex + +\vskip 4ex +\flushleft{\LARGE\texttt{storage/}} +\flushleft{\Large\texttt{posix}} +\vskip 2ex +The \texttt{posix} xlator is the one which actually makes calls to the +on-disk filesystem. Currently this is the only storage xlator available. However, +plans to develop other storage xlators, such as one for Amazon's S3 service, are +on the roadmap. + +\chapter{Writing a simple xlator} +\noindent +In this section we're going to write a rot13 xlator. ``Rot13'' is a +simple substitution cipher which obscures a text by replacing each +letter with the letter thirteen places down the alphabet. So `a' (0) +would become `n' (12), `b' would be 'm', and so on. Rot13 applied to +a piece of ciphertext yields the plaintext again, because rot13 is its +own inverse, since: + +\[ +x_c = x + 13\; (mod\; 26) +\] +\[ +x_c + 13\; (mod\; 26) = x + 13 + 13\; (mod\; 26) = x +\] + +First we include the requisite headers. + +\begin{verbatim} +#include +#include + +#include "glusterfs.h" +#include "xlator.h" +#include "logging.h" + +/* + * This is a rot13 ``encryption'' xlator. It rot13's data when + * writing to disk and rot13's it back when reading it. + * This xlator is meant as an example, not for production + * use ;) (hence no error-checking) + */ + +\end{verbatim} + +Then we write the rot13 function itself. For simplicity, we only transform lower case +letters. Any other byte is passed through as it is. + +\begin{verbatim} +/* We only handle lower case letters for simplicity */ +static void +rot13 (char *buf, int len) +{ + int i; + for (i = 0; i < len; i++) { + if (isalpha (buf[i])) + buf[i] = (buf[i] - 'a' + 13) % 26; + else if (buf[i] <= 26) + buf[i] = (buf[i] + 13) % 26 + 'a'; + } +} +\end{verbatim} + +Next comes a utility function whose purpose will be clear after looking at the code +below. + +\begin{verbatim} +static void +rot13_iovec (struct iovec *vector, int count) +{ + int i; + for (i = 0; i < count; i++) { + rot13 (vector[i].iov_base, vector[i].iov_len); + } +} +\end{verbatim} + +\begin{verbatim} +static int32_t +rot13_readv_cbk (call_frame_t *frame, + call_frame_t *prev_frame, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count) +{ + rot13_iovec (vector, count); + + STACK_UNWIND (frame, op_ret, op_errno, vector, count); + return 0; +} + +static int32_t +rot13_readv (call_frame_t *frame, + xlator_t *this, + dict_t *ctx, + size_t size, + off_t offset) +{ + STACK_WIND (frame, + rot13_readv_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readv, + ctx, size, offset); + return 0; +} + +static int32_t +rot13_writev_cbk (call_frame_t *frame, + call_frame_t *prev_frame, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +static int32_t +rot13_writev (call_frame_t *frame, + xlator_t *this, + dict_t *ctx, + struct iovec *vector, + int32_t count, + off_t offset) +{ + rot13_iovec (vector, count); + + STACK_WIND (frame, + rot13_writev_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->writev, + ctx, vector, count, offset); + return 0; +} + +\end{verbatim} + +Every xlator must define two functions and two external symbols. The functions are +\texttt{init} and \texttt{fini}, and the symbols are \texttt{fops} and \texttt{mops}. +The \texttt{init} function is called when the xlator is loaded by GlusterFS, and +contains code for the xlator to initialize itself. Note that if an xlator is present +multiple times in the spec tree, the \texttt{init} function will be called each time +the xlator is loaded. + +\begin{verbatim} +int32_t +init (xlator_t *this) +{ + if (!this->children) { + gf_log ("rot13", GF_LOG_ERROR, + "FATAL: rot13 should have exactly one child"); + return -1; + } + + gf_log ("rot13", GF_LOG_DEBUG, "rot13 xlator loaded"); + return 0; +} +\end{verbatim} + +\begin{verbatim} + +void +fini (xlator_t *this) +{ + return; +} + +struct xlator_fops fops = { + .readv = rot13_readv, + .writev = rot13_writev +}; + +struct xlator_mops mops = { +}; + +\end{verbatim} + +\end{document} + diff --git a/doc/hacker-guide/posix.txt b/doc/hacker-guide/posix.txt new file mode 100644 index 000000000..d0132abfe --- /dev/null +++ b/doc/hacker-guide/posix.txt @@ -0,0 +1,59 @@ +--------------- +* storage/posix +--------------- + +- SET_FS_ID + + This is so that all filesystem checks are done with the user's + uid/gid and not GlusterFS's uid/gid. + +- MAKE_REAL_PATH + + This macro concatenates the base directory of the posix volume + ('option directory') with the given path. + +- need_xattr in lookup + + If this flag is passed, lookup returns a xattr dictionary that contains + the file's create time, the file's contents, and the version number + of the file. + + This is a hack to increase small file performance. If an application + wants to read a small file, it can finish its job with just a lookup + call instead of a lookup followed by read. + +- getdents/setdents + + These are used by unify to set and get directory entries. + +- ALIGN_BUF + + Macro to align an address to a page boundary (4K). + +- priv->export_statfs + + In some cases, two exported volumes may reside on the same + partition on the server. Sending statvfs info for both + the volumes will lead to erroneous df output at the client, + since free space on the partition will be counted twice. + + In such cases, user can disable exporting statvfs info + on one of the volumes by setting this option. + +- xattrop + + This fop is used by replicate to set version numbers on files. + +- getxattr/setxattr hack to read/write files + + A key, GLUSTERFS_FILE_CONTENT_STRING, is handled in a special way by + getxattr/setxattr. A getxattr with the key will return the entire + content of the file as the value. A setxattr with the key will write + the value as the entire content of the file. + +- posix_checksum + + This calculates a simple XOR checksum on all entry names in a + directory that is used by unify to compare directory contents. + + diff --git a/doc/hacker-guide/replicate.txt b/doc/hacker-guide/replicate.txt new file mode 100644 index 000000000..284f373fb --- /dev/null +++ b/doc/hacker-guide/replicate.txt @@ -0,0 +1,206 @@ +--------------- +* cluster/replicate +--------------- + +Before understanding replicate, one must understand two internal FOPs: + +GF_FILE_LK: + This is exactly like fcntl(2) locking, except the locks are in a + separate domain from locks held by applications. + +GF_DIR_LK (loc_t *loc, char *basename): + This allows one to lock a name under a directory. For example, + to lock /mnt/glusterfs/foo, one would use the call: + + GF_DIR_LK ({loc_t for "/mnt/glusterfs"}, "foo") + + If one wishes to lock *all* the names under a particular directory, + supply the basename argument as NULL. + + The locks can either be read locks or write locks; consult the + function prototype for more details. + +Both these operations are implemented by the features/locks (earlier +known as posix-locks) translator. + +-------------- +* Basic design +-------------- + +All FOPs can be classified into four major groups: + + - inode-read + Operations that read an inode's data (file contents) or metadata (perms, etc.). + + access, getxattr, fstat, readlink, readv, stat. + + - inode-write + Operations that modify an inode's data or metadata. + + chmod, chown, truncate, writev, utimens. + + - dir-read + Operations that read a directory's contents or metadata. + + readdir, getdents, checksum. + + - dir-write + Operations that modify a directory's contents or metadata. + + create, link, mkdir, mknod, rename, rmdir, symlink, unlink. + + Some of these make a subgroup in that they modify *two* different entries: + link, rename, symlink. + + - Others + Other operations. + + flush, lookup, open, opendir, statfs. + +------------ +* Algorithms +------------ + +Each of the four major groups has its own algorithm: + + ---------------------- + - inode-read, dir-read + ---------------------- + + = Send a request to the first child that is up: + - if it fails: + try the next available child + - if we have exhausted all children: + return failure + + ------------- + - inode-write + ------------- + + All operations are done in parallel unless specified otherwise. + + (1) Send a GF_FILE_LK request on all children for a write lock on + the appropriate region + (for metadata operations: entire file (0, 0) + for writev: (offset, offset+size of buffer)) + + - If a lock request fails on a child: + unlock all children + try to acquire a blocking lock (F_SETLKW) on each child, serially. + + If this fails (due to ENOTCONN or EINVAL): + Consider this child as dead for rest of transaction. + + (2) Mark all children as "pending" on all (alive) children + (see below for meaning of "pending"). + + - If it fails on any child: + mark it as dead (in transaction local state). + + (3) Perform operation on all (alive) children. + + - If it fails on any child: + mark it as dead (in transaction local state). + + (4) Unmark all successful children as not "pending" on all nodes. + + (5) Unlock region on all (alive) children. + + ----------- + - dir-write + ----------- + + The algorithm for dir-write is same as above except instead of holding + GF_FILE_LK locks we hold a GF_DIR_LK lock on the name being operated upon. + In case of link-type calls, we hold locks on both the operand names. + +----------- +* "pending" +----------- + + The "pending" number is like a journal entry. A pending entry is an + array of 32-bit integers stored in network byte-order as the extended + attribute of an inode (which can be a directory as well). + + There are three keys corresponding to three types of pending operations: + + - AFR_METADATA_PENDING + There are some metadata operations pending on this inode (perms, ctime/mtime, + xattr, etc.). + + - AFR_DATA_PENDING + There is some data pending on this inode (writev). + + - AFR_ENTRY_PENDING + There are some directory operations pending on this directory + (create, unlink, etc.). + +----------- +* Self heal +----------- + + - On lookup, gather extended attribute data: + - If entry is a regular file: + - If an entry is present on one child and not on others: + - create entry on others. + - If entries exist but have different metadata (perms, etc.): + - consider the entry with the highest AFR_METADATA_PENDING number as + definitive and replicate its attributes on children. + + - If entry is a directory: + - Consider the entry with the higest AFR_ENTRY_PENDING number as + definitive and replicate its contents on all children. + + - If any two entries have non-matching types (i.e., one is file and + other is directory): + - Announce to the user via log that a split-brain situation has been + detected, and do nothing. + + - On open, gather extended attribute data: + - Consider the file with the highest AFR_DATA_PENDING number as + the definitive one and replicate its contents on all other + children. + + During all self heal operations, appropriate locks must be held on all + regions/entries being affected. + +--------------- +* Inode scaling +--------------- + +Inode scaling is necessary because if a situation arises where: + - An inode number is returned for a directory (by lookup) which was + previously the inode number of a file (as per FUSE's table), then + FUSE gets horribly confused (consult a FUSE expert for more details). + +To avoid such a situation, we distribute the 64-bit inode space equally +among all children of replicate. + +To illustrate: + +If c1, c2, c3 are children of replicate, they each get 1/3 of the available +inode space: + +Child: c1 c2 c3 c1 c2 c3 c1 c2 c3 c1 c2 ... +Inode number: 1 2 3 4 5 6 7 8 9 10 11 ... + +Thus, if lookup on c1 returns an inode number "2", it is scaled to "4" +(which is the second inode number in c1's space). + +This way we ensure that there is never a collision of inode numbers from +two different children. + +This reduction of inode space doesn't really reduce the usability of +replicate since even if we assume replicate has 1024 children (which would be a +highly unusual scenario), each child still has a 54-bit inode space. + +2^54 ~ 1.8 * 10^16 + +which is much larger than any real world requirement. + + +============================================== +$ Last updated: Sun Oct 12 23:17:01 IST 2008 $ +$ Author: Vikas Gorur $ +============================================== + diff --git a/doc/hacker-guide/write-behind.txt b/doc/hacker-guide/write-behind.txt new file mode 100644 index 000000000..498e95480 --- /dev/null +++ b/doc/hacker-guide/write-behind.txt @@ -0,0 +1,45 @@ +basic working +-------------- + + write behind is basically a translator to lie to the application that the write-requests are finished, even before it is actually finished. + + on a regular translator tree without write-behind, control flow is like this: + + 1. application makes a write() system call. + 2. VFS ==> FUSE ==> /dev/fuse. + 3. fuse-bridge initiates a glusterfs writev() call. + 4. writev() is STACK_WIND()ed upto client-protocol or storage translator. + 5. client-protocol, on recieving reply from server, starts STACK_UNWIND() towards the fuse-bridge. + + on a translator tree with write-behind, control flow is like this: + + 1. application makes a write() system call. + 2. VFS ==> FUSE ==> /dev/fuse. + 3. fuse-bridge initiates a glusterfs writev() call. + 4. writev() is STACK_WIND()ed upto write-behind translator. + 5. write-behind adds the write buffer to its internal queue and does a STACK_UNWIND() towards the fuse-bridge. + + write call is completed in application's percepective. after STACK_UNWIND()ing towards the fuse-bridge, write-behind initiates a fresh writev() call to its child translator, whose replies will be consumed by write-behind itself. write-behind _doesn't_ cache the write buffer, unless 'option flush-behind on' is specified in volume specification file. + +windowing +--------- + + write respect to write-behind, each write-buffer has three flags: 'stack_wound', 'write_behind' and 'got_reply'. + + stack_wound: if set, indicates that write-behind has initiated STACK_WIND() towards child translator. + + write_behind: if set, indicates that write-behind has done STACK_UNWIND() towards fuse-bridge. + + got_reply: if set, indicates that write-behind has recieved reply from child translator for a writev() STACK_WIND(). a request will be destroyed by write-behind only if this flag is set. + + currently pending write requests = aggregate size of requests with write_behind = 1 and got_reply = 0. + + window size limits the aggregate size of currently pending write requests. once the pending requests' size has reached the window size, write-behind blocks writev() calls from fuse-bridge. + blocking is only from application's perspective. write-behind does STACK_WIND() to child translator straight-away, but hold behind the STACK_UNWIND() towards fuse-bridge. STACK_UNWIND() is done only once write-behind gets enough replies to accomodate for currently blocked request. + +flush behind +------------ + + if 'option flush-behind on' is specified in volume specification file, then write-behind sends aggregate write requests to child translator, instead of regular per request STACK_WIND()s. + + diff --git a/doc/handling-options.txt b/doc/handling-options.txt new file mode 100644 index 000000000..cac1fe939 --- /dev/null +++ b/doc/handling-options.txt @@ -0,0 +1,13 @@ + +How to add a new option to a given volume ? +=========================================== + +* Add a entry in 'struct volume_options options[]' with your key, what is + the type of the 'key', etc. + +* The 'key' and corresponding 'value' given for the same by user are validated + before calling init() of the translator/transport/scheduler/auth-module. + +* Once the complete init() is successful, user will get a warning if he has + given a 'key' which is not defined in these modules. + diff --git a/doc/mac-related-xattrs.txt b/doc/mac-related-xattrs.txt new file mode 100644 index 000000000..805658334 --- /dev/null +++ b/doc/mac-related-xattrs.txt @@ -0,0 +1,21 @@ + +This document is intended to briefly explain how the Extended Attributes on +Darwin 10.5.x releases works +---- + +On Darwin other than all the normal filesystem operations, 'Finder' (like +Explorer in Windows but a little more) keeps its information in two extended +attributes named 'com.apple.FinderInfo' and 'com.apple.ResourceFork'. If these +xattrs are not implemented the filesystem won't be shown on Finder, and if they +are not implemented properly there may be issues when some of the file operations +are done through GUI of Finder. But when a filesystem is used over mountpoint in a +terminal, everything is fine and these xattrs are not required. + +Currently the way these xattrs are implemented is simple. All the xattr calls +(getxattr, setxattr, listxattr, removexattr) are passed down to underlaying filesystem, +most of the cases when exported FS is on MacOS X itself, these keys are supported, hence +the fops succeed. But in the case of using exports of different OS on Darwin the issue is +extended attribute prefix like 'com.apple.' may not be supported, hence the problem with +Finder. To solve this issue, GlusterFS returns virtual default values to these keys, which +works fine on most of the cases. + diff --git a/doc/porting_guide.txt b/doc/porting_guide.txt new file mode 100644 index 000000000..905bb4228 --- /dev/null +++ b/doc/porting_guide.txt @@ -0,0 +1,45 @@ + GlusterFS Porting Guide + ----------------------- + +* General setup + +The configure script will detect the target platform for the build. +All platform-specific CFLAGS, macro definitions should be done +in configure.ac + +Platform-specific code can be written like this: + +#ifdef GF_DARWIN_HOST_OS + /* some code specific to Darwin */ +#endif + +* Coding guidelines + +In general, avoid glibc extensions. For example, nested functions don't work +on Mac OS X. It is best to stick to C99. + +When using library calls and system calls, pay attention to the +portability notes. As far as possible stick to POSIX-specified behavior. +Do not use anything expressly permitted by the specification. For example, +some fields in structures may be present only on certain platforms. Avoid +use of such things. + +Do not pass values of constants such as F_*, O_*, errno values, etc. across +platforms. + +Please refer compat-errno.h for more details about errno handling inside +glusterfs for cross platform. + +* Specific issues + +- The argp library is available only on Linux through glibc, but for other + platforms glusterfs has already included argp-standalone library which will + statically linked during the glusterfs build. + +- Extended attribute calls (setxattr, listxattr, etc.) have differing prototypes + on different platforms. See compat.h for macro definitions to resolve this, also + read out the specific extended attribute documentation for your platforms. + +------------------------------------------ +Last revised: Thu Feb 28 13:58:07 IST 2008 +------------------------------------------ diff --git a/doc/qa/qa-client.vol b/doc/qa/qa-client.vol new file mode 100644 index 000000000..176dda589 --- /dev/null +++ b/doc/qa/qa-client.vol @@ -0,0 +1,170 @@ +# This spec file should be used for testing before any release +# + +# 1st client +volume client1 + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport +# option transport-type ib-verbs # for ib-verbs transport +# option transport.ib-verbs.work-request-send-size 131072 +# option transport.ib-verbs.work-request-send-count 64 +# option transport.ib-verbs.work-request-recv-size 131072 +# option transport.ib-verbs.work-request-recv-count 64 + option remote-host 127.0.0.1 + option remote-subvolume ra1 +end-volume + +# 2nd client +volume client2 + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport +# option transport-type ib-verbs # for ib-verbs transport + option remote-host 127.0.0.1 + option remote-subvolume ra2 +end-volume + +# 3rd client +volume client3 + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport +# option transport-type ib-verbs # for ib-verbs transport + option remote-host 127.0.0.1 + option remote-subvolume ra3 +end-volume + +# 4th client +volume client4 + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport +# option transport-type ib-verbs # for ib-verbs transport + option remote-host 127.0.0.1 + option remote-subvolume ra4 +end-volume + +# 5th client +volume client5 + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport +# option transport-type ib-verbs # for ib-verbs transport + option remote-host 127.0.0.1 + option remote-subvolume ra5 +end-volume + +# 6th client +volume client6 + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport +# option transport-type ib-verbs # for ib-verbs transport + option remote-host 127.0.0.1 + option remote-subvolume ra6 +end-volume + +# 7th client +volume client7 + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport +# option transport-type ib-verbs # for ib-verbs transport + option remote-host 127.0.0.1 + option remote-subvolume ra7 +end-volume + +# 8th client +volume client8 + type protocol/client + option transport-type tcp # for TCP/IP transport +# option transport-type ib-sdp # for Infiniband transport +# option transport-type ib-verbs # for ib-verbs transport + option remote-host 127.0.0.1 + option remote-subvolume ra8 +end-volume + +# 1st Stripe (client1 client2) +volume stripe1 + type cluster/stripe + subvolumes client1 client2 + option block-size 128KB # all striped in 128kB block +end-volume + +# 2st Stripe (client3 client4) +volume stripe2 + type cluster/stripe + subvolumes client3 client4 + option block-size 128KB # all striped in 128kB block +end-volume + +# 3st Stripe (client5 client6) +volume stripe3 + type cluster/stripe + subvolumes client5 client6 + option block-size 128KB # all striped in 128kB block +end-volume + +# 4st Stripe (client7 client8) +volume stripe4 + type cluster/stripe + subvolumes client7 client8 + option block-size 128KB # all striped in 128kB block +end-volume + + +# 1st replicate +volume replicate1 + type cluster/replicate + subvolumes stripe1 stripe2 +end-volume + +# 2nd replicate +volume replicate2 + type cluster/replicate + subvolumes stripe3 stripe4 +end-volume + +volume ns + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option remote-subvolume brick-ns +end-volume + +# Unify +volume unify0 + type cluster/unify + subvolumes replicate1 replicate2 +# subvolumes stripe1 stripe3 + option namespace ns + option scheduler rr # random # alu # nufa + option rr.limits.min-free-disk 1GB +# option alu.order x +# option alu.x.entry-threshold +# option alu.x.exit-threshold +end-volume + + +# ==== Performance Translators ==== +# The default options for performance translators should be the best for 90+% of the cases +volume iot + type performance/io-threads + subvolumes unify0 +end-volume + +volume wb + type performance/write-behind + subvolumes iot +end-volume + +volume ioc + type performance/io-cache + subvolumes wb +end-volume + +volume ra + type performance/read-ahead + subvolumes ioc +end-volume diff --git a/doc/qa/qa-high-avail-client.vol b/doc/qa/qa-high-avail-client.vol new file mode 100644 index 000000000..69cb8dd30 --- /dev/null +++ b/doc/qa/qa-high-avail-client.vol @@ -0,0 +1,17 @@ +volume client + type protocol/client + option transport-type tcp + option remote-host localhost + option transport.socket.remote-port 7001 + option remote-subvolume server1-iot +end-volume + +volume ra + type performance/read-ahead + subvolumes client +end-volume + +volume wb + type performance/write-behind + subvolumes ra +end-volume diff --git a/doc/qa/qa-high-avail-server.vol b/doc/qa/qa-high-avail-server.vol new file mode 100644 index 000000000..09d91c4c4 --- /dev/null +++ b/doc/qa/qa-high-avail-server.vol @@ -0,0 +1,346 @@ + +# -- server 1 -- +volume server1-posix1 + type storage/posix + option directory /tmp/ha-export1/ +end-volume + +volume server1-ns1 + type storage/posix + option directory /tmp/ha-export-ns1/ +end-volume + +volume server1-client2 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7002 + option remote-subvolume server2-posix2 +end-volume + +volume server1-ns2 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7002 + option remote-subvolume server2-ns2 +end-volume + +volume server1-client3 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7003 + option remote-subvolume server3-posix3 +end-volume + +volume server1-ns3 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7003 + option remote-subvolume server3-ns3 +end-volume + +volume server1-io1 + type performance/io-threads + option thread-count 8 + subvolumes server1-posix1 +end-volume + + +volume server1-io2 + type performance/io-threads + option thread-count 8 + subvolumes server1-client2 +end-volume + +volume server1-io3 + type performance/io-threads + option thread-count 8 + subvolumes server1-client3 +end-volume + +volume server1-ns-io1 + type performance/io-threads + option thread-count 8 + subvolumes server1-ns1 +end-volume + +volume server1-ns-io2 + type performance/io-threads + option thread-count 8 + subvolumes server1-ns2 +end-volume + +volume server1-ns-io3 + type performance/io-threads + option thread-count 8 + subvolumes server1-ns3 +end-volume + +volume server1-ns-replicate + type cluster/replicate + subvolumes server1-ns-io1 server1-ns-io2 server1-ns-io3 +end-volume + +volume server1-storage-replicate + type cluster/replicate + subvolumes server1-io1 server1-io2 server1-io3 +end-volume + +volume server1-unify + type cluster/unify + #option self-heal off + subvolumes server1-storage-replicate + option namespace server1-ns-replicate + option scheduler rr +end-volume + +volume server1-iot + type performance/io-threads + option thread-count 8 + subvolumes server1-unify +end-volume + +volume server1 + type protocol/server + option transport-type tcp + subvolumes server1-iot + option transport.socket.listen-port 7001 + option auth.addr.server1-posix1.allow * + option auth.addr.server1-ns1.allow * + option auth.addr.server1-iot.allow * +end-volume + + +# == Server2 == +volume server2-client1 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7001 + option remote-subvolume server1-posix1 +end-volume + +volume server2-ns1 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7001 + option remote-subvolume server1-ns1 +end-volume + +volume server2-posix2 + type storage/posix + option directory /tmp/ha-export2/ +end-volume + +volume server2-ns2 + type storage/posix + option directory /tmp/ha-export-ns2/ +end-volume + +volume server2-client3 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7003 + option remote-subvolume server3-posix3 +end-volume + +volume server2-ns3 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7003 + option remote-subvolume server3-ns3 +end-volume + +volume server2-io1 + type performance/io-threads + option thread-count 8 + subvolumes server2-client1 +end-volume + + +volume server2-io2 + type performance/io-threads + option thread-count 8 + subvolumes server2-posix2 +end-volume + +volume server2-io3 + type performance/io-threads + option thread-count 8 + subvolumes server2-client3 +end-volume + +volume server2-ns-io1 + type performance/io-threads + option thread-count 8 + subvolumes server2-ns1 +end-volume + +volume server2-ns-io2 + type performance/io-threads + option thread-count 8 + subvolumes server2-ns2 +end-volume + +volume server2-ns-io3 + type performance/io-threads + option thread-count 8 + subvolumes server2-ns3 +end-volume + +volume server2-ns-replicate + type cluster/replicate + subvolumes server2-ns-io1 server2-ns-io2 server2-ns-io3 +end-volume + +volume server2-storage-replicate + type cluster/replicate + subvolumes server2-io2 server2-io3 server2-io1 +end-volume + +volume server2-unify + type cluster/unify + option self-heal off + subvolumes server2-storage-replicate + option namespace server2-ns-replicate + option scheduler rr +end-volume + +volume server2-iot + type performance/io-threads + option thread-count 8 + option cache-size 64MB + subvolumes server2-unify +end-volume + +volume server2 + type protocol/server + option transport-type tcp + subvolumes server2-iot + option transport.socket.listen-port 7002 + option auth.addr.server2-posix2.allow * + option auth.addr.server2-ns2.allow * + option auth.addr.server2-iot.allow * +end-volume + +# == server 3 == +volume server3-client1 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7001 + option remote-subvolume server1-posix1 +end-volume + +volume server3-ns1 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7001 + option remote-subvolume server1-ns1 +end-volume + +volume server3-client2 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7002 + option remote-subvolume server2-posix2 +end-volume + +volume server3-ns2 + type protocol/client + option transport-type tcp + option remote-host 127.0.0.1 + option transport.socket.remote-port 7002 + option remote-subvolume server2-ns2 +end-volume + +volume server3-posix3 + type storage/posix + option directory /tmp/ha-export3/ +end-volume + +volume server3-ns3 + type storage/posix + option directory /tmp/ha-export-ns3/ +end-volume + +volume server3-io1 + type performance/io-threads + option thread-count 8 + subvolumes server3-client1 +end-volume + + +volume server3-io2 + type performance/io-threads + option thread-count 8 + subvolumes server3-client2 +end-volume + +volume server3-io3 + type performance/io-threads + option thread-count 8 + subvolumes server3-posix3 +end-volume + +volume server3-ns-io1 + type performance/io-threads + option thread-count 8 + subvolumes server3-ns1 +end-volume + +volume server3-ns-io2 + type performance/io-threads + option thread-count 8 + subvolumes server3-ns2 +end-volume + +volume server3-ns-io3 + type performance/io-threads + option thread-count 8 + subvolumes server3-ns3 +end-volume + +volume server3-ns-replicate + type cluster/replicate + subvolumes server3-ns-io1 server3-ns-io2 server3-ns-io3 +end-volume + +volume server3-storage-replicate + type cluster/replicate + subvolumes server3-io3 server3-io2 server3-io1 +end-volume + +volume server3-unify + type cluster/unify + option self-heal off + subvolumes server3-storage-replicate + option namespace server3-ns-replicate + option scheduler rr +end-volume + +volume server3-iot + type performance/io-threads + option thread-count 8 + option cache-size 64MB + subvolumes server3-unify +end-volume + +volume server3 + type protocol/server + option transport-type tcp + subvolumes server3-iot + option transport.socket.listen-port 7003 + option auth.addr.server3-posix3.allow * + option auth.addr.server3-ns3.allow * + option auth.addr.server3-iot.allow * +end-volume + diff --git a/doc/qa/qa-server.vol b/doc/qa/qa-server.vol new file mode 100644 index 000000000..1c245c324 --- /dev/null +++ b/doc/qa/qa-server.vol @@ -0,0 +1,284 @@ +# This spec file should be used for testing before any release +# + +# Namespace posix +volume brick-ns + type storage/posix # POSIX FS translator + option directory /tmp/export-ns # Export this directory +end-volume + +# 1st server + +volume brick1 + type storage/posix # POSIX FS translator + option directory /tmp/export1 # Export this directory +end-volume + +# == Posix-Locks == + volume plocks1 + type features/posix-locks +# option mandatory on + subvolumes brick1 + end-volume + +volume iot1 + type performance/io-threads + subvolumes plocks1 # change properly if above commented volumes needs to be included +# option +end-volume + +volume wb1 + type performance/write-behind + subvolumes iot1 +# option +end-volume + +volume ra1 + type performance/read-ahead + subvolumes wb1 +# option +end-volume + +volume brick2 + type storage/posix # POSIX FS translator + option directory /tmp/export2 # Export this directory +end-volume + +# == TrashCan Translator == +# volume trash2 +# type features/trash +# option trash-dir /.trashcan +# subvolumes brick2 +# end-volume + +# == Posix-Locks == +volume plocks2 + type features/posix-locks +# option + subvolumes brick2 +end-volume + +volume iot2 + type performance/io-threads + subvolumes plocks2 # change properly if above commented volumes needs to be included +# option +end-volume + +volume wb2 + type performance/write-behind + subvolumes iot2 +# option +end-volume + +volume ra2 + type performance/read-ahead + subvolumes wb2 +# option +end-volume + +volume brick3 + type storage/posix # POSIX FS translator + option directory /tmp/export3 # Export this directory +end-volume + +# == TrashCan Translator == +# volume trash3 +# type features/trash +# option trash-dir /.trashcan +# subvolumes brick3 +# end-volume + +# == Posix-Locks == +volume plocks3 + type features/posix-locks +# option + subvolumes brick3 +end-volume + +volume iot3 + type performance/io-threads + subvolumes plocks3 # change properly if above commented volumes needs to be included +# option +end-volume + +volume wb3 + type performance/write-behind + subvolumes iot3 +# option +end-volume + +volume ra3 + type performance/read-ahead + subvolumes wb3 +# option +end-volume + +volume brick4 + type storage/posix # POSIX FS translator + option directory /tmp/export4 # Export this directory +end-volume + +# == Posix-Locks == +volume plocks4 + type features/posix-locks +# option + subvolumes brick4 +end-volume + +volume iot4 + type performance/io-threads + subvolumes plocks4 # change properly if above commented volumes needs to be included +# option +end-volume + +volume wb4 + type performance/write-behind + subvolumes iot4 +# option +end-volume + +volume ra4 + type performance/read-ahead + subvolumes wb4 +# option +end-volume + +volume brick5 + type storage/posix # POSIX FS translator + option directory /tmp/export5 # Export this directory +end-volume + + +# == Posix-Locks == +volume plocks5 + type features/posix-locks +# option + subvolumes brick5 +end-volume + +volume iot5 + type performance/io-threads + subvolumes plocks5 # change properly if above commented volumes needs to be included +# option +end-volume + +volume wb5 + type performance/write-behind + subvolumes iot5 +# option +end-volume + +volume ra5 + type performance/read-ahead + subvolumes wb5 +# option +end-volume + +volume brick6 + type storage/posix # POSIX FS translator + option directory /tmp/export6 # Export this directory +end-volume + +# == Posix-Locks == +volume plocks6 + type features/posix-locks +# option + subvolumes brick6 +end-volume + +volume iot6 + type performance/io-threads + subvolumes plocks6 # change properly if above commented volumes needs to be included +# option +end-volume + +volume wb6 + type performance/write-behind + subvolumes iot6 +# option +end-volume + +volume ra6 + type performance/read-ahead + subvolumes wb6 +# option +end-volume + +volume brick7 + type storage/posix # POSIX FS translator + option directory /tmp/export7 # Export this directory +end-volume + +# == Posix-Locks == +volume plocks7 + type features/posix-locks +# option + subvolumes brick7 +end-volume + +volume iot7 + type performance/io-threads + subvolumes plocks7 # change properly if above commented volumes needs to be included +# option +end-volume + +volume wb7 + type performance/write-behind + subvolumes iot7 +# option +end-volume + +volume ra7 + type performance/read-ahead + subvolumes wb7 +# option +end-volume + +volume brick8 + type storage/posix # POSIX FS translator + option directory /tmp/export8 # Export this directory +end-volume + +# == Posix-Locks == +volume plocks8 + type features/posix-locks +# option + subvolumes brick8 +end-volume + +volume iot8 + type performance/io-threads + subvolumes plocks8 # change properly if above commented volumes needs to be included +# option +end-volume + +volume wb8 + type performance/write-behind + subvolumes iot8 +# option +end-volume + +volume ra8 + type performance/read-ahead + subvolumes wb8 +# option +end-volume + +volume server8 + type protocol/server + subvolumes ra8 ra1 ra2 ra3 ra4 ra5 ra6 ra7 brick-ns + option transport-type tcp # For TCP/IP transport +# option transport-type ib-sdp # For Infiniband transport +# option transport-type ib-verbs # For ib-verbs transport + option client-volume-filename /examples/qa-client.vol + option auth.addr.ra1.allow * # Allow access to "stat8" volume + option auth.addr.ra2.allow * # Allow access to "stat8" volume + option auth.addr.ra3.allow * # Allow access to "stat8" volume + option auth.addr.ra4.allow * # Allow access to "stat8" volume + option auth.addr.ra5.allow * # Allow access to "stat8" volume + option auth.addr.ra6.allow * # Allow access to "stat8" volume + option auth.addr.ra7.allow * # Allow access to "stat8" volume + option auth.addr.ra8.allow * # Allow access to "stat8" volume + option auth.addr.brick-ns.allow * # Allow access to "stat8" volume +end-volume + diff --git a/doc/replicate.lyx b/doc/replicate.lyx new file mode 100644 index 000000000..2bbcb652a --- /dev/null +++ b/doc/replicate.lyx @@ -0,0 +1,797 @@ +#LyX 1.4.2 created this file. For more info see http://www.lyx.org/ +\lyxformat 245 +\begin_document +\begin_header +\textclass article +\language english +\inputencoding auto +\fontscheme default +\graphics default +\paperfontsize default +\spacing single +\papersize default +\use_geometry false +\use_amsmath 1 +\cite_engine basic +\use_bibtopic false +\paperorientation portrait +\secnumdepth 3 +\tocdepth 3 +\paragraph_separation skip +\defskip medskip +\quotes_language english +\papercolumns 1 +\papersides 1 +\paperpagestyle default +\tracking_changes false +\output_changes false +\end_header + +\begin_body + +\begin_layout Title + +\size larger +Automatic File Replication (replicate) in GlusterFS +\end_layout + +\begin_layout Author +Vikas Gorur +\family typewriter +\size larger + +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Standard + + +\backslash +hrule +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section* +Overview +\end_layout + +\begin_layout Standard +This document describes the design and usage of the replicate translator in GlusterFS. + This document is valid for the 1.4.x releases, and not earlier ones. +\end_layout + +\begin_layout Standard +The replicate translator of GlusterFS aims to keep identical copies of a file + on all its subvolumes, as far as possible. + It tries to do this by performing all filesystem mutation operations (writing + data, creating files, changing ownership, etc.) on all its subvolumes in + such a way that if an operation succeeds on atleast one subvolume, all + other subvolumes can later be brought up to date. +\end_layout + +\begin_layout Standard +In the rest of the document the terms +\begin_inset Quotes eld +\end_inset + +subvolume +\begin_inset Quotes erd +\end_inset + + and +\begin_inset Quotes eld +\end_inset + +server +\begin_inset Quotes erd +\end_inset + + are used interchangeably, trusting that it will cause no confusion to the + reader. +\end_layout + +\begin_layout Section* +Usage +\end_layout + +\begin_layout Standard +A sample volume declaration for replicate looks like this: +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Standard + + +\backslash +begin{verbatim} +\end_layout + +\begin_layout Standard + +volume replicate +\end_layout + +\begin_layout Standard + + type cluster/replicate +\end_layout + +\begin_layout Standard + + # options, see below for description +\end_layout + +\begin_layout Standard + + subvolumes brick1 brick2 +\end_layout + +\begin_layout Standard + +end-volume +\end_layout + +\begin_layout Standard + + +\backslash +end{verbatim} +\end_layout + +\begin_layout Standard + +\end_layout + +\begin_layout Standard + +\end_layout + +\begin_layout Standard + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +This defines an replicate volume with two subvolumes, brick1, and brick2. + For replicate to work properly, it is essential that its subvolumes support +\series bold +extended attributes +\series default +. + This means that you should choose a backend filesystem that supports extended + attributes, like XFS, ReiserFS, or Ext3. +\end_layout + +\begin_layout Standard +The storage volumes used as backend for replicate +\emph on +must +\emph default + have a posix-locks volume loaded above them. +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Standard + + +\backslash +begin{verbatim} +\end_layout + +\begin_layout Standard + +volume brick1 +\end_layout + +\begin_layout Standard + + type features/posix-locks +\end_layout + +\begin_layout Standard + + subvolumes brick1-ds +\end_layout + +\begin_layout Standard + +end-volume +\end_layout + +\begin_layout Standard + + +\backslash +end{verbatim} +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Section* +Design +\end_layout + +\begin_layout Subsection* +Read algorithm +\end_layout + +\begin_layout Standard +All operations that do not modify the file or directory are sent to all + the subvolumes and the first successful reply is returned to the application. +\end_layout + +\begin_layout Standard +The read() system call (reading data from a file) is an exception. + For read() calls, replicate tries to do load balancing by sending all reads from + a particular file to a particular server. +\end_layout + +\begin_layout Standard +The read algorithm is also affected by the option read-subvolume; see below + for details. +\end_layout + +\begin_layout Subsection* +Classes of file operations +\end_layout + +\begin_layout Standard +replicate divides all filesystem write operations into three classes: +\end_layout + +\begin_layout Itemize + +\series bold +data: +\series default +Operations that modify the contents of a file (write, truncate). +\end_layout + +\begin_layout Itemize + +\series bold +metadata: +\series default +Operations that modify attributes of a file or directory (permissions, ownership +, etc.). +\end_layout + +\begin_layout Itemize + +\series bold +entry: +\series default +Operations that create or delete directory entries (mkdir, create, rename, + rmdir, unlink, etc.). +\end_layout + +\begin_layout Subsection* +Locking and Change Log +\end_layout + +\begin_layout Standard +To ensure consistency across subvolumes, replicate holds a lock whenever a modificatio +n is being made to a file or directory. + By default, replicate considers the first subvolume as the sole lock server. + However, the number of lock servers can be increased upto the total number + of subvolumes. +\end_layout + +\begin_layout Standard +The change log is a set of extended attributes associated with files and + directories that replicate maintains. + The change log keeps track of the changes made to files and directories + (data, metadata, entry) so that the self-heal algorithm knows which copy + of a file or directory is the most recent one. +\end_layout + +\begin_layout Subsection* +Write algorithm +\end_layout + +\begin_layout Standard +The algorithm for all write operations (data, metadata, entry) is: +\end_layout + +\begin_layout Enumerate +Lock the file (or directory) on all of the lock servers (see options below). +\end_layout + +\begin_layout Enumerate +Write change log entries on all servers. +\end_layout + +\begin_layout Enumerate +Perform the operation. +\end_layout + +\begin_layout Enumerate +Erase change log entries. +\end_layout + +\begin_layout Enumerate +Unlock the file (or directory) on all of the lock servers. +\end_layout + +\begin_layout Standard +The above algorithm is a simplified version intended for general users. + Please refer to the source code for the full details. +\end_layout + +\begin_layout Subsection* +Self-Heal +\end_layout + +\begin_layout Standard +replicate automatically tries to fix any inconsistencies it detects among different + copies of a file. + It uses information in the change log to determine which copy is the +\begin_inset Quotes eld +\end_inset + +correct +\begin_inset Quotes erd +\end_inset + + version. +\end_layout + +\begin_layout Standard +Self-heal is triggered when a file or directory is first +\begin_inset Quotes eld +\end_inset + +accessed +\begin_inset Quotes erd +\end_inset + +, that is, the first time any operation is attempted on it. + The self-heal algorithm does the following things: +\end_layout + +\begin_layout Standard +If the entry being accessed is a directory: +\end_layout + +\begin_layout Itemize +The contents of the +\begin_inset Quotes eld +\end_inset + +correct +\begin_inset Quotes erd +\end_inset + + version is replicated on all subvolumes, by deleting entries and creating + entries as necessary. +\end_layout + +\begin_layout Standard +If the entry being accessed is a file: +\end_layout + +\begin_layout Itemize +If the file does not exist on some subvolumes, it is created. +\end_layout + +\begin_layout Itemize +If there is a mismatch in the size of the file, or ownership, or permission, + it is fixed. +\end_layout + +\begin_layout Itemize +If the change log indicates that some copies need updating, they are updated. +\end_layout + +\begin_layout Subsection* +Split-brain +\end_layout + +\begin_layout Standard +It may happen that one replicate client can access only some of the servers in + a cluster and another replicate client can access the remaining servers. + Or it may happen that in a cluster of two servers, one server goes down + and comes back up, but the other goes down immediately. + Both these scenarios result in a +\begin_inset Quotes eld +\end_inset + +split-brain +\begin_inset Quotes erd +\end_inset + +. +\end_layout + +\begin_layout Standard +In a split-brain situation, there will be two or more copies of a file, + all of which are +\begin_inset Quotes eld +\end_inset + +correct +\begin_inset Quotes erd +\end_inset + + in some sense. + replicate without manual intervention has no way of knowing what to do, since + it cannot consider any single copy as definitive, nor does it know of any + meaningful way to merge the copies. +\end_layout + +\begin_layout Standard +If replicate detects that a split-brain has happened on a file, it disallows opening + of that file. + You will have to manually resolve the conflict by deleting all but one + copy of the file. + Alternatively you can set an automatic split-brain resolution policy by + using the `favorite-child' option (see below). +\end_layout + +\begin_layout Section* +Translator Options +\end_layout + +\begin_layout Standard +replicate accepts the following options: +\end_layout + +\begin_layout Subsection* +read-subvolume (default: none) +\end_layout + +\begin_layout Standard +The value of this option must be the name of a subvolume. + If given, all read operations are sent to only the specified subvolume, + instead of being balanced across all subvolumes. +\end_layout + +\begin_layout Subsection* +favorite-child (default: none) +\end_layout + +\begin_layout Standard +The value of this option must be the name of a subvolume. + If given, the specified subvolume will be preferentially used in resolving + conflicts ( +\begin_inset Quotes eld +\end_inset + +split-brain +\begin_inset Quotes erd +\end_inset + +). + This means if a discrepancy is noticed in the attributes or content of + a file, the copy on the `favorite-child' will be considered the definitive + version and its contents will +\emph on +overwrite +\emph default +the contents of all other copies. + Use this option with caution! It is possible to +\emph on +lose data +\emph default + with this option. + If you are in doubt, do not specify this option. +\end_layout + +\begin_layout Subsection* +Self-heal options +\end_layout + +\begin_layout Standard +Setting any of these options to +\begin_inset Quotes eld +\end_inset + +off +\begin_inset Quotes erd +\end_inset + + prevents that kind of self-heal from being done on a file or directory. + For example, if metadata self-heal is turned off, permissions and ownership + are no longer fixed automatically. +\end_layout + +\begin_layout Subsubsection* +data-self-heal (default: on) +\end_layout + +\begin_layout Standard +Enable/disable self-healing of file contents. +\end_layout + +\begin_layout Subsubsection* +metadata-self-heal (default: off) +\end_layout + +\begin_layout Standard +Enable/disable self-healing of metadata (permissions, ownership, modification + times). +\end_layout + +\begin_layout Subsubsection* +entry-self-heal (default: on) +\end_layout + +\begin_layout Standard +Enable/disable self-healing of directory entries. +\end_layout + +\begin_layout Subsection* +Change Log options +\end_layout + +\begin_layout Standard +If any of these options is turned off, it disables writing of change log + entries for that class of file operations. + That is, steps 2 and 4 of the write algorithm (see above) are not done. + Note that if the change log is not written, the self-heal algorithm cannot + determine the +\begin_inset Quotes eld +\end_inset + +correct +\begin_inset Quotes erd +\end_inset + + version of a file and hence self-heal will only be able to fix +\begin_inset Quotes eld +\end_inset + +obviously +\begin_inset Quotes erd +\end_inset + + wrong things (such as a file existing on only one node). +\end_layout + +\begin_layout Subsubsection* +data-change-log (default: on) +\end_layout + +\begin_layout Standard +Enable/disable writing of change log for data operations. +\end_layout + +\begin_layout Subsubsection* +metadata-change-log (default: on) +\end_layout + +\begin_layout Standard +Enable/disable writing of change log for metadata operations. +\end_layout + +\begin_layout Subsubsection* +entry-change-log (default: on) +\end_layout + +\begin_layout Standard +Enable/disable writing of change log for entry operations. +\end_layout + +\begin_layout Subsection* +Locking options +\end_layout + +\begin_layout Standard +These options let you specify the number of lock servers to use for each + class of file operations. + The default values are satisfactory in most cases. + If you are extra paranoid, you may want to increase the values. + However, be very cautious if you set the data- or entry- lock server counts + to zero, since this can result in +\emph on +lost data. + +\emph default + For example, if you set the data-lock-server-count to zero, and two application +s write to the same region of a file, there is a possibility that none of + your servers will have all the data. + In other words, the copies will be +\emph on +inconsistent +\emph default +, and +\emph on +incomplete +\emph default +. + Do not set data- and entry- lock server counts to zero unless you absolutely + know what you are doing and agree to not hold GlusterFS responsible for + any lost data. +\end_layout + +\begin_layout Subsubsection* +data-lock-server-count (default: 1) +\end_layout + +\begin_layout Standard +Number of lock servers to use for data operations. +\end_layout + +\begin_layout Subsubsection* +metadata-lock-server-count (default: 0) +\end_layout + +\begin_layout Standard +Number of lock servers to use for metadata operations. +\end_layout + +\begin_layout Subsubsection* +entry-lock-server-count (default: 1) +\end_layout + +\begin_layout Standard +Number of lock servers to use for entry operations. +\end_layout + +\begin_layout Section* +Known Issues +\end_layout + +\begin_layout Subsection* +Self-heal of file with more than one link (hard links): +\end_layout + +\begin_layout Standard +Consider two servers, A and B. + Assume A is down, and the user creates a file `new' as a hard link to a + file `old'. + When A comes back up, replicate will see that the file `new' does not exist on + A, and self-heal will create the file and copy the contents from B. + However, now on server A the file `new' is not a link to the file `old' + but an entirely different file. +\end_layout + +\begin_layout Standard +We know of no easy way to fix this problem, but we will try to fix it in + forthcoming releases. +\end_layout + +\begin_layout Subsection* +File re-opening after a server comes back up: +\end_layout + +\begin_layout Standard +If a server A goes down and comes back up, any files which were opened while + A was down and are still open will not have their writes replicated on + A. + In other words, data replication only happens on those servers which were + alive when the file was opened. +\end_layout + +\begin_layout Standard +This is a rather tricky issue but we hope to fix it very soon. +\end_layout + +\begin_layout Section* +Frequently Asked Questions +\end_layout + +\begin_layout Subsection* +1. + How can I force self-heal to happen? +\end_layout + +\begin_layout Standard +You can force self-heal to happen on your cluster by running a script or + a command that accesses every file. + A simple way to do it would be: +\end_layout + +\begin_layout Standard +\begin_inset ERT +status open + +\begin_layout Standard + +\end_layout + +\begin_layout Standard + + +\backslash +begin{verbatim} +\end_layout + +\begin_layout Standard + +$ ls -lR +\end_layout + +\begin_layout Standard + + +\backslash +end{verbatim} +\end_layout + +\begin_layout Standard + +\end_layout + +\end_inset + + +\end_layout + +\begin_layout Standard +Run the command in all directories which you want to forcibly self-heal. +\end_layout + +\begin_layout Subsection* +2. + Which backend filesystem should I use for replicate? +\end_layout + +\begin_layout Standard +You can use any backend filesystem that supports extended attributes. + We know of users successfully using XFR, ReiserFS, and Ext3. +\end_layout + +\begin_layout Subsection* +3. + What can I do to improve replicate performance? +\end_layout + +\begin_layout Standard +Try loading performance translators such as io-threads, write-behind, io-cache, + and read-ahead depending on your workload. + If you are willing to sacrifice correctness in corner cases, you can experiment + with the lock-server-count and the change-log options (see above). + As warned earlier, be very careful! +\end_layout + +\begin_layout Subsection* +4. + How can I selectively replicate files? +\end_layout + +\begin_layout Standard +There is no support for selective replication in replicate itself. + You can achieve selective replication by loading the unify translator over + replicate, and using the switch scheduler. + Configure unify with two subvolumes, one of them being replicate. + Using the switch scheduler, schedule all files for which you need replication + to the replicate subvolume. + Consult unify and switch documentation for more details. +\end_layout + +\begin_layout Section* +Contact +\end_layout + +\begin_layout Standard +If you need more assistance on replicate, contact us on the mailing list (visit gluster.org for details on how to subscribe). +\end_layout + +\begin_layout Standard +Send you comments and suggestions about this document to . +\end_layout + +\end_body +\end_document diff --git a/doc/replicate.pdf b/doc/replicate.pdf new file mode 100644 index 000000000..b7212af2b Binary files /dev/null and b/doc/replicate.pdf differ diff --git a/doc/solaris-related-xattrs.txt b/doc/solaris-related-xattrs.txt new file mode 100644 index 000000000..e26efa5d1 --- /dev/null +++ b/doc/solaris-related-xattrs.txt @@ -0,0 +1,44 @@ + Solaris Extended Attributes + +In solaris extended attributes are logically supported as files +within the filesystem. The file system is therefore augmented +with an orthogonal namespace of file attributes. Attribute values +are accessed by file descriptors obtained through a special attribute +interface. This type of logical view of "attributes as files" allows +the leveraging of existing file system interface functionality to +support the construction, deletion and manipulation of attributes. + +But as we have tested through this functionality provided by Solaris +we have come accross two major issues as written below. + +1. Symlink XATTR_NOFOLLOW not present for creating extended attributes + directly on the symlinks like other platforms Linux,MAC-OSX,BSD etc. + An implementation is present for O_NOFOLLOW for "openat()" call sets + up errno ELOOP whenever encountered with a symlink and also another + implementation AT_SYMLINK_NOFOLLOW which is not present for calls like + "attropen(), openat()" + + a snippet of test code which helped us understand this behaviour + -------------------------------------- + attrfd = attropen (path, key, + flags|AT_SYMLINK_NOFOLLOW|O_CREAT|O_WRONLY|O_NOFOLLOW, 0777); + if (attrfd >= 0) { + ftruncate (attrfd, 0); + ret = write (attrfd, value, size); + close (attrfd); + } else { + fprintf (stderr, "Couldn't set extended attribute for %s (%d)\n", + path, errno); + } + -------------------------------------- + +2. Extended attribute support for special files like device files, fifo files + is not supported under solaris. + +Apart from these glitches almost everything regarding porting functionality +for extended attribute calls has been properly implemented in compat.c +with writing wrapper around functions over +"attropen()", "openat()", "unlinkat()" + + + diff --git a/doc/translator-options.txt b/doc/translator-options.txt new file mode 100644 index 000000000..3d8402be5 --- /dev/null +++ b/doc/translator-options.txt @@ -0,0 +1,221 @@ +mount/fuse: + * direct-io-mode GF_OPTION_TYPE_BOOL on|off|yes|no + * macfuse-local GF_OPTION_TYPE_BOOL on|off|yes|no + * mount-point (mountpoint) GF_OPTION_TYPE_PATH + * attribute-timeout GF_OPTION_TYPE_TIME 0-3600 + * entry-timeout GF_OPTION_TYPE_TIME 0-3600 + +protocol/server: + * transport-type GF_OPTION_TYPE_STR tcp|socket|ib-verbs|unix|ib-sdp| + tcp/client|ib-verbs/client + * volume-filename.* GF_OPTION_TYPE_PATH + * inode-lru-limit GF_OPTION_TYPE_INT 0-(1 * GF_UNIT_MB) + * client-volume-filename GF_OPTION_TYPE_PATH + +protocol/client: + * username GF_OPTION_TYPE_ANY + * password GF_OPTION_TYPE_ANY + * transport-type GF_OPTION_TYPE_STR tcp|socket|ib-verbs|unix|ib-sdp| + tcp/client|ib-verbs/client + * remote-host GF_OPTION_TYPE_ANY + * remote-subvolume GF_OPTION_TYPE_ANY + * transport-timeout GF_OPTION_TYPE_TIME 5-1013 + +cluster/replicate: + * read-subvolume GF_OPTION_TYPE_XLATOR + * favorite-child GF_OPTION_TYPE_XLATOR + * data-self-heal GF_OPTION_TYPE_BOOL + * metadata-self-heal GF_OPTION_TYPE_BOOL + * entry-self-heal GF_OPTION_TYPE_BOOL + * data-change-log GF_OPTION_TYPE_BOOL + * metadata-change-log GF_OPTION_TYPE_BOOL + * entry-change-log GF_OPTION_TYPE_BOOL + * data-lock-server-count GF_OPTION_TYPE_INT 0 + * metadata-lock-server-count GF_OPTION_TYPE_INT 0 + * entry-lock-server-count GF_OPTION_TYPE_INT 0 + +cluster/distribute: + * lookup-unhashed GF_OPTION_TYPE_BOOL + +cluster/unify: + * namespace GF_OPTION_TYPE_XLATOR + * scheduler GF_OPTION_TYPE_STR alu|rr|random|nufa|switch + * self-heal GF_OPTION_TYPE_STR foreground|background|off + * optimist GF_OPTION_TYPE_BOOL + +cluster/nufa: + local-volume-name GF_OPTION_TYPE_XLATOR + +cluster/stripe: + * block-size GF_OPTION_TYPE_ANY + * use-xattr GF_OPTION_TYPE_BOOL + +debug/trace: + * include-ops (include) GF_OPTION_TYPE_STR + * exclude-ops (exclude) GF_OPTION_TYPE_STR + +encryption/rot-13: + * encrypt-write GF_OPTION_TYPE_BOOL + * decrypt-read GF_OPTION_TYPE_BOOL + +features/path-convertor: + * start-offset GF_OPTION_TYPE_INT 0-4095 + * end-offset GF_OPTION_TYPE_INT 1-4096 + * replace-with GF_OPTION_TYPE_ANY + +features/trash: + * trash-dir GF_OPTION_TYPE_PATH + +features/locks: + * mandatory-locks (mandatory) GF_OPTION_TYPE_BOOL + +features/filter: + * root-squashing GF_OPTION_TYPE_BOOL + * read-only GF_OPTION_TYPE_BOOL + * fixed-uid GF_OPTION_TYPE_INT + * fixed-gid GF_OPTION_TYPE_INT + * translate-uid GF_OPTION_TYPE_ANY + * translate-gid GF_OPTION_TYPE_ANY + * filter-uid GF_OPTION_TYPE_ANY + * filter-gid GF_OPTION_TYPE_ANY + +features/quota: + * min-free-disk-limit GF_OPTION_TYPE_PERCENT + * refresh-interval GF_OPTION_TYPE_TIME + * disk-usage-limit GF_OPTION_TYPE_SIZET + +storage/posix: + * o-direct GF_OPTION_TYPE_BOOL + * directory GF_OPTION_TYPE_PATH + * export-statfs-size GF_OPTION_TYPE_BOOL + * mandate-attribute GF_OPTION_TYPE_BOOL + +storage/bdb: + * directory GF_OPTION_TYPE_PATH + * logdir GF_OPTION_TYPE_PATH + * errfile GF_OPTION_TYPE_PATH + * dir-mode GF_OPTION_TYPE_ANY + * file-mode GF_OPTION_TYPE_ANY + * page-size GF_OPTION_TYPE_SIZET + * lru-limit GF_OPTION_TYPE_INT + * lock-timeout GF_OPTION_TYPE_TIME + * checkpoint-timeout GF_OPTION_TYPE_TIME + * transaction-timeout GF_OPTION_TYPE_TIME + * mode GF_OPTION_TYPE_BOOL + * access-mode GF_OPTION_TYPE_STR + +performance/read-ahead: + * force-atime-update GF_OPTION_TYPE_BOOL + * page-size GF_OPTION_TYPE_SIZET (64 * GF_UNIT_KB)-(2 * GF_UNIT_MB) + * page-count GF_OPTION_TYPE_INT 1-16 + +performance/write-behind: + * flush-behind GF_OPTION_TYPE_BOOL + * aggregate-size GF_OPTION_TYPE_SIZET (128 * GF_UNIT_KB)-(4 * GF_UNIT_MB) + * window-size GF_OPTION_TYPE_SIZET (512 * GF_UNIT_KB)-(1 * GF_UNIT_GB) + * enable-O_SYNC GF_OPTION_TYPE_BOOL + * disable-for-first-nbytes GF_OPTION_TYPE_SIZET 1 - (1 * GF_UNIT_MB) + +performance/symlink-cache: + +performance/io-threads: + * thread-count GF_OPTION_TYPE_INT 1-32 + +performance/io-cache: + * priority GF_OPTION_TYPE_ANY + * cache-timeout (force-revalidate-timeout) GF_OPTION_TYPE_INT 0-60 + * page-size GF_OPTION_TYPE_SIZET (16 * GF_UNIT_KB)-(4 * GF_UNIT_MB) + * cache-size GF_OPTION_TYPE_SIZET (4 * GF_UNIT_MB)-(6 * GF_UNIT_GB) + +auth: +- addr: + * auth.addr.*.allow GF_OPTION_TYPE_ANY + * auth.addr.*.reject GF_OPTION_TYPE_ANY + +- login: + * auth.login.*.allow GF_OPTION_TYPE_ANY + * auth.login.*.password GF_OPTION_TYPE_ANY + +scheduler/alu: + * scheduler.alu.order (alu.order) + GF_OPTION_TYPE_ANY + * scheduler.alu.disk-usage.entry-threshold (alu.disk-usage.entry-threshold) + GF_OPTION_TYPE_SIZET + * scheduler.alu.disk-usage.exit-threshold (alu.disk-usage.exit-threshold) + GF_OPTION_TYPE_SIZET + * scheduler.alu.write-usage.entry-threshold (alu.write-usage.entry-threshold) + GF_OPTION_TYPE_SIZET + * scheduler.alu.write-usage.exit-threshold (alu.write-usage.exit-threshold) + GF_OPTION_TYPE_SIZET + * scheduler.alu.read-usage.entry-threshold (alu.read-usage.entry-threshold) + GF_OPTION_TYPE_SIZET + * scheduler.alu.read-usage.exit-threshold (alu.read-usage.exit-threshold) + GF_OPTION_TYPE_SIZET + * scheduler.alu.open-files-usage.entry-threshold (alu.open-files-usage.entry-threshold) + GF_OPTION_TYPE_INT + * scheduler.alu.open-files-usage.exit-threshold (alu.open-files-usage.exit-threshold) + GF_OPTION_TYPE_INT + * scheduler.read-only-subvolumes (alu.read-only-subvolumes) + GF_OPTION_TYPE_ANY + * scheduler.refresh-interval (alu.refresh-interval) + GF_OPTION_TYPE_TIME + * scheduler.limits.min-free-disk (alu.limits.min-free-disk) + GF_OPTION_TYPE_PERCENT + * scheduler.alu.stat-refresh.num-file-create (alu.stat-refresh.num-file-create) + GF_OPTION_TYPE_INT + +scheduler/nufa: + * scheduler.refresh-interval (nufa.refresh-interval) + GF_OPTION_TYPE_TIME + * scheduler.limits.min-free-disk (nufa.limits.min-free-disk) + GF_OPTION_TYPE_PERCENT + * scheduler.local-volume-name (nufa.local-volume-name) + GF_OPTION_TYPE_XLATOR + +scheduler/random: + * scheduler.refresh-interval (random.refresh-interval) GF_OPTION_TYPE_TIME + * scheduler.limits.min-free-disk (random.limits.min-free-disk) GF_OPTION_TYPE_PERCENT + +scheduler/rr: + * scheduler.refresh-interval (rr.refresh-interval) GF_OPTION_TYPE_TIME + * scheduler.limits.min-free-disk (rr.limits.min-free-disk) GF_OPTION_TYPE_PERCENT + * scheduler.read-only-subvolumes (rr.read-only-subvolumes) GF_OPTION_TYPE_ANY + +scheduler/switch: + * scheduler.read-only-subvolumes (switch.read-only-subvolumes) GF_OPTION_TYPE_ANY + * scheduler.local-volume-name (switch.nufa.local-volume-name) GF_OPTION_TYPE_XLATOR + * scheduler.switch.case (switch.case) GF_OPTION_TYPE_ANY + +transport/ib-verbs: + * transport.ib-verbs.port (ib-verbs-port) GF_OPTION_TYPE_INT 1-4 + check the option by 'ibv_devinfo' + * transport.ib-verbs.mtu (ib-verbs-mtu) GF_OPTION_TYPE_INT + * transport.ib-verbs.device-name (ib-verbs-device-name) GF_OPTION_TYPE_ANY, + check by 'ibv_devinfo' + * transport.ib-verbs.work-request-send-size (ib-verbs-work-request-send-size) + GF_OPTION_TYPE_INT, + * transport.ib-verbs.work-request-recv-size (ib-verbs-work-request-recv-size) + GF_OPTION_TYPE_INT + * transport.ib-verbs.work-request-send-count (ib-verbs-work-request-send-count) + GF_OPTION_TYPE_INT + * transport.ib-verbs.work-request-recv-count (ib-verbs-work-request-recv-count) + GF_OPTION_TYPE_INT + * remote-port (transport.remote-port,transport.ib-verbs.remote-port) + GF_OPTION_TYPE_INT + * transport.ib-verbs.listen-port GF_OPTION_TYPE_INT + * transport.ib-verbs.connect-path (connect-path) GF_OPTION_TYPE_ANY + * transport.ib-verbs.bind-path (bind-path) GF_OPTION_TYPE_ANY + * transport.ib-verbs.listen-path (listen-path) GF_OPTION_TYPE_ANY + * transport.address-family (address-family) GF_OPTION_TYPE_STR inet|inet6|inet/inet6| + inet6/inet|unix|inet-sdp + +transport/socket: + * transport.remote-port (remote-port,transport.socket.remote-port) GF_OPTION_TYPE_INT + * transport.socket.listen-port (listen-port) GF_OPTION_TYPE_INT + * transport.socket.bind-address (bind-address) GF_OPTION_TYPE_ANY + * transport.socket.connect-path (connect-path) GF_OPTION_TYPE_ANY + * transport.socket.bind-path (bind-path) GF_OPTION_TYPE_ANY + * transport.socket.listen-path (listen-path) GF_OPTION_TYPE_ANY + * transport.address-family (address-family) GF_OPTION_TYPE_STR inet|inet6| + inet/inet6|inet6/inet| + unix|inet-sdp diff --git a/doc/user-guide/Makefile.am b/doc/user-guide/Makefile.am new file mode 100644 index 000000000..8d7068f14 --- /dev/null +++ b/doc/user-guide/Makefile.am @@ -0,0 +1 @@ +info_TEXINFOS = user-guide.texi diff --git a/doc/user-guide/advanced-stripe.odg b/doc/user-guide/advanced-stripe.odg new file mode 100644 index 000000000..7686d7091 Binary files /dev/null and b/doc/user-guide/advanced-stripe.odg differ diff --git a/doc/user-guide/advanced-stripe.pdf b/doc/user-guide/advanced-stripe.pdf new file mode 100644 index 000000000..ec8b03dcf Binary files /dev/null and b/doc/user-guide/advanced-stripe.pdf differ diff --git a/doc/user-guide/colonO-icon.jpg b/doc/user-guide/colonO-icon.jpg new file mode 100644 index 000000000..3e66f7a27 Binary files /dev/null and b/doc/user-guide/colonO-icon.jpg differ diff --git a/doc/user-guide/fdl.texi b/doc/user-guide/fdl.texi new file mode 100644 index 000000000..e33c687cd --- /dev/null +++ b/doc/user-guide/fdl.texi @@ -0,0 +1,454 @@ + +@c @node GNU Free Documentation License +@c @appendixsec GNU Free Documentation License + +@cindex FDL, GNU Free Documentation License +@center Version 1.2, November 2002 + +@display +Copyright @copyright{} 2000,2001,2002 Free Software Foundation, Inc. +59 Temple Place, Suite 330, Boston, MA 02111-1307, USA + +Everyone is permitted to copy and distribute verbatim copies +of this license document, but changing it is not allowed. +@end display + +@enumerate 0 +@item +PREAMBLE + +The purpose of this License is to make a manual, textbook, or other +functional and useful document @dfn{free} in the sense of freedom: to +assure everyone the effective freedom to copy and redistribute it, +with or without modifying it, either commercially or noncommercially. +Secondarily, this License preserves for the author and publisher a way +to get credit for their work, while not being considered responsible +for modifications made by others. + +This License is a kind of ``copyleft'', which means that derivative +works of the document must themselves be free in the same sense. It +complements the GNU General Public License, which is a copyleft +license designed for free software. + +We have designed this License in order to use it for manuals for free +software, because free software needs free documentation: a free +program should come with manuals providing the same freedoms that the +software does. But this License is not limited to software manuals; +it can be used for any textual work, regardless of subject matter or +whether it is published as a printed book. We recommend this License +principally for works whose purpose is instruction or reference. + +@item +APPLICABILITY AND DEFINITIONS + +This License applies to any manual or other work, in any medium, that +contains a notice placed by the copyright holder saying it can be +distributed under the terms of this License. Such a notice grants a +world-wide, royalty-free license, unlimited in duration, to use that +work under the conditions stated herein. The ``Document'', below, +refers to any such manual or work. Any member of the public is a +licensee, and is addressed as ``you''. You accept the license if you +copy, modify or distribute the work in a way requiring permission +under copyright law. + +A ``Modified Version'' of the Document means any work containing the +Document or a portion of it, either copied verbatim, or with +modifications and/or translated into another language. + +A ``Secondary Section'' is a named appendix or a front-matter section +of the Document that deals exclusively with the relationship of the +publishers or authors of the Document to the Document's overall +subject (or to related matters) and contains nothing that could fall +directly within that overall subject. (Thus, if the Document is in +part a textbook of mathematics, a Secondary Section may not explain +any mathematics.) The relationship could be a matter of historical +connection with the subject or with related matters, or of legal, +commercial, philosophical, ethical or political position regarding +them. + +The ``Invariant Sections'' are certain Secondary Sections whose titles +are designated, as being those of Invariant Sections, in the notice +that says that the Document is released under this License. If a +section does not fit the above definition of Secondary then it is not +allowed to be designated as Invariant. The Document may contain zero +Invariant Sections. If the Document does not identify any Invariant +Sections then there are none. + +The ``Cover Texts'' are certain short passages of text that are listed, +as Front-Cover Texts or Back-Cover Texts, in the notice that says that +the Document is released under this License. A Front-Cover Text may +be at most 5 words, and a Back-Cover Text may be at most 25 words. + +A ``Transparent'' copy of the Document means a machine-readable copy, +represented in a format whose specification is available to the +general public, that is suitable for revising the document +straightforwardly with generic text editors or (for images composed of +pixels) generic paint programs or (for drawings) some widely available +drawing editor, and that is suitable for input to text formatters or +for automatic translation to a variety of formats suitable for input +to text formatters. A copy made in an otherwise Transparent file +format whose markup, or absence of markup, has been arranged to thwart +or discourage subsequent modification by readers is not Transparent. +An image format is not Transparent if used for any substantial amount +of text. A copy that is not ``Transparent'' is called ``Opaque''. + +Examples of suitable formats for Transparent copies include plain +@sc{ascii} without markup, Texinfo input format, La@TeX{} input +format, @acronym{SGML} or @acronym{XML} using a publicly available +@acronym{DTD}, and standard-conforming simple @acronym{HTML}, +PostScript or @acronym{PDF} designed for human modification. Examples +of transparent image formats include @acronym{PNG}, @acronym{XCF} and +@acronym{JPG}. Opaque formats include proprietary formats that can be +read and edited only by proprietary word processors, @acronym{SGML} or +@acronym{XML} for which the @acronym{DTD} and/or processing tools are +not generally available, and the machine-generated @acronym{HTML}, +PostScript or @acronym{PDF} produced by some word processors for +output purposes only. + +The ``Title Page'' means, for a printed book, the title page itself, +plus such following pages as are needed to hold, legibly, the material +this License requires to appear in the title page. For works in +formats which do not have any title page as such, ``Title Page'' means +the text near the most prominent appearance of the work's title, +preceding the beginning of the body of the text. + +A section ``Entitled XYZ'' means a named subunit of the Document whose +title either is precisely XYZ or contains XYZ in parentheses following +text that translates XYZ in another language. (Here XYZ stands for a +specific section name mentioned below, such as ``Acknowledgements'', +``Dedications'', ``Endorsements'', or ``History''.) To ``Preserve the Title'' +of such a section when you modify the Document means that it remains a +section ``Entitled XYZ'' according to this definition. + +The Document may include Warranty Disclaimers next to the notice which +states that this License applies to the Document. These Warranty +Disclaimers are considered to be included by reference in this +License, but only as regards disclaiming warranties: any other +implication that these Warranty Disclaimers may have is void and has +no effect on the meaning of this License. + +@item +VERBATIM COPYING + +You may copy and distribute the Document in any medium, either +commercially or noncommercially, provided that this License, the +copyright notices, and the license notice saying this License applies +to the Document are reproduced in all copies, and that you add no other +conditions whatsoever to those of this License. You may not use +technical measures to obstruct or control the reading or further +copying of the copies you make or distribute. However, you may accept +compensation in exchange for copies. If you distribute a large enough +number of copies you must also follow the conditions in section 3. + +You may also lend copies, under the same conditions stated above, and +you may publicly display copies. + +@item +COPYING IN QUANTITY + +If you publish printed copies (or copies in media that commonly have +printed covers) of the Document, numbering more than 100, and the +Document's license notice requires Cover Texts, you must enclose the +copies in covers that carry, clearly and legibly, all these Cover +Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on +the back cover. Both covers must also clearly and legibly identify +you as the publisher of these copies. The front cover must present +the full title with all words of the title equally prominent and +visible. You may add other material on the covers in addition. +Copying with changes limited to the covers, as long as they preserve +the title of the Document and satisfy these conditions, can be treated +as verbatim copying in other respects. + +If the required texts for either cover are too voluminous to fit +legibly, you should put the first ones listed (as many as fit +reasonably) on the actual cover, and continue the rest onto adjacent +pages. + +If you publish or distribute Opaque copies of the Document numbering +more than 100, you must either include a machine-readable Transparent +copy along with each Opaque copy, or state in or with each Opaque copy +a computer-network location from which the general network-using +public has access to download using public-standard network protocols +a complete Transparent copy of the Document, free of added material. +If you use the latter option, you must take reasonably prudent steps, +when you begin distribution of Opaque copies in quantity, to ensure +that this Transparent copy will remain thus accessible at the stated +location until at least one year after the last time you distribute an +Opaque copy (directly or through your agents or retailers) of that +edition to the public. + +It is requested, but not required, that you contact the authors of the +Document well before redistributing any large number of copies, to give +them a chance to provide you with an updated version of the Document. + +@item +MODIFICATIONS + +You may copy and distribute a Modified Version of the Document under +the conditions of sections 2 and 3 above, provided that you release +the Modified Version under precisely this License, with the Modified +Version filling the role of the Document, thus licensing distribution +and modification of the Modified Version to whoever possesses a copy +of it. In addition, you must do these things in the Modified Version: + +@enumerate A +@item +Use in the Title Page (and on the covers, if any) a title distinct +from that of the Document, and from those of previous versions +(which should, if there were any, be listed in the History section +of the Document). You may use the same title as a previous version +if the original publisher of that version gives permission. + +@item +List on the Title Page, as authors, one or more persons or entities +responsible for authorship of the modifications in the Modified +Version, together with at least five of the principal authors of the +Document (all of its principal authors, if it has fewer than five), +unless they release you from this requirement. + +@item +State on the Title page the name of the publisher of the +Modified Version, as the publisher. + +@item +Preserve all the copyright notices of the Document. + +@item +Add an appropriate copyright notice for your modifications +adjacent to the other copyright notices. + +@item +Include, immediately after the copyright notices, a license notice +giving the public permission to use the Modified Version under the +terms of this License, in the form shown in the Addendum below. + +@item +Preserve in that license notice the full lists of Invariant Sections +and required Cover Texts given in the Document's license notice. + +@item +Include an unaltered copy of this License. + +@item +Preserve the section Entitled ``History'', Preserve its Title, and add +to it an item stating at least the title, year, new authors, and +publisher of the Modified Version as given on the Title Page. If +there is no section Entitled ``History'' in the Document, create one +stating the title, year, authors, and publisher of the Document as +given on its Title Page, then add an item describing the Modified +Version as stated in the previous sentence. + +@item +Preserve the network location, if any, given in the Document for +public access to a Transparent copy of the Document, and likewise +the network locations given in the Document for previous versions +it was based on. These may be placed in the ``History'' section. +You may omit a network location for a work that was published at +least four years before the Document itself, or if the original +publisher of the version it refers to gives permission. + +@item +For any section Entitled ``Acknowledgements'' or ``Dedications'', Preserve +the Title of the section, and preserve in the section all the +substance and tone of each of the contributor acknowledgements and/or +dedications given therein. + +@item +Preserve all the Invariant Sections of the Document, +unaltered in their text and in their titles. Section numbers +or the equivalent are not considered part of the section titles. + +@item +Delete any section Entitled ``Endorsements''. Such a section +may not be included in the Modified Version. + +@item +Do not retitle any existing section to be Entitled ``Endorsements'' or +to conflict in title with any Invariant Section. + +@item +Preserve any Warranty Disclaimers. +@end enumerate + +If the Modified Version includes new front-matter sections or +appendices that qualify as Secondary Sections and contain no material +copied from the Document, you may at your option designate some or all +of these sections as invariant. To do this, add their titles to the +list of Invariant Sections in the Modified Version's license notice. +These titles must be distinct from any other section titles. + +You may add a section Entitled ``Endorsements'', provided it contains +nothing but endorsements of your Modified Version by various +parties---for example, statements of peer review or that the text has +been approved by an organization as the authoritative definition of a +standard. + +You may add a passage of up to five words as a Front-Cover Text, and a +passage of up to 25 words as a Back-Cover Text, to the end of the list +of Cover Texts in the Modified Version. Only one passage of +Front-Cover Text and one of Back-Cover Text may be added by (or +through arrangements made by) any one entity. If the Document already +includes a cover text for the same cover, previously added by you or +by arrangement made by the same entity you are acting on behalf of, +you may not add another; but you may replace the old one, on explicit +permission from the previous publisher that added the old one. + +The author(s) and publisher(s) of the Document do not by this License +give permission to use their names for publicity for or to assert or +imply endorsement of any Modified Version. + +@item +COMBINING DOCUMENTS + +You may combine the Document with other documents released under this +License, under the terms defined in section 4 above for modified +versions, provided that you include in the combination all of the +Invariant Sections of all of the original documents, unmodified, and +list them all as Invariant Sections of your combined work in its +license notice, and that you preserve all their Warranty Disclaimers. + +The combined work need only contain one copy of this License, and +multiple identical Invariant Sections may be replaced with a single +copy. If there are multiple Invariant Sections with the same name but +different contents, make the title of each such section unique by +adding at the end of it, in parentheses, the name of the original +author or publisher of that section if known, or else a unique number. +Make the same adjustment to the section titles in the list of +Invariant Sections in the license notice of the combined work. + +In the combination, you must combine any sections Entitled ``History'' +in the various original documents, forming one section Entitled +``History''; likewise combine any sections Entitled ``Acknowledgements'', +and any sections Entitled ``Dedications''. You must delete all +sections Entitled ``Endorsements.'' + +@item +COLLECTIONS OF DOCUMENTS + +You may make a collection consisting of the Document and other documents +released under this License, and replace the individual copies of this +License in the various documents with a single copy that is included in +the collection, provided that you follow the rules of this License for +verbatim copying of each of the documents in all other respects. + +You may extract a single document from such a collection, and distribute +it individually under this License, provided you insert a copy of this +License into the extracted document, and follow this License in all +other respects regarding verbatim copying of that document. + +@item +AGGREGATION WITH INDEPENDENT WORKS + +A compilation of the Document or its derivatives with other separate +and independent documents or works, in or on a volume of a storage or +distribution medium, is called an ``aggregate'' if the copyright +resulting from the compilation is not used to limit the legal rights +of the compilation's users beyond what the individual works permit. +When the Document is included in an aggregate, this License does not +apply to the other works in the aggregate which are not themselves +derivative works of the Document. + +If the Cover Text requirement of section 3 is applicable to these +copies of the Document, then if the Document is less than one half of +the entire aggregate, the Document's Cover Texts may be placed on +covers that bracket the Document within the aggregate, or the +electronic equivalent of covers if the Document is in electronic form. +Otherwise they must appear on printed covers that bracket the whole +aggregate. + +@item +TRANSLATION + +Translation is considered a kind of modification, so you may +distribute translations of the Document under the terms of section 4. +Replacing Invariant Sections with translations requires special +permission from their copyright holders, but you may include +translations of some or all Invariant Sections in addition to the +original versions of these Invariant Sections. You may include a +translation of this License, and all the license notices in the +Document, and any Warranty Disclaimers, provided that you also include +the original English version of this License and the original versions +of those notices and disclaimers. In case of a disagreement between +the translation and the original version of this License or a notice +or disclaimer, the original version will prevail. + +If a section in the Document is Entitled ``Acknowledgements'', +``Dedications'', or ``History'', the requirement (section 4) to Preserve +its Title (section 1) will typically require changing the actual +title. + +@item +TERMINATION + +You may not copy, modify, sublicense, or distribute the Document except +as expressly provided for under this License. Any other attempt to +copy, modify, sublicense or distribute the Document is void, and will +automatically terminate your rights under this License. However, +parties who have received copies, or rights, from you under this +License will not have their licenses terminated so long as such +parties remain in full compliance. + +@item +FUTURE REVISIONS OF THIS LICENSE + +The Free Software Foundation may publish new, revised versions +of the GNU Free Documentation License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. See +@uref{http://www.gnu.org/copyleft/}. + +Each version of the License is given a distinguishing version number. +If the Document specifies that a particular numbered version of this +License ``or any later version'' applies to it, you have the option of +following the terms and conditions either of that specified version or +of any later version that has been published (not as a draft) by the +Free Software Foundation. If the Document does not specify a version +number of this License, you may choose any version ever published (not +as a draft) by the Free Software Foundation. +@end enumerate + +@page +@c @appendixsubsec ADDENDUM: How to use this License for your +@c documents +@subsection ADDENDUM: How to use this License for your documents + +To use this License in a document you have written, include a copy of +the License in the document and put the following copyright and +license notices just after the title page: + +@smallexample +@group + Copyright (C) @var{year} @var{your name}. + Permission is granted to copy, distribute and/or modify this document + under the terms of the GNU Free Documentation License, Version 1.2 + or any later version published by the Free Software Foundation; + with no Invariant Sections, no Front-Cover Texts, and no Back-Cover + Texts. A copy of the license is included in the section entitled ``GNU + Free Documentation License''. +@end group +@end smallexample + +If you have Invariant Sections, Front-Cover Texts and Back-Cover Texts, +replace the ``with...Texts.'' line with this: + +@smallexample +@group + with the Invariant Sections being @var{list their titles}, with + the Front-Cover Texts being @var{list}, and with the Back-Cover Texts + being @var{list}. +@end group +@end smallexample + +If you have Invariant Sections without Cover Texts, or some other +combination of the three, merge those two alternatives to suit the +situation. + +If your document contains nontrivial examples of program code, we +recommend releasing these examples in parallel under your choice of +free software license, such as the GNU General Public License, +to permit their use in free software. + +@c Local Variables: +@c ispell-local-pdict: "ispell-dict" +@c End: + diff --git a/doc/user-guide/fuse.odg b/doc/user-guide/fuse.odg new file mode 100644 index 000000000..61bd103c7 Binary files /dev/null and b/doc/user-guide/fuse.odg differ diff --git a/doc/user-guide/fuse.pdf b/doc/user-guide/fuse.pdf new file mode 100644 index 000000000..a7d13faff Binary files /dev/null and b/doc/user-guide/fuse.pdf differ diff --git a/doc/user-guide/ha.odg b/doc/user-guide/ha.odg new file mode 100644 index 000000000..e4b8b72d0 Binary files /dev/null and b/doc/user-guide/ha.odg differ diff --git a/doc/user-guide/ha.pdf b/doc/user-guide/ha.pdf new file mode 100644 index 000000000..e372c0ab0 Binary files /dev/null and b/doc/user-guide/ha.pdf differ diff --git a/doc/user-guide/stripe.odg b/doc/user-guide/stripe.odg new file mode 100644 index 000000000..79441bf14 Binary files /dev/null and b/doc/user-guide/stripe.odg differ diff --git a/doc/user-guide/stripe.pdf b/doc/user-guide/stripe.pdf new file mode 100644 index 000000000..b94446feb Binary files /dev/null and b/doc/user-guide/stripe.pdf differ diff --git a/doc/user-guide/unify.odg b/doc/user-guide/unify.odg new file mode 100644 index 000000000..ccaa9bf16 Binary files /dev/null and b/doc/user-guide/unify.odg differ diff --git a/doc/user-guide/unify.pdf b/doc/user-guide/unify.pdf new file mode 100644 index 000000000..c22027f66 Binary files /dev/null and b/doc/user-guide/unify.pdf differ diff --git a/doc/user-guide/user-guide.info b/doc/user-guide/user-guide.info new file mode 100644 index 000000000..078d62ade --- /dev/null +++ b/doc/user-guide/user-guide.info @@ -0,0 +1,2698 @@ +This is ../../../doc/user-guide/user-guide.info, produced by makeinfo +version 4.9 from ../../../doc/user-guide/user-guide.texi. + +START-INFO-DIR-ENTRY +* GlusterFS: (user-guide). GlusterFS distributed filesystem user guide +END-INFO-DIR-ENTRY + + This is the user manual for GlusterFS 2.0. + + Copyright (C) 2008,2007 Research, Inc. Permission is granted to +copy, distribute and/or modify this document under the terms of the GNU +Free Documentation License, Version 1.2 or any later version published +by the Free Software Foundation; with no Invariant Sections, no +Front-Cover Texts, and no Back-Cover Texts. A copy of the license is +included in the chapter entitled "GNU Free Documentation License". + + +File: user-guide.info, Node: Top, Next: Acknowledgements, Up: (dir) + +GlusterFS 2.0 User Guide +************************ + +This is the user manual for GlusterFS 2.0. + + Copyright (C) 2008,2007 Research, Inc. Permission is granted to +copy, distribute and/or modify this document under the terms of the GNU +Free Documentation License, Version 1.2 or any later version published +by the Free Software Foundation; with no Invariant Sections, no +Front-Cover Texts, and no Back-Cover Texts. A copy of the license is +included in the chapter entitled "GNU Free Documentation License". + +* Menu: + +* Acknowledgements:: +* Introduction:: +* Installation and Invocation:: +* Concepts:: +* Translators:: +* Usage Scenarios:: +* Troubleshooting:: +* GNU Free Documentation Licence:: +* Index:: + + --- The Detailed Node Listing --- + +Installation and Invocation + +* Pre requisites:: +* Getting GlusterFS:: +* Building:: +* Running GlusterFS:: +* A Tutorial Introduction:: + +Running GlusterFS + +* Server:: +* Client:: + +Concepts + +* Filesystems in Userspace:: +* Translator:: +* Volume specification file:: + +Translators + +* Storage Translators:: +* Client and Server Translators:: +* Clustering Translators:: +* Performance Translators:: +* Features Translators:: + +Storage Translators + +* POSIX:: + +Client and Server Translators + +* Transport modules:: +* Client protocol:: +* Server protocol:: + +Clustering Translators + +* Unify:: +* Replicate:: +* Stripe:: + +Performance Translators + +* Read Ahead:: +* Write Behind:: +* IO Threads:: +* IO Cache:: + +Features Translators + +* POSIX Locks:: +* Fixed ID:: + +Miscellaneous Translators + +* ROT-13:: +* Trace:: + + +File: user-guide.info, Node: Acknowledgements, Next: Introduction, Prev: Top, Up: Top + +Acknowledgements +**************** + +GlusterFS continues to be a wonderful and enriching experience for all +of us involved. + + GlusterFS development would not have been possible at this pace if +not for our enthusiastic users. People from around the world have +helped us with bug reports, performance numbers, and feature +suggestions. A huge thanks to them all. + + Matthew Paine - for RPMs & general enthu + + Leonardo Rodrigues de Mello - for DEBs + + Julian Perez & Adam D'Auria - for multi-server tutorial + + Paul England - for HA spec + + Brent Nelson - for many bug reports + + Jacques Mattheij - for Europe mirror. + + Patrick Negri - for TCP non-blocking connect. + http://gluster.org/core-team.php () + Research + + +File: user-guide.info, Node: Introduction, Next: Installation and Invocation, Prev: Acknowledgements, Up: Top + +1 Introduction +************** + +GlusterFS is a distributed filesystem. It works at the file level, not +block level. + + A network filesystem is one which allows us to access remote files. A +distributed filesystem is one that stores data on multiple machines and +makes them all appear to be a part of the same filesystem. + + Need for distributed filesystems + + * Scalability: A distributed filesystem allows us to store more data + than what can be stored on a single machine. + + * Redundancy: We might want to replicate crucial data on to several + machines. + + * Uniform access: One can mount a remote volume (for example your + home directory) from any machine and access the same data. + +1.1 Contacting us +================= + +You can reach us through the mailing list *gluster-devel* +(). + + You can also find many of the developers on IRC, on the `#gluster' +channel on Freenode (). + + The GlusterFS documentation wiki is also useful: + + + For commercial support, you can contact Research at: + + 3194 Winding Vista Common + Fremont, CA 94539 + USA. + + Phone: +1 (510) 354 6801 + Toll free: +1 (888) 813 6309 + Fax: +1 (510) 372 0604 + + You can also email us at . + + +File: user-guide.info, Node: Installation and Invocation, Next: Concepts, Prev: Introduction, Up: Top + +2 Installation and Invocation +***************************** + +* Menu: + +* Pre requisites:: +* Getting GlusterFS:: +* Building:: +* Running GlusterFS:: +* A Tutorial Introduction:: + + +File: user-guide.info, Node: Pre requisites, Next: Getting GlusterFS, Up: Installation and Invocation + +2.1 Pre requisites +================== + +Before installing GlusterFS make sure you have the following components +installed. + +2.1.1 FUSE +---------- + +You'll need FUSE version 2.6.0 or higher to use GlusterFS. You can omit +installing FUSE if you want to build _only_ the server. Note that you +won't be able to mount a GlusterFS filesystem on a machine that does +not have FUSE installed. + + FUSE can be downloaded from: + + To get the best performance from GlusterFS, however, it is +recommended that you use our patched version of FUSE. See Patched FUSE +for details. + +2.1.2 Patched FUSE +------------------ + +The GlusterFS project maintains a patched version of FUSE meant to be +used with GlusterFS. The patches increase GlusterFS performance. It is +recommended that all users use the patched FUSE. + + The patched FUSE tarball can be downloaded from: + + + + The specific changes made to FUSE are: + + * The communication channel size between FUSE kernel module and + GlusterFS has been increased to 1MB, permitting large reads and + writes to be sent in bigger chunks. + + * The kernel's read-ahead boundry has been extended upto 1MB. + + * Block size returned in the `stat()'/`fstat()' calls tuned to 1MB, + to make cp and similar commands perform I/O using that block size. + + * `flock()' locking support has been added (although some rework in + GlusterFS is needed for perfect compliance). + +2.1.3 libibverbs (optional) +--------------------------- + +This is only needed if you want GlusterFS to use InfiniBand as the +interconnect mechanism between server and client. You can get it from: + + . + +2.1.4 Bison and Flex +-------------------- + +These should be already installed on most Linux systems. If not, use +your distribution's normal software installation procedures to install +them. Make sure you install the relevant developer packages also. + + +File: user-guide.info, Node: Getting GlusterFS, Next: Building, Prev: Pre requisites, Up: Installation and Invocation + +2.2 Getting GlusterFS +===================== + +There are many ways to get hold of GlusterFS. For a production +deployment, the recommended method is to download the latest release +tarball. Release tarballs are available at: +. + + If you want the bleeding edge development source, you can get them +from the GNU Arch(1) repository. First you must install GNU Arch +itself. Then register the GlusterFS archive by doing: + + $ tla register-archive http://arch.sv.gnu.org/archives/gluster + + Now you can check out the source itself: + + $ tla get -A gluster@sv.gnu.org glusterfs--mainline--3.0 + + ---------- Footnotes ---------- + + (1) + + +File: user-guide.info, Node: Building, Next: Running GlusterFS, Prev: Getting GlusterFS, Up: Installation and Invocation + +2.3 Building +============ + +You can skip this section if you're installing from RPMs or DEBs. + + GlusterFS uses the Autotools mechanism to build. As such, the +procedure is straight-forward. First, change into the GlusterFS source +directory. + + $ cd glusterfs- + + If you checked out the source from the Arch repository, you'll need +to run `./autogen.sh' first. Note that you'll need to have Autoconf and +Automake installed for this. + + Run `configure'. + + $ ./configure + + The configure script accepts the following options: + +`--disable-ibverbs' + Disable the InfiniBand transport mechanism. + +`--disable-fuse-client' + Disable the FUSE client. + +`--disable-server' + Disable building of the GlusterFS server. + +`--disable-bdb' + Disable building of Berkeley DB based storage translator. + +`--disable-mod_glusterfs' + Disable building of Apache/lighttpd glusterfs plugins. + +`--disable-epoll' + Use poll instead of epoll. + +`--disable-libglusterfsclient' + Disable building of libglusterfsclient + + + Build and install GlusterFS. + + # make install + + The binaries (`glusterfsd' and `glusterfs') will be by default +installed in `/usr/local/sbin/'. Translator, scheduler, and transport +shared libraries will be installed in +`/usr/local/lib/glusterfs//'. Sample volume specification +files will be in `/usr/local/etc/glusterfs/'. This document itself can +be found in `/usr/local/share/doc/glusterfs/'. If you passed the +`--prefix' argument to the configure script, then replace `/usr/local' +in the preceding paths with the prefix. + + +File: user-guide.info, Node: Running GlusterFS, Next: A Tutorial Introduction, Prev: Building, Up: Installation and Invocation + +2.4 Running GlusterFS +===================== + +* Menu: + +* Server:: +* Client:: + + +File: user-guide.info, Node: Server, Next: Client, Up: Running GlusterFS + +2.4.1 Server +------------ + +The GlusterFS server is necessary to export storage volumes to remote +clients (See *Note Server protocol:: for more info). This section +documents the invocation of the GlusterFS server program and all the +command-line options accepted by it. + + Basic Options + +`-f, --volfile=' + Use the volume file as the volume specification. + +`-s, --volfile-server=' + Server to get volume file from. This option overrides -volfile + option. + +`-l, --log-file=' + Specify the path for the log file. + +`-L, --log-level=' + Set the log level for the server. Log level should be one of DEBUG, + WARNING, ERROR, CRITICAL, or NONE. + + Advanced Options + +`--debug' + Run in debug mode. This option sets -no-daemon, -log-level to + DEBUG and -log-file to console. + +`-N, --no-daemon' + Run glusterfsd as a foreground process. + +`-p, --pid-file=' + Path for the PID file. + +`--volfile-id=' + 'key' of the volfile to be fetched from server. + +`--volfile-server-port=' + Listening port number of volfile server. + +`--volfile-server-transport=[socket|ib-verbs]' + Transport type to get volfile from server. [default: `socket'] + +`--xlator-options=' + Add/override a translator option for a volume with specified value. + + Miscellaneous Options + +`-?, --help' + Show this help text. + +`--usage' + Display a short usage message. + +`-V, --version' + Show version information. + + +File: user-guide.info, Node: Client, Prev: Server, Up: Running GlusterFS + +2.4.2 Client +------------ + +The GlusterFS client process is necessary to access remote storage +volumes and mount them locally using FUSE. This section documents the +invocation of the client process and all its command-line arguments. + + # glusterfs [options] + + The `mountpoint' is the directory where you want the GlusterFS +filesystem to appear. Example: + + # glusterfs -f /usr/local/etc/glusterfs-client.vol /mnt + + The command-line options are detailed below. + + Basic Options + +`-f, --volfile=' + Use the volume file as the volume specification. + +`-s, --volfile-server=' + Server to get volume file from. This option overrides -volfile + option. + +`-l, --log-file=' + Specify the path for the log file. + +`-L, --log-level=' + Set the log level for the server. Log level should be one of DEBUG, + WARNING, ERROR, CRITICAL, or NONE. + + Advanced Options + +`--debug' + Run in debug mode. This option sets -no-daemon, -log-level to + DEBUG and -log-file to console. + +`-N, --no-daemon' + Run `glusterfs' as a foreground process. + +`-p, --pid-file=' + Path for the PID file. + +`--volfile-id=' + 'key' of the volfile to be fetched from server. + +`--volfile-server-port=' + Listening port number of volfile server. + +`--volfile-server-transport=[socket|ib-verbs]' + Transport type to get volfile from server. [default: `socket'] + +`--xlator-options=' + Add/override a translator option for a volume with specified value. + +`--volume-name=' + Volume name in client spec to use. Defaults to the root volume. + + FUSE Options + +`--attribute-timeout=' + Attribute timeout for inodes in the kernel, in seconds. Defaults + to 1 second. + +`--disable-direct-io-mode' + Disable direct I/O mode in FUSE kernel module. + +`-e, --entry-timeout=' + Entry timeout for directory entries in the kernel, in seconds. + Defaults to 1 second. + + Missellaneous Options + +`-?, --help' + Show this help information. + +`-V, --version' + Show version information. + + +File: user-guide.info, Node: A Tutorial Introduction, Prev: Running GlusterFS, Up: Installation and Invocation + +2.5 A Tutorial Introduction +=========================== + +This section will show you how to quickly get GlusterFS up and running. +We'll configure GlusterFS as a simple network filesystem, with one +server and one client. In this mode of usage, GlusterFS can serve as a +replacement for NFS. + + We'll make use of two machines; call them _server_ and _client_ (If +you don't want to setup two machines, just run everything that follows +on the same machine). In the examples that follow, the shell prompts +will use these names to clarify the machine on which the command is +being run. For example, a command that should be run on the server will +be shown with the prompt: + + [root@server]# + + Our goal is to make a directory on the _server_ (say, `/export') +accessible to the _client_. + + First of all, get GlusterFS installed on both the machines, as +described in the previous sections. Make sure you have the FUSE kernel +module loaded. You can ensure this by running: + + [root@server]# modprobe fuse + + Before we can run the GlusterFS client or server programs, we need +to write two files called _volume specifications_ (equivalently refered +to as _volfiles_). The volfile describes the _translator tree_ on a +node. The next chapter will explain the concepts of `translator' and +`volume specification' in detail. For now, just assume that the volfile +is like an NFS `/etc/export' file. + + On the server, create a text file somewhere (we'll assume the path +`/tmp/glusterfsd.vol') with the following contents. + + volume colon-o + type storage/posix + option directory /export + end-volume + + volume server + type protocol/server + subvolumes colon-o + option transport-type tcp + option auth.addr.colon-o.allow * + end-volume + + A brief explanation of the file's contents. The first section +defines a storage volume, named "colon-o" (the volume names are +arbitrary), which exports the `/export' directory. The second section +defines options for the translator which will make the storage volume +accessible remotely. It specifies `colon-o' as a subvolume. This +defines the _translator tree_, about which more will be said in the +next chapter. The two options specify that the TCP protocol is to be +used (as opposed to InfiniBand, for example), and that access to the +storage volume is to be provided to clients with any IP address at all. +If you wanted to restrict access to this server to only your subnet for +example, you'd specify something like `192.168.1.*' in the second +option line. + + On the client machine, create the following text file (again, we'll +assume the path to be `/tmp/glusterfs-client.vol'). Replace +_server-ip-address_ with the IP address of your server machine. If you +are doing all this on a single machine, use `127.0.0.1'. + + volume client + type protocol/client + option transport-type tcp + option remote-host _server-ip-address_ + option remote-subvolume colon-o + end-volume + + Now we need to start both the server and client programs. To start +the server: + + [root@server]# glusterfsd -f /tmp/glusterfs-server.vol + + To start the client: + + [root@client]# glusterfs -f /tmp/glusterfs-client.vol /mnt/glusterfs + + You should now be able to see the files under the server's `/export' +directory in the `/mnt/glusterfs' directory on the client. That's it; +GlusterFS is now working as a network file system. + + +File: user-guide.info, Node: Concepts, Next: Translators, Prev: Installation and Invocation, Up: Top + +3 Concepts +********** + +* Menu: + +* Filesystems in Userspace:: +* Translator:: +* Volume specification file:: + + +File: user-guide.info, Node: Filesystems in Userspace, Next: Translator, Up: Concepts + +3.1 Filesystems in Userspace +============================ + +A filesystem is usually implemented in kernel space. Kernel space +development is much harder than userspace development. FUSE is a kernel +module/library that allows us to write a filesystem completely in +userspace. + + FUSE consists of a kernel module which interacts with the userspace +implementation using a device file `/dev/fuse'. When a process makes a +syscall on a FUSE filesystem, VFS hands the request to the FUSE module, +which writes the request to `/dev/fuse'. The userspace implementation +polls `/dev/fuse', and when a request arrives, processes it and writes +the result back to `/dev/fuse'. The kernel then reads from the device +file and returns the result to the user process. + + In case of GlusterFS, the userspace program is the GlusterFS client. +The control flow is shown in the diagram below. The GlusterFS client +services the request by sending it to the server, which in turn hands +it to the local POSIX filesystem. + + + Fig 1. Control flow in GlusterFS + + +File: user-guide.info, Node: Translator, Next: Volume specification file, Prev: Filesystems in Userspace, Up: Concepts + +3.2 Translator +============== + +The _translator_ is the most important concept in GlusterFS. In fact, +GlusterFS is nothing but a collection of translators working together, +forming a translator _tree_. + + The idea of a translator is perhaps best understood using an +analogy. Consider the VFS in the Linux kernel. The VFS abstracts the +various filesystem implementations (such as EXT3, ReiserFS, XFS, etc.) +supported by the kernel. When an application calls the kernel to +perform an operation on a file, the kernel passes the request on to the +appropriate filesystem implementation. + + For example, let's say there are two partitions on a Linux machine: +`/', which is an EXT3 partition, and `/usr', which is a ReiserFS +partition. Now if an application wants to open a file called, say, +`/etc/fstab', then the kernel will internally pass the request to the +EXT3 implementation. If on the other hand, an application wants to +read a file called `/usr/src/linux/CREDITS', then the kernel will call +upon the ReiserFS implementation to do the job. + + The "filesystem implementation" objects are analogous to GlusterFS +translators. A GlusterFS translator implements all the filesystem +operations. Whereas in VFS there is a two-level tree (with the kernel +at the root and all the filesystem implementation as its children), in +GlusterFS there exists a more elaborate tree structure. + + We can now define translators more precisely. A GlusterFS translator +is a shared object (`.so') that implements every filesystem call. +GlusterFS translators can be arranged in an arbitrary tree structure +(subject to constraints imposed by the translators). When GlusterFS +receives a filesystem call, it passes it on to the translator at the +root of the translator tree. The root translator may in turn pass it on +to any or all of its children, and so on, until the leaf nodes are +reached. The result of a filesystem call is communicated in the reverse +fashion, from the leaf nodes up to the root node, and then on to the +application. + + So what might a translator tree look like? + + + Fig 2. A sample translator tree + + The diagram depicts three servers and one GlusterFS client. It is +important to note that conceptually, the translator tree spans machine +boundaries. Thus, the client machine in the diagram, `10.0.0.1', can +access the aggregated storage of the filesystems on the server machines +`10.0.0.2', `10.0.0.3', and `10.0.0.4'. The translator diagram will +make more sense once you've read the next chapter and understood the +functions of the various translators. + + +File: user-guide.info, Node: Volume specification file, Prev: Translator, Up: Concepts + +3.3 Volume specification file +============================= + +The volume specification file describes the translator tree for both the +server and client programs. + + A volume specification file is a sequence of volume definitions. +The syntax of a volume definition is explained below: + + *volume* _volume-name_ + *type* _translator-name_ + *option* _option-name_ _option-value_ + ... + *subvolumes* _subvolume1_ _subvolume2_ ... + *end-volume* + + ... + +_volume-name_ + An identifier for the volume. This is just a human-readable name, + and can contain any alphanumeric character. For instance, + "storage-1", "colon-o", or "forty-two". + +_translator-name_ + Name of one of the available translators. Example: + `protocol/client', `cluster/unify'. + +_option-name_ + Name of a valid option for the translator. + +_option-value_ + Value for the option. Everything following the "option" keyword to + the end of the line is considered the value; it is up to the + translator to parse it. + +_subvolume1_, _subvolume2_, ... + Volume names of sub-volumes. The sub-volumes must already have + been defined earlier in the file. + + There are a few rules you must follow when writing a volume +specification file: + + * Everything following a ``#'' is considered a comment and is + ignored. Blank lines are also ignored. + + * All names and keywords are case-sensitive. + + * The order of options inside a volume definition does not matter. + + * An option value may not span multiple lines. + + * If an option is not specified, it will assume its default value. + + * A sub-volume must have already been defined before it can be + referenced. This means you have to write the specification file + "bottom-up", starting from the leaf nodes of the translator tree + and moving up to the root. + + A simple example volume specification file is shown below: + + # This is a comment line + volume client + type protocol/client + option transport-type tcp + option remote-host localhost # Also a comment + option remote-subvolume brick + # The subvolumes line may be absent + end-volume + + volume iot + type performance/io-threads + option thread-count 4 + subvolumes client + end-volume + + volume wb + type performance/write-behind + subvolumes iot + end-volume + + +File: user-guide.info, Node: Translators, Next: Usage Scenarios, Prev: Concepts, Up: Top + +4 Translators +************* + +* Menu: + +* Storage Translators:: +* Client and Server Translators:: +* Clustering Translators:: +* Performance Translators:: +* Features Translators:: +* Miscellaneous Translators:: + + This chapter documents all the available GlusterFS translators in +detail. Each translator section will show its name (for example, +`cluster/unify'), briefly describe its purpose and workings, and list +every option accepted by that translator and their meaning. + + +File: user-guide.info, Node: Storage Translators, Next: Client and Server Translators, Up: Translators + +4.1 Storage Translators +======================= + +The storage translators form the "backend" for GlusterFS. Currently, +the only available storage translator is the POSIX translator, which +stores files on a normal POSIX filesystem. A pleasant consequence of +this is that your data will still be accessible if GlusterFS crashes or +cannot be started. + + Other storage backends are planned for the future. One of the +possibilities is an Amazon S3 translator. Amazon S3 is an unlimited +online storage service accessible through a web services API. The S3 +translator will allow you to access the storage as a normal POSIX +filesystem. (1) + +* Menu: + +* POSIX:: +* BDB:: + + ---------- Footnotes ---------- + + (1) Some more discussion about this can be found at: + +http://developer.amazonwebservices.com/connect/message.jspa?messageID=52873 + + +File: user-guide.info, Node: POSIX, Next: BDB, Up: Storage Translators + +4.1.1 POSIX +----------- + + type storage/posix + + The `posix' translator uses a normal POSIX filesystem as its +"backend" to actually store files and directories. This can be any +filesystem that supports extended attributes (EXT3, ReiserFS, XFS, +...). Extended attributes are used by some translators to store +metadata, for example, by the replicate and stripe translators. See +*Note Replicate:: and *Note Stripe::, respectively for details. + +`directory ' + The directory on the local filesystem which is to be used for + storage. + + +File: user-guide.info, Node: BDB, Prev: POSIX, Up: Storage Translators + +4.1.2 BDB +--------- + + type storage/bdb + + The `BDB' translator uses a Berkeley DB database as its "backend" to +actually store files as key-value pair in the database and directories +as regular POSIX directories. Note that BDB does not provide extended +attribute support for regular files. Do not use BDB as storage +translator while using any translator that demands extended attributes +on "backend". + +`directory ' + The directory on the local filesystem which is to be used for + storage. + +`mode [cache|persistent] (cache)' + When BDB is run in `cache' mode, recovery of back-end is not + completely guaranteed. `persistent' guarantees that BDB can + recover back-end from Berkeley DB even if GlusterFS crashes. + +`errfile ' + The path of the file to be used as `errfile' for Berkeley DB to + report detailed error messages, if any. Note that all the contents + of this file will be written by Berkeley DB, not GlusterFS. + +`logdir ' + + +File: user-guide.info, Node: Client and Server Translators, Next: Clustering Translators, Prev: Storage Translators, Up: Translators + +4.2 Client and Server Translators +================================= + +The client and server translator enable GlusterFS to export a +translator tree over the network or access a remote GlusterFS server. +These two translators implement GlusterFS's network protocol. + +* Menu: + +* Transport modules:: +* Client protocol:: +* Server protocol:: + + +File: user-guide.info, Node: Transport modules, Next: Client protocol, Up: Client and Server Translators + +4.2.1 Transport modules +----------------------- + +The client and server translators are capable of using any of the +pluggable transport modules. Currently available transport modules are +`tcp', which uses a TCP connection between client and server to +communicate; `ib-sdp', which uses a TCP connection over InfiniBand, and +`ibverbs', which uses high-speed InfiniBand connections. + + Each transport module comes in two different versions, one to be +used on the server side and the other on the client side. + +4.2.1.1 TCP +........... + +The TCP transport module uses a TCP/IP connection between the server +and the client. + + option transport-type tcp + + The TCP client module accepts the following options: + +`non-blocking-connect [no|off|on|yes] (on)' + Whether to make the connection attempt asynchronous. + +`remote-port (6996)' + Server port to connect to. + +`remote-host *' + Hostname or IP address of the server. If the host name resolves to + multiple IP addresses, all of them will be tried in a round-robin + fashion. This feature can be used to implement fail-over. + + The TCP server module accepts the following options: + +`bind-address
(0.0.0.0)' + The local interface on which the server should listen to requests. + Default is to listen on all interfaces. + +`listen-port (6996)' + The local port to listen on. + +4.2.1.2 IB-SDP +.............. + + option transport-type ib-sdp + + kernel implements socket interface for ib hardware. SDP is over +ib-verbs. This module accepts the same options as `tcp' + +4.2.1.3 ibverbs +............... + + option transport-type tcp + + InfiniBand is a scalable switched fabric interconnect mechanism +primarily used in high-performance computing. InfiniBand can deliver +data throughput of the order of 10 Gbit/s, with latencies of 4-5 ms. + + The `ib-verbs' transport accesses the InfiniBand hardware through +the "verbs" API, which is the lowest level of software access possible +and which gives the highest performance. On InfiniBand hardware, it is +always best to use `ib-verbs'. Use `ib-sdp' only if you cannot get +`ib-verbs' working for some reason. + + The `ib-verbs' client module accepts the following options: + +`non-blocking-connect [no|off|on|yes] (on)' + Whether to make the connection attempt asynchronous. + +`remote-port (6996)' + Server port to connect to. + +`remote-host *' + Hostname or IP address of the server. If the host name resolves to + multiple IP addresses, all of them will be tried in a round-robin + fashion. This feature can be used to implement fail-over. + + The `ib-verbs' server module accepts the following options: + +`bind-address
(0.0.0.0)' + The local interface on which the server should listen to requests. + Default is to listen on all interfaces. + +`listen-port (6996)' + The local port to listen on. + + The following options are common to both the client and server +modules: + + If you are familiar with InfiniBand jargon, the mode is used by +GlusterFS is "reliable connection-oriented channel transfer". + +`ib-verbs-work-request-send-count (64)' + Length of the send queue in datagrams. [Reason to + increase/decrease?] + +`ib-verbs-work-request-recv-count (64)' + Length of the receive queue in datagrams. [Reason to + increase/decrease?] + +`ib-verbs-work-request-send-size (128KB)' + Size of each datagram that is sent. [Reason to increase/decrease?] + +`ib-verbs-work-request-recv-size (128KB)' + Size of each datagram that is received. [Reason to + increase/decrease?] + +`ib-verbs-port (1)' + Port number for ib-verbs. + +`ib-verbs-mtu [256|512|1024|2048|4096] (2048)' + The Maximum Transmission Unit [Reason to increase/decrease?] + +`ib-verbs-device-name (first device in the list)' + InfiniBand device to be used. + + For maximum performance, you should ensure that the send/receive +counts on both the client and server are the same. + + ib-verbs is preferred over ib-sdp. + + +File: user-guide.info, Node: Client protocol, Next: Server protocol, Prev: Transport modules, Up: Client and Server Translators + +4.2.2 Client +------------ + + type procotol/client + + The client translator enables the GlusterFS client to access a +remote server's translator tree. + +`transport-type [tcp,ib-sdp,ib-verbs] (tcp)' + The transport type to use. You should use the client versions of + all the transport modules (`tcp', `ib-sdp', `ib-verbs'). + +`remote-subvolume *' + The name of the volume on the remote host to attach to. Note that + this is _not_ the name of the `protocol/server' volume on the + server. It should be any volume under the server. + +`transport-timeout (120- seconds)' + Inactivity timeout. If a reply is expected and no activity takes + place on the connection within this time, the transport connection + will be broken, and a new connection will be attempted. + + +File: user-guide.info, Node: Server protocol, Prev: Client protocol, Up: Client and Server Translators + +4.2.3 Server +------------ + + type protocol/server + + The server translator exports a translator tree and makes it +accessible to remote GlusterFS clients. + +`client-volume-filename (/glusterfs-client.vol)' + The volume specification file to use for the client. This is the + file the client will receive when it is invoked with the + `--server' option (*Note Client::). + +`transport-type [tcp,ib-verbs,ib-sdp] (tcp)' + The transport to use. You should use the server versions of all + the transport modules (`tcp', `ib-sdp', `ib-verbs'). + +`auth.addr..allow ' + IP addresses of the clients that are allowed to attach to the + specified volume. This can be a wildcard. For example, a wildcard + of the form `192.168.*.*' allows any host in the `192.168.x.x' + subnet to connect to the server. + + + +File: user-guide.info, Node: Clustering Translators, Next: Performance Translators, Prev: Client and Server Translators, Up: Translators + +4.3 Clustering Translators +========================== + +The clustering translators are the most important GlusterFS +translators, since it is these that make GlusterFS a cluster +filesystem. These translators together enable GlusterFS to access an +arbitrarily large amount of storage, and provide RAID-like redundancy +and distribution over the entire cluster. + + There are three clustering translators: *unify*, *replicate*, and +*stripe*. The unify translator aggregates storage from many server +nodes. The replicate translator provides file replication. The stripe +translator allows a file to be spread across many server nodes. The +following sections look at each of these translators in detail. + +* Menu: + +* Unify:: +* Replicate:: +* Stripe:: + + +File: user-guide.info, Node: Unify, Next: Replicate, Up: Clustering Translators + +4.3.1 Unify +----------- + + type cluster/unify + + The unify translator presents a `unified' view of all its +sub-volumes. That is, it makes the union of all its sub-volumes appear +as a single volume. It is the unify translator that gives GlusterFS the +ability to access an arbitrarily large amount of storage. + + For unify to work correctly, certain invariants need to be +maintained across the entire network. These are: + + * The directory structure of all the sub-volumes must be identical. + + * A particular file can exist on only one of the sub-volumes. + Phrasing it in another way, a pathname such as + `/home/calvin/homework.txt') is unique across the entire cluster. + + + +Looking at the second requirement, you might wonder how one can +accomplish storing redundant copies of a file, if no file can exist +multiple times. To answer, we must remember that these invariants are +from _unify's perspective_. A translator such as replicate at a lower +level in the translator tree than unify may subvert this picture. + + The first invariant might seem quite tedious to ensure. We shall see +later that this is not so, since unify's _self-heal_ mechanism takes +care of maintaining it. + + The second invariant implies that unify needs some way to decide +which file goes where. Unify makes use of _scheduler_ modules for this +purpose. + + When a file needs to be created, unify's scheduler decides upon the +sub-volume to be used to store the file. There are many schedulers +available, each using a different algorithm and suitable for different +purposes. + + The various schedulers are described in detail in the sections that +follow. + +4.3.1.1 ALU +........... + + option scheduler alu + + ALU stands for "Adaptive Least Usage". It is the most advanced +scheduler available in GlusterFS. It balances the load across volumes +taking several factors in account. It adapts itself to changing I/O +patterns according to its configuration. When properly configured, it +can eliminate the need for regular tuning of the filesystem to keep +volume load nicely balanced. + + The ALU scheduler is composed of multiple least-usage +sub-schedulers. Each sub-scheduler keeps track of a certain type of +load, for each of the sub-volumes, getting statistics from the +sub-volumes themselves. The sub-schedulers are these: + + * disk-usage: The used and free disk space on the volume. + + * read-usage: The amount of reading done from this volume. + + * write-usage: The amount of writing done to this volume. + + * open-files-usage: The number of files currently open from this + volume. + + * disk-speed-usage: The speed at which the disks are spinning. This + is a constant value and therefore not very useful. + + The ALU scheduler needs to know which of these sub-schedulers to use, +and in which order to evaluate them. This is done through the `option +alu.order' configuration directive. + + Each sub-scheduler needs to know two things: when to kick in (the +entry-threshold), and how long to stay in control (the exit-threshold). +For example: when unifying three disks of 100GB, keeping an exact +balance of disk-usage is not necesary. Instead, there could be a 1GB +margin, which can be used to nicely balance other factors, such as +read-usage. The disk-usage scheduler can be told to kick in only when a +certain threshold of discrepancy is passed, such as 1GB. When it +assumes control under this condition, it will write all subsequent data +to the least-used volume. If it is doing so, it is unwise to stop right +after the values are below the entry-threshold again, since that would +make it very likely that the situation will occur again very soon. Such +a situation would cause the ALU to spend most of its time disk-usage +scheduling, which is unfair to the other sub-schedulers. The +exit-threshold therefore defines the amount of data that needs to be +written to the least-used disk, before control is relinquished again. + + In addition to the sub-schedulers, the ALU scheduler also has +"limits" options. These can stop the creation of new files on a volume +once values drop below a certain threshold. For example, setting +`option alu.limits.min-free-disk 5GB' will stop the scheduling of files +to volumes that have less than 5GB of free disk space, leaving the +files on that disk some room to grow. + + The actual values you assign to the thresholds for sub-schedulers and +limits depend on your situation. If you have fast-growing files, you'll +want to stop file-creation on a disk much earlier than when hardly any +of your files are growing. If you care less about disk-usage balance +than about read-usage balance, you'll want a bigger disk-usage +scheduler entry-threshold and a smaller read-usage scheduler +entry-threshold. + + For thresholds defining a size, values specifying "KB", "MB" and "GB" +are allowed. For example: `option alu.limits.min-free-disk 5GB'. + +`alu.order * ("disk-usage:write-usage:read-usage:open-files-usage:disk-speed")' + +`alu.disk-usage.entry-threshold (1GB)' + +`alu.disk-usage.exit-threshold (512MB)' + +`alu.write-usage.entry-threshold <%> (25)' + +`alu.write-usage.exit-threshold <%> (5)' + +`alu.read-usage.entry-threshold <%> (25)' + +`alu.read-usage.exit-threshold <%> (5)' + +`alu.open-files-usage.entry-threshold (1000)' + +`alu.open-files-usage.exit-threshold (100)' + +`alu.limits.min-free-disk <%>' + +`alu.limits.max-open-files ' + +4.3.1.2 Round Robin (RR) +........................ + + option scheduler rr + + Round-Robin (RR) scheduler creates files in a round-robin fashion. +Each client will have its own round-robin loop. When your files are +mostly similar in size and I/O access pattern, this scheduler is a good +choice. RR scheduler checks for free disk space on the server before +scheduling, so you can know when to add another server node. The +default value of min-free-disk is 5% and is checked on file creation +calls, with atleast 10 seconds (by default) elapsing between two checks. + + Options: +`rr.limits.min-free-disk <%> (5)' + Minimum free disk space a node must have for RR to schedule a file + to it. + +`rr.refresh-interval (10 seconds)' + Time between two successive free disk space checks. + +4.3.1.3 Random +.............. + + option scheduler random + + The random scheduler schedules file creation randomly among its +child nodes. Like the round-robin scheduler, it also checks for a +minimum amount of free disk space before scheduling a file to a node. + +`random.limits.min-free-disk <%> (5)' + Minimum free disk space a node must have for random to schedule a + file to it. + +`random.refresh-interval (10 seconds)' + Time between two successive free disk space checks. + +4.3.1.4 NUFA +............ + + option scheduler nufa + + It is common in many GlusterFS computing environments for all +deployed machines to act as both servers and clients. For example, a +research lab may have 40 workstations each with its own storage. All of +these workstations might act as servers exporting a volume as well as +clients accessing the entire cluster's storage. In such a situation, +it makes sense to store locally created files on the local workstation +itself (assuming files are accessed most by the workstation that +created them). The Non-Uniform File Allocation (NUFA) scheduler +accomplishes that. + + NUFA gives the local system first priority for file creation over +other nodes. If the local volume does not have more free disk space +than a specified amount (5% by default) then NUFA schedules files among +the other child volumes in a round-robin fashion. + + NUFA is named after the similar strategy used for memory access, +NUMA(1). + +`nufa.limits.min-free-disk <%> (5)' + Minimum disk space that must be free (local or remote) for NUFA to + schedule a file to it. + +`nufa.refresh-interval (10 seconds)' + Time between two successive free disk space checks. + +`nufa.local-volume-name ' + The name of the volume corresponding to the local system. This + volume must be one of the children of the unify volume. This + option is mandatory. + +4.3.1.5 Namespace +................. + +Namespace volume needed because: - persistent inode numbers. - file +exists even when node is down. + + namespace files are simply touched. on every lookup it is checked. + +`namespace *' + Name of the namespace volume (which should be one of the unify + volume's children). + +`self-heal [on|off] (on)' + Enable/disable self-heal. Unless you know what you are doing, do + not disable self-heal. + +4.3.1.6 Self Heal +................. + +* When a 'lookup()/stat()' call is made on directory for the first +time, a self-heal call is made, which checks for the consistancy of its +child nodes. If an entry is present in storage node, but not in +namespace, that entry is created in namespace, and vica-versa. There is +an writedir() API introduced which is used for the same. It also checks +for permissions, and uid/gid consistencies. + + * This check is also done when an server goes down and comes up. + + * If one starts with an empty namespace export, but has data in +storage nodes, a 'find .>/dev/null' or 'ls -lR >/dev/null' should help +to build namespace in one shot. Even otherwise, namespace is built on +demand when a file is looked up for the first time. + + NOTE: There are some issues (Kernel 'Oops' msgs) seen with +fuse-2.6.3, when someone deletes namespace in backend, when glusterfs is +running. But with fuse-2.6.5, this issue is not there. + + ---------- Footnotes ---------- + + (1) Non-Uniform Memory Access: + + + +File: user-guide.info, Node: Replicate, Next: Stripe, Prev: Unify, Up: Clustering Translators + +4.3.2 Replicate (formerly AFR) +------------------------------ + + type cluster/replicate + + Replicate provides RAID-1 like functionality for GlusterFS. +Replicate replicates files and directories across the subvolumes. Hence +if Replicate has four subvolumes, there will be four copies of all +files and directories. Replicate provides high-availability, i.e., in +case one of the subvolumes go down (e. g. server crash, network +disconnection) Replicate will still service the requests using the +redundant copies. + + Replicate also provides self-heal functionality, i.e., in case the +crashed servers come up, the outdated files and directories will be +updated with the latest versions. Replicate uses extended attributes of +the backend file system to track the versioning of files and +directories and provide the self-heal feature. + + volume replicate-example + type cluster/replicate + subvolumes brick1 brick2 brick3 + end-volume + + This sample configuration will replicate all directories and files on +brick1, brick2 and brick3. + + All the read operations happen from the first alive child. If all the +three sub-volumes are up, reads will be done from brick1; if brick1 is +down read will be done from brick2. In case read() was being done on +brick1 and it goes down, replicate transparently falls back to brick2. + + The next release of GlusterFS will add the following features: + * Ability to specify the sub-volume from which read operations are + to be done (this will help users who have one of the sub-volumes + as a local storage volume). + + * Allow scheduling of read operations amongst the sub-volumes in a + round-robin fashion. + + The order of the subvolumes list should be same across all the +'replicate's as they will be used for locking purposes. + +4.3.2.1 Self Heal +................. + +Replicate has self-heal feature, which updates the outdated file and +directory copies by the most recent versions. For example consider the +following config: + + volume replicate-example + type cluster/replicate + subvolumes brick1 brick2 + end-volume + +4.3.2.2 File self-heal +...................... + +Now if we create a file foo.txt on replicate-example, the file will be +created on brick1 and brick2. The file will have two extended +attributes associated with it in the backend filesystem. One is +trusted.afr.createtime and the other is trusted.afr.version. The +trusted.afr.createtime xattr has the create time (in terms of seconds +since epoch) and trusted.afr.version is a number that is incremented +each time a file is modified. This increment happens during close +(incase any write was done before close). + + If brick1 goes down, we edit foo.txt the version gets incremented. +Now the brick1 comes back up, when we open() on foo.txt replicate will +check if their versions are same. If they are not same, the outdated +copy is replaced by the latest copy and its version is updated. After +the sync the open() proceeds in the usual manner and the application +calling open() can continue on its access to the file. + + If brick1 goes down, we delete foo.txt and create a file with the +same name again i.e foo.txt. Now brick1 comes back up, clearly there is +a chance that the version on brick1 being more than the version on +brick2, this is where createtime extended attribute helps in deciding +which the outdated copy is. Hence we need to consider both createtime +and version to decide on the latest copy. + + The version attribute is incremented during the close() call. Version +will not be incremented in case there was no write() done. In case the +fd that the close() gets was got by create() call, we also create the +createtime extended attribute. + +4.3.2.3 Directory self-heal +........................... + +Suppose brick1 goes down, we delete foo.txt, brick1 comes back up, now +we should not create foo.txt on brick2 but we should delete foo.txt on +brick1. We handle this situation by having the createtime and version +attribute on the directory similar to the file. when lookup() is done +on the directory, we compare the createtime/version attributes of the +copies and see which files needs to be deleted and delete those files +and update the extended attributes of the outdated directory copy. +Each time a directory is modified (a file or a subdirectory is created +or deleted inside the directory) and one of the subvols is down, we +increment the directory's version. + + lookup() is a call initiated by the kernel on a file or directory +just before any access to that file or directory. In glusterfs, by +default, lookup() will not be called in case it was called in the past +one second on that particular file or directory. + + The extended attributes can be seen in the backend filesystem using +the `getfattr' command. (`getfattr -n trusted.afr.version ') + +`debug [on|off] (off)' + +`self-heal [on|off] (on)' + +`replicate (*:1)' + +`lock-node (first child is used by default)' + + +File: user-guide.info, Node: Stripe, Prev: Replicate, Up: Clustering Translators + +4.3.3 Stripe +------------ + + type cluster/stripe + + The stripe translator distributes the contents of a file over its +sub-volumes. It does this by creating a file equal in size to the +total size of the file on each of its sub-volumes. It then writes only +a part of the file to each sub-volume, leaving the rest of it empty. +These empty regions are called `holes' in Unix terminology. The holes +do not consume any disk space. + + The diagram below makes this clear. + + + +You can configure stripe so that only filenames matching a pattern are +striped. You can also configure the size of the data to be stored on +each sub-volume. + +`block-size : (*:0 no striping)' + Distribute files matching `' over the sub-volumes, + storing at least `' on each sub-volume. For example, + + option block-size *.mpg:1M + + distributes all files ending in `.mpg', storing at least 1 MB on + each sub-volume. + + Any number of `block-size' option lines may be present, specifying + different sizes for different file name patterns. + + +File: user-guide.info, Node: Performance Translators, Next: Features Translators, Prev: Clustering Translators, Up: Translators + +4.4 Performance Translators +=========================== + +* Menu: + +* Read Ahead:: +* Write Behind:: +* IO Threads:: +* IO Cache:: +* Booster:: + + +File: user-guide.info, Node: Read Ahead, Next: Write Behind, Up: Performance Translators + +4.4.1 Read Ahead +---------------- + + type performance/read-ahead + + The read-ahead translator pre-fetches data in advance on every read. +This benefits applications that mostly process files in sequential +order, since the next block of data will already be available by the +time the application is done with the current one. + + Additionally, the read-ahead translator also behaves as a +read-aggregator. Many small read operations are combined and issued as +fewer, larger read requests to the server. + + Read-ahead deals in "pages" as the unit of data fetched. The page +size is configurable, as is the "page count", which is the number of +pages that are pre-fetched. + + Read-ahead is best used with InfiniBand (using the ib-verbs +transport). On FastEthernet and Gigabit Ethernet networks, GlusterFS +can achieve the link-maximum throughput even without read-ahead, making +it quite superflous. + + Note that read-ahead only happens if the reads are perfectly +sequential. If your application accesses data in a random fashion, +using read-ahead might actually lead to a performance loss, since +read-ahead will pointlessly fetch pages which won't be used by the +application. + + Options: +`page-size (256KB)' + The unit of data that is pre-fetched. + +`page-count (2)' + The number of pages that are pre-fetched. + +`force-atime-update [on|off|yes|no] (off|no)' + Whether to force an access time (atime) update on the file on + every read. Without this, the atime will be slightly imprecise, as + it will reflect the time when the read-ahead translator read the + data, not when the application actually read it. + + +File: user-guide.info, Node: Write Behind, Next: IO Threads, Prev: Read Ahead, Up: Performance Translators + +4.4.2 Write Behind +------------------ + + type performance/write-behind + + The write-behind translator improves the latency of a write +operation. It does this by relegating the write operation to the +background and returning to the application even as the write is in +progress. Using the write-behind translator, successive write requests +can be pipelined. This mode of write-behind operation is best used on +the client side, to enable decreased write latency for the application. + + The write-behind translator can also aggregate write requests. If the +`aggregate-size' option is specified, then successive writes upto that +size are accumulated and written in a single operation. This mode of +operation is best used on the server side, as this will decrease the +disk's head movement when multiple files are being written to in +parallel. + + The `aggregate-size' option has a default value of 128KB. Although +this works well for most users, you should always experiment with +different values to determine the one that will deliver maximum +performance. This is because the performance of write-behind depends on +your interconnect, size of RAM, and the work load. + +`aggregate-size (128KB)' + Amount of data to accumulate before doing a write + +`flush-behind [on|yes|off|no] (off|no)' + + +File: user-guide.info, Node: IO Threads, Next: IO Cache, Prev: Write Behind, Up: Performance Translators + +4.4.3 IO Threads +---------------- + + type performance/io-threads + + The IO threads translator is intended to increase the responsiveness +of the server to metadata operations by doing file I/O (read, write) in +a background thread. Since the GlusterFS server is single-threaded, +using the IO threads translator can significantly improve performance. +This translator is best used on the server side, loaded just below the +server protocol translator. + + IO threads operates by handing out read and write requests to a +separate thread. The total number of threads in existence at a time is +constant, and configurable. + +`thread-count (1)' + Number of threads to use. + + +File: user-guide.info, Node: IO Cache, Next: Booster, Prev: IO Threads, Up: Performance Translators + +4.4.4 IO Cache +-------------- + + type performance/io-cache + + The IO cache translator caches data that has been read. This is +useful if many applications read the same data multiple times, and if +reads are much more frequent than writes (for example, IO caching may be +useful in a web hosting environment, where most clients will simply +read some files and only a few will write to them). + + The IO cache translator reads data from its child in `page-size' +chunks. It caches data upto `cache-size' bytes. The cache is +maintained as a prioritized least-recently-used (LRU) list, with +priorities determined by user-specified patterns to match filenames. + + When the IO cache translator detects a write operation, the cache +for that file is flushed. + + The IO cache translator periodically verifies the consistency of +cached data, using the modification times on the files. The +verification timeout is configurable. + +`page-size (128KB)' + Size of a page. + +`cache-size (n) (32MB)' + Total amount of data to be cached. + +`force-revalidate-timeout (1)' + Timeout to force a cache consistency verification, in seconds. + +`priority (*:0)' + Filename patterns listed in order of priority. + + +File: user-guide.info, Node: Booster, Prev: IO Cache, Up: Performance Translators + +4.4.5 Booster +------------- + + type performance/booster + + The booster translator gives applications a faster path to +communicate read and write requests to GlusterFS. Normally, all +requests to GlusterFS from applications go through FUSE, as indicated +in *Note Filesystems in Userspace::. Using the booster translator in +conjunction with the GlusterFS booster shared library, an application +can bypass the FUSE path and send read/write requests directly to the +GlusterFS client process. + + The booster mechanism consists of two parts: the booster translator, +and the booster shared library. The booster translator is meant to be +loaded on the client side, usually at the root of the translator tree. +The booster shared library should be `LD_PRELOAD'ed with the +application. + + The booster translator when loaded opens a Unix domain socket and +listens for read/write requests on it. The booster shared library +intercepts read and write system calls and sends the requests to the +GlusterFS process directly using the Unix domain socket, bypassing FUSE. +This leads to superior performance. + + Once you've loaded the booster translator in your volume +specification file, you can start your application as: + + $ LD_PRELOAD=/usr/local/bin/glusterfs-booster.so your_app + + The booster translator accepts no options. + + +File: user-guide.info, Node: Features Translators, Next: Miscellaneous Translators, Prev: Performance Translators, Up: Translators + +4.5 Features Translators +======================== + +* Menu: + +* POSIX Locks:: +* Fixed ID:: + + +File: user-guide.info, Node: POSIX Locks, Next: Fixed ID, Up: Features Translators + +4.5.1 POSIX Locks +----------------- + + type features/posix-locks + + This translator provides storage independent POSIX record locking +support (`fcntl' locking). Typically you'll want to load this on the +server side, just above the POSIX storage translator. Using this +translator you can get both advisory locking and mandatory locking +support. It also handles `flock()' locks properly. + + Caveat: Consider a file that does not have its mandatory locking bits +(+setgid, -group execution) turned on. Assume that this file is now +opened by a process on a client that has the write-behind xlator +loaded. The write-behind xlator does not cache anything for files which +have mandatory locking enabled, to avoid incoherence. Let's say that +mandatory locking is now enabled on this file through another client. +The former client will not know about this change, and write-behind may +erroneously report a write as being successful when in fact it would +fail due to the region it is writing to being locked. + + There seems to be no easy way to fix this. To work around this +problem, it is recommended that you never enable the mandatory bits on +a file while it is open. + +`mandatory [on|off] (on)' + Turns mandatory locking on. + + +File: user-guide.info, Node: Fixed ID, Prev: POSIX Locks, Up: Features Translators + +4.5.2 Fixed ID +-------------- + + type features/fixed-id + + The fixed ID translator makes all filesystem requests from the client +to appear to be coming from a fixed, specified UID/GID, regardless of +which user actually initiated the request. + +`fixed-uid [if not set, not used]' + The UID to send to the server + +`fixed-gid [if not set, not used]' + The GID to send to the server + + +File: user-guide.info, Node: Miscellaneous Translators, Prev: Features Translators, Up: Translators + +4.6 Miscellaneous Translators +============================= + +* Menu: + +* ROT-13:: +* Trace:: + + +File: user-guide.info, Node: ROT-13, Next: Trace, Up: Miscellaneous Translators + +4.6.1 ROT-13 +------------ + + type encryption/rot-13 + + ROT-13 is a toy translator that can "encrypt" and "decrypt" file +contents using the ROT-13 algorithm. ROT-13 is a trivial algorithm that +rotates each alphabet by thirteen places. Thus, 'A' becomes 'N', 'B' +becomes 'O', and 'Z' becomes 'M'. + + It goes without saying that you shouldn't use this translator if you +need _real_ encryption (a future release of GlusterFS will have real +encryption translators). + +`encrypt-write [on|off] (on)' + Whether to encrypt on write + +`decrypt-read [on|off] (on)' + Whether to decrypt on read + + +File: user-guide.info, Node: Trace, Prev: ROT-13, Up: Miscellaneous Translators + +4.6.2 Trace +----------- + + type debug/trace + + The trace translator is intended for debugging purposes. When +loaded, it logs all the system calls received by the server or client +(wherever trace is loaded), their arguments, and the results. You must +use a GlusterFS log level of DEBUG (See *Note Running GlusterFS::) for +trace to work. + + Sample trace output (lines have been wrapped for readability): + 2007-10-30 00:08:58 D [trace.c:1579:trace_opendir] trace: callid: 68 + (*this=0x8059e40, loc=0x8091984 {path=/iozone3_283, inode=0x8091f00}, + fd=0x8091d50) + + 2007-10-30 00:08:58 D [trace.c:630:trace_opendir_cbk] trace: + (*this=0x8059e40, op_ret=4, op_errno=1, fd=0x8091d50) + + 2007-10-30 00:08:58 D [trace.c:1602:trace_readdir] trace: callid: 69 + (*this=0x8059e40, size=4096, offset=0 fd=0x8091d50) + + 2007-10-30 00:08:58 D [trace.c:215:trace_readdir_cbk] trace: + (*this=0x8059e40, op_ret=0, op_errno=0, count=4) + + 2007-10-30 00:08:58 D [trace.c:1624:trace_closedir] trace: callid: 71 + (*this=0x8059e40, *fd=0x8091d50) + + 2007-10-30 00:08:58 D [trace.c:809:trace_closedir_cbk] trace: + (*this=0x8059e40, op_ret=0, op_errno=1) + + +File: user-guide.info, Node: Usage Scenarios, Next: Troubleshooting, Prev: Translators, Up: Top + +5 Usage Scenarios +***************** + +5.1 Advanced Striping +===================== + +This section is based on the Advanced Striping tutorial written by +Anand Avati on the GlusterFS wiki (1). + +5.1.1 Mixed Storage Requirements +-------------------------------- + +There are two ways of scheduling the I/O. One at file level (using +unify translator) and other at block level (using stripe translator). +Striped I/O is good for files that are potentially large and require +high parallel throughput (for example, a single file of 400GB being +accessed by 100s and 1000s of systems simultaneously and randomly). For +most of the cases, file level scheduling works best. + + In the real world, it is desirable to mix file level and block level +scheduling on a single storage volume. Alternatively users can choose +to have two separate volumes and hence two mount points, but the +applications may demand a single storage system to host both. + + This document explains how to mix file level scheduling with stripe. + +5.1.2 Configuration Brief +------------------------- + +This setup demonstrates how users can configure unify translator with +appropriate I/O scheduler for file level scheduling and strip for only +matching patterns. This way, GlusterFS chooses appropriate I/O profile +and knows how to efficiently handle both the types of data. + + A simple technique to achieve this effect is to create a stripe set +of unify and stripe blocks, where unify is the first sub-volume. Files +that do not match the stripe policy passed on to first unify sub-volume +and inturn scheduled arcoss the cluster using its file level I/O +scheduler. + + 5.1.3 Preparing GlusterFS Envoronment +------------------------------------- + +Create the directories /export/namespace, /export/unify and +/export/stripe on all the storage bricks. + + Place the following server and client volume spec file under +/etc/glusterfs (or appropriate installed path) and replace the IP +addresses / access control fields to match your environment. + + ## file: /etc/glusterfs/glusterfsd.vol + volume posix-unify + type storage/posix + option directory /export/for-unify + end-volume + + volume posix-stripe + type storage/posix + option directory /export/for-stripe + end-volume + + volume posix-namespace + type storage/posix + option directory /export/for-namespace + end-volume + + volume server + type protocol/server + option transport-type tcp + option auth.addr.posix-unify.allow 192.168.1.* + option auth.addr.posix-stripe.allow 192.168.1.* + option auth.addr.posix-namespace.allow 192.168.1.* + subvolumes posix-unify posix-stripe posix-namespace + end-volume + + ## file: /etc/glusterfs/glusterfs.vol + volume client-namespace + type protocol/client + option transport-type tcp + option remote-host 192.168.1.1 + option remote-subvolume posix-namespace + end-volume + + volume client-unify-1 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.1 + option remote-subvolume posix-unify + end-volume + + volume client-unify-2 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.2 + option remote-subvolume posix-unify + end-volume + + volume client-unify-3 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.3 + option remote-subvolume posix-unify + end-volume + + volume client-unify-4 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.4 + option remote-subvolume posix-unify + end-volume + + volume client-stripe-1 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.1 + option remote-subvolume posix-stripe + end-volume + + volume client-stripe-2 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.2 + option remote-subvolume posix-stripe + end-volume + + volume client-stripe-3 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.3 + option remote-subvolume posix-stripe + end-volume + + volume client-stripe-4 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.4 + option remote-subvolume posix-stripe + end-volume + + volume unify + type cluster/unify + option scheduler rr + subvolumes cluster-unify-1 cluster-unify-2 cluster-unify-3 cluster-unify-4 + end-volume + + volume stripe + type cluster/stripe + option block-size *.img:2MB # All files ending with .img are striped with 2MB stripe block size. + subvolumes unify cluster-stripe-1 cluster-stripe-2 cluster-stripe-3 cluster-stripe-4 + end-volume + + Bring up the Storage + + Starting GlusterFS Server: If you have installed through binary +package, you can start the service through init.d startup script. If +not: + + [root@server]# glusterfsd + + Mounting GlusterFS Volumes: + + [root@client]# glusterfs -s [BRICK-IP-ADDRESS] /mnt/cluster + + Improving upon this Setup + + Infiniband Verbs RDMA transport is much faster than TCP/IP GigE +transport. + + Use of performance translators such as read-ahead, write-behind, +io-cache, io-threads, booster is recommended. + + Replace round-robin (rr) scheduler with ALU to handle more dynamic +storage environments. + + ---------- Footnotes ---------- + + (1) +http://gluster.org/docs/index.php/Mixing_Striped_and_Regular_Files + + +File: user-guide.info, Node: Troubleshooting, Next: GNU Free Documentation Licence, Prev: Usage Scenarios, Up: Top + +6 Troubleshooting +***************** + +This chapter is a general troubleshooting guide to GlusterFS. It lists +common GlusterFS server and client error messages, debugging hints, and +concludes with the suggested procedure to report bugs in GlusterFS. + +6.1 GlusterFS error messages +============================ + +6.1.1 Server errors +------------------- + + glusterfsd: FATAL: could not open specfile: + '/etc/glusterfs/glusterfsd.vol' + + The GlusterFS server expects the volume specification file to be at +`/etc/glusterfs/glusterfsd.vol'. The example specification file will be +installed as `/etc/glusterfs/glusterfsd.vol.sample'. You need to edit +it and rename it, or provide a different specification file using the +`--spec-file' command line option (See *Note Server::). + + gf_log_init: failed to open logfile "/usr/var/log/glusterfs/glusterfsd.log" + (Permission denied) + + You don't have permission to create files in the +`/usr/var/log/glusterfs' directory. Make sure you are running GlusterFS +as root. Alternatively, specify a different path for the log file using +the `--log-file' option (See *Note Server::). + +6.1.2 Client errors +------------------- + + fusermount: failed to access mountpoint /mnt: + Transport endpoint is not connected + + A previous failed (or hung) mount of GlusterFS is preventing it from +being mounted again in the same location. The fix is to do: + + # umount /mnt + + and try mounting again. + + *"Transport endpoint is not connected".* + + If you get this error when you try a command such as `ls' or `cat', +it means the GlusterFS mount did not succeed. Try running GlusterFS in +`DEBUG' logging level and study the log messages to discover the cause. + + *"Connect to server failed", "SERVER-ADDRESS: Connection refused".* + + GluserFS Server is not running or dead. Check your network +connections and firewall settings. To check if the server is reachable, +try: + + telnet IP-ADDRESS 6996 + + If the server is accessible, your `telnet' command should connect and +block. If not you will see an error message such as `telnet: Unable to +connect to remote host: Connection refused'. 6996 is the default +GlusterFS port. If you have changed it, then use the corresponding port +instead. + + gf_log_init: failed to open logfile "/usr/var/log/glusterfs/glusterfs.log" + (Permission denied) + + You don't have permission to create files in the +`/usr/var/log/glusterfs' directory. Make sure you are running GlusterFS +as root. Alternatively, specify a different path for the log file using +the `--log-file' option (See *Note Client::). + +6.2 FUSE error messages +======================= + +`modprobe fuse' fails with: "Unknown symbol in module, or unknown +parameter". + + If you are using fuse-2.6.x on Redhat Enterprise Linux Work Station 4 +and Advanced Server 4 with 2.6.9-42.ELlargesmp, 2.6.9-42.ELsmp, +2.6.9-42.EL kernels and get this error while loading FUSE kernel +module, you need to apply the following patch. + + For fuse-2.6.2: + + + + For fuse-2.6.3: + + + +6.3 AppArmour and GlusterFS +=========================== + +Under OpenSuSE GNU/Linux, the AppArmour security feature does not allow +GlusterFS to create temporary files or network socket connections even +while running as root. You will see error messages like `Unable to open +log file: Operation not permitted' or `Connection refused'. Disabling +AppArmour using YaST or properly configuring AppArmour to recognize +`glusterfsd' or `glusterfs'/`fusermount' should solve the problem. + +6.4 Reporting a bug +=================== + +If you encounter a bug in GlusterFS, please follow the below guidelines +when you report it to the mailing list. Be sure to report it! User +feedback is crucial to the health of the project and we value it highly. + +6.4.1 General instructions +-------------------------- + +When running GlusterFS in a non-production environment, be sure to +build it with the following command: + + $ make CFLAGS='-g -O0 -DDEBUG' + + This includes debugging information which will be helpful in getting +backtraces (see below) and also disable optimization. Enabling +optimization can result in incorrect line numbers being reported to gdb. + +6.4.2 Volume specification files +-------------------------------- + +Attach all relevant server and client spec files you were using when +you encountered the bug. Also tell us details of your setup, i.e., how +many clients and how many servers. + +6.4.3 Log files +--------------- + +Set the loglevel of your client and server programs to DEBUG (by +passing the -L DEBUG option) and attach the log files with your bug +report. Obviously, if only the client is failing (for example), you +only need to send us the client log file. + +6.4.4 Backtrace +--------------- + +If GlusterFS has encountered a segmentation fault or has crashed for +some other reason, include the backtrace with the bug report. You can +get the backtrace using the following procedure. + + Run the GlusterFS client or server inside gdb. + + $ gdb ./glusterfs + (gdb) set args -f client.spec -N -l/path/to/log/file -LDEBUG /mnt/point + (gdb) run + + Now when the process segfaults, you can get the backtrace by typing: + + (gdb) bt + + If the GlusterFS process has crashed and dumped a core file (you can +find this in / if running as a daemon and in the current directory +otherwise), you can do: + + $ gdb /path/to/glusterfs /path/to/core. + + and then get the backtrace. + + If the GlusterFS server or client seems to be hung, then you can get +the backtrace by attaching gdb to the process. First get the `PID' of +the process (using ps), and then do: + + $ gdb ./glusterfs + + Press Ctrl-C to interrupt the process and then generate the +backtrace. + +6.4.5 Reproducing the bug +------------------------- + +If the bug is reproducible, please include the steps necessary to do +so. If the bug is not reproducible, send us the bug report anyway. + +6.4.6 Other information +----------------------- + +If you think it is relevant, send us also the version of FUSE you're +using, the kernel version, platform. + + +File: user-guide.info, Node: GNU Free Documentation Licence, Next: Index, Prev: Troubleshooting, Up: Top + +Appendix A GNU Free Documentation Licence +***************************************** + + Version 1.2, November 2002 + + Copyright (C) 2000,2001,2002 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA + + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + 0. PREAMBLE + + The purpose of this License is to make a manual, textbook, or other + functional and useful document "free" in the sense of freedom: to + assure everyone the effective freedom to copy and redistribute it, + with or without modifying it, either commercially or + noncommercially. Secondarily, this License preserves for the + author and publisher a way to get credit for their work, while not + being considered responsible for modifications made by others. + + This License is a kind of "copyleft", which means that derivative + works of the document must themselves be free in the same sense. + It complements the GNU General Public License, which is a copyleft + license designed for free software. + + We have designed this License in order to use it for manuals for + free software, because free software needs free documentation: a + free program should come with manuals providing the same freedoms + that the software does. But this License is not limited to + software manuals; it can be used for any textual work, regardless + of subject matter or whether it is published as a printed book. + We recommend this License principally for works whose purpose is + instruction or reference. + + 1. APPLICABILITY AND DEFINITIONS + + This License applies to any manual or other work, in any medium, + that contains a notice placed by the copyright holder saying it + can be distributed under the terms of this License. Such a notice + grants a world-wide, royalty-free license, unlimited in duration, + to use that work under the conditions stated herein. The + "Document", below, refers to any such manual or work. Any member + of the public is a licensee, and is addressed as "you". You + accept the license if you copy, modify or distribute the work in a + way requiring permission under copyright law. + + A "Modified Version" of the Document means any work containing the + Document or a portion of it, either copied verbatim, or with + modifications and/or translated into another language. + + A "Secondary Section" is a named appendix or a front-matter section + of the Document that deals exclusively with the relationship of the + publishers or authors of the Document to the Document's overall + subject (or to related matters) and contains nothing that could + fall directly within that overall subject. (Thus, if the Document + is in part a textbook of mathematics, a Secondary Section may not + explain any mathematics.) The relationship could be a matter of + historical connection with the subject or with related matters, or + of legal, commercial, philosophical, ethical or political position + regarding them. + + The "Invariant Sections" are certain Secondary Sections whose + titles are designated, as being those of Invariant Sections, in + the notice that says that the Document is released under this + License. If a section does not fit the above definition of + Secondary then it is not allowed to be designated as Invariant. + The Document may contain zero Invariant Sections. If the Document + does not identify any Invariant Sections then there are none. + + The "Cover Texts" are certain short passages of text that are + listed, as Front-Cover Texts or Back-Cover Texts, in the notice + that says that the Document is released under this License. A + Front-Cover Text may be at most 5 words, and a Back-Cover Text may + be at most 25 words. + + A "Transparent" copy of the Document means a machine-readable copy, + represented in a format whose specification is available to the + general public, that is suitable for revising the document + straightforwardly with generic text editors or (for images + composed of pixels) generic paint programs or (for drawings) some + widely available drawing editor, and that is suitable for input to + text formatters or for automatic translation to a variety of + formats suitable for input to text formatters. A copy made in an + otherwise Transparent file format whose markup, or absence of + markup, has been arranged to thwart or discourage subsequent + modification by readers is not Transparent. An image format is + not Transparent if used for any substantial amount of text. A + copy that is not "Transparent" is called "Opaque". + + Examples of suitable formats for Transparent copies include plain + ASCII without markup, Texinfo input format, LaTeX input format, + SGML or XML using a publicly available DTD, and + standard-conforming simple HTML, PostScript or PDF designed for + human modification. Examples of transparent image formats include + PNG, XCF and JPG. Opaque formats include proprietary formats that + can be read and edited only by proprietary word processors, SGML or + XML for which the DTD and/or processing tools are not generally + available, and the machine-generated HTML, PostScript or PDF + produced by some word processors for output purposes only. + + The "Title Page" means, for a printed book, the title page itself, + plus such following pages as are needed to hold, legibly, the + material this License requires to appear in the title page. For + works in formats which do not have any title page as such, "Title + Page" means the text near the most prominent appearance of the + work's title, preceding the beginning of the body of the text. + + A section "Entitled XYZ" means a named subunit of the Document + whose title either is precisely XYZ or contains XYZ in parentheses + following text that translates XYZ in another language. (Here XYZ + stands for a specific section name mentioned below, such as + "Acknowledgements", "Dedications", "Endorsements", or "History".) + To "Preserve the Title" of such a section when you modify the + Document means that it remains a section "Entitled XYZ" according + to this definition. + + The Document may include Warranty Disclaimers next to the notice + which states that this License applies to the Document. These + Warranty Disclaimers are considered to be included by reference in + this License, but only as regards disclaiming warranties: any other + implication that these Warranty Disclaimers may have is void and + has no effect on the meaning of this License. + + 2. VERBATIM COPYING + + You may copy and distribute the Document in any medium, either + commercially or noncommercially, provided that this License, the + copyright notices, and the license notice saying this License + applies to the Document are reproduced in all copies, and that you + add no other conditions whatsoever to those of this License. You + may not use technical measures to obstruct or control the reading + or further copying of the copies you make or distribute. However, + you may accept compensation in exchange for copies. If you + distribute a large enough number of copies you must also follow + the conditions in section 3. + + You may also lend copies, under the same conditions stated above, + and you may publicly display copies. + + 3. COPYING IN QUANTITY + + If you publish printed copies (or copies in media that commonly + have printed covers) of the Document, numbering more than 100, and + the Document's license notice requires Cover Texts, you must + enclose the copies in covers that carry, clearly and legibly, all + these Cover Texts: Front-Cover Texts on the front cover, and + Back-Cover Texts on the back cover. Both covers must also clearly + and legibly identify you as the publisher of these copies. The + front cover must present the full title with all words of the + title equally prominent and visible. You may add other material + on the covers in addition. Copying with changes limited to the + covers, as long as they preserve the title of the Document and + satisfy these conditions, can be treated as verbatim copying in + other respects. + + If the required texts for either cover are too voluminous to fit + legibly, you should put the first ones listed (as many as fit + reasonably) on the actual cover, and continue the rest onto + adjacent pages. + + If you publish or distribute Opaque copies of the Document + numbering more than 100, you must either include a + machine-readable Transparent copy along with each Opaque copy, or + state in or with each Opaque copy a computer-network location from + which the general network-using public has access to download + using public-standard network protocols a complete Transparent + copy of the Document, free of added material. If you use the + latter option, you must take reasonably prudent steps, when you + begin distribution of Opaque copies in quantity, to ensure that + this Transparent copy will remain thus accessible at the stated + location until at least one year after the last time you + distribute an Opaque copy (directly or through your agents or + retailers) of that edition to the public. + + It is requested, but not required, that you contact the authors of + the Document well before redistributing any large number of + copies, to give them a chance to provide you with an updated + version of the Document. + + 4. MODIFICATIONS + + You may copy and distribute a Modified Version of the Document + under the conditions of sections 2 and 3 above, provided that you + release the Modified Version under precisely this License, with + the Modified Version filling the role of the Document, thus + licensing distribution and modification of the Modified Version to + whoever possesses a copy of it. In addition, you must do these + things in the Modified Version: + + A. Use in the Title Page (and on the covers, if any) a title + distinct from that of the Document, and from those of + previous versions (which should, if there were any, be listed + in the History section of the Document). You may use the + same title as a previous version if the original publisher of + that version gives permission. + + B. List on the Title Page, as authors, one or more persons or + entities responsible for authorship of the modifications in + the Modified Version, together with at least five of the + principal authors of the Document (all of its principal + authors, if it has fewer than five), unless they release you + from this requirement. + + C. State on the Title page the name of the publisher of the + Modified Version, as the publisher. + + D. Preserve all the copyright notices of the Document. + + E. Add an appropriate copyright notice for your modifications + adjacent to the other copyright notices. + + F. Include, immediately after the copyright notices, a license + notice giving the public permission to use the Modified + Version under the terms of this License, in the form shown in + the Addendum below. + + G. Preserve in that license notice the full lists of Invariant + Sections and required Cover Texts given in the Document's + license notice. + + H. Include an unaltered copy of this License. + + I. Preserve the section Entitled "History", Preserve its Title, + and add to it an item stating at least the title, year, new + authors, and publisher of the Modified Version as given on + the Title Page. If there is no section Entitled "History" in + the Document, create one stating the title, year, authors, + and publisher of the Document as given on its Title Page, + then add an item describing the Modified Version as stated in + the previous sentence. + + J. Preserve the network location, if any, given in the Document + for public access to a Transparent copy of the Document, and + likewise the network locations given in the Document for + previous versions it was based on. These may be placed in + the "History" section. You may omit a network location for a + work that was published at least four years before the + Document itself, or if the original publisher of the version + it refers to gives permission. + + K. For any section Entitled "Acknowledgements" or "Dedications", + Preserve the Title of the section, and preserve in the + section all the substance and tone of each of the contributor + acknowledgements and/or dedications given therein. + + L. Preserve all the Invariant Sections of the Document, + unaltered in their text and in their titles. Section numbers + or the equivalent are not considered part of the section + titles. + + M. Delete any section Entitled "Endorsements". Such a section + may not be included in the Modified Version. + + N. Do not retitle any existing section to be Entitled + "Endorsements" or to conflict in title with any Invariant + Section. + + O. Preserve any Warranty Disclaimers. + + If the Modified Version includes new front-matter sections or + appendices that qualify as Secondary Sections and contain no + material copied from the Document, you may at your option + designate some or all of these sections as invariant. To do this, + add their titles to the list of Invariant Sections in the Modified + Version's license notice. These titles must be distinct from any + other section titles. + + You may add a section Entitled "Endorsements", provided it contains + nothing but endorsements of your Modified Version by various + parties--for example, statements of peer review or that the text + has been approved by an organization as the authoritative + definition of a standard. + + You may add a passage of up to five words as a Front-Cover Text, + and a passage of up to 25 words as a Back-Cover Text, to the end + of the list of Cover Texts in the Modified Version. Only one + passage of Front-Cover Text and one of Back-Cover Text may be + added by (or through arrangements made by) any one entity. If the + Document already includes a cover text for the same cover, + previously added by you or by arrangement made by the same entity + you are acting on behalf of, you may not add another; but you may + replace the old one, on explicit permission from the previous + publisher that added the old one. + + The author(s) and publisher(s) of the Document do not by this + License give permission to use their names for publicity for or to + assert or imply endorsement of any Modified Version. + + 5. COMBINING DOCUMENTS + + You may combine the Document with other documents released under + this License, under the terms defined in section 4 above for + modified versions, provided that you include in the combination + all of the Invariant Sections of all of the original documents, + unmodified, and list them all as Invariant Sections of your + combined work in its license notice, and that you preserve all + their Warranty Disclaimers. + + The combined work need only contain one copy of this License, and + multiple identical Invariant Sections may be replaced with a single + copy. If there are multiple Invariant Sections with the same name + but different contents, make the title of each such section unique + by adding at the end of it, in parentheses, the name of the + original author or publisher of that section if known, or else a + unique number. Make the same adjustment to the section titles in + the list of Invariant Sections in the license notice of the + combined work. + + In the combination, you must combine any sections Entitled + "History" in the various original documents, forming one section + Entitled "History"; likewise combine any sections Entitled + "Acknowledgements", and any sections Entitled "Dedications". You + must delete all sections Entitled "Endorsements." + + 6. COLLECTIONS OF DOCUMENTS + + You may make a collection consisting of the Document and other + documents released under this License, and replace the individual + copies of this License in the various documents with a single copy + that is included in the collection, provided that you follow the + rules of this License for verbatim copying of each of the + documents in all other respects. + + You may extract a single document from such a collection, and + distribute it individually under this License, provided you insert + a copy of this License into the extracted document, and follow + this License in all other respects regarding verbatim copying of + that document. + + 7. AGGREGATION WITH INDEPENDENT WORKS + + A compilation of the Document or its derivatives with other + separate and independent documents or works, in or on a volume of + a storage or distribution medium, is called an "aggregate" if the + copyright resulting from the compilation is not used to limit the + legal rights of the compilation's users beyond what the individual + works permit. When the Document is included in an aggregate, this + License does not apply to the other works in the aggregate which + are not themselves derivative works of the Document. + + If the Cover Text requirement of section 3 is applicable to these + copies of the Document, then if the Document is less than one half + of the entire aggregate, the Document's Cover Texts may be placed + on covers that bracket the Document within the aggregate, or the + electronic equivalent of covers if the Document is in electronic + form. Otherwise they must appear on printed covers that bracket + the whole aggregate. + + 8. TRANSLATION + + Translation is considered a kind of modification, so you may + distribute translations of the Document under the terms of section + 4. Replacing Invariant Sections with translations requires special + permission from their copyright holders, but you may include + translations of some or all Invariant Sections in addition to the + original versions of these Invariant Sections. You may include a + translation of this License, and all the license notices in the + Document, and any Warranty Disclaimers, provided that you also + include the original English version of this License and the + original versions of those notices and disclaimers. In case of a + disagreement between the translation and the original version of + this License or a notice or disclaimer, the original version will + prevail. + + If a section in the Document is Entitled "Acknowledgements", + "Dedications", or "History", the requirement (section 4) to + Preserve its Title (section 1) will typically require changing the + actual title. + + 9. TERMINATION + + You may not copy, modify, sublicense, or distribute the Document + except as expressly provided for under this License. Any other + attempt to copy, modify, sublicense or distribute the Document is + void, and will automatically terminate your rights under this + License. However, parties who have received copies, or rights, + from you under this License will not have their licenses + terminated so long as such parties remain in full compliance. + + 10. FUTURE REVISIONS OF THIS LICENSE + + The Free Software Foundation may publish new, revised versions of + the GNU Free Documentation License from time to time. Such new + versions will be similar in spirit to the present version, but may + differ in detail to address new problems or concerns. See + `http://www.gnu.org/copyleft/'. + + Each version of the License is given a distinguishing version + number. If the Document specifies that a particular numbered + version of this License "or any later version" applies to it, you + have the option of following the terms and conditions either of + that specified version or of any later version that has been + published (not as a draft) by the Free Software Foundation. If + the Document does not specify a version number of this License, + you may choose any version ever published (not as a draft) by the + Free Software Foundation. + +A.0.1 ADDENDUM: How to use this License for your documents +---------------------------------------------------------- + +To use this License in a document you have written, include a copy of +the License in the document and put the following copyright and license +notices just after the title page: + + Copyright (C) YEAR YOUR NAME. + Permission is granted to copy, distribute and/or modify this document + under the terms of the GNU Free Documentation License, Version 1.2 + or any later version published by the Free Software Foundation; + with no Invariant Sections, no Front-Cover Texts, and no Back-Cover + Texts. A copy of the license is included in the section entitled ``GNU + Free Documentation License''. + + If you have Invariant Sections, Front-Cover Texts and Back-Cover +Texts, replace the "with...Texts." line with this: + + with the Invariant Sections being LIST THEIR TITLES, with + the Front-Cover Texts being LIST, and with the Back-Cover Texts + being LIST. + + If you have Invariant Sections without Cover Texts, or some other +combination of the three, merge those two alternatives to suit the +situation. + + If your document contains nontrivial examples of program code, we +recommend releasing these examples in parallel under your choice of +free software license, such as the GNU General Public License, to +permit their use in free software. + + +File: user-guide.info, Node: Index, Prev: GNU Free Documentation Licence, Up: Top + +Index +***** + +[index] +* Menu: + +* alu (scheduler): Unify. (line 49) +* AppArmour: Troubleshooting. (line 96) +* arch: Getting GlusterFS. (line 6) +* booster: Booster. (line 6) +* commercial support: Introduction. (line 36) +* DNS round robin: Transport modules. (line 29) +* fcntl: POSIX Locks. (line 6) +* FDL, GNU Free Documentation License: GNU Free Documentation Licence. + (line 6) +* fixed-id (translator): Fixed ID. (line 6) +* GlusterFS client: Client. (line 6) +* GlusterFS mailing list: Introduction. (line 28) +* GlusterFS server: Server. (line 6) +* infiniband transport: Transport modules. (line 58) +* InfiniBand, installation: Pre requisites. (line 51) +* io-cache (translator): IO Cache. (line 6) +* io-threads (translator): IO Threads. (line 6) +* IRC channel, #gluster: Introduction. (line 31) +* libibverbs: Pre requisites. (line 51) +* namespace: Unify. (line 207) +* nufa (scheduler): Unify. (line 175) +* OpenSuSE: Troubleshooting. (line 96) +* posix-locks (translator): POSIX Locks. (line 6) +* random (scheduler): Unify. (line 159) +* read-ahead (translator): Read Ahead. (line 6) +* record locking: POSIX Locks. (line 6) +* Redhat Enterprise Linux: Troubleshooting. (line 78) +* Replicate: Replicate. (line 6) +* rot-13 (translator): ROT-13. (line 6) +* rr (scheduler): Unify. (line 138) +* scheduler (unify): Unify. (line 6) +* self heal (replicate): Replicate. (line 46) +* self heal (unify): Unify. (line 223) +* stripe (translator): Stripe. (line 6) +* trace (translator): Trace. (line 6) +* unify (translator): Unify. (line 6) +* unify invariants: Unify. (line 16) +* write-behind (translator): Write Behind. (line 6) +* Z Research, Inc.: Introduction. (line 36) + + + +Tag Table: +Node: Top703 +Node: Acknowledgements2303 +Node: Introduction3213 +Node: Installation and Invocation4648 +Node: Pre requisites4932 +Node: Getting GlusterFS7022 +Ref: Getting GlusterFS-Footnote-17808 +Node: Building7856 +Node: Running GlusterFS9558 +Node: Server9769 +Node: Client11357 +Node: A Tutorial Introduction13563 +Node: Concepts17100 +Node: Filesystems in Userspace17315 +Node: Translator18456 +Node: Volume specification file21159 +Node: Translators23631 +Node: Storage Translators24200 +Ref: Storage Translators-Footnote-125007 +Node: POSIX25141 +Node: BDB25764 +Node: Client and Server Translators26821 +Node: Transport modules27297 +Node: Client protocol31444 +Node: Server protocol32383 +Node: Clustering Translators33372 +Node: Unify34259 +Ref: Unify-Footnote-143858 +Node: Replicate43950 +Node: Stripe49005 +Node: Performance Translators50163 +Node: Read Ahead50437 +Node: Write Behind52169 +Node: IO Threads53578 +Node: IO Cache54366 +Node: Booster55690 +Node: Features Translators57104 +Node: POSIX Locks57332 +Node: Fixed ID58649 +Node: Miscellaneous Translators59135 +Node: ROT-1359333 +Node: Trace60012 +Node: Usage Scenarios61281 +Ref: Usage Scenarios-Footnote-167214 +Node: Troubleshooting67289 +Node: GNU Free Documentation Licence73637 +Node: Index96086 + +End Tag Table diff --git a/doc/user-guide/user-guide.pdf b/doc/user-guide/user-guide.pdf new file mode 100644 index 000000000..ed7bd2a99 Binary files /dev/null and b/doc/user-guide/user-guide.pdf differ diff --git a/doc/user-guide/user-guide.texi b/doc/user-guide/user-guide.texi new file mode 100644 index 000000000..8365419a6 --- /dev/null +++ b/doc/user-guide/user-guide.texi @@ -0,0 +1,2226 @@ +\input texinfo +@setfilename user-guide.info +@settitle GlusterFS 2.0 User Guide +@afourpaper + +@direntry +* GlusterFS: (user-guide). GlusterFS distributed filesystem user guide +@end direntry + +@copying +This is the user manual for GlusterFS 2.0. + +Copyright @copyright{} 2008,2007 @email{@b{Z}} Research, Inc. Permission is granted to +copy, distribute and/or modify this document under the terms of the +@acronym{GNU} Free Documentation License, Version 1.2 or any later +version published by the Free Software Foundation; with no Invariant +Sections, no Front-Cover Texts, and no Back-Cover Texts. A copy of the +license is included in the chapter entitled ``@acronym{GNU} Free +Documentation License''. +@end copying + +@titlepage +@title GlusterFS 2.0 User Guide [DRAFT] +@subtitle January 15, 2008 +@author http://gluster.org/core-team.php +@author @email{@b{Z}} @b{Research} + +@page +@vskip 0pt plus 1filll +@insertcopying +@end titlepage + +@c Info stuff +@ifnottex +@node Top +@top GlusterFS 2.0 User Guide + +@insertcopying +@menu +* Acknowledgements:: +* Introduction:: +* Installation and Invocation:: +* Concepts:: +* Translators:: +* Usage Scenarios:: +* Troubleshooting:: +* GNU Free Documentation Licence:: +* Index:: + +@detailmenu + --- The Detailed Node Listing --- + +Installation and Invocation + +* Pre requisites:: +* Getting GlusterFS:: +* Building:: +* Running GlusterFS:: +* A Tutorial Introduction:: + +Running GlusterFS + +* Server:: +* Client:: + +Concepts + +* Filesystems in Userspace:: +* Translator:: +* Volume specification file:: + +Translators + +* Storage Translators:: +* Client and Server Translators:: +* Clustering Translators:: +* Performance Translators:: +* Features Translators:: + +Storage Translators + +* POSIX:: + +Client and Server Translators + +* Transport modules:: +* Client protocol:: +* Server protocol:: + +Clustering Translators + +* Unify:: +* Replicate:: +* Stripe:: + +Performance Translators + +* Read Ahead:: +* Write Behind:: +* IO Threads:: +* IO Cache:: + +Features Translators + +* POSIX Locks:: +* Fixed ID:: + +Miscellaneous Translators + +* ROT-13:: +* Trace:: + +@end detailmenu +@end menu + +@end ifnottex +@c Info stuff end + +@contents + +@node Acknowledgements +@unnumbered Acknowledgements +GlusterFS continues to be a wonderful and enriching experience for all +of us involved. + +GlusterFS development would not have been possible at this pace if +not for our enthusiastic users. People from around the world have +helped us with bug reports, performance numbers, and feature suggestions. +A huge thanks to them all. + +Matthew Paine - for RPMs & general enthu + +Leonardo Rodrigues de Mello - for DEBs + +Julian Perez & Adam D'Auria - for multi-server tutorial + +Paul England - for HA spec + +Brent Nelson - for many bug reports + +Jacques Mattheij - for Europe mirror. + +Patrick Negri - for TCP non-blocking connect. +@flushright +http://gluster.org/core-team.php (@email{list-hacking@@zresearch.com}) +@email{@b{Z}} Research +@end flushright + +@node Introduction +@chapter Introduction + +GlusterFS is a distributed filesystem. It works at the file level, +not block level. + +A network filesystem is one which allows us to access remote files. A +distributed filesystem is one that stores data on multiple machines +and makes them all appear to be a part of the same filesystem. + +Need for distributed filesystems + +@itemize @bullet +@item Scalability: A distributed filesystem allows us to store more data than what can be stored on a single machine. + +@item Redundancy: We might want to replicate crucial data on to several machines. + +@item Uniform access: One can mount a remote volume (for example your home directory) from any machine and access the same data. +@end itemize + +@section Contacting us +You can reach us through the mailing list @strong{gluster-devel} +(@email{gluster-devel@@nongnu.org}). +@cindex GlusterFS mailing list + +You can also find many of the developers on @acronym{IRC}, on the @code{#gluster} +channel on Freenode (@indicateurl{irc.freenode.net}). +@cindex IRC channel, #gluster + +The GlusterFS documentation wiki is also useful: @* +@indicateurl{http://gluster.org/docs/index.php/GlusterFS} + +For commercial support, you can contact @email{@b{Z}} Research at: +@cindex commercial support +@cindex Z Research, Inc. + +@display +3194 Winding Vista Common +Fremont, CA 94539 +USA. + +Phone: +1 (510) 354 6801 +Toll free: +1 (888) 813 6309 +Fax: +1 (510) 372 0604 +@end display + +You can also email us at @email{support@@zresearch.com}. + +@node Installation and Invocation +@chapter Installation and Invocation + +@menu +* Pre requisites:: +* Getting GlusterFS:: +* Building:: +* Running GlusterFS:: +* A Tutorial Introduction:: +@end menu + +@node Pre requisites +@section Pre requisites + +Before installing GlusterFS make sure you have the +following components installed. + +@subsection @acronym{FUSE} +You'll need @acronym{FUSE} version 2.6.0 or higher to +use GlusterFS. You can omit installing @acronym{FUSE} if you want to +build @emph{only} the server. Note that you won't be able to mount +a GlusterFS filesystem on a machine that does not have @acronym{FUSE} +installed. + +@acronym{FUSE} can be downloaded from: @indicateurl{http://fuse.sourceforge.net/} + +To get the best performance from GlusterFS, however, it is recommended that you use +our patched version of @acronym{FUSE}. See Patched FUSE for details. + +@subsection Patched FUSE + +The GlusterFS project maintains a patched version of @acronym{FUSE} meant to be used +with GlusterFS. The patches increase GlusterFS performance. It is recommended that +all users use the patched @acronym{FUSE}. + +The patched @acronym{FUSE} tarball can be downloaded from: + +@indicateurl{ftp://ftp.zresearch.com/pub/gluster/glusterfs/fuse/} + +The specific changes made to @acronym{FUSE} are: + +@itemize +@item The communication channel size between @acronym{FUSE} kernel module and GlusterFS has been increased to 1MB, permitting large reads and writes to be sent in bigger chunks. + +@item The kernel's read-ahead boundry has been extended upto 1MB. + +@item Block size returned in the @command{stat()}/@command{fstat()} calls tuned to 1MB, to make cp and similar commands perform I/O using that block size. + +@item @command{flock()} locking support has been added (although some rework in GlusterFS is needed for perfect compliance). +@end itemize + +@subsection libibverbs (optional) +@cindex InfiniBand, installation +@cindex libibverbs +This is only needed if you want GlusterFS to use InfiniBand as the +interconnect mechanism between server and client. You can get it from: + +@indicateurl{http://www.openfabrics.org/downloads.htm}. + +@subsection Bison and Flex +These should be already installed on most Linux systems. If not, use your distribution's +normal software installation procedures to install them. Make sure you install the +relevant developer packages also. + +@node Getting GlusterFS +@section Getting GlusterFS +@cindex arch +There are many ways to get hold of GlusterFS. For a production deployment, +the recommended method is to download the latest release tarball. +Release tarballs are available at: @indicateurl{http://gluster.org/download.php}. + +If you want the bleeding edge development source, you can get them +from the @acronym{GNU} +Arch@footnote{@indicateurl{http://www.gnu.org/software/gnu-arch/}} +repository. First you must install @acronym{GNU} Arch itself. Then +register the GlusterFS archive by doing: + +@example +$ tla register-archive http://arch.sv.gnu.org/archives/gluster +@end example + +Now you can check out the source itself: + +@example +$ tla get -A gluster@@sv.gnu.org glusterfs--mainline--3.0 +@end example + +@node Building +@section Building +You can skip this section if you're installing from @acronym{RPM}s +or @acronym{DEB}s. + +GlusterFS uses the Autotools mechanism to build. As such, the procedure +is straight-forward. First, change into the GlusterFS source directory. + +@example +$ cd glusterfs- +@end example + +If you checked out the source from the Arch repository, you'll need +to run @command{./autogen.sh} first. Note that you'll need to have +Autoconf and Automake installed for this. + +Run @command{configure}. + +@example +$ ./configure +@end example + +The configure script accepts the following options: + +@cartouche +@table @code + +@item --disable-ibverbs +Disable the InfiniBand transport mechanism. + +@item --disable-fuse-client +Disable the @acronym{FUSE} client. + +@item --disable-server +Disable building of the GlusterFS server. + +@item --disable-bdb +Disable building of Berkeley DB based storage translator. + +@item --disable-mod_glusterfs +Disable building of Apache/lighttpd glusterfs plugins. + +@item --disable-epoll +Use poll instead of epoll. + +@item --disable-libglusterfsclient +Disable building of libglusterfsclient + +@end table +@end cartouche + +Build and install GlusterFS. + +@example +# make install +@end example + +The binaries (@command{glusterfsd} and @command{glusterfs}) will be by +default installed in @command{/usr/local/sbin/}. Translator, +scheduler, and transport shared libraries will be installed in +@command{/usr/local/lib/glusterfs//}. Sample volume +specification files will be in @command{/usr/local/etc/glusterfs/}. +This document itself can be found in +@command{/usr/local/share/doc/glusterfs/}. If you passed the @command{--prefix} +argument to the configure script, then replace @command{/usr/local} in the preceding +paths with the prefix. + +@node Running GlusterFS +@section Running GlusterFS + +@menu +* Server:: +* Client:: +@end menu + +@node Server +@subsection Server +@cindex GlusterFS server + +The GlusterFS server is necessary to export storage volumes to remote clients +(See @ref{Server protocol} for more info). This section documents the invocation +of the GlusterFS server program and all the command-line options accepted by it. + +@cartouche +@table @code +Basic Options +@item -f, --volfile= + Use the volume file as the volume specification. + +@item -s, --volfile-server= + Server to get volume file from. This option overrides --volfile option. + +@item -l, --log-file= + Specify the path for the log file. + +@item -L, --log-level= + Set the log level for the server. Log level should be one of @acronym{DEBUG}, +@acronym{WARNING}, @acronym{ERROR}, @acronym{CRITICAL}, or @acronym{NONE}. + +Advanced Options +@item --debug + Run in debug mode. This option sets --no-daemon, --log-level to DEBUG and + --log-file to console. + +@item -N, --no-daemon + Run glusterfsd as a foreground process. + +@item -p, --pid-file= + Path for the @acronym{PID} file. + +@item --volfile-id= + 'key' of the volfile to be fetched from server. + +@item --volfile-server-port= + Listening port number of volfile server. + +@item --volfile-server-transport=[socket|ib-verbs] + Transport type to get volfile from server. [default: @command{socket}] + +@item --xlator-options= + Add/override a translator option for a volume with specified value. + +Miscellaneous Options +@item -?, --help + Show this help text. + +@item --usage + Display a short usage message. + +@item -V, --version + Show version information. +@end table +@end cartouche + +@node Client +@subsection Client +@cindex GlusterFS client + +The GlusterFS client process is necessary to access remote storage volumes and +mount them locally using @acronym{FUSE}. This section documents the invocation of the +client process and all its command-line arguments. + +@example + # glusterfs [options] +@end example + +The @command{mountpoint} is the directory where you want the GlusterFS +filesystem to appear. Example: + +@example + # glusterfs -f /usr/local/etc/glusterfs-client.vol /mnt +@end example + +The command-line options are detailed below. + +@tex +\vfill +@end tex +@page + +@cartouche +@table @code + +Basic Options +@item -f, --volfile= + Use the volume file as the volume specification. + +@item -s, --volfile-server= + Server to get volume file from. This option overrides --volfile option. + +@item -l, --log-file= + Specify the path for the log file. + +@item -L, --log-level= + Set the log level for the server. Log level should be one of @acronym{DEBUG}, +@acronym{WARNING}, @acronym{ERROR}, @acronym{CRITICAL}, or @acronym{NONE}. + +Advanced Options +@item --debug + Run in debug mode. This option sets --no-daemon, --log-level to DEBUG and + --log-file to console. + +@item -N, --no-daemon + Run @command{glusterfs} as a foreground process. + +@item -p, --pid-file= + Path for the @acronym{PID} file. + +@item --volfile-id= + 'key' of the volfile to be fetched from server. + +@item --volfile-server-port= + Listening port number of volfile server. + +@item --volfile-server-transport=[socket|ib-verbs] + Transport type to get volfile from server. [default: @command{socket}] + +@item --xlator-options= + Add/override a translator option for a volume with specified value. + +@item --volume-name= + Volume name in client spec to use. Defaults to the root volume. + +@acronym{FUSE} Options +@item --attribute-timeout= + Attribute timeout for inodes in the kernel, in seconds. Defaults to 1 second. + +@item --disable-direct-io-mode + Disable direct @acronym{I/O} mode in @acronym{FUSE} kernel module. + +@item -e, --entry-timeout= + Entry timeout for directory entries in the kernel, in seconds. + Defaults to 1 second. + +Missellaneous Options +@item -?, --help + Show this help information. + +@item -V, --version + Show version information. +@end table +@end cartouche + +@node A Tutorial Introduction +@section A Tutorial Introduction + +This section will show you how to quickly get GlusterFS up and running. We'll +configure GlusterFS as a simple network filesystem, with one server and one client. +In this mode of usage, GlusterFS can serve as a replacement for NFS. + +We'll make use of two machines; call them @emph{server} and +@emph{client} (If you don't want to setup two machines, just run +everything that follows on the same machine). In the examples that +follow, the shell prompts will use these names to clarify the machine +on which the command is being run. For example, a command that should +be run on the server will be shown with the prompt: + +@example +[root@@server]# +@end example + +Our goal is to make a directory on the @emph{server} (say, @command{/export}) +accessible to the @emph{client}. + +First of all, get GlusterFS installed on both the machines, as described in the +previous sections. Make sure you have the @acronym{FUSE} kernel module loaded. You +can ensure this by running: + +@example +[root@@server]# modprobe fuse +@end example + +Before we can run the GlusterFS client or server programs, we need to write +two files called @emph{volume specifications} (equivalently refered to as @emph{volfiles}). +The volfile describes the @emph{translator tree} on a node. The next chapter will +explain the concepts of `translator' and `volume specification' in detail. For now, +just assume that the volfile is like an NFS @command{/etc/export} file. + +On the server, create a text file somewhere (we'll assume the path +@command{/tmp/glusterfsd.vol}) with the following contents. + +@cartouche +@example +volume colon-o + type storage/posix + option directory /export +end-volume + +volume server + type protocol/server + subvolumes colon-o + option transport-type tcp + option auth.addr.colon-o.allow * +end-volume +@end example +@end cartouche + +A brief explanation of the file's contents. The first section defines a storage +volume, named ``colon-o'' (the volume names are arbitrary), which exports the +@command{/export} directory. The second section defines options for the translator +which will make the storage volume accessible remotely. It specifies @command{colon-o} as +a subvolume. This defines the @emph{translator tree}, about which more will be said +in the next chapter. The two options specify that the @acronym{TCP} protocol is to be +used (as opposed to InfiniBand, for example), and that access to the storage volume +is to be provided to clients with any @acronym{IP} address at all. If you wanted to +restrict access to this server to only your subnet for example, you'd specify +something like @command{192.168.1.*} in the second option line. + +On the client machine, create the following text file (again, we'll assume +the path to be @command{/tmp/glusterfs-client.vol}). Replace +@emph{server-ip-address} with the @acronym{IP} address of your server machine. If you +are doing all this on a single machine, use @command{127.0.0.1}. + +@cartouche +@example +volume client + type protocol/client + option transport-type tcp + option remote-host @emph{server-ip-address} + option remote-subvolume colon-o +end-volume +@end example +@end cartouche + +Now we need to start both the server and client programs. To start the server: + +@example +[root@@server]# glusterfsd -f /tmp/glusterfs-server.vol +@end example + +To start the client: + +@example +[root@@client]# glusterfs -f /tmp/glusterfs-client.vol /mnt/glusterfs +@end example + +You should now be able to see the files under the server's @command{/export} directory +in the @command{/mnt/glusterfs} directory on the client. That's it; GlusterFS is now +working as a network file system. + +@node Concepts +@chapter Concepts + +@menu +* Filesystems in Userspace:: +* Translator:: +* Volume specification file:: +@end menu + +@node Filesystems in Userspace +@section Filesystems in Userspace + +A filesystem is usually implemented in kernel space. Kernel space +development is much harder than userspace development. @acronym{FUSE} +is a kernel module/library that allows us to write a filesystem +completely in userspace. + +@acronym{FUSE} consists of a kernel module which interacts with the userspace +implementation using a device file @code{/dev/fuse}. When a process +makes a syscall on a @acronym{FUSE} filesystem, @acronym{VFS} hands the request to the +@acronym{FUSE} module, which writes the request to @code{/dev/fuse}. The +userspace implementation polls @code{/dev/fuse}, and when a request arrives, +processes it and writes the result back to @code{/dev/fuse}. The kernel then +reads from the device file and returns the result to the user process. + +In case of GlusterFS, the userspace program is the GlusterFS client. +The control flow is shown in the diagram below. The GlusterFS client +services the request by sending it to the server, which in turn +hands it to the local @acronym{POSIX} filesystem. + +@center @image{fuse,44pc,,,.pdf} +@center Fig 1. Control flow in GlusterFS + +@node Translator +@section Translator + +The @emph{translator} is the most important concept in GlusterFS. In +fact, GlusterFS is nothing but a collection of translators working +together, forming a translator @emph{tree}. + +The idea of a translator is perhaps best understood using an +analogy. Consider the @acronym{VFS} in the Linux kernel. The +@acronym{VFS} abstracts the various filesystem implementations (such +as @acronym{EXT3}, ReiserFS, @acronym{XFS}, etc.) supported by the +kernel. When an application calls the kernel to perform an operation +on a file, the kernel passes the request on to the appropriate +filesystem implementation. + +For example, let's say there are two partitions on a Linux machine: +@command{/}, which is an @acronym{EXT3} partition, and @command{/usr}, +which is a ReiserFS partition. Now if an application wants to open a +file called, say, @command{/etc/fstab}, then the kernel will +internally pass the request to the @acronym{EXT3} implementation. If +on the other hand, an application wants to read a file called +@command{/usr/src/linux/CREDITS}, then the kernel will call upon the +ReiserFS implementation to do the job. + +The ``filesystem implementation'' objects are analogous to GlusterFS +translators. A GlusterFS translator implements all the filesystem +operations. Whereas in @acronym{VFS} there is a two-level tree (with +the kernel at the root and all the filesystem implementation as its +children), in GlusterFS there exists a more elaborate tree structure. + +We can now define translators more precisely. A GlusterFS translator +is a shared object (@command{.so}) that implements every filesystem +call. GlusterFS translators can be arranged in an arbitrary tree +structure (subject to constraints imposed by the translators). When +GlusterFS receives a filesystem call, it passes it on to the +translator at the root of the translator tree. The root translator may +in turn pass it on to any or all of its children, and so on, until the +leaf nodes are reached. The result of a filesystem call is +communicated in the reverse fashion, from the leaf nodes up to the +root node, and then on to the application. + +So what might a translator tree look like? + +@tex +\vfill +@end tex +@page + +@center @image{xlator,44pc,,,.pdf} +@center Fig 2. A sample translator tree + +The diagram depicts three servers and one GlusterFS client. It is important +to note that conceptually, the translator tree spans machine boundaries. +Thus, the client machine in the diagram, @command{10.0.0.1}, can access +the aggregated storage of the filesystems on the server machines @command{10.0.0.2}, +@command{10.0.0.3}, and @command{10.0.0.4}. The translator diagram will make more +sense once you've read the next chapter and understood the functions of the +various translators. + +@node Volume specification file +@section Volume specification file +The volume specification file describes the translator tree for both the +server and client programs. + +A volume specification file is a sequence of volume definitions. +The syntax of a volume definition is explained below: + +@cartouche +@example +@strong{volume} @emph{volume-name} + @strong{type} @emph{translator-name} + @strong{option} @emph{option-name} @emph{option-value} + @dots{} + @strong{subvolumes} @emph{subvolume1} @emph{subvolume2} @dots{} +@strong{end-volume} +@end example + +@dots{} +@end cartouche + +@table @asis +@item @emph{volume-name} + An identifier for the volume. This is just a human-readable name, +and can contain any alphanumeric character. For instance, ``storage-1'', ``colon-o'', +or ``forty-two''. + +@item @emph{translator-name} + Name of one of the available translators. Example: @command{protocol/client}, +@command{cluster/unify}. + +@item @emph{option-name} + Name of a valid option for the translator. + +@item @emph{option-value} + Value for the option. Everything following the ``option'' keyword to the end of the +line is considered the value; it is up to the translator to parse it. + +@item @emph{subvolume1}, @emph{subvolume2}, @dots{} + Volume names of sub-volumes. The sub-volumes must already have been defined earlier +in the file. +@end table + +There are a few rules you must follow when writing a volume specification file: + +@itemize +@item Everything following a `@command{#}' is considered a comment and is ignored. Blank lines are also ignored. +@item All names and keywords are case-sensitive. +@item The order of options inside a volume definition does not matter. +@item An option value may not span multiple lines. +@item If an option is not specified, it will assume its default value. +@item A sub-volume must have already been defined before it can be referenced. This means you have to write the specification file ``bottom-up'', starting from the leaf nodes of the translator tree and moving up to the root. +@end itemize + +A simple example volume specification file is shown below: + +@cartouche +@example +# This is a comment line +volume client + type protocol/client + option transport-type tcp + option remote-host localhost # Also a comment + option remote-subvolume brick +# The subvolumes line may be absent +end-volume + +volume iot + type performance/io-threads + option thread-count 4 + subvolumes client +end-volume + +volume wb + type performance/write-behind + subvolumes iot +end-volume +@end example +@end cartouche + +@node Translators +@chapter Translators + +@menu +* Storage Translators:: +* Client and Server Translators:: +* Clustering Translators:: +* Performance Translators:: +* Features Translators:: +* Miscellaneous Translators:: +@end menu + +This chapter documents all the available GlusterFS translators in detail. +Each translator section will show its name (for example, @command{cluster/unify}), +briefly describe its purpose and workings, and list every option accepted by +that translator and their meaning. + +@node Storage Translators +@section Storage Translators + +The storage translators form the ``backend'' for GlusterFS. Currently, +the only available storage translator is the @acronym{POSIX} +translator, which stores files on a normal @acronym{POSIX} +filesystem. A pleasant consequence of this is that your data will +still be accessible if GlusterFS crashes or cannot be started. + +Other storage backends are planned for the future. One of the possibilities is an +Amazon S3 translator. Amazon S3 is an unlimited online storage service accessible +through a web services @acronym{API}. The S3 translator will allow you to access +the storage as a normal @acronym{POSIX} filesystem. +@footnote{Some more discussion about this can be found at: + +http://developer.amazonwebservices.com/connect/message.jspa?messageID=52873} + +@menu +* POSIX:: +* BDB:: +@end menu + +@node POSIX +@subsection POSIX +@example +type storage/posix +@end example + +The @command{posix} translator uses a normal @acronym{POSIX} +filesystem as its ``backend'' to actually store files and +directories. This can be any filesystem that supports extended +attributes (@acronym{EXT3}, ReiserFS, @acronym{XFS}, ...). Extended +attributes are used by some translators to store metadata, for +example, by the replicate and stripe translators. See +@ref{Replicate} and @ref{Stripe}, respectively for details. + +@cartouche +@table @code +@item directory +The directory on the local filesystem which is to be used for storage. +@end table +@end cartouche + +@node BDB +@subsection BDB +@example +type storage/bdb +@end example + +The @command{BDB} translator uses a @acronym{Berkeley DB} database as its +``backend'' to actually store files as key-value pair in the database and +directories as regular @acronym{POSIX} directories. Note that @acronym{BDB} +does not provide extended attribute support for regular files. Do not use +@acronym{BDB} as storage translator while using any translator that demands +extended attributes on ``backend''. + +@cartouche +@table @code +@item directory +The directory on the local filesystem which is to be used for storage. +@item mode [cache|persistent] (cache) +When @acronym{BDB} is run in @command{cache} mode, recovery of back-end is not completely +guaranteed. @command{persistent} guarantees that @acronym{BDB} can recover back-end from +@acronym{Berkeley DB} even if GlusterFS crashes. +@item errfile +The path of the file to be used as @command{errfile} for @acronym{Berkeley DB} to report +detailed error messages, if any. Note that all the contents of this file will be written +by @acronym{Berkeley DB}, not GlusterFS. +@item logdir + + +@end table +@end cartouche + +@node Client and Server Translators, Clustering Translators, Storage Translators, Translators +@section Client and Server Translators + +The client and server translator enable GlusterFS to export a +translator tree over the network or access a remote GlusterFS +server. These two translators implement GlusterFS's network protocol. + +@menu +* Transport modules:: +* Client protocol:: +* Server protocol:: +@end menu + +@node Transport modules +@subsection Transport modules +The client and server translators are capable of using any of the +pluggable transport modules. Currently available transport modules are +@command{tcp}, which uses a @acronym{TCP} connection between client +and server to communicate; @command{ib-sdp}, which uses a +@acronym{TCP} connection over InfiniBand, and @command{ibverbs}, which +uses high-speed InfiniBand connections. + +Each transport module comes in two different versions, one to be used on +the server side and the other on the client side. + +@subsubsection TCP + +The @acronym{TCP} transport module uses a @acronym{TCP/IP} connection between +the server and the client. + +@example + option transport-type tcp +@end example + +The @acronym{TCP} client module accepts the following options: + +@cartouche +@table @code +@item non-blocking-connect [no|off|on|yes] (on) +Whether to make the connection attempt asynchronous. +@item remote-port (6996) +Server port to connect to. +@cindex DNS round robin +@item remote-host * +Hostname or @acronym{IP} address of the server. If the host name resolves to +multiple IP addresses, all of them will be tried in a round-robin fashion. This +feature can be used to implement fail-over. +@end table +@end cartouche + +The @acronym{TCP} server module accepts the following options: + +@cartouche +@table @code +@item bind-address
(0.0.0.0) +The local interface on which the server should listen to requests. Default is to +listen on all interfaces. +@item listen-port (6996) +The local port to listen on. +@end table +@end cartouche + +@subsubsection IB-SDP +@example + option transport-type ib-sdp +@end example + +kernel implements socket interface for ib hardware. SDP is over ib-verbs. +This module accepts the same options as @command{tcp} + +@subsubsection ibverbs + +@example + option transport-type tcp +@end example + +@cindex infiniband transport + +InfiniBand is a scalable switched fabric interconnect mechanism +primarily used in high-performance computing. InfiniBand can deliver +data throughput of the order of 10 Gbit/s, with latencies of 4-5 ms. + +The @command{ib-verbs} transport accesses the InfiniBand hardware through +the ``verbs'' @acronym{API}, which is the lowest level of software access possible +and which gives the highest performance. On InfiniBand hardware, it is always +best to use @command{ib-verbs}. Use @command{ib-sdp} only if you cannot get +@command{ib-verbs} working for some reason. + +The @command{ib-verbs} client module accepts the following options: + +@cartouche +@table @code +@item non-blocking-connect [no|off|on|yes] (on) +Whether to make the connection attempt asynchronous. +@item remote-port (6996) +Server port to connect to. +@cindex DNS round robin +@item remote-host * +Hostname or @acronym{IP} address of the server. If the host name resolves to +multiple IP addresses, all of them will be tried in a round-robin fashion. This +feature can be used to implement fail-over. +@end table +@end cartouche + +The @command{ib-verbs} server module accepts the following options: + +@cartouche +@table @code +@item bind-address
(0.0.0.0) +The local interface on which the server should listen to requests. Default is to +listen on all interfaces. +@item listen-port (6996) +The local port to listen on. +@end table +@end cartouche + +The following options are common to both the client and server modules: + +If you are familiar with InfiniBand jargon, +the mode is used by GlusterFS is ``reliable connection-oriented channel transfer''. + +@cartouche +@table @code +@item ib-verbs-work-request-send-count (64) +Length of the send queue in datagrams. [Reason to increase/decrease?] + +@item ib-verbs-work-request-recv-count (64) +Length of the receive queue in datagrams. [Reason to increase/decrease?] + +@item ib-verbs-work-request-send-size (128KB) +Size of each datagram that is sent. [Reason to increase/decrease?] + +@item ib-verbs-work-request-recv-size (128KB) +Size of each datagram that is received. [Reason to increase/decrease?] + +@item ib-verbs-port (1) +Port number for ib-verbs. + +@item ib-verbs-mtu [256|512|1024|2048|4096] (2048) +The Maximum Transmission Unit [Reason to increase/decrease?] + +@item ib-verbs-device-name (first device in the list) +InfiniBand device to be used. +@end table +@end cartouche + +For maximum performance, you should ensure that the send/receive counts on both +the client and server are the same. + +ib-verbs is preferred over ib-sdp. + +@node Client protocol +@subsection Client +@example +type procotol/client +@end example + +The client translator enables the GlusterFS client to access a remote server's +translator tree. + +@cartouche +@table @code + +@item transport-type [tcp,ib-sdp,ib-verbs] (tcp) +The transport type to use. You should use the client versions of all the +transport modules (@command{tcp}, @command{ib-sdp}, +@command{ib-verbs}). +@item remote-subvolume * +The name of the volume on the remote host to attach to. Note that +this is @emph{not} the name of the @command{protocol/server} volume on the +server. It should be any volume under the server. +@item transport-timeout (120- seconds) +Inactivity timeout. If a reply is expected and no activity takes place +on the connection within this time, the transport connection will be +broken, and a new connection will be attempted. +@end table +@end cartouche + +@node Server protocol +@subsection Server +@example +type protocol/server +@end example + +The server translator exports a translator tree and makes it accessible to +remote GlusterFS clients. + +@cartouche +@table @code +@item client-volume-filename (/glusterfs-client.vol) +The volume specification file to use for the client. This is the file the +client will receive when it is invoked with the @command{--server} option +(@ref{Client}). + +@item transport-type [tcp,ib-verbs,ib-sdp] (tcp) +The transport to use. You should use the server versions of all the transport +modules (@command{tcp}, @command{ib-sdp}, @command{ib-verbs}). + +@item auth.addr..allow +IP addresses of the clients that are allowed to attach to the specified volume. +This can be a wildcard. For example, a wildcard of the form @command{192.168.*.*} +allows any host in the @command{192.168.x.x} subnet to connect to the server. + +@end table +@end cartouche + +@node Clustering Translators +@section Clustering Translators + +The clustering translators are the most important GlusterFS +translators, since it is these that make GlusterFS a cluster +filesystem. These translators together enable GlusterFS to access an +arbitrarily large amount of storage, and provide @acronym{RAID}-like +redundancy and distribution over the entire cluster. + +There are three clustering translators: @strong{unify}, @strong{replicate}, +and @strong{stripe}. The unify translator aggregates storage from +many server nodes. The replicate translator provides file replication. The stripe +translator allows a file to be spread across many server nodes. The following sections +look at each of these translators in detail. + +@menu +* Unify:: +* Replicate:: +* Stripe:: +@end menu + +@node Unify +@subsection Unify +@cindex unify (translator) +@cindex scheduler (unify) +@example +type cluster/unify +@end example + +The unify translator presents a `unified' view of all its sub-volumes. That is, +it makes the union of all its sub-volumes appear as a single volume. It is the +unify translator that gives GlusterFS the ability to access an arbitrarily +large amount of storage. + +For unify to work correctly, certain invariants need to be maintained across +the entire network. These are: + +@cindex unify invariants +@itemize +@item The directory structure of all the sub-volumes must be identical. +@item A particular file can exist on only one of the sub-volumes. Phrasing it in another way, a pathname such as @command{/home/calvin/homework.txt}) is unique across the entire cluster. +@end itemize + +@tex +\vfill +@end tex +@page + +@center @image{unify,44pc,,,.pdf} + +Looking at the second requirement, you might wonder how one can +accomplish storing redundant copies of a file, if no file can exist +multiple times. To answer, we must remember that these invariants are +from @emph{unify's perspective}. A translator such as replicate at a lower +level in the translator tree than unify may subvert this picture. + +The first invariant might seem quite tedious to ensure. We shall see +later that this is not so, since unify's @emph{self-heal} mechanism +takes care of maintaining it. + +The second invariant implies that unify needs some way to decide which file goes where. +Unify makes use of @emph{scheduler} modules for this purpose. + +When a file needs to be created, unify's scheduler decides upon the +sub-volume to be used to store the file. There are many schedulers +available, each using a different algorithm and suitable for different +purposes. + +The various schedulers are described in detail in the sections that follow. + +@subsubsection ALU +@cindex alu (scheduler) + +@example + option scheduler alu +@end example + +ALU stands for "Adaptive Least Usage". It is the most advanced +scheduler available in GlusterFS. It balances the load across volumes +taking several factors in account. It adapts itself to changing I/O +patterns according to its configuration. When properly configured, it +can eliminate the need for regular tuning of the filesystem to keep +volume load nicely balanced. + +The ALU scheduler is composed of multiple least-usage +sub-schedulers. Each sub-scheduler keeps track of a certain type of +load, for each of the sub-volumes, getting statistics from +the sub-volumes themselves. The sub-schedulers are these: + +@itemize +@item disk-usage: The used and free disk space on the volume. + +@item read-usage: The amount of reading done from this volume. + +@item write-usage: The amount of writing done to this volume. + +@item open-files-usage: The number of files currently open from this volume. + +@item disk-speed-usage: The speed at which the disks are spinning. This is a constant value and therefore not very useful. +@end itemize + +The ALU scheduler needs to know which of these sub-schedulers to use, +and in which order to evaluate them. This is done through the +@command{option alu.order} configuration directive. + +Each sub-scheduler needs to know two things: when to kick in (the +entry-threshold), and how long to stay in control (the +exit-threshold). For example: when unifying three disks of 100GB, +keeping an exact balance of disk-usage is not necesary. Instead, there +could be a 1GB margin, which can be used to nicely balance other +factors, such as read-usage. The disk-usage scheduler can be told to +kick in only when a certain threshold of discrepancy is passed, such +as 1GB. When it assumes control under this condition, it will write +all subsequent data to the least-used volume. If it is doing so, it is +unwise to stop right after the values are below the entry-threshold +again, since that would make it very likely that the situation will +occur again very soon. Such a situation would cause the ALU to spend +most of its time disk-usage scheduling, which is unfair to the other +sub-schedulers. The exit-threshold therefore defines the amount of +data that needs to be written to the least-used disk, before control +is relinquished again. + +In addition to the sub-schedulers, the ALU scheduler also has "limits" +options. These can stop the creation of new files on a volume once +values drop below a certain threshold. For example, setting +@command{option alu.limits.min-free-disk 5GB} will stop the scheduling +of files to volumes that have less than 5GB of free disk space, +leaving the files on that disk some room to grow. + +The actual values you assign to the thresholds for sub-schedulers and +limits depend on your situation. If you have fast-growing files, +you'll want to stop file-creation on a disk much earlier than when +hardly any of your files are growing. If you care less about +disk-usage balance than about read-usage balance, you'll want a bigger +disk-usage scheduler entry-threshold and a smaller read-usage +scheduler entry-threshold. + +For thresholds defining a size, values specifying "KB", "MB" and "GB" +are allowed. For example: @command{option alu.limits.min-free-disk 5GB}. + +@cartouche +@table @code +@item alu.order * ("disk-usage:write-usage:read-usage:open-files-usage:disk-speed") +@item alu.disk-usage.entry-threshold (1GB) +@item alu.disk-usage.exit-threshold (512MB) +@item alu.write-usage.entry-threshold <%> (25) +@item alu.write-usage.exit-threshold <%> (5) +@item alu.read-usage.entry-threshold <%> (25) +@item alu.read-usage.exit-threshold <%> (5) +@item alu.open-files-usage.entry-threshold (1000) +@item alu.open-files-usage.exit-threshold (100) +@item alu.limits.min-free-disk <%> +@item alu.limits.max-open-files +@end table +@end cartouche + +@subsubsection Round Robin (RR) +@cindex rr (scheduler) + +@example + option scheduler rr +@end example + +Round-Robin (RR) scheduler creates files in a round-robin +fashion. Each client will have its own round-robin loop. When your +files are mostly similar in size and I/O access pattern, this +scheduler is a good choice. RR scheduler checks for free disk space +on the server before scheduling, so you can know when to add +another server node. The default value of min-free-disk is 5% and is +checked on file creation calls, with atleast 10 seconds (by default) +elapsing between two checks. + +Options: +@cartouche +@table @code +@item rr.limits.min-free-disk <%> (5) +Minimum free disk space a node must have for RR to schedule a file to it. +@item rr.refresh-interval (10 seconds) +Time between two successive free disk space checks. +@end table +@end cartouche + +@subsubsection Random +@cindex random (scheduler) + +@example + option scheduler random +@end example + +The random scheduler schedules file creation randomly among its child nodes. +Like the round-robin scheduler, it also checks for a minimum amount of free disk +space before scheduling a file to a node. + +@cartouche +@table @code +@item random.limits.min-free-disk <%> (5) +Minimum free disk space a node must have for random to schedule a file to it. +@item random.refresh-interval (10 seconds) +Time between two successive free disk space checks. +@end table +@end cartouche + +@subsubsection NUFA +@cindex nufa (scheduler) + +@example + option scheduler nufa +@end example + +It is common in many GlusterFS computing environments for all deployed +machines to act as both servers and clients. For example, a +research lab may have 40 workstations each with its own storage. All +of these workstations might act as servers exporting a volume as well +as clients accessing the entire cluster's storage. In such a +situation, it makes sense to store locally created files on the local +workstation itself (assuming files are accessed most by the +workstation that created them). The Non-Uniform File Allocation (@acronym{NUFA}) +scheduler accomplishes that. + +@acronym{NUFA} gives the local system first priority for file creation +over other nodes. If the local volume does not have more free disk space +than a specified amount (5% by default) then @acronym{NUFA} schedules files +among the other child volumes in a round-robin fashion. + +@acronym{NUFA} is named after the similar strategy used for memory access, +@acronym{NUMA}@footnote{Non-Uniform Memory Access: +@indicateurl{http://en.wikipedia.org/wiki/Non-Uniform_Memory_Access}}. + +@cartouche +@table @code +@item nufa.limits.min-free-disk <%> (5) +Minimum disk space that must be free (local or remote) for @acronym{NUFA} to schedule a +file to it. +@item nufa.refresh-interval (10 seconds) +Time between two successive free disk space checks. +@item nufa.local-volume-name +The name of the volume corresponding to the local system. This volume must be +one of the children of the unify volume. This option is mandatory. +@end table +@end cartouche + +@cindex namespace +@subsubsection Namespace +Namespace volume needed because: + - persistent inode numbers. + - file exists even when node is down. + +namespace files are simply touched. on every lookup it is checked. + +@cartouche +@table @code +@item namespace * +Name of the namespace volume (which should be one of the unify volume's children). +@item self-heal [on|off] (on) +Enable/disable self-heal. Unless you know what you are doing, do not disable self-heal. +@end table +@end cartouche + +@cindex self heal (unify) +@subsubsection Self Heal + * When a 'lookup()/stat()' call is made on directory for the first +time, a self-heal call is made, which checks for the consistancy of +its child nodes. If an entry is present in storage node, but not in +namespace, that entry is created in namespace, and vica-versa. There +is an writedir() API introduced which is used for the same. It also +checks for permissions, and uid/gid consistencies. + + * This check is also done when an server goes down and comes up. + + * If one starts with an empty namespace export, but has data in +storage nodes, a 'find .>/dev/null' or 'ls -lR >/dev/null' should help +to build namespace in one shot. Even otherwise, namespace is built on +demand when a file is looked up for the first time. + +NOTE: There are some issues (Kernel 'Oops' msgs) seen with fuse-2.6.3, +when someone deletes namespace in backend, when glusterfs is +running. But with fuse-2.6.5, this issue is not there. + +@node Replicate +@subsection Replicate (formerly AFR) +@cindex Replicate +@example +type cluster/replicate +@end example + +Replicate provides @acronym{RAID}-1 like functionality for +GlusterFS. Replicate replicates files and directories across the +subvolumes. Hence if Replicate has four subvolumes, there will be +four copies of all files and directories. Replicate provides +high-availability, i.e., in case one of the subvolumes go down +(e. g. server crash, network disconnection) Replicate will still +service the requests using the redundant copies. + +Replicate also provides self-heal functionality, i.e., in case the +crashed servers come up, the outdated files and directories will be +updated with the latest versions. Replicate uses extended +attributes of the backend file system to track the versioning of files +and directories and provide the self-heal feature. + +@example +volume replicate-example + type cluster/replicate + subvolumes brick1 brick2 brick3 +end-volume +@end example + +This sample configuration will replicate all directories and files on +brick1, brick2 and brick3. + +All the read operations happen from the first alive child. If all the +three sub-volumes are up, reads will be done from brick1; if brick1 is +down read will be done from brick2. In case read() was being done on +brick1 and it goes down, replicate transparently falls back to +brick2. + +The next release of GlusterFS will add the following features: +@itemize +@item Ability to specify the sub-volume from which read operations are to be done (this will help users who have one of the sub-volumes as a local storage volume). +@item Allow scheduling of read operations amongst the sub-volumes in a round-robin fashion. +@end itemize + +The order of the subvolumes list should be same across all the 'replicate's as +they will be used for locking purposes. + +@cindex self heal (replicate) +@subsubsection Self Heal +Replicate has self-heal feature, which updates the outdated file and +directory copies by the most recent versions. For example consider the +following config: + +@example +volume replicate-example + type cluster/replicate + subvolumes brick1 brick2 +end-volume +@end example + +@subsubsection File self-heal + +Now if we create a file foo.txt on replicate-example, the file will be created +on brick1 and brick2. The file will have two extended attributes associated +with it in the backend filesystem. One is trusted.afr.createtime and the +other is trusted.afr.version. The trusted.afr.createtime xattr has the +create time (in terms of seconds since epoch) and trusted.afr.version +is a number that is incremented each time a file is modified. This increment +happens during close (incase any write was done before close). + +If brick1 goes down, we edit foo.txt the version gets incremented. Now +the brick1 comes back up, when we open() on foo.txt replicate will check if +their versions are same. If they are not same, the outdated copy is +replaced by the latest copy and its version is updated. After the sync +the open() proceeds in the usual manner and the application calling open() +can continue on its access to the file. + +If brick1 goes down, we delete foo.txt and create a file with the same +name again i.e foo.txt. Now brick1 comes back up, clearly there is a +chance that the version on brick1 being more than the version on brick2, +this is where createtime extended attribute helps in deciding which +the outdated copy is. Hence we need to consider both createtime and +version to decide on the latest copy. + +The version attribute is incremented during the close() call. Version +will not be incremented in case there was no write() done. In case the +fd that the close() gets was got by create() call, we also create +the createtime extended attribute. + +@subsubsection Directory self-heal + +Suppose brick1 goes down, we delete foo.txt, brick1 comes back up, now +we should not create foo.txt on brick2 but we should delete foo.txt +on brick1. We handle this situation by having the createtime and version +attribute on the directory similar to the file. when lookup() is done +on the directory, we compare the createtime/version attributes of the +copies and see which files needs to be deleted and delete those files +and update the extended attributes of the outdated directory copy. +Each time a directory is modified (a file or a subdirectory is created +or deleted inside the directory) and one of the subvols is down, we +increment the directory's version. + +lookup() is a call initiated by the kernel on a file or directory +just before any access to that file or directory. In glusterfs, by +default, lookup() will not be called in case it was called in the +past one second on that particular file or directory. + +The extended attributes can be seen in the backend filesystem using +the @command{getfattr} command. (@command{getfattr -n trusted.afr.version }) + +@cartouche +@table @code +@item debug [on|off] (off) +@item self-heal [on|off] (on) +@item replicate (*:1) +@item lock-node (first child is used by default) +@end table +@end cartouche + +@node Stripe +@subsection Stripe +@cindex stripe (translator) +@example +type cluster/stripe +@end example + +The stripe translator distributes the contents of a file over its +sub-volumes. It does this by creating a file equal in size to the +total size of the file on each of its sub-volumes. It then writes only +a part of the file to each sub-volume, leaving the rest of it empty. +These empty regions are called `holes' in Unix terminology. The holes +do not consume any disk space. + +The diagram below makes this clear. + +@center @image{stripe,44pc,,,.pdf} + +You can configure stripe so that only filenames matching a pattern +are striped. You can also configure the size of the data to be stored +on each sub-volume. + +@cartouche +@table @code +@item block-size : (*:0 no striping) +Distribute files matching @command{} over the sub-volumes, +storing at least @command{} on each sub-volume. For example, + +@example + option block-size *.mpg:1M +@end example + +distributes all files ending in @command{.mpg}, storing at least 1 MB on +each sub-volume. + +Any number of @command{block-size} option lines may be present, specifying +different sizes for different file name patterns. +@end table +@end cartouche + +@node Performance Translators +@section Performance Translators + +@menu +* Read Ahead:: +* Write Behind:: +* IO Threads:: +* IO Cache:: +* Booster:: +@end menu + +@node Read Ahead +@subsection Read Ahead +@cindex read-ahead (translator) +@example +type performance/read-ahead +@end example + +The read-ahead translator pre-fetches data in advance on every read. +This benefits applications that mostly process files in sequential order, +since the next block of data will already be available by the time the +application is done with the current one. + +Additionally, the read-ahead translator also behaves as a read-aggregator. +Many small read operations are combined and issued as fewer, larger read +requests to the server. + +Read-ahead deals in ``pages'' as the unit of data fetched. The page size +is configurable, as is the ``page count'', which is the number of pages +that are pre-fetched. + +Read-ahead is best used with InfiniBand (using the ib-verbs transport). +On FastEthernet and Gigabit Ethernet networks, +GlusterFS can achieve the link-maximum throughput even without +read-ahead, making it quite superflous. + +Note that read-ahead only happens if the reads are perfectly +sequential. If your application accesses data in a random fashion, +using read-ahead might actually lead to a performance loss, since +read-ahead will pointlessly fetch pages which won't be used by the +application. + +@cartouche +Options: +@table @code +@item page-size (256KB) +The unit of data that is pre-fetched. +@item page-count (2) +The number of pages that are pre-fetched. +@item force-atime-update [on|off|yes|no] (off|no) +Whether to force an access time (atime) update on the file on every read. Without +this, the atime will be slightly imprecise, as it will reflect the time when +the read-ahead translator read the data, not when the application actually read it. +@end table +@end cartouche + +@node Write Behind +@subsection Write Behind +@cindex write-behind (translator) +@example +type performance/write-behind +@end example + +The write-behind translator improves the latency of a write operation. +It does this by relegating the write operation to the background and +returning to the application even as the write is in progress. Using the +write-behind translator, successive write requests can be pipelined. +This mode of write-behind operation is best used on the client side, to +enable decreased write latency for the application. + +The write-behind translator can also aggregate write requests. If the +@command{aggregate-size} option is specified, then successive writes upto that +size are accumulated and written in a single operation. This mode of operation +is best used on the server side, as this will decrease the disk's head movement +when multiple files are being written to in parallel. + +The @command{aggregate-size} option has a default value of 128KB. Although +this works well for most users, you should always experiment with different values +to determine the one that will deliver maximum performance. This is because the +performance of write-behind depends on your interconnect, size of RAM, and the +work load. + +@cartouche +@table @code +@item aggregate-size (128KB) +Amount of data to accumulate before doing a write +@item flush-behind [on|yes|off|no] (off|no) + +@end table +@end cartouche + +@node IO Threads +@subsection IO Threads +@cindex io-threads (translator) +@example +type performance/io-threads +@end example + +The IO threads translator is intended to increase the responsiveness +of the server to metadata operations by doing file I/O (read, write) +in a background thread. Since the GlusterFS server is +single-threaded, using the IO threads translator can significantly +improve performance. This translator is best used on the server side, +loaded just below the server protocol translator. + +IO threads operates by handing out read and write requests to a separate thread. +The total number of threads in existence at a time is constant, and configurable. + +@cartouche +@table @code +@item thread-count (1) +Number of threads to use. +@end table +@end cartouche + +@node IO Cache +@subsection IO Cache +@cindex io-cache (translator) +@example +type performance/io-cache +@end example + +The IO cache translator caches data that has been read. This is useful +if many applications read the same data multiple times, and if reads +are much more frequent than writes (for example, IO caching may be +useful in a web hosting environment, where most clients will simply +read some files and only a few will write to them). + +The IO cache translator reads data from its child in @command{page-size} chunks. +It caches data upto @command{cache-size} bytes. The cache is maintained as +a prioritized least-recently-used (@acronym{LRU}) list, with priorities determined +by user-specified patterns to match filenames. + +When the IO cache translator detects a write operation, the +cache for that file is flushed. + +The IO cache translator periodically verifies the consistency of +cached data, using the modification times on the files. The verification timeout +is configurable. + +@cartouche +@table @code +@item page-size (128KB) +Size of a page. +@item cache-size (n) (32MB) +Total amount of data to be cached. +@item force-revalidate-timeout (1) +Timeout to force a cache consistency verification, in seconds. +@item priority (*:0) +Filename patterns listed in order of priority. +@end table +@end cartouche + +@node Booster +@subsection Booster +@cindex booster +@example + type performance/booster +@end example + +The booster translator gives applications a faster path to communicate +read and write requests to GlusterFS. Normally, all requests to GlusterFS from +applications go through FUSE, as indicated in @ref{Filesystems in Userspace}. +Using the booster translator in conjunction with the GlusterFS booster shared +library, an application can bypass the FUSE path and send read/write requests +directly to the GlusterFS client process. + +The booster mechanism consists of two parts: the booster translator, +and the booster shared library. The booster translator is meant to be +loaded on the client side, usually at the root of the translator tree. +The booster shared library should be @command{LD_PRELOAD}ed with the +application. + +The booster translator when loaded opens a Unix domain socket and +listens for read/write requests on it. The booster shared library +intercepts read and write system calls and sends the requests to the +GlusterFS process directly using the Unix domain socket, bypassing FUSE. +This leads to superior performance. + +Once you've loaded the booster translator in your volume specification file, you +can start your application as: + +@example + $ LD_PRELOAD=/usr/local/bin/glusterfs-booster.so your_app +@end example + +The booster translator accepts no options. + +@node Features Translators +@section Features Translators + +@menu +* POSIX Locks:: +* Fixed ID:: +@end menu + +@node POSIX Locks +@subsection POSIX Locks +@cindex record locking +@cindex fcntl +@cindex posix-locks (translator) +@example +type features/posix-locks +@end example + +This translator provides storage independent POSIX record locking +support (@command{fcntl} locking). Typically you'll want to load this on the +server side, just above the @acronym{POSIX} storage translator. Using this +translator you can get both advisory locking and mandatory locking +support. It also handles @command{flock()} locks properly. + +Caveat: Consider a file that does not have its mandatory locking bits +(+setgid, -group execution) turned on. Assume that this file is now +opened by a process on a client that has the write-behind xlator +loaded. The write-behind xlator does not cache anything for files +which have mandatory locking enabled, to avoid incoherence. Let's say +that mandatory locking is now enabled on this file through another +client. The former client will not know about this change, and +write-behind may erroneously report a write as being successful when +in fact it would fail due to the region it is writing to being locked. + +There seems to be no easy way to fix this. To work around this +problem, it is recommended that you never enable the mandatory bits on +a file while it is open. + +@cartouche +@table @code +@item mandatory [on|off] (on) +Turns mandatory locking on. +@end table +@end cartouche + +@node Fixed ID +@subsection Fixed ID +@cindex fixed-id (translator) +@example +type features/fixed-id +@end example + +The fixed ID translator makes all filesystem requests from the client +to appear to be coming from a fixed, specified +@acronym{UID}/@acronym{GID}, regardless of which user actually +initiated the request. + +@cartouche +@table @code +@item fixed-uid [if not set, not used] +The @acronym{UID} to send to the server +@item fixed-gid [if not set, not used] +The @acronym{GID} to send to the server +@end table +@end cartouche + +@node Miscellaneous Translators +@section Miscellaneous Translators + +@menu +* ROT-13:: +* Trace:: +@end menu + +@node ROT-13 +@subsection ROT-13 +@cindex rot-13 (translator) +@example +type encryption/rot-13 +@end example + +@acronym{ROT-13} is a toy translator that can ``encrypt'' and ``decrypt'' file +contents using the @acronym{ROT-13} algorithm. @acronym{ROT-13} is a trivial +algorithm that rotates each alphabet by thirteen places. Thus, 'A' becomes 'N', +'B' becomes 'O', and 'Z' becomes 'M'. + +It goes without saying that you shouldn't use this translator if you need +@emph{real} encryption (a future release of GlusterFS will have real encryption +translators). + +@cartouche +@table @code +@item encrypt-write [on|off] (on) +Whether to encrypt on write +@item decrypt-read [on|off] (on) +Whether to decrypt on read +@end table +@end cartouche + +@node Trace +@subsection Trace +@cindex trace (translator) +@example +type debug/trace +@end example + +The trace translator is intended for debugging purposes. When loaded, it +logs all the system calls received by the server or client (wherever +trace is loaded), their arguments, and the results. You must use a GlusterFS log +level of DEBUG (See @ref{Running GlusterFS}) for trace to work. + +Sample trace output (lines have been wrapped for readability): +@cartouche +@example +2007-10-30 00:08:58 D [trace.c:1579:trace_opendir] trace: callid: 68 +(*this=0x8059e40, loc=0x8091984 @{path=/iozone3_283, inode=0x8091f00@}, + fd=0x8091d50) + +2007-10-30 00:08:58 D [trace.c:630:trace_opendir_cbk] trace: +(*this=0x8059e40, op_ret=4, op_errno=1, fd=0x8091d50) + +2007-10-30 00:08:58 D [trace.c:1602:trace_readdir] trace: callid: 69 +(*this=0x8059e40, size=4096, offset=0 fd=0x8091d50) + +2007-10-30 00:08:58 D [trace.c:215:trace_readdir_cbk] trace: +(*this=0x8059e40, op_ret=0, op_errno=0, count=4) + +2007-10-30 00:08:58 D [trace.c:1624:trace_closedir] trace: callid: 71 +(*this=0x8059e40, *fd=0x8091d50) + +2007-10-30 00:08:58 D [trace.c:809:trace_closedir_cbk] trace: +(*this=0x8059e40, op_ret=0, op_errno=1) +@end example +@end cartouche + +@node Usage Scenarios +@chapter Usage Scenarios + +@section Advanced Striping + +This section is based on the Advanced Striping tutorial written by +Anand Avati on the GlusterFS wiki +@footnote{http://gluster.org/docs/index.php/Mixing_Striped_and_Regular_Files}. + +@subsection Mixed Storage Requirements + +There are two ways of scheduling the I/O. One at file level (using +unify translator) and other at block level (using stripe +translator). Striped I/O is good for files that are potentially large +and require high parallel throughput (for example, a single file of +400GB being accessed by 100s and 1000s of systems simultaneously and +randomly). For most of the cases, file level scheduling works best. + +In the real world, it is desirable to mix file level and block level +scheduling on a single storage volume. Alternatively users can choose +to have two separate volumes and hence two mount points, but the +applications may demand a single storage system to host both. + +This document explains how to mix file level scheduling with stripe. + +@subsection Configuration Brief + +This setup demonstrates how users can configure unify translator with +appropriate I/O scheduler for file level scheduling and strip for only +matching patterns. This way, GlusterFS chooses appropriate I/O profile +and knows how to efficiently handle both the types of data. + +A simple technique to achieve this effect is to create a stripe set of +unify and stripe blocks, where unify is the first sub-volume. Files +that do not match the stripe policy passed on to first unify +sub-volume and inturn scheduled arcoss the cluster using its file +level I/O scheduler. + +@image{advanced-stripe,44pc,,,.pdf} + +@subsection Preparing GlusterFS Envoronment + +Create the directories /export/namespace, /export/unify and +/export/stripe on all the storage bricks. + + Place the following server and client volume spec file under +/etc/glusterfs (or appropriate installed path) and replace the IP +addresses / access control fields to match your environment. + +@cartouche +@example + ## file: /etc/glusterfs/glusterfsd.vol + volume posix-unify + type storage/posix + option directory /export/for-unify + end-volume + + volume posix-stripe + type storage/posix + option directory /export/for-stripe + end-volume + + volume posix-namespace + type storage/posix + option directory /export/for-namespace + end-volume + + volume server + type protocol/server + option transport-type tcp + option auth.addr.posix-unify.allow 192.168.1.* + option auth.addr.posix-stripe.allow 192.168.1.* + option auth.addr.posix-namespace.allow 192.168.1.* + subvolumes posix-unify posix-stripe posix-namespace + end-volume +@end example +@end cartouche + +@cartouche +@example + ## file: /etc/glusterfs/glusterfs.vol + volume client-namespace + type protocol/client + option transport-type tcp + option remote-host 192.168.1.1 + option remote-subvolume posix-namespace + end-volume + + volume client-unify-1 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.1 + option remote-subvolume posix-unify + end-volume + + volume client-unify-2 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.2 + option remote-subvolume posix-unify + end-volume + + volume client-unify-3 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.3 + option remote-subvolume posix-unify + end-volume + + volume client-unify-4 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.4 + option remote-subvolume posix-unify + end-volume + + volume client-stripe-1 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.1 + option remote-subvolume posix-stripe + end-volume + + volume client-stripe-2 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.2 + option remote-subvolume posix-stripe + end-volume + + volume client-stripe-3 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.3 + option remote-subvolume posix-stripe + end-volume + + volume client-stripe-4 + type protocol/client + option transport-type tcp + option remote-host 192.168.1.4 + option remote-subvolume posix-stripe + end-volume + + volume unify + type cluster/unify + option scheduler rr + subvolumes cluster-unify-1 cluster-unify-2 cluster-unify-3 cluster-unify-4 + end-volume + + volume stripe + type cluster/stripe + option block-size *.img:2MB # All files ending with .img are striped with 2MB stripe block size. + subvolumes unify cluster-stripe-1 cluster-stripe-2 cluster-stripe-3 cluster-stripe-4 + end-volume +@end example +@end cartouche + + +Bring up the Storage + +Starting GlusterFS Server: If you have installed through binary +package, you can start the service through init.d startup script. If +not: + +@example +[root@@server]# glusterfsd +@end example + +Mounting GlusterFS Volumes: + +@example +[root@@client]# glusterfs -s [BRICK-IP-ADDRESS] /mnt/cluster +@end example + +Improving upon this Setup + +Infiniband Verbs RDMA transport is much faster than TCP/IP GigE +transport. + +Use of performance translators such as read-ahead, write-behind, +io-cache, io-threads, booster is recommended. + +Replace round-robin (rr) scheduler with ALU to handle more dynamic +storage environments. + +@node Troubleshooting +@chapter Troubleshooting + +This chapter is a general troubleshooting guide to GlusterFS. It lists +common GlusterFS server and client error messages, debugging hints, and +concludes with the suggested procedure to report bugs in GlusterFS. + +@section GlusterFS error messages + +@subsection Server errors + +@example +glusterfsd: FATAL: could not open specfile: +'/etc/glusterfs/glusterfsd.vol' +@end example + +The GlusterFS server expects the volume specification file to be +at @command{/etc/glusterfs/glusterfsd.vol}. The example +specification file will be installed as +@command{/etc/glusterfs/glusterfsd.vol.sample}. You need to edit +it and rename it, or provide a different specification file using +the @command{--spec-file} command line option (See @ref{Server}). + +@vskip 4ex + +@example +gf_log_init: failed to open logfile "/usr/var/log/glusterfs/glusterfsd.log" + (Permission denied) +@end example + +You don't have permission to create files in the +@command{/usr/var/log/glusterfs} directory. Make sure you are running +GlusterFS as root. Alternatively, specify a different path for the log +file using the @command{--log-file} option (See @ref{Server}). + +@subsection Client errors + +@example +fusermount: failed to access mountpoint /mnt: + Transport endpoint is not connected +@end example + +A previous failed (or hung) mount of GlusterFS is preventing it from being +mounted again in the same location. The fix is to do: + +@example +# umount /mnt +@end example + +and try mounting again. + +@vskip 4ex + +@strong{``Transport endpoint is not connected''.} + +If you get this error when you try a command such as @command{ls} or @command{cat}, +it means the GlusterFS mount did not succeed. Try running GlusterFS in @command{DEBUG} +logging level and study the log messages to discover the cause. + +@vskip 4ex + +@strong{``Connect to server failed'', ``SERVER-ADDRESS: Connection refused''.} + +GluserFS Server is not running or dead. Check your network +connections and firewall settings. To check if the server is reachable, +try: + +@example +telnet IP-ADDRESS 6996 +@end example + +If the server is accessible, your `telnet' command should connect and +block. If not you will see an error message such as @command{telnet: Unable to +connect to remote host: Connection refused}. 6996 is the default +GlusterFS port. If you have changed it, then use the corresponding +port instead. + +@vskip 4ex + +@example +gf_log_init: failed to open logfile "/usr/var/log/glusterfs/glusterfs.log" + (Permission denied) +@end example + +You don't have permission to create files in the +@command{/usr/var/log/glusterfs} directory. Make sure you are running +GlusterFS as root. Alternatively, specify a different path for the log +file using the @command{--log-file} option (See @ref{Client}). + +@section FUSE error messages +@command{modprobe fuse} fails with: ``Unknown symbol in module, or unknown parameter''. +@cindex Redhat Enterprise Linux + +If you are using fuse-2.6.x on Redhat Enterprise Linux Work Station 4 +and Advanced Server 4 with 2.6.9-42.ELlargesmp, 2.6.9-42.ELsmp, +2.6.9-42.EL kernels and get this error while loading @acronym{FUSE} kernel +module, you need to apply the following patch. + +For fuse-2.6.2: + +@indicateurl{http://ftp.zresearch.com/pub/gluster/glusterfs/fuse/fuse-2.6.2-rhel-build.patch} + +For fuse-2.6.3: + +@indicateurl{http://ftp.zresearch.com/pub/gluster/glusterfs/fuse/fuse-2.6.3-rhel-build.patch} + +@section AppArmour and GlusterFS +@cindex AppArmour +@cindex OpenSuSE +Under OpenSuSE GNU/Linux, the AppArmour security feature does not +allow GlusterFS to create temporary files or network socket +connections even while running as root. You will see error messages +like `Unable to open log file: Operation not permitted' or `Connection +refused'. Disabling AppArmour using YaST or properly configuring +AppArmour to recognize @command{glusterfsd} or @command{glusterfs}/@command{fusermount} +should solve the problem. + +@section Reporting a bug + +If you encounter a bug in GlusterFS, please follow the below +guidelines when you report it to the mailing list. Be sure to report +it! User feedback is crucial to the health of the project and we value +it highly. + +@subsection General instructions + +When running GlusterFS in a non-production environment, be sure to +build it with the following command: + +@example + $ make CFLAGS='-g -O0 -DDEBUG' +@end example + +This includes debugging information which will be helpful in getting +backtraces (see below) and also disable optimization. Enabling +optimization can result in incorrect line numbers being reported to +gdb. + +@subsection Volume specification files + +Attach all relevant server and client spec files you were using when +you encountered the bug. Also tell us details of your setup, i.e., how +many clients and how many servers. + +@subsection Log files + +Set the loglevel of your client and server programs to @acronym{DEBUG} (by +passing the -L @acronym{DEBUG} option) and attach the log files with your bug +report. Obviously, if only the client is failing (for example), you +only need to send us the client log file. + +@subsection Backtrace + +If GlusterFS has encountered a segmentation fault or has crashed for +some other reason, include the backtrace with the bug report. You can +get the backtrace using the following procedure. + +Run the GlusterFS client or server inside gdb. + +@example + $ gdb ./glusterfs + (gdb) set args -f client.spec -N -l/path/to/log/file -LDEBUG /mnt/point + (gdb) run +@end example + +Now when the process segfaults, you can get the backtrace by typing: + +@example + (gdb) bt +@end example + +If the GlusterFS process has crashed and dumped a core file (you can +find this in / if running as a daemon and in the current directory +otherwise), you can do: + +@example + $ gdb /path/to/glusterfs /path/to/core. +@end example + +and then get the backtrace. + +If the GlusterFS server or client seems to be hung, then you can get +the backtrace by attaching gdb to the process. First get the @command{PID} of +the process (using ps), and then do: + +@example + $ gdb ./glusterfs +@end example + +Press Ctrl-C to interrupt the process and then generate the backtrace. + +@subsection Reproducing the bug + +If the bug is reproducible, please include the steps necessary to do +so. If the bug is not reproducible, send us the bug report anyway. + +@subsection Other information + +If you think it is relevant, send us also the version of @acronym{FUSE} you're +using, the kernel version, platform. + +@node GNU Free Documentation Licence +@appendix GNU Free Documentation Licence +@include fdl.texi + +@node Index +@unnumbered Index +@printindex cp + +@bye diff --git a/doc/user-guide/xlator.odg b/doc/user-guide/xlator.odg new file mode 100644 index 000000000..179a65f6e Binary files /dev/null and b/doc/user-guide/xlator.odg differ diff --git a/doc/user-guide/xlator.pdf b/doc/user-guide/xlator.pdf new file mode 100644 index 000000000..a07e14d67 Binary files /dev/null and b/doc/user-guide/xlator.pdf differ diff --git a/extras/Makefile.am b/extras/Makefile.am new file mode 100644 index 000000000..f243a0da5 --- /dev/null +++ b/extras/Makefile.am @@ -0,0 +1,13 @@ + +docdir = $(datadir)/doc/glusterfs/ +EmacsModedir = $(docdir)/ +EmacsMode_DATA = glusterfs-mode.el + +SUBDIRS = init.d benchmarking + +EXTRA_DIST = specgen.scm glusterfs.vim glusterfs-mode.el Portfile \ + test/Makefile.am test/Makefile.in test/rdd.c \ + benchmarking/Makefile.am benchmarking/Makefile.in + +CLEANFILES = + diff --git a/extras/Portfile b/extras/Portfile new file mode 100644 index 000000000..4732c38ee --- /dev/null +++ b/extras/Portfile @@ -0,0 +1,26 @@ +# $Id$ + +PortSystem 1.0 + +name glusterfs +version 2.0.0rc1 +categories fuse +maintainers amar@zresearch.com +description GlusterFS +long_description GlusterFS is a cluster file system, flexible to tune it for your needs. +homepage http://www.gluster.org/ +platforms darwin +master_sites http://ftp.zresearch.com/pub/gluster/glusterfs/2.0/2.0.0 + +configure.args --disable-bdb +checksums md5 33c2d02344d4fab422e80cfb637e0b48 + +post-destroot { + file mkdir ${destroot}/Library/LaunchDaemons/ + file copy ${worksrcpath}/extras/glusterfs-server.plist \ + ${destroot}/Library/LaunchDaemons/com.zresearch.glusterfs.plist + + file mkdir ${destroot}/sbin/ + file copy ${worksrcpath}/xlators/mount/fuse/utils/mount_glusterfs \ + ${destroot}/sbin/ +} \ No newline at end of file diff --git a/extras/benchmarking/Makefile.am b/extras/benchmarking/Makefile.am new file mode 100644 index 000000000..16f73cbaa --- /dev/null +++ b/extras/benchmarking/Makefile.am @@ -0,0 +1,7 @@ + +docdir = $(datadir)/doc/$(PACKAGE_NAME)/benchmarking + +EXTRA_DIST = glfs-bm.c README launch-script.sh local-script.sh + +CLEANFILES = + diff --git a/extras/benchmarking/README b/extras/benchmarking/README new file mode 100644 index 000000000..e83dd8822 --- /dev/null +++ b/extras/benchmarking/README @@ -0,0 +1,18 @@ + +-------------- +Parallel DD performance: + +* Copy the local-script.sh in ${mountpoint}/benchmark/ directory +* Edit it so the blocksize and count are as per the requirements + +* Edit the launch-script.sh script to make sure paths, mountpoints etc are alright. + +* run 'lauch-script.sh' + +* after the run, you can get the aggregated result by adding all the 3rd entry in output.$(hostname) entries in 'output/' directory. + +-------------- + +iozone: + +bash# iozone - +m iozone_cluster.config - t 62 - r ${block_size} - s ${file_size} - +n - i 0 - i 1 \ No newline at end of file diff --git a/extras/benchmarking/glfs-bm.c b/extras/benchmarking/glfs-bm.c new file mode 100644 index 000000000..f5e63ae80 --- /dev/null +++ b/extras/benchmarking/glfs-bm.c @@ -0,0 +1,619 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#define _GNU_SOURCE +#define __USE_FILE_OFFSET64 +#define _FILE_OFFSET_BITS 64 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct state { + char need_op_write:1; + char need_op_read:1; + + char need_iface_fileio:1; + char need_iface_xattr:1; + + char need_mode_posix:1; + char need_mode_libglusterfsclient:1; + + char prefix[512]; + long int count; + + size_t block_size; + + char *specfile; + void *libglusterfsclient_context; + + long int io_size; +}; + + +#define MEASURE(func, arg) measure (func, #func, arg) + + +void +tv_difference (struct timeval *tv_stop, + struct timeval *tv_start, + struct timeval *tv_diff) +{ + if (tv_stop->tv_usec < tv_start->tv_usec) { + tv_diff->tv_usec = (tv_stop->tv_usec + 1000000) - tv_start->tv_usec; + tv_diff->tv_sec = (tv_stop->tv_sec - 1 - tv_start->tv_sec); + } else { + tv_diff->tv_usec = tv_stop->tv_usec - tv_start->tv_usec; + tv_diff->tv_sec = tv_stop->tv_sec - tv_start->tv_sec; + } +} + + +void +measure (int (*func)(struct state *state), + char *func_name, struct state *state) +{ + struct timeval tv_start, tv_stop, tv_diff; + state->io_size = 0; + long int count; + + gettimeofday (&tv_start, NULL); + count = func (state); + gettimeofday (&tv_stop, NULL); + + tv_difference (&tv_stop, &tv_start, &tv_diff); + + fprintf (stdout, "%s: count=%ld, size=%ld, time=%ld:%ld\n", + func_name, count, state->io_size, + tv_diff.tv_sec, tv_diff.tv_usec); +} + + +static error_t +parse_opts (int key, char *arg, + struct argp_state *_state) +{ + struct state *state = _state->input; + + switch (key) + { + case 'o': + if (strcasecmp (arg, "read") == 0) { + state->need_op_write = 0; + state->need_op_read = 1; + } else if (strcasecmp (arg, "write") == 0) { + state->need_op_write = 1; + state->need_op_read = 0; + } else if (strcasecmp (arg, "both") == 0) { + state->need_op_write = 1; + state->need_op_read = 1; + } else { + fprintf (stderr, "unknown op: %s\n", arg); + return -1; + } + break; + case 'i': + if (strcasecmp (arg, "fileio") == 0) { + state->need_iface_fileio = 1; + state->need_iface_xattr = 0; + } else if (strcasecmp (arg, "xattr") == 0) { + state->need_iface_fileio = 0; + state->need_iface_xattr = 1; + } else if (strcasecmp (arg, "both") == 0) { + state->need_iface_fileio = 1; + state->need_iface_xattr = 1; + } else { + fprintf (stderr, "unknown interface: %s\n", arg); + return -1; + } + break; + case 'm': + if (strcasecmp (arg, "posix") == 0) { + state->need_mode_posix = 1; + state->need_mode_libglusterfsclient = 0; + } else if (strcasecmp (arg, "libglusterfsclient") == 0) { + state->need_mode_posix = 0; + state->need_mode_libglusterfsclient = 1; + } else if (strcasecmp (arg, "both") == 0) { + state->need_mode_posix = 1; + state->need_mode_libglusterfsclient = 1; + } else { + fprintf (stderr, "unknown mode: %s\n", arg); + return -1; + } + break; + case 'b': + { + size_t block_size = atoi (arg); + if (!block_size) { + fprintf (stderr, "incorrect size: %s\n", arg); + return -1; + } + state->block_size = block_size; + } + break; + case 's': + state->specfile = strdup (arg); + break; + case 'p': + fprintf (stderr, "using prefix: %s\n", arg); + strncpy (state->prefix, arg, 512); + break; + case 'c': + { + long count = atol (arg); + if (!count) { + fprintf (stderr, "incorrect count: %s\n", arg); + return -1; + } + state->count = count; + } + break; + case ARGP_KEY_NO_ARGS: + break; + case ARGP_KEY_ARG: + break; + } + + return 0; +} + +int +do_mode_posix_iface_fileio_write (struct state *state) +{ + long int i; + int ret = -1; + char block[state->block_size]; + + for (i=0; icount; i++) { + int fd = -1; + char filename[512]; + + sprintf (filename, "%s.%06ld", state->prefix, i); + + fd = open (filename, O_CREAT|O_WRONLY, 00600); + if (fd == -1) { + fprintf (stderr, "open(%s) => %s\n", filename, strerror (errno)); + break; + } + ret = write (fd, block, state->block_size); + if (ret != state->block_size) { + fprintf (stderr, "write (%s) => %d/%s\n", filename, ret, + strerror (errno)); + close (fd); + break; + } + close (fd); + state->io_size += ret; + } + + return i; +} + + +int +do_mode_posix_iface_fileio_read (struct state *state) +{ + long int i; + int ret = -1; + char block[state->block_size]; + + for (i=0; icount; i++) { + int fd = -1; + char filename[512]; + + sprintf (filename, "%s.%06ld", state->prefix, i); + + fd = open (filename, O_RDONLY); + if (fd == -1) { + fprintf (stderr, "open(%s) => %s\n", filename, strerror (errno)); + break; + } + ret = read (fd, block, state->block_size); + if (ret == -1) { + fprintf (stderr, "read(%s) => %d/%s\n", filename, ret, strerror (errno)); + close (fd); + break; + } + close (fd); + state->io_size += ret; + } + + return i; +} + + +int +do_mode_posix_iface_fileio (struct state *state) +{ + if (state->need_op_write) + MEASURE (do_mode_posix_iface_fileio_write, state); + + if (state->need_op_read) + MEASURE (do_mode_posix_iface_fileio_read, state); + + return 0; +} + + +int +do_mode_posix_iface_xattr_write (struct state *state) +{ + long int i; + int ret = -1; + char block[state->block_size]; + char *dname = NULL, *dirc = NULL; + char *bname = NULL, *basec = NULL; + + dirc = strdup (state->prefix); + basec = strdup (state->prefix); + dname = dirname (dirc); + bname = basename (basec); + + for (i=0; icount; i++) { + char key[512]; + + sprintf (key, "glusterfs.file.%s.%06ld", bname, i); + + ret = lsetxattr (dname, key, block, state->block_size, 0); + + if (ret != 0) { + fprintf (stderr, "lsetxattr (%s, %s, %p) => %s\n", + dname, key, block, strerror (errno)); + break; + } + state->io_size += state->block_size; + } + + free (dirc); + free (basec); + + return i; +} + + +int +do_mode_posix_iface_xattr_read (struct state *state) +{ + long int i; + int ret = -1; + char block[state->block_size]; + char *dname = NULL, *dirc = NULL; + char *bname = NULL, *basec = NULL; + + dirc = strdup (state->prefix); + basec = strdup (state->prefix); + dname = dirname (dirc); + bname = basename (basec); + + for (i=0; icount; i++) { + char key[512]; + + sprintf (key, "glusterfs.file.%s.%06ld", bname, i); + + ret = lgetxattr (dname, key, block, state->block_size); + + if (ret < 0) { + fprintf (stderr, "lgetxattr (%s, %s, %p) => %s\n", + dname, key, block, strerror (errno)); + break; + } + state->io_size += ret; + } + + return i; +} + + +int +do_mode_posix_iface_xattr (struct state *state) +{ + if (state->need_op_write) + MEASURE (do_mode_posix_iface_xattr_write, state); + + if (state->need_op_read) + MEASURE (do_mode_posix_iface_xattr_read, state); + + return 0; +} + + +int +do_mode_libglusterfsclient_iface_fileio_write (struct state *state) +{ + long int i; + int ret = -1; + char block[state->block_size]; + + for (i=0; icount; i++) { + long fd = 0; + char filename[512]; + + sprintf (filename, "/%s.%06ld", state->prefix, i); + + fd = glusterfs_open (state->libglusterfsclient_context, + filename, O_CREAT|O_WRONLY, 0); + + if (fd == 0) { + fprintf (stderr, "open(%s) => %s\n", filename, strerror (errno)); + break; + } + ret = glusterfs_write (fd, block, state->block_size); + if (ret == -1) { + fprintf (stderr, "glusterfs_write(%s) => %s\n", filename, strerror (errno)); + glusterfs_close (fd); + break; + } + glusterfs_close (fd); + state->io_size += ret; + } + + return i; +} + + +int +do_mode_libglusterfsclient_iface_fileio_read (struct state *state) +{ + long int i; + int ret = -1; + char block[state->block_size]; + + for (i=0; icount; i++) { + long fd = 0; + char filename[512]; + + sprintf (filename, "/%s.%06ld", state->prefix, i); + + fd = glusterfs_open (state->libglusterfsclient_context, + filename, O_RDONLY, 0); + + if (fd == 0) { + fprintf (stderr, "glusterfs_open(%s) => %s\n", filename, strerror (errno)); + break; + } + ret = glusterfs_read (fd, block, state->block_size); + if (ret == -1) { + fprintf (stderr, "glusterfs_read(%s) => %s\n", filename, strerror (errno)); + glusterfs_close (fd); + break; + } + glusterfs_close (fd); + state->io_size += ret; + } + + return i; +} + + +int +do_mode_libglusterfsclient_iface_fileio (struct state *state) +{ + if (state->need_op_write) + MEASURE (do_mode_libglusterfsclient_iface_fileio_write, state); + + if (state->need_op_read) + MEASURE (do_mode_libglusterfsclient_iface_fileio_read, state); + + return 0; +} + + +int +do_mode_libglusterfsclient_iface_xattr_write (struct state *state) +{ + long int i; + int ret = -1; + char block[state->block_size]; + char *dname = NULL, *dirc = NULL; + char *bname = NULL, *basec = NULL; + + asprintf (&dirc, "/%s", state->prefix); + asprintf (&basec, "/%s", state->prefix); + dname = dirname (dirc); + bname = basename (basec); + + for (i=0; icount; i++) { + char key[512]; + + sprintf (key, "glusterfs.file.%s.%06ld", bname, i); + + ret = glusterfs_setxattr (state->libglusterfsclient_context, + dname, key, block, state->block_size, 0); + + if (ret < 0) { + fprintf (stderr, "glusterfs_setxattr (%s, %s, %p) => %s\n", + dname, key, block, strerror (errno)); + break; + } + state->io_size += state->block_size; + } + + return i; + +} + + +int +do_mode_libglusterfsclient_iface_xattr_read (struct state *state) +{ + long int i; + int ret = -1; + char block[state->block_size]; + char *dname = NULL, *dirc = NULL; + char *bname = NULL, *basec = NULL; + + dirc = strdup (state->prefix); + basec = strdup (state->prefix); + dname = dirname (dirc); + bname = basename (basec); + + for (i=0; icount; i++) { + char key[512]; + + sprintf (key, "glusterfs.file.%s.%06ld", bname, i); + + ret = glusterfs_getxattr (state->libglusterfsclient_context, + dname, key, block, state->block_size); + + if (ret < 0) { + fprintf (stderr, "glusterfs_getxattr (%s, %s, %p) => %s\n", + dname, key, block, strerror (errno)); + break; + } + state->io_size += ret; + } + + return i; +} + + +int +do_mode_libglusterfsclient_iface_xattr (struct state *state) +{ + if (state->need_op_write) + MEASURE (do_mode_libglusterfsclient_iface_xattr_write, state); + + if (state->need_op_read) + MEASURE (do_mode_libglusterfsclient_iface_xattr_read, state); + + return 0; +} + + +int +do_mode_posix (struct state *state) +{ + if (state->need_iface_fileio) + do_mode_posix_iface_fileio (state); + + if (state->need_iface_xattr) + do_mode_posix_iface_xattr (state); + + return 0; +} + + +int +do_mode_libglusterfsclient (struct state *state) +{ + glusterfs_init_ctx_t ctx = { + .logfile = "/dev/stderr", + .loglevel = "error", + .lookup_timeout = 60, + .stat_timeout = 60, + }; + + ctx.specfile = state->specfile; + if (state->specfile) { + state->libglusterfsclient_context = glusterfs_init (&ctx); + + if (!state->libglusterfsclient_context) { + fprintf (stdout, "Unable to initialize glusterfs context, skipping libglusterfsclient mode\n"); + return -1; + } + } else { + fprintf (stdout, "glusterfs volume specification file not provided, skipping libglusterfsclient mode\n"); + return -1; + } + + if (state->need_iface_fileio) + do_mode_libglusterfsclient_iface_fileio (state); + + if (state->need_iface_xattr) + do_mode_libglusterfsclient_iface_xattr (state); + + return 0; +} + + +int +do_actions (struct state *state) +{ + if (state->need_mode_libglusterfsclient) + do_mode_libglusterfsclient (state); + + if (state->need_mode_posix) + do_mode_posix (state); + + return 0; +} + +static struct argp_option options[] = { + {"op", 'o', "OPERATIONS", 0, + "WRITE|READ|BOTH - defaults to BOTH"}, + {"iface", 'i', "INTERFACE", 0, + "FILEIO|XATTR|BOTH - defaults to FILEIO"}, + {"mode", 'm', "MODE", 0, + "POSIX|LIBGLUSTERFSCLIENT|BOTH - defaults to POSIX"}, + {"block", 'b', "BLOCKSIZE", 0, + " - defaults to 4096"}, + {"specfile", 's', "SPECFILE", 0, + "absolute path to specfile"}, + {"prefix", 'p', "PREFIX", 0, + "filename prefix"}, + {"count", 'c', "COUNT", 0, + "number of files"}, + {0, 0, 0, 0, 0} +}; + +static struct argp argp = { + options, + parse_opts, + "tool", + "tool to benchmark small file performance" +}; + +int +main (int argc, char *argv[]) +{ + struct state state = {0, }; + + state.need_op_write = 1; + state.need_op_read = 1; + + state.need_iface_fileio = 1; + state.need_iface_xattr = 0; + + state.need_mode_posix = 1; + state.need_mode_libglusterfsclient = 0; + + state.block_size = 4096; + + strcpy (state.prefix, "tmpfile"); + state.count = 1048576; + + if (argp_parse (&argp, argc, argv, 0, 0, &state) != 0) { + fprintf (stderr, "argp_parse() failed\n"); + return 1; + } + + do_actions (&state); + + return 0; +} diff --git a/extras/benchmarking/launch-script.sh b/extras/benchmarking/launch-script.sh new file mode 100755 index 000000000..5d5050d41 --- /dev/null +++ b/extras/benchmarking/launch-script.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +# This script is to launch the script in parallel across all the nodes. + +mount_point="/mnt/glusterfs" +path_to_script="$mount_point}/benchmark/local-script.sh" + +num_hosts=8 + +for i in $(seq 1 $num_hosts); do + ssh node$i path_to_script & +done + +sleep 3; + +touch ${mount_point}/benchmark/start-test + + diff --git a/extras/benchmarking/local-script.sh b/extras/benchmarking/local-script.sh new file mode 100755 index 000000000..80a7fafe8 --- /dev/null +++ b/extras/benchmarking/local-script.sh @@ -0,0 +1,26 @@ +#!/bin/sh + +# This script needs to be present on glusterfs mount, (ie, on every node which wants to run benchmark) + +ifilename="/dev/zero" +ofilename="testdir/testfile.$(hostname)" +result="output/output.$(hostname)" +blocksize=128k +count=8 + +mkdir -p testdir; +mkdir -p output; +echo > ${result} +while [ ! -e start-test ]; do + sleep 1; +done; + + +for i in $(seq 1 5); do + # write + dd if=${ifilename} of=${ofilename} bs=${blocksize} count=${count} 2>&1 | tail -n 1 | cut -f 8,9 -d ' ' >> ${result} ; + # read + #dd if=${ofilename} of=/dev/null bs=${blocksize} count=${count} 2>&1 | tail -n 1 | cut -f 8,9 -d ' ' >> ${result} ; +done + +rm -f start-test diff --git a/extras/glusterfs-mode.el b/extras/glusterfs-mode.el new file mode 100644 index 000000000..e65fbf460 --- /dev/null +++ b/extras/glusterfs-mode.el @@ -0,0 +1,112 @@ +;;; Copyright (C) 2007, 2008 Z RESEARCH Inc. +;;; +;;; This program is free software; you can redistribute it and/or modify +;;; it under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 2 of the License, or +;;; (at your option) any later version. +;;; +;;; This program is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with this program; if not, write to the Free Software +;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +;;; + +(defvar glusterfs-mode-hook nil) + +;; (defvar glusterfs-mode-map +;; (let ((glusterfs-mode-map (make-keymap))) +;; (define-key glusterfs-mode-map "\C-j" 'newline-and-indent) +;; glusterfs-mode-map) +;; "Keymap for WPDL major mode") + +(add-to-list 'auto-mode-alist '("\\.vol\\'" . glusterfs-mode)) + +(defconst glusterfs-font-lock-keywords-1 + (list + ; "cluster/{unify,afr,stripe}" + ; "performance/{io-cache,io-threads,write-behind,read-ahead,stat-prefetch}" + ; "protocol/{client/server}" + ; "features/{trash,posix-locks,fixed-id,filter}" + ; "stroage/posix" + ; "encryption/rot-13" + ; "debug/trace" + '("\\<\\(cluster/\\(unify\\|afr\\|replicate\\|stripe\\|ha\\|dht\\|distribute\\)\\|\\performance/\\(io-\\(cache\\|threads\\)\\|write-behind\\|read-ahead\\|symlink-cache\\)\\|protocol/\\(server\\|client\\)\\|features/\\(trash\\|posix-locks\\|locks\\|path-converter\\|filter\\)\\|storage/\\(posix\\|bdb\\)\\|encryption/rot-13\\|debug/trace\\)\\>" . font-lock-keyword-face)) +"Additional Keywords to highlight in GlusterFS mode.") + +(defconst glusterfs-font-lock-keywords-2 + (append glusterfs-font-lock-keywords-1 + (list + ; "replicate" "namespace" "scheduler" "remote-subvolume" "remote-host" + ; "auth.addr" "block-size" "remote-port" "listen-port" "transport-type" + ; "limits.min-free-disk" "directory" + ; TODO: add all the keys here. + '("\\<\\(inode-lru-limit\\|replicate\\|namespace\\|scheduler\\|username\\|password\\|allow\\|reject\\|block-size\\|listen-port\\|transport-type\\|transport-timeout\\|directory\\|page-size\\|page-count\\|aggregate-size\\|non-blocking-io\\|client-volume-filename\\|bind-address\\|self-heal\\|read-only-subvolumes\\|read-subvolume\\|thread-count\\|cache-size\\|window-size\\|force-revalidate-timeout\\|priority\\|include\\|exclude\\|remote-\\(host\\|subvolume\\|port\\)\\|auth.\\(addr\\|login\\)\\|limits.\\(min-disk-free\\|transaction-size\\|ib-verbs-\\(work-request-\\(send-\\|recv-\\(count\\|size\\)\\)\\|port\\|mtu\\|device-name\\)\\)\\)\ \\>" . font-lock-constant-face))) + "option keys in GlusterFS mode.") + +(defconst glusterfs-font-lock-keywords-3 + (append glusterfs-font-lock-keywords-2 + (list + ; "option" "volume" "end-volume" "subvolumes" "type" + '("\\<\\(option\ \\|volume\ \\|subvolumes\ \\|type\ \\|end-volume\\)\\>" . font-lock-builtin-face))) + ;'((regexp-opt (" option " "^volume " "^end-volume" "subvolumes " " type ") t) . font-lock-builtin-face)) + "Minimal highlighting expressions for GlusterFS mode.") + + +(defvar glusterfs-font-lock-keywords glusterfs-font-lock-keywords-3 + "Default highlighting expressions for GlusterFS mode.") + +(defvar glusterfs-mode-syntax-table + (let ((glusterfs-mode-syntax-table (make-syntax-table))) + (modify-syntax-entry ?\# "<" glusterfs-mode-syntax-table) + (modify-syntax-entry ?* ". 23" glusterfs-mode-syntax-table) + (modify-syntax-entry ?\n ">#" glusterfs-mode-syntax-table) + glusterfs-mode-syntax-table) + "Syntax table for glusterfs-mode") + +;; TODO: add an indentation table + +(defun glusterfs-indent-line () + "Indent current line as GlusterFS code" + (interactive) + (beginning-of-line) + (if (bobp) + (indent-line-to 0) ; First line is always non-indented + (let ((not-indented t) cur-indent) + (if (looking-at "^[ \t]*volume\ ") + (progn + (save-excursion + (forward-line -1) + (setq not-indented nil) + (setq cur-indent 0)))) + (if (looking-at "^[ \t]*end-volume") + (progn + (save-excursion + (forward-line -1) + (setq cur-indent 0)) + (if (< cur-indent 0) ; We can't indent past the left margin + (setq cur-indent 0))) + (save-excursion + (while not-indented ; Iterate backwards until we find an indentation hint + (progn + (setq cur-indent 2) ; Do the actual indenting + (setq not-indented nil))))) + (if cur-indent + (indent-line-to cur-indent) + (indent-line-to 0))))) + +(defun glusterfs-mode () + (interactive) + (kill-all-local-variables) + ;; (use-local-map glusterfs-mode-map) + (set-syntax-table glusterfs-mode-syntax-table) + (set (make-local-variable 'indent-line-function) 'glusterfs-indent-line) + (set (make-local-variable 'font-lock-defaults) '(glusterfs-font-lock-keywords)) + (setq major-mode 'glusterfs-mode) + (setq mode-name "GlusterFS") + (run-hooks 'glusterfs-mode-hook)) + +(provide 'glusterfs-mode) diff --git a/extras/glusterfs.vim b/extras/glusterfs.vim new file mode 100644 index 000000000..0de6b5b2f --- /dev/null +++ b/extras/glusterfs.vim @@ -0,0 +1,211 @@ +" glusterfs.vim: GNU Vim Syntax file for GlusterFS .vol specification +" Copyright (C) 2007 Z RESEARCH, Inc. +" This file is part of GlusterFS. +" +" GlusterFS is free software; you can redistribute it and/or modify +" it under the terms of the GNU General Public License as published +" by the Free Software Foundation; either version 3 of the License, +" or (at your option) any later version. +" +" GlusterFS is distributed in the hope that it will be useful, but +" WITHOUT ANY WARRANTY; without even the implied warranty of +" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +" General Public License for more details. +" +" You should have received a copy of the GNU General Public License +" along with this program. If not, see +" . +" +" Last Modified: Wed Aug 1 00:47:10 IST 2007 +" Version: 0.8 + +syntax clear +syntax case match + +setlocal iskeyword+=- +setlocal iskeyword+=% +setlocal iskeyword+=. +setlocal iskeyword+=* +setlocal iskeyword+=: +setlocal iskeyword+=, + + +"************************************************************************ +" Initially, consider everything an error. Then start eliminating one +" field after the other. Whatever is not eliminated (due to defined +" properties) is an error - Multiples Values for a key +"************************************************************************ +syn match glusterfsError /[^ ]\+/ skipwhite +syn match glusterfsComment "#.*" contains=glusterfsTodo + +syn keyword glusterfsTodo contained TODO FIXME NOTE + +"------------------------------------------------------------------------ +" 'Type' Begin +"------------------------------------------------------------------------ +" Handle all the 'Type' keys and values. Here, a '/' is used to separate +" the key-value pair, they are clubbed together for convenience +syn match glusterfsType "^\s*type\s\+" skipwhite nextgroup=glusterfsTypeKeyVal + +syn match glusterfsTypeKeyVal contained "\" +syn match glusterfsTypeKeyVal contained "\" +syn match glusterfsTypeKeyVal contained "\" +syn match glusterfsTypeKeyVal contained "\" +syn match glusterfsTypeKeyVal contained "\" +"syn match glusterfsTypeKeyVal contained "\" +syn match glusterfsTypeKeyVal contained "\" +syn match glusterfsTypeKeyVal contained "\" +"------------------------------------------------------------------------ +" 'Type' End +"------------------------------------------------------------------------ + + +"************************************************************************ + +"------------------------------------------------------------------------ +" 'Volume' Begin +"------------------------------------------------------------------------ +" NOTE 1: Only one volume name allowed after 'volume' keyword +" NOTE 2: Multiple volumes allowed after 'subvolumes' +" NOTE 3: Some other options (like remote-subvolume, namespace etc) use +" volume name (single) +syn match glusterfsVol "^\s*volume\s\+" nextgroup=glusterfsVolName +syn match glusterfsVolName "\<\k\+" contained + +syn match glusterfsVol "^\s*subvolumes\s\+" skipwhite nextgroup=glusterfsSubVolName +syn match glusterfsSubVolName "\<\k\+\>" skipwhite contained nextgroup=glusterfsSubVolName + +syn match glusterfsVol "^\s*end-volume\>" +"------------------------------------------------------------------------ +" 'Volume' End +"------------------------------------------------------------------------ + + + + + +"------------------------------------------------------------------------ +" 'Options' Begin +"------------------------------------------------------------------------ +syn match glusterfsOpt "^\s*option\s\+" nextgroup=glusterfsOptKey + + +syn keyword glusterfsOptKey contained transport-type skipwhite nextgroup=glusterfsOptValTransportType +syn match glusterfsOptValTransportType contained "\<\(tcp\|ib\-verbs\|ib-sdp\)/\(client\|server\)\>" + +syn keyword glusterfsOptKey contained remote-subvolume skipwhite nextgroup=glusterfsVolName + +syn keyword glusterfsOptKey contained auth.addr.ra8.allow auth.addr.ra7.allow auth.addr.ra6.allow auth.addr.ra5.allow auth.addr.ra4.allow auth.addr.ra3.allow auth.addr.ra2.allow auth.addr.ra1.allow auth.addr.brick-ns.allow skipwhite nextgroup=glusterfsOptVal + +syn keyword glusterfsOptKey contained client-volume-filename directory trash-dir skipwhite nextgroup=glusterfsOpt_Path +syn match glusterfsOpt_Path contained "\s\+\f\+\>" + +syn keyword glusterfsOptKey contained debug self-heal encrypt-write decrypt-read mandatory nextgroup=glusterfsOpt_OnOff +syn match glusterfsOpt_OnOff contained "\s\+\(on\|off\)\>" + +syn keyword glusterfsOptKey contained flush-behind non-blocking-connect nextgroup=glusterfsOpt_OnOffNoYes +syn keyword glusterfsOpt_OnOffNoYes contained on off no yes + +syn keyword glusterfsOptKey contained page-size cache-size nextgroup=glusterfsOpt_Size + +syn keyword glusterfsOptKey contained fixed-gid fixed-uid cache-seconds page-count thread-count aggregate-size listen-port remote-port transport-timeout inode-lru-limit nextgroup=glusterfsOpt_Number + +syn keyword glusterfsOptKey contained alu.disk-usage.entry-threshold alu.disk-usage.exit-threshold nextgroup=glusterfsOpt_Size + +syn keyword glusterfsOptKey contained alu.order skipwhite nextgroup=glusterfsOptValAluOrder +syn match glusterfsOptValAluOrder contained "\s\+\(\(disk-usage\|write-usage\|read-usage\|open-files-usage\|disk-speed\):\)*\(disk-usage\|write-usage\|read-usage\|open-files-usage\|disk-speed\)\>" + +syn keyword glusterfsOptKey contained alu.open-files-usage.entry-threshold alu.open-files-usage.exit-threshold alu.limits.max-open-files rr.refresh-interval random.refresh-interval nufa.refresh-interval nextgroup=glusterfsOpt_Number + +syn keyword glusterfsOptKey contained nufa.local-volume-name skipwhite nextgroup=glusterfsVolName + +syn keyword glusterfsOptKey contained ib-verbs-work-request-send-size ib-verbs-work-request-recv-size nextgroup=glusterfsOpt_Size +syn match glusterfsOpt_Size contained "\s\+\d\+\([gGmMkK][bB]\)\=\>" + +syn keyword glusterfsOptKey contained ib-verbs-work-request-send-count ib-verbs-work-request-recv-count ib-verbs-port nextgroup=glusterfsOpt_Number + +syn keyword glusterfsOptKey contained ib-verbs-mtu nextgroup=glusterfsOptValIBVerbsMtu +syn match glusterfsOptValIBVerbsMtu "\s\+\(256\|512\|1024\|2048\|4096\)\>" contained + +syn keyword glusterfsOptKey contained ib-verbs-device-name nextgroup=glusterfsOptVal + +syn match glusterfsOpt_Number contained "\s\+\d\+\>" + +syn keyword glusterfsOptKey contained scheduler skipwhite nextgroup=glusterfsOptValScheduler +syn keyword glusterfsOptValScheduler contained rr alu random nufa + +syn keyword glusterfsOptKey contained namespace skipwhite nextgroup=glusterfsVolName + +syn keyword glusterfsOptKey contained lock-node skipwhite nextgroup=glusterfsVolName + + + +syn keyword glusterfsOptKey contained alu.write-usage.entry-threshold alu.write-usage.exit-threshold alu.read-usage.entry-threshold alu.read-usage.exit-threshold alu.limits.min-free-disk nextgroup=glusterfsOpt_Percentage + +syn keyword glusterfsOptKey contained random.limits.min-free-disk nextgroup=glusterfsOpt_Percentage +syn keyword glusterfsOptKey contained rr.limits.min-disk-free nextgroup=glusterfsOpt_Size + +syn keyword glusterfsOptKey contained nufa.limits.min-free-disk nextgroup=glusterfsOpt_Percentage + +syn match glusterfsOpt_Percentage contained "\s\+\d\+%\=\>" + + + + + + + + + +syn keyword glusterfsOptKey contained remote-host bind-address nextgroup=glusterfsOpt_IP,glusterfsOpt_Domain +syn match glusterfsOpt_IP contained "\s\+\d\d\=\d\=\.\d\d\=\d\=\.\d\d\=\d\=\.\d\d\=\d\=\>" +syn match glusterfsOpt_Domain contained "\s\+\a[a-zA-Z0-9_-]*\(\.\a\+\)*\>" + +syn match glusterfsVolNames "\s*\<\S\+\>" contained skipwhite nextgroup=glusterfsVolNames + +syn keyword glusterfsOptKey contained block-size replicate skipwhite nextgroup=glusterfsOpt_Pattern + +syn match glusterfsOpt_Pattern contained "\s\+\k\+\>" +syn match glusterfsOptVal contained "\s\+\S\+\>" + + + + + +hi link glusterfsError Error +hi link glusterfsComment Comment + +hi link glusterfsVol keyword + +hi link glusterfsVolName function +hi link glusterfsSubVolName function + +hi link glusterfsType Keyword +hi link glusterfsTypeKeyVal String + +hi link glusterfsOpt Keyword + +hi link glusterfsOptKey Special +hi link glusterfsOptVal Normal + +hi link glusterfsOptValTransportType String +hi link glusterfsOptValScheduler String +hi link glusterfsOptValAluOrder String +hi link glusterfsOptValIBVerbsMtu String + +hi link glusterfsOpt_OnOff String +hi link glusterfsOpt_OnOffNoYes String + + +" Options that require +hi link glusterfsOpt_Size PreProc +hi link glusterfsOpt_Domain PreProc +hi link glusterfsOpt_Percentage PreProc +hi link glusterfsOpt_IP PreProc +hi link glusterfsOpt_Pattern PreProc +hi link glusterfsOpt_Number Preproc +hi link glusterfsOpt_Path Preproc + + + +let b:current_syntax = "glusterfs" diff --git a/extras/init.d/Makefile.am b/extras/init.d/Makefile.am new file mode 100644 index 000000000..608b5bb2d --- /dev/null +++ b/extras/init.d/Makefile.am @@ -0,0 +1,9 @@ + +EXTRA_DIST = glusterfsd glusterfs-server glusterfs-server.plist + +CLEANFILES = + +install-data-am: +if GF_DARWIN_HOST_OS + cp glusterfs-server.plist /Library/LaunchDaemons/com.zresearch.glusterfs.plist +endif diff --git a/extras/init.d/glusterfs-server b/extras/init.d/glusterfs-server new file mode 100755 index 000000000..975283982 --- /dev/null +++ b/extras/init.d/glusterfs-server @@ -0,0 +1,100 @@ +#!/bin/sh +### BEGIN INIT INFO +# Provides: glusterfsd +# Required-Start: $local_fs $network +# Required-Stop: $local_fs $network +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: gluster server +# Description: This file starts / stops the gluster server +### END INIT INFO + +# Author: Chris AtLee +# Patched by: Matthias Albert < matthias@linux4experts.de> + +PATH=/sbin:/usr/sbin:/bin:/usr/bin +NAME=glusterfsd +SCRIPTNAME=/etc/init.d/$NAME +DAEMON=/usr/sbin/$NAME +PIDFILE=/var/run/$NAME.pid +CONFIGFILE=/etc/glusterfs/server.vol +GLUSTERFS_OPTS="-f $CONFIGFILE" +PID=`test -f $PIDFILE && cat $PIDFILE` + + +# Gracefully exit if the package has been removed. +test -x $DAEMON || exit 0 + +# Load the VERBOSE setting and other rcS variables +. /lib/init/vars.sh + +# Define LSB log_* functions. +. /lib/lsb/init-functions + +check_config() +{ + if [ ! -f "$CONFIGFILE" ]; then + echo "Config file $CONFIGFILE is missing...exiting!" + exit 0 + fi +} + +do_start() +{ + check_config; + pidofproc -p $PIDFILE $DAEMON >/dev/null + status=$? + if [ $status -eq 0 ]; then + log_success_msg "glusterfs server is already running with pid $PID" + else + log_daemon_msg "Starting glusterfs server" "glusterfsd" + start-stop-daemon --start --quiet --oknodo --pidfile $PIDFILE --startas $DAEMON -- -p $PIDFILE $GLUSTERFS_OPTS + log_end_msg $? + start_daemon -p $PIDFILE $DAEMON -f $CONFIGFILE + return $? + fi +} + +do_stop() +{ + log_daemon_msg "Stopping glusterfs server" "glusterfsd" + start-stop-daemon --stop --quiet --oknodo --pidfile $PIDFILE + log_end_msg $? + rm -f $PIDFILE + killproc -p $PIDFILE $DAEMON + return $? +} + +do_status() +{ + pidofproc -p $PIDFILE $DAEMON >/dev/null + status=$? + if [ $status -eq 0 ]; then + log_success_msg "glusterfs server is running with pid $PID" + else + log_failure_msg "glusterfs server is not running." + fi + exit $status +} + +case "$1" in + start) + do_start + ;; + stop) + do_stop + ;; + status) + do_status; + ;; + restart|force-reload) + do_stop + sleep 2 + do_start + ;; + *) + echo "Usage: $SCRIPTNAME {start|stop|status|restart|force-reload}" >&2 + exit 3 + ;; +esac + diff --git a/extras/init.d/glusterfs-server.plist.in b/extras/init.d/glusterfs-server.plist.in new file mode 100644 index 000000000..4d2287c57 --- /dev/null +++ b/extras/init.d/glusterfs-server.plist.in @@ -0,0 +1,15 @@ + + + + + Label + com.zresearch.glusterfs + ProgramArguments + + @prefix@/sbin/glusterfsd + -N + -f + @prefix@/etc/glusterfs/server.vol + + + diff --git a/extras/init.d/glusterfsd b/extras/init.d/glusterfsd new file mode 100755 index 000000000..866a0010e --- /dev/null +++ b/extras/init.d/glusterfsd @@ -0,0 +1,110 @@ +#!/bin/bash +# +# chkconfig: 35 90 12 +# description: Glusterfsd server +# + +# Get function from functions library +# . /etc/rc.d/init.d/functions + +BASE=glusterfsd +GSERVER="/sbin/$BASE -f /etc/glusterfs/glusterfs-server.vol" + +# A function to stop gluster +killgluster() +{ + killlevel="-9" + # Find pid. + pid= + if [ -f /var/run/$BASE.pid ]; then + local line p + read line < /var/run/$BASE.pid + for p in $line ; do + [ -z "${p//[0-9]/}" -a -d "/proc/$p" ] && pid="$pid +$p" + done + fi + if [ -z "$pid" ]; then + pid=`pidof -o $$ -o $PPID -o %PPID -x $1 || \ + pidof -o $$ -o $PPID -o %PPID -x $BASE` + fi + # Kill it. + kill $killlevel $pid + if [ "$?" = 0 ] + then + echo "Gluster process $pid has been killed" + initlog -n "Kill gluster" -e 1 + else + echo "Failed: Gluster process $pid has not been killed" + initlog -n "Kill gluster" -e 2 + fi + + # Remove pid and lock file if any. + if [ -f /var/run/$BASE.pid ] + then + rm -f /var/run/$BASE.pid && initlog -n "Remove $BASE.pid:" -e +1 + else echo "$BASE.pid not found" && initlog -n "Remove +$BASE.pid:" -e 2 + fi + + if [ -f /var/lock/subsys/$BASE ] + then + rm -f /var/lock/subsys/$BASE && initlog -n "Remove $BASE lock +file:" -e 1 + else echo "$BASE lock file not found" && initlog -n "Remove +$BASE lock file:" -e 2 + fi +} + +# Start the service $BASE +start() +{ + initlog -c "echo -n Starting $BASE:" + $GSERVER + if [ $? = 0 ] + then + touch /var/lock/subsys/$BASE + initlog -n "Starting $BASE" -e 1 + echo " [OK]" + else + echo "$BASE start failed." + initlog -n "$BASE start" -e 2 + fi +} + +# Stop the service $BASE +stop() +{ + echo "Stopping $BASE:" + killgluster +} +status() +{ + if test "`lsof |grep -c /sbin/$BASE`" = "0" + then echo "$BASE is stopped." + else echo "$BASE is running..." + fi +} + +### service arguments ### +case $1 in + start) + start + ;; + stop) + stop + ;; + status) + status + ;; + restart|reload|condrestart) + stop + start + ;; + *) + echo $.Usage: $0 {start|stop|restart|reload|status}. + exit 1 +esac + +exit 0 diff --git a/extras/specgen.scm b/extras/specgen.scm new file mode 100755 index 000000000..279afe896 --- /dev/null +++ b/extras/specgen.scm @@ -0,0 +1,98 @@ +#!/usr/bin/guile -s +!# + +;;; Copyright (C) 2007 Z RESEARCH Inc. +;;; +;;; This program is free software; you can redistribute it and/or modify +;;; it under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 2 of the License, or +;;; (at your option) any later version. +;;; +;;; This program is distributed in the hope that it will be useful, +;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with this program; if not, write to the Free Software +;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +;;; + +;;; This script lets you specify the xlator graph as a Scheme list +;;; and provides a function to generate the spec file for the graph. + + +(define (volume args) + (apply + (lambda (name type options) + (lambda args + (display "volume ") (display name) (newline) + (display " type ") (display type) (newline) + (map (lambda (key-value-cons) + (let ((key (car key-value-cons)) + (value (cdr key-value-cons))) + (display " option ") (display key) (display " ") + (display value) (newline))) + options) + (if (> (length args) 0) + (begin + (display " subvolumes ") + (map (lambda (subvol) + (display subvol) (display " ")) + args) + (newline))) + (display "end-volume") (newline) (newline) + name)) + args)) + +;; define volumes with names/type/options and bind to a symbol +;; relate them seperately (see below) +;; more convinient to seperate volume definition and relation + +(define wb (volume '(wb0 + performance/write-behind + ((aggregate-size . 0) + (flush-behind . off) + )))) + +(define ra (volume '(ra0 + performance/read-ahead + ((page-size . 128KB) + (page-count . 1) + )))) + +(define ioc (volume '(ioc0 + performance/io-cache + ((page-size . 128KB) + (cache-size . 64MB) + )))) + +(define iot (volume '(iot0 + performance/io-threads + () + ))) + +(define client1 (volume '(client1 + protocol/client + ((transport-type . tcp/client) + (remote-host . localhost) + (remote-subvolume . brick1) + )))) + +(define client2 (volume '(client2 + protocol/client + ((transport-type . tcp/client) + (remote-host . localhost) + (remote-subvolume . brick2) + )))) + +(define unify (volume '(unify0 + cluster/unify + ((scheduler . rr) + )))) + +;; relate the symbols to output a spec file +;; note: relating with symbols lets you change volume name in one place + +(wb (ra (ioc (iot (unify (client1) + (client2)))))) diff --git a/extras/stripe-merge.c b/extras/stripe-merge.c new file mode 100644 index 000000000..3f8e4b124 --- /dev/null +++ b/extras/stripe-merge.c @@ -0,0 +1,48 @@ +#include +#include +#include +#include +#include + +int +main (int argc, char *argv[]) +{ + int fds[argc-1]; + char buf[argc-1][4096]; + int i; + int max_ret, ret; + + if (argc < 2) { + printf ("Usage: %s file1 file2 ... >file\n", argv[0]); + return 1; + } + + for (i=0; i max_ret) + max_ret = ret; + } + for (i=0; i + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TWO_POWER(power) (2UL << (power)) + +#define RDD_INTEGER_VALUE ((TWO_POWER ((sizeof (int) * 8))) - 1) + +#ifndef UNIX_PATH_MAX +#define UNIX_PATH_MAX 108 +#endif + +struct rdd_file { + char path[UNIX_PATH_MAX]; + struct stat st; + int fd; +}; + +struct rdd_config { + long iters; + long max_ops_per_seq; + size_t max_bs; + size_t min_bs; + int thread_count; + pthread_t *threads; + pthread_barrier_t barrier; + pthread_mutex_t lock; + struct rdd_file in_file; + struct rdd_file out_file; +}; +static struct rdd_config rdd_config; + +enum rdd_keys { + RDD_MIN_BS_KEY = 1, + RDD_MAX_BS_KEY, +}; + +static error_t +rdd_parse_opts (int key, char *arg, + struct argp_state *_state) +{ + switch (key) { + case 'o': + { + int len = 0; + len = strlen (arg); + if (len > UNIX_PATH_MAX) { + fprintf (stderr, "output file name too long (%s)\n", arg); + return -1; + } + + strncpy (rdd_config.out_file.path, arg, len); + } + break; + + case 'i': + { + int len = 0; + len = strlen (arg); + if (len > UNIX_PATH_MAX) { + fprintf (stderr, "input file name too long (%s)\n", arg); + return -1; + } + + strncpy (rdd_config.in_file.path, arg, len); + } + break; + + case RDD_MIN_BS_KEY: + { + char *tmp = NULL; + long bs = 0; + bs = strtol (arg, &tmp, 10); + if ((bs == LONG_MAX) || (bs == LONG_MIN) || (tmp && *tmp)) { + fprintf (stderr, "invalid argument for minimum block size (%s)\n", arg); + return -1; + } + + rdd_config.min_bs = bs; + } + break; + + case RDD_MAX_BS_KEY: + { + char *tmp = NULL; + long bs = 0; + bs = strtol (arg, &tmp, 10); + if ((bs == LONG_MAX) || (bs == LONG_MIN) || (tmp && *tmp)) { + fprintf (stderr, "invalid argument for maximum block size (%s)\n", arg); + return -1; + } + + rdd_config.max_bs = bs; + } + break; + + case 'r': + { + char *tmp = NULL; + long iters = 0; + iters = strtol (arg, &tmp, 10); + if ((iters == LONG_MAX) || (iters == LONG_MIN) || (tmp && *tmp)) { + fprintf (stderr, "invalid argument for iterations (%s)\n", arg); + return -1; + } + + rdd_config.iters = iters; + } + break; + + case 'm': + { + char *tmp = NULL; + long max_ops = 0; + max_ops = strtol (arg, &tmp, 10); + if ((max_ops == LONG_MAX) || (max_ops == LONG_MIN) || (tmp && *tmp)) { + fprintf (stderr, "invalid argument for max-ops (%s)\n", arg); + return -1; + } + + rdd_config.max_ops_per_seq = max_ops; + } + break; + + case 't': + { + char *tmp = NULL; + long threads = 0; + threads = strtol (arg, &tmp, 10); + if ((threads == LONG_MAX) || (threads == LONG_MIN) || (tmp && *tmp)) { + fprintf (stderr, "invalid argument for thread count (%s)\n", arg); + return -1; + } + + rdd_config.thread_count = threads; + } + break; + + case ARGP_KEY_NO_ARGS: + break; + case ARGP_KEY_ARG: + break; + case ARGP_KEY_END: + if (_state->argc == 1) { + argp_usage (_state); + } + + } + + return 0; +} + +static struct argp_option rdd_options[] = { + {"if", 'i', "INPUT_FILE", 0, "input-file"}, + {"of", 'o', "OUTPUT_FILE", 0, "output-file"}, + {"threads", 't', "COUNT", 0, "number of threads to spawn (defaults to 2)"}, + {"min-bs", RDD_MIN_BS_KEY, "MIN_BLOCK_SIZE", 0, + "Minimum block size in bytes (defaults to 1024)"}, + {"max-bs", RDD_MAX_BS_KEY, "MAX_BLOCK_SIZE", 0, + "Maximum block size in bytes (defaults to 4096)"}, + {"iters", 'r', "ITERS", 0, + "Number of read-write sequences (defaults to 1000000)"}, + {"max-ops", 'm', "MAXOPS", 0, + "maximum number of read-writes to be performed in a sequence (defaults to 1)"}, + {0, 0, 0, 0, 0} +}; + +static struct argp argp = { + rdd_options, + rdd_parse_opts, + "", + "random dd - tool to do a sequence of random block-sized continuous read writes starting at a random offset" +}; + + +static void +rdd_default_config (void) +{ + rdd_config.thread_count = 2; + rdd_config.iters = 1000000; + rdd_config.max_bs = 4096; + rdd_config.min_bs = 1024; + rdd_config.in_file.fd = rdd_config.out_file.fd = -1; + rdd_config.max_ops_per_seq = 1; + + return; +} + + +static char +rdd_valid_config (void) +{ + char ret = 1; + int fd = -1; + + fd = open (rdd_config.in_file.path, O_RDONLY); + if (fd == -1) { + ret = 0; + goto out; + } + close (fd); + + if (rdd_config.min_bs > rdd_config.max_bs) { + ret = 0; + goto out; + } + + if (strlen (rdd_config.out_file.path) == 0) { + sprintf (rdd_config.out_file.path, "%s.rddout", rdd_config.in_file.path); + } + +out: + return ret; +} + + +static void * +rdd_read_write (void *arg) +{ + int i = 0, ret = 0; + size_t bs = 0; + off_t offset = 0; + long rand = 0; + long max_ops = 0; + char *buf = NULL; + + buf = CALLOC (1, rdd_config.max_bs); + if (!buf) { + fprintf (stderr, "calloc failed (%s)\n", strerror (errno)); + ret = -1; + goto out; + } + + for (i = 0; i < rdd_config.iters; i++) + { + pthread_mutex_lock (&rdd_config.lock); + { + int bytes = 0; + rand = random (); + + if (rdd_config.min_bs == rdd_config.max_bs) { + bs = rdd_config.max_bs; + } else { + bs = rdd_config.min_bs + (rand % (rdd_config.max_bs - rdd_config.min_bs)); + } + + offset = rand % rdd_config.in_file.st.st_size; + max_ops = rand % rdd_config.max_ops_per_seq; + if (!max_ops) { + max_ops ++; + } + + ret = lseek (rdd_config.in_file.fd, offset, SEEK_SET); + if (ret != offset) { + fprintf (stderr, "lseek failed (%s)\n", strerror (errno)); + ret = -1; + goto unlock; + } + + ret = lseek (rdd_config.out_file.fd, offset, SEEK_SET); + if (ret != offset) { + fprintf (stderr, "lseek failed (%s)\n", strerror (errno)); + ret = -1; + goto unlock; + } + + while (max_ops--) + { + bytes = read (rdd_config.in_file.fd, buf, bs); + if (!bytes) { + break; + } + + if (bytes == -1) { + fprintf (stderr, "read failed (%s)\n", strerror (errno)); + ret = -1; + goto unlock; + } + + if (write (rdd_config.out_file.fd, buf, bytes) != bytes) { + fprintf (stderr, "write failed (%s)\n", strerror (errno)); + ret = -1; + goto unlock; + } + } + } + unlock: + pthread_mutex_unlock (&rdd_config.lock); + if (ret == -1) { + goto out; + } + ret = 0; + } +out: + free (buf); + pthread_barrier_wait (&rdd_config.barrier); + + return NULL; +} + + +static int +rdd_spawn_threads (void) +{ + int i = 0, ret = -1, fd = -1; + char buf[4096]; + + fd = open (rdd_config.in_file.path, O_RDONLY); + if (fd < 0) { + fprintf (stderr, "cannot open %s (%s)\n", rdd_config.in_file.path, strerror (errno)); + ret = -1; + goto out; + } + ret = fstat (fd, &rdd_config.in_file.st); + if (ret != 0) { + close (fd); + fprintf (stderr, "cannot stat %s (%s)\n", rdd_config.in_file.path, strerror (errno)); + ret = -1; + goto out; + } + rdd_config.in_file.fd = fd; + + fd = open (rdd_config.out_file.path, O_WRONLY | O_CREAT, S_IRWXU | S_IROTH); + if (fd < 0) { + close (rdd_config.in_file.fd); + rdd_config.in_file.fd = -1; + fprintf (stderr, "cannot open %s (%s)\n", rdd_config.out_file.path, strerror (errno)); + ret = -1; + goto out; + } + rdd_config.out_file.fd = fd; + + while ((ret = read (rdd_config.in_file.fd, buf, 4096)) > 0) { + if (write (rdd_config.out_file.fd, buf, ret) != ret) { + fprintf (stderr, "write failed (%s)\n", strerror (errno)); + close (rdd_config.in_file.fd); + close (rdd_config.out_file.fd); + rdd_config.in_file.fd = rdd_config.out_file.fd = -1; + ret = -1; + goto out; + } + } + + rdd_config.threads = CALLOC (rdd_config.thread_count, sizeof (pthread_t)); + if (rdd_config.threads == NULL) { + fprintf (stderr, "calloc() failed (%s)\n", strerror (errno)); + + ret = -1; + close (rdd_config.in_file.fd); + close (rdd_config.out_file.fd); + rdd_config.in_file.fd = rdd_config.out_file.fd = -1; + goto out; + } + + ret = pthread_barrier_init (&rdd_config.barrier, NULL, rdd_config.thread_count + 1); + if (ret != 0) { + fprintf (stderr, "pthread_barrier_init() failed (%s)\n", strerror (ret)); + + free (rdd_config.threads); + close (rdd_config.in_file.fd); + close (rdd_config.out_file.fd); + rdd_config.in_file.fd = rdd_config.out_file.fd = -1; + ret = -1; + goto out; + } + + ret = pthread_mutex_init (&rdd_config.lock, NULL); + if (ret != 0) { + fprintf (stderr, "pthread_mutex_init() failed (%s)\n", strerror (ret)); + + free (rdd_config.threads); + pthread_barrier_destroy (&rdd_config.barrier); + close (rdd_config.in_file.fd); + close (rdd_config.out_file.fd); + rdd_config.in_file.fd = rdd_config.out_file.fd = -1; + ret = -1; + goto out; + } + + for (i = 0; i < rdd_config.thread_count; i++) + { + ret = pthread_create (&rdd_config.threads[i], NULL, rdd_read_write, NULL); + if (ret != 0) { + fprintf (stderr, "pthread_create failed (%s)\n", strerror (errno)); + exit (1); + } + } + +out: + return ret; +} + + +static void +rdd_wait_for_completion (void) +{ + pthread_barrier_wait (&rdd_config.barrier); +} + + +int +main (int argc, char *argv[]) +{ + int ret = -1; + + rdd_default_config (); + + ret = argp_parse (&argp, argc, argv, 0, 0, NULL); + if (ret != 0) { + ret = -1; + fprintf (stderr, "%s: argp_parse() failed\n", argv[0]); + goto err; + } + + if (!rdd_valid_config ()) { + ret = -1; + fprintf (stderr, "%s: configuration validation failed\n", argv[0]); + goto err; + } + + ret = rdd_spawn_threads (); + if (ret != 0) { + fprintf (stderr, "%s: spawning threads failed\n", argv[0]); + goto err; + } + + rdd_wait_for_completion (); + +err: + return ret; +} diff --git a/glusterfs-guts/Makefile.am b/glusterfs-guts/Makefile.am new file mode 100644 index 000000000..f963effea --- /dev/null +++ b/glusterfs-guts/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src \ No newline at end of file diff --git a/glusterfs-guts/src/Makefile.am b/glusterfs-guts/src/Makefile.am new file mode 100644 index 000000000..bb8c7b176 --- /dev/null +++ b/glusterfs-guts/src/Makefile.am @@ -0,0 +1,17 @@ +sbin_PROGRAMS = glusterfs-guts + +glusterfs_guts_SOURCES = glusterfs-guts.c fuse-bridge.c guts-replay.c guts-trace.c \ + fuse-extra.c guts-extra.c guts-parse.c guts-tables.c + +noinst_HEADERS = fuse_kernel.h fuse-extra.h glusterfs-guts.h glusterfs-fuse.h guts-lowlevel.h \ + guts-parse.h guts-replay.h guts-tables.h guts-trace.h + +glusterfs_guts_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la -lfuse + +AM_CFLAGS = -fPIC -Wall -pthread + +AM_CPPFLAGS = -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -DFUSE_USE_VERSION=26 \ + -I$(top_srcdir)/libglusterfs/src -DDATADIR=\"$(localstatedir)\" \ + -DCONFDIR=\"$(sysconfdir)/glusterfs\" + +CLEANFILES = diff --git a/glusterfs-guts/src/fuse-bridge.c b/glusterfs-guts/src/fuse-bridge.c new file mode 100644 index 000000000..0972563c6 --- /dev/null +++ b/glusterfs-guts/src/fuse-bridge.c @@ -0,0 +1,2724 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +#include +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif /* _CONFIG_H */ + +#include "glusterfs.h" +#include "logging.h" +#include "xlator.h" +#include "glusterfs.h" +#include "transport.h" +#include "defaults.h" +#include "common-utils.h" + +#include + +#include "fuse-extra.h" +#include "list.h" + +#include "guts-lowlevel.h" + +#define BIG_FUSE_CHANNEL_SIZE 1048576 + +struct fuse_private { + int fd; + struct fuse *fuse; + struct fuse_session *se; + struct fuse_chan *ch; + char *mountpoint; +}; + +char glusterfs_fuse_direct_io_mode = 1; +float glusterfs_fuse_entry_timeout = 1.0; +float glusterfs_fuse_attr_timeout = 1.0; + +#define FI_TO_FD(fi) ((fd_t *)((long)fi->fh)) + +#define FUSE_FOP(state, ret, op, args ...) \ +do { \ + call_frame_t *frame = get_call_frame_for_req (state, 1); \ + xlator_t *xl = frame->this->children ? \ + frame->this->children->xlator : NULL; \ + dict_t *refs = frame->root->req_refs; \ + frame->root->state = state; \ + STACK_WIND (frame, ret, xl, xl->fops->op, args); \ + dict_unref (refs); \ +} while (0) + +#define FUSE_FOP_NOREPLY(state, op, args ...) \ +do { \ + call_frame_t *_frame = get_call_frame_for_req (state, 0); \ + xlator_t *xl = _frame->this->children->xlator; \ + _frame->root->req_refs = NULL; \ + STACK_WIND (_frame, fuse_nop_cbk, xl, xl->fops->op, args); \ +} while (0) + +typedef struct { + loc_t loc; + inode_t *parent; + inode_t *inode; + char *name; +} fuse_loc_t; + +typedef struct { + void *pool; + xlator_t *this; + inode_table_t *itable; + fuse_loc_t fuse_loc; + fuse_loc_t fuse_loc2; + fuse_req_t req; + + int32_t flags; + off_t off; + size_t size; + unsigned long nlookup; + fd_t *fd; + dict_t *dict; + char *name; + char is_revalidate; +} fuse_state_t; + + +static void +loc_wipe (loc_t *loc) +{ + if (loc->inode) { + inode_unref (loc->inode); + loc->inode = NULL; + } + if (loc->path) { + FREE (loc->path); + loc->path = NULL; + } +} + + +static inode_t * +dummy_inode (inode_table_t *table) +{ + inode_t *dummy; + + dummy = CALLOC (1, sizeof (*dummy)); + ERR_ABORT (dummy); + + dummy->table = table; + + INIT_LIST_HEAD (&dummy->list); + INIT_LIST_HEAD (&dummy->inode_hash); + INIT_LIST_HEAD (&dummy->fds); + INIT_LIST_HEAD (&dummy->dentry.name_hash); + INIT_LIST_HEAD (&dummy->dentry.inode_list); + + dummy->ref = 1; + dummy->ctx = get_new_dict (); + + LOCK_INIT (&dummy->lock); + return dummy; +} + +static void +fuse_loc_wipe (fuse_loc_t *fuse_loc) +{ + loc_wipe (&fuse_loc->loc); + if (fuse_loc->name) { + FREE (fuse_loc->name); + fuse_loc->name = NULL; + } + if (fuse_loc->inode) { + inode_unref (fuse_loc->inode); + fuse_loc->inode = NULL; + } + if (fuse_loc->parent) { + inode_unref (fuse_loc->parent); + fuse_loc->parent = NULL; + } +} + + +static void +free_state (fuse_state_t *state) +{ + fuse_loc_wipe (&state->fuse_loc); + + fuse_loc_wipe (&state->fuse_loc2); + + if (state->dict) { + dict_unref (state->dict); + state->dict = (void *)0xaaaaeeee; + } + if (state->name) { + FREE (state->name); + state->name = NULL; + } +#ifdef DEBUG + memset (state, 0x90, sizeof (*state)); +#endif + FREE (state); + state = NULL; +} + + +static int32_t +fuse_nop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + if (frame->root->state) + free_state (frame->root->state); + + frame->root->state = EEEEKS; + STACK_DESTROY (frame->root); + return 0; +} + +fuse_state_t * +state_from_req (fuse_req_t req) +{ + fuse_state_t *state; + transport_t *trans = fuse_req_userdata (req); + + state = (void *)calloc (1, sizeof (*state)); + ERR_ABORT (state); + state->pool = trans->xl->ctx->pool; + state->itable = trans->xl->itable; + state->req = req; + state->this = trans->xl; + + return state; +} + + +static call_frame_t * +get_call_frame_for_req (fuse_state_t *state, char d) +{ + call_pool_t *pool = state->pool; + fuse_req_t req = state->req; + const struct fuse_ctx *ctx = NULL; + call_ctx_t *cctx = NULL; + transport_t *trans = NULL; + + cctx = CALLOC (1, sizeof (*cctx)); + ERR_ABORT (cctx); + cctx->frames.root = cctx; + + if (req) { + ctx = fuse_req_ctx(req); + + cctx->uid = ctx->uid; + cctx->gid = ctx->gid; + cctx->pid = ctx->pid; + cctx->unique = req_callid (req); + } + + if (req) { + trans = fuse_req_userdata (req); + cctx->frames.this = trans->xl; + cctx->trans = trans; + } else { + cctx->frames.this = state->this; + } + + if (d) { + cctx->req_refs = dict_ref (get_new_dict ()); + dict_set (cctx->req_refs, NULL, trans->buf); + cctx->req_refs->is_locked = 1; + } + + cctx->pool = pool; + LOCK (&pool->lock); + list_add (&cctx->all_frames, &pool->all_frames); + UNLOCK (&pool->lock); + + return &cctx->frames; +} + + +static void +fuse_loc_fill (fuse_loc_t *fuse_loc, + fuse_state_t *state, + ino_t ino, + const char *name) +{ + size_t n; + inode_t *inode, *parent = NULL; + + /* resistance against multiple invocation of loc_fill not to get + reference leaks via inode_search() */ + inode = fuse_loc->inode; + if (!inode) { + inode = inode_search (state->itable, ino, name); + } + fuse_loc->inode = inode; + + if (name) { + if (!fuse_loc->name) + fuse_loc->name = strdup (name); + + parent = fuse_loc->parent; + if (!parent) { + if (inode) + parent = inode_parent (inode, ino); + else + parent = inode_search (state->itable, ino, NULL); + } + } + fuse_loc->parent = parent; + + if (inode) { + fuse_loc->loc.inode = inode_ref (inode); + fuse_loc->loc.ino = inode->ino; + } + + if (parent) { + n = inode_path (parent, name, NULL, 0) + 1; + fuse_loc->loc.path = CALLOC (1, n); + ERR_ABORT (fuse_loc->loc.path); + inode_path (parent, name, (char *)fuse_loc->loc.path, n); + } else if (inode) { + n = inode_path (inode, NULL, NULL, 0) + 1; + fuse_loc->loc.path = CALLOC (1, n); + ERR_ABORT (fuse_loc->loc.path); + inode_path (inode, NULL, (char *)fuse_loc->loc.path, n); + } +} + +static int32_t +fuse_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stat, + dict_t *dict); + +static int32_t +fuse_entry_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + fuse_state_t *state; + fuse_req_t req; + struct fuse_entry_param e = {0, }; + + state = frame->root->state; + req = state->req; + + if (!op_ret) { + if (inode->ino == 1) + buf->st_ino = 1; + } + + if (!op_ret && inode && inode->ino && buf && inode->ino != buf->st_ino) { + /* temporary workaround to handle AFR returning differnt inode number */ + gf_log ("glusterfs-fuse", GF_LOG_WARNING, + "%"PRId64": %s => inode number changed %"PRId64" -> %"PRId64, + frame->root->unique, state->fuse_loc.loc.path, + inode->ino, buf->st_ino); + inode_unref (state->fuse_loc.loc.inode); + state->fuse_loc.loc.inode = dummy_inode (state->itable); + state->is_revalidate = 2; + + STACK_WIND (frame, fuse_lookup_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->lookup, + &state->fuse_loc.loc, + 0); + + return 0; + } + + if (op_ret == 0) { + ino_t ino = buf->st_ino; + inode_t *fuse_inode; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s => %"PRId64, frame->root->unique, + state->fuse_loc.loc.path, ino); + + try_again: + fuse_inode = inode_update (state->itable, state->fuse_loc.parent, + state->fuse_loc.name, buf); + + if (fuse_inode->ctx) { + /* if the inode was already in the hash, checks to flush out + old name hashes */ + if ((fuse_inode->st_mode ^ buf->st_mode) & S_IFMT) { + gf_log ("glusterfs-fuse", GF_LOG_WARNING, + "%"PRId64": %s => %"PRId64" Rehashing %x/%x", + frame->root->unique, + state->fuse_loc.loc.path, ino, (S_IFMT & buf->st_ino), + (S_IFMT & fuse_inode->st_mode)); + + fuse_inode->st_mode = buf->st_mode; + inode_unhash_name (state->itable, fuse_inode); + inode_unref (fuse_inode); + goto try_again; + } + if (buf->st_nlink == 1) { + /* no other name hashes should exist */ + if (!list_empty (&fuse_inode->dentry.inode_list)) { + gf_log ("glusterfs-fuse", GF_LOG_WARNING, + "%"PRId64": %s => %"PRId64" Rehashing because st_nlink less than dentry maps", + frame->root->unique, + state->fuse_loc.loc.path, ino); + inode_unhash_name (state->itable, fuse_inode); + inode_unref (fuse_inode); + goto try_again; + } + if ((state->fuse_loc.parent != fuse_inode->dentry.parent) || + strcmp (state->fuse_loc.name, fuse_inode->dentry.name)) { + gf_log ("glusterfs-fuse", GF_LOG_WARNING, + "%"PRId64": %s => %"PRId64" Rehashing because single st_nlink does not match dentry map", + frame->root->unique, + state->fuse_loc.loc.path, ino); + inode_unhash_name (state->itable, fuse_inode); + inode_unref (fuse_inode); + goto try_again; + } + } + } + + if ((fuse_inode->ctx != inode->ctx) && + list_empty (&fuse_inode->fds)) { + dict_t *swap = inode->ctx; + inode->ctx = fuse_inode->ctx; + fuse_inode->ctx = swap; + fuse_inode->generation = inode->generation; + fuse_inode->st_mode = buf->st_mode; + } + + inode_lookup (fuse_inode); + + inode_unref (fuse_inode); + + /* TODO: make these timeouts configurable (via meta?) */ + e.ino = fuse_inode->ino; + e.generation = buf->st_ctime; + e.entry_timeout = glusterfs_fuse_entry_timeout; + e.attr_timeout = glusterfs_fuse_attr_timeout; + e.attr = *buf; + e.attr.st_blksize = BIG_FUSE_CHANNEL_SIZE; + if (state->fuse_loc.parent) + fuse_reply_entry (req, &e); + else + fuse_reply_attr (req, buf, glusterfs_fuse_attr_timeout); + } else { + if (state->is_revalidate == -1 && op_errno == ENOENT) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s => -1 (%d)", frame->root->unique, + state->fuse_loc.loc.path, op_errno); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s => -1 (%d)", frame->root->unique, + state->fuse_loc.loc.path, op_errno); + } + + if (state->is_revalidate == 1) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "unlinking stale dentry for `%s'", + state->fuse_loc.loc.path); + + if (state->fuse_loc.parent) + inode_unlink (state->itable, state->fuse_loc.parent, + state->fuse_loc.name); + + inode_unref (state->fuse_loc.loc.inode); + state->fuse_loc.loc.inode = dummy_inode (state->itable); + state->is_revalidate = 2; + + STACK_WIND (frame, fuse_lookup_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->lookup, + &state->fuse_loc.loc, 0); + + return 0; + } + + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + return 0; +} + + +static int32_t +fuse_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stat, + dict_t *dict) +{ + fuse_entry_cbk (frame, cookie, this, op_ret, op_errno, inode, stat); + return 0; +} + + +static void +fuse_lookup (fuse_req_t req, + fuse_ino_t par, + const char *name) +{ + fuse_state_t *state; + + state = state_from_req (req); + + fuse_loc_fill (&state->fuse_loc, state, par, name); + + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": LOOKUP %s", req_callid (req), + state->fuse_loc.loc.path); + + state->fuse_loc.loc.inode = dummy_inode (state->itable); + /* to differntiate in entry_cbk what kind of call it is */ + state->is_revalidate = -1; + } else { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": LOOKUP %s(%"PRId64")", req_callid (req), + state->fuse_loc.loc.path, state->fuse_loc.loc.inode->ino); + state->is_revalidate = 1; + } + + FUSE_FOP (state, fuse_lookup_cbk, lookup, + &state->fuse_loc.loc, 0); +} + + +static void +fuse_forget (fuse_req_t req, + fuse_ino_t ino, + unsigned long nlookup) +{ + inode_t *fuse_inode; + fuse_state_t *state; + + if (ino == 1) { + fuse_reply_none (req); + return; + } + + state = state_from_req (req); + fuse_inode = inode_search (state->itable, ino, NULL); + inode_forget (fuse_inode, nlookup); + inode_unref (fuse_inode); + + free_state (state); + fuse_reply_none (req); +} + + +static int32_t +fuse_attr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + fuse_state_t *state; + fuse_req_t req; + + state = frame->root->state; + req = state->req; + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s => %"PRId64, frame->root->unique, + state->fuse_loc.loc.path ? state->fuse_loc.loc.path : "ERR", + buf->st_ino); + /* TODO: make these timeouts configurable via meta */ + /* TODO: what if the inode number has changed by now */ + buf->st_blksize = BIG_FUSE_CHANNEL_SIZE; + fuse_reply_attr (req, buf, glusterfs_fuse_attr_timeout); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64"; %s => -1 (%d)", frame->root->unique, + state->fuse_loc.loc.path ? state->fuse_loc.loc.path : "ERR", + op_errno); + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + return 0; +} + + +static void +fuse_getattr (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + state = state_from_req (req); + + if (ino == 1) { + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (state->fuse_loc.loc.inode) + state->is_revalidate = 1; + else + state->is_revalidate = -1; + FUSE_FOP (state, + fuse_lookup_cbk, lookup, &state->fuse_loc.loc, 0); + return; + } + + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": GETATTR %"PRId64" (%s) (fuse_loc_fill() returned NULL inode)", + req_callid (req), (int64_t)ino, state->fuse_loc.loc.path); + fuse_reply_err (req, EINVAL); + return; + } + + if (list_empty (&state->fuse_loc.loc.inode->fds) || + S_ISDIR (state->fuse_loc.loc.inode->st_mode)) { + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": GETATTR %"PRId64" (%s)", + req_callid (req), (int64_t)ino, state->fuse_loc.loc.path); + + FUSE_FOP (state, + fuse_attr_cbk, + stat, + &state->fuse_loc.loc); + } else { + fd_t *fd = list_entry (state->fuse_loc.loc.inode->fds.next, + fd_t, inode_list); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FGETATTR %"PRId64" (%s/%p)", + req_callid (req), (int64_t)ino, state->fuse_loc.loc.path, fd); + + FUSE_FOP (state, + fuse_attr_cbk, + fstat, fd); + } +} + + +static int32_t +fuse_fd_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + fuse_state_t *state; + fuse_req_t req; + + state = frame->root->state; + req = state->req; + fd = state->fd; + + if (op_ret >= 0) { + struct fuse_file_info fi = {0, }; + + LOCK (&fd->inode->lock); + list_add (&fd->inode_list, &fd->inode->fds); + UNLOCK (&fd->inode->lock); + + fi.fh = (unsigned long) fd; + fi.flags = state->flags; + + if (!S_ISDIR (fd->inode->st_mode)) { + if ((fi.flags & 3) && glusterfs_fuse_direct_io_mode) + fi.direct_io = 1; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s => %p", frame->root->unique, + state->fuse_loc.loc.path, fd); + + if (fuse_reply_open (req, &fi) == -ENOENT) { + gf_log ("glusterfs-fuse", GF_LOG_WARNING, "open() got EINTR"); + state->req = 0; + + if (S_ISDIR (fd->inode->st_mode)) + FUSE_FOP_NOREPLY (state, closedir, fd); + else + FUSE_FOP_NOREPLY (state, close, fd); + } + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s => -1 (%d)", frame->root->unique, + state->fuse_loc.loc.path, op_errno); + fuse_reply_err (req, op_errno); + fd_destroy (fd); + } + + free_state (state); + STACK_DESTROY (frame->root); + return 0; +} + + + +static void +do_chmod (fuse_req_t req, + fuse_ino_t ino, + struct stat *attr, + struct fuse_file_info *fi) +{ + fuse_state_t *state = state_from_req (req); + + if (fi) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FCHMOD %p", req_callid (req), FI_TO_FD (fi)); + + FUSE_FOP (state, + fuse_attr_cbk, + fchmod, + FI_TO_FD (fi), + attr->st_mode); + } else { + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": CHMOD %"PRId64" (%s) (fuse_loc_fill() returned NULL inode)", + req_callid (req), (int64_t)ino, state->fuse_loc.loc.path); + fuse_reply_err (req, EINVAL); + return; + } + + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": CHMOD %s", req_callid (req), + state->fuse_loc.loc.path); + + FUSE_FOP (state, + fuse_attr_cbk, + chmod, + &state->fuse_loc.loc, + attr->st_mode); + } +} + +static void +do_chown (fuse_req_t req, + fuse_ino_t ino, + struct stat *attr, + int valid, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t) -1; + gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t) -1; + + state = state_from_req (req); + + if (fi) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FCHOWN %p", req_callid (req), FI_TO_FD (fi)); + + FUSE_FOP (state, + fuse_attr_cbk, + fchown, + FI_TO_FD (fi), + uid, + gid); + } else { + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": CHOWN %"PRId64" (%s) (fuse_loc_fill() returned NULL inode)", + req_callid (req), (int64_t)ino, state->fuse_loc.loc.path); + fuse_reply_err (req, EINVAL); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": CHOWN %s", req_callid (req), + state->fuse_loc.loc.path); + + FUSE_FOP (state, + fuse_attr_cbk, + chown, + &state->fuse_loc.loc, + uid, + gid); + } +} + +static void +do_truncate (fuse_req_t req, + fuse_ino_t ino, + struct stat *attr, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + state = state_from_req (req); + + if (fi) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FTRUNCATE %p/%"PRId64, req_callid (req), + FI_TO_FD (fi), attr->st_size); + + FUSE_FOP (state, + fuse_attr_cbk, + ftruncate, + FI_TO_FD (fi), + attr->st_size); + } else { + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": TRUNCATE %s/%"PRId64" (fuse_loc_fill() returned NULL inode)", + req_callid (req), state->fuse_loc.loc.path, attr->st_size); + fuse_reply_err (req, EINVAL); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": TRUNCATE %s/%"PRId64, req_callid (req), + state->fuse_loc.loc.path, attr->st_size); + + FUSE_FOP (state, + fuse_attr_cbk, + truncate, + &state->fuse_loc.loc, + attr->st_size); + } + + return; +} + +static void +do_utimes (fuse_req_t req, + fuse_ino_t ino, + struct stat *attr) +{ + fuse_state_t *state; + + struct timespec tv[2]; +#ifdef FUSE_STAT_HAS_NANOSEC + tv[0] = ST_ATIM(attr); + tv[1] = ST_MTIM(attr); +#else + tv[0].tv_sec = attr->st_atime; + tv[0].tv_nsec = 0; + tv[1].tv_sec = attr->st_mtime; + tv[1].tv_nsec = 0; +#endif + + state = state_from_req (req); + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": UTIMENS %s (fuse_loc_fill() returned NULL inode)", + req_callid (req), state->fuse_loc.loc.path); + fuse_reply_err (req, EINVAL); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": UTIMENS %s", req_callid (req), + state->fuse_loc.loc.path); + + FUSE_FOP (state, + fuse_attr_cbk, + utimens, + &state->fuse_loc.loc, + tv); +} + +static void +fuse_setattr (fuse_req_t req, + fuse_ino_t ino, + struct stat *attr, + int valid, + struct fuse_file_info *fi) +{ + + if (valid & FUSE_SET_ATTR_MODE) + do_chmod (req, ino, attr, fi); + else if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) + do_chown (req, ino, attr, valid, fi); + else if (valid & FUSE_SET_ATTR_SIZE) + do_truncate (req, ino, attr, fi); + else if ((valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) == (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) + do_utimes (req, ino, attr); + + if (!valid) + fuse_getattr (req, ino, fi); +} + + +static int32_t +fuse_err_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s => 0", frame->root->unique, + state->fuse_loc.loc.path ? state->fuse_loc.loc.path : "ERR"); + fuse_reply_err (req, 0); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s => -1 (%d)", frame->root->unique, + state->fuse_loc.loc.path ? state->fuse_loc.loc.path : "ERR", + op_errno); + fuse_reply_err (req, op_errno); + } + + if (state->fd) + fd_destroy (state->fd); + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + + +static int32_t +fuse_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret == 0) + inode_unlink (state->itable, state->fuse_loc.parent, state->fuse_loc.name); + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s => 0", frame->root->unique, + state->fuse_loc.loc.path); + + fuse_reply_err (req, 0); + } else { + gf_log ("glusterfs-fuse", (op_errno == ENOTEMPTY) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "%"PRId64": %s => -1 (%d)", frame->root->unique, + state->fuse_loc.loc.path, op_errno); + + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_access (fuse_req_t req, + fuse_ino_t ino, + int mask) +{ + fuse_state_t *state; + + state = state_from_req (req); + + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": ACCESS %"PRId64" (%s) (fuse_loc_fill() returned NULL inode)", + req_callid (req), (int64_t)ino, state->fuse_loc.loc.path); + fuse_reply_err (req, EINVAL); + return; + } + + FUSE_FOP (state, + fuse_err_cbk, + access, + &state->fuse_loc.loc, + mask); + + return; +} + + + +static int32_t +fuse_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *linkname) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret > 0) { + ((char *)linkname)[op_ret] = '\0'; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s => %s", frame->root->unique, + state->fuse_loc.loc.path, linkname); + + fuse_reply_readlink(req, linkname); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s => -1 (%d)", frame->root->unique, + state->fuse_loc.loc.path, op_errno); + fuse_reply_err(req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + +static void +fuse_readlink (fuse_req_t req, + fuse_ino_t ino) +{ + fuse_state_t *state; + + state = state_from_req (req); + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64" READLINK %s/%"PRId64" (fuse_loc_fill() returned NULL inode)", + req_callid (req), state->fuse_loc.loc.path, state->fuse_loc.loc.inode->ino); + fuse_reply_err (req, EINVAL); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64" READLINK %s/%"PRId64, req_callid (req), + state->fuse_loc.loc.path, state->fuse_loc.loc.inode->ino); + + FUSE_FOP (state, + fuse_readlink_cbk, + readlink, + &state->fuse_loc.loc, + 4096); + + return; +} + + +static void +fuse_mknod (fuse_req_t req, + fuse_ino_t par, + const char *name, + mode_t mode, + dev_t rdev) +{ + fuse_state_t *state; + + state = state_from_req (req); + fuse_loc_fill (&state->fuse_loc, state, par, name); + + state->fuse_loc.loc.inode = dummy_inode (state->itable); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": MKNOD %s", req_callid (req), + state->fuse_loc.loc.path); + + FUSE_FOP (state, + fuse_entry_cbk, + mknod, + &state->fuse_loc.loc, + mode, + rdev); + + return; +} + + +static void +fuse_mkdir (fuse_req_t req, + fuse_ino_t par, + const char *name, + mode_t mode) +{ + fuse_state_t *state; + + state = state_from_req (req); + fuse_loc_fill (&state->fuse_loc, state, par, name); + + state->fuse_loc.loc.inode = dummy_inode (state->itable); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": MKDIR %s", req_callid (req), + state->fuse_loc.loc.path); + + FUSE_FOP (state, + fuse_entry_cbk, + mkdir, + &state->fuse_loc.loc, + mode); + + return; +} + + +static void +fuse_unlink (fuse_req_t req, + fuse_ino_t par, + const char *name) +{ + fuse_state_t *state; + + state = state_from_req (req); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": UNLINK %s", req_callid (req), + state->fuse_loc.loc.path); + + fuse_loc_fill (&state->fuse_loc, state, par, name); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": UNLINK %s (fuse_loc_fill() returned NULL inode)", req_callid (req), + state->fuse_loc.loc.path); + fuse_reply_err (req, EINVAL); + return; + } + + FUSE_FOP (state, + fuse_unlink_cbk, + unlink, + &state->fuse_loc.loc); + + return; +} + + +static void +fuse_rmdir (fuse_req_t req, + fuse_ino_t par, + const char *name) +{ + fuse_state_t *state; + + state = state_from_req (req); + fuse_loc_fill (&state->fuse_loc, state, par, name); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": RMDIR %s (fuse_loc_fill() returned NULL inode)", req_callid (req), + state->fuse_loc.loc.path); + fuse_reply_err (req, EINVAL); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": RMDIR %s", req_callid (req), + state->fuse_loc.loc.path); + + FUSE_FOP (state, + fuse_unlink_cbk, + rmdir, + &state->fuse_loc.loc); + + return; +} + + +static void +fuse_symlink (fuse_req_t req, + const char *linkname, + fuse_ino_t par, + const char *name) +{ + fuse_state_t *state; + + state = state_from_req (req); + fuse_loc_fill (&state->fuse_loc, state, par, name); + + state->fuse_loc.loc.inode = dummy_inode (state->itable); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": SYMLINK %s -> %s", req_callid (req), + state->fuse_loc.loc.path, linkname); + + FUSE_FOP (state, + fuse_entry_cbk, + symlink, + linkname, + &state->fuse_loc.loc); + return; +} + + +int32_t +fuse_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s -> %s => 0", frame->root->unique, + state->fuse_loc.loc.path, + state->fuse_loc2.loc.path); + + inode_t *inode; + { + /* ugly ugly - to stay blind to situation where + rename happens on a new inode + */ + buf->st_ino = state->fuse_loc.loc.ino; + } + inode = inode_rename (state->itable, + state->fuse_loc.parent, + state->fuse_loc.name, + state->fuse_loc2.parent, + state->fuse_loc2.name, + buf); + + inode_unref (inode); + fuse_reply_err (req, 0); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s -> %s => -1 (%d)", frame->root->unique, + state->fuse_loc.loc.path, + state->fuse_loc2.loc.path, op_errno); + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + return 0; +} + + +static void +fuse_rename (fuse_req_t req, + fuse_ino_t oldpar, + const char *oldname, + fuse_ino_t newpar, + const char *newname) +{ + fuse_state_t *state; + + state = state_from_req (req); + + fuse_loc_fill (&state->fuse_loc, state, oldpar, oldname); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "for %s %"PRId64": RENAME `%s' -> `%s' (fuse_loc_fill() returned NULL inode)", + state->fuse_loc.loc.path, req_callid (req), state->fuse_loc.loc.path, + state->fuse_loc2.loc.path); + + fuse_reply_err (req, EINVAL); + return; + } + + fuse_loc_fill (&state->fuse_loc2, state, newpar, newname); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": RENAME `%s' -> `%s'", + req_callid (req), state->fuse_loc.loc.path, + state->fuse_loc2.loc.path); + + FUSE_FOP (state, + fuse_rename_cbk, + rename, + &state->fuse_loc.loc, + &state->fuse_loc2.loc); + + return; +} + + +static void +fuse_link (fuse_req_t req, + fuse_ino_t ino, + fuse_ino_t par, + const char *name) +{ + fuse_state_t *state; + + state = state_from_req (req); + + fuse_loc_fill (&state->fuse_loc, state, par, name); + fuse_loc_fill (&state->fuse_loc2, state, ino, NULL); + if (!state->fuse_loc2.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "fuse_loc_fill() returned NULL inode for %s %"PRId64": LINK %s %s", + state->fuse_loc2.loc.path, req_callid (req), + state->fuse_loc2.loc.path, state->fuse_loc.loc.path); + fuse_reply_err (req, EINVAL); + return; + } + + state->fuse_loc.loc.inode = inode_ref (state->fuse_loc2.loc.inode); + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": LINK %s %s", req_callid (req), + state->fuse_loc2.loc.path, state->fuse_loc.loc.path); + + FUSE_FOP (state, + fuse_entry_cbk, + link, + &state->fuse_loc2.loc, + state->fuse_loc.loc.path); + + return; +} + + +static int32_t +fuse_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + struct fuse_file_info fi = {0, }; + struct fuse_entry_param e = {0, }; + + fd = state->fd; + + fi.flags = state->flags; + if (op_ret >= 0) { + inode_t *fuse_inode; + fi.fh = (unsigned long) fd; + + if ((fi.flags & 3) && glusterfs_fuse_direct_io_mode) + fi.direct_io = 1; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s => %p", frame->root->unique, + state->fuse_loc.loc.path, fd); + + fuse_inode = inode_update (state->itable, + state->fuse_loc.parent, + state->fuse_loc.name, + buf); + if (fuse_inode->ctx) { + inode_unhash_name (state->itable, fuse_inode); + inode_unref (fuse_inode); + + fuse_inode = inode_update (state->itable, + state->fuse_loc.parent, + state->fuse_loc.name, + buf); + } + + + { + if (fuse_inode->ctx != inode->ctx) { + dict_t *swap = inode->ctx; + inode->ctx = fuse_inode->ctx; + fuse_inode->ctx = swap; + fuse_inode->generation = inode->generation; + fuse_inode->st_mode = buf->st_mode; + } + + inode_lookup (fuse_inode); + + /* list_del (&fd->inode_list); */ + + LOCK (&fuse_inode->lock); + list_add (&fd->inode_list, &fuse_inode->fds); + inode_unref (fd->inode); + fd->inode = inode_ref (fuse_inode); + UNLOCK (&fuse_inode->lock); + + // inode_destroy (inode); + } + + inode_unref (fuse_inode); + + e.ino = fuse_inode->ino; + e.generation = buf->st_ctime; + e.entry_timeout = glusterfs_fuse_entry_timeout; + e.attr_timeout = glusterfs_fuse_attr_timeout; + e.attr = *buf; + e.attr.st_blksize = BIG_FUSE_CHANNEL_SIZE; + + fi.keep_cache = 0; + + // if (fi.flags & 1) + // fi.direct_io = 1; + + if (fuse_reply_create (req, &e, &fi) == -ENOENT) { + gf_log ("glusterfs-fuse", GF_LOG_WARNING, "create() got EINTR"); + /* TODO: forget this node too */ + state->req = 0; + FUSE_FOP_NOREPLY (state, close, fd); + } + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s => -1 (%d)", req_callid (req), + state->fuse_loc.loc.path, op_errno); + fuse_reply_err (req, op_errno); + fd_destroy (fd); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_create (fuse_req_t req, + fuse_ino_t par, + const char *name, + mode_t mode, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd; + + state = state_from_req (req); + state->flags = fi->flags; + + fuse_loc_fill (&state->fuse_loc, state, par, name); + state->fuse_loc.loc.inode = dummy_inode (state->itable); + + fd = fd_create (state->fuse_loc.loc.inode); + state->fd = fd; + + + LOCK (&fd->inode->lock); + list_del_init (&fd->inode_list); + UNLOCK (&fd->inode->lock); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": CREATE %s", req_callid (req), + state->fuse_loc.loc.path); + + FUSE_FOP (state, + fuse_create_cbk, + create, + &state->fuse_loc.loc, + state->flags, + mode, fd); + + return; +} + + +static void +fuse_open (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd; + + state = state_from_req (req); + state->flags = fi->flags; + + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": OPEN %s (fuse_loc_fill() returned NULL inode)", req_callid (req), + state->fuse_loc.loc.path); + + fuse_reply_err (req, EINVAL); + return; + } + + + fd = fd_create (state->fuse_loc.loc.inode); + state->fd = fd; + + LOCK (&fd->inode->lock); + list_del_init (&fd->inode_list); + UNLOCK (&fd->inode->lock); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": OPEN %s", req_callid (req), + state->fuse_loc.loc.path); + + FUSE_FOP (state, + fuse_fd_cbk, + open, + &state->fuse_loc.loc, + fi->flags, fd); + + return; +} + + +static int32_t +fuse_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret >= 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": READ => %d/%d,%"PRId64"/%"PRId64, frame->root->unique, + op_ret, state->size, state->off, stbuf->st_size); + + fuse_reply_vec (req, vector, count); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": READ => -1 (%d)", frame->root->unique, op_errno); + + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + +static void +fuse_readv (fuse_req_t req, + fuse_ino_t ino, + size_t size, + off_t off, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + state = state_from_req (req); + state->size = size; + state->off = off; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": READ (%p, size=%d, offset=%"PRId64")", + req_callid (req), FI_TO_FD (fi), size, off); + + FUSE_FOP (state, + fuse_readv_cbk, + readv, + FI_TO_FD (fi), + size, + off); + +} + + +static int32_t +fuse_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret >= 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": WRITE => %d/%d,%"PRId64"/%"PRId64, frame->root->unique, + op_ret, state->size, state->off, stbuf->st_size); + + fuse_reply_write (req, op_ret); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": WRITE => -1 (%d)", frame->root->unique, op_errno); + + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_write (fuse_req_t req, + fuse_ino_t ino, + const char *buf, + size_t size, + off_t off, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + struct iovec vector; + + state = state_from_req (req); + state->size = size; + state->off = off; + + vector.iov_base = (void *)buf; + vector.iov_len = size; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": WRITE (%p, size=%d, offset=%"PRId64")", + req_callid (req), FI_TO_FD (fi), size, off); + + FUSE_FOP (state, + fuse_writev_cbk, + writev, + FI_TO_FD (fi), + &vector, + 1, + off); + return; +} + + +static void +fuse_flush (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + state = state_from_req (req); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FLUSH %p", req_callid (req), FI_TO_FD (fi)); + + FUSE_FOP (state, + fuse_err_cbk, + flush, + FI_TO_FD (fi)); + + return; +} + + +static void +fuse_release (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + state = state_from_req (req); + state->fd = FI_TO_FD (fi); + + LOCK (&state->fd->inode->lock); + list_del_init (&state->fd->inode_list); + UNLOCK (&state->fd->inode->lock); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": CLOSE %p", req_callid (req), FI_TO_FD (fi)); + + FUSE_FOP (state, fuse_err_cbk, close, state->fd); + return; +} + + +static void +fuse_fsync (fuse_req_t req, + fuse_ino_t ino, + int datasync, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + state = state_from_req (req); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FSYNC %p", req_callid (req), FI_TO_FD (fi)); + + FUSE_FOP (state, + fuse_err_cbk, + fsync, + FI_TO_FD (fi), + datasync); + + return; +} + +static void +fuse_opendir (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd; + + state = state_from_req (req); + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": OPEN %s (fuse_loc_fill() returned NULL inode)", req_callid (req), + state->fuse_loc.loc.path); + + fuse_reply_err (req, EINVAL); + return; + } + + + fd = fd_create (state->fuse_loc.loc.inode); + state->fd = fd; + + LOCK (&fd->inode->lock); + list_del_init (&fd->inode_list); + UNLOCK (&fd->inode->lock); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": OPEN %s", req_callid (req), + state->fuse_loc.loc.path); + + FUSE_FOP (state, + fuse_fd_cbk, + opendir, + &state->fuse_loc.loc, fd); +} + +#if 0 + +void +fuse_dir_reply (fuse_req_t req, + size_t size, + off_t off, + fd_t *fd) +{ + char *buf; + size_t size_limited; + data_t *buf_data; + + buf_data = dict_get (fd->ctx, "__fuse__getdents__internal__@@!!"); + buf = buf_data->data; + size_limited = size; + + if (size_limited > (buf_data->len - off)) + size_limited = (buf_data->len - off); + + if (off > buf_data->len) { + size_limited = 0; + off = 0; + } + + fuse_reply_buf (req, buf + off, size_limited); +} + + +static int32_t +fuse_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret < 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": READDIR => -1 (%d)", + frame->root->unique, op_errno); + + fuse_reply_err (state->req, op_errno); + } else { + dir_entry_t *trav; + size_t size = 0; + char *buf; + data_t *buf_data; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": READDIR => %d entries", + frame->root->unique, count); + + for (trav = entries->next; trav; trav = trav->next) { + size += fuse_add_direntry (req, NULL, 0, trav->name, NULL, 0); + } + + buf = CALLOC (1, size); + ERR_ABORT (buf); + buf_data = data_from_dynptr (buf, size); + size = 0; + + for (trav = entries->next; trav; trav = trav->next) { + size_t entry_size; + entry_size = fuse_add_direntry (req, NULL, 0, trav->name, NULL, 0); + fuse_add_direntry (req, buf + size, entry_size, trav->name, + &trav->buf, entry_size + size); + size += entry_size; + } + + dict_set (state->fd->ctx, + "__fuse__getdents__internal__@@!!", + buf_data); + + fuse_dir_reply (state->req, state->size, state->off, state->fd); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + +static void +fuse_getdents (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi, + size_t size, + off_t off, + int32_t flag) +{ + fuse_state_t *state; + fd_t *fd = FI_TO_FD (fi); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": GETDENTS %p", req_callid (req), FI_TO_FD (fi)); + + if (!off) + dict_del (fd->ctx, "__fuse__getdents__internal__@@!!"); + + if (dict_get (fd->ctx, "__fuse__getdents__internal__@@!!")) { + fuse_dir_reply (req, size, off, fd); + return; + } + + state = state_from_req (req); + + state->size = size; + state->off = off; + state->fd = fd; + + FUSE_FOP (state, + fuse_getdents_cbk, + getdents, + fd, + size, + off, + 0); +} + +#endif + +static int32_t +fuse_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *buf) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret >= 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": READDIR => %d/%d,%"PRId64, frame->root->unique, + op_ret, state->size, state->off); + + fuse_reply_buf (req, (void *)buf, op_ret); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": READDIR => -1 (%d)", frame->root->unique, op_errno); + + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; + +} + +static void +fuse_readdir (fuse_req_t req, + fuse_ino_t ino, + size_t size, + off_t off, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + state = state_from_req (req); + state->size = size; + state->off = off; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": READDIR (%p, size=%d, offset=%"PRId64")", + req_callid (req), FI_TO_FD (fi), size, off); + + FUSE_FOP (state, + fuse_readdir_cbk, + readdir, + FI_TO_FD (fi), + size, + off); +} + + +static void +fuse_releasedir (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + state = state_from_req (req); + state->fd = FI_TO_FD (fi); + + LOCK (&state->fd->inode->lock); + list_del_init (&state->fd->inode_list); + UNLOCK (&state->fd->inode->lock); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": CLOSEDIR %p", req_callid (req), FI_TO_FD (fi)); + + FUSE_FOP (state, fuse_err_cbk, closedir, state->fd); +} + + +static void +fuse_fsyncdir (fuse_req_t req, + fuse_ino_t ino, + int datasync, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + state = state_from_req (req); + + FUSE_FOP (state, + fuse_err_cbk, + fsyncdir, + FI_TO_FD (fi), + datasync); + + return; +} + + +static int32_t +fuse_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + /* + Filesystems (like ZFS on solaris) reports + different ->f_frsize and ->f_bsize. Old coreutils + df tools use statfs() and do not see ->f_frsize. + the ->f_blocks, ->f_bavail and ->f_bfree are + w.r.t ->f_frsize and not ->f_bsize which makes the + df tools report wrong values. + + Scale the block counts to match ->f_bsize. + */ + /* TODO: with old coreutils, f_bsize is taken from stat()'s st_blksize + * so the df with old coreutils this wont work :( + */ + + if (op_ret == 0) { + + buf->f_blocks *= buf->f_frsize; + buf->f_blocks /= BIG_FUSE_CHANNEL_SIZE; + + buf->f_bavail *= buf->f_frsize; + buf->f_bavail /= BIG_FUSE_CHANNEL_SIZE; + + buf->f_bfree *= buf->f_frsize; + buf->f_bfree /= BIG_FUSE_CHANNEL_SIZE; + + buf->f_frsize = buf->f_bsize = BIG_FUSE_CHANNEL_SIZE; + + fuse_reply_statfs (req, buf); + + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": ERR => -1 (%d)", frame->root->unique, op_errno); + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_statfs (fuse_req_t req, + fuse_ino_t ino) +{ + fuse_state_t *state; + + state = state_from_req (req); + fuse_loc_fill (&state->fuse_loc, state, 1, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": STATFS (fuse_loc_fill() returned NULL inode)", req_callid (req)); + + fuse_reply_err (req, EINVAL); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": STATFS", req_callid (req)); + + FUSE_FOP (state, + fuse_statfs_cbk, + statfs, + &state->fuse_loc.loc); +} + +static void +fuse_setxattr (fuse_req_t req, + fuse_ino_t ino, + const char *name, + const char *value, + size_t size, + int flags) +{ + fuse_state_t *state; + + state = state_from_req (req); + state->size = size; + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": SETXATTR %s/%"PRId64" (%s) (fuse_loc_fill() returned NULL inode)", + req_callid (req), + state->fuse_loc.loc.path, (int64_t)ino, name); + + fuse_reply_err (req, EINVAL); + return; + } + + state->dict = get_new_dict (); + + dict_set (state->dict, (char *)name, + bin_to_data ((void *)value, size)); + dict_ref (state->dict); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": SETXATTR %s/%"PRId64" (%s)", req_callid (req), + state->fuse_loc.loc.path, (int64_t)ino, name); + + FUSE_FOP (state, + fuse_err_cbk, + setxattr, + &state->fuse_loc.loc, + state->dict, + flags); + + return; +} + + +static int32_t +fuse_xattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + int32_t ret = op_ret; + char *value = ""; + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (ret >= 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s => %d", frame->root->unique, + state->fuse_loc.loc.path, op_ret); + + /* if successful */ + if (state->name) { + /* if callback for getxattr */ + data_t *value_data = dict_get (dict, state->name); + if (value_data) { + ret = value_data->len; /* Don't return the value for '\0' */ + value = value_data->data; + + if (state->size) { + /* if callback for getxattr and asks for value */ + fuse_reply_buf (req, value, ret); + } else { + /* if callback for getxattr and asks for value length only */ + fuse_reply_xattr (req, ret); + } + } else { + fuse_reply_err (req, ENODATA); + } + } else { + /* if callback for listxattr */ + int32_t len = 0; + data_pair_t *trav = dict->members_list; + while (trav) { + len += strlen (trav->key) + 1; + trav = trav->next; + } + value = alloca (len + 1); + ERR_ABORT (value); + len = 0; + trav = dict->members_list; + while (trav) { + strcpy (value + len, trav->key); + value[len + strlen(trav->key)] = '\0'; + len += strlen (trav->key) + 1; + trav = trav->next; + } + if (state->size) { + /* if callback for listxattr and asks for list of keys */ + fuse_reply_buf (req, value, len); + } else { + /* if callback for listxattr and asks for length of keys only */ + fuse_reply_xattr (req, len); + } + } + } else { + /* if failure - no need to check if listxattr or getxattr */ + if (op_errno != ENODATA) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s => -1 (%d)", frame->root->unique, + state->fuse_loc.loc.path, op_errno); + } else { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s => -1 (%d)", frame->root->unique, + state->fuse_loc.loc.path, op_errno); + } + + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_getxattr (fuse_req_t req, + fuse_ino_t ino, + const char *name, + size_t size) +{ + fuse_state_t *state; + + state = state_from_req (req); + state->size = size; + state->name = strdup (name); + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": GETXATTR %s/%"PRId64" (%s) (fuse_loc_fill() returned NULL inode)", + req_callid (req), state->fuse_loc.loc.path, (int64_t)ino, name); + + fuse_reply_err (req, EINVAL); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": GETXATTR %s/%"PRId64" (%s)", req_callid (req), + state->fuse_loc.loc.path, (int64_t)ino, name); + + FUSE_FOP (state, + fuse_xattr_cbk, + getxattr, + &state->fuse_loc.loc); + + return; +} + + +static void +fuse_listxattr (fuse_req_t req, + fuse_ino_t ino, + size_t size) +{ + fuse_state_t *state; + + state = state_from_req (req); + state->size = size; + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": LISTXATTR %s/%"PRId64" (fuse_loc_fill() returned NULL inode)", + req_callid (req), state->fuse_loc.loc.path, (int64_t)ino); + + fuse_reply_err (req, EINVAL); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": LISTXATTR %s/%"PRId64, req_callid (req), + state->fuse_loc.loc.path, (int64_t)ino); + + FUSE_FOP (state, + fuse_xattr_cbk, + getxattr, + &state->fuse_loc.loc); + + return; +} + + +static void +fuse_removexattr (fuse_req_t req, + fuse_ino_t ino, + const char *name) + +{ + fuse_state_t *state; + + state = state_from_req (req); + fuse_loc_fill (&state->fuse_loc, state, ino, NULL); + if (!state->fuse_loc.loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": REMOVEXATTR %s/%"PRId64" (%s) (fuse_loc_fill() returned NULL inode)", + req_callid (req), state->fuse_loc.loc.path, (int64_t)ino, name); + + fuse_reply_err (req, EINVAL); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": REMOVEXATTR %s/%"PRId64" (%s)", req_callid (req), + state->fuse_loc.loc.path, (int64_t)ino, name); + + FUSE_FOP (state, + fuse_err_cbk, + removexattr, + &state->fuse_loc.loc, + name); + + return; +} + +static int32_t +fuse_getlk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + fuse_state_t *state = frame->root->state; + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": ERR => 0", frame->root->unique); + fuse_reply_lock (state->req, lock); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": ERR => -1 (%d)", frame->root->unique, op_errno); + fuse_reply_err (state->req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + +static void +fuse_getlk (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi, + struct flock *lock) +{ + fuse_state_t *state; + + state = state_from_req (req); + state->req = req; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": GETLK %p", req_callid (req), FI_TO_FD (fi)); + + FUSE_FOP (state, + fuse_getlk_cbk, + lk, + FI_TO_FD (fi), + F_GETLK, + lock); + + return; +} + +static int32_t +fuse_setlk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + fuse_state_t *state = frame->root->state; + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": ERR => 0", frame->root->unique); + fuse_reply_err (state->req, 0); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": ERR => -1 (%d)", frame->root->unique, op_errno); + fuse_reply_err (state->req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + +static void +fuse_setlk (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi, + struct flock *lock, + int sleep) +{ + fuse_state_t *state; + + state = state_from_req (req); + state->req = req; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": SETLK %p (sleep=%d)", req_callid (req), FI_TO_FD (fi), + sleep); + + FUSE_FOP (state, + fuse_setlk_cbk, + lk, + FI_TO_FD(fi), + (sleep ? F_SETLKW : F_SETLK), + lock); + + return; +} + + +int32_t +fuse_forget_notify (call_frame_t *frame, xlator_t *this, + inode_t *inode) +{ + return 0; +} + +struct xlator_fops fuse_xl_fops = { + .forget = fuse_forget_notify +}; + +static void +fuse_init (void *data, struct fuse_conn_info *conn) +{ + transport_t *trans = data; + struct fuse_private *priv = trans->private; + xlator_t *xl = trans->xl; + int32_t ret; + + xl->name = "fuse"; + xl->fops = &fuse_xl_fops; + xl->itable = inode_table_new (0, xl); + xl->notify = default_notify; + ret = xlator_tree_init (xl); + if (ret == 0) { + + } else { + fuse_unmount (priv->mountpoint, priv->ch); + exit (1); + } +} + + +static void +fuse_destroy (void *data) +{ + +} + +struct fuse_lowlevel_ops fuse_ops = { + .init = fuse_init, + .destroy = fuse_destroy, + .lookup = fuse_lookup, + .forget = fuse_forget, + .getattr = fuse_getattr, + .setattr = fuse_setattr, + .opendir = fuse_opendir, + .readdir = fuse_readdir, + .releasedir = fuse_releasedir, + .access = fuse_access, + .readlink = fuse_readlink, + .mknod = fuse_mknod, + .mkdir = fuse_mkdir, + .unlink = fuse_unlink, + .rmdir = fuse_rmdir, + .symlink = fuse_symlink, + .rename = fuse_rename, + .link = fuse_link, + .create = fuse_create, + .open = fuse_open, + .read = fuse_readv, + .write = fuse_write, + .flush = fuse_flush, + .release = fuse_release, + .fsync = fuse_fsync, + .fsyncdir = fuse_fsyncdir, + .statfs = fuse_statfs, + .setxattr = fuse_setxattr, + .getxattr = fuse_getxattr, + .listxattr = fuse_listxattr, + .removexattr = fuse_removexattr, + .getlk = fuse_getlk, + .setlk = fuse_setlk +}; + + +static int32_t +fuse_transport_disconnect (transport_t *this) +{ + struct fuse_private *priv = this->private; + + gf_log ("glusterfs-fuse", + GF_LOG_DEBUG, + "cleaning up fuse transport in disconnect handler"); + + fuse_session_remove_chan (priv->ch); + fuse_session_destroy (priv->se); + fuse_unmount (priv->mountpoint, priv->ch); + + FREE (priv); + priv = NULL; + this->private = NULL; + + /* TODO: need graceful exit. every xlator should be ->fini()'ed + and come out of main poll loop cleanly + */ + exit (0); + + return -1; +} + + +static int32_t +fuse_transport_init (transport_t *this, + dict_t *options, + event_notify_fn_t notify) +{ + char *mountpoint = strdup (data_to_str (dict_get (options, + "mountpoint"))); + char *source; + asprintf (&source, "fsname=glusterfs"); + char *argv[] = { "glusterfs", + +#ifndef GF_DARWIN_HOST_OS + "-o", "nonempty", +#endif + "-o", "allow_other", + "-o", "default_permissions", + "-o", source, + "-o", "max_readahead=1048576", + "-o", "max_read=1048576", + "-o", "max_write=1048576", + NULL }; +#ifdef GF_DARWIN_HOST_OS + int argc = 13; +#else + int argc = 15; +#endif + + struct fuse_args args = FUSE_ARGS_INIT(argc, + argv); + struct fuse_private *priv = NULL; + int32_t res; + + priv = CALLOC (1, sizeof (*priv)); + ERR_ABORT (priv); + + + this->notify = notify; + this->private = (void *)priv; + + priv->ch = fuse_mount (mountpoint, &args); + if (!priv->ch) { + gf_log ("glusterfs-fuse", + GF_LOG_ERROR, "fuse_mount failed (%s)\n", strerror (errno)); + fuse_opt_free_args(&args); + goto err_free; + } + + priv->se = fuse_lowlevel_new (&args, &fuse_ops, sizeof (fuse_ops), this); + fuse_opt_free_args(&args); + + res = fuse_set_signal_handlers (priv->se); + if (res == -1) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, "fuse_set_signal_handlers failed"); + goto err; + } + + fuse_session_add_chan (priv->se, priv->ch); + + priv->fd = fuse_chan_fd (priv->ch); + this->buf = data_ref (data_from_dynptr (NULL, 0)); + this->buf->is_locked = 1; + + priv->mountpoint = mountpoint; + + transport_ref (this); + //poll_register (this->xl_private, priv->fd, this); + + return 0; + + err: + fuse_unmount (mountpoint, priv->ch); + err_free: + FREE (mountpoint); + mountpoint = NULL; + return -1; +} + +void +guts_log_req (void *, int32_t); + +static void * +fuse_thread_proc (void *data) +{ + transport_t *trans = data; + struct fuse_private *priv = trans->private; + int32_t res = 0; + data_t *buf = trans->buf; + int32_t ref = 0; + size_t chan_size = fuse_chan_bufsize (priv->ch); + char *recvbuf = CALLOC (1, chan_size); + ERR_ABORT (recvbuf); + + while (!fuse_session_exited (priv->se)) { + int32_t fuse_chan_receive (struct fuse_chan * ch, + char *buf, + int32_t size); + + + res = fuse_chan_receive (priv->ch, + recvbuf, + chan_size); + + if (res == -1) { + transport_disconnect (trans); + } + + buf = trans->buf; + + if (res && res != -1) { + if (buf->len < (res)) { + if (buf->data) { + FREE (buf->data); + buf->data = NULL; + } + buf->data = CALLOC (1, res); + ERR_ABORT (buf->data); + buf->len = res; + } + memcpy (buf->data, recvbuf, res); // evil evil + guts_log_req (buf->data, res); + fuse_session_process (priv->se, + buf->data, + res, + priv->ch); + } + + LOCK (&buf->lock); + ref = buf->refcount; + UNLOCK (&buf->lock); + if (1) { + data_unref (buf); + + trans->buf = data_ref (data_from_dynptr (NULL, 0)); + trans->buf->is_locked = 1; + } + } + + exit (0); + + return NULL; +} + + +static int32_t +fuse_transport_notify (xlator_t *xl, + int32_t event, + void *data, + ...) +{ + transport_t *trans = data; + struct fuse_private *priv = trans->private; + int32_t res = 0; + data_t *buf; + int32_t ref = 0; + + if (event == GF_EVENT_POLLERR) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, "got GF_EVENT_POLLERR"); + transport_disconnect (trans); + return -1; + } + + if (event != GF_EVENT_POLLIN) { + gf_log ("glusterfs-fuse", GF_LOG_WARNING, "Ignoring notify event %d", + event); + return 0; + } + + if (!fuse_session_exited(priv->se)) { + static size_t chan_size = 0; + + int32_t fuse_chan_receive (struct fuse_chan * ch, + char *buf, + int32_t size); + if (!chan_size) + chan_size = fuse_chan_bufsize (priv->ch) ; + + buf = trans->buf; + + if (!buf->data) { + buf->data = MALLOC (chan_size); + ERR_ABORT (buf->data); + buf->len = chan_size; + } + + res = fuse_chan_receive (priv->ch, + buf->data, + chan_size); + /* if (res == -1) { + transport_destroy (trans); + */ + if (res && res != -1) { + /* trace the request and log it to tio file */ + guts_log_req (buf->data, res); + fuse_session_process (priv->se, + buf->data, + res, + priv->ch); + } + + LOCK (&buf->lock); + ref = buf->refcount; + UNLOCK (&buf->lock); + /* TODO do the check with a lock */ + if (ref > 1) { + data_unref (buf); + + // trans->buf = data_ref (data_from_dynptr (malloc (fuse_chan_bufsize (priv->ch)), + trans->buf = data_ref (data_from_dynptr (NULL, 0)); + trans->buf->data = MALLOC (chan_size); + ERR_ABORT (trans->buf->data); + trans->buf->len = chan_size; + trans->buf->is_locked = 1; + } + } else { + transport_disconnect (trans); + } + + /* + if (fuse_session_exited (priv->se)) { + transport_destroy (trans); + res = -1; + }*/ + + return res >= 0 ? 0 : res; +} + +static void +fuse_transport_fini (transport_t *this) +{ + +} + +static struct transport_ops fuse_transport_ops = { + .disconnect = fuse_transport_disconnect, +}; + +static transport_t fuse_transport = { + .ops = &fuse_transport_ops, + .private = NULL, + .xl = NULL, + .init = fuse_transport_init, + .fini = fuse_transport_fini, + .notify = fuse_transport_notify +}; + + +transport_t * +glusterfs_mount (glusterfs_ctx_t *ctx, + const char *mount_point) +{ + dict_t *options = get_new_dict (); + transport_t *new_fuse = CALLOC (1, sizeof (*new_fuse)); + ERR_ABORT (new_fuse); + + memcpy (new_fuse, &fuse_transport, sizeof (*new_fuse)); + new_fuse->ops = &fuse_transport_ops; + new_fuse->xl_private = ctx; + + dict_set (options, + "mountpoint", + str_to_data ((char *)mount_point)); + + return (new_fuse->init (new_fuse, + options, + fuse_transport_notify) == 0 ? new_fuse : NULL); +} + +int32_t +fuse_thread (pthread_t *thread, void *data) +{ + return pthread_create (thread, NULL, fuse_thread_proc, data); +} + + diff --git a/glusterfs-guts/src/fuse-extra.c b/glusterfs-guts/src/fuse-extra.c new file mode 100644 index 000000000..93574d174 --- /dev/null +++ b/glusterfs-guts/src/fuse-extra.c @@ -0,0 +1,137 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif /* _CONFIG_H */ + +#include "fuse-extra.h" +#include "common-utils.h" +#include +#include +#include +#include +#include "common-utils.h" + +struct fuse_req; +struct fuse_ll; + +struct fuse_req { + struct fuse_ll *f; + uint64_t unique; + int ctr; + pthread_mutex_t lock; + struct fuse_ctx ctx; + struct fuse_chan *ch; + int interrupted; + union { + struct { + uint64_t unique; + } i; + struct { + fuse_interrupt_func_t func; + void *data; + } ni; + } u; + struct fuse_req *next; + struct fuse_req *prev; +}; + +struct fuse_ll { + int debug; + int allow_root; + struct fuse_lowlevel_ops op; + int got_init; + void *userdata; + uid_t owner; + struct fuse_conn_info conn; + struct fuse_req list; + struct fuse_req interrupts; + pthread_mutex_t lock; + int got_destroy; +}; + +struct fuse_out_header { + uint32_t len; + int32_t error; + uint64_t unique; +}; + +uint64_t req_callid (fuse_req_t req) +{ + return req->unique; +} + +static void destroy_req(fuse_req_t req) +{ + pthread_mutex_destroy (&req->lock); + FREE (req); +} + +static void list_del_req(struct fuse_req *req) +{ + struct fuse_req *prev = req->prev; + struct fuse_req *next = req->next; + prev->next = next; + next->prev = prev; +} + +static void +free_req (fuse_req_t req) +{ + int ctr; + struct fuse_ll *f = req->f; + + pthread_mutex_lock(&req->lock); + req->u.ni.func = NULL; + req->u.ni.data = NULL; + pthread_mutex_unlock(&req->lock); + + pthread_mutex_lock(&f->lock); + list_del_req(req); + ctr = --req->ctr; + pthread_mutex_unlock(&f->lock); + if (!ctr) + destroy_req(req); +} + +int32_t +fuse_reply_vec (fuse_req_t req, + struct iovec *vector, + int32_t count) +{ + int32_t error = 0; + struct fuse_out_header out; + struct iovec *iov; + int res; + + iov = alloca ((count + 1) * sizeof (*vector)); + out.unique = req->unique; + out.error = error; + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + memcpy (&iov[1], vector, count * sizeof (*vector)); + count++; + out.len = iov_length(iov, count); + res = fuse_chan_send(req->ch, iov, count); + free_req(req); + + return res; +} diff --git a/glusterfs-guts/src/fuse-extra.h b/glusterfs-guts/src/fuse-extra.h new file mode 100644 index 000000000..c7d2877c0 --- /dev/null +++ b/glusterfs-guts/src/fuse-extra.h @@ -0,0 +1,38 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _FUSE_EXTRA_H +#define _FUSE_EXTRA_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif /* _CONFIG_H */ + +#include +#include + +uint64_t req_callid (fuse_req_t req); + +int32_t +fuse_reply_vec (fuse_req_t req, + struct iovec *vector, + int32_t count); + +#endif /* _FUSE_EXTRA_H */ diff --git a/glusterfs-guts/src/fuse_kernel.h b/glusterfs-guts/src/fuse_kernel.h new file mode 100644 index 000000000..7ebff8b22 --- /dev/null +++ b/glusterfs-guts/src/fuse_kernel.h @@ -0,0 +1,380 @@ +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2007 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + +/* This file defines the kernel interface of FUSE */ + +#ifdef __FreeBSD__ +/* + This -- and only this -- header file may also be distributed under + the terms of the BSD Licence as follows: + + Copyright (C) 2001-2006 Miklos Szeredi. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. +*/ + +#include +#define __u64 uint64_t +#define __u32 uint32_t +#define __s32 int32_t +#else +#include +#include +#endif + +/** Version number of this interface */ +#define FUSE_KERNEL_VERSION 7 + +/** Minor version number of this interface */ +#define FUSE_KERNEL_MINOR_VERSION 8 + +/** The node ID of the root inode */ +#define FUSE_ROOT_ID 1 + +/** The major number of the fuse character device */ +#define FUSE_MAJOR MISC_MAJOR + +/** The minor number of the fuse character device */ +#define FUSE_MINOR 229 + +/* Make sure all structures are padded to 64bit boundary, so 32bit + userspace works under 64bit kernels */ + +struct fuse_attr { + __u64 ino; + __u64 size; + __u64 blocks; + __u64 atime; + __u64 mtime; + __u64 ctime; + __u32 atimensec; + __u32 mtimensec; + __u32 ctimensec; + __u32 mode; + __u32 nlink; + __u32 uid; + __u32 gid; + __u32 rdev; +}; + +struct fuse_kstatfs { + __u64 blocks; + __u64 bfree; + __u64 bavail; + __u64 files; + __u64 ffree; + __u32 bsize; + __u32 namelen; + __u32 frsize; + __u32 padding; + __u32 spare[6]; +}; + +struct fuse_file_lock { + __u64 start; + __u64 end; + __u32 type; + __u32 pid; /* tgid */ +}; + +/** + * Bitmasks for fuse_setattr_in.valid + */ +#define FATTR_MODE (1 << 0) +#define FATTR_UID (1 << 1) +#define FATTR_GID (1 << 2) +#define FATTR_SIZE (1 << 3) +#define FATTR_ATIME (1 << 4) +#define FATTR_MTIME (1 << 5) +#define FATTR_FH (1 << 6) + +/** + * Flags returned by the OPEN request + * + * FOPEN_DIRECT_IO: bypass page cache for this open file + * FOPEN_KEEP_CACHE: don't invalidate the data cache on open + */ +#define FOPEN_DIRECT_IO (1 << 0) +#define FOPEN_KEEP_CACHE (1 << 1) + +/** + * INIT request/reply flags + */ +#define FUSE_ASYNC_READ (1 << 0) +#define FUSE_POSIX_LOCKS (1 << 1) + +/** + * Release flags + */ +#define FUSE_RELEASE_FLUSH (1 << 0) + +enum fuse_opcode { + FUSE_LOOKUP = 1, + FUSE_FORGET = 2, /* no reply */ + FUSE_GETATTR = 3, + FUSE_SETATTR = 4, + FUSE_READLINK = 5, + FUSE_SYMLINK = 6, + FUSE_MKNOD = 8, + FUSE_MKDIR = 9, + FUSE_UNLINK = 10, + FUSE_RMDIR = 11, + FUSE_RENAME = 12, + FUSE_LINK = 13, + FUSE_OPEN = 14, + FUSE_READ = 15, + FUSE_WRITE = 16, + FUSE_STATFS = 17, + FUSE_RELEASE = 18, + FUSE_FSYNC = 20, + FUSE_SETXATTR = 21, + FUSE_GETXATTR = 22, + FUSE_LISTXATTR = 23, + FUSE_REMOVEXATTR = 24, + FUSE_FLUSH = 25, + FUSE_INIT = 26, + FUSE_OPENDIR = 27, + FUSE_READDIR = 28, + FUSE_RELEASEDIR = 29, + FUSE_FSYNCDIR = 30, + FUSE_GETLK = 31, + FUSE_SETLK = 32, + FUSE_SETLKW = 33, + FUSE_ACCESS = 34, + FUSE_CREATE = 35, + FUSE_INTERRUPT = 36, + FUSE_BMAP = 37, + FUSE_DESTROY = 38, +}; + +/* The read buffer is required to be at least 8k, but may be much larger */ +#define FUSE_MIN_READ_BUFFER 8192 + +struct fuse_entry_out { + __u64 nodeid; /* Inode ID */ + __u64 generation; /* Inode generation: nodeid:gen must + be unique for the fs's lifetime */ + __u64 entry_valid; /* Cache timeout for the name */ + __u64 attr_valid; /* Cache timeout for the attributes */ + __u32 entry_valid_nsec; + __u32 attr_valid_nsec; + struct fuse_attr attr; +}; + +struct fuse_forget_in { + __u64 nlookup; +}; + +struct fuse_attr_out { + __u64 attr_valid; /* Cache timeout for the attributes */ + __u32 attr_valid_nsec; + __u32 dummy; + struct fuse_attr attr; +}; + +struct fuse_mknod_in { + __u32 mode; + __u32 rdev; +}; + +struct fuse_mkdir_in { + __u32 mode; + __u32 padding; +}; + +struct fuse_rename_in { + __u64 newdir; +}; + +struct fuse_link_in { + __u64 oldnodeid; +}; + +struct fuse_setattr_in { + __u32 valid; + __u32 padding; + __u64 fh; + __u64 size; + __u64 unused1; + __u64 atime; + __u64 mtime; + __u64 unused2; + __u32 atimensec; + __u32 mtimensec; + __u32 unused3; + __u32 mode; + __u32 unused4; + __u32 uid; + __u32 gid; + __u32 unused5; +}; + +struct fuse_open_in { + __u32 flags; + __u32 mode; +}; + +struct fuse_open_out { + __u64 fh; + __u32 open_flags; + __u32 padding; +}; + +struct fuse_release_in { + __u64 fh; + __u32 flags; + __u32 release_flags; + __u64 lock_owner; +}; + +struct fuse_flush_in { + __u64 fh; + __u32 unused; + __u32 padding; + __u64 lock_owner; +}; + +struct fuse_read_in { + __u64 fh; + __u64 offset; + __u32 size; + __u32 padding; +}; + +struct fuse_write_in { + __u64 fh; + __u64 offset; + __u32 size; + __u32 write_flags; +}; + +struct fuse_write_out { + __u32 size; + __u32 padding; +}; + +#define FUSE_COMPAT_STATFS_SIZE 48 + +struct fuse_statfs_out { + struct fuse_kstatfs st; +}; + +struct fuse_fsync_in { + __u64 fh; + __u32 fsync_flags; + __u32 padding; +}; + +struct fuse_setxattr_in { + __u32 size; + __u32 flags; +}; + +struct fuse_getxattr_in { + __u32 size; + __u32 padding; +}; + +struct fuse_getxattr_out { + __u32 size; + __u32 padding; +}; + +struct fuse_lk_in { + __u64 fh; + __u64 owner; + struct fuse_file_lock lk; +}; + +struct fuse_lk_out { + struct fuse_file_lock lk; +}; + +struct fuse_access_in { + __u32 mask; + __u32 padding; +}; + +struct fuse_init_in { + __u32 major; + __u32 minor; + __u32 max_readahead; + __u32 flags; +}; + +struct fuse_init_out { + __u32 major; + __u32 minor; + __u32 max_readahead; + __u32 flags; + __u32 unused; + __u32 max_write; +}; + +struct fuse_interrupt_in { + __u64 unique; +}; + +struct fuse_bmap_in { + __u64 block; + __u32 blocksize; + __u32 padding; +}; + +struct fuse_bmap_out { + __u64 block; +}; + +struct fuse_in_header { + __u32 len; + __u32 opcode; + __u64 unique; + __u64 nodeid; + __u32 uid; + __u32 gid; + __u32 pid; + __u32 padding; +}; + +struct fuse_out_header { + __u32 len; + __s32 error; + __u64 unique; +}; + +struct fuse_dirent { + __u64 ino; + __u64 off; + __u32 namelen; + __u32 type; + char name[0]; +}; + +#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name) +#define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1)) +#define FUSE_DIRENT_SIZE(d) \ + FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen) diff --git a/glusterfs-guts/src/glusterfs-fuse.h b/glusterfs-guts/src/glusterfs-fuse.h new file mode 100644 index 000000000..f446202fb --- /dev/null +++ b/glusterfs-guts/src/glusterfs-fuse.h @@ -0,0 +1,58 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef __GLUSTERFS_FUSE_H__ +#define __GLUSTERFS_FUSE_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif /* _CONFIG_H */ + +#define DEFAULT_LOG_FILE DATADIR"/log/glusterfs/glusterfs.log" +#define DEFAULT_GLUSTERFS_CLIENT_VOL CONFDIR "/glusterfs-client.vol" + +#define SPEC_LOCAL_FILE 1 +#define SPEC_REMOTE_FILE 2 + +#if 0 +#define GF_YES 1 +#define GF_NO 0 +#endif + +#ifdef GF_LOG_FUSE_ARGS +#undef GF_LOG_FUSE_ARGS +#endif + +struct gf_spec_location { + int32_t where; + union { + char *file; + struct { + char *ip; + char *port; + char *transport; + }server; + }spec; +}; + +transport_t * glusterfs_mount (glusterfs_ctx_t *ctx, + const char *mount_point); + +#endif /* __GLUSTERFS_FUSE_H__ */ diff --git a/glusterfs-guts/src/glusterfs-guts.c b/glusterfs-guts/src/glusterfs-guts.c new file mode 100644 index 000000000..3efac3a35 --- /dev/null +++ b/glusterfs-guts/src/glusterfs-guts.c @@ -0,0 +1,400 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include +#include +#include +#include + +#include "glusterfs.h" +#include "xlator.h" +#include "glusterfs-guts.h" + +/* argp initializations */ +static char doc[] = "glusterfs-guts is unit testing suite for glusterfs"; +static char argp_doc[] = ""; +const char *argp_program_version = PACKAGE_NAME " " PACKAGE_VERSION " built on " __DATE__; +const char *argp_program_bug_address = PACKAGE_BUGREPORT; + +guts_ctx_t guts_ctx; +error_t parse_opts (int32_t key, char *arg, struct argp_state *_state); + +static struct argp_option options[] = { + {"spec-file", 'f', "VOLUMESPEC-FILE", 0,\ + "Load VOLUMESPEC-FILE."}, + {"threads", 't', "NUMBER", 0,\ + "Load NUMBER of threads."}, + {"tio-file", 'i', "FILE", 0,\ + "Replay fops from FILE."}, + {"tio-directory", 'I', "DIRECTORY", 0,\ + "Replay fops from files in DIRECTORY. Valid option only when using more than one thread."}, + {"log-level", 'L', "LOGLEVEL", 0, + "LOGLEVEL should be one of DEBUG, WARNING, [ERROR], CRITICAL, NONE"}, + {"log-file", 'l', "LOGFILE", 0, \ + "Specify the file to redirect logs"}, + {"trace", 'T', "MOUNTPOINT", 0, \ + "Run guts in trace mode. Guts mounts glusterfs on MOUNTPOINT specified"}, + {"output", 'o', "OUTPUT-TIOFILE", 0, \ + "Write trace io output to OUTPUT-TIOFILE. Valid only when run in trace(-T) mode."}, + {"version", 'V', 0, 0,\ + "print version information"}, + { 0, } +}; + +static struct argp argp = { options, parse_opts, argp_doc, doc }; + +/* guts_print_version - used by argument parser routine to print version information for guts */ +static int32_t +guts_print_version (void) +{ + printf ("%s\n", argp_program_version); + printf ("Copyright (c) 2006, 2007 Z RESEARCH Inc. \n"); + printf ("GlusterFS comes with ABSOLUTELY NO WARRANTY.\nYou may redistribute copies of GlusterFS under the terms of the GNU General Public License.\n"); + exit (0); +} + +/* parse_opts - argument parsing helper routine for argp library */ +error_t +parse_opts (int32_t key, char *arg, struct argp_state *_state) +{ + guts_ctx_t *state = _state->input; + + switch (key) { + case 'f': + if (!state->specfile) { + state->specfile = strdup (arg); + } + break; + + case 't': + if (!state->threads) { + state->threads = strtol (arg, NULL, 0); + } + break; + + case 'i': + if (state->threads == 1) { + state->file = strdup (arg); + } else { + fprintf (stderr, "glusterfs-guts: -i option is valid only when guts is running single thread\n"); + exit (1); + } + break; + + case 'I': + if (state->threads > 1) { + state->directory = strdup (arg); + } else { + fprintf (stderr, "glusterfs-guts: -I option is valid only when guts is running multiple threads\n"); + exit (1); + } + break; + + case 'L': + /* set log level */ + if (!strncasecmp (arg, "DEBUG", strlen ("DEBUG"))) { + state->loglevel = GF_LOG_DEBUG; + } else if (!strncasecmp (arg, "WARNING", strlen ("WARNING"))) { + state->loglevel = GF_LOG_WARNING; + } else if (!strncasecmp (arg, "CRITICAL", strlen ("CRITICAL"))) { + state->loglevel = GF_LOG_CRITICAL; + } else if (!strncasecmp (arg, "NONE", strlen ("NONE"))) { + state->loglevel = GF_LOG_NONE; + } else if (!strncasecmp (arg, "ERROR", strlen ("ERROR"))) { + state->loglevel = GF_LOG_ERROR; + } else { + fprintf (stderr, "glusterfs-guts: Unrecognized log-level \"%s\", possible values are \"DEBUG|WARNING|[ERROR]|CRITICAL|NONE\"\n", arg); + exit (EXIT_FAILURE); + } + break; + case 'l': + /* set log file */ + state->logfile = strdup (arg); + break; + + case 'T': + state->trace = 1; + state->mountpoint = strdup (arg); + break; + + case 'o': + state->file = strdup (arg); + break; + + case 'V': + guts_print_version (); + break; + + } + return 0; +} + +/* get_xlator_graph - creates a translator graph and returns the pointer to the root of the xlator tree + * + * @ctx: guts context structure + * @conf: file handle to volume specfile + * + * returns pointer to the root of the translator tree + */ +static xlator_t * +get_xlator_graph (glusterfs_ctx_t *ctx, + FILE *conf) +{ + xlator_t *tree, *trav = NULL; + + tree = file_to_xlator_tree (ctx, conf); + trav = tree; + + if (tree == NULL) { + gf_log ("glusterfs-guts", + GF_LOG_ERROR, + "specification file parsing failed, exiting"); + return NULL; + } + + tree = trav; + + return tree; +} + +/* get_spec_fp - get file handle to volume spec file specified. + * + * @ctx: guts context structure + * + * returns FILE pointer to the volume spec file. + */ +static FILE * +get_spec_fp (guts_ctx_t *ctx) +{ + char *specfile = ctx->specfile; + FILE *conf = NULL; + + specfile = ctx->specfile; + + conf = fopen (specfile, "r"); + + if (!conf) { + perror (specfile); + return NULL; + } + gf_log ("glusterfs-guts", + GF_LOG_DEBUG, + "loading spec from %s", + specfile); + + return conf; +} + +static void * +guts_thread_main (void *ctx) +{ + guts_thread_ctx_t *tctx = (guts_thread_ctx_t *) ctx; + + printf ("starting thread main with %s:\n", tctx->file); + guts_replay (tctx); + printf ("ending thread main.\n"); + + return NULL; +} + +/* guts_create_threads - creates different threads based on thread number specified in ctx and assigns a + * tio file to each thread and attaches each thread to the graph created by main(). + * @ctx: guts_ctx_t which contains the context corresponding to the current run of guts + * + * returns the guts_threads_t structure which contains handles to the different threads created. + * + */ +static guts_threads_t * +guts_create_threads (guts_ctx_t *ctx) +{ + guts_threads_t *threads = NULL; + int32_t thread_count = ctx->threads; + + threads = CALLOC (1, sizeof (*threads)); + ERR_ABORT (threads); + + + INIT_LIST_HEAD (&(threads->threads)); + + if (thread_count == 1) { + /* special case: we have only one thread and we are given a tio-file as argument instead of a directory. + * handling differently */ + guts_thread_ctx_t *thread = NULL; + thread = CALLOC (1, sizeof (*thread)); + ERR_ABORT (thread); + list_add (&thread->threads, &threads->threads); + thread->file = strdup (ctx->file); + thread->ctx = ctx; + } else { + /* look for .tio files in the directory given and assign to each of the threads */ + DIR *dir = opendir (ctx->directory); + + if (!dir) { + gf_log ("guts", + GF_LOG_ERROR, + "failed to open directory %s", ctx->directory); + } else { + guts_thread_ctx_t *thread = NULL; + struct dirent *dirp = NULL; + /* to pass through "." and ".." */ + readdir (dir); + readdir (dir); + + while (thread_count > 0) { + char pathname[256] = {0,}; + + thread = CALLOC (1, sizeof (*thread)); + ERR_ABORT (thread); + dirp = NULL; + + list_add (&thread->threads, &threads->threads); + dirp = readdir (dir); + if (dirp) { + sprintf (pathname, "%s/%s", ctx->directory, dirp->d_name); + printf ("file name for thread(%d) is %s\n", thread_count, pathname); + thread->file = strdup (pathname); + thread->ctx = ctx; + } else if (thread_count > 0) { + gf_log ("guts", + GF_LOG_ERROR, + "number of tio files less than %d, number of threads specified", ctx->threads); + /* TODO: cleanup */ + return NULL; + } + --thread_count; + } + } + } + return threads; +} + +/* guts_start_threads - starts all the threads in @threads. + * + * @threads: guts_threads_t structure containing the handles to threads created by guts_create_threads. + * + * returns <0 on error. + * + */ +static void +guts_start_threads (guts_threads_t *gthreads) +{ + guts_thread_ctx_t *thread = NULL; + list_for_each_entry (thread, >hreads->threads, threads) { + if (pthread_create (&thread->pthread, NULL, guts_thread_main, (void *)thread) < 0) { + gf_log ("guts", + GF_LOG_ERROR, + "failed to start thread"); + } else { + gf_log ("guts", + GF_LOG_DEBUG, + "started thread with file %s", thread->file); + } + } +} + +static int32_t +guts_join_threads (guts_threads_t *gthreads) +{ + guts_thread_ctx_t *thread = NULL; + list_for_each_entry (thread, >hreads->threads, threads) { + if (pthread_join (thread->pthread, NULL) < 0) { + gf_log ("guts", + GF_LOG_ERROR, + "failed to join thread"); + } else { + gf_log ("guts", + GF_LOG_DEBUG, + "joined thread with file %s", thread->file); + } + } + return 0; +} + + +int32_t +main (int32_t argc, char *argv[]) +{ + /* glusterfs_ctx_t is required to be passed to + * 1. get_xlator_graph + * 2. glusterfs_mount + */ + glusterfs_ctx_t gfs_ctx = { + .logfile = DATADIR "/log/glusterfs/glusterfs-guts.log", + .loglevel = GF_LOG_DEBUG, + .poll_type = SYS_POLL_TYPE_EPOLL, + }; + + guts_ctx_t guts_ctx = {0,}; + FILE *specfp = NULL; + xlator_t *graph = NULL; + guts_threads_t *threads = NULL; + + argp_parse (&argp, argc, argv, 0, 0, &guts_ctx); + + if (gf_log_init (gfs_ctx.logfile) == -1 ) { + fprintf (stderr, + "glusterfs-guts: failed to open logfile \"%s\"\n", + gfs_ctx.logfile); + return -1; + } + gf_log_set_loglevel (gfs_ctx.loglevel); + + specfp = get_spec_fp (&guts_ctx); + if (!specfp) { + fprintf (stderr, + "glusterfs-guts: could not open specfile\n"); + return -1; + } + + graph = get_xlator_graph (&gfs_ctx, specfp); + if (!graph) { + gf_log ("guts", GF_LOG_ERROR, + "Unable to get xlator graph"); + return -1; + } + fclose (specfp); + + guts_ctx.graph = graph; + + if (guts_ctx.trace) { + return guts_trace (&guts_ctx); + } else { + /* now that we have the xlator graph, we need to create as many threads as requested and assign a tio file + * to each of the threads and tell each thread to attach to the graph we just created. */ + + if (!guts_ctx.file && !guts_ctx.directory) { + fprintf (stderr, + "glusterfs-guts: no tio file specified"); + return -1; + } + + threads = guts_create_threads (&guts_ctx); + + if (threads) { + guts_start_threads (threads); + guts_join_threads (threads); + } else { + gf_log ("guts", GF_LOG_ERROR, + "unable to create threads"); + return 0; + } + } + + return 0; +} + diff --git a/glusterfs-guts/src/glusterfs-guts.h b/glusterfs-guts/src/glusterfs-guts.h new file mode 100644 index 000000000..eda1788a9 --- /dev/null +++ b/glusterfs-guts/src/glusterfs-guts.h @@ -0,0 +1,62 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef __GLUSTERFS_GUTS_H +#define __GLUSTERFS_GUTS_H + +#include "xlator.h" +#include "transport.h" +#include "glusterfs.h" +#include "glusterfs-fuse.h" +#include "timer.h" + +#ifdef DEFAULT_LOG_FILE +#undef DEFAULT_LOG_FILE +#endif + +#define DEFAULT_LOG_FILE DATADIR"/log/glusterfs/glusterfs-guts.log" + + +typedef struct { + int32_t threads; /* number of threads to start in replay mode */ + char *logfile; /* logfile path */ + int32_t loglevel; /* logging level */ + char *directory; /* path to directory containing tio files, when threads > 1 */ + char *file; /* path to tio file, when threads == 1 during replay. in trace mode, path to tio output */ + char *specfile; /* path to specfile to load translator tree */ + xlator_t *graph; /* translator tree after the specfile is loaded */ + int32_t trace; /* if trace == 1, glusterfs-guts runs in trace mode, otherwise in replay mode */ + char *mountpoint; /* valid only when trace == 1, mounpoint to mount glusterfs */ +} guts_ctx_t; + + +typedef struct { + struct list_head threads; + pthread_t pthread; + xlator_t *tree; + char *file; + guts_ctx_t *ctx; +} guts_thread_ctx_t; + +typedef struct { + struct list_head threads; +} guts_threads_t; + +int32_t guts_replay (guts_thread_ctx_t *); +int32_t guts_trace (guts_ctx_t *); +#endif diff --git a/glusterfs-guts/src/guts-extra.c b/glusterfs-guts/src/guts-extra.c new file mode 100644 index 000000000..dd4ad466f --- /dev/null +++ b/glusterfs-guts/src/guts-extra.c @@ -0,0 +1,18 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ diff --git a/glusterfs-guts/src/guts-lowlevel.h b/glusterfs-guts/src/guts-lowlevel.h new file mode 100644 index 000000000..498b5d01e --- /dev/null +++ b/glusterfs-guts/src/guts-lowlevel.h @@ -0,0 +1,86 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _GUTS_LOWLEVEL_H_ +#define _GUTS_LOWLEVEL_H_ + +int +guts_reply_err (fuse_req_t req, + int err); + +int +guts_reply_none (fuse_req_t req); + +int +guts_reply_entry (fuse_req_t req, + const struct fuse_entry_param *e); + +int +guts_reply_create (fuse_req_t req, + const struct fuse_entry_param *e, + const struct fuse_file_info *f); + +int +guts_reply_attr (fuse_req_t req, + const struct stat *attr, + double attr_timeout); + +int +guts_reply_readlink (fuse_req_t req, + const char *linkname); + +int +guts_reply_open (fuse_req_t req, + const struct fuse_file_info *f); + +int +guts_reply_write (fuse_req_t req, + size_t count); + +int +guts_reply_buf (fuse_req_t req, + const char *buf, + size_t size); + +int +guts_reply_statfs (fuse_req_t req, + const struct statvfs *stbuf); + +int +guts_reply_xattr (fuse_req_t req, + size_t count); + +int +guts_reply_lock (fuse_req_t req, + struct flock *lock); + +/* exploiting the macros to reduce coding work ;) */ +#define fuse_reply_entry guts_reply_entry +#define fuse_reply_err guts_reply_err +#define fuse_reply_none guts_reply_none +#define fuse_reply_attr guts_reply_attr +#define fuse_reply_open guts_reply_open +#define fuse_reply_readlink guts_reply_readlink +#define fuse_reply_create guts_reply_create +#define fuse_reply_write guts_reply_write +#define fuse_reply_buf guts_reply_buf +#define fuse_reply_statfs guts_reply_statfs +#define fuse_reply_xattr guts_reply_xattr +#define fuse_reply_lock guts_reply_lock + +#endif diff --git a/glusterfs-guts/src/guts-parse.c b/glusterfs-guts/src/guts-parse.c new file mode 100644 index 000000000..dd17a737e --- /dev/null +++ b/glusterfs-guts/src/guts-parse.c @@ -0,0 +1,217 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "guts-parse.h" +#include "guts-tables.h" + +/* unavoidable usage of global data.. :'( */ +static int32_t tio_fd = 0; + +int32_t +guts_tio_init (const char *filename) +{ + tio_fd = open (filename, O_WRONLY | O_CREAT); + + if (tio_fd < 0) { + gf_log ("guts", + GF_LOG_ERROR, + "failed to open tio file %s", filename); + } + + return tio_fd; +} + +void +guts_reply_dump (fuse_req_t req, + const void *arg, + int32_t len) +{ + uint8_t *buf = NULL; + uint8_t *ibuf = NULL; + uint32_t buf_size = REP_HEADER_FULL_LEN + len; + + ibuf = buf = CALLOC (1, buf_size); + + /* being paranoid, checking for both ibuf and buf.. ;) */ + if (ibuf && buf) { + memcpy (ibuf, REP_BEGIN, strlen (REP_BEGIN)); + ibuf += strlen (REP_BEGIN); + memcpy (ibuf, req, sizeof (struct fuse_req)); + ibuf += sizeof (struct fuse_req); + memcpy (ibuf, &len, sizeof (len)); + ibuf += sizeof (len); + memcpy (ibuf, arg, len); + + gf_full_write (tio_fd, buf, buf_size); + + free (buf); + } else { + gf_log ("glusterfs-guts", GF_LOG_DEBUG, + "failed to allocate memory while dumping reply"); + } +} + +void +guts_req_dump (struct fuse_in_header *in, + const void *arg, + int32_t len) +{ + /* GUTS_REQUEST_BEGIN::::GUTS_REQUEST_END */ + uint8_t *buf = NULL; + uint8_t *ibuf = NULL; + uint32_t buf_size = REQ_HEADER_FULL_LEN + len; + + ibuf = buf = CALLOC (1, buf_size); + + if (ibuf && buf) { + memcpy (ibuf, REQ_BEGIN, strlen (REQ_BEGIN)); + ibuf += strlen (REQ_BEGIN); + memcpy (ibuf, in, sizeof (*in)); + ibuf += sizeof (*in); + memcpy (ibuf, &len, sizeof (len)); + ibuf += sizeof (len); + memcpy (ibuf, arg, len); + + gf_full_write (tio_fd, buf, buf_size); + + free (buf); + } else { + gf_log ("glusterfs-guts", GF_LOG_DEBUG, + "failed to allocate memory while dumping reply"); + } +} + + + +guts_req_t * +guts_read_entry (guts_replay_ctx_t *ctx) +{ + guts_req_t *req = NULL; + guts_reply_t *reply = NULL; + uint8_t begin[256] = {0,}; + int32_t ret = 0; + int32_t fd = ctx->tio_fd; + + while (!req) { + req = guts_get_request (ctx); + + if (!req) { + ret = read (fd, begin, strlen (REQ_BEGIN)); + + if (ret == 0) { + gf_log ("glusterfs-guts", GF_LOG_DEBUG, + "guts replay finished"); + req = NULL; + } + + if (is_request (begin)) { + req = CALLOC (1, sizeof (*req)); + ERR_ABORT (req); + gf_full_read (fd, (char *)req, REQ_HEADER_LEN); + + req->arg = CALLOC (1, req->arg_len + 1); + ERR_ABORT (req->arg); + gf_full_read (fd, req->arg, req->arg_len); + gf_log ("guts", + GF_LOG_DEBUG, + "%s: fop %s (%d)\n", + begin, guts_log[req->header.opcode].name, req->header.opcode); + guts_add_request (ctx, req); + req = guts_get_request (ctx); + } else { + /* whenever a reply is read, we put it to a hash table and we would like to retrieve it whenever + * we get a reply for any call + */ + reply = CALLOC (1, sizeof (*reply)); + ERR_ABORT (reply); + gf_full_read (fd, (char *)reply, REP_HEADER_LEN); + + reply->arg = CALLOC (1, reply->arg_len + 1); + ERR_ABORT (reply->arg); + gf_full_read (fd, reply->arg, reply->arg_len); + + /* add a new reply to */ + ret = guts_add_reply (ctx, reply); + gf_log ("guts", + GF_LOG_DEBUG, + "got a reply with unique: %ld", reply->req.unique); + } + } + } + return req; +} + +guts_reply_t * +guts_read_reply (guts_replay_ctx_t *ctx, + uint64_t unique) +{ + guts_req_t *req = NULL; + guts_reply_t *reply = NULL, *rep = NULL; + uint8_t begin[256] = {0,}; + int32_t ret = 0; + int32_t fd = ctx->tio_fd; + + while (!rep) { + + ret = read (fd, begin, strlen (REQ_BEGIN)); + + if (ret == 0) { + printf ("\ndone\n"); + return NULL; + } + + if (is_request (begin)) { + req = CALLOC (1, sizeof (*req)); + ERR_ABORT (req); + gf_full_read (fd, (char *)req, REQ_HEADER_LEN); + + req->arg = CALLOC (1, req->arg_len + 1); + ERR_ABORT (req->arg); + gf_full_read (fd, req->arg, req->arg_len); + gf_log ("guts", + GF_LOG_DEBUG, + "%s: fop %s (%d)\n", + begin, guts_log[req->header.opcode].name, req->header.opcode); + + ret = guts_add_request (ctx, req); + + } else { + /* whenever a reply is read, we put it to a hash table and we would like to retrieve it whenever + * we get a reply for any call + */ + reply = CALLOC (1, sizeof (*reply)); + ERR_ABORT (reply); + gf_full_read (fd, (char *)reply, REP_HEADER_LEN); + + reply->arg = CALLOC (1, reply->arg_len + 1); + ERR_ABORT (reply->arg); + gf_full_read (fd, reply->arg, reply->arg_len); + + /* add a new reply to */ + if (reply->req.unique == unique) { + return reply; + } else { + ret = guts_add_reply (ctx, reply); + gf_log ("guts", + GF_LOG_DEBUG, + "got a reply with unique: %ld", reply->req.unique); + } + } + } + return NULL; +} diff --git a/glusterfs-guts/src/guts-parse.h b/glusterfs-guts/src/guts-parse.h new file mode 100644 index 000000000..7791b1215 --- /dev/null +++ b/glusterfs-guts/src/guts-parse.h @@ -0,0 +1,140 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#ifndef _GUTS_PARSE_H_ +#define _GUTS_PARSE_H_ + +#include "glusterfs.h" +#include "glusterfs-guts.h" +#include "fuse_kernel.h" +#include +#include "list.h" + +#ifndef _FUSE_OPAQUE_ +#define _FUSE_OPAQUE_ + +struct fuse_private { + int fd; + struct fuse *fuse; + struct fuse_session *se; + struct fuse_chan *ch; + char *mountpoint; +}; + +struct fuse_req { + struct fuse_ll *f; + uint64_t unique; + int ctr; + pthread_mutex_t lock; + struct fuse_ctx ctx; + struct fuse_chan *ch; + int interrupted; + union { + struct { + uint64_t unique; + } i; + struct { + fuse_interrupt_func_t func; + void *data; + } ni; + } u; + struct fuse_req *next; + struct fuse_req *prev; +}; + +struct fuse_ll { + int debug; + int allow_root; + struct fuse_lowlevel_ops op; + int got_init; + void *userdata; + uid_t owner; + struct fuse_conn_info conn; + struct fuse_req list; + struct fuse_req interrupts; + pthread_mutex_t lock; + int got_destroy; +}; +#endif + +#define REQ_BEGIN "GUTS_REQ_BEGIN:" +#define REQ_HEADER_FULL_LEN (strlen(REQ_BEGIN) + sizeof (struct fuse_in_header) + sizeof (int32_t)) + +#define REP_BEGIN "GUTS_REP_BEGIN:" +#define REP_HEADER_FULL_LEN (strlen(REP_BEGIN) + sizeof (struct fuse_req) + sizeof (int32_t)) + +#define REQ_HEADER_LEN (sizeof (struct fuse_in_header) + sizeof (int32_t)) +#define REP_HEADER_LEN (sizeof (struct fuse_req) + sizeof (int32_t)) + +#define is_request(begin) (0==strcmp(begin, REQ_BEGIN)?1:0) + +typedef void (*func_t)(struct fuse_in_header *, const void *); + +typedef struct { + func_t func; + const char *name; +} guts_log_t; + +typedef struct { + struct fuse_in_header header; + int32_t arg_len; + struct list_head list; + void *arg; +} guts_req_t; + +typedef struct { + struct fuse_req req; + int32_t arg_len; + void *arg; +} guts_reply_t; + +struct guts_replay_ctx { + int32_t tio_fd; + struct fuse_ll *guts_ll; + dict_t *replies; + dict_t *inodes; + dict_t *fds; + struct list_head requests; + dict_t *requests_dict; +}; + +typedef struct guts_replay_ctx guts_replay_ctx_t; + +extern guts_log_t guts_log[]; + +int32_t +guts_tio_init (const char *); + +void +guts_req_dump (struct fuse_in_header *, + const void *, + int32_t); + +guts_req_t * +guts_read_entry (guts_replay_ctx_t *ctx); + +void +guts_reply_dump (fuse_req_t, + const void *, + int32_t); + +guts_reply_t * +guts_read_reply (guts_replay_ctx_t *ctx, + uint64_t unique); + +#endif /* _GUTS_PARSE_H_ */ diff --git a/glusterfs-guts/src/guts-replay.c b/glusterfs-guts/src/guts-replay.c new file mode 100644 index 000000000..a5447464d --- /dev/null +++ b/glusterfs-guts/src/guts-replay.c @@ -0,0 +1,834 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include "glusterfs-guts.h" +#include "guts-parse.h" +#include +#include "guts-tables.h" +#include "guts-replay.h" +#include "guts-trace.h" + +static void +convert_attr (const struct fuse_setattr_in *attr, + struct stat *stbuf) +{ + stbuf->st_mode = attr->mode; + stbuf->st_uid = attr->uid; + stbuf->st_gid = attr->gid; + stbuf->st_size = attr->size; + stbuf->st_atime = attr->atime; + /* + ST_ATIM_NSEC_SET (stbuf, attr->atimensec); + ST_MTIM_NSEC_SET (stbuf, attr->mtimensec);*/ +} + +static void +guts_replay_lookup (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + char *name = (char *) inargs; + + if (req->f->op.lookup) + req->f->op.lookup(req, ino, name); + else + guts_reply_err (req, ENOSYS); + +} + +static void +guts_replay_forget (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + struct fuse_forget_in *arg = (struct fuse_forget_in *) inargs; + + if (req->f->op.forget) + req->f->op.forget (req, ino, arg->nlookup); + +} + +static void +guts_replay_getattr (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + (void) inargs; + + if (req->f->op.getattr) + req->f->op.getattr (req, ino, NULL); + else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_setattr (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + struct fuse_setattr_in *arg = (struct fuse_setattr_in *)inargs; + + if (req->f->op.setattr) { + struct fuse_file_info *fi = NULL; + struct fuse_file_info fi_store; + struct stat stbuf; + memset (&stbuf, 0, sizeof (stbuf)); + convert_attr (arg, &stbuf); + if (arg->valid & FATTR_FH) { + arg->valid &= ~FATTR_FH; + memset (&fi_store, 0, sizeof (fi_store)); + fi = &fi_store; + fi->fh = arg->fh; + fi->fh_old = fi->fh; + } + req->f->op.setattr (req, ino, &stbuf, arg->valid, fi); + } else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_access (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + struct fuse_access_in *arg = (struct fuse_access_in *)inargs; + + if (req->f->op.access) + req->f->op.access (req, ino, arg->mask); + else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_readlink (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + (void) inargs; + + if (req->f->op.readlink) + req->f->op.readlink (req, ino); + else + guts_reply_err (req, ENOSYS); +} + + +static void +guts_replay_mknod (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + struct fuse_mknod_in *arg = (struct fuse_mknod_in *) inargs; + + if (req->f->op.mknod) + req->f->op.mknod (req, ino, PARAM(arg), arg->mode, arg->rdev); + else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_mkdir (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *) inargs; + + if (req->f->op.mkdir) + req->f->op.mkdir (req, ino, PARAM(arg), arg->mode); + else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_unlink (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + char *name = (char *)inargs; + + if (req->f->op.unlink) { + + req->f->op.unlink (req, ino, name); + } else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_rmdir (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + char *name = (char *)inargs; + + if (req->f->op.rmdir) { + req->f->op.rmdir (req, ino, name); + } else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_symlink (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + char *name = (char *) inargs; + char *linkname = ((char *) inargs) + strlen ((char *) inargs) + 1; + + if (req->f->op.symlink) { + req->f->op.symlink (req, linkname, ino, name); + } else + guts_reply_err (req, ENOSYS); +} + + + +static void +guts_replay_rename (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + struct fuse_rename_in *arg = (struct fuse_rename_in *) inargs; + char *oldname = PARAM(arg); + char *newname = oldname + strlen (oldname) + 1; + + if (req->f->op.rename) { + req->f->op.rename (req, ino, oldname, arg->newdir, newname); + } else + guts_reply_err (req, ENOSYS); + +} + +static void +guts_replay_link (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + struct fuse_link_in *arg = (struct fuse_link_in *) inargs; + + if (req->f->op.link) { + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + fuse_ino_t old_ino = guts_inode_search (ctx, arg->oldnodeid); + + req->f->op.link (req, old_ino, ino, PARAM(arg)); + } else + guts_reply_err (req, ENOSYS); +} + + +static void +guts_replay_create (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + struct guts_create_in *arg = (struct guts_create_in *) inargs; + + if (req->f->op.create) { + struct fuse_file_info fi; + memset (&fi, 0, sizeof (fi)); + fi.flags = arg->open_in.flags; + + req->f->op.create (req, ino, arg->name, arg->open_in.mode, &fi); + } else + guts_reply_err (req, ENOSYS); + +} + +static void +guts_replay_open (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + struct fuse_open_in *arg = (struct fuse_open_in *) inargs; + struct fuse_file_info fi; + + memset (&fi, 0, sizeof (fi)); + fi.flags = arg->flags; + + if (req->f->op.open) { + /* TODO: how efficient is using dict_get here?? */ + req->f->op.open (req, ino, &fi); + } else + guts_reply_open (req, &fi); +} + + +static void +guts_replay_read(fuse_req_t req, + fuse_ino_t ino, + const void *inarg) +{ + struct fuse_read_in *arg = (struct fuse_read_in *) inarg; + + if (req->f->op.read){ + struct fuse_file_info fi; + guts_replay_ctx_t *ctx = req->u.ni.data; + + memset (&fi, 0, sizeof (fi)); + /* TODO: how efficient is using dict_get here?? */ + fi.fh = (unsigned long) guts_fd_search (ctx, arg->fh); + if (!fi.fh) { + /* TODO: make it more meaningful and organized */ + printf ("readv called without opening the file\n"); + guts_reply_err (req, EBADFD); + } else { + fi.fh_old = fi.fh; + req->f->op.read (req, ino, arg->size, arg->offset, &fi); + } + } else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_write(fuse_req_t req, + fuse_ino_t ino, + const void *inarg) +{ + struct fuse_write_in *arg = (struct fuse_write_in *) inarg; + struct fuse_file_info fi; + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + + memset (&fi, 0, sizeof (fi)); + fi.fh = (unsigned long) guts_fd_search (ctx, arg->fh); + + if (!fi.fh) { + /* TODO: make it more meaningful and organized */ + printf ("writev called without opening the file\n"); + guts_reply_err (req, EBADFD); + } else { + fi.fh_old = fi.fh; + fi.writepage = arg->write_flags & 1; + if (req->f->op.write) + req->f->op.write (req, ino, PARAM(arg), arg->size, arg->offset, &fi); + else + guts_reply_err (req, ENOSYS); + } +} + +static void +guts_replay_flush(fuse_req_t req, + fuse_ino_t ino, + const void *inarg) +{ + struct fuse_flush_in *arg = (struct fuse_flush_in *) inarg; + struct fuse_file_info fi; + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + + memset (&fi, 0, sizeof (fi)); + fi.fh = (unsigned long) guts_fd_search (ctx, arg->fh); + if (!fi.fh) { + printf ("flush called without calling open\n"); + guts_reply_err (req, EBADFD); + } else { + fi.fh_old = fi.fh; + fi.flush = 1; + + if (req->f->conn.proto_minor >= 7) + fi.lock_owner = arg->lock_owner; + + if (req->f->op.flush) + req->f->op.flush (req, ino, &fi); + else + guts_reply_err (req, ENOSYS); + } +} + +static void +guts_replay_release(fuse_req_t req, + fuse_ino_t ino, + const void *inarg) +{ + struct fuse_release_in *arg = (struct fuse_release_in *) inarg; + struct fuse_file_info fi; + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + + memset (&fi, 0, sizeof (fi)); + fi.flags = arg->flags; + fi.fh = (unsigned long) guts_fd_search (ctx, arg->fh); + + if (!fi.fh) { + printf ("release called without calling open\n"); + guts_reply_err (req, EBADFD); + } else { + fi.fh_old = fi.fh; + if (req->f->conn.proto_minor >= 8) { + fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; + fi.lock_owner = arg->lock_owner; + } + if (req->f->op.release) + req->f->op.release (req, ino, &fi); + else + guts_reply_err (req, ENOSYS); + } +} + +static void +guts_replay_fsync(fuse_req_t req, + fuse_ino_t ino, + const void *inarg) +{ + struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; + struct fuse_file_info fi; + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + + memset (&fi, 0, sizeof (fi)); + fi.fh = (unsigned long) guts_fd_search (ctx, arg->fh); + fi.fh_old = fi.fh; + + if (req->f->op.fsync) + req->f->op.fsync (req, ino, arg->fsync_flags & 1, &fi); + else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_opendir (fuse_req_t req, + fuse_ino_t ino, + const void *inarg) +{ + struct fuse_open_in *arg = (struct fuse_open_in *) inarg; + struct fuse_file_info fi; + + memset (&fi, 0, sizeof (fi)); + fi.flags = arg->flags; + + if (req->f->op.opendir) { + req->f->op.opendir (req, ino, &fi); + } else + guts_reply_open (req, &fi); +} + +static void +guts_replay_readdir(fuse_req_t req, + fuse_ino_t ino, + const void *inarg) +{ + struct fuse_read_in *arg = (struct fuse_read_in *) inarg; + struct fuse_file_info fi; + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + + memset (&fi, 0, sizeof (fi)); + fi.fh = (unsigned long) guts_fd_search (ctx, arg->fh); + + if (!fi.fh) { + /* TODO: make it more meaningful and organized */ + printf ("readdir called without opening the file\n"); + guts_reply_err (req, EBADFD); + } else { + fi.fh_old = fi.fh; + + if (req->f->op.readdir) + req->f->op.readdir (req, ino, arg->size, arg->offset, &fi); + else + guts_reply_err (req, ENOSYS); + } + +} + +static void +guts_replay_releasedir(fuse_req_t req, + fuse_ino_t ino, + const void *inarg) +{ + struct fuse_release_in *arg = (struct fuse_release_in *) inarg; + struct fuse_file_info fi; + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + + memset (&fi, 0, sizeof (fi)); + fi.flags = arg->flags; + fi.fh = (unsigned long) guts_fd_search (ctx, arg->fh); + if (!fi.fh) { + printf ("releasedir called without calling opendir\n"); + guts_reply_err (req, EBADFD); + } else { + + fi.fh_old = fi.fh; + if (req->f->op.releasedir) + req->f->op.releasedir (req, ino, &fi); + else + guts_reply_err (req, ENOSYS); + } +} + +static void +guts_replay_fsyncdir(fuse_req_t req, + fuse_ino_t ino, + const void *inarg) +{ + struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; + struct fuse_file_info fi; + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + + memset (&fi, 0, sizeof (fi)); + fi.fh = (unsigned long) guts_fd_search (ctx, arg->fh); + fi.fh_old = fi.fh; + + if (req->f->op.fsyncdir) + req->f->op.fsyncdir (req, ino, arg->fsync_flags & 1, &fi); + else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_statfs (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + (void) ino; + (void) inargs; + + if (req->f->op.statfs) { + req->f->op.statfs (req, ino); + } else { + struct statvfs buf = { + .f_namemax = 255, + .f_bsize = 512, + }; + guts_reply_statfs (req, &buf); + } +} + +static void +guts_replay_setxattr(fuse_req_t req, + fuse_ino_t ino, + const void *inarg) +{ + struct fuse_setxattr_in *arg = (struct fuse_setxattr_in *) inarg; + char *name = PARAM(arg); + char *value = name + strlen(name) + 1; + + if (req->f->op.setxattr) + req->f->op.setxattr (req, ino, name, value, arg->size, arg->flags); + else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_getxattr(fuse_req_t req, + fuse_ino_t ino, + const void *inarg) +{ + struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *) inarg; + + if (req->f->op.getxattr) + req->f->op.getxattr (req, ino, PARAM(arg), arg->size); + else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_listxattr (fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *) inargs; + + if (req->f->op.listxattr) + req->f->op.listxattr (req, ino, arg->size); + else + guts_reply_err (req, ENOSYS); +} + +static void +guts_replay_removexattr(fuse_req_t req, + fuse_ino_t ino, + const void *inargs) +{ + char *name = (char *)inargs; + + if (req->f->op.removexattr) + req->f->op.removexattr (req, ino, name); + else + guts_reply_err (req, ENOSYS); +} + +guts_replay_t guts_replay_fop[] = { + [FUSE_LOOKUP] = { guts_replay_lookup, "lookup" }, + [FUSE_FORGET] = { guts_replay_forget, "forget" }, + [FUSE_GETATTR] = { guts_replay_getattr, "getattr" }, + [FUSE_SETATTR] = { guts_replay_setattr, "setattr" }, + [FUSE_ACCESS] = { guts_replay_access, "access" }, + [FUSE_READLINK] = { guts_replay_readlink, "readlink" }, + [FUSE_MKNOD] = { guts_replay_mknod, "mknod" }, + [FUSE_MKDIR] = { guts_replay_mkdir, "mkdir" }, + [FUSE_UNLINK] = { guts_replay_unlink, "unlink" }, + [FUSE_RMDIR] = { guts_replay_rmdir, "rmdir" }, + [FUSE_SYMLINK] = { guts_replay_symlink, "symlink" }, + [FUSE_RENAME] = { guts_replay_rename, "rename" }, + [FUSE_LINK] = { guts_replay_link, "link" }, + [FUSE_CREATE] = { guts_replay_create, "create" }, + [FUSE_OPEN] = { guts_replay_open, "open" }, + [FUSE_READ] = { guts_replay_read, "read" }, + [FUSE_WRITE] = { guts_replay_write, "write" }, + [FUSE_FLUSH] = { guts_replay_flush, "flush" }, + [FUSE_RELEASE] = { guts_replay_release, "release" }, + [FUSE_FSYNC] = { guts_replay_fsync, "fsync" }, + [FUSE_OPENDIR] = { guts_replay_opendir, "opendir" }, + [FUSE_READDIR] = { guts_replay_readdir, "readdir" }, + [FUSE_RELEASEDIR] = { guts_replay_releasedir, "releasedir" }, + [FUSE_FSYNCDIR] = { guts_replay_fsyncdir, "fsyncdir" }, + [FUSE_STATFS] = { guts_replay_statfs, "statfs" }, + [FUSE_SETXATTR] = { guts_replay_setxattr, "setxattr" }, + [FUSE_GETXATTR] = { guts_replay_getxattr, "getxattr" }, + [FUSE_LISTXATTR] = { guts_replay_listxattr, "listxattr" }, + [FUSE_REMOVEXATTR] = { guts_replay_removexattr, "removexattr" }, +}; + +static inline void +list_init_req (struct fuse_req *req) +{ + req->next = req; + req->prev = req; +} + + +static int32_t +guts_transport_notify (xlator_t *xl, + int32_t event, + void *data, + ...) +{ + /* dummy, nobody has got anything to notify me.. ;) */ + return 0; +} + +static int32_t +guts_transport_init (transport_t *this, + dict_t *options, + event_notify_fn_t notify) +{ + struct fuse_private *priv = CALLOC (1, sizeof (*priv)); + ERR_ABORT (priv); + + this->notify = NULL; + this->private = (void *)priv; + + /* fuse channel */ + priv->ch = NULL; + + /* fuse session */ + priv->se = NULL; + + /* fuse channel fd */ + priv->fd = -1; + + this->buf = data_ref (data_from_dynptr (NULL, 0)); + this->buf->is_locked = 1; + + priv->mountpoint = NULL; + + transport_ref (this); + + return 0; +} + +static void +guts_transport_fini (transport_t *this) +{ + +} + +static int32_t +guts_transport_disconnect (transport_t *this) +{ + struct fuse_private *priv = this->private; + + gf_log ("glusterfs-guts", + GF_LOG_DEBUG, + "cleaning up fuse transport in disconnect handler"); + + FREE (priv); + priv = NULL; + this->private = NULL; + + /* TODO: need graceful exit. every xlator should be ->fini()'ed + and come out of main poll loop cleanly + */ + return -1; +} + +static struct transport_ops guts_transport_ops = { + .disconnect = guts_transport_disconnect, +}; + +static transport_t guts_transport = { + .ops = &guts_transport_ops, + .private = NULL, + .xl = NULL, + .init = guts_transport_init, + .fini = guts_transport_fini, + .notify = guts_transport_notify +}; + +static inline xlator_t * +fuse_graph (xlator_t *graph) +{ + xlator_t *top = NULL; + xlator_list_t *xlchild; + + top = CALLOC (1, sizeof (*top)); + ERR_ABORT (top); + + xlchild = CALLOC (1, sizeof(*xlchild)); + ERR_ABORT (xlchild); + xlchild->xlator = graph; + top->children = xlchild; + top->ctx = graph->ctx; + top->next = graph; + graph->parent = top; + + return top; +} + +static guts_replay_ctx_t * +guts_replay_init (guts_thread_ctx_t *thread) +{ + guts_replay_ctx_t *ctx = NULL; + int32_t fd = open (thread->file, O_RDONLY); + + if (fd < 0) { + gf_log ("glusterfs-guts", GF_LOG_DEBUG, + "failed to open tio_file %s", thread->file); + return ctx; + } else { + struct fuse_ll *guts_ll = CALLOC (1, sizeof (*guts_ll)); + ERR_ABORT (guts_ll); + + ctx = CALLOC (1, sizeof (*ctx)); + ERR_ABORT (ctx); + + if (ctx) { + /* equivalent to fuse_new_session () */ + guts_ll->conn.async_read = 1; + guts_ll->conn.max_write = UINT_MAX; + guts_ll->conn.max_readahead = UINT_MAX; + memcpy (&guts_ll->op, &fuse_ops, sizeof (struct fuse_lowlevel_ops)); + list_init_req (&guts_ll->list); + list_init_req (&guts_ll->interrupts); + guts_ll->owner = getuid (); + guts_ll->userdata = thread; + + /* TODO: need to create transport_t object which whole of the glusterfs + * so desperately depends on */ + transport_t *guts_trans = CALLOC (1, sizeof (*guts_trans)); + + if (guts_trans) { + memcpy (guts_trans, &guts_transport, sizeof (*guts_trans)); + guts_trans->ops = &guts_transport_ops; + } else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "failed to allocate memory for guts transport object"); + return NULL; + } + + glusterfs_ctx_t *glfs_ctx = CALLOC (1, sizeof (*glfs_ctx));; + if (glfs_ctx) { + guts_trans->xl_private = glfs_ctx; + guts_trans->xl = fuse_graph (thread->ctx->graph); + }else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "failed to allocate memory for glusterfs_ctx_t object"); + return NULL; + } + + call_pool_t *pool = CALLOC (1, sizeof (call_pool_t)); + if (pool) { + glfs_ctx->pool = pool; + LOCK_INIT (&pool->lock); + INIT_LIST_HEAD (&pool->all_frames); + } else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "failed to allocate memory for guts call pool"); + return NULL; + } + + guts_trans->xl->ctx = glfs_ctx; + guts_trans->init (guts_trans, NULL, guts_transport_notify); + guts_ll->userdata = guts_trans; + + /* call fuse_init */ + guts_ll->op.init (guts_trans, NULL); + + { + ctx->guts_ll = guts_ll; + ctx->tio_fd = fd; + ctx->inodes = get_new_dict (); + ctx->fds = get_new_dict (); + ctx->replies = get_new_dict (); + INIT_LIST_HEAD(&ctx->requests); + ctx->requests_dict = get_new_dict (); + } + } else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "failed to allocate memory for guts_ctx_t object"); + return NULL; + } + } + + return ctx; +} + +int32_t +guts_replay (guts_thread_ctx_t *thread) +{ + guts_req_t *entry = NULL; + guts_replay_ctx_t *ctx = guts_replay_init (thread); + + if (!ctx) { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "failed to initialize guts_replay"); + return -1; + } else { + while ((entry = guts_read_entry (ctx))) { + /* here we go ... execute the request */ + fuse_req_t req = CALLOC (1, sizeof (struct fuse_req)); + ino_t ino = entry->header.nodeid; + void *arg = entry->arg; + + if (req) { + req->f = ctx->guts_ll; + req->unique = entry->header.unique; + req->ctx.uid = entry->header.uid; + req->ctx.pid = entry->header.pid; + + /* req->u.ni.data is unused void *, while running in replay mode. Making use of available real-estate + * to store useful information of thread specific guts_replay_ctx */ + req->u.ni.data = (void *) ctx; + /* req->ch is of type 'struct fuse_chan', which fuse uses only at the + * time of the response it gets and is useful in sending the reply data to correct channel + * in /dev/fuse. This is not useful for us, so we ignore it by keeping it NULL */ + list_init_req (req); + + fuse_ino_t new_ino = guts_inode_search (ctx, ino); + + if (guts_replay_fop[entry->header.opcode].func) { + printf ("operation: %s && inode: %ld\n", guts_replay_fop[entry->header.opcode].name, new_ino); + guts_replay_fop[entry->header.opcode].func (req, new_ino, arg); + } + + if (entry->arg) + free (entry->arg); + free (entry); + } else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "failed to allocate memory for fuse_req_t object"); + return -1; + } + } + } + return 0; +} diff --git a/glusterfs-guts/src/guts-replay.h b/glusterfs-guts/src/guts-replay.h new file mode 100644 index 000000000..532060d2b --- /dev/null +++ b/glusterfs-guts/src/guts-replay.h @@ -0,0 +1,33 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) + +void guts_reply_err (fuse_req_t, error_t); +void guts_reply_open (fuse_req_t, struct fuse_file_info *); +void guts_reply_statfs (fuse_req_t, struct statvfs *); + +typedef void (*guts_replay_fop_t)(fuse_req_t, fuse_ino_t, const void *); + +typedef struct { + guts_replay_fop_t func; + const char *name; +} guts_replay_t; + +extern struct fuse_lowlevel_ops fuse_ops; + diff --git a/glusterfs-guts/src/guts-tables.c b/glusterfs-guts/src/guts-tables.c new file mode 100644 index 000000000..2992b3e2c --- /dev/null +++ b/glusterfs-guts/src/guts-tables.c @@ -0,0 +1,248 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include "guts-parse.h" +#include "dict.h" +#include "guts-tables.h" + + +int32_t +guts_attr_cmp (const struct stat *attr, + const struct stat *old_attr) +{ + return 0; +} + +int32_t +guts_statvfs_cmp (const struct statvfs *stbuf, + const struct statvfs *old_stbuf) +{ + return 0; +} + +int32_t +guts_flock_cmp (struct flock *lock, + struct flock *old_lock) +{ + return 0; +} + + +guts_req_t * +guts_lookup_request (guts_replay_ctx_t *ctx, uint64_t unique) +{ + guts_req_t *req = NULL; + + if (unique == 0) { + if (list_empty (&ctx->requests)) + req = NULL; + else { + /* pick an entry from list, move it out of the list and return it to the caller */ + char *key = NULL; + + req = list_entry (ctx->requests.next, guts_req_t, list); + list_del (&req->list); + + asprintf (&key, "%llu", req->header.unique); + + dict_set (ctx->requests_dict, key, data_from_static_ptr (req)); + + if (key) + free (key); + } + } else { + char *key = NULL; + data_t *req_data = NULL; + + asprintf (&key, "%llu", unique); + + req_data = dict_get (ctx->requests_dict, key); + + if (req_data) + req = data_to_ptr (req_data); + + if (key) + free (key); + } + return req; +} + +guts_req_t * +guts_get_request (guts_replay_ctx_t *ctx) +{ + return guts_lookup_request (ctx, 0); +} + +int32_t +guts_add_request (guts_replay_ctx_t *ctx, + guts_req_t *req) +{ + list_add_tail (&req->list, &ctx->requests); + return 0; +} + +int32_t +guts_add_reply (guts_replay_ctx_t *ctx, + guts_reply_t *reply) +{ + char *key = NULL; + asprintf (&key, "%llu", reply->req.unique); + + dict_set (ctx->replies, key, data_from_static_ptr(reply)); + + if (key) + free(key); + + return 0; +} + + +guts_reply_t * +guts_lookup_reply (guts_replay_ctx_t *ctx, + uint64_t unique) +{ + char *key = NULL; + data_t *reply_data = NULL; + guts_reply_t *new_reply = NULL; + + asprintf (&key, "%llu", unique); + reply_data = dict_get (ctx->replies, key); + + if (reply_data) { + new_reply = data_to_ptr (reply_data); + dict_del (ctx->replies, key); + } else { + /* reply has not yet been read from tio file */ + new_reply = guts_read_reply (ctx, unique); + + if (!new_reply) { + /* failed to fetch reply for 'unique' from tio file */ + new_reply; + } + } + + if (key) + free(key); + + return new_reply; + +} + +int32_t +guts_inode_update (guts_replay_ctx_t *ctx, + fuse_ino_t old_ino, + fuse_ino_t new_ino) +{ + char *key = NULL; + asprintf (&key, "%ld", old_ino); + dict_set (ctx->inodes, key, data_from_uint64 (new_ino)); + + if (key) + free(key); + + return 0; +} + +fuse_ino_t +guts_inode_search (guts_replay_ctx_t *ctx, + fuse_ino_t old_ino) +{ + char *key = NULL; + data_t *ino_data = NULL; + fuse_ino_t new_ino = 0; + + asprintf (&key, "%ld", old_ino); + ino_data = dict_get (ctx->inodes, key); + + if (ino_data) + new_ino = data_to_uint64 (ino_data); + else if (old_ino != /* TODO: FIXME */1 ) { + new_ino = 0; + } else + new_ino = old_ino; + + if (key) + free(key); + + return new_ino; +} + +int32_t +guts_fd_add (guts_replay_ctx_t *ctx, + unsigned long old_fd, + fd_t *new_fd) +{ + char *key = NULL; + asprintf (&key, "%ld", old_fd); + dict_set (ctx->fds, key, data_from_static_ptr (new_fd)); + + if (key) + free(key); + + return 0; +} + +fd_t * +guts_fd_search (guts_replay_ctx_t *ctx, + unsigned long old_fd) +{ + char *key = NULL; + data_t *fd_data = NULL; + fd_t *new_fd = NULL; + + asprintf (&key, "%ld", old_fd); + fd_data = dict_get (ctx->fds, key); + + if (fd_data) + new_fd = data_to_ptr (fd_data); + + if (key) + free(key); + + return new_fd; +} + +int32_t +guts_delete_fd (guts_replay_ctx_t *ctx, + unsigned long old_fd) +{ + char *key = NULL; + data_t *fd_data = NULL; + + asprintf (&key, "%ld", old_fd); + fd_data = dict_get (ctx->fds, key); + + if (fd_data) + dict_del (ctx->fds, key); + + if (key) + free(key); + + return 0; +} + +inline int32_t +guts_get_opcode (guts_replay_ctx_t *ctx, + uint64_t unique) +{ + guts_req_t *req = guts_lookup_request (ctx, unique); + + return ((req == NULL) ? -1 : req->header.opcode); + +} diff --git a/glusterfs-guts/src/guts-tables.h b/glusterfs-guts/src/guts-tables.h new file mode 100644 index 000000000..ff27300fa --- /dev/null +++ b/glusterfs-guts/src/guts-tables.h @@ -0,0 +1,80 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _GUTS_TABLES_H_ +#define _GUTS_TABLES_H_ + + +int32_t +guts_attr_cmp (const struct stat *attr, + const struct stat *old_attr); + +int32_t +guts_statvfs_cmp (const struct statvfs *stbuf, + const struct statvfs *old_stbuf); + +int32_t +guts_inode_update (guts_replay_ctx_t *ctx, + fuse_ino_t old_ino, + fuse_ino_t new_ino); + +fuse_ino_t +guts_inode_search (guts_replay_ctx_t *ctx, + fuse_ino_t old_ino); + +int32_t +guts_add_request (guts_replay_ctx_t *, + guts_req_t *); + +guts_req_t * +guts_get_request (guts_replay_ctx_t *ctx); + +guts_req_t * +guts_lookup_request (guts_replay_ctx_t *ctx, + uint64_t unique); + +guts_reply_t * +guts_lookup_reply (guts_replay_ctx_t *ctx, + uint64_t unique); + +int32_t +guts_add_reply (guts_replay_ctx_t *ctx, + guts_reply_t *reply); + +int32_t +guts_flock_cmp (struct flock *lock, + struct flock *old_lock); + +fd_t * +guts_fd_search (guts_replay_ctx_t *ctx, + unsigned long old_fd); + +int32_t +guts_delete_fd (guts_replay_ctx_t *, + unsigned long); + +int32_t +guts_get_opcode (guts_replay_ctx_t *ctx, + uint64_t unique); +int32_t +guts_fd_add (guts_replay_ctx_t *ctx, + unsigned long old_fd, + fd_t *new_fd); + +#endif diff --git a/glusterfs-guts/src/guts-trace.c b/glusterfs-guts/src/guts-trace.c new file mode 100644 index 000000000..51d8a68d6 --- /dev/null +++ b/glusterfs-guts/src/guts-trace.c @@ -0,0 +1,650 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include "glusterfs-guts.h" +#include + +#include "guts-parse.h" +#include "guts-tables.h" +#include "guts-trace.h" + +static xlator_t * +fuse_graph (xlator_t *graph) +{ + xlator_t *top = NULL; + xlator_list_t *xlchild; + + top = CALLOC (1, sizeof (*top)); + ERR_ABORT (top); + xlchild = CALLOC (1, sizeof(*xlchild)); + ERR_ABORT (xlchild); + xlchild->xlator = graph; + top->children = xlchild; + top->ctx = graph->ctx; + top->next = graph; + graph->parent = top; + + return top; +} + +int32_t +fuse_thread (pthread_t *thread, void *data); + +int32_t +guts_trace (guts_ctx_t *guts_ctx) +{ + transport_t *mp = NULL; + glusterfs_ctx_t ctx = { + .poll_type = SYS_POLL_TYPE_EPOLL, + }; + xlator_t *graph = NULL; + call_pool_t *pool = NULL; + int32_t ret = -1; + pthread_t thread; + /* Ignore SIGPIPE */ + signal (SIGPIPE, SIG_IGN); + +#if HAVE_BACKTRACE + /* Handle SIGABORT and SIGSEGV */ + signal (SIGSEGV, gf_print_trace); + signal (SIGABRT, gf_print_trace); +#endif /* HAVE_BACKTRACE */ + + ret = guts_tio_init (guts_ctx->file); + + if (ret < 0) { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "running in trace mode: failed to open tio file %s", guts_ctx->file); + return -1; + } + + pool = ctx.pool = CALLOC (1, sizeof (call_pool_t)); + ERR_ABORT (ctx.pool); + LOCK_INIT (&pool->lock); + INIT_LIST_HEAD (&pool->all_frames); + + /* glusterfs_mount has to be ideally placed after all the initialisation stuff */ + if (!(mp = glusterfs_mount (&ctx, guts_ctx->mountpoint))) { + gf_log ("glusterfs-guts", GF_LOG_ERROR, "Unable to mount glusterfs"); + return -1; + } + + gf_timer_registry_init (&ctx); + graph = guts_ctx->graph; + + if (!graph) { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "Unable to get xlator graph for mount_point %s", guts_ctx->mountpoint); + transport_disconnect (mp); + return -1; + } + + ctx.graph = graph; + + mp->xl = fuse_graph (graph); + mp->xl->ctx = &ctx; + + fuse_thread (&thread, mp); + + while (!poll_iteration (&ctx)); + + return 0; +} + + + +static void +guts_name (struct fuse_in_header *in, + const void *inargs) +{ + char *name = (char *) inargs; + + guts_req_dump (in, name, strlen (name)); +} + +static void +guts_noarg (struct fuse_in_header *in, + const void *inargs) +{ + guts_req_dump (in, NULL, 0); +} + + +static void +guts_setattr (struct fuse_in_header *in, + const void *inargs) +{ + struct fuse_setattr_in *arg = (struct fuse_setattr_in *)inargs; + guts_req_dump (in, arg, sizeof (*arg)); +} + +static void +guts_access (struct fuse_in_header *in, + const void *inargs) +{ + struct fuse_access_in *arg = (struct fuse_access_in *)inargs; + guts_req_dump (in, arg, sizeof (*arg)); +} + + +static void +guts_mknod (struct fuse_in_header *in, + const void *inargs) +{ + struct fuse_mknod_in *arg = (struct fuse_mknod_in *) inargs; + guts_req_dump (in, arg, sizeof (*arg)); +} + +static void +guts_mkdir (struct fuse_in_header *in, + const void *inargs) +{ + struct fuse_mkdir_in *arg = (struct fuse_mkdir_in *) inargs; + guts_req_dump (in, arg, sizeof (*arg)); +} + + +static void +guts_symlink (struct fuse_in_header *in, + const void *inargs) +{ + char *name = (char *) inargs; + char *linkname = ((char *) inargs) + strlen ((char *) inargs) + 1; + struct guts_symlink_in symlink_in; + + strcpy (symlink_in.name, name); + strcpy (symlink_in.linkname, linkname); + guts_req_dump (in, &symlink_in, sizeof (symlink_in)); +} + +static void +guts_rename (struct fuse_in_header *in, + const void *inargs) +{ + struct fuse_rename_in *arg = (struct fuse_rename_in *) inargs; + char *oldname = PARAM(arg); + char *newname = oldname + strlen (oldname) + 1; + struct guts_rename_in rename_in; + + memset (&rename_in, 0, sizeof (rename_in)); + memcpy (&rename_in, arg, sizeof (*arg)); + strcpy (rename_in.oldname, oldname); + strcpy (rename_in.newname, newname); + + guts_req_dump (in, &rename_in, sizeof (rename_in)); + +} + +static void +guts_link (struct fuse_in_header *in, + const void *inargs) +{ + struct fuse_link_in *arg = (struct fuse_link_in *) inargs; + + guts_req_dump (in, arg, sizeof (*arg)); +} + + +static void +guts_open (struct fuse_in_header *in, + const void *inargs) +{ + struct fuse_open_in *arg = (struct fuse_open_in *) inargs; + + guts_req_dump (in, arg, sizeof (*arg)); +} + +static void +guts_create (struct fuse_in_header *in, + const void *inargs) +{ + struct guts_create_in create_in; + struct fuse_open_in *arg = (struct fuse_open_in *) inargs; + char *name = PARAM (arg); + + memset (&create_in, 0, sizeof (create_in)); + memcpy (&create_in.open_in, arg, sizeof (*arg)); + memcpy (&create_in.name, name, strlen (name)); + + guts_req_dump (in, &create_in, sizeof (create_in)); +} + + +static void +guts_read(struct fuse_in_header *in, + const void *inarg) +{ + struct fuse_read_in *arg = (struct fuse_read_in *) inarg; + guts_req_dump (in, arg, sizeof (*arg)); +} + +static void +guts_write(struct fuse_in_header *in, + const void *inarg) +{ + /* TODO: where the hell is the data to be written??? */ + struct fuse_write_in *arg = (struct fuse_write_in *) inarg; + guts_req_dump (in, arg, sizeof (*arg)); +} + +static void +guts_flush(struct fuse_in_header *in, + const void *inarg) +{ + struct fuse_flush_in *arg = (struct fuse_flush_in *) inarg; + guts_req_dump (in, arg, sizeof (*arg)); +} + +static void +guts_release(struct fuse_in_header *in, + const void *inarg) +{ + struct fuse_release_in *arg = (struct fuse_release_in *) inarg; + guts_req_dump (in, arg, sizeof (*arg)); +} + +static void +guts_fsync(struct fuse_in_header *in, + const void *inarg) +{ + struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; + guts_req_dump (in, arg, sizeof (*arg)); +} + + +static void +guts_readdir(struct fuse_in_header *in, + const void *inarg) +{ + struct fuse_read_in *arg = (struct fuse_read_in *) inarg; + guts_req_dump (in, arg, sizeof (*arg)); +} + +static void +guts_releasedir(struct fuse_in_header *in, + const void *inarg) +{ + struct fuse_release_in *arg = (struct fuse_release_in *) inarg; + guts_req_dump (in, arg, sizeof (*arg)); +} + +static void +guts_fsyncdir(struct fuse_in_header *in, + const void *inarg) +{ + struct fuse_fsync_in *arg = (struct fuse_fsync_in *) inarg; + guts_req_dump (in, arg, sizeof (*arg)); +} + + +static void +guts_setxattr(struct fuse_in_header *in, + const void *inarg) +{ + struct fuse_setxattr_in *arg = (struct fuse_setxattr_in *) inarg; + char *name = PARAM(arg); + char *value = name + strlen(name) + 1; + struct guts_xattr_in setxattr_in; + + memset (&setxattr_in, 0, sizeof (setxattr_in)); + memcpy (&setxattr_in, arg, sizeof (*arg)); + strcpy (setxattr_in.name, name); + strcpy (setxattr_in.value, value); + + guts_req_dump (in, &setxattr_in, sizeof (setxattr_in)); + +} + +static void +guts_getxattr(struct fuse_in_header *in, + const void *inarg) +{ + struct fuse_getxattr_in *arg = (struct fuse_getxattr_in *) inarg; + guts_req_dump (in, arg, sizeof (*arg)); +} + +guts_log_t guts_log[] = { + [FUSE_LOOKUP] = { guts_name, "lookup" }, + [FUSE_GETATTR] = { guts_noarg, "getattr" }, + [FUSE_SETATTR] = { guts_setattr, "setattr" }, + [FUSE_ACCESS] = { guts_access, "access" }, + [FUSE_READLINK] = { guts_noarg, "readlink" }, + [FUSE_MKNOD] = { guts_mknod, "mknod" }, + [FUSE_MKDIR] = { guts_mkdir, "mkdir" }, + [FUSE_UNLINK] = { guts_name, "unlink" }, + [FUSE_RMDIR] = { guts_name, "rmdir" }, + [FUSE_SYMLINK] = { guts_symlink, "symlink" }, + [FUSE_RENAME] = { guts_rename, "rename" }, + [FUSE_LINK] = { guts_link, "link" }, + [FUSE_CREATE] = { guts_create, "create" }, + [FUSE_OPEN] = { guts_open, "open" }, + [FUSE_READ] = { guts_read, "read" }, + [FUSE_WRITE] = { guts_write, "write" }, + [FUSE_FLUSH] = { guts_flush, "flush" }, + [FUSE_RELEASE] = { guts_release, "release" }, + [FUSE_FSYNC] = { guts_fsync, "fsync" }, + [FUSE_OPENDIR] = { guts_open, "opendir" }, + [FUSE_READDIR] = { guts_readdir, "readdir" }, + [FUSE_RELEASEDIR] = { guts_releasedir, "releasedir" }, + [FUSE_FSYNCDIR] = { guts_fsyncdir, "fsyncdir" }, + [FUSE_STATFS] = { guts_noarg, "statfs" }, + [FUSE_SETXATTR] = { guts_setxattr, "setxattr" }, + [FUSE_GETXATTR] = { guts_getxattr, "getxattr" }, + [FUSE_LISTXATTR] = { guts_getxattr, "listxattr" }, + [FUSE_REMOVEXATTR] = { guts_name, "removexattr" }, +}; + +/* used for actual tracing task */ + +int32_t +guts_log_req (void *buf, + int32_t len) +{ + struct fuse_in_header *in = buf; + const void *inargs = NULL; + int32_t header_len = sizeof (struct fuse_in_header); + + if (header_len < len ) { + inargs = buf + header_len; + gf_log ("guts-gimmik", GF_LOG_ERROR, + "unique: %llu, opcode: %s (%i), nodeid: %lu, insize: %zu\n", + (unsigned long long) in->unique, "", + /*opname((enum fuse_opcode) in->opcode),*/ in->opcode, + (unsigned long) in->nodeid, len); + if (guts_log[in->opcode].func) + guts_log[in->opcode].func (in, inargs); + + } else { + gf_log ("guts", GF_LOG_ERROR, + "header is longer than the buffer passed"); + } + + return 0; +} + + +int +guts_reply_err (fuse_req_t req, int err) +{ + if (IS_TRACE(req)) { + /* we are tracing calls, just dump the reply to file and continue with fuse_reply_err() */ + guts_reply_dump (req, &err, sizeof (err)); + return fuse_reply_err (req, err); + } else { + /* we are replaying. ;) */ + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + int32_t opcode = guts_get_opcode (ctx, req->unique); + + /* see if we are called by close/closedir, if yes remove do a guts_fd_delete () */ + if (opcode == FUSE_RELEASEDIR || opcode == FUSE_RELEASE) { + guts_req_t *request = guts_lookup_request (ctx, req->unique); + struct fuse_release_in *arg = request->arg; + + guts_delete_fd (ctx, arg->fh); + } else if (err == -1) { + /* error while replaying?? just quit as of now + * TODO: this is not the right way */ + printf (":O - glusterfs-guts: replay failed\n"); + exit (0); + } + + return 0; + } +} + +void +guts_reply_none (fuse_req_t req) +{ + if (IS_TRACE(req)) { + guts_reply_dump (req, NULL, 0); + fuse_reply_none (req); + } else { + return; + } +} + +int +guts_reply_entry (fuse_req_t req, + const struct fuse_entry_param *e) +{ + if (IS_TRACE(req)) { + guts_reply_dump (req, e, sizeof (*e)); + return fuse_reply_entry (req, e); + } else { + /* TODO: is dict_set() the best solution for this case?? */ + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + guts_reply_t *reply = guts_lookup_reply (ctx, req->unique); + struct fuse_entry_param *old_entry = (struct fuse_entry_param *)reply->arg; + guts_inode_update (ctx, old_entry->ino, e->ino); + return 0; + } +} + +int +guts_reply_create (fuse_req_t req, + const struct fuse_entry_param *e, + const struct fuse_file_info *f) +{ + if (IS_TRACE(req)) { + struct guts_create_out create_out; + + memset (&create_out, 0, sizeof (create_out)); + memcpy (&create_out.e, e, sizeof (*e)); + memcpy (&create_out.f, f, sizeof (*f)); + + guts_reply_dump (req, &create_out, sizeof (create_out)); + return fuse_reply_create (req, e, f); + } else { + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + guts_reply_t *reply = guts_lookup_reply (ctx, req->unique); + struct guts_create_out *old_createout = (struct guts_create_out *) reply->arg; + struct fuse_file_info *old_f = &old_createout->f; + + /* add a new fd and map it to the file handle, as stored in tio file */ + guts_fd_add (ctx, old_f->fh, (fd_t *)(long)f->fh); + + return 0; + } +} + + +int +guts_reply_attr (fuse_req_t req, + const struct stat *attr, + double attr_timeout) +{ + if (IS_TRACE(req)) { + struct guts_attr_out attr_out; + + memcpy (&attr_out.attr, attr, sizeof (*attr)); + attr_out.attr_timeout = attr_timeout; + + guts_reply_dump (req, &attr_out, sizeof (attr_out)); + return fuse_reply_attr (req, attr, attr_timeout); + } else { + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + guts_reply_t *reply = guts_lookup_reply (ctx, req->unique); + struct guts_attr_out *old_attrout = (struct guts_attr_out *) reply->arg; + + if (!guts_attr_cmp (attr, &old_attrout->attr)) + return 0; + else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "attr failed."); + return -1; + } + } +} + +int +guts_reply_readlink (fuse_req_t req, + const char *linkname) +{ + if (IS_TRACE(req)) { + guts_reply_dump (req, linkname, strlen (linkname)); + return fuse_reply_readlink (req, linkname); + } else { + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + guts_reply_t *reply = guts_lookup_reply (ctx, req->unique); + char *old_linkname = (char *) reply->arg; + if (!strcmp (linkname, old_linkname)) + return 0; + else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "readlink failed. linkname in tio file: %s \n linkname recieved on replay: %s", + old_linkname, linkname); + return -1; + } + } +} + +int +guts_reply_open (fuse_req_t req, + const struct fuse_file_info *f) +{ + if (IS_TRACE(req)) { + guts_reply_dump (req, f, sizeof (*f)); + return fuse_reply_open (req, f); + } else { + /* the fd we recieve here is the valid fd for our current session, map the indicative number we have + * in mapping */ + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + guts_reply_t *reply = guts_lookup_reply (ctx, req->unique); + + if (reply) { + struct fuse_file_info *old_f = reply->arg; + + /* add a new fd and map it to the file handle, as stored in tio file */ + guts_fd_add (ctx, old_f->fh, (fd_t *)(long)f->fh); + } + + return 0; + } +} + +int +guts_reply_write (fuse_req_t req, + size_t count) +{ + if (IS_TRACE(req)) { + guts_reply_dump (req, &count, sizeof (count)); + return fuse_reply_write (req, count); + } else { + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + guts_reply_t *reply = guts_lookup_reply (ctx, req->unique); + size_t *old_count = reply->arg; + if (count == *old_count) + return 0; + else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "writev failed. old writev count: %d \n writev count on replay: %d", + old_count, count); + return -1; + } + } +} + +int +guts_reply_buf (fuse_req_t req, + const char *buf, + size_t size) +{ + if (IS_TRACE(req)) { + guts_reply_dump (req, buf, size); + return fuse_reply_buf (req, buf, size); + } else { + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + guts_reply_t *reply = guts_lookup_reply (ctx, req->unique); + char *old_buf = reply->arg; + size_t old_size = reply->arg_len; + if ((size == old_size) && (!memcmp (buf, old_buf, size))) + return 0; + else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "readv failed. old readv size: %d \n readv size on replay: %d", + old_size, size); + return -1; + } + } +} + +int +guts_reply_statfs (fuse_req_t req, + const struct statvfs *stbuf) +{ + if (IS_TRACE(req)) { + guts_reply_dump (req, stbuf, sizeof (*stbuf)); + return fuse_reply_statfs (req, stbuf); + } else { + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + guts_reply_t *reply = guts_lookup_reply (ctx, req->unique); + struct statvfs *old_stbuf = reply->arg; + + if (!guts_statvfs_cmp (old_stbuf, stbuf)) + return 0; + else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "statfs failed."); + return -1; + } + } +} + +int +guts_reply_xattr (fuse_req_t req, + size_t count) +{ + if (IS_TRACE(req)) { + guts_reply_dump (req, &count, sizeof (count)); + return fuse_reply_xattr (req, count); + } else { + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + guts_reply_t *reply = guts_lookup_reply (ctx, req->unique); + size_t *old_count = reply->arg; + if (count == *old_count) + return 0; + else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "xattr failed. old xattr count: %d \n xattr count on replay: %d", + old_count, count); + return -1; + } + } +} + +int +guts_reply_lock (fuse_req_t req, + struct flock *lock) +{ + if (IS_TRACE(req)) { + guts_reply_dump (req, lock , sizeof (*lock)); + return fuse_reply_lock (req, lock); + } else { + guts_replay_ctx_t *ctx = (guts_replay_ctx_t *) req->u.ni.data; + guts_reply_t *reply = guts_lookup_reply (ctx, req->unique); + struct flock *old_lock = (struct flock *)reply->arg; + if (!guts_flock_cmp (lock, old_lock)) + return 0; + else { + gf_log ("glusterfs-guts", GF_LOG_ERROR, + "lock failed."); + return -1; + } + } +} diff --git a/glusterfs-guts/src/guts-trace.h b/glusterfs-guts/src/guts-trace.h new file mode 100644 index 000000000..c877b2bcf --- /dev/null +++ b/glusterfs-guts/src/guts-trace.h @@ -0,0 +1,54 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#define IS_TRACE(req) (req->ch != NULL) + +#define PARAM(inarg) (((char *)(inarg)) + sizeof(*(inarg))) + +struct guts_symlink_in { + char name[NAME_MAX]; + char linkname[NAME_MAX]; +}; + +struct guts_create_in { + struct fuse_open_in open_in; + char name[NAME_MAX]; +}; + +struct guts_xattr_in { + struct fuse_setxattr_in xattr; + char name[NAME_MAX]; + char value[NAME_MAX]; +}; + +struct guts_rename_in { + struct fuse_rename_in rename; + char oldname[NAME_MAX]; + char newname[NAME_MAX]; +}; + +struct guts_create_out { + struct fuse_entry_param e; + struct fuse_file_info f; +}; + +struct guts_attr_out { + struct stat attr; + double attr_timeout; +}; diff --git a/glusterfs.spec.in b/glusterfs.spec.in new file mode 100644 index 000000000..1740bbc36 --- /dev/null +++ b/glusterfs.spec.in @@ -0,0 +1,256 @@ +# if you make changes, the it is advised to increment this number, and provide +# a descriptive suffix to identify who owns or what the change represents +# e.g. release_version 2.MSW +%define release_version 1 + +# if you wish to compile an rpm without ibverbs support, compile like this... +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --without ibverbs +%define with_ibverbs %{?_without_ibverbs:0}%{?!_without_ibverbs:1} + +# if you wish to compile an rpm without building the client RPMs... +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --without client +%define with_client %{?_without_client:0}%{?!_without_client:1} + +# if you wish to compile an rpm without BDB translator... +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --without bdb +%define with_bdb %{?_without_bdb:0}%{?!_without_bdb:1} + +# if you wish to compile an rpm without libglusterfsclient... +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --without libglfsclient +%define with_libglfsclient %{?_without_libglfsclient:0}%{?!_without_libglfsclient:1} + +# if you wish to compile an rpm without mod_glusterfs support... +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --without modglfs +%define with_modglfs %{?_without_modglfs:0}%{?!_without_modglfs:1} + +# if you wish to compile an rpm with apache at nonstandard location +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --without apxs_default --define 'apxs_path /usr/local/apache/bin' +%define with_apxs_default %{?_without_apxs_default:0}%{?!_without_apxs_default:1} +%{!?apxs_path: %define apxs_path %{nil}} + +# if you wish to compile an rpm with apache binaries at nonstandard path +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --without apache_auto -define 'apxs_bin_path /usr/local/apache/bin/apxs' +# rpmbuild -ta @PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz --without apache_auto -define 'apache_bin_path /usr/local/apache/bin/apache2' +%define with_apache_auto %{?_without_apache_auto:0}%{?!_without_apache_auto:1} +%{!?apxs_bin_path: %define apxs_bin_path %{nil}} +%{!?apache_bin_path: %define apache_bin_path %{nil}} + +Summary: GNU Cluster File System +Name: @PACKAGE_NAME@ +Version: @PACKAGE_VERSION@ +Release: %release_version +License: GPLv3 or later +Group: System Environment/Base +Vendor: Z RESEARCH Inc +Packager: @PACKAGE_BUGREPORT@ +BuildRoot: %_tmppath/%name-%version-%release-root +%if %with_ibverbs +BuildRequires: libibverbs-devel +%endif +%if %with_bdb +BuildRequires: db4-devel +%endif +%if %with_client +BuildRequires: fuse-devel +%endif +# Module needs to be fixed. +%if %with_modglfs +%if %with_apxs_default +BuildRequires: apache-devel >= 1.3 +Requires: apache >= 1.3 +%endif +%endif +BuildRequires: libtool +BuildRequires: byacc bison flex +BuildRequires: gcc +BuildRequires: make +URL: ftp://ftp.zresearch.com/pub/gluster/glusterfs/1.4-qa/@PACKAGE_NAME@-@PACKAGE_VERSION@.tar.gz +Source: %name-%version.tar.gz + +%description +GlusterFS is a clustered file-system capable of scaling to several +peta-bytes. It aggregates various storage bricks over Infiniband RDMA +or TCP/IP interconnect into one large parallel network file +system. GlusterFS is one of the most sophisticated file system in +terms of features and extensibility. It borrows a powerful concept +called Translators from GNU Hurd kernel. Much of the code in GlusterFS +is in userspace and easily manageable. + +%package devel +Summary: GlusterFS Development Libraries +Group: Development/Libraries +Requires: %name = %version + +%description devel +GlusterFS is a clustered file-system capable of scaling to several +peta-bytes. It aggregates various storage bricks over Infiniband RDMA +or TCP/IP interconnect into one large parallel network file +system. GlusterFS is one of the most sophisticated file system in +terms of features and extensibility. It borrows a powerful concept +called Translators from GNU Hurd kernel. Much of the code in GlusterFS +is in userspace and easily manageable. + +This package provides the development libraries. + + +%prep +# then -n argument says that the unzipped version is NOT %name-%version +#%setup -n %name-%version +%setup + + +%build +%if "%{with_client}" == "0" +%define client_options --disable-fuse-client +%endif +%if "%{with_ibverbs}" == "0" +%define ibverbs_options --disable-ibverbs +%endif +%if "%{with_bdb}" == "0" +%define bdb_options --disable-bdb +%endif +%if "%{with_libglfsclient}" == "0" +%define libglfs_options --disable-libglusterfsclient +%endif +# Module needs to be fixed. +%if "%{with_modglfs}" == "0" +%define modglfs_options --disable-mod_glusterfs +%endif +%if "%{with_modglfs}" == "1" +%if "%{with_apxs_default}" == "0" +%define apxs_options --with-apxs=%{?apxs_path:%apxs_path} +%endif +%endif +%if "%{with_modglfs}" == "1" +%if "%{with_apache_auto}" == "0" +%define apxs_bin_options --with-apxspath=%{?apxs_bin_path:%apxs_bin_path} +%define apache_bin_options --with-apachepath=%{?apache_bin_path:%apache_bin_path} +%endif +%endif + +%configure --prefix=/usr --sysconfdir=/etc --localstatedir=/var --libdir=%_libdir %{?client_options:%client_options} %{?ibverbs_options:%ibverbs_options} %{?bdb_options:%bdb_options} %{?libglfs_options:%libglfs_options} %{?modglfs_options:%modglfs_options} %{?apxs_options:%apxs_options} %{?apxs_bin_options:%apxs_bin_options} %{?apache_bin_options:%apache_bin_options} +%{__make} + + +%install +%{__rm} -rf $RPM_BUILD_ROOT +%{__make} install DESTDIR=$RPM_BUILD_ROOT +%{__rm} -rf $RPM_BUILD_ROOT/share/ +%{__mkdir_p} $RPM_BUILD_ROOT/usr/include/glusterfs +%{__mkdir_p} $RPM_BUILD_ROOT/var/log/glusterfs +%{__cp} %_builddir/%name-%version/libglusterfs/src/*.h $RPM_BUILD_ROOT/usr/include/glusterfs/ + + +%files +%doc AUTHORS ChangeLog COPYING INSTALL NEWS README +%_libdir +%dir /var/log/glusterfs +%exclude %_libdir/*.a +%exclude %_libdir/*.la +%exclude /usr/include/libglusterfsclient.h +%doc /usr/share/doc/glusterfs +%config /etc/glusterfs +%_prefix/sbin/glusterfs +%_prefix/sbin/glusterfsd +%_mandir/man8/glusterfs.8.gz +%_infodir/user-guide.info.gz +%exclude %_infodir/dir + +%if %with_client +/sbin/mount.glusterfs +%endif + +%files devel +%doc AUTHORS ChangeLog COPYING INSTALL NEWS README THANKS +%_libdir/*.a +%exclude %_libdir/*.la +%_prefix/include +%exclude /usr/include/glusterfs/y.tab.h + +%post +ldconfig -n %_libdir +%if %with_modglfs +%if %with_apxs_default +%{_sbindir}/apxs -i -a -n glusterfs %{_libdir}/glusterfs/%version/apache-1.3/mod_glusterfs.so +%else +%{apxs_path}/apxs -i -a -n glusterfs %{_libdir}/glusterfs/%version/apache-1.3/mod_glusterfs.so +%endif +%endif + +%postun +ldconfig + +%clean +%{__rm} -rf $RPM_BUILD_ROOT + + +%changelog +* Fri Dec 12 2008 Harshavardhana - 1.4 +- Added new options with --with-apxspath --with-apachepath + new configure options. + %post install command ldconfig moved up by one line. + +* Thu May 08 2008 Harshavardhana - 1.4 +- Added proper checks for apache-1.3 dependency, and enhanced + post install scripts + +* Wed Apr 23 2008 Harshavardhana - 1.4 +- Removed two new packages due to Excerpts From Amar's reviews. + +* Mon Apr 21 2008 Harshavardhana - 1.4 +- Fixed some build problems. And changed BuildRequires with httpd + and lighttpd(1.4) version. +- created libglusterfsclient and modglusterfs new packages. + +* Sat Apr 19 2008 Amar Tumballi - 1.3.8pre6 +- Merged common, client and server packages into one package. +- Added options to disable bdb, mod_glusterfs, libglusterfsclient + +* Fri Apr 11 2008 Harshavardhana - 1.3.8pre5 +- Changed many hardcoded variables to standard rpm variables. Removed + *.la unnecessary for the release. Python option removed as it + is not present with the coming releases. + +* Tue Feb 12 2008 Harshavardhana - 1.3.8 +- Replaced configure_options with different names for each configure + options as it is observed that configure_options never get appended + with extra options provided. + +* Wed Jan 16 2008 Matt Paine - 1.3.8 +- Change all /usr/libx directory references to %_libdir +- Added new switch to enable build without building client RPMS + +* Sun Jan 6 2008 Anand V. Avati - 1.3.8 +- glusterfs-booster.so back in libdir + +* Fri Nov 09 2007 Harshavardhana Ranganath - 1.3.8 +- Bumped to new version fixed problem with build for new glusterfs-booster.so + inside /usr/bin + +* Sun Oct 18 2007 Harshavardhana Ranganath - 1.3.7 +- Bumped to new version + +* Sun Oct 18 2007 Harshavardhana Ranganath - 1.3.6 +- Bumped to new version + +* Sun Oct 14 2007 Harshavardhana Ranganath - 1.3.5 +- Bumped to new version + +* Tue Oct 09 2007 Harshavardhana Ranganath - 1.3.4 +- Bumped to new version + +* Tue Oct 02 2007 Harshavardhana Ranganath - 1.3.3 +- Bumped to new version + +* Tue Oct 02 2007 Harshavardhana Ranganath - 1.3.2 +- Bumped to new version + +* Thu Sep 20 2007 Harshavardhana Ranganath - 1.3.1 +- built new rpms with ibverbs seperate + +* Sat Aug 4 2007 Matt Paine - 1.3.pre7 +- Added support to build rpm without ibverbs support (use --without ibverbs switch) + +* Sun Jul 15 2007 Matt Paine - 1.3.pre6 +- Initial spec file + diff --git a/glusterfsd/Makefile.am b/glusterfsd/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/glusterfsd/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/glusterfsd/src/Makefile.am b/glusterfsd/src/Makefile.am new file mode 100644 index 000000000..060917930 --- /dev/null +++ b/glusterfsd/src/Makefile.am @@ -0,0 +1,24 @@ +sbin_PROGRAMS = glusterfsd + +glusterfsd_SOURCES = glusterfsd.c fetch-spec.c +glusterfsd_LDADD = $(top_builddir)/libglusterfs/src/libglusterfs.la $(GF_LDADD) +glusterfsd_LDFLAGS = $(GF_LDFLAGS) $(GF_GLUSTERFS_LDFLAGS) +noinst_HEADERS = glusterfsd.h + +AM_CFLAGS = -fPIC -Wall -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -DDATADIR=\"$(localstatedir)\" \ + -DCONFDIR=\"$(sysconfdir)/glusterfs\" $(GF_GLUSTERFS_CFLAGS) + +CLEANFILES = + +$(top_builddir)/libglusterfs/src/libglusterfs.la: + $(MAKE) -C $(top_builddir)/libglusterfs/src/ all + +uninstall-local: + rm -f $(DESTDIR)$(sbindir)/glusterfs + +install-data-local: + $(INSTALL) -d -m 755 $(DESTDIR)$(localstatedir)/run + $(INSTALL) -d -m 755 $(DESTDIR)$(localstatedir)/log/glusterfs + rm -f $(DESTDIR)$(sbindir)/glusterfs + ln -s glusterfsd $(DESTDIR)$(sbindir)/glusterfs diff --git a/glusterfsd/src/fetch-spec.c b/glusterfsd/src/fetch-spec.c new file mode 100644 index 000000000..4009a55b3 --- /dev/null +++ b/glusterfsd/src/fetch-spec.c @@ -0,0 +1,266 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif /* _CONFIG_H */ + +#include "glusterfs.h" +#include "stack.h" +#include "dict.h" +#include "transport.h" +#include "event.h" +#include "defaults.h" + + +static int +fetch_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + char *spec_data) +{ + FILE *spec_fp = NULL; + + spec_fp = frame->local; + + if (op_ret >= 0) { + fwrite (spec_data, strlen (spec_data), 1, spec_fp); + fflush (spec_fp); + fclose (spec_fp); + } + else { + gf_log (frame->this->name, GF_LOG_ERROR, + "GETSPEC from server returned -1 (%s)", + strerror (op_errno)); + } + + frame->local = NULL; + STACK_DESTROY (frame->root); + + /* exit the child process */ + exit (op_ret); +} + + +static int +fetch_notify (xlator_t *this_xl, int event, void *data, ...) +{ + int ret = 0; + call_frame_t *frame = NULL; + + switch (event) + { + case GF_EVENT_CHILD_UP: + frame = create_frame (this_xl, this_xl->ctx->pool); + frame->local = this_xl->private; + + STACK_WIND (frame, fetch_cbk, + this_xl->children->xlator, + this_xl->children->xlator->mops->getspec, + this_xl->ctx->cmd_args.volfile_id, + 0); + break; + case GF_EVENT_CHILD_DOWN: + break; + default: + ret = default_notify (this_xl, event, data); + break; + } + + return ret; +} + + +static int +fetch_init (xlator_t *xl) +{ + return 0; +} + +static xlator_t * +get_shrub (glusterfs_ctx_t *ctx, + const char *remote_host, + const char *transport, + uint32_t remote_port) +{ + int ret = 0; + xlator_t *top = NULL; + xlator_t *trans = NULL; + xlator_list_t *parent = NULL, *tmp = NULL; + + top = CALLOC (1, sizeof (*top)); + ERR_ABORT (top); + trans = CALLOC (1, sizeof (*trans)); + ERR_ABORT (trans); + + top->name = "top"; + top->ctx = ctx; + top->next = trans; + top->init = fetch_init; + top->notify = fetch_notify; + top->children = (void *) CALLOC (1, sizeof (*top->children)); + ERR_ABORT (top->children); + top->children->xlator = trans; + + trans->name = "trans"; + trans->ctx = ctx; + trans->prev = top; + trans->init = fetch_init; + trans->notify = default_notify; + trans->options = get_new_dict (); + + parent = CALLOC (1, sizeof(*parent)); + parent->xlator = top; + if (trans->parents == NULL) + trans->parents = parent; + else { + tmp = trans->parents; + while (tmp->next) + tmp = tmp->next; + tmp->next = parent; + } + + /* TODO: log on failure to set dict */ + if (remote_host) + ret = dict_set_static_ptr (trans->options, "remote-host", + (char *)remote_host); + + if (remote_port) + ret = dict_set_uint32 (trans->options, "remote-port", + remote_port); + + /* 'option remote-subvolume ' is needed here even though + * its not used + */ + ret = dict_set_static_ptr (trans->options, "remote-subvolume", + "brick"); + ret = dict_set_static_ptr (trans->options, "disable-handshake", "on"); + ret = dict_set_static_ptr (trans->options, "non-blocking-io", "off"); + + if (transport) { + char *transport_type = CALLOC (1, strlen (transport) + 10); + ERR_ABORT (transport_type); + strcpy(transport_type, transport); + + if (strchr (transport_type, ':')) + *(strchr (transport_type, ':')) = '\0'; + + ret = dict_set_dynstr (trans->options, "transport-type", + transport_type); + } + + xlator_set_type (trans, "protocol/client"); + + if (xlator_tree_init (top) != 0) + return NULL; + + return top; +} + + +static int +_fetch (glusterfs_ctx_t *ctx, + FILE *spec_fp, + const char *remote_host, + const char *transport, + uint32_t remote_port) +{ + xlator_t *this = NULL; + + this = get_shrub (ctx, remote_host, transport, remote_port); + if (this == NULL) + return -1; + + this->private = spec_fp; + + event_dispatch (ctx->event_pool); + + return 0; +} + + +static int +_fork_and_fetch (glusterfs_ctx_t *ctx, + FILE *spec_fp, + const char *remote_host, + const char *transport, + uint32_t remote_port) +{ + int ret; + + ret = fork (); + switch (ret) { + case -1: + perror ("fork()"); + break; + case 0: + /* child */ + ret = _fetch (ctx, spec_fp, remote_host, + transport, remote_port); + if (ret == -1) + exit (ret); + default: + /* parent */ + wait (&ret); + ret = WEXITSTATUS (ret); + } + return ret; +} + +FILE * +fetch_spec (glusterfs_ctx_t *ctx) +{ + char *remote_host = NULL; + char *transport = NULL; + FILE *spec_fp; + int32_t ret; + + spec_fp = tmpfile (); + + if (!spec_fp) { + perror ("tmpfile ()"); + return NULL; + } + + remote_host = ctx->cmd_args.volfile_server; + transport = ctx->cmd_args.volfile_server_transport; + if (!transport) + transport = "socket"; + + ret = _fork_and_fetch (ctx, spec_fp, remote_host, transport, + ctx->cmd_args.volfile_server_port); + + if (!ret) { + fseek (spec_fp, 0, SEEK_SET); + } + else { + fclose (spec_fp); + spec_fp = NULL; + } + + return spec_fp; +} diff --git a/glusterfsd/src/glusterfsd.c b/glusterfsd/src/glusterfsd.c new file mode 100644 index 000000000..545f40e80 --- /dev/null +++ b/glusterfsd/src/glusterfsd.c @@ -0,0 +1,1123 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_MALLOC_H +#include +#endif + +#ifdef HAVE_MALLOC_STATS +#ifdef DEBUG +#include +#endif +#endif + +#include "xlator.h" +#include "glusterfs.h" +#include "compat.h" +#include "logging.h" +#include "dict.h" +#include "protocol.h" +#include "list.h" +#include "timer.h" +#include "glusterfsd.h" +#include "stack.h" +#include "revision.h" +#include "common-utils.h" +#include "event.h" + +#include + +/* using argp for command line parsing */ +static char gf_doc[] = ""; +static char argp_doc[] = "--volfile-server=SERVER [MOUNT-POINT]\n" \ + "--volfile=VOLFILE [MOUNT-POINT]"; +const char *argp_program_version = "" \ + PACKAGE_NAME" "PACKAGE_VERSION" built on "__DATE__" "__TIME__ \ + "\nRepository revision: " GLUSTERFS_REPOSITORY_REVISION "\n" \ + "Copyright (c) 2006, 2007, 2008 Z RESEARCH Inc. " \ + "\n" \ + "GlusterFS comes with ABSOLUTELY NO WARRANTY.\n" \ + "You may redistribute copies of GlusterFS under the terms of "\ + "the GNU General Public License."; +const char *argp_program_bug_address = "<" PACKAGE_BUGREPORT ">"; + +error_t parse_opts (int32_t key, char *arg, struct argp_state *_state); + +static struct argp_option gf_options[] = { + {0, 0, 0, 0, "Basic options:"}, + {"volfile-server", ARGP_VOLFILE_SERVER_KEY, "SERVER", 0, + "Server to get the volume file from. This option overrides " + "--volfile option"}, + {"volfile", ARGP_VOLUME_FILE_KEY, "VOLFILE", 0, + "File to use as VOLUME_FILE [default: "DEFAULT_CLIENT_VOLUME_FILE" or " + DEFAULT_SERVER_VOLUME_FILE"]"}, + {"spec-file", ARGP_VOLUME_FILE_KEY, "VOLFILE", OPTION_HIDDEN, + "File to use as VOLFILE [default : "DEFAULT_CLIENT_VOLUME_FILE" or " + DEFAULT_SERVER_VOLUME_FILE"]"}, + {"log-level", ARGP_LOG_LEVEL_KEY, "LOGLEVEL", 0, + "Logging severity. Valid options are DEBUG, NORMAL, WARNING, ERROR, " + "CRITICAL and NONE [default: NORMAL]"}, + {"log-file", ARGP_LOG_FILE_KEY, "LOGFILE", 0, + "File to use for logging [default: " + DEFAULT_LOG_FILE_DIRECTORY "/" PACKAGE_NAME ".log" "]"}, + + {0, 0, 0, 0, "Advanced Options:"}, + {"volfile-server-port", ARGP_VOLFILE_SERVER_PORT_KEY, "PORT", 0, + "Listening port number of volfile server"}, + {"volfile-server-transport", ARGP_VOLFILE_SERVER_TRANSPORT_KEY, + "TRANSPORT", 0, + "Transport type to get volfile from server [default: socket]"}, + {"volfile-id", ARGP_VOLFILE_ID_KEY, "KEY", 0, + "'key' of the volfile to be fetched from server"}, + {"pid-file", ARGP_PID_FILE_KEY, "PIDFILE", 0, + "File to use as pid file"}, + {"no-daemon", ARGP_NO_DAEMON_KEY, 0, 0, + "Run in foreground"}, + {"run-id", ARGP_RUN_ID_KEY, "RUN-ID", OPTION_HIDDEN, + "Run ID for the process, used by scripts to keep track of process " + "they started, defaults to none"}, + {"debug", ARGP_DEBUG_KEY, 0, 0, + "Run in debug mode. This option sets --no-daemon, --log-level " + "to DEBUG and --log-file to console"}, + {"volume-name", ARGP_VOLUME_NAME_KEY, "VOLUME-NAME", 0, + "Volume name to be used for MOUNT-POINT [default: top most volume " + "in VOLFILE]"}, + {"xlator-option", ARGP_XLATOR_OPTION_KEY,"VOLUME-NAME.OPTION=VALUE", 0, + "Add/override a translator option for a volume with specified value"}, + + {0, 0, 0, 0, "Fuse options:"}, + {"disable-direct-io-mode", ARGP_DISABLE_DIRECT_IO_MODE_KEY, 0, 0, + "Disable direct I/O mode in fuse kernel module"}, + {"entry-timeout", ARGP_ENTRY_TIMEOUT_KEY, "SECONDS", 0, + "Set entry timeout to SECONDS in fuse kernel module [default: 1]"}, + {"attribute-timeout", ARGP_ATTRIBUTE_TIMEOUT_KEY, "SECONDS", 0, + "Set attribute timeout to SECONDS for inodes in fuse kernel module " + "[default: 1]"}, +#ifdef GF_DARWIN_HOST_OS + {"non-local", ARGP_NON_LOCAL_KEY, 0, 0, + "Mount the macfuse volume without '-o local' option"}, +#endif + {0, 0, 0, 0, "Miscellaneous Options:"}, + {0, } +}; + + +static struct argp argp = { gf_options, parse_opts, argp_doc, gf_doc }; + + +static void +_gf_dump_details (int argc, char **argv) +{ + extern FILE *gf_log_logfile; + int i = 0; + char timestr[256]; + time_t utime = 0; + struct tm *tm = NULL; + pid_t mypid = 0; + struct utsname uname_buf = {{0, }, }; + int uname_ret = -1; + + utime = time (NULL); + tm = localtime (&utime); + mypid = getpid (); + uname_ret = uname (&uname_buf); + + /* Which TLA? What time? */ + strftime (timestr, 256, "%Y-%m-%d %H:%M:%S", tm); + fprintf (gf_log_logfile, + "========================================" + "========================================\n"); + fprintf (gf_log_logfile, "Version : %s %s built on %s %s\n", + PACKAGE_NAME, PACKAGE_VERSION, __DATE__, __TIME__); + fprintf (gf_log_logfile, "TLA Revision : %s\n", + GLUSTERFS_REPOSITORY_REVISION); + fprintf (gf_log_logfile, "Starting Time: %s\n", timestr); + fprintf (gf_log_logfile, "Command line : "); + for (i = 0; i < argc; i++) { + fprintf (gf_log_logfile, "%s ", argv[i]); + } + + fprintf (gf_log_logfile, "\nPID : %d\n", mypid); + + if (uname_ret == 0) { + fprintf (gf_log_logfile, "System name : %s\n", uname_buf.sysname); + fprintf (gf_log_logfile, "Nodename : %s\n", uname_buf.nodename); + fprintf (gf_log_logfile, "Kernel Release : %s\n", uname_buf.release); + fprintf (gf_log_logfile, "Hardware Identifier: %s\n", uname_buf.machine); + } + + + fprintf (gf_log_logfile, "\n"); + fflush (gf_log_logfile); +} + + + +static xlator_t * +_add_fuse_mount (xlator_t *graph) +{ + int ret = 0; + cmd_args_t *cmd_args = NULL; + xlator_t *top = NULL; + glusterfs_ctx_t *ctx = NULL; + xlator_list_t *xlchild = NULL; + + ctx = graph->ctx; + cmd_args = &ctx->cmd_args; + + xlchild = CALLOC (sizeof (*xlchild), 1); + ERR_ABORT (xlchild); + xlchild->xlator = graph; + + top = CALLOC (1, sizeof (*top)); + top->name = strdup ("fuse"); + if (xlator_set_type (top, ZR_XLATOR_FUSE) == -1) { + fprintf (stderr, + "MOUNT-POINT %s initialization failed", + cmd_args->mount_point); + gf_log ("glusterfs", GF_LOG_ERROR, + "MOUNT-POINT %s initialization failed", + cmd_args->mount_point); + return NULL; + } + top->children = xlchild; + top->ctx = graph->ctx; + top->next = graph; + top->options = get_new_dict (); + + ret = dict_set_static_ptr (top->options, ZR_MOUNTPOINT_OPT, + cmd_args->mount_point); + if (ret < 0) { + gf_log ("glusterfs", GF_LOG_DEBUG, + "failed to set mount-point to options dictionary"); + } + + if (cmd_args->fuse_attribute_timeout) + ret = dict_set_uint32 (top->options, ZR_ATTR_TIMEOUT_OPT, + cmd_args->fuse_attribute_timeout); + if (cmd_args->fuse_entry_timeout) + ret = dict_set_uint32 (top->options, ZR_ENTRY_TIMEOUT_OPT, + cmd_args->fuse_entry_timeout); + +#ifdef GF_DARWIN_HOST_OS + /* On Darwin machines, O_APPEND is not handled, + * which may corrupt the data + */ + if (cmd_args->fuse_direct_io_mode_flag == _gf_true) { + gf_log ("glusterfs", GF_LOG_DEBUG, + "'direct-io-mode' in fuse causes data corruption " + "if O_APPEND is used. disabling 'direct-io-mode'"); + } + ret = dict_set_static_ptr (top->options, ZR_DIRECT_IO_OPT, "disable"); + + if (cmd_args->non_local) + ret = dict_set_uint32 (top->options, "macfuse-local", + cmd_args->non_local); + +#else /* ! DARWIN HOST OS */ + if (cmd_args->fuse_direct_io_mode_flag == _gf_true) { + ret = dict_set_static_ptr (top->options, ZR_DIRECT_IO_OPT, + "enable"); + } else { + ret = dict_set_static_ptr (top->options, ZR_DIRECT_IO_OPT, + "disable"); + } + +#endif /* GF_DARWIN_HOST_OS */ + + graph->parents = CALLOC (1, sizeof (xlator_list_t)); + graph->parents->xlator = top; + + return top; +} + + +static FILE * +_get_specfp (glusterfs_ctx_t *ctx) +{ + int ret = 0; + cmd_args_t *cmd_args = NULL; + FILE *specfp = NULL; + struct stat statbuf; + + cmd_args = &ctx->cmd_args; + + if (cmd_args->volfile_server) { + specfp = fetch_spec (ctx); + + if (specfp == NULL) { + fprintf (stderr, + "error while getting volume file from " + "server %s\n", cmd_args->volfile_server); + gf_log ("glusterfs", GF_LOG_ERROR, + "error while getting volume file from " + "server %s", cmd_args->volfile_server); + } + else { + gf_log ("glusterfs", GF_LOG_DEBUG, + "loading volume file from server %s", + cmd_args->volfile_server); + } + + return specfp; + } + + ret = stat (cmd_args->volume_file, &statbuf); + if (ret == -1) { + fprintf (stderr, "%s: %s\n", + cmd_args->volume_file, strerror (errno)); + gf_log ("glusterfs", GF_LOG_ERROR, + "%s: %s", cmd_args->volume_file, strerror (errno)); + return NULL; + } + if (!(S_ISREG (statbuf.st_mode) || S_ISLNK (statbuf.st_mode))) { + fprintf (stderr, + "provide a valid volume file\n"); + gf_log ("glusterfs", GF_LOG_ERROR, + "provide a valid volume file"); + return NULL; + } + if ((specfp = fopen (cmd_args->volume_file, "r")) == NULL) { + fprintf (stderr, "volume file %s: %s\n", + cmd_args->volume_file, + strerror (errno)); + gf_log ("glusterfs", GF_LOG_ERROR, + "volume file %s: %s", + cmd_args->volume_file, + strerror (errno)); + return NULL; + } + + gf_log ("glusterfs", GF_LOG_DEBUG, + "loading volume file %s", cmd_args->volume_file); + + return specfp; +} + +static xlator_t * +_parse_specfp (glusterfs_ctx_t *ctx, + FILE *specfp) +{ + cmd_args_t *cmd_args = NULL; + xlator_t *tree = NULL, *trav = NULL, *new_tree = NULL; + + cmd_args = &ctx->cmd_args; + + fseek (specfp, 0L, SEEK_SET); + + tree = file_to_xlator_tree (ctx, specfp); + trav = tree; + + if (tree == NULL) { + if (cmd_args->volfile_server) { + fprintf (stderr, + "error in parsing volume file given by " + "server %s\n", cmd_args->volfile_server); + gf_log ("glusterfs", GF_LOG_ERROR, + "error in parsing volume file given by " + "server %s", cmd_args->volfile_server); + } + else { + fprintf (stderr, + "error in parsing volume file %s\n", + cmd_args->volume_file); + gf_log ("glusterfs", GF_LOG_ERROR, + "error in parsing volume file %s", + cmd_args->volume_file); + } + return NULL; + } + + /* if volume_name is given, then we attach to it */ + if (cmd_args->volume_name) { + while (trav) { + if (strcmp (trav->name, cmd_args->volume_name) == 0) { + new_tree = trav; + break; + } + trav = trav->next; + } + + if (!trav) { + if (cmd_args->volfile_server) { + fprintf (stderr, + "volume %s not found in volume " + "file given by server %s\n", + cmd_args->volume_name, + cmd_args->volfile_server); + gf_log ("glusterfs", GF_LOG_ERROR, + "volume %s not found in volume " + "file given by server %s", + cmd_args->volume_name, + cmd_args->volfile_server); + } else { + fprintf (stderr, + "volume %s not found in volume " + "file %s\n", + cmd_args->volume_name, + cmd_args->volume_file); + gf_log ("glusterfs", GF_LOG_ERROR, + "volume %s not found in volume " + "file %s", cmd_args->volume_name, + cmd_args->volume_file); + } + return NULL; + } + tree = trav; + } + return tree; +} + +static int +_log_if_option_is_invalid (xlator_t *xl, data_pair_t *pair) +{ + volume_opt_list_t *vol_opt = NULL; + volume_option_t *opt = NULL; + int i = 0; + int index = 0; + int found = 0; + + /* Get the first volume_option */ + list_for_each_entry (vol_opt, &xl->volume_options, list) { + /* Warn for extra option */ + if (!vol_opt->given_opt) + break; + + opt = vol_opt->given_opt; + for (index = 0; + ((index < ZR_OPTION_MAX_ARRAY_SIZE) && + (opt[index].key && opt[index].key[0])); index++) + for (i = 0; (i < ZR_VOLUME_MAX_NUM_KEY) && + opt[index].key[i]; i++) { + if (fnmatch (opt[index].key[i], + pair->key, + FNM_NOESCAPE) == 0) { + found = 1; + break; + } + } + } + + if (!found) { + gf_log (xl->name, GF_LOG_WARNING, + "option '%s' is not recognized", + pair->key); + } + return 0; +} + +static int +_xlator_graph_init (xlator_t *xl) +{ + volume_opt_list_t *vol_opt = NULL; + data_pair_t *pair = NULL; + xlator_t *trav = NULL; + int ret = -1; + + trav = xl; + + while (trav->prev) + trav = trav->prev; + + /* Validate phase */ + while (trav) { + /* Get the first volume_option */ + list_for_each_entry (vol_opt, + &trav->volume_options, list) + break; + if ((ret = + validate_xlator_volume_options (trav, + vol_opt->given_opt)) < 0) { + gf_log (trav->name, GF_LOG_ERROR, + "validating translator failed"); + return ret; + } + trav = trav->next; + } + + + trav = xl; + while (trav->prev) + trav = trav->prev; + /* Initialization phase */ + while (trav) { + if (!trav->ready) { + if ((ret = xlator_tree_init (trav)) < 0) { + gf_log ("glusterfs", GF_LOG_ERROR, + "initializing translator failed"); + return ret; + } + } + trav = trav->next; + } + + /* No error in this phase, just bunch of warning if at all */ + trav = xl; + + while (trav->prev) + trav = trav->prev; + + /* Validate again phase */ + while (trav) { + pair = trav->options->members_list; + while (pair) { + _log_if_option_is_invalid (trav, pair); + pair = pair->next; + } + trav = trav->next; + } + + return ret; +} + +int +glusterfs_graph_init (xlator_t *graph, int fuse) +{ + volume_opt_list_t *vol_opt = NULL; + + if (fuse) { + /* FUSE needs to be initialized earlier than the + other translators */ + list_for_each_entry (vol_opt, + &graph->volume_options, list) + break; + if (validate_xlator_volume_options (graph, + vol_opt->given_opt) == -1) { + gf_log (graph->name, GF_LOG_ERROR, + "validating translator failed"); + return -1; + } + if (graph->init (graph) != 0) + return -1; + + graph->ready = 1; + } + if (_xlator_graph_init (graph) == -1) + return -1; + + /* check server or fuse is given */ + if (graph->ctx->top == NULL) { + fprintf (stderr, "no valid translator loaded at the top, or" + "no mount point given. exiting\n"); + gf_log ("glusterfs", GF_LOG_ERROR, + "no valid translator loaded at the top or " + "no mount point given. exiting"); + return -1; + } + + return 0; +} + +static int +gf_remember_xlator_option (struct list_head *options, char *arg) +{ + glusterfs_ctx_t *ctx = NULL; + cmd_args_t *cmd_args = NULL; + xlator_cmdline_option_t *option = NULL; + int ret = -1; + char *dot = NULL; + char *equals = NULL; + + ctx = get_global_ctx_ptr (); + cmd_args = &ctx->cmd_args; + + option = CALLOC (1, sizeof (xlator_cmdline_option_t)); + INIT_LIST_HEAD (&option->cmd_args); + + dot = strchr (arg, '.'); + if (!dot) + goto out; + + option->volume = CALLOC ((dot - arg), sizeof (char)); + strncpy (option->volume, arg, (dot - arg)); + + equals = strchr (arg, '='); + if (!equals) + goto out; + + option->key = CALLOC ((equals - dot), sizeof (char)); + strncpy (option->key, dot + 1, (equals - dot - 1)); + + if (!*(equals + 1)) + goto out; + + option->value = strdup (equals + 1); + + list_add (&option->cmd_args, &cmd_args->xlator_options); + + ret = 0; +out: + if (ret == -1) { + if (option) { + if (option->volume) + FREE (option->volume); + if (option->key) + FREE (option->key); + if (option->value) + FREE (option->value); + + FREE (option); + } + } + + return ret; +} + + +static void +gf_add_cmdline_options (xlator_t *graph, cmd_args_t *cmd_args) +{ + int ret = 0; + xlator_t *trav = graph; + xlator_cmdline_option_t *cmd_option = NULL; + + while (trav) { + list_for_each_entry (cmd_option, + &cmd_args->xlator_options, cmd_args) { + if (!fnmatch (cmd_option->volume, + trav->name, FNM_NOESCAPE)) { + ret = dict_set_str (trav->options, + cmd_option->key, + cmd_option->value); + if (ret == 0) { + gf_log ("glusterfs", GF_LOG_WARNING, + "adding option '%s' for " + "volume '%s' with value '%s'", + cmd_option->key, trav->name, + cmd_option->value); + } else { + gf_log ("glusterfs", GF_LOG_WARNING, + "adding option '%s' for " + "volume '%s' failed: %s", + cmd_option->key, trav->name, + strerror (-ret)); + } + } + } + trav = trav->next; + } +} + + +error_t +parse_opts (int key, char *arg, struct argp_state *state) +{ + cmd_args_t *cmd_args = NULL; + uint32_t n = 0; + + cmd_args = state->input; + + switch (key) { + case ARGP_VOLFILE_SERVER_KEY: + cmd_args->volfile_server = strdup (arg); + break; + + case ARGP_VOLUME_FILE_KEY: + cmd_args->volume_file = strdup (arg); + break; + + case ARGP_LOG_LEVEL_KEY: + if (strcasecmp (arg, ARGP_LOG_LEVEL_NONE_OPTION) == 0) { + cmd_args->log_level = GF_LOG_NONE; + break; + } + if (strcasecmp (arg, ARGP_LOG_LEVEL_TRACE_OPTION) == 0) { + cmd_args->log_level = GF_LOG_TRACE; + break; + } + if (strcasecmp (arg, ARGP_LOG_LEVEL_CRITICAL_OPTION) == 0) { + cmd_args->log_level = GF_LOG_CRITICAL; + break; + } + if (strcasecmp (arg, ARGP_LOG_LEVEL_ERROR_OPTION) == 0) { + cmd_args->log_level = GF_LOG_ERROR; + break; + } + if (strcasecmp (arg, ARGP_LOG_LEVEL_WARNING_OPTION) == 0) { + cmd_args->log_level = GF_LOG_WARNING; + break; + } + if (strcasecmp (arg, ARGP_LOG_LEVEL_NORMAL_OPTION) == 0) { + cmd_args->log_level = GF_LOG_NORMAL; + break; + } + if (strcasecmp (arg, ARGP_LOG_LEVEL_DEBUG_OPTION) == 0) { + cmd_args->log_level = GF_LOG_DEBUG; + break; + } + + argp_failure (state, -1, 0, "unknown log level %s", arg); + break; + + case ARGP_LOG_FILE_KEY: + cmd_args->log_file = strdup (arg); + break; + + case ARGP_VOLFILE_SERVER_PORT_KEY: + n = 0; + + if (gf_string2uint_base10 (arg, &n) == 0) { + cmd_args->volfile_server_port = n; + break; + } + + argp_failure (state, -1, 0, + "unknown volfile server port %s", arg); + break; + + case ARGP_VOLFILE_SERVER_TRANSPORT_KEY: + cmd_args->volfile_server_transport = strdup (arg); + break; + + case ARGP_VOLFILE_ID_KEY: + cmd_args->volfile_id = strdup (arg); + break; + + case ARGP_PID_FILE_KEY: + cmd_args->pid_file = strdup (arg); + break; + + case ARGP_NO_DAEMON_KEY: + cmd_args->no_daemon_mode = ENABLE_NO_DAEMON_MODE; + break; + + case ARGP_RUN_ID_KEY: + cmd_args->run_id = strdup (arg); + break; + + case ARGP_DEBUG_KEY: + cmd_args->debug_mode = ENABLE_DEBUG_MODE; + break; + + case ARGP_DISABLE_DIRECT_IO_MODE_KEY: + cmd_args->fuse_direct_io_mode_flag = _gf_false; + break; + + case ARGP_ENTRY_TIMEOUT_KEY: + n = 0; + + if (gf_string2uint_base10 (arg, &n) == 0) { + cmd_args->fuse_entry_timeout = n; + break; + } + + argp_failure (state, -1, 0, "unknown entry timeout %s", arg); + break; + + case ARGP_ATTRIBUTE_TIMEOUT_KEY: + n = 0; + + if (gf_string2uint_base10 (arg, &n) == 0) { + cmd_args->fuse_attribute_timeout = n; + break; + } + + argp_failure (state, -1, 0, + "unknown attribute timeout %s", arg); + break; + + case ARGP_VOLUME_NAME_KEY: + cmd_args->volume_name = strdup (arg); + break; + + case ARGP_XLATOR_OPTION_KEY: + gf_remember_xlator_option (&cmd_args->xlator_options, arg); + break; + +#ifdef GF_DARWIN_HOST_OS + case ARGP_NON_LOCAL_KEY: + cmd_args->non_local = _gf_true; + break; + +#endif /* DARWIN */ + + case ARGP_KEY_NO_ARGS: + break; + + case ARGP_KEY_ARG: + if (state->arg_num >= 1) + argp_usage (state); + + cmd_args->mount_point = strdup (arg); + break; + } + + return 0; +} + + +void +cleanup_and_exit (int signum) +{ + glusterfs_ctx_t *ctx = NULL; + xlator_t *trav = NULL; + + ctx = get_global_ctx_ptr (); + + gf_log ("glusterfs", GF_LOG_WARNING, "shutting down"); + + if (ctx->pidfp) { + gf_unlockfd (fileno (ctx->pidfp)); + fclose (ctx->pidfp); + ctx->pidfp = NULL; + } + + if (ctx->specfp) { + fclose (ctx->specfp); + ctx->specfp = NULL; + } + + if (ctx->cmd_args.pid_file) { + unlink (ctx->cmd_args.pid_file); + ctx->cmd_args.pid_file = NULL; + } + + if (ctx->graph) { + trav = ctx->graph; + ctx->graph = NULL; + while (trav) { + trav->fini (trav); + trav = trav->next; + } + exit (0); + } else { + gf_log ("glusterfs", GF_LOG_DEBUG, "no graph present"); + } +} + + +static char * +zr_build_process_uuid () +{ + char tmp_str[1024] = {0,}; + char hostname[256] = {0,}; + struct timeval tv = {0,}; + struct tm now = {0, }; + char now_str[32]; + + if (-1 == gettimeofday(&tv, NULL)) { + gf_log ("", GF_LOG_ERROR, + "gettimeofday: failed %s", + strerror (errno)); + } + + if (-1 == gethostname (hostname, 256)) { + gf_log ("", GF_LOG_ERROR, + "gethostname: failed %s", + strerror (errno)); + } + + localtime_r (&tv.tv_sec, &now); + strftime (now_str, 32, "%Y/%m/%d-%H:%M:%S", &now); + snprintf (tmp_str, 1024, "%s-%d-%s:%ld", + hostname, getpid(), now_str, tv.tv_usec); + + return strdup (tmp_str); +} + +#define GF_SERVER_PROCESS 0 +#define GF_CLIENT_PROCESS 1 + +static uint8_t +gf_get_process_mode (char *exec_name) +{ + char *dup_execname = NULL, *base = NULL; + uint8_t ret = 0; + + dup_execname = strdup (exec_name); + base = basename (dup_execname); + + if (!strncmp (base, "glusterfsd", 10)) { + ret = GF_SERVER_PROCESS; + } else { + ret = GF_CLIENT_PROCESS; + } + + free (dup_execname); + + return ret; +} + +int +main (int argc, char *argv[]) +{ + glusterfs_ctx_t *ctx = NULL; + cmd_args_t *cmd_args = NULL; + call_pool_t *pool = NULL; + struct stat stbuf; + char tmp_logfile[1024] = { 0 }; + char timestr[256] = { 0 }; + char *base_exec_name = NULL; + time_t utime; + struct tm *tm = NULL; + int ret = 0; + struct rlimit lim; + FILE *specfp = NULL; + xlator_t *graph = NULL; + xlator_t *trav = NULL; + int fuse_volume_found = 0; + int xl_count = 0; + uint8_t process_mode = 0; + + utime = time (NULL); + ctx = CALLOC (1, sizeof (glusterfs_ctx_t)); + ERR_ABORT (ctx); + base_exec_name = strdup (argv[0]); + process_mode = gf_get_process_mode (base_exec_name); + set_global_ctx_ptr (ctx); + ctx->process_uuid = zr_build_process_uuid (); + cmd_args = &ctx->cmd_args; + + /* parsing command line arguments */ + cmd_args->log_level = DEFAULT_LOG_LEVEL; + cmd_args->fuse_direct_io_mode_flag = _gf_true; + + INIT_LIST_HEAD (&cmd_args->xlator_options); + + argp_parse (&argp, argc, argv, ARGP_IN_ORDER, NULL, cmd_args); + + if (ENABLE_DEBUG_MODE == cmd_args->debug_mode) { + cmd_args->log_level = GF_LOG_DEBUG; + cmd_args->log_file = "/dev/stdout"; + cmd_args->no_daemon_mode = ENABLE_NO_DAEMON_MODE; + } + + if ((cmd_args->volfile_server == NULL) + && (cmd_args->volume_file == NULL)) { + if (process_mode == GF_SERVER_PROCESS) + cmd_args->volume_file = strdup (DEFAULT_SERVER_VOLUME_FILE); + else + cmd_args->volume_file = strdup (DEFAULT_CLIENT_VOLUME_FILE); + } + + if (cmd_args->log_file == NULL) + asprintf (&cmd_args->log_file, + DEFAULT_LOG_FILE_DIRECTORY "/%s.log", + basename (base_exec_name)); + + free (base_exec_name); + + ctx->event_pool = event_pool_new (DEFAULT_EVENT_POOL_SIZE); + pthread_mutex_init (&(ctx->lock), NULL); + pool = ctx->pool = CALLOC (1, sizeof (call_pool_t)); + ERR_ABORT (ctx->pool); + LOCK_INIT (&pool->lock); + INIT_LIST_HEAD (&pool->all_frames); + + if (cmd_args->pid_file != NULL) { + ctx->pidfp = fopen (cmd_args->pid_file, "a+"); + if (ctx->pidfp == NULL) { + fprintf (stderr, + "unable to open pid file %s. %s. exiting\n", + cmd_args->pid_file, strerror (errno)); + /* do cleanup and exit ?! */ + return -1; + } + ret = gf_lockfd (fileno (ctx->pidfp)); + if (ret == -1) { + fprintf (stderr, "unable to lock pid file %s. %s. " + "Is another instance of %s running?!\n" + "exiting\n", cmd_args->pid_file, + strerror (errno), argv[0]); + fclose (ctx->pidfp); + return -1; + } + ret = ftruncate (fileno (ctx->pidfp), 0); + if (ret == -1) { + fprintf (stderr, + "unable to truncate file %s. %s. exiting\n", + cmd_args->pid_file, strerror (errno)); + gf_unlockfd (fileno (ctx->pidfp)); + fclose (ctx->pidfp); + return -1; + } + } + + /* initializing logs */ + if (cmd_args->run_id) { + ret = stat (cmd_args->log_file, &stbuf); + /* If its /dev/null, or /dev/stdout, /dev/stderr, + * let it use the same, no need to alter + */ + if (((ret == 0) && + (S_ISREG (stbuf.st_mode) || S_ISLNK (stbuf.st_mode))) || + (ret == -1)) { + /* Have seperate logfile per run */ + tm = localtime (&utime); + strftime (timestr, 256, "%Y%m%d.%H%M%S", tm); + sprintf (tmp_logfile, "%s.%s.%d", + cmd_args->log_file, timestr, getpid ()); + + /* Create symlink to actual log file */ + unlink (cmd_args->log_file); + symlink (tmp_logfile, cmd_args->log_file); + + FREE (cmd_args->log_file); + cmd_args->log_file = strdup (tmp_logfile); + } + } + + gf_global_variable_init (); + + if (gf_log_init (cmd_args->log_file) == -1) { + fprintf (stderr, + "failed to open logfile %s. exiting\n", + cmd_args->log_file); + return -1; + } + gf_log_set_loglevel (cmd_args->log_level); + + /* setting up environment */ + lim.rlim_cur = RLIM_INFINITY; + lim.rlim_max = RLIM_INFINITY; + if (setrlimit (RLIMIT_CORE, &lim) == -1) { + fprintf (stderr, "ignoring %s\n", + strerror (errno)); + } +#ifdef HAVE_MALLOC_STATS +#ifdef DEBUG + mtrace (); +#endif + signal (SIGUSR1, (sighandler_t) malloc_stats); +#endif + signal (SIGSEGV, gf_print_trace); + signal (SIGABRT, gf_print_trace); + signal (SIGPIPE, SIG_IGN); + signal (SIGHUP, gf_log_logrotate); + signal (SIGTERM, cleanup_and_exit); + /* This is used to dump details */ + /* signal (SIGUSR2, (sighandler_t) glusterfs_stats); */ + + /* getting and parsing volume file */ + if ((specfp = _get_specfp (ctx)) == NULL) { + /* _get_specfp() prints necessary error message */ + gf_log ("glusterfs", GF_LOG_ERROR, "exiting\n"); + argp_help (&argp, stderr, ARGP_HELP_SEE, (char *) argv[0]); + return -1; + } + _gf_dump_details (argc, argv); + gf_log_volume_file (specfp); + if ((graph = _parse_specfp (ctx, specfp)) == NULL) { + /* _parse_specfp() prints necessary error message */ + fprintf (stderr, "exiting\n"); + gf_log ("glusterfs", GF_LOG_ERROR, "exiting"); + return -1; + } + ctx->specfp = specfp; + + /* check whether MOUNT-POINT argument and fuse volume are given + * at same time or not. If not, add argument MOUNT-POINT to graph + * as top volume if given + */ + trav = graph; + fuse_volume_found = 0; + + while (trav) { + if (strcmp (trav->type, ZR_XLATOR_FUSE) == 0) { + if (dict_get (trav->options, + ZR_MOUNTPOINT_OPT) != NULL) { + trav->ctx = graph->ctx; + fuse_volume_found = 1; + } + } + + xl_count++; /* Getting this value right is very important */ + trav = trav->next; + } + + ctx->xl_count = xl_count + 1; + + if (!fuse_volume_found && (cmd_args->mount_point != NULL)) { + if ((graph = _add_fuse_mount (graph)) == NULL) { + /* _add_fuse_mount() prints necessary + * error message + */ + fprintf (stderr, "exiting\n"); + gf_log ("glusterfs", GF_LOG_ERROR, "exiting"); + return -1; + } + } + + /* daemonize now */ + if (!cmd_args->no_daemon_mode) { + if (daemon (0, 0) == -1) { + fprintf (stderr, "unable to run in daemon mode: %s", + strerror (errno)); + gf_log ("glusterfs", GF_LOG_ERROR, + "unable to run in daemon mode: %s", + strerror (errno)); + return -1; + } + + /* we are daemon now */ + /* update pid file, if given */ + if (cmd_args->pid_file != NULL) { + fprintf (ctx->pidfp, "%d\n", getpid ()); + fflush (ctx->pidfp); + /* we close pid file on exit */ + } + } + + gf_log ("glusterfs", GF_LOG_DEBUG, + "running in pid %d", getpid ()); + + gf_timer_registry_init (ctx); + + /* override xlator options with command line options + * where applicable + */ + gf_add_cmdline_options (graph, cmd_args); + + ctx->graph = graph; + if (glusterfs_graph_init (graph, fuse_volume_found) != 0) { + gf_log ("glusterfs", GF_LOG_ERROR, + "translator initialization failed. exiting"); + return -1; + } + + /* Send PARENT_UP notify to all the translators now */ + graph->notify (graph, GF_EVENT_PARENT_UP, ctx->graph); + + gf_log ("glusterfs", GF_LOG_NORMAL, "Successfully started"); + + event_dispatch (ctx->event_pool); + + return 0; +} diff --git a/glusterfsd/src/glusterfsd.h b/glusterfsd/src/glusterfsd.h new file mode 100644 index 000000000..69ad6b07a --- /dev/null +++ b/glusterfsd/src/glusterfsd.h @@ -0,0 +1,78 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef __GLUSTERFSD_H__ +#define __GLUSTERFSD_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#define DEFAULT_CLIENT_VOLUME_FILE CONFDIR "/glusterfs.vol" +#define DEFAULT_SERVER_VOLUME_FILE CONFDIR "/glusterfsd.vol" +#define DEFAULT_LOG_FILE_DIRECTORY DATADIR "/log/glusterfs" +#define DEFAULT_LOG_LEVEL GF_LOG_NORMAL + +#define DEFAULT_EVENT_POOL_SIZE 16384 + +#define ARGP_LOG_LEVEL_NONE_OPTION "NONE" +#define ARGP_LOG_LEVEL_TRACE_OPTION "TRACE" +#define ARGP_LOG_LEVEL_CRITICAL_OPTION "CRITICAL" +#define ARGP_LOG_LEVEL_ERROR_OPTION "ERROR" +#define ARGP_LOG_LEVEL_WARNING_OPTION "WARNING" +#define ARGP_LOG_LEVEL_NORMAL_OPTION "NORMAL" +#define ARGP_LOG_LEVEL_DEBUG_OPTION "DEBUG" + +#define ENABLE_NO_DAEMON_MODE 1 +#define ENABLE_DEBUG_MODE 1 + +#define ZR_XLATOR_FUSE "mount/fuse" +#define ZR_MOUNTPOINT_OPT "mountpoint" +#define ZR_ATTR_TIMEOUT_OPT "attribute-timeout" +#define ZR_ENTRY_TIMEOUT_OPT "entry-timeout" +#define ZR_DIRECT_IO_OPT "direct-io-mode" + +enum argp_option_keys { + ARGP_VOLFILE_SERVER_KEY = 's', + ARGP_VOLUME_FILE_KEY = 'f', + ARGP_LOG_LEVEL_KEY = 'L', + ARGP_LOG_FILE_KEY = 'l', + ARGP_VOLFILE_SERVER_PORT_KEY = 131, + ARGP_VOLFILE_SERVER_TRANSPORT_KEY = 132, + ARGP_PID_FILE_KEY = 'p', + ARGP_NO_DAEMON_KEY = 'N', + ARGP_RUN_ID_KEY = 'r', + ARGP_DEBUG_KEY = 133, + ARGP_DISABLE_DIRECT_IO_MODE_KEY = 134, + ARGP_ENTRY_TIMEOUT_KEY = 135, + ARGP_ATTRIBUTE_TIMEOUT_KEY = 136, + ARGP_VOLUME_NAME_KEY = 137, + ARGP_XLATOR_OPTION_KEY = 138, +#ifdef GF_DARWIN_HOST_OS + ARGP_NON_LOCAL_KEY = 139, +#endif /* DARWIN */ + ARGP_VOLFILE_ID_KEY = 143, +}; + +/* Moved here from fetch-spec.h */ +FILE *fetch_spec (glusterfs_ctx_t *ctx); + + +#endif /* __GLUSTERFSD_H__ */ diff --git a/libglusterfs/Makefile.am b/libglusterfs/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/libglusterfs/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/libglusterfs/src/Makefile.am b/libglusterfs/src/Makefile.am new file mode 100644 index 000000000..16e6717de --- /dev/null +++ b/libglusterfs/src/Makefile.am @@ -0,0 +1,21 @@ +libglusterfs_la_CFLAGS = -fPIC -Wall -g -shared -nostartfiles $(GF_CFLAGS) $(GF_DARWIN_LIBGLUSTERFS_CFLAGS) + +libglusterfs_la_CPPFLAGS = -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -D_GNU_SOURCE -DXLATORDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator\" -DSCHEDULERDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler\" -DTRANSPORTDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/transport\" -D$(GF_HOST_OS) -DLIBDIR=\"$(libdir)/glusterfs/$(PACKAGE_VERSION)/auth\" + +libglusterfs_la_LIBADD = @LEXLIB@ + +lib_LTLIBRARIES = libglusterfs.la + +libglusterfs_la_SOURCES = dict.c spec.lex.c y.tab.c xlator.c logging.c hashfn.c defaults.c scheduler.c common-utils.c transport.c timer.c inode.c call-stub.c compat.c authenticate.c fd.c compat-errno.c event.c mem-pool.c gf-dirent.c + +noinst_HEADERS = common-utils.h defaults.h dict.h glusterfs.h hashfn.h logging.h protocol.h scheduler.h xlator.h transport.h stack.h timer.h list.h inode.h call-stub.h compat.h authenticate.h fd.h revision.h compat-errno.h event.h mem-pool.h byte-order.h gf-dirent.h locking.h + +EXTRA_DIST = spec.l spec.y + +spec.lex.c: spec.l y.tab.h + $(LEX) -t $(srcdir)/spec.l > $@ + +y.tab.c y.tab.h: spec.y + $(YACC) -d $(srcdir)/spec.y + +CLEANFILES = spec.lex.c y.tab.c y.tab.h diff --git a/libglusterfs/src/authenticate.c b/libglusterfs/src/authenticate.c new file mode 100644 index 000000000..69cc8d99c --- /dev/null +++ b/libglusterfs/src/authenticate.c @@ -0,0 +1,240 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include "authenticate.h" + +static void +init (dict_t *this, + char *key, + data_t *value, + void *data) +{ + void *handle = NULL; + char *auth_file = NULL; + auth_handle_t *auth_handle = NULL; + auth_fn_t authenticate = NULL; + int *error = NULL; + + /* It gets over written */ + error = data; + + if (!strncasecmp (key, "ip", strlen ("ip"))) { + gf_log ("authenticate", GF_LOG_ERROR, + "AUTHENTICATION MODULE \"IP\" HAS BEEN REPLACED " + "BY \"ADDR\""); + dict_set (this, key, data_from_dynptr (NULL, 0)); + /* TODO: 1.3.x backword compatibility */ + // *error = -1; + // return; + key = "addr"; + } + + asprintf (&auth_file, "%s/%s.so", LIBDIR, key); + handle = dlopen (auth_file, RTLD_LAZY); + if (!handle) { + gf_log ("authenticate", GF_LOG_ERROR, "dlopen(%s): %s\n", + auth_file, dlerror ()); + dict_set (this, key, data_from_dynptr (NULL, 0)); + FREE (auth_file); + *error = -1; + return; + } + FREE (auth_file); + + authenticate = dlsym (handle, "gf_auth"); + if (!authenticate) { + gf_log ("authenticate", GF_LOG_ERROR, + "dlsym(gf_auth) on %s\n", dlerror ()); + dict_set (this, key, data_from_dynptr (NULL, 0)); + *error = -1; + return; + } + + auth_handle = CALLOC (1, sizeof (*auth_handle)); + if (!auth_handle) { + gf_log ("authenticate", GF_LOG_ERROR, "Out of memory"); + dict_set (this, key, data_from_dynptr (NULL, 0)); + *error = -1; + return; + } + auth_handle->vol_opt = CALLOC (1, sizeof (volume_opt_list_t)); + auth_handle->vol_opt->given_opt = dlsym (handle, "options"); + if (auth_handle->vol_opt->given_opt == NULL) { + gf_log ("authenticate", GF_LOG_DEBUG, + "volume option validation not specified"); + } + + auth_handle->authenticate = authenticate; + auth_handle->handle = handle; + + dict_set (this, key, + data_from_dynptr (auth_handle, sizeof (*auth_handle))); +} + +static void +fini (dict_t *this, + char *key, + data_t *value, + void *data) +{ + auth_handle_t *handle = data_to_ptr (value); + if (handle) { + dlclose (handle->handle); + } +} + +int32_t +gf_auth_init (xlator_t *xl, dict_t *auth_modules) +{ + int ret = 0; + auth_handle_t *handle = NULL; + data_pair_t *pair = NULL; + dict_foreach (auth_modules, init, &ret); + if (!ret) { + pair = auth_modules->members_list; + while (pair) { + handle = data_to_ptr (pair->value); + if (handle) { + list_add_tail (&(handle->vol_opt->list), + &(xl->volume_options)); + if (-1 == + validate_xlator_volume_options (xl, + handle->vol_opt->given_opt)) { + gf_log ("authenticate", GF_LOG_ERROR, + "volume option validation " + "failed"); + ret = -1; + } + } + pair = pair->next; + } + } + if (ret) { + gf_log (xl->name, GF_LOG_ERROR, "authentication init failed"); + dict_foreach (auth_modules, fini, &ret); + ret = -1; + } + return ret; +} + +static dict_t *__input_params; +static dict_t *__config_params; + +void +map (dict_t *this, + char *key, + data_t *value, + void *data) +{ + dict_t *res = data; + auth_fn_t authenticate; + auth_handle_t *handle = NULL; + + if (value && (handle = data_to_ptr (value)) && + (authenticate = handle->authenticate)) { + dict_set (res, key, + int_to_data (authenticate (__input_params, + __config_params))); + } else { + dict_set (res, key, int_to_data (AUTH_DONT_CARE)); + } +} + +void +reduce (dict_t *this, + char *key, + data_t *value, + void *data) +{ + int64_t val = 0; + int64_t *res = data; + if (!data) + return; + + val = data_to_int64 (value); + switch (val) + { + case AUTH_ACCEPT: + if (AUTH_DONT_CARE == *res) + *res = AUTH_ACCEPT; + break; + + case AUTH_REJECT: + *res = AUTH_REJECT; + break; + + case AUTH_DONT_CARE: + break; + } +} + + +auth_result_t +gf_authenticate (dict_t *input_params, + dict_t *config_params, + dict_t *auth_modules) +{ + dict_t *results = NULL; + int64_t result = AUTH_DONT_CARE; + + results = get_new_dict (); + __input_params = input_params; + __config_params = config_params; + + dict_foreach (auth_modules, map, results); + + dict_foreach (results, reduce, &result); + if (AUTH_DONT_CARE == result) { + data_t *peerinfo_data = dict_get (input_params, "peer-info"); + char *name = NULL; + + if (peerinfo_data) { + peer_info_t *peerinfo = data_to_ptr (peerinfo_data); + name = peerinfo->identifier; + } + + gf_log ("auth", GF_LOG_ERROR, + "no authentication module is interested in " + "accepting remote-client %s", name); + result = AUTH_REJECT; + } + + dict_destroy (results); + return result; +} + +void +gf_auth_fini (dict_t *auth_modules) +{ + int32_t dummy; + + dict_foreach (auth_modules, fini, &dummy); +} diff --git a/libglusterfs/src/authenticate.h b/libglusterfs/src/authenticate.h new file mode 100644 index 000000000..3d9b78527 --- /dev/null +++ b/libglusterfs/src/authenticate.h @@ -0,0 +1,61 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _AUTHENTICATE_H +#define _AUTHENTICATE_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include "dict.h" +#include "compat.h" +#include "list.h" +#include "transport.h" +#include "xlator.h" + +typedef enum { + AUTH_ACCEPT, + AUTH_REJECT, + AUTH_DONT_CARE +} auth_result_t; + +typedef auth_result_t (*auth_fn_t) (dict_t *input_params, + dict_t *config_params); + +typedef struct { + void *handle; + auth_fn_t authenticate; + volume_opt_list_t *vol_opt; +} auth_handle_t; + +auth_result_t gf_authenticate (dict_t *input_params, + dict_t *config_params, + dict_t *auth_modules); +int32_t gf_auth_init (xlator_t *xl, dict_t *auth_modules); +void gf_auth_fini (dict_t *auth_modules); + +#endif /* _AUTHENTICATE_H */ diff --git a/libglusterfs/src/byte-order.h b/libglusterfs/src/byte-order.h new file mode 100644 index 000000000..b0cf90b09 --- /dev/null +++ b/libglusterfs/src/byte-order.h @@ -0,0 +1,150 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _BYTE_ORDER_H +#define _BYTE_ORDER_H + +#include + +#define LS1 0x00ffU +#define MS1 0xff00U +#define LS2 0x0000ffffU +#define MS2 0xffff0000U +#define LS4 0x00000000ffffffffULL +#define MS4 0xffffffff00000000ULL + + +static uint16_t (*hton16) (uint16_t); +static uint32_t (*hton32) (uint32_t); +static uint64_t (*hton64) (uint64_t); + +#define ntoh16 hton16 +#define ntoh32 hton32 +#define ntoh64 hton64 + +#define do_swap2(x) (((x&LS1) << 8)|(((x&MS1) >> 8))) +#define do_swap4(x) ((do_swap2(x&LS2) << 16)|(do_swap2((x&MS2) >> 16))) +#define do_swap8(x) ((do_swap4(x&LS4) << 32)|(do_swap4((x&MS4) >> 32))) + + +static inline uint16_t +__swap16 (uint16_t x) +{ + return do_swap2(x); +} + + +static inline uint32_t +__swap32 (uint32_t x) +{ + return do_swap4(x); +} + + +static inline uint64_t +__swap64 (uint64_t x) +{ + return do_swap8(x); +} + + +static inline uint16_t +__noswap16 (uint16_t x) +{ + return do_swap2(x); +} + + +static inline uint32_t +__noswap32 (uint32_t x) +{ + return do_swap4(x); +} + + +static inline uint64_t +__noswap64 (uint64_t x) +{ + return do_swap8(x); +} + + +static inline uint16_t +__byte_order_init16 (uint16_t i) +{ + uint32_t num = 1; + + if (((char *)(&num))[0] == 1) { + hton16 = __swap16; + hton32 = __swap32; + hton64 = __swap64; + } else { + hton16 = __noswap16; + hton32 = __noswap32; + hton64 = __noswap64; + } + + return hton16 (i); +} + + +static inline uint32_t +__byte_order_init32 (uint32_t i) +{ + uint32_t num = 1; + + if (((char *)(&num))[0] == 1) { + hton16 = __swap16; + hton32 = __swap32; + hton64 = __swap64; + } else { + hton16 = __noswap16; + hton32 = __noswap32; + hton64 = __noswap64; + } + + return hton32 (i); +} + + +static inline uint64_t +__byte_order_init64 (uint64_t i) +{ + uint32_t num = 1; + + if (((char *)(&num))[0] == 1) { + hton16 = __swap16; + hton32 = __swap32; + hton64 = __swap64; + } else { + hton16 = __noswap16; + hton32 = __noswap32; + hton64 = __noswap64; + } + + return hton64 (i); +} + + +static uint16_t (*hton16) (uint16_t) = __byte_order_init16; +static uint32_t (*hton32) (uint32_t) = __byte_order_init32; +static uint64_t (*hton64) (uint64_t) = __byte_order_init64; + + +#endif /* _BYTE_ORDER_H */ diff --git a/libglusterfs/src/call-stub.c b/libglusterfs/src/call-stub.c new file mode 100644 index 000000000..cd7357259 --- /dev/null +++ b/libglusterfs/src/call-stub.c @@ -0,0 +1,3822 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" + + +static call_stub_t * +stub_new (call_frame_t *frame, + char wind, + glusterfs_fop_t fop) +{ + call_stub_t *new = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + new = CALLOC (1, sizeof (*new)); + GF_VALIDATE_OR_GOTO ("call-stub", new, out); + + new->frame = frame; + new->wind = wind; + new->fop = fop; + + INIT_LIST_HEAD (&new->list); +out: + return new; +} + + +call_stub_t * +fop_lookup_stub (call_frame_t *frame, + fop_lookup_t fn, + loc_t *loc, + dict_t *xattr_req) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_LOOKUP); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.lookup.fn = fn; + + if (xattr_req) + stub->args.lookup.xattr_req = dict_ref (xattr_req); + + loc_copy (&stub->args.lookup.loc, loc); +out: + return stub; +} + + +call_stub_t * +fop_lookup_cbk_stub (call_frame_t *frame, + fop_lookup_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_LOOKUP); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.lookup_cbk.fn = fn; + stub->args.lookup_cbk.op_ret = op_ret; + stub->args.lookup_cbk.op_errno = op_errno; + if (inode) + stub->args.lookup_cbk.inode = inode_ref (inode); + if (buf) + stub->args.lookup_cbk.buf = *buf; + if (dict) + stub->args.lookup_cbk.dict = dict_ref (dict); +out: + return stub; +} + + + +call_stub_t * +fop_stat_stub (call_frame_t *frame, + fop_stat_t fn, + loc_t *loc) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_STAT); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.stat.fn = fn; + loc_copy (&stub->args.stat.loc, loc); +out: + return stub; +} + + +call_stub_t * +fop_stat_cbk_stub (call_frame_t *frame, + fop_stat_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_STAT); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.stat_cbk.fn = fn; + stub->args.stat_cbk.op_ret = op_ret; + stub->args.stat_cbk.op_errno = op_errno; + if (op_ret == 0) + stub->args.stat_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_fstat_stub (call_frame_t *frame, + fop_fstat_t fn, + fd_t *fd) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 1, GF_FOP_FSTAT); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.fstat.fn = fn; + + if (fd) + stub->args.fstat.fd = fd_ref (fd); +out: + return stub; +} + + +call_stub_t * +fop_fstat_cbk_stub (call_frame_t *frame, + fop_fstat_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_FSTAT); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.fstat_cbk.fn = fn; + stub->args.fstat_cbk.op_ret = op_ret; + stub->args.fstat_cbk.op_errno = op_errno; + if (buf) + stub->args.fstat_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_chmod_stub (call_frame_t *frame, + fop_chmod_t fn, + loc_t *loc, + mode_t mode) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_CHMOD); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.chmod.fn = fn; + loc_copy (&stub->args.chmod.loc, loc); + stub->args.chmod.mode = mode; +out: + return stub; +} + + +call_stub_t * +fop_chmod_cbk_stub (call_frame_t *frame, + fop_chmod_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_CHMOD); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.chmod_cbk.fn = fn; + stub->args.chmod_cbk.op_ret = op_ret; + stub->args.chmod_cbk.op_errno = op_errno; + if (buf) + stub->args.chmod_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_fchmod_stub (call_frame_t *frame, + fop_fchmod_t fn, + fd_t *fd, + mode_t mode) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 1, GF_FOP_FCHMOD); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.fchmod.fn = fn; + if (fd) + stub->args.fchmod.fd = fd_ref (fd); + stub->args.fchmod.mode = mode; +out: + return stub; +} + + +call_stub_t * +fop_fchmod_cbk_stub (call_frame_t *frame, + fop_fchmod_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_FCHMOD); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.fchmod_cbk.fn = fn; + stub->args.fchmod_cbk.op_ret = op_ret; + stub->args.fchmod_cbk.op_errno = op_errno; + if (buf) + stub->args.fchmod_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_chown_stub (call_frame_t *frame, + fop_chown_t fn, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_CHOWN); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.chown.fn = fn; + loc_copy (&stub->args.chown.loc, loc); + stub->args.chown.uid = uid; + stub->args.chown.gid = gid; +out: + return stub; +} + + +call_stub_t * +fop_chown_cbk_stub (call_frame_t *frame, + fop_chown_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_CHOWN); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.chown_cbk.fn = fn; + stub->args.chown_cbk.op_ret = op_ret; + stub->args.chown_cbk.op_errno = op_errno; + if (buf) + stub->args.chown_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_fchown_stub (call_frame_t *frame, + fop_fchown_t fn, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 1, GF_FOP_FCHOWN); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.fchown.fn = fn; + if (fd) + stub->args.fchown.fd = fd_ref (fd); + stub->args.fchown.uid = uid; + stub->args.fchown.gid = gid; +out: + return stub; +} + + +call_stub_t * +fop_fchown_cbk_stub (call_frame_t *frame, + fop_fchown_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_FCHOWN); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.fchown_cbk.fn = fn; + stub->args.fchown_cbk.op_ret = op_ret; + stub->args.fchown_cbk.op_errno = op_errno; + if (buf) + stub->args.fchown_cbk.buf = *buf; +out: + return stub; +} + + +/* truncate */ + +call_stub_t * +fop_truncate_stub (call_frame_t *frame, + fop_truncate_t fn, + loc_t *loc, + off_t off) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_TRUNCATE); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.truncate.fn = fn; + loc_copy (&stub->args.truncate.loc, loc); + stub->args.truncate.off = off; +out: + return stub; +} + + +call_stub_t * +fop_truncate_cbk_stub (call_frame_t *frame, + fop_truncate_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_TRUNCATE); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.truncate_cbk.fn = fn; + stub->args.truncate_cbk.op_ret = op_ret; + stub->args.truncate_cbk.op_errno = op_errno; + if (buf) + stub->args.truncate_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_ftruncate_stub (call_frame_t *frame, + fop_ftruncate_t fn, + fd_t *fd, + off_t off) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 1, GF_FOP_FTRUNCATE); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.ftruncate.fn = fn; + if (fd) + stub->args.ftruncate.fd = fd_ref (fd); + + stub->args.ftruncate.off = off; +out: + return stub; +} + + +call_stub_t * +fop_ftruncate_cbk_stub (call_frame_t *frame, + fop_ftruncate_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_FTRUNCATE); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.ftruncate_cbk.fn = fn; + stub->args.ftruncate_cbk.op_ret = op_ret; + stub->args.ftruncate_cbk.op_errno = op_errno; + if (buf) + stub->args.ftruncate_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_utimens_stub (call_frame_t *frame, + fop_utimens_t fn, + loc_t *loc, + struct timespec tv[2]) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_UTIMENS); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.utimens.fn = fn; + loc_copy (&stub->args.utimens.loc, loc); + stub->args.utimens.tv[0] = tv[0]; + stub->args.utimens.tv[1] = tv[1]; +out: + return stub; +} + + +call_stub_t * +fop_utimens_cbk_stub (call_frame_t *frame, + fop_utimens_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_UTIMENS); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.utimens_cbk.fn = fn; + stub->args.utimens_cbk.op_ret = op_ret; + stub->args.utimens_cbk.op_errno = op_errno; + if (buf) + stub->args.utimens_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_access_stub (call_frame_t *frame, + fop_access_t fn, + loc_t *loc, + int32_t mask) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_ACCESS); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.access.fn = fn; + loc_copy (&stub->args.access.loc, loc); + stub->args.access.mask = mask; +out: + return stub; +} + + +call_stub_t * +fop_access_cbk_stub (call_frame_t *frame, + fop_access_cbk_t fn, + int32_t op_ret, + int32_t op_errno) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_ACCESS); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.access_cbk.fn = fn; + stub->args.access_cbk.op_ret = op_ret; + stub->args.access_cbk.op_errno = op_errno; +out: + return stub; +} + + +call_stub_t * +fop_readlink_stub (call_frame_t *frame, + fop_readlink_t fn, + loc_t *loc, + size_t size) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_READLINK); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.readlink.fn = fn; + loc_copy (&stub->args.readlink.loc, loc); + stub->args.readlink.size = size; +out: + return stub; +} + + +call_stub_t * +fop_readlink_cbk_stub (call_frame_t *frame, + fop_readlink_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_READLINK); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.readlink_cbk.fn = fn; + stub->args.readlink_cbk.op_ret = op_ret; + stub->args.readlink_cbk.op_errno = op_errno; + if (path) + stub->args.readlink_cbk.buf = strdup (path); +out: + return stub; +} + + +call_stub_t * +fop_mknod_stub (call_frame_t *frame, + fop_mknod_t fn, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_MKNOD); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.mknod.fn = fn; + loc_copy (&stub->args.mknod.loc, loc); + stub->args.mknod.mode = mode; + stub->args.mknod.rdev = rdev; +out: + return stub; +} + + +call_stub_t * +fop_mknod_cbk_stub (call_frame_t *frame, + fop_mknod_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_MKNOD); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.mknod_cbk.fn = fn; + stub->args.mknod_cbk.op_ret = op_ret; + stub->args.mknod_cbk.op_errno = op_errno; + if (inode) + stub->args.mknod_cbk.inode = inode_ref (inode); + if (buf) + stub->args.mknod_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_mkdir_stub (call_frame_t *frame, + fop_mkdir_t fn, + loc_t *loc, + mode_t mode) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_MKDIR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.mkdir.fn = fn; + loc_copy (&stub->args.mkdir.loc, loc); + stub->args.mkdir.mode = mode; +out: + return stub; +} + + +call_stub_t * +fop_mkdir_cbk_stub (call_frame_t *frame, + fop_mkdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_MKDIR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.mkdir_cbk.fn = fn; + stub->args.mkdir_cbk.op_ret = op_ret; + stub->args.mkdir_cbk.op_errno = op_errno; + if (inode) + stub->args.mkdir_cbk.inode = inode_ref (inode); + if (buf) + stub->args.mkdir_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_unlink_stub (call_frame_t *frame, + fop_unlink_t fn, + loc_t *loc) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_UNLINK); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.unlink.fn = fn; + loc_copy (&stub->args.unlink.loc, loc); +out: + return stub; +} + + +call_stub_t * +fop_unlink_cbk_stub (call_frame_t *frame, + fop_unlink_cbk_t fn, + int32_t op_ret, + int32_t op_errno) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_UNLINK); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.unlink_cbk.fn = fn; + stub->args.unlink_cbk.op_ret = op_ret; + stub->args.unlink_cbk.op_errno = op_errno; +out: + return stub; +} + + + +call_stub_t * +fop_rmdir_stub (call_frame_t *frame, + fop_rmdir_t fn, + loc_t *loc) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_RMDIR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.rmdir.fn = fn; + loc_copy (&stub->args.rmdir.loc, loc); +out: + return stub; +} + + +call_stub_t * +fop_rmdir_cbk_stub (call_frame_t *frame, + fop_rmdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_RMDIR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.rmdir_cbk.fn = fn; + stub->args.rmdir_cbk.op_ret = op_ret; + stub->args.rmdir_cbk.op_errno = op_errno; +out: + return stub; +} + + +call_stub_t * +fop_symlink_stub (call_frame_t *frame, + fop_symlink_t fn, + const char *linkname, + loc_t *loc) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + GF_VALIDATE_OR_GOTO ("call-stub", linkname, out); + + stub = stub_new (frame, 1, GF_FOP_SYMLINK); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.symlink.fn = fn; + stub->args.symlink.linkname = strdup (linkname); + loc_copy (&stub->args.symlink.loc, loc); +out: + return stub; +} + + +call_stub_t * +fop_symlink_cbk_stub (call_frame_t *frame, + fop_symlink_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_SYMLINK); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.symlink_cbk.fn = fn; + stub->args.symlink_cbk.op_ret = op_ret; + stub->args.symlink_cbk.op_errno = op_errno; + if (inode) + stub->args.symlink_cbk.inode = inode_ref (inode); + if (buf) + stub->args.symlink_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_rename_stub (call_frame_t *frame, + fop_rename_t fn, + loc_t *oldloc, + loc_t *newloc) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", oldloc, out); + GF_VALIDATE_OR_GOTO ("call-stub", newloc, out); + + stub = stub_new (frame, 1, GF_FOP_RENAME); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.rename.fn = fn; + loc_copy (&stub->args.rename.old, oldloc); + loc_copy (&stub->args.rename.new, newloc); +out: + return stub; +} + + +call_stub_t * +fop_rename_cbk_stub (call_frame_t *frame, + fop_rename_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_RENAME); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.rename_cbk.fn = fn; + stub->args.rename_cbk.op_ret = op_ret; + stub->args.rename_cbk.op_errno = op_errno; + if (buf) + stub->args.rename_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_link_stub (call_frame_t *frame, + fop_link_t fn, + loc_t *oldloc, + loc_t *newloc) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", oldloc, out); + GF_VALIDATE_OR_GOTO ("call-stub", newloc, out); + + stub = stub_new (frame, 1, GF_FOP_LINK); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.link.fn = fn; + loc_copy (&stub->args.link.oldloc, oldloc); + loc_copy (&stub->args.link.newloc, newloc); + +out: + return stub; +} + + +call_stub_t * +fop_link_cbk_stub (call_frame_t *frame, + fop_link_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_LINK); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.link_cbk.fn = fn; + stub->args.link_cbk.op_ret = op_ret; + stub->args.link_cbk.op_errno = op_errno; + if (inode) + stub->args.link_cbk.inode = inode_ref (inode); + if (buf) + stub->args.link_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_create_stub (call_frame_t *frame, + fop_create_t fn, + loc_t *loc, + int32_t flags, + mode_t mode, fd_t *fd) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_CREATE); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.create.fn = fn; + loc_copy (&stub->args.create.loc, loc); + stub->args.create.flags = flags; + stub->args.create.mode = mode; + if (fd) + stub->args.create.fd = fd_ref (fd); +out: + return stub; +} + + +call_stub_t * +fop_create_cbk_stub (call_frame_t *frame, + fop_create_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_CREATE); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.create_cbk.fn = fn; + stub->args.create_cbk.op_ret = op_ret; + stub->args.create_cbk.op_errno = op_errno; + if (fd) + stub->args.create_cbk.fd = fd_ref (fd); + if (inode) + stub->args.create_cbk.inode = inode_ref (inode); + if (buf) + stub->args.create_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_open_stub (call_frame_t *frame, + fop_open_t fn, + loc_t *loc, + int32_t flags, fd_t *fd) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_OPEN); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.open.fn = fn; + loc_copy (&stub->args.open.loc, loc); + stub->args.open.flags = flags; + if (fd) + stub->args.open.fd = fd_ref (fd); +out: + return stub; +} + + +call_stub_t * +fop_open_cbk_stub (call_frame_t *frame, + fop_open_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) + +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_OPEN); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.open_cbk.fn = fn; + stub->args.open_cbk.op_ret = op_ret; + stub->args.open_cbk.op_errno = op_errno; + if (fd) + stub->args.open_cbk.fd = fd_ref (fd); +out: + return stub; +} + + +call_stub_t * +fop_readv_stub (call_frame_t *frame, + fop_readv_t fn, + fd_t *fd, + size_t size, + off_t off) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 1, GF_FOP_READ); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.readv.fn = fn; + if (fd) + stub->args.readv.fd = fd_ref (fd); + stub->args.readv.size = size; + stub->args.readv.off = off; +out: + return stub; +} + + +call_stub_t * +fop_readv_cbk_stub (call_frame_t *frame, + fop_readv_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) + +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_READ); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.readv_cbk.fn = fn; + stub->args.readv_cbk.op_ret = op_ret; + stub->args.readv_cbk.op_errno = op_errno; + if (op_ret >= 0) { + stub->args.readv_cbk.vector = iov_dup (vector, count); + stub->args.readv_cbk.count = count; + stub->args.readv_cbk.stbuf = *stbuf; + stub->args.readv_cbk.rsp_refs = + dict_ref (frame->root->rsp_refs); + } +out: + return stub; +} + + +call_stub_t * +fop_writev_stub (call_frame_t *frame, + fop_writev_t fn, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", vector, out); + + stub = stub_new (frame, 1, GF_FOP_WRITE); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.writev.fn = fn; + if (fd) + stub->args.writev.fd = fd_ref (fd); + stub->args.writev.vector = iov_dup (vector, count); + stub->args.writev.count = count; + stub->args.writev.off = off; + + if (frame->root->req_refs) + stub->args.writev.req_refs = dict_ref (frame->root->req_refs); +out: + return stub; +} + + +call_stub_t * +fop_writev_cbk_stub (call_frame_t *frame, + fop_writev_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) + +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_WRITE); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.writev_cbk.fn = fn; + stub->args.writev_cbk.op_ret = op_ret; + stub->args.writev_cbk.op_errno = op_errno; + if (op_ret >= 0) + stub->args.writev_cbk.stbuf = *stbuf; +out: + return stub; +} + + + +call_stub_t * +fop_flush_stub (call_frame_t *frame, + fop_flush_t fn, + fd_t *fd) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 1, GF_FOP_FLUSH); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.flush.fn = fn; + if (fd) + stub->args.flush.fd = fd_ref (fd); +out: + return stub; +} + + +call_stub_t * +fop_flush_cbk_stub (call_frame_t *frame, + fop_flush_cbk_t fn, + int32_t op_ret, + int32_t op_errno) + +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_FLUSH); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.flush_cbk.fn = fn; + stub->args.flush_cbk.op_ret = op_ret; + stub->args.flush_cbk.op_errno = op_errno; +out: + return stub; +} + + + + +call_stub_t * +fop_fsync_stub (call_frame_t *frame, + fop_fsync_t fn, + fd_t *fd, + int32_t datasync) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 1, GF_FOP_FSYNC); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.fsync.fn = fn; + if (fd) + stub->args.fsync.fd = fd_ref (fd); + stub->args.fsync.datasync = datasync; +out: + return stub; +} + + +call_stub_t * +fop_fsync_cbk_stub (call_frame_t *frame, + fop_fsync_cbk_t fn, + int32_t op_ret, + int32_t op_errno) + +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_FSYNC); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.fsync_cbk.fn = fn; + stub->args.fsync_cbk.op_ret = op_ret; + stub->args.fsync_cbk.op_errno = op_errno; +out: + return stub; +} + + +call_stub_t * +fop_opendir_stub (call_frame_t *frame, + fop_opendir_t fn, + loc_t *loc, fd_t *fd) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_OPENDIR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.opendir.fn = fn; + loc_copy (&stub->args.opendir.loc, loc); + if (stub->args.opendir.fd) + stub->args.opendir.fd = fd_ref (fd); +out: + return stub; +} + + +call_stub_t * +fop_opendir_cbk_stub (call_frame_t *frame, + fop_opendir_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) + +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_OPENDIR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.opendir_cbk.fn = fn; + stub->args.opendir_cbk.op_ret = op_ret; + stub->args.opendir_cbk.op_errno = op_errno; + + if (fd) + stub->args.opendir_cbk.fd = fd_ref (fd); +out: + return stub; +} + + +call_stub_t * +fop_getdents_stub (call_frame_t *frame, + fop_getdents_t fn, + fd_t *fd, + size_t size, + off_t off, + int32_t flag) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 1, GF_FOP_GETDENTS); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.getdents.fn = fn; + stub->args.getdents.size = size; + stub->args.getdents.off = off; + if (fd) + stub->args.getdents.fd = fd_ref (fd); + stub->args.getdents.flag = flag; +out: + return stub; +} + + +call_stub_t * +fop_getdents_cbk_stub (call_frame_t *frame, + fop_getdents_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count) + +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_GETDENTS); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.getdents_cbk.fn = fn; + stub->args.getdents_cbk.op_ret = op_ret; + stub->args.getdents_cbk.op_errno = op_errno; + if (op_ret >= 0) { + stub->args.getdents_cbk.entries.next = entries->next; + /* FIXME: are entries not needed in the caller after + * creating stub? */ + entries->next = NULL; + } + + stub->args.getdents_cbk.count = count; +out: + return stub; +} + + + +call_stub_t * +fop_fsyncdir_stub (call_frame_t *frame, + fop_fsyncdir_t fn, + fd_t *fd, + int32_t datasync) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 1, GF_FOP_FSYNCDIR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.fsyncdir.fn = fn; + if (fd) + stub->args.fsyncdir.fd = fd_ref (fd); + stub->args.fsyncdir.datasync = datasync; +out: + return stub; +} + + +call_stub_t * +fop_fsyncdir_cbk_stub (call_frame_t *frame, + fop_fsyncdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno) + +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_FSYNCDIR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.fsyncdir_cbk.fn = fn; + stub->args.fsyncdir_cbk.op_ret = op_ret; + stub->args.fsyncdir_cbk.op_errno = op_errno; +out: + return stub; +} + + +call_stub_t * +fop_statfs_stub (call_frame_t *frame, + fop_statfs_t fn, + loc_t *loc) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_STATFS); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.statfs.fn = fn; + loc_copy (&stub->args.statfs.loc, loc); +out: + return stub; +} + + +call_stub_t * +fop_statfs_cbk_stub (call_frame_t *frame, + fop_statfs_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf) + +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_STATFS); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.statfs_cbk.fn = fn; + stub->args.statfs_cbk.op_ret = op_ret; + stub->args.statfs_cbk.op_errno = op_errno; + if (op_ret == 0) + stub->args.statfs_cbk.buf = *buf; +out: + return stub; +} + + +call_stub_t * +fop_setxattr_stub (call_frame_t *frame, + fop_setxattr_t fn, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_SETXATTR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.setxattr.fn = fn; + loc_copy (&stub->args.setxattr.loc, loc); + /* TODO */ + if (dict) + stub->args.setxattr.dict = dict_ref (dict); + stub->args.setxattr.flags = flags; +out: + return stub; +} + + +call_stub_t * +fop_setxattr_cbk_stub (call_frame_t *frame, + fop_setxattr_cbk_t fn, + int32_t op_ret, + int32_t op_errno) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_SETXATTR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.setxattr_cbk.fn = fn; + stub->args.setxattr_cbk.op_ret = op_ret; + stub->args.setxattr_cbk.op_errno = op_errno; +out: + return stub; +} + +call_stub_t * +fop_getxattr_stub (call_frame_t *frame, + fop_getxattr_t fn, + loc_t *loc, + const char *name) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_GETXATTR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.getxattr.fn = fn; + loc_copy (&stub->args.getxattr.loc, loc); + + if (name) + stub->args.getxattr.name = strdup (name); +out: + return stub; +} + + +call_stub_t * +fop_getxattr_cbk_stub (call_frame_t *frame, + fop_getxattr_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_GETXATTR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.getxattr_cbk.fn = fn; + stub->args.getxattr_cbk.op_ret = op_ret; + stub->args.getxattr_cbk.op_errno = op_errno; + /* TODO */ + if (dict) + stub->args.getxattr_cbk.dict = dict_ref (dict); +out: + return stub; +} + +call_stub_t * +fop_removexattr_stub (call_frame_t *frame, + fop_removexattr_t fn, + loc_t *loc, + const char *name) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + GF_VALIDATE_OR_GOTO ("call-stub", name, out); + + stub = stub_new (frame, 1, GF_FOP_REMOVEXATTR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.removexattr.fn = fn; + loc_copy (&stub->args.removexattr.loc, loc); + stub->args.removexattr.name = strdup (name); +out: + return stub; +} + + +call_stub_t * +fop_removexattr_cbk_stub (call_frame_t *frame, + fop_removexattr_cbk_t fn, + int32_t op_ret, + int32_t op_errno) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_REMOVEXATTR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.removexattr_cbk.fn = fn; + stub->args.removexattr_cbk.op_ret = op_ret; + stub->args.removexattr_cbk.op_errno = op_errno; +out: + return stub; +} + + +call_stub_t * +fop_lk_stub (call_frame_t *frame, + fop_lk_t fn, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", lock, out); + + stub = stub_new (frame, 1, GF_FOP_LK); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.lk.fn = fn; + if (fd) + stub->args.lk.fd = fd_ref (fd); + stub->args.lk.cmd = cmd; + stub->args.lk.lock = *lock; +out: + return stub; +} + + +call_stub_t * +fop_lk_cbk_stub (call_frame_t *frame, + fop_lk_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) + +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_LK); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.lk_cbk.fn = fn; + stub->args.lk_cbk.op_ret = op_ret; + stub->args.lk_cbk.op_errno = op_errno; + if (op_ret == 0) + stub->args.lk_cbk.lock = *lock; +out: + return stub; +} + +call_stub_t * +fop_inodelk_stub (call_frame_t *frame, fop_inodelk_t fn, + loc_t *loc, int32_t cmd, struct flock *lock) +{ + call_stub_t *stub = NULL; + + if (!frame || !lock) + return NULL; + + stub = stub_new (frame, 1, GF_FOP_INODELK); + if (!stub) + return NULL; + + stub->args.inodelk.fn = fn; + + loc_copy (&stub->args.inodelk.loc, loc); + stub->args.inodelk.cmd = cmd; + stub->args.inodelk.lock = *lock; + + return stub; +} + +call_stub_t * +fop_inodelk_cbk_stub (call_frame_t *frame, fop_inodelk_cbk_t fn, + int32_t op_ret, int32_t op_errno) +{ + call_stub_t *stub = NULL; + + if (!frame) + return NULL; + + stub = stub_new (frame, 0, GF_FOP_INODELK); + if (!stub) + return NULL; + + stub->args.inodelk_cbk.fn = fn; + stub->args.inodelk_cbk.op_ret = op_ret; + stub->args.inodelk_cbk.op_errno = op_errno; + + return stub; +} + + +call_stub_t * +fop_finodelk_stub (call_frame_t *frame, fop_finodelk_t fn, + fd_t *fd, int32_t cmd, struct flock *lock) +{ + call_stub_t *stub = NULL; + + if (!frame || !lock) + return NULL; + + stub = stub_new (frame, 1, GF_FOP_FINODELK); + if (!stub) + return NULL; + + stub->args.finodelk.fn = fn; + + if (fd) + stub->args.finodelk.fd = fd_ref (fd); + stub->args.finodelk.cmd = cmd; + stub->args.finodelk.lock = *lock; + + return stub; +} + + +call_stub_t * +fop_finodelk_cbk_stub (call_frame_t *frame, fop_inodelk_cbk_t fn, + int32_t op_ret, int32_t op_errno) +{ + call_stub_t *stub = NULL; + + if (!frame) + return NULL; + + stub = stub_new (frame, 0, GF_FOP_FINODELK); + if (!stub) + return NULL; + + stub->args.finodelk_cbk.fn = fn; + stub->args.finodelk_cbk.op_ret = op_ret; + stub->args.finodelk_cbk.op_errno = op_errno; + + return stub; +} + + +call_stub_t * +fop_entrylk_stub (call_frame_t *frame, fop_entrylk_t fn, + loc_t *loc, const char *name, + entrylk_cmd cmd, entrylk_type type) +{ + call_stub_t *stub = NULL; + + if (!frame) + return NULL; + + stub = stub_new (frame, 1, GF_FOP_ENTRYLK); + if (!stub) + return NULL; + + stub->args.entrylk.fn = fn; + loc_copy (&stub->args.entrylk.loc, loc); + + stub->args.entrylk.cmd = cmd; + stub->args.entrylk.type = type; + if (name) + stub->args.entrylk.name = strdup (name); + + return stub; +} + +call_stub_t * +fop_entrylk_cbk_stub (call_frame_t *frame, fop_entrylk_cbk_t fn, + int32_t op_ret, int32_t op_errno) +{ + call_stub_t *stub = NULL; + + if (!frame) + return NULL; + + stub = stub_new (frame, 0, GF_FOP_ENTRYLK); + if (!stub) + return NULL; + + stub->args.entrylk_cbk.fn = fn; + stub->args.entrylk_cbk.op_ret = op_ret; + stub->args.entrylk_cbk.op_errno = op_errno; + + return stub; +} + + +call_stub_t * +fop_fentrylk_stub (call_frame_t *frame, fop_fentrylk_t fn, + fd_t *fd, const char *name, + entrylk_cmd cmd, entrylk_type type) +{ + call_stub_t *stub = NULL; + + if (!frame) + return NULL; + + stub = stub_new (frame, 1, GF_FOP_FENTRYLK); + if (!stub) + return NULL; + + stub->args.fentrylk.fn = fn; + + if (fd) + stub->args.fentrylk.fd = fd_ref (fd); + stub->args.fentrylk.cmd = cmd; + stub->args.fentrylk.type = type; + if (name) + stub->args.fentrylk.name = strdup (name); + + return stub; +} + +call_stub_t * +fop_fentrylk_cbk_stub (call_frame_t *frame, fop_fentrylk_cbk_t fn, + int32_t op_ret, int32_t op_errno) +{ + call_stub_t *stub = NULL; + + if (!frame) + return NULL; + + stub = stub_new (frame, 0, GF_FOP_FENTRYLK); + if (!stub) + return NULL; + + stub->args.fentrylk_cbk.fn = fn; + stub->args.fentrylk_cbk.op_ret = op_ret; + stub->args.fentrylk_cbk.op_errno = op_errno; + + return stub; +} + + +call_stub_t * +fop_setdents_stub (call_frame_t *frame, + fop_setdents_t fn, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 1, GF_FOP_SETDENTS); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + if (fd) + stub->args.setdents.fd = fd_ref (fd); + stub->args.setdents.fn = fn; + stub->args.setdents.flags = flags; + stub->args.setdents.count = count; + if (entries) { + stub->args.setdents.entries.next = entries->next; + entries->next = NULL; + } +out: + return stub; +} + +call_stub_t * +fop_setdents_cbk_stub (call_frame_t *frame, + fop_setdents_cbk_t fn, + int32_t op_ret, + int32_t op_errno) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_SETDENTS); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.setdents_cbk.fn = fn; + stub->args.setdents_cbk.op_ret = op_ret; + stub->args.setdents_cbk.op_errno = op_errno; +out: + return stub; + +} + +call_stub_t * +fop_readdir_cbk_stub (call_frame_t *frame, + fop_readdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries) +{ + call_stub_t *stub = NULL; + gf_dirent_t *stub_entry = NULL, *entry = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_READDIR); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.readdir_cbk.fn = fn; + stub->args.readdir_cbk.op_ret = op_ret; + stub->args.readdir_cbk.op_errno = op_errno; + INIT_LIST_HEAD (&stub->args.readdir_cbk.entries.list); + + if (op_ret > 0) { + list_for_each_entry (entry, &entries->list, list) { + stub_entry = gf_dirent_for_name (entry->d_name); + ERR_ABORT (stub_entry); + stub_entry->d_off = entry->d_off; + stub_entry->d_ino = entry->d_ino; + + list_add_tail (&stub_entry->list, + &stub->args.readdir_cbk.entries.list); + } + } +out: + return stub; +} + +call_stub_t * +fop_readdir_stub (call_frame_t *frame, + fop_readdir_t fn, + fd_t *fd, + size_t size, + off_t off) +{ + call_stub_t *stub = NULL; + + stub = stub_new (frame, 1, GF_FOP_READDIR); + stub->args.readdir.fn = fn; + stub->args.readdir.fd = fd_ref (fd); + stub->args.readdir.size = size; + stub->args.readdir.off = off; + + return stub; +} +call_stub_t * +fop_checksum_stub (call_frame_t *frame, + fop_checksum_t fn, + loc_t *loc, + int32_t flags) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + GF_VALIDATE_OR_GOTO ("call-stub", loc, out); + + stub = stub_new (frame, 1, GF_FOP_CHECKSUM); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.checksum.fn = fn; + loc_copy (&stub->args.checksum.loc, loc); + stub->args.checksum.flags = flags; +out: + return stub; +} + + +call_stub_t * +fop_checksum_cbk_stub (call_frame_t *frame, + fop_checksum_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_CHECKSUM); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.checksum_cbk.fn = fn; + stub->args.checksum_cbk.op_ret = op_ret; + stub->args.checksum_cbk.op_errno = op_errno; + if (op_ret >= 0) + { + stub->args.checksum_cbk.file_checksum = + memdup (file_checksum, ZR_FILENAME_MAX); + + stub->args.checksum_cbk.dir_checksum = + memdup (dir_checksum, ZR_FILENAME_MAX); + } +out: + return stub; +} + + +call_stub_t * +fop_xattrop_cbk_stub (call_frame_t *frame, + fop_xattrop_cbk_t fn, + int32_t op_ret, + int32_t op_errno) +{ + call_stub_t *stub = NULL; + + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 0, GF_FOP_XATTROP); + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + stub->args.xattrop_cbk.fn = fn; + stub->args.xattrop_cbk.op_ret = op_ret; + stub->args.xattrop_cbk.op_errno = op_errno; + +out: + return stub; +} + + +call_stub_t * +fop_fxattrop_cbk_stub (call_frame_t *frame, + fop_fxattrop_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + dict_t *xattr) +{ + call_stub_t *stub = NULL; + GF_VALIDATE_OR_GOTO ("call-stub", frame, out); + + stub = stub_new (frame, 1, GF_FOP_FXATTROP); + stub->args.fxattrop_cbk.fn = fn; + stub->args.fxattrop_cbk.op_ret = op_ret; + stub->args.fxattrop_cbk.op_errno = op_errno; + if (xattr) + stub->args.fxattrop_cbk.xattr = dict_ref (xattr); + +out: + return stub; +} + + +call_stub_t * +fop_xattrop_stub (call_frame_t *frame, + fop_xattrop_t fn, + loc_t *loc, + gf_xattrop_flags_t optype, + dict_t *xattr) +{ + call_stub_t *stub = NULL; + + if (!frame || !xattr) + return NULL; + + stub = stub_new (frame, 1, GF_FOP_XATTROP); + if (!stub) + return NULL; + + stub->args.xattrop.fn = fn; + + loc_copy (&stub->args.xattrop.loc, loc); + + stub->args.xattrop.optype = optype; + stub->args.xattrop.xattr = dict_ref (xattr); + + return stub; +} + +call_stub_t * +fop_fxattrop_stub (call_frame_t *frame, + fop_fxattrop_t fn, + fd_t *fd, + gf_xattrop_flags_t optype, + dict_t *xattr) +{ + call_stub_t *stub = NULL; + + if (!frame || !xattr) + return NULL; + + stub = stub_new (frame, 1, GF_FOP_FXATTROP); + if (!stub) + return NULL; + + stub->args.fxattrop.fn = fn; + + stub->args.fxattrop.fd = fd_ref (fd); + + stub->args.fxattrop.optype = optype; + stub->args.fxattrop.xattr = dict_ref (xattr); + + return stub; +} + + +static void +call_resume_wind (call_stub_t *stub) +{ + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + switch (stub->fop) { + case GF_FOP_OPEN: + { + stub->args.open.fn (stub->frame, + stub->frame->this, + &stub->args.open.loc, + stub->args.open.flags, stub->args.open.fd); + break; + } + case GF_FOP_CREATE: + { + stub->args.create.fn (stub->frame, + stub->frame->this, + &stub->args.create.loc, + stub->args.create.flags, + stub->args.create.mode, + stub->args.create.fd); + break; + } + case GF_FOP_STAT: + { + stub->args.stat.fn (stub->frame, + stub->frame->this, + &stub->args.stat.loc); + break; + } + case GF_FOP_READLINK: + { + stub->args.readlink.fn (stub->frame, + stub->frame->this, + &stub->args.readlink.loc, + stub->args.readlink.size); + break; + } + + case GF_FOP_MKNOD: + { + stub->args.mknod.fn (stub->frame, + stub->frame->this, + &stub->args.mknod.loc, + stub->args.mknod.mode, + stub->args.mknod.rdev); + } + break; + + case GF_FOP_MKDIR: + { + stub->args.mkdir.fn (stub->frame, + stub->frame->this, + &stub->args.mkdir.loc, + stub->args.mkdir.mode); + } + break; + + case GF_FOP_UNLINK: + { + stub->args.unlink.fn (stub->frame, + stub->frame->this, + &stub->args.unlink.loc); + } + break; + + case GF_FOP_RMDIR: + { + stub->args.rmdir.fn (stub->frame, + stub->frame->this, + &stub->args.rmdir.loc); + } + break; + + case GF_FOP_SYMLINK: + { + stub->args.symlink.fn (stub->frame, + stub->frame->this, + stub->args.symlink.linkname, + &stub->args.symlink.loc); + } + break; + + case GF_FOP_RENAME: + { + stub->args.rename.fn (stub->frame, + stub->frame->this, + &stub->args.rename.old, + &stub->args.rename.new); + } + break; + + case GF_FOP_LINK: + { + stub->args.link.fn (stub->frame, + stub->frame->this, + &stub->args.link.oldloc, + &stub->args.link.newloc); + } + break; + + case GF_FOP_CHMOD: + { + stub->args.chmod.fn (stub->frame, + stub->frame->this, + &stub->args.chmod.loc, + stub->args.chmod.mode); + } + break; + + case GF_FOP_CHOWN: + { + stub->args.chown.fn (stub->frame, + stub->frame->this, + &stub->args.chown.loc, + stub->args.chown.uid, + stub->args.chown.gid); + break; + } + case GF_FOP_TRUNCATE: + { + stub->args.truncate.fn (stub->frame, + stub->frame->this, + &stub->args.truncate.loc, + stub->args.truncate.off); + break; + } + + case GF_FOP_READ: + { + stub->args.readv.fn (stub->frame, + stub->frame->this, + stub->args.readv.fd, + stub->args.readv.size, + stub->args.readv.off); + break; + } + + case GF_FOP_WRITE: + { + stub->args.writev.fn (stub->frame, + stub->frame->this, + stub->args.writev.fd, + stub->args.writev.vector, + stub->args.writev.count, + stub->args.writev.off); + break; + } + + case GF_FOP_STATFS: + { + stub->args.statfs.fn (stub->frame, + stub->frame->this, + &stub->args.statfs.loc); + break; + } + case GF_FOP_FLUSH: + { + stub->args.flush.fn (stub->frame, + stub->frame->this, + stub->args.flush.fd); + break; + } + + case GF_FOP_FSYNC: + { + stub->args.fsync.fn (stub->frame, + stub->frame->this, + stub->args.fsync.fd, + stub->args.fsync.datasync); + break; + } + + case GF_FOP_SETXATTR: + { + stub->args.setxattr.fn (stub->frame, + stub->frame->this, + &stub->args.setxattr.loc, + stub->args.setxattr.dict, + stub->args.setxattr.flags); + break; + } + + case GF_FOP_GETXATTR: + { + stub->args.getxattr.fn (stub->frame, + stub->frame->this, + &stub->args.getxattr.loc, + stub->args.getxattr.name); + break; + } + + case GF_FOP_REMOVEXATTR: + { + stub->args.removexattr.fn (stub->frame, + stub->frame->this, + &stub->args.removexattr.loc, + stub->args.removexattr.name); + break; + } + + case GF_FOP_OPENDIR: + { + stub->args.opendir.fn (stub->frame, + stub->frame->this, + &stub->args.opendir.loc, + stub->args.opendir.fd); + break; + } + + case GF_FOP_GETDENTS: + { + stub->args.getdents.fn (stub->frame, + stub->frame->this, + stub->args.getdents.fd, + stub->args.getdents.size, + stub->args.getdents.off, + stub->args.getdents.flag); + break; + } + + case GF_FOP_FSYNCDIR: + { + stub->args.fsyncdir.fn (stub->frame, + stub->frame->this, + stub->args.fsyncdir.fd, + stub->args.fsyncdir.datasync); + break; + } + + case GF_FOP_ACCESS: + { + stub->args.access.fn (stub->frame, + stub->frame->this, + &stub->args.access.loc, + stub->args.access.mask); + break; + } + + case GF_FOP_FTRUNCATE: + { + stub->args.ftruncate.fn (stub->frame, + stub->frame->this, + stub->args.ftruncate.fd, + stub->args.ftruncate.off); + break; + } + + case GF_FOP_FSTAT: + { + stub->args.fstat.fn (stub->frame, + stub->frame->this, + stub->args.fstat.fd); + break; + } + + case GF_FOP_LK: + { + stub->args.lk.fn (stub->frame, + stub->frame->this, + stub->args.lk.fd, + stub->args.lk.cmd, + &stub->args.lk.lock); + break; + } + + case GF_FOP_INODELK: + { + stub->args.inodelk.fn (stub->frame, + stub->frame->this, + &stub->args.inodelk.loc, + stub->args.inodelk.cmd, + &stub->args.inodelk.lock); + break; + } + + case GF_FOP_FINODELK: + { + stub->args.finodelk.fn (stub->frame, + stub->frame->this, + stub->args.finodelk.fd, + stub->args.finodelk.cmd, + &stub->args.finodelk.lock); + break; + } + + case GF_FOP_ENTRYLK: + { + stub->args.entrylk.fn (stub->frame, + stub->frame->this, + &stub->args.entrylk.loc, + stub->args.entrylk.name, + stub->args.entrylk.cmd, + stub->args.entrylk.type); + break; + } + + case GF_FOP_FENTRYLK: + { + stub->args.fentrylk.fn (stub->frame, + stub->frame->this, + stub->args.fentrylk.fd, + stub->args.fentrylk.name, + stub->args.fentrylk.cmd, + stub->args.fentrylk.type); + break; + } + + case GF_FOP_UTIMENS: + { + stub->args.utimens.fn (stub->frame, + stub->frame->this, + &stub->args.utimens.loc, + stub->args.utimens.tv); + break; + } + + + break; + case GF_FOP_FCHMOD: + { + stub->args.fchmod.fn (stub->frame, + stub->frame->this, + stub->args.fchmod.fd, + stub->args.fchmod.mode); + break; + } + + case GF_FOP_FCHOWN: + { + stub->args.fchown.fn (stub->frame, + stub->frame->this, + stub->args.fchown.fd, + stub->args.fchown.uid, + stub->args.fchown.gid); + break; + } + + case GF_FOP_LOOKUP: + { + stub->args.lookup.fn (stub->frame, + stub->frame->this, + &stub->args.lookup.loc, + stub->args.lookup.xattr_req); + break; + } + + case GF_FOP_SETDENTS: + { + stub->args.setdents.fn (stub->frame, + stub->frame->this, + stub->args.setdents.fd, + stub->args.setdents.flags, + &stub->args.setdents.entries, + stub->args.setdents.count); + break; + } + + case GF_FOP_CHECKSUM: + { + stub->args.checksum.fn (stub->frame, + stub->frame->this, + &stub->args.checksum.loc, + stub->args.checksum.flags); + break; + } + case GF_FOP_READDIR: + { + stub->args.readdir.fn (stub->frame, + stub->frame->this, + stub->args.readdir.fd, + stub->args.readdir.size, + stub->args.readdir.off); + break; + } + case GF_FOP_XATTROP: + { + stub->args.xattrop.fn (stub->frame, + stub->frame->this, + &stub->args.xattrop.loc, + stub->args.xattrop.optype, + stub->args.xattrop.xattr); + + break; + } + case GF_FOP_FXATTROP: + { + stub->args.fxattrop.fn (stub->frame, + stub->frame->this, + stub->args.fxattrop.fd, + stub->args.fxattrop.optype, + stub->args.fxattrop.xattr); + + break; + } + default: + { + gf_log ("call-stub", + GF_LOG_DEBUG, + "Invalid value of FOP"); + } + break; + } +out: + return; +} + + + +static void +call_resume_unwind (call_stub_t *stub) +{ + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + switch (stub->fop) { + case GF_FOP_OPEN: + { + if (!stub->args.open_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.open_cbk.op_ret, + stub->args.open_cbk.op_errno, + stub->args.open_cbk.fd); + else + stub->args.open_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.open_cbk.op_ret, + stub->args.open_cbk.op_errno, + stub->args.open_cbk.fd); + break; + } + + case GF_FOP_CREATE: + { + if (!stub->args.create_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.create_cbk.op_ret, + stub->args.create_cbk.op_errno, + stub->args.create_cbk.fd, + stub->args.create_cbk.inode, + &stub->args.create_cbk.buf); + else + stub->args.create_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.create_cbk.op_ret, + stub->args.create_cbk.op_errno, + stub->args.create_cbk.fd, + stub->args.create_cbk.inode, + &stub->args.create_cbk.buf); + + break; + } + + case GF_FOP_STAT: + { + if (!stub->args.stat_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.stat_cbk.op_ret, + stub->args.stat_cbk.op_errno, + &stub->args.stat_cbk.buf); + else + stub->args.stat_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.stat_cbk.op_ret, + stub->args.stat_cbk.op_errno, + &stub->args.stat_cbk.buf); + + break; + } + + case GF_FOP_READLINK: + { + if (!stub->args.readlink_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.readlink_cbk.op_ret, + stub->args.readlink_cbk.op_errno, + stub->args.readlink_cbk.buf); + else + stub->args.readlink_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.readlink_cbk.op_ret, + stub->args.readlink_cbk.op_errno, + stub->args.readlink_cbk.buf); + + break; + } + + case GF_FOP_MKNOD: + { + if (!stub->args.mknod_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.mknod_cbk.op_ret, + stub->args.mknod_cbk.op_errno, + stub->args.mknod_cbk.inode, + &stub->args.mknod_cbk.buf); + else + stub->args.mknod_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.mknod_cbk.op_ret, + stub->args.mknod_cbk.op_errno, + stub->args.mknod_cbk.inode, + &stub->args.mknod_cbk.buf); + break; + } + + case GF_FOP_MKDIR: + { + if (!stub->args.mkdir_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.mkdir_cbk.op_ret, + stub->args.mkdir_cbk.op_errno, + stub->args.mkdir_cbk.inode, + &stub->args.mkdir_cbk.buf); + else + stub->args.mkdir_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.mkdir_cbk.op_ret, + stub->args.mkdir_cbk.op_errno, + stub->args.mkdir_cbk.inode, + &stub->args.mkdir_cbk.buf); + + if (stub->args.mkdir_cbk.inode) + inode_unref (stub->args.mkdir_cbk.inode); + + break; + } + + case GF_FOP_UNLINK: + { + if (!stub->args.unlink_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.unlink_cbk.op_ret, + stub->args.unlink_cbk.op_errno); + else + stub->args.unlink_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.unlink_cbk.op_ret, + stub->args.unlink_cbk.op_errno); + break; + } + + case GF_FOP_RMDIR: + { + if (!stub->args.rmdir_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.rmdir_cbk.op_ret, + stub->args.rmdir_cbk.op_errno); + else + stub->args.unlink_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.rmdir_cbk.op_ret, + stub->args.rmdir_cbk.op_errno); + break; + } + + case GF_FOP_SYMLINK: + { + if (!stub->args.symlink_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.symlink_cbk.op_ret, + stub->args.symlink_cbk.op_errno, + stub->args.symlink_cbk.inode, + &stub->args.symlink_cbk.buf); + else + stub->args.symlink_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.symlink_cbk.op_ret, + stub->args.symlink_cbk.op_errno, + stub->args.symlink_cbk.inode, + &stub->args.symlink_cbk.buf); + } + break; + + case GF_FOP_RENAME: + { +#if 0 + if (!stub->args.rename_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.rename_cbk.op_ret, + stub->args.rename_cbk.op_errno, + &stub->args.rename_cbk.buf); + else + stub->args.rename_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.rename_cbk.op_ret, + stub->args.rename_cbk.op_errno, + &stub->args.rename_cbk.buf); +#endif + break; + } + + case GF_FOP_LINK: + { + if (!stub->args.link_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.link_cbk.op_ret, + stub->args.link_cbk.op_errno, + stub->args.link_cbk.inode, + &stub->args.link_cbk.buf); + else + stub->args.link_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.link_cbk.op_ret, + stub->args.link_cbk.op_errno, + stub->args.link_cbk.inode, + &stub->args.link_cbk.buf); + break; + } + + case GF_FOP_CHMOD: + { + if (!stub->args.chmod_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.chmod_cbk.op_ret, + stub->args.chmod_cbk.op_errno, + &stub->args.chmod_cbk.buf); + else + stub->args.chmod_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.chmod_cbk.op_ret, + stub->args.chmod_cbk.op_errno, + &stub->args.chmod_cbk.buf); + break; + } + + case GF_FOP_CHOWN: + { + if (!stub->args.chown_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.chown_cbk.op_ret, + stub->args.chown_cbk.op_errno, + &stub->args.chown_cbk.buf); + else + stub->args.chown_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.chown_cbk.op_ret, + stub->args.chown_cbk.op_errno, + &stub->args.chown_cbk.buf); + break; + } + + case GF_FOP_TRUNCATE: + { + if (!stub->args.truncate_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.truncate_cbk.op_ret, + stub->args.truncate_cbk.op_errno, + &stub->args.truncate_cbk.buf); + else + stub->args.truncate_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.truncate_cbk.op_ret, + stub->args.truncate_cbk.op_errno, + &stub->args.truncate_cbk.buf); + break; + } + + case GF_FOP_READ: + { + if (!stub->args.readv_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.readv_cbk.op_ret, + stub->args.readv_cbk.op_errno, + stub->args.readv_cbk.vector, + stub->args.readv_cbk.count, + &stub->args.readv_cbk.stbuf); + else + stub->args.readv_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.readv_cbk.op_ret, + stub->args.readv_cbk.op_errno, + stub->args.readv_cbk.vector, + stub->args.readv_cbk.count, + &stub->args.readv_cbk.stbuf); + } + break; + + case GF_FOP_WRITE: + { + if (!stub->args.writev_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.writev_cbk.op_ret, + stub->args.writev_cbk.op_errno, + &stub->args.writev_cbk.stbuf); + else + stub->args.writev_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.writev_cbk.op_ret, + stub->args.writev_cbk.op_errno, + &stub->args.writev_cbk.stbuf); + break; + } + + case GF_FOP_STATFS: + { + if (!stub->args.statfs_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.statfs_cbk.op_ret, + stub->args.statfs_cbk.op_errno, + &(stub->args.statfs_cbk.buf)); + else + stub->args.statfs_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.statfs_cbk.op_ret, + stub->args.statfs_cbk.op_errno, + &(stub->args.statfs_cbk.buf)); + } + break; + + case GF_FOP_FLUSH: + { + if (!stub->args.flush_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.flush_cbk.op_ret, + stub->args.flush_cbk.op_errno); + else + stub->args.flush_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.flush_cbk.op_ret, + stub->args.flush_cbk.op_errno); + + break; + } + + case GF_FOP_FSYNC: + { + if (!stub->args.fsync_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.fsync_cbk.op_ret, + stub->args.fsync_cbk.op_errno); + else + stub->args.fsync_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.fsync_cbk.op_ret, + stub->args.fsync_cbk.op_errno); + break; + } + + case GF_FOP_SETXATTR: + { + if (!stub->args.setxattr_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.setxattr_cbk.op_ret, + stub->args.setxattr_cbk.op_errno); + + else + stub->args.setxattr_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.setxattr_cbk.op_ret, + stub->args.setxattr_cbk.op_errno); + + break; + } + + case GF_FOP_GETXATTR: + { + if (!stub->args.getxattr_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.getxattr_cbk.op_ret, + stub->args.getxattr_cbk.op_errno, + stub->args.getxattr_cbk.dict); + else + stub->args.getxattr_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.getxattr_cbk.op_ret, + stub->args.getxattr_cbk.op_errno, + stub->args.getxattr_cbk.dict); + break; + } + + case GF_FOP_REMOVEXATTR: + { + if (!stub->args.removexattr_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.removexattr_cbk.op_ret, + stub->args.removexattr_cbk.op_errno); + else + stub->args.removexattr_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.removexattr_cbk.op_ret, + stub->args.removexattr_cbk.op_errno); + + break; + } + + case GF_FOP_OPENDIR: + { + if (!stub->args.opendir_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.opendir_cbk.op_ret, + stub->args.opendir_cbk.op_errno, + stub->args.opendir_cbk.fd); + else + stub->args.opendir_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.opendir_cbk.op_ret, + stub->args.opendir_cbk.op_errno, + stub->args.opendir_cbk.fd); + break; + } + + case GF_FOP_GETDENTS: + { + if (!stub->args.getdents_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.getdents_cbk.op_ret, + stub->args.getdents_cbk.op_errno, + &stub->args.getdents_cbk.entries, + stub->args.getdents_cbk.count); + else + stub->args.getdents_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.getdents_cbk.op_ret, + stub->args.getdents_cbk.op_errno, + &stub->args.getdents_cbk.entries, + stub->args.getdents_cbk.count); + break; + } + + case GF_FOP_FSYNCDIR: + { + if (!stub->args.fsyncdir_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.fsyncdir_cbk.op_ret, + stub->args.fsyncdir_cbk.op_errno); + else + stub->args.fsyncdir_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.fsyncdir_cbk.op_ret, + stub->args.fsyncdir_cbk.op_errno); + break; + } + + case GF_FOP_ACCESS: + { + if (!stub->args.access_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.access_cbk.op_ret, + stub->args.access_cbk.op_errno); + else + stub->args.access_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.access_cbk.op_ret, + stub->args.access_cbk.op_errno); + + break; + } + + case GF_FOP_FTRUNCATE: + { + if (!stub->args.ftruncate_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.ftruncate_cbk.op_ret, + stub->args.ftruncate_cbk.op_errno, + &stub->args.ftruncate_cbk.buf); + else + stub->args.ftruncate_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.ftruncate_cbk.op_ret, + stub->args.ftruncate_cbk.op_errno, + &stub->args.ftruncate_cbk.buf); + break; + } + + case GF_FOP_FSTAT: + { + if (!stub->args.fstat_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.fstat_cbk.op_ret, + stub->args.fstat_cbk.op_errno, + &stub->args.fstat_cbk.buf); + else + stub->args.fstat_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.fstat_cbk.op_ret, + stub->args.fstat_cbk.op_errno, + &stub->args.fstat_cbk.buf); + + break; + } + + case GF_FOP_LK: + { + if (!stub->args.lk_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.lk_cbk.op_ret, + stub->args.lk_cbk.op_errno, + &stub->args.lk_cbk.lock); + else + stub->args.lk_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.lk_cbk.op_ret, + stub->args.lk_cbk.op_errno, + &stub->args.lk_cbk.lock); + break; + } + + case GF_FOP_INODELK: + { + if (!stub->args.inodelk_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.inodelk_cbk.op_ret, + stub->args.inodelk_cbk.op_errno); + + else + stub->args.inodelk_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.inodelk_cbk.op_ret, + stub->args.inodelk_cbk.op_errno); + break; + } + + case GF_FOP_FINODELK: + { + if (!stub->args.finodelk_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.finodelk_cbk.op_ret, + stub->args.finodelk_cbk.op_errno); + + else + stub->args.finodelk_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.finodelk_cbk.op_ret, + stub->args.finodelk_cbk.op_errno); + break; + } + + case GF_FOP_ENTRYLK: + { + if (!stub->args.entrylk_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.entrylk_cbk.op_ret, + stub->args.entrylk_cbk.op_errno); + + else + stub->args.entrylk_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.entrylk_cbk.op_ret, + stub->args.entrylk_cbk.op_errno); + break; + } + + case GF_FOP_FENTRYLK: + { + if (!stub->args.fentrylk_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.fentrylk_cbk.op_ret, + stub->args.fentrylk_cbk.op_errno); + + else + stub->args.fentrylk_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.fentrylk_cbk.op_ret, + stub->args.fentrylk_cbk.op_errno); + break; + } + + case GF_FOP_UTIMENS: + { + if (!stub->args.utimens_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.utimens_cbk.op_ret, + stub->args.utimens_cbk.op_errno, + &stub->args.utimens_cbk.buf); + else + stub->args.utimens_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.utimens_cbk.op_ret, + stub->args.utimens_cbk.op_errno, + &stub->args.utimens_cbk.buf); + + break; + } + + + break; + case GF_FOP_FCHMOD: + { + if (!stub->args.fchmod_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.fchmod_cbk.op_ret, + stub->args.fchmod_cbk.op_errno, + &stub->args.fchmod_cbk.buf); + else + stub->args.fchmod_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.fchmod_cbk.op_ret, + stub->args.fchmod_cbk.op_errno, + &stub->args.fchmod_cbk.buf); + break; + } + + case GF_FOP_FCHOWN: + { + if (!stub->args.fchown_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.fchown_cbk.op_ret, + stub->args.fchown_cbk.op_errno, + &stub->args.fchown_cbk.buf); + else + stub->args.fchown_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.fchown_cbk.op_ret, + stub->args.fchown_cbk.op_errno, + &stub->args.fchown_cbk.buf); + break; + } + + case GF_FOP_LOOKUP: + { + if (!stub->args.lookup_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.lookup_cbk.op_ret, + stub->args.lookup_cbk.op_errno, + stub->args.lookup_cbk.inode, + &stub->args.lookup_cbk.buf, + stub->args.lookup_cbk.dict); + else + stub->args.lookup_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.lookup_cbk.op_ret, + stub->args.lookup_cbk.op_errno, + stub->args.lookup_cbk.inode, + &stub->args.lookup_cbk.buf, + stub->args.lookup_cbk.dict); + /* FIXME NULL should not be passed */ + + if (stub->args.lookup_cbk.dict) + dict_unref (stub->args.lookup_cbk.dict); + if (stub->args.lookup_cbk.inode) + inode_unref (stub->args.lookup_cbk.inode); + + break; + } + case GF_FOP_SETDENTS: + { + if (!stub->args.setdents_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.setdents_cbk.op_ret, + stub->args.setdents_cbk.op_errno); + else + stub->args.setdents_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.setdents_cbk.op_ret, + stub->args.setdents_cbk.op_errno); + break; + } + + case GF_FOP_CHECKSUM: + { + if (!stub->args.checksum_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.checksum_cbk.op_ret, + stub->args.checksum_cbk.op_errno, + stub->args.checksum_cbk.file_checksum, + stub->args.checksum_cbk.dir_checksum); + else + stub->args.checksum_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.checksum_cbk.op_ret, + stub->args.checksum_cbk.op_errno, + stub->args.checksum_cbk.file_checksum, + stub->args.checksum_cbk.dir_checksum); + if (stub->args.checksum_cbk.op_ret >= 0) + { + FREE (stub->args.checksum_cbk.file_checksum); + FREE (stub->args.checksum_cbk.dir_checksum); + } + + break; + } + + case GF_FOP_READDIR: + { + if (!stub->args.readdir_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.readdir_cbk.op_ret, + stub->args.readdir_cbk.op_errno, + &stub->args.readdir_cbk.entries); + else + stub->args.readdir_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.readdir_cbk.op_ret, + stub->args.readdir_cbk.op_errno, + &stub->args.readdir_cbk.entries); + + if (stub->args.readdir_cbk.op_ret > 0) + gf_dirent_free (&stub->args.readdir_cbk.entries); + + break; + } + + case GF_FOP_XATTROP: + { + if (!stub->args.xattrop_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.xattrop_cbk.op_ret, + stub->args.xattrop_cbk.op_errno); + else + stub->args.xattrop_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.xattrop_cbk.op_ret, + stub->args.xattrop_cbk.op_errno, + stub->args.xattrop_cbk.xattr); + + if (stub->args.xattrop_cbk.xattr) + dict_unref (stub->args.xattrop_cbk.xattr); + + break; + } + case GF_FOP_FXATTROP: + { + if (!stub->args.fxattrop_cbk.fn) + STACK_UNWIND (stub->frame, + stub->args.fxattrop_cbk.op_ret, + stub->args.fxattrop_cbk.op_errno); + else + stub->args.fxattrop_cbk.fn (stub->frame, + stub->frame->cookie, + stub->frame->this, + stub->args.fxattrop_cbk.op_ret, + stub->args.fxattrop_cbk.op_errno, + stub->args.fxattrop_cbk.xattr); + + if (stub->args.fxattrop_cbk.xattr) + dict_unref (stub->args.fxattrop_cbk.xattr); + + break; + } + case GF_FOP_MAXVALUE: + { + gf_log ("call-stub", + GF_LOG_DEBUG, + "Invalid value of FOP"); + } + break; + } +out: + return; +} + + +static void +call_stub_destroy_wind (call_stub_t *stub) +{ + switch (stub->fop) { + case GF_FOP_OPEN: + { + loc_wipe (&stub->args.open.loc); + if (stub->args.open.fd) + fd_unref (stub->args.open.fd); + break; + } + case GF_FOP_CREATE: + { + loc_wipe (&stub->args.create.loc); + if (stub->args.create.fd) + fd_unref (stub->args.create.fd); + break; + } + case GF_FOP_STAT: + { + loc_wipe (&stub->args.stat.loc); + break; + } + case GF_FOP_READLINK: + { + loc_wipe (&stub->args.readlink.loc); + break; + } + + case GF_FOP_MKNOD: + { + loc_wipe (&stub->args.mknod.loc); + } + break; + + case GF_FOP_MKDIR: + { + loc_wipe (&stub->args.mkdir.loc); + } + break; + + case GF_FOP_UNLINK: + { + loc_wipe (&stub->args.unlink.loc); + } + break; + + case GF_FOP_RMDIR: + { + loc_wipe (&stub->args.rmdir.loc); + } + break; + + case GF_FOP_SYMLINK: + { + FREE (stub->args.symlink.linkname); + loc_wipe (&stub->args.symlink.loc); + } + break; + + case GF_FOP_RENAME: + { + loc_wipe (&stub->args.rename.old); + loc_wipe (&stub->args.rename.new); + } + break; + + case GF_FOP_LINK: + { + loc_wipe (&stub->args.link.oldloc); + loc_wipe (&stub->args.link.newloc); + } + break; + + case GF_FOP_CHMOD: + { + loc_wipe (&stub->args.chmod.loc); + } + break; + + case GF_FOP_CHOWN: + { + loc_wipe (&stub->args.chown.loc); + break; + } + case GF_FOP_TRUNCATE: + { + loc_wipe (&stub->args.truncate.loc); + break; + } + + case GF_FOP_READ: + { + if (stub->args.readv.fd) + fd_unref (stub->args.readv.fd); + break; + } + + case GF_FOP_WRITE: + { + dict_t *refs = stub->args.writev.req_refs; + if (stub->args.writev.fd) + fd_unref (stub->args.writev.fd); + FREE (stub->args.writev.vector); + if (refs) + dict_unref (refs); + break; + } + + case GF_FOP_STATFS: + { + loc_wipe (&stub->args.statfs.loc); + break; + } + case GF_FOP_FLUSH: + { + if (stub->args.flush.fd) + fd_unref (stub->args.flush.fd); + break; + } + + case GF_FOP_FSYNC: + { + if (stub->args.fsync.fd) + fd_unref (stub->args.fsync.fd); + break; + } + + case GF_FOP_SETXATTR: + { + loc_wipe (&stub->args.setxattr.loc); + if (stub->args.setxattr.dict) + dict_unref (stub->args.setxattr.dict); + break; + } + + case GF_FOP_GETXATTR: + { + if (stub->args.getxattr.name) + FREE (stub->args.getxattr.name); + loc_wipe (&stub->args.getxattr.loc); + break; + } + + case GF_FOP_REMOVEXATTR: + { + loc_wipe (&stub->args.removexattr.loc); + FREE (stub->args.removexattr.name); + break; + } + + case GF_FOP_OPENDIR: + { + loc_wipe (&stub->args.opendir.loc); + if (stub->args.opendir.fd) + fd_unref (stub->args.opendir.fd); + break; + } + + case GF_FOP_GETDENTS: + { + if (stub->args.getdents.fd) + fd_unref (stub->args.getdents.fd); + break; + } + + case GF_FOP_FSYNCDIR: + { + if (stub->args.fsyncdir.fd) + fd_unref (stub->args.fsyncdir.fd); + break; + } + + case GF_FOP_ACCESS: + { + loc_wipe (&stub->args.access.loc); + break; + } + + case GF_FOP_FTRUNCATE: + { + if (stub->args.ftruncate.fd) + fd_unref (stub->args.ftruncate.fd); + break; + } + + case GF_FOP_FSTAT: + { + if (stub->args.fstat.fd) + fd_unref (stub->args.fstat.fd); + break; + } + + case GF_FOP_LK: + { + if (stub->args.lk.fd) + fd_unref (stub->args.lk.fd); + break; + } + + case GF_FOP_INODELK: + { + loc_wipe (&stub->args.inodelk.loc); + break; + } + case GF_FOP_FINODELK: + { + if (stub->args.finodelk.fd) + fd_unref (stub->args.finodelk.fd); + break; + } + case GF_FOP_ENTRYLK: + { + if (stub->args.entrylk.name) + FREE (stub->args.entrylk.name); + loc_wipe (&stub->args.entrylk.loc); + break; + } + case GF_FOP_FENTRYLK: + { + if (stub->args.fentrylk.name) + FREE (stub->args.fentrylk.name); + + if (stub->args.fentrylk.fd) + fd_unref (stub->args.fentrylk.fd); + break; + } + case GF_FOP_UTIMENS: + { + loc_wipe (&stub->args.utimens.loc); + break; + } + break; + case GF_FOP_FCHMOD: + { + if (stub->args.fchmod.fd) + fd_unref (stub->args.fchmod.fd); + break; + } + + case GF_FOP_FCHOWN: + { + if (stub->args.fchown.fd) + fd_unref (stub->args.fchown.fd); + break; + } + + case GF_FOP_LOOKUP: + { + loc_wipe (&stub->args.lookup.loc); + if (stub->args.lookup.xattr_req) + dict_unref (stub->args.lookup.xattr_req); + break; + } + + case GF_FOP_SETDENTS: + { + dir_entry_t *entry, *next; + if (stub->args.setdents.fd) + fd_unref (stub->args.setdents.fd); + entry = stub->args.setdents.entries.next; + while (entry) { + next = entry->next; + FREE (entry->name); + FREE (entry); + entry = next; + } + break; + } + + case GF_FOP_CHECKSUM: + { + loc_wipe (&stub->args.checksum.loc); + break; + } + break; + case GF_FOP_READDIR: + { + if (stub->args.readdir.fd) + fd_unref (stub->args.readdir.fd); + break; + } + case GF_FOP_XATTROP: + { + loc_wipe (&stub->args.xattrop.loc); + dict_unref (stub->args.xattrop.xattr); + break; + } + case GF_FOP_FXATTROP: + { + if (stub->args.fxattrop.fd) + fd_unref (stub->args.fxattrop.fd); + dict_unref (stub->args.xattrop.xattr); + break; + } + case GF_FOP_MAXVALUE: + { + gf_log ("call-stub", + GF_LOG_DEBUG, + "Invalid value of FOP"); + } + break; + default: + break; + } +} + + +static void +call_stub_destroy_unwind (call_stub_t *stub) +{ + switch (stub->fop) { + case GF_FOP_OPEN: + { + if (stub->args.open_cbk.fd) + fd_unref (stub->args.open_cbk.fd); + } + break; + + case GF_FOP_CREATE: + { + if (stub->args.create_cbk.fd) + fd_unref (stub->args.create_cbk.fd); + + if (stub->args.create_cbk.inode) + inode_unref (stub->args.create_cbk.inode); + } + break; + + case GF_FOP_STAT: + break; + + case GF_FOP_READLINK: + { + if (stub->args.readlink_cbk.buf) + FREE (stub->args.readlink_cbk.buf); + } + break; + + case GF_FOP_MKNOD: + { + if (stub->args.mknod_cbk.inode) + inode_unref (stub->args.mknod_cbk.inode); + } + break; + + case GF_FOP_MKDIR: + { + if (stub->args.mkdir_cbk.inode) + inode_unref (stub->args.mkdir_cbk.inode); + } + break; + + case GF_FOP_UNLINK: + break; + + case GF_FOP_RMDIR: + break; + + case GF_FOP_SYMLINK: + { + if (stub->args.symlink_cbk.inode) + inode_unref (stub->args.symlink_cbk.inode); + } + break; + + case GF_FOP_RENAME: + break; + + case GF_FOP_LINK: + { + if (stub->args.link_cbk.inode) + inode_unref (stub->args.link_cbk.inode); + } + break; + + case GF_FOP_CHMOD: + break; + + case GF_FOP_CHOWN: + break; + + case GF_FOP_TRUNCATE: + break; + + case GF_FOP_READ: + { + if (stub->args.readv_cbk.op_ret >= 0) { + dict_t *refs = stub->args.readv_cbk.rsp_refs; + FREE (stub->args.readv_cbk.vector); + + if (refs) { + dict_unref (refs); + } + } + } + break; + + case GF_FOP_WRITE: + break; + + case GF_FOP_STATFS: + break; + + case GF_FOP_FLUSH: + break; + + case GF_FOP_FSYNC: + break; + + case GF_FOP_SETXATTR: + break; + + case GF_FOP_GETXATTR: + { + if (stub->args.getxattr_cbk.dict) + dict_unref (stub->args.getxattr_cbk.dict); + } + break; + + case GF_FOP_REMOVEXATTR: + break; + + case GF_FOP_OPENDIR: + { + if (stub->args.opendir_cbk.fd) + fd_unref (stub->args.opendir_cbk.fd); + } + break; + + case GF_FOP_GETDENTS: + { + dir_entry_t *tmp = NULL, *entries = NULL; + + entries = &stub->args.getdents_cbk.entries; + if (stub->args.getdents_cbk.op_ret >= 0) { + while (entries->next) { + tmp = entries->next; + entries->next = entries->next->next; + FREE (tmp->name); + FREE (tmp); + } + } + } + break; + + case GF_FOP_FSYNCDIR: + break; + + case GF_FOP_ACCESS: + break; + + case GF_FOP_FTRUNCATE: + break; + + case GF_FOP_FSTAT: + break; + + case GF_FOP_LK: + break; + + case GF_FOP_INODELK: + break; + + case GF_FOP_FINODELK: + break; + + case GF_FOP_ENTRYLK: + break; + + case GF_FOP_FENTRYLK: + break; + + case GF_FOP_UTIMENS: + break; + + case GF_FOP_FCHMOD: + break; + + case GF_FOP_FCHOWN: + break; + + case GF_FOP_LOOKUP: + { + if (stub->args.lookup_cbk.inode) + inode_unref (stub->args.lookup_cbk.inode); + + if (stub->args.lookup_cbk.dict) + dict_unref (stub->args.lookup_cbk.dict); + } + break; + + case GF_FOP_SETDENTS: + break; + + case GF_FOP_CHECKSUM: + { + if (stub->args.checksum_cbk.op_ret >= 0) { + FREE (stub->args.checksum_cbk.file_checksum); + FREE (stub->args.checksum_cbk.dir_checksum); + } + } + break; + + case GF_FOP_READDIR: + { + if (stub->args.readdir_cbk.op_ret > 0) { + gf_dirent_free (&stub->args.readdir_cbk.entries); + } + } + break; + + case GF_FOP_XATTROP: + { + if (stub->args.xattrop_cbk.xattr) + dict_unref (stub->args.xattrop_cbk.xattr); + } + break; + + case GF_FOP_FXATTROP: + { + if (stub->args.fxattrop_cbk.xattr) + dict_unref (stub->args.fxattrop_cbk.xattr); + } + break; + + case GF_FOP_MAXVALUE: + { + gf_log ("call-stub", + GF_LOG_DEBUG, + "Invalid value of FOP"); + } + break; + + default: + break; + } +} + + +void +call_stub_destroy (call_stub_t *stub) +{ + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + if (stub->wind) { + call_stub_destroy_wind (stub); + } else { + call_stub_destroy_unwind (stub); + } + + FREE (stub); +out: + return; +} + +void +call_resume (call_stub_t *stub) +{ + errno = EINVAL; + GF_VALIDATE_OR_GOTO ("call-stub", stub, out); + + list_del_init (&stub->list); + + if (stub->wind) + call_resume_wind (stub); + else + call_resume_unwind (stub); + + call_stub_destroy (stub); +out: + return; +} + + diff --git a/libglusterfs/src/call-stub.h b/libglusterfs/src/call-stub.h new file mode 100644 index 000000000..12e94a158 --- /dev/null +++ b/libglusterfs/src/call-stub.h @@ -0,0 +1,1104 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CALL_STUB_H_ +#define _CALL_STUB_H_ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "stack.h" +#include "list.h" + +typedef struct { + struct list_head list; + char wind; + call_frame_t *frame; + glusterfs_fop_t fop; + + union { + /* lookup */ + struct { + fop_lookup_t fn; + loc_t loc; + dict_t *xattr_req; + } lookup; + struct { + fop_lookup_cbk_t fn; + int32_t op_ret, op_errno; + inode_t *inode; + struct stat buf; + dict_t *dict; + } lookup_cbk; + + /* stat */ + struct { + fop_stat_t fn; + loc_t loc; + } stat; + struct { + fop_stat_cbk_t fn; + int32_t op_ret, op_errno; + struct stat buf; + } stat_cbk; + + /* fstat */ + struct { + fop_fstat_t fn; + fd_t *fd; + } fstat; + struct { + fop_fstat_cbk_t fn; + int32_t op_ret, op_errno; + struct stat buf; + } fstat_cbk; + + /* chmod */ + struct { + fop_chmod_t fn; + loc_t loc; + mode_t mode; + } chmod; + struct { + fop_chmod_cbk_t fn; + int32_t op_ret, op_errno; + struct stat buf; + } chmod_cbk; + + /* fchmod */ + struct { + fop_fchmod_t fn; + fd_t *fd; + mode_t mode; + } fchmod; + struct { + fop_fchmod_cbk_t fn; + int32_t op_ret, op_errno; + struct stat buf; + } fchmod_cbk; + + /* chown */ + struct { + fop_chown_t fn; + loc_t loc; + uid_t uid; + gid_t gid; + } chown; + struct { + fop_chown_cbk_t fn; + int32_t op_ret, op_errno; + struct stat buf; + } chown_cbk; + + /* fchown */ + struct { + fop_fchown_t fn; + fd_t *fd; + uid_t uid; + gid_t gid; + } fchown; + struct { + fop_fchown_cbk_t fn; + int32_t op_ret, op_errno; + struct stat buf; + } fchown_cbk; + + /* truncate */ + struct { + fop_truncate_t fn; + loc_t loc; + off_t off; + } truncate; + struct { + fop_truncate_cbk_t fn; + int32_t op_ret, op_errno; + struct stat buf; + } truncate_cbk; + + /* ftruncate */ + struct { + fop_ftruncate_t fn; + fd_t *fd; + off_t off; + } ftruncate; + struct { + fop_ftruncate_cbk_t fn; + int32_t op_ret, op_errno; + struct stat buf; + } ftruncate_cbk; + + /* utimens */ + struct { + fop_utimens_t fn; + loc_t loc; + struct timespec tv[2]; + } utimens; + struct { + fop_utimens_cbk_t fn; + int32_t op_ret, op_errno; + struct stat buf; + } utimens_cbk; + + /* access */ + struct { + fop_access_t fn; + loc_t loc; + int32_t mask; + } access; + struct { + fop_access_cbk_t fn; + int32_t op_ret, op_errno; + } access_cbk; + + /* readlink */ + struct { + fop_readlink_t fn; + loc_t loc; + size_t size; + } readlink; + struct { + fop_readlink_cbk_t fn; + int32_t op_ret, op_errno; + const char *buf; + } readlink_cbk; + + /* mknod */ + struct { + fop_mknod_t fn; + loc_t loc; + mode_t mode; + dev_t rdev; + } mknod; + struct { + fop_mknod_cbk_t fn; + int32_t op_ret, op_errno; + inode_t *inode; + struct stat buf; + } mknod_cbk; + + /* mkdir */ + struct { + fop_mkdir_t fn; + loc_t loc; + mode_t mode; + } mkdir; + struct { + fop_mkdir_cbk_t fn; + int32_t op_ret, op_errno; + inode_t *inode; + struct stat buf; + } mkdir_cbk; + + /* unlink */ + struct { + fop_unlink_t fn; + loc_t loc; + } unlink; + struct { + fop_unlink_cbk_t fn; + int32_t op_ret, op_errno; + } unlink_cbk; + + /* rmdir */ + struct { + fop_rmdir_t fn; + loc_t loc; + } rmdir; + struct { + fop_rmdir_cbk_t fn; + int32_t op_ret, op_errno; + } rmdir_cbk; + + /* symlink */ + struct { + fop_symlink_t fn; + const char *linkname; + loc_t loc; + } symlink; + struct { + fop_symlink_cbk_t fn; + int32_t op_ret, op_errno; + inode_t *inode; + struct stat buf; + } symlink_cbk; + + /* rename */ + struct { + fop_rename_t fn; + loc_t old; + loc_t new; + } rename; + struct { + fop_rename_cbk_t fn; + int32_t op_ret, op_errno; + struct stat buf; + } rename_cbk; + + /* link */ + struct { + fop_link_t fn; + loc_t oldloc; + loc_t newloc; + } link; + struct { + fop_link_cbk_t fn; + int32_t op_ret, op_errno; + inode_t *inode; + struct stat buf; + } link_cbk; + + /* create */ + struct { + fop_create_t fn; + loc_t loc; + int32_t flags; + mode_t mode; + fd_t *fd; + } create; + struct { + fop_create_cbk_t fn; + int32_t op_ret, op_errno; + fd_t *fd; + inode_t *inode; + struct stat buf; + } create_cbk; + + /* open */ + struct { + fop_open_t fn; + loc_t loc; + int32_t flags; + fd_t *fd; + } open; + struct { + fop_open_cbk_t fn; + int32_t op_ret, op_errno; + fd_t *fd; + } open_cbk; + + /* readv */ + struct { + fop_readv_t fn; + fd_t *fd; + size_t size; + off_t off; + } readv; + struct { + fop_readv_cbk_t fn; + int32_t op_ret; + int32_t op_errno; + struct iovec *vector; + int32_t count; + struct stat stbuf; + dict_t *rsp_refs; + } readv_cbk; + + /* writev */ + struct { + fop_writev_t fn; + fd_t *fd; + struct iovec *vector; + int32_t count; + off_t off; + dict_t *req_refs; + } writev; + struct { + fop_writev_cbk_t fn; + int32_t op_ret, op_errno; + struct stat stbuf; + } writev_cbk; + + /* flush */ + struct { + fop_flush_t fn; + fd_t *fd; + } flush; + struct { + fop_flush_cbk_t fn; + int32_t op_ret, op_errno; + } flush_cbk; + + /* fsync */ + struct { + fop_fsync_t fn; + fd_t *fd; + int32_t datasync; + } fsync; + struct { + fop_fsync_cbk_t fn; + int32_t op_ret, op_errno; + } fsync_cbk; + + /* opendir */ + struct { + fop_opendir_t fn; + loc_t loc; + fd_t *fd; + } opendir; + struct { + fop_opendir_cbk_t fn; + int32_t op_ret, op_errno; + fd_t *fd; + } opendir_cbk; + + /* getdents */ + struct { + fop_getdents_t fn; + fd_t *fd; + size_t size; + off_t off; + int32_t flag; + } getdents; + struct { + fop_getdents_cbk_t fn; + int32_t op_ret; + int32_t op_errno; + dir_entry_t entries; + int32_t count; + } getdents_cbk; + + /* setdents */ + struct { + fop_setdents_t fn; + fd_t *fd; + int32_t flags; + dir_entry_t entries; + int32_t count; + } setdents; + struct { + fop_setdents_cbk_t fn; + int32_t op_ret; + int32_t op_errno; + } setdents_cbk; + + /* fsyncdir */ + struct { + fop_fsyncdir_t fn; + fd_t *fd; + int32_t datasync; + } fsyncdir; + struct { + fop_fsyncdir_cbk_t fn; + int32_t op_ret, op_errno; + } fsyncdir_cbk; + + /* statfs */ + struct { + fop_statfs_t fn; + loc_t loc; + } statfs; + struct { + fop_statfs_cbk_t fn; + int32_t op_ret, op_errno; + struct statvfs buf; + } statfs_cbk; + + /* setxattr */ + struct { + fop_setxattr_t fn; + loc_t loc; + dict_t *dict; + int32_t flags; + } setxattr; + struct { + fop_setxattr_cbk_t fn; + int32_t op_ret, op_errno; + } setxattr_cbk; + + /* getxattr */ + struct { + fop_getxattr_t fn; + loc_t loc; + const char *name; + } getxattr; + struct { + fop_getxattr_cbk_t fn; + int32_t op_ret, op_errno; + dict_t *dict; + } getxattr_cbk; + + /* removexattr */ + struct { + fop_removexattr_t fn; + loc_t loc; + const char *name; + } removexattr; + struct { + fop_removexattr_cbk_t fn; + int32_t op_ret, op_errno; + } removexattr_cbk; + + /* lk */ + struct { + fop_lk_t fn; + fd_t *fd; + int32_t cmd; + struct flock lock; + } lk; + struct { + fop_lk_cbk_t fn; + int32_t op_ret, op_errno; + struct flock lock; + } lk_cbk; + + /* inodelk */ + struct { + fop_inodelk_t fn; + loc_t loc; + int32_t cmd; + struct flock lock; + } inodelk; + + struct { + fop_inodelk_cbk_t fn; + int32_t op_ret, op_errno; + } inodelk_cbk; + + /* finodelk */ + struct { + fop_finodelk_t fn; + fd_t *fd; + int32_t cmd; + struct flock lock; + } finodelk; + + struct { + fop_finodelk_cbk_t fn; + int32_t op_ret, op_errno; + } finodelk_cbk; + + /* entrylk */ + struct { + fop_entrylk_t fn; + loc_t loc; + const char *name; + entrylk_cmd cmd; + entrylk_type type; + } entrylk; + + struct { + fop_entrylk_cbk_t fn; + int32_t op_ret, op_errno; + } entrylk_cbk; + + /* fentrylk */ + struct { + fop_fentrylk_t fn; + fd_t *fd; + const char *name; + entrylk_cmd cmd; + entrylk_type type; + } fentrylk; + + struct { + fop_fentrylk_cbk_t fn; + int32_t op_ret, op_errno; + } fentrylk_cbk; + + /* readdir */ + struct { + fop_readdir_t fn; + fd_t *fd; + size_t size; + off_t off; + } readdir; + struct { + fop_readdir_cbk_t fn; + int32_t op_ret, op_errno; + gf_dirent_t entries; + } readdir_cbk; + + /* checksum */ + struct { + fop_checksum_t fn; + loc_t loc; + int32_t flags; + } checksum; + struct { + fop_checksum_cbk_t fn; + int32_t op_ret, op_errno; + uint8_t *file_checksum; + uint8_t *dir_checksum; + } checksum_cbk; + + /* xattrop */ + struct { + fop_xattrop_t fn; + loc_t loc; + gf_xattrop_flags_t optype; + dict_t *xattr; + } xattrop; + struct { + fop_xattrop_cbk_t fn; + int32_t op_ret; + int32_t op_errno; + dict_t *xattr; + } xattrop_cbk; + + /* fxattrop */ + struct { + fop_fxattrop_t fn; + fd_t *fd; + gf_xattrop_flags_t optype; + dict_t *xattr; + } fxattrop; + struct { + fop_fxattrop_cbk_t fn; + int32_t op_ret; + int32_t op_errno; + dict_t *xattr; + } fxattrop_cbk; + } args; +} call_stub_t; + +call_stub_t * +fop_lookup_stub (call_frame_t *frame, + fop_lookup_t fn, + loc_t *loc, + dict_t *xattr_req); + +call_stub_t * +fop_lookup_cbk_stub (call_frame_t *frame, + fop_lookup_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict); +call_stub_t * +fop_stat_stub (call_frame_t *frame, + fop_stat_t fn, + loc_t *loc); +call_stub_t * +fop_stat_cbk_stub (call_frame_t *frame, + fop_stat_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); +call_stub_t * +fop_fstat_stub (call_frame_t *frame, + fop_fstat_t fn, + fd_t *fd); +call_stub_t * +fop_fstat_cbk_stub (call_frame_t *frame, + fop_fstat_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); +call_stub_t * +fop_chmod_stub (call_frame_t *frame, + fop_chmod_t fn, + loc_t *loc, + mode_t mode); +call_stub_t * +fop_chmod_cbk_stub (call_frame_t *frame, + fop_chmod_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); +call_stub_t * +fop_fchmod_stub (call_frame_t *frame, + fop_fchmod_t fn, + fd_t *fd, + mode_t mode); +call_stub_t * +fop_fchmod_cbk_stub (call_frame_t *frame, + fop_fchmod_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); +call_stub_t * +fop_chown_stub (call_frame_t *frame, + fop_chown_t fn, + loc_t *loc, + uid_t uid, + gid_t gid); + +call_stub_t * +fop_chown_cbk_stub (call_frame_t *frame, + fop_chown_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +call_stub_t * +fop_fchown_stub (call_frame_t *frame, + fop_fchown_t fn, + fd_t *fd, + uid_t uid, + gid_t gid); + +call_stub_t * +fop_fchown_cbk_stub (call_frame_t *frame, + fop_fchown_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +call_stub_t * +fop_truncate_stub (call_frame_t *frame, + fop_truncate_t fn, + loc_t *loc, + off_t off); + +call_stub_t * +fop_truncate_cbk_stub (call_frame_t *frame, + fop_truncate_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +call_stub_t * +fop_ftruncate_stub (call_frame_t *frame, + fop_ftruncate_t fn, + fd_t *fd, + off_t off); + +call_stub_t * +fop_ftruncate_cbk_stub (call_frame_t *frame, + fop_ftruncate_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +call_stub_t * +fop_utimens_stub (call_frame_t *frame, + fop_utimens_t fn, + loc_t *loc, + struct timespec tv[2]); + +call_stub_t * +fop_utimens_cbk_stub (call_frame_t *frame, + fop_utimens_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +call_stub_t * +fop_access_stub (call_frame_t *frame, + fop_access_t fn, + loc_t *loc, + int32_t mask); + +call_stub_t * +fop_access_cbk_stub (call_frame_t *frame, + fop_access_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +call_stub_t * +fop_readlink_stub (call_frame_t *frame, + fop_readlink_t fn, + loc_t *loc, + size_t size); + +call_stub_t * +fop_readlink_cbk_stub (call_frame_t *frame, + fop_readlink_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + const char *path); + +call_stub_t * +fop_mknod_stub (call_frame_t *frame, + fop_mknod_t fn, + loc_t *loc, + mode_t mode, + dev_t rdev); + +call_stub_t * +fop_mknod_cbk_stub (call_frame_t *frame, + fop_mknod_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + +call_stub_t * +fop_mkdir_stub (call_frame_t *frame, + fop_mkdir_t fn, + loc_t *loc, + mode_t mode); + +call_stub_t * +fop_mkdir_cbk_stub (call_frame_t *frame, + fop_mkdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + +call_stub_t * +fop_unlink_stub (call_frame_t *frame, + fop_unlink_t fn, + loc_t *loc); + +call_stub_t * +fop_unlink_cbk_stub (call_frame_t *frame, + fop_unlink_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +call_stub_t * +fop_rmdir_stub (call_frame_t *frame, + fop_rmdir_t fn, + loc_t *loc); + +call_stub_t * +fop_rmdir_cbk_stub (call_frame_t *frame, + fop_rmdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +call_stub_t * +fop_symlink_stub (call_frame_t *frame, + fop_symlink_t fn, + const char *linkname, + loc_t *loc); + +call_stub_t * +fop_symlink_cbk_stub (call_frame_t *frame, + fop_symlink_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + +call_stub_t * +fop_rename_stub (call_frame_t *frame, + fop_rename_t fn, + loc_t *oldloc, + loc_t *newloc); + +call_stub_t * +fop_rename_cbk_stub (call_frame_t *frame, + fop_rename_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +call_stub_t * +fop_link_stub (call_frame_t *frame, + fop_link_t fn, + loc_t *oldloc, + loc_t *newloc); + +call_stub_t * +fop_link_cbk_stub (call_frame_t *frame, + fop_link_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + +call_stub_t * +fop_create_stub (call_frame_t *frame, + fop_create_t fn, + loc_t *loc, + int32_t flags, + mode_t mode, fd_t *fd); + +call_stub_t * +fop_create_cbk_stub (call_frame_t *frame, + fop_create_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf); + +call_stub_t * +fop_open_stub (call_frame_t *frame, + fop_open_t fn, + loc_t *loc, + int32_t flags, + fd_t *fd); + +call_stub_t * +fop_open_cbk_stub (call_frame_t *frame, + fop_open_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + fd_t *fd); + +call_stub_t * +fop_readv_stub (call_frame_t *frame, + fop_readv_t fn, + fd_t *fd, + size_t size, + off_t off); + +call_stub_t * +fop_readv_cbk_stub (call_frame_t *frame, + fop_readv_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf); + +call_stub_t * +fop_writev_stub (call_frame_t *frame, + fop_writev_t fn, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off); + +call_stub_t * +fop_writev_cbk_stub (call_frame_t *frame, + fop_writev_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf); + +call_stub_t * +fop_flush_stub (call_frame_t *frame, + fop_flush_t fn, + fd_t *fd); + +call_stub_t * +fop_flush_cbk_stub (call_frame_t *frame, + fop_flush_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +call_stub_t * +fop_fsync_stub (call_frame_t *frame, + fop_fsync_t fn, + fd_t *fd, + int32_t datasync); + +call_stub_t * +fop_fsync_cbk_stub (call_frame_t *frame, + fop_fsync_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +call_stub_t * +fop_opendir_stub (call_frame_t *frame, + fop_opendir_t fn, + loc_t *loc, fd_t *fd); + +call_stub_t * +fop_opendir_cbk_stub (call_frame_t *frame, + fop_opendir_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + fd_t *fd); + +call_stub_t * +fop_getdents_stub (call_frame_t *frame, + fop_getdents_t fn, + fd_t *fd, + size_t size, + off_t off, + int32_t flag); + +call_stub_t * +fop_getdents_cbk_stub (call_frame_t *frame, + fop_getdents_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count); + +call_stub_t * +fop_setdents_stub (call_frame_t *frame, + fop_setdents_t fn, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count); + +call_stub_t * +fop_setdents_cbk_stub (call_frame_t *frame, + fop_setdents_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +call_stub_t * +fop_fsyncdir_stub (call_frame_t *frame, + fop_fsyncdir_t fn, + fd_t *fd, + int32_t datasync); + +call_stub_t * +fop_fsyncdir_cbk_stub (call_frame_t *frame, + fop_fsyncdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +call_stub_t * +fop_statfs_stub (call_frame_t *frame, + fop_statfs_t fn, + loc_t *loc); + +call_stub_t * +fop_statfs_cbk_stub (call_frame_t *frame, + fop_statfs_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf); + +call_stub_t * +fop_setxattr_stub (call_frame_t *frame, + fop_setxattr_t fn, + loc_t *loc, + dict_t *dict, + int32_t flags); + +call_stub_t * +fop_setxattr_cbk_stub (call_frame_t *frame, + fop_setxattr_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +call_stub_t * +fop_getxattr_stub (call_frame_t *frame, + fop_getxattr_t fn, + loc_t *loc, + const char *name); + +call_stub_t * +fop_getxattr_cbk_stub (call_frame_t *frame, + fop_getxattr_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + dict_t *value); + +call_stub_t * +fop_removexattr_stub (call_frame_t *frame, + fop_removexattr_t fn, + loc_t *loc, + const char *name); + +call_stub_t * +fop_removexattr_cbk_stub (call_frame_t *frame, + fop_removexattr_cbk_t fn, + int32_t op_ret, + int32_t op_errno); +call_stub_t * +fop_lk_stub (call_frame_t *frame, + fop_lk_t fn, + fd_t *fd, + int32_t cmd, + struct flock *lock); + +call_stub_t * +fop_lk_cbk_stub (call_frame_t *frame, + fop_lk_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + struct flock *lock); + +call_stub_t * +fop_inodelk_stub (call_frame_t *frame, fop_inodelk_t fn, + loc_t *loc, int32_t cmd, struct flock *lock); + +call_stub_t * +fop_finodelk_stub (call_frame_t *frame, fop_finodelk_t fn, + fd_t *fd, int32_t cmd, struct flock *lock); + +call_stub_t * +fop_entrylk_stub (call_frame_t *frame, fop_entrylk_t fn, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type); + +call_stub_t * +fop_fentrylk_stub (call_frame_t *frame, fop_fentrylk_t fn, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type); + +call_stub_t * +fop_inodelk_cbk_stub (call_frame_t *frame, fop_inodelk_cbk_t fn, + int32_t op_ret, int32_t op_errno); + +call_stub_t * +fop_finodelk_cbk_stub (call_frame_t *frame, fop_inodelk_cbk_t fn, + int32_t op_ret, int32_t op_errno); + +call_stub_t * +fop_entrylk_cbk_stub (call_frame_t *frame, fop_entrylk_cbk_t fn, + int32_t op_ret, int32_t op_errno); + +call_stub_t * +fop_fentrylk_cbk_stub (call_frame_t *frame, fop_entrylk_cbk_t fn, + int32_t op_ret, int32_t op_errno); + +call_stub_t * +fop_readdir_stub (call_frame_t *frame, + fop_readdir_t fn, + fd_t *fd, + size_t size, + off_t off); + +call_stub_t * +fop_readdir_cbk_stub (call_frame_t *frame, + fop_readdir_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries); + +call_stub_t * +fop_checksum_stub (call_frame_t *frame, + fop_checksum_t fn, + loc_t *loc, + int32_t flags); + +call_stub_t * +fop_checksum_cbk_stub (call_frame_t *frame, + fop_checksum_cbk_t fn, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum); + +call_stub_t * +fop_xattrop_stub (call_frame_t *frame, + fop_xattrop_t fn, + loc_t *loc, + gf_xattrop_flags_t optype, + dict_t *xattr); + +call_stub_t * +fop_xattrop_stub_cbk_stub (call_frame_t *frame, + fop_xattrop_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +call_stub_t * +fop_fxattrop_stub (call_frame_t *frame, + fop_fxattrop_t fn, + fd_t *fd, + gf_xattrop_flags_t optype, + dict_t *xattr); + +call_stub_t * +fop_fxattrop_stub_cbk_stub (call_frame_t *frame, + fop_xattrop_cbk_t fn, + int32_t op_ret, + int32_t op_errno); + +void call_resume (call_stub_t *stub); +void call_stub_destroy (call_stub_t *stub); +#endif diff --git a/libglusterfs/src/common-utils.c b/libglusterfs/src/common-utils.c new file mode 100644 index 000000000..616c7a8ce --- /dev/null +++ b/libglusterfs/src/common-utils.c @@ -0,0 +1,1349 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifdef HAVE_BACKTRACE +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "logging.h" +#include "common-utils.h" +#include "revision.h" +#include "glusterfs.h" +#include "stack.h" + +typedef int32_t (*rw_op_t)(int32_t fd, char *buf, int32_t size); +typedef int32_t (*rwv_op_t)(int32_t fd, const struct iovec *buf, int32_t size); +static glusterfs_ctx_t *gf_global_ctx; + + +struct dnscache6 { + struct addrinfo *first; + struct addrinfo *next; +}; + +int32_t +gf_resolve_ip6 (const char *hostname, + uint16_t port, + int family, + void **dnscache, + struct addrinfo **addr_info) +{ + int32_t ret = 0; + struct addrinfo hints; + struct dnscache6 *cache = NULL; + char service[NI_MAXSERV], host[NI_MAXHOST]; + + if (!hostname) { + gf_log ("resolver", GF_LOG_WARNING, "hostname is NULL"); + return -1; + } + + if (!*dnscache) { + *dnscache = CALLOC (1, sizeof (struct dnscache6)); + } + + cache = *dnscache; + if (cache->first && !cache->next) { + freeaddrinfo(cache->first); + cache->first = cache->next = NULL; + gf_log ("resolver", GF_LOG_DEBUG, + "flushing DNS cache"); + } + + if (!cache->first) { + char *port_str = NULL; + gf_log ("resolver", GF_LOG_DEBUG, + "DNS cache not present, freshly probing hostname: %s", + hostname); + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = family; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_ADDRCONFIG; + + asprintf (&port_str, "%d", port); + if ((ret = getaddrinfo(hostname, port_str, &hints, &cache->first)) != 0) { + gf_log ("resolver", GF_LOG_ERROR, + "getaddrinfo failed (%s)", gai_strerror (ret)); + + free (*dnscache); + *dnscache = NULL; + free (port_str); + return -1; + } + free (port_str); + + cache->next = cache->first; + } + + if (cache->next) { + ret = getnameinfo((struct sockaddr *)cache->next->ai_addr, + cache->next->ai_addrlen, + host, sizeof (host), + service, sizeof (service), + NI_NUMERICHOST); + if (ret != 0) { + gf_log ("resolver", + GF_LOG_ERROR, + "getnameinfo failed (%s)", gai_strerror (ret)); + goto err; + } + + gf_log ("resolver", GF_LOG_DEBUG, + "returning ip-%s (port-%s) for hostname: %s and port: %d", + host, service, hostname, port); + + *addr_info = cache->next; + } + + cache->next = cache->next->ai_next; + if (cache->next) { + ret = getnameinfo((struct sockaddr *)cache->next->ai_addr, + cache->next->ai_addrlen, + host, sizeof (host), + service, sizeof (service), + NI_NUMERICHOST); + if (ret != 0) { + gf_log ("resolver", + GF_LOG_ERROR, + "getnameinfo failed (%s)", gai_strerror (ret)); + goto err; + } + + gf_log ("resolver", GF_LOG_DEBUG, + "next DNS query will return: ip-%s port-%s", host, service); + } + + return 0; + +err: + freeaddrinfo (cache->first); + cache->first = cache->next = NULL; + free (cache); + *dnscache = NULL; + return -1; +} + +char *gf_fop_list[GF_FOP_MAXVALUE]; +char *gf_mop_list[GF_MOP_MAXVALUE]; +char *gf_cbk_list[GF_CBK_MAXVALUE]; + +void +gf_global_variable_init() +{ + gf_fop_list[GF_FOP_STAT] = "STAT"; /* 0 */ + gf_fop_list[GF_FOP_READLINK] = "READLINK"; /* 1 */ + gf_fop_list[GF_FOP_MKNOD] = "MKNOD"; /* 2 */ + gf_fop_list[GF_FOP_MKDIR] = "MKDIR"; + gf_fop_list[GF_FOP_UNLINK] = "UNLINK"; + gf_fop_list[GF_FOP_RMDIR] = "RMDIR"; /* 5 */ + gf_fop_list[GF_FOP_SYMLINK] = "SYMLINK"; + gf_fop_list[GF_FOP_RENAME] = "RENAME"; + gf_fop_list[GF_FOP_LINK] = "LINK"; + gf_fop_list[GF_FOP_CHMOD] = "CHMOD"; + gf_fop_list[GF_FOP_CHOWN] = "CHOWN"; /* 10 */ + gf_fop_list[GF_FOP_TRUNCATE] = "TRUNCATE"; + gf_fop_list[GF_FOP_OPEN] = "OPEN"; + gf_fop_list[GF_FOP_READ] = "READ"; + gf_fop_list[GF_FOP_WRITE] = "WRITE"; + gf_fop_list[GF_FOP_STATFS] = "STATFS"; /* 15 */ + gf_fop_list[GF_FOP_FLUSH] = "FLUSH"; + gf_fop_list[GF_FOP_FSYNC] = "FSYNC"; + gf_fop_list[GF_FOP_SETXATTR] = "SETXATTR"; + gf_fop_list[GF_FOP_GETXATTR] = "GETXATTR"; /* 20 */ + gf_fop_list[GF_FOP_REMOVEXATTR] = "REMOVEXATTR"; + gf_fop_list[GF_FOP_OPENDIR] = "OPENDIR"; + gf_fop_list[GF_FOP_GETDENTS] = "GETDENTS"; + gf_fop_list[GF_FOP_FSYNCDIR] = "FSYNCDIR"; /* 25 */ + gf_fop_list[GF_FOP_ACCESS] = "ACCESS"; + gf_fop_list[GF_FOP_CREATE] = "CREATE"; + gf_fop_list[GF_FOP_FTRUNCATE] = "FTRUNCATE"; + gf_fop_list[GF_FOP_FSTAT] = "FSTAT"; + gf_fop_list[GF_FOP_LK] = "LK"; /* 30 */ + gf_fop_list[GF_FOP_UTIMENS] = "UTIMENS"; + gf_fop_list[GF_FOP_FCHMOD] = "FCHMOD"; + gf_fop_list[GF_FOP_FCHOWN] = "FCHOWN"; + gf_fop_list[GF_FOP_LOOKUP] = "LOOKUP"; + gf_fop_list[GF_FOP_SETDENTS] = "SETDENTS"; /* 35 */ + gf_fop_list[GF_FOP_READDIR] = "READDIR"; + gf_fop_list[GF_FOP_INODELK] = "INODELK"; + gf_fop_list[GF_FOP_FINODELK] = "FINODELK"; + gf_fop_list[GF_FOP_ENTRYLK] = "ENTRYLK"; + gf_fop_list[GF_FOP_FENTRYLK] = "FENTRYLK"; /* 40 */ + gf_fop_list[GF_FOP_CHECKSUM] = "CHECKSUM"; /* 41 */ + gf_fop_list[GF_FOP_XATTROP] = "XATTROP"; + + gf_mop_list[GF_MOP_SETVOLUME] = "SETVOLUME"; /* 0 */ + gf_mop_list[GF_MOP_GETVOLUME] = "GETVOLUME"; /* 1 */ + gf_mop_list[GF_MOP_STATS] = "STATS"; + gf_mop_list[GF_MOP_SETSPEC] = "SETSPEC"; + gf_mop_list[GF_MOP_GETSPEC] = "GETSPEC"; + + gf_cbk_list[GF_CBK_FORGET] = "FORGET"; + gf_cbk_list[GF_CBK_RELEASE] = "RELEASE"; + gf_cbk_list[GF_CBK_RELEASEDIR] = "RELEASEDIR"; + /* Are there any more variables to be included? All global + variables initialization should go here */ + + return; +} + +void +set_global_ctx_ptr (glusterfs_ctx_t *ctx) +{ + gf_global_ctx = ctx; +} + +glusterfs_ctx_t * +get_global_ctx_ptr (void) +{ + return gf_global_ctx; +} + +void +gf_log_volume_file (FILE *specfp) +{ + extern FILE *gf_log_logfile; + int lcount = 0; + char data[GF_UNIT_KB]; + + fseek (specfp, 0L, SEEK_SET); + + fprintf (gf_log_logfile, "Given volfile:\n"); + fprintf (gf_log_logfile, + "+---------------------------------------" + "---------------------------------------+\n"); + while (!feof (specfp)) { + if (fgets (data, GF_UNIT_KB, specfp) == NULL) + break; + lcount++; + fprintf (gf_log_logfile, "%3d: %s", lcount, data); + } + fprintf (gf_log_logfile, + "\n+---------------------------------------" + "---------------------------------------+\n"); + fflush (gf_log_logfile); + fseek (specfp, 0L, SEEK_SET); +} + +static void +gf_dump_config_flags (int fd) +{ + + write (fd, "configuration details:\n", 22); + +/* have argp */ +#ifdef HAVE_ARGP + write (fd, "argp 1\n", 7); +#endif + +/* ifdef if found backtrace */ +#ifdef HAVE_BACKTRACE + write (fd, "backtrace 1\n", 12); +#endif + +/* Berkeley-DB version has cursor->get() */ +#ifdef HAVE_BDB_CURSOR_GET + write (fd, "bdb->cursor->get 1\n", 19); +#endif + +/* Define to 1 if you have the header file. */ +#ifdef HAVE_DB_H + write (fd, "db.h 1\n", 7); +#endif + +/* Define to 1 if you have the header file. */ +#ifdef HAVE_DLFCN_H + write (fd, "dlfcn 1\n", 8); +#endif + +/* define if fdatasync exists */ +#ifdef HAVE_FDATASYNC + write (fd, "fdatasync 1\n", 12); +#endif + +/* Define to 1 if you have the `pthread' library (-lpthread). */ +#ifdef HAVE_LIBPTHREAD + write (fd, "libpthread 1\n", 13); +#endif + +/* define if llistxattr exists */ +#ifdef HAVE_LLISTXATTR + write (fd, "llistxattr 1\n", 13); +#endif + +/* define if found setfsuid setfsgid */ +#ifdef HAVE_SET_FSID + write (fd, "setfsid 1\n", 10); +#endif + +/* define if found spinlock */ +#ifdef HAVE_SPINLOCK + write (fd, "spinlock 1\n", 11); +#endif + +/* Define to 1 if you have the header file. */ +#ifdef HAVE_SYS_EPOLL_H + write (fd, "epoll.h 1\n", 10); +#endif + +/* Define to 1 if you have the header file. */ +#ifdef HAVE_SYS_EXTATTR_H + write (fd, "extattr.h 1\n", 12); +#endif + +/* Define to 1 if you have the header file. */ +#ifdef HAVE_SYS_XATTR_H + write (fd, "xattr.h 1\n", 10); +#endif + +/* define if found st_atim.tv_nsec */ +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC + write (fd, "st_atim.tv_nsec 1\n", 18); +#endif + +/* define if found st_atimespec.tv_nsec */ +#ifdef HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC + write (fd, "st_atimespec.tv_nsec 1\n",23); +#endif + +/* Define to the full name and version of this package. */ +#ifdef PACKAGE_STRING + { + char msg[128]; + sprintf (msg, "package-string: %s\n", PACKAGE_STRING); + write (fd, msg, strlen (msg)); + } +#endif + + return; +} + +/* Obtain a backtrace and print it to stdout. */ +/* TODO: It looks like backtrace_symbols allocates memory, + it may be problem because mostly memory allocation/free causes 'sigsegv' */ +void +gf_print_trace (int32_t signum) +{ + extern FILE *gf_log_logfile; + int fd = fileno (gf_log_logfile); + char msg[1024]; + + + /* Pending frames, (if any), list them in order */ + write (fd, "pending frames:\n", 16); + { + extern glusterfs_ctx_t *gf_global_ctx; + glusterfs_ctx_t *ctx = gf_global_ctx; + struct list_head *trav = ((call_pool_t *)ctx->pool)->all_frames.next; + while (trav != (&((call_pool_t *)ctx->pool)->all_frames)) { + call_frame_t *tmp = (call_frame_t *)(&((call_stack_t *)trav)->frames); + if ((tmp->root->type == GF_OP_TYPE_FOP_REQUEST) || + (tmp->root->type == GF_OP_TYPE_FOP_REPLY)) + sprintf (msg,"frame : type(%d) op(%s)\n", + tmp->root->type, + gf_fop_list[tmp->root->op]); + if ((tmp->root->type == GF_OP_TYPE_MOP_REQUEST) || + (tmp->root->type == GF_OP_TYPE_MOP_REPLY)) + sprintf (msg,"frame : type(%d) op(%s)\n", + tmp->root->type, + gf_mop_list[tmp->root->op]); + if ((tmp->root->type == GF_OP_TYPE_CBK_REQUEST) || + (tmp->root->type == GF_OP_TYPE_CBK_REPLY)) + sprintf (msg,"frame : type(%d) op(%s)\n", + tmp->root->type, + gf_cbk_list[tmp->root->op]); + + write (fd, msg, strlen (msg)); + trav = trav->next; + } + write (fd, "\n", 1); + } + + sprintf (msg, "patchset: %s\n", GLUSTERFS_REPOSITORY_REVISION); + write (fd, msg, strlen (msg)); + + sprintf (msg, "signal received: %d\n", signum); + write (fd, msg, strlen (msg)); + + gf_dump_config_flags (fd); +#if HAVE_BACKTRACE + /* Print 'backtrace' */ + { + void *array[200]; + size_t size; + + size = backtrace (array, 200); + backtrace_symbols_fd (&array[1], size-1, fd); + sprintf (msg, "---------\n"); + write (fd, msg, strlen (msg)); + } +#endif /* HAVE_BACKTRACE */ + + /* Send a signal to terminate the process */ + signal (signum, SIG_DFL); + raise (signum); +} + +void +trap (void) +{ + +} + +char * +gf_trim (char *string) +{ + register char *s, *t; + + if (string == NULL) + { + return NULL; + } + + for (s = string; isspace (*s); s++) + ; + + if (*s == 0) + return s; + + t = s + strlen (s) - 1; + while (t > s && isspace (*t)) + t--; + *++t = '\0'; + + return s; +} + +int +gf_strsplit (const char *str, const char *delim, + char ***tokens, int *token_count) +{ + char *_running = NULL; + char *running = NULL; + char *token = NULL; + char **token_list = NULL; + int count = 0; + int i = 0; + int j = 0; + + if (str == NULL || delim == NULL || tokens == NULL || token_count == NULL) + { + return -1; + } + + if ((_running = strdup (str)) == NULL) + { + return -1; + } + running = _running; + + while ((token = strsep (&running, delim)) != NULL) + { + if (token[0] != '\0') + count++; + } + free (_running); + + if ((_running = strdup (str)) == NULL) + { + return -1; + } + running = _running; + + if ((token_list = CALLOC (count, sizeof (char *))) == NULL) + { + free (_running); + return -1; + } + + while ((token = strsep (&running, delim)) != NULL) + { + if (token[0] == '\0') + continue; + + if ((token_list[i++] = strdup (token)) == NULL) + goto free_exit; + } + + free (_running); + + *tokens = token_list; + *token_count = count; + return 0; + +free_exit: + free (_running); + for (j = 0; j < i; j++) + { + free (token_list[j]); + } + free (token_list); + return -1; +} + +int +gf_volume_name_validate (const char *volume_name) +{ + const char *vname = NULL; + + if (volume_name == NULL) + { + return -1; + } + + if (!isalpha (volume_name[0])) + { + return 1; + } + + for (vname = &volume_name[1]; *vname != '\0'; vname++) + { + if (!(isalnum (*vname) || *vname == '_')) + return 1; + } + + return 0; +} + + +int +gf_string2time (const char *str, uint32_t *n) +{ + unsigned long value = 0; + char *tail = NULL; + int old_errno = 0; + const char *s = NULL; + + if (str == NULL || n == NULL) + { + errno = EINVAL; + return -1; + } + + for (s = str; *s != '\0'; s++) + { + if (isspace (*s)) + { + continue; + } + if (*s == '-') + { + return -1; + } + break; + } + + old_errno = errno; + errno = 0; + value = strtol (str, &tail, 0); + + if (errno == ERANGE || errno == EINVAL) + { + return -1; + } + + if (errno == 0) + { + errno = old_errno; + } + + if (!((tail[0] == '\0') || + ((tail[0] == 's') && (tail[1] == '\0')) || + ((tail[0] == 's') && (tail[1] == 'e') && (tail[2] == 'c') && (tail[3] == '\0')))) + { + return -1; + } + + *n = value; + + return 0; +} + + +int +gf_string2percent (const char *str, uint32_t *n) +{ + unsigned long value = 0; + char *tail = NULL; + int old_errno = 0; + const char *s = NULL; + + if (str == NULL || n == NULL) + { + errno = EINVAL; + return -1; + } + + for (s = str; *s != '\0'; s++) + { + if (isspace (*s)) + { + continue; + } + if (*s == '-') + { + return -1; + } + break; + } + + old_errno = errno; + errno = 0; + value = strtol (str, &tail, 0); + + if (errno == ERANGE || errno == EINVAL) + { + return -1; + } + + if (errno == 0) + { + errno = old_errno; + } + + if (!((tail[0] == '\0') || + ((tail[0] == '%') && (tail[1] == '\0')))) + { + return -1; + } + + *n = value; + + return 0; +} + + +static int +_gf_string2long (const char *str, long *n, int base) +{ + long value = 0; + char *tail = NULL; + int old_errno = 0; + + if (str == NULL || n == NULL) + { + errno = EINVAL; + return -1; + } + + old_errno = errno; + errno = 0; + value = strtol (str, &tail, base); + + if (errno == ERANGE || errno == EINVAL) + { + return -1; + } + + if (errno == 0) + { + errno = old_errno; + } + + if (tail[0] != '\0') + { + /* bala: invalid integer format */ + return -1; + } + + *n = value; + + return 0; +} + +static int +_gf_string2ulong (const char *str, unsigned long *n, int base) +{ + unsigned long value = 0; + char *tail = NULL; + int old_errno = 0; + const char *s = NULL; + + if (str == NULL || n == NULL) + { + errno = EINVAL; + return -1; + } + + for (s = str; *s != '\0'; s++) + { + if (isspace (*s)) + { + continue; + } + if (*s == '-') + { + /* bala: we do not support suffixed (-) sign and + invalid integer format */ + return -1; + } + break; + } + + old_errno = errno; + errno = 0; + value = strtoul (str, &tail, base); + + if (errno == ERANGE || errno == EINVAL) + { + return -1; + } + + if (errno == 0) + { + errno = old_errno; + } + + if (tail[0] != '\0') + { + /* bala: invalid integer format */ + return -1; + } + + *n = value; + + return 0; +} + +static int +_gf_string2uint (const char *str, unsigned int *n, int base) +{ + unsigned long value = 0; + char *tail = NULL; + int old_errno = 0; + const char *s = NULL; + + if (str == NULL || n == NULL) + { + errno = EINVAL; + return -1; + } + + for (s = str; *s != '\0'; s++) + { + if (isspace (*s)) + { + continue; + } + if (*s == '-') + { + /* bala: we do not support suffixed (-) sign and + invalid integer format */ + return -1; + } + break; + } + + old_errno = errno; + errno = 0; + value = strtoul (str, &tail, base); + + if (errno == ERANGE || errno == EINVAL) + { + return -1; + } + + if (errno == 0) + { + errno = old_errno; + } + + if (tail[0] != '\0') + { + /* bala: invalid integer format */ + return -1; + } + + *n = (unsigned int)value; + + return 0; +} + +static int +_gf_string2longlong (const char *str, long long *n, int base) +{ + long long value = 0; + char *tail = NULL; + int old_errno = 0; + + if (str == NULL || n == NULL) + { + errno = EINVAL; + return -1; + } + + old_errno = errno; + errno = 0; + value = strtoll (str, &tail, base); + + if (errno == ERANGE || errno == EINVAL) + { + return -1; + } + + if (errno == 0) + { + errno = old_errno; + } + + if (tail[0] != '\0') + { + /* bala: invalid integer format */ + return -1; + } + + *n = value; + + return 0; +} + +static int +_gf_string2ulonglong (const char *str, unsigned long long *n, int base) +{ + unsigned long long value = 0; + char *tail = NULL; + int old_errno = 0; + const char *s = NULL; + + if (str == NULL || n == NULL) + { + errno = EINVAL; + return -1; + } + + for (s = str; *s != '\0'; s++) + { + if (isspace (*s)) + { + continue; + } + if (*s == '-') + { + /* bala: we do not support suffixed (-) sign and + invalid integer format */ + return -1; + } + break; + } + + old_errno = errno; + errno = 0; + value = strtoull (str, &tail, base); + + if (errno == ERANGE || errno == EINVAL) + { + return -1; + } + + if (errno == 0) + { + errno = old_errno; + } + + if (tail[0] != '\0') + { + /* bala: invalid integer format */ + return -1; + } + + *n = value; + + return 0; +} + +int +gf_string2long (const char *str, long *n) +{ + return _gf_string2long (str, n, 0); +} + +int +gf_string2ulong (const char *str, unsigned long *n) +{ + return _gf_string2ulong (str, n, 0); +} + +int +gf_string2int (const char *str, int *n) +{ + return _gf_string2long (str, (long *) n, 0); +} + +int +gf_string2uint (const char *str, unsigned int *n) +{ + return _gf_string2uint (str, n, 0); +} + +int +gf_string2longlong (const char *str, long long *n) +{ + return _gf_string2longlong (str, n, 0); +} + +int +gf_string2ulonglong (const char *str, unsigned long long *n) +{ + return _gf_string2ulonglong (str, n, 0); +} + +int +gf_string2int8 (const char *str, int8_t *n) +{ + long l = 0L; + int rv = 0; + + rv = _gf_string2long (str, &l, 0); + if (rv != 0) + return rv; + + if (l >= INT8_MIN && l <= INT8_MAX) + { + *n = (int8_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2int16 (const char *str, int16_t *n) +{ + long l = 0L; + int rv = 0; + + rv = _gf_string2long (str, &l, 0); + if (rv != 0) + return rv; + + if (l >= INT16_MIN && l <= INT16_MAX) + { + *n = (int16_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2int32 (const char *str, int32_t *n) +{ + long l = 0L; + int rv = 0; + + rv = _gf_string2long (str, &l, 0); + if (rv != 0) + return rv; + + if (l >= INT32_MIN && l <= INT32_MAX) + { + *n = (int32_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2int64 (const char *str, int64_t *n) +{ + long long l = 0LL; + int rv = 0; + + rv = _gf_string2longlong (str, &l, 0); + if (rv != 0) + return rv; + + if (l >= INT64_MIN && l <= INT64_MAX) + { + *n = (int64_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2uint8 (const char *str, uint8_t *n) +{ + unsigned long l = 0L; + int rv = 0; + + rv = _gf_string2ulong (str, &l, 0); + if (rv != 0) + return rv; + + if (l >= 0 && l <= UINT8_MAX) + { + *n = (uint8_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2uint16 (const char *str, uint16_t *n) +{ + unsigned long l = 0L; + int rv = 0; + + rv = _gf_string2ulong (str, &l, 0); + if (rv != 0) + return rv; + + if (l >= 0 && l <= UINT16_MAX) + { + *n = (uint16_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2uint32 (const char *str, uint32_t *n) +{ + unsigned long l = 0L; + int rv = 0; + + rv = _gf_string2ulong (str, &l, 0); + if (rv != 0) + return rv; + + if (l >= 0 && l <= UINT32_MAX) + { + *n = (uint32_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2uint64 (const char *str, uint64_t *n) +{ + unsigned long long l = 0ULL; + int rv = 0; + + rv = _gf_string2ulonglong (str, &l, 0); + if (rv != 0) + return rv; + + if (l >= 0 && l <= UINT64_MAX) + { + *n = (uint64_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2ulong_base10 (const char *str, unsigned long *n) +{ + return _gf_string2ulong (str, n, 10); +} + +int +gf_string2uint_base10 (const char *str, unsigned int *n) +{ + return _gf_string2uint (str, n, 10); +} + +int +gf_string2uint8_base10 (const char *str, uint8_t *n) +{ + unsigned long l = 0L; + int rv = 0; + + rv = _gf_string2ulong (str, &l, 10); + if (rv != 0) + return rv; + + if (l >= 0 && l <= UINT8_MAX) + { + *n = (uint8_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2uint16_base10 (const char *str, uint16_t *n) +{ + unsigned long l = 0L; + int rv = 0; + + rv = _gf_string2ulong (str, &l, 10); + if (rv != 0) + return rv; + + if (l >= 0 && l <= UINT16_MAX) + { + *n = (uint16_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2uint32_base10 (const char *str, uint32_t *n) +{ + unsigned long l = 0L; + int rv = 0; + + rv = _gf_string2ulong (str, &l, 10); + if (rv != 0) + return rv; + + if (l >= 0 && l <= UINT32_MAX) + { + *n = (uint32_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2uint64_base10 (const char *str, uint64_t *n) +{ + unsigned long long l = 0ULL; + int rv = 0; + + rv = _gf_string2ulonglong (str, &l, 10); + if (rv != 0) + return rv; + + if (l >= 0 && l <= UINT64_MAX) + { + *n = (uint64_t) l; + return 0; + } + + errno = ERANGE; + return -1; +} + +int +gf_string2bytesize (const char *str, uint64_t *n) +{ + uint64_t value = 0ULL; + char *tail = NULL; + int old_errno = 0; + const char *s = NULL; + + if (str == NULL || n == NULL) + { + errno = EINVAL; + return -1; + } + + for (s = str; *s != '\0'; s++) + { + if (isspace (*s)) + { + continue; + } + if (*s == '-') + { + /* bala: we do not support suffixed (-) sign and + invalid integer format */ + return -1; + } + break; + } + + old_errno = errno; + errno = 0; + value = strtoull (str, &tail, 10); + + if (errno == ERANGE || errno == EINVAL) + { + return -1; + } + + if (errno == 0) + { + errno = old_errno; + } + + if (tail[0] != '\0') + { + if (strcasecmp (tail, GF_UNIT_KB_STRING) == 0) + { + value *= GF_UNIT_KB; + } + else if (strcasecmp (tail, GF_UNIT_MB_STRING) == 0) + { + value *= GF_UNIT_MB; + } + else if (strcasecmp (tail, GF_UNIT_GB_STRING) == 0) + { + value *= GF_UNIT_GB; + } + else if (strcasecmp (tail, GF_UNIT_TB_STRING) == 0) + { + value *= GF_UNIT_TB; + } + else if (strcasecmp (tail, GF_UNIT_PB_STRING) == 0) + { + value *= GF_UNIT_PB; + } + else + { + /* bala: invalid integer format */ + return -1; + } + } + + *n = value; + + return 0; +} + +int64_t +gf_str_to_long_long (const char *number) +{ + int64_t unit = 1; + int64_t ret = 0; + char *endptr = NULL ; + if (!number) + return 0; + + ret = strtoll (number, &endptr, 0); + + if (endptr) { + switch (*endptr) { + case 'G': + case 'g': + if ((* (endptr + 1) == 'B') ||(* (endptr + 1) == 'b')) + unit = 1024 * 1024 * 1024; + break; + case 'M': + case 'm': + if ((* (endptr + 1) == 'B') ||(* (endptr + 1) == 'b')) + unit = 1024 * 1024; + break; + case 'K': + case 'k': + if ((* (endptr + 1) == 'B') ||(* (endptr + 1) == 'b')) + unit = 1024; + break; + case '%': + unit = 1; + break; + default: + unit = 1; + break; + } + } + return ret * unit; +} + +int +gf_string2boolean (const char *str, gf_boolean_t *b) +{ + if (str == NULL) { + return -1; + } + + if ((strcasecmp (str, "1") == 0) || + (strcasecmp (str, "on") == 0) || + (strcasecmp (str, "yes") == 0) || + (strcasecmp (str, "true") == 0) || + (strcasecmp (str, "enable") == 0)) { + *b = _gf_true; + return 0; + } + + if ((strcasecmp (str, "0") == 0) || + (strcasecmp (str, "off") == 0) || + (strcasecmp (str, "no") == 0) || + (strcasecmp (str, "false") == 0) || + (strcasecmp (str, "disable") == 0)) { + *b = _gf_false; + return 0; + } + + return -1; +} + + +int +gf_lockfd (int fd) +{ + struct flock fl; + + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 0; + + return fcntl (fd, F_SETLK, &fl); +} + + +int +gf_unlockfd (int fd) +{ + struct flock fl; + + fl.l_type = F_UNLCK; + fl.l_whence = SEEK_SET; + fl.l_start = 0; + fl.l_len = 0; + + return fcntl (fd, F_SETLK, &fl); +} + diff --git a/libglusterfs/src/common-utils.h b/libglusterfs/src/common-utils.h new file mode 100644 index 000000000..2016b51eb --- /dev/null +++ b/libglusterfs/src/common-utils.h @@ -0,0 +1,313 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _COMMON_UTILS_H +#define _COMMON_UTILS_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#ifndef GF_BSD_HOST_OS +#include +#endif + +void trap (void); + +#define GF_UNIVERSAL_ANSWER 42 /* :O */ + +/* To solve type punned error */ +#define VOID(ptr) ((void **) ((void *) ptr)) + +#include "logging.h" +#include "glusterfs.h" +#include "locking.h" +#include "mem-pool.h" + + +#define min(a,b) ((a)<(b)?(a):(b)) +#define max(a,b) ((a)>(b)?(a):(b)) +#define roof(a,b) ((((a)+(b)-1)/((b)?(b):1))*(b)) +#define floor(a,b) (((a)/((b)?(b):1))*(b)) + + +#define GF_UNIT_KB 1024ULL +#define GF_UNIT_MB 1048576ULL +#define GF_UNIT_GB 1073741824ULL +#define GF_UNIT_TB 1099511627776ULL +#define GF_UNIT_PB 1125899906842624ULL + +#define GF_UNIT_KB_STRING "KB" +#define GF_UNIT_MB_STRING "MB" +#define GF_UNIT_GB_STRING "GB" +#define GF_UNIT_TB_STRING "TB" +#define GF_UNIT_PB_STRING "PB" + + +#define ERR_ABORT(ptr) \ + if (ptr == NULL) { \ + abort (); \ + } + +enum _gf_boolean +{ + _gf_false = 0, + _gf_true = 1 +}; + +typedef enum _gf_boolean gf_boolean_t; + +void gf_global_variable_init(void); +void set_global_ctx_ptr (glusterfs_ctx_t *ctx); +glusterfs_ctx_t *get_global_ctx_ptr (void); + +in_addr_t gf_resolve_ip (const char *hostname, void **dnscache); + +void gf_log_volume_file (FILE *specfp); +void gf_print_trace (int32_t signal); + +extern char *gf_fop_list[GF_FOP_MAXVALUE]; +extern char *gf_mop_list[GF_MOP_MAXVALUE]; +extern char *gf_cbk_list[GF_CBK_MAXVALUE]; + +#define VECTORSIZE(count) (count * (sizeof (struct iovec))) + +#define STRLEN_0(str) (strlen(str) + 1) +#define VALIDATE_OR_GOTO(arg,label) do { \ + if (!arg) { \ + errno = EINVAL; \ + gf_log ((this ? this->name : "(Govinda! Govinda!)"), \ + GF_LOG_ERROR, \ + "invalid argument: " #arg); \ + goto label; \ + } \ + } while (0); + +#define GF_VALIDATE_OR_GOTO(name,arg,label) do { \ + if (!arg) { \ + errno = EINVAL; \ + gf_log (name, GF_LOG_ERROR, \ + "invalid argument: " #arg); \ + goto label; \ + } \ + } while (0); + + +#define GF_FILE_CONTENT_REQUESTED(_xattr_req,_content_limit) \ + (dict_get_uint64 (_xattr_req, "glusterfs.content", _content_limit) == 0) + +static inline void +iov_free (struct iovec *vector, int count) +{ + int i; + + for (i = 0; i < count; i++) + FREE (vector[i].iov_base); + + FREE (vector); +} + + +static inline int +iov_length (const struct iovec *vector, int count) +{ + int i = 0; + size_t size = 0; + + for (i = 0; i < count; i++) + size += vector[i].iov_len; + + return size; +} + + +static inline struct iovec * +iov_dup (struct iovec *vector, int count) +{ + int bytecount = 0; + int i; + struct iovec *newvec = NULL; + + bytecount = (count * sizeof (struct iovec)); + newvec = MALLOC (bytecount); + if (!newvec) + return NULL; + + for (i = 0; i < count; i++) { + newvec[i].iov_len = vector[i].iov_len; + newvec[i].iov_base = vector[i].iov_base; + } + + return newvec; +} + + +static inline int +iov_subset (struct iovec *orig, int orig_count, + off_t src_offset, off_t dst_offset, + struct iovec *new) +{ + int new_count = 0; + int i; + off_t offset = 0; + size_t start_offset = 0; + size_t end_offset = 0; + + + for (i = 0; i < orig_count; i++) { + if ((offset + orig[i].iov_len < src_offset) + || (offset > dst_offset)) { + goto not_subset; + } + + if (!new) { + goto count_only; + } + + start_offset = 0; + end_offset = orig[i].iov_len; + + if (src_offset >= offset) { + start_offset = (src_offset - offset); + } + + if (dst_offset <= (offset + orig[i].iov_len)) { + end_offset = (dst_offset - offset); + } + + new[new_count].iov_base = orig[i].iov_base + start_offset; + new[new_count].iov_len = end_offset - start_offset; + + count_only: + new_count++; + + not_subset: + offset += orig[i].iov_len; + } + + return new_count; +} + + +static inline void +iov_unload (char *buf, const struct iovec *vector, int count) +{ + int i; + int copied = 0; + + for (i = 0; i < count; i++) { + memcpy (buf + copied, vector[i].iov_base, vector[i].iov_len); + copied += vector[i].iov_len; + } +} + + +static inline int +mem_0filled (const char *buf, size_t size) +{ + int i = 0; + int ret = 0; + + for (i = 0; i < size; i++) { + ret = buf[i]; + if (ret) + break; + } + + return ret; +} + + +static inline int +iov_0filled (struct iovec *vector, int count) +{ + int i = 0; + int ret = 0; + + for (i = 0; i < count; i++) { + ret = mem_0filled (vector[i].iov_base, vector[i].iov_len); + if (ret) + break; + } + + return ret; +} + + +static inline void * +memdup (const void *ptr, size_t size) +{ + void *newptr = NULL; + + newptr = MALLOC (size); + if (!newptr) + return NULL; + + memcpy (newptr, ptr, size); + return newptr; +} + + +char *gf_trim (char *string); +int gf_strsplit (const char *str, const char *delim, + char ***tokens, int *token_count); +int gf_volume_name_validate (const char *volume_name); + +int gf_string2long (const char *str, long *n); +int gf_string2ulong (const char *str, unsigned long *n); +int gf_string2int (const char *str, int *n); +int gf_string2uint (const char *str, unsigned int *n); +int gf_string2longlong (const char *str, long long *n); +int gf_string2ulonglong (const char *str, unsigned long long *n); + +int gf_string2int8 (const char *str, int8_t *n); +int gf_string2int16 (const char *str, int16_t *n); +int gf_string2int32 (const char *str, int32_t *n); +int gf_string2int64 (const char *str, int64_t *n); +int gf_string2uint8 (const char *str, uint8_t *n); +int gf_string2uint16 (const char *str, uint16_t *n); +int gf_string2uint32 (const char *str, uint32_t *n); +int gf_string2uint64 (const char *str, uint64_t *n); + +int gf_string2ulong_base10 (const char *str, unsigned long *n); +int gf_string2uint_base10 (const char *str, unsigned int *n); +int gf_string2uint8_base10 (const char *str, uint8_t *n); +int gf_string2uint16_base10 (const char *str, uint16_t *n); +int gf_string2uint32_base10 (const char *str, uint32_t *n); +int gf_string2uint64_base10 (const char *str, uint64_t *n); + +int gf_string2bytesize (const char *str, uint64_t *n); + +int gf_string2boolean (const char *str, gf_boolean_t *b); +int gf_string2percent (const char *str, uint32_t *n); +int gf_string2time (const char *str, uint32_t *n); + +int gf_lockfd (int fd); +int gf_unlockfd (int fd); + +#endif /* _COMMON_UTILS_H */ + diff --git a/libglusterfs/src/compat-errno.c b/libglusterfs/src/compat-errno.c new file mode 100644 index 000000000..a4a6c7106 --- /dev/null +++ b/libglusterfs/src/compat-errno.c @@ -0,0 +1,938 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include + +#include "compat-errno.h" + + +static int32_t gf_error_to_errno_array[1024]; +static int32_t gf_errno_to_error_array[1024]; + +static int32_t gf_compat_errno_init_done; + +#ifdef GF_SOLARIS_HOST_OS +static void +init_compat_errno_arrays () +{ +/* ENOMSG 35 / * No message of desired type */ + gf_error_to_errno_array[GF_ERROR_CODE_NOMSG] = ENOMSG; + gf_errno_to_error_array[ENOMSG] = GF_ERROR_CODE_NOMSG; + +/* EIDRM 36 / * Identifier removed */ + gf_error_to_errno_array[GF_ERROR_CODE_IDRM] = EIDRM; + gf_errno_to_error_array[EIDRM] = GF_ERROR_CODE_IDRM; + +/* ECHRNG 37 / * Channel number out of range */ + gf_error_to_errno_array[GF_ERROR_CODE_CHRNG] = ECHRNG; + gf_errno_to_error_array[ECHRNG] = GF_ERROR_CODE_CHRNG; + +/* EL2NSYNC 38 / * Level 2 not synchronized */ + gf_error_to_errno_array[GF_ERROR_CODE_L2NSYNC] = EL2NSYNC; + gf_errno_to_error_array[EL2NSYNC] = GF_ERROR_CODE_L2NSYNC; + +/* EL3HLT 39 / * Level 3 halted */ + gf_error_to_errno_array[GF_ERROR_CODE_L3HLT] = EL3HLT; + gf_errno_to_error_array[EL3HLT] = GF_ERROR_CODE_L3HLT; + +/* EL3RST 40 / * Level 3 reset */ + gf_error_to_errno_array[GF_ERROR_CODE_L3RST] = EL3RST; + gf_errno_to_error_array[EL3RST] = GF_ERROR_CODE_L3RST; + +/* ELNRNG 41 / * Link number out of range */ + gf_error_to_errno_array[GF_ERROR_CODE_LNRNG] = ELNRNG; + gf_errno_to_error_array[ELNRNG] = GF_ERROR_CODE_LNRNG; + +/* EUNATCH 42 / * Protocol driver not attached */ + gf_error_to_errno_array[GF_ERROR_CODE_UNATCH] = EUNATCH; + gf_errno_to_error_array[EUNATCH] = GF_ERROR_CODE_UNATCH; + +/* ENOCSI 43 / * No CSI structure available */ + gf_error_to_errno_array[GF_ERROR_CODE_NOCSI] = ENOCSI; + gf_errno_to_error_array[ENOCSI] = GF_ERROR_CODE_NOCSI; + +/* EL2HLT 44 / * Level 2 halted */ + gf_error_to_errno_array[GF_ERROR_CODE_L2HLT] = EL2HLT; + gf_errno_to_error_array[EL2HLT] = GF_ERROR_CODE_L2HLT; + +/* EDEADLK 45 / * Deadlock condition. */ + gf_error_to_errno_array[GF_ERROR_CODE_DEADLK] = EDEADLK; + gf_errno_to_error_array[EDEADLK] = GF_ERROR_CODE_DEADLK; + +/* ENOLCK 46 / * No record locks available. */ + gf_error_to_errno_array[GF_ERROR_CODE_NOLCK] = ENOLCK; + gf_errno_to_error_array[ENOLCK] = GF_ERROR_CODE_NOLCK; + +/* ECANCELED 47 / * Operation canceled */ + gf_error_to_errno_array[GF_ERROR_CODE_CANCELED] = ECANCELED; + gf_errno_to_error_array[ECANCELED] = GF_ERROR_CODE_CANCELED; + +/* ENOTSUP 48 / * Operation not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTSUPP] = ENOTSUP; + gf_errno_to_error_array[ENOTSUP] = GF_ERROR_CODE_NOTSUPP; + +/* Filesystem Quotas */ +/* EDQUOT 49 / * Disc quota exceeded */ + gf_error_to_errno_array[GF_ERROR_CODE_DQUOT] = EDQUOT; + gf_errno_to_error_array[EDQUOT] = GF_ERROR_CODE_DQUOT; + +/* Convergent Error Returns */ +/* EBADE 50 / * invalid exchange */ + gf_error_to_errno_array[GF_ERROR_CODE_BADE] = EBADE; + gf_errno_to_error_array[EBADE] = GF_ERROR_CODE_BADE; +/* EBADR 51 / * invalid request descriptor */ + gf_error_to_errno_array[GF_ERROR_CODE_BADR] = EBADR; + gf_errno_to_error_array[EBADR] = GF_ERROR_CODE_BADR; +/* EXFULL 52 / * exchange full */ + gf_error_to_errno_array[GF_ERROR_CODE_XFULL] = EXFULL; + gf_errno_to_error_array[EXFULL] = GF_ERROR_CODE_XFULL; +/* ENOANO 53 / * no anode */ + gf_error_to_errno_array[GF_ERROR_CODE_NOANO] = ENOANO; + gf_errno_to_error_array[ENOANO] = GF_ERROR_CODE_NOANO; +/* EBADRQC 54 / * invalid request code */ + gf_error_to_errno_array[GF_ERROR_CODE_BADRQC] = EBADRQC; + gf_errno_to_error_array[EBADRQC] = GF_ERROR_CODE_BADRQC; +/* EBADSLT 55 / * invalid slot */ + gf_error_to_errno_array[GF_ERROR_CODE_BADSLT] = EBADSLT; + gf_errno_to_error_array[EBADSLT] = GF_ERROR_CODE_BADSLT; +/* EDEADLOCK 56 / * file locking deadlock error */ +/* This is same as EDEADLK on linux */ + gf_error_to_errno_array[GF_ERROR_CODE_DEADLK] = EDEADLOCK; + gf_errno_to_error_array[EDEADLOCK] = GF_ERROR_CODE_DEADLK; + +/* EBFONT 57 / * bad font file fmt */ + gf_error_to_errno_array[GF_ERROR_CODE_BFONT] = EBFONT; + gf_errno_to_error_array[EBFONT] = GF_ERROR_CODE_BFONT; + +/* Interprocess Robust Locks */ +/* EOWNERDEAD 58 / * process died with the lock */ + gf_error_to_errno_array[GF_ERROR_CODE_OWNERDEAD] = EOWNERDEAD; + gf_errno_to_error_array[EOWNERDEAD] = GF_ERROR_CODE_OWNERDEAD; +/* ENOTRECOVERABLE 59 / * lock is not recoverable */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTRECOVERABLE] = ENOTRECOVERABLE; + gf_errno_to_error_array[ENOTRECOVERABLE] = GF_ERROR_CODE_NOTRECOVERABLE; + +/* stream problems */ +/* ENOSTR 60 / * Device not a stream */ + gf_error_to_errno_array[GF_ERROR_CODE_NOSTR] = ENOSTR; + gf_errno_to_error_array[ENOSTR] = GF_ERROR_CODE_NOSTR; +/* ENODATA 61 / * no data (for no delay io) */ + gf_error_to_errno_array[GF_ERROR_CODE_NODATA] = ENODATA; + gf_errno_to_error_array[ENODATA] = GF_ERROR_CODE_NODATA; +/* ETIME 62 / * timer expired */ + gf_error_to_errno_array[GF_ERROR_CODE_TIME] = ETIME; + gf_errno_to_error_array[ETIME] = GF_ERROR_CODE_TIME; +/* ENOSR 63 / * out of streams resources */ + gf_error_to_errno_array[GF_ERROR_CODE_NOSR] = ENOSR; + gf_errno_to_error_array[ENOSR] = GF_ERROR_CODE_NOSR; + +/* ENONET 64 / * Machine is not on the network */ + gf_error_to_errno_array[GF_ERROR_CODE_NONET] = ENONET; + gf_errno_to_error_array[ENONET] = GF_ERROR_CODE_NONET; +/* ENOPKG 65 / * Package not installed */ + gf_error_to_errno_array[GF_ERROR_CODE_NOPKG] = ENOPKG; + gf_errno_to_error_array[ENOPKG] = GF_ERROR_CODE_NOPKG; +/* EREMOTE 66 / * The object is remote */ + gf_error_to_errno_array[GF_ERROR_CODE_REMOTE] = EREMOTE; + gf_errno_to_error_array[EREMOTE] = GF_ERROR_CODE_REMOTE; +/* ENOLINK 67 / * the link has been severed */ + gf_error_to_errno_array[GF_ERROR_CODE_NOLINK] = ENOLINK; + gf_errno_to_error_array[ENOLINK] = GF_ERROR_CODE_NOLINK; +/* EADV 68 / * advertise error */ + gf_error_to_errno_array[GF_ERROR_CODE_ADV] = EADV; + gf_errno_to_error_array[EADV] = GF_ERROR_CODE_ADV; +/* ESRMNT 69 / * srmount error */ + gf_error_to_errno_array[GF_ERROR_CODE_SRMNT] = ESRMNT; + gf_errno_to_error_array[ESRMNT] = GF_ERROR_CODE_SRMNT; + +/* ECOMM 70 / * Communication error on send */ + gf_error_to_errno_array[GF_ERROR_CODE_COMM] = ECOMM; + gf_errno_to_error_array[ECOMM] = GF_ERROR_CODE_COMM; +/* EPROTO 71 / * Protocol error */ + gf_error_to_errno_array[GF_ERROR_CODE_PROTO] = EPROTO; + gf_errno_to_error_array[EPROTO] = GF_ERROR_CODE_PROTO; + +/* Interprocess Robust Locks */ +/* ELOCKUNMAPPED 72 / * locked lock was unmapped */ + gf_error_to_errno_array[GF_ERROR_CODE_LOCKUNMAPPED] = ELOCKUNMAPPED; + gf_errno_to_error_array[ELOCKUNMAPPED] = GF_ERROR_CODE_LOCKUNMAPPED; + +/* ENOTACTIVE 73 / * Facility is not active */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTACTIVE] = ENOTACTIVE; + gf_errno_to_error_array[ENOTACTIVE] = GF_ERROR_CODE_NOTACTIVE; +/* EMULTIHOP 74 / * multihop attempted */ + gf_error_to_errno_array[GF_ERROR_CODE_MULTIHOP] = EMULTIHOP; + gf_errno_to_error_array[EMULTIHOP] = GF_ERROR_CODE_MULTIHOP; +/* EBADMSG 77 / * trying to read unreadable message */ + gf_error_to_errno_array[GF_ERROR_CODE_BADMSG] = EBADMSG; + gf_errno_to_error_array[EBADMSG] = GF_ERROR_CODE_BADMSG; +/* ENAMETOOLONG 78 / * path name is too long */ + gf_error_to_errno_array[GF_ERROR_CODE_NAMETOOLONG] = ENAMETOOLONG; + gf_errno_to_error_array[ENAMETOOLONG] = GF_ERROR_CODE_NAMETOOLONG; +/* EOVERFLOW 79 / * value too large to be stored in data type */ + gf_error_to_errno_array[GF_ERROR_CODE_OVERFLOW] = EOVERFLOW; + gf_errno_to_error_array[EOVERFLOW] = GF_ERROR_CODE_OVERFLOW; +/* ENOTUNIQ 80 / * given log. name not unique */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTUNIQ] = ENOTUNIQ; + gf_errno_to_error_array[ENOTUNIQ] = GF_ERROR_CODE_NOTUNIQ; +/* EBADFD 81 / * f.d. invalid for this operation */ + gf_error_to_errno_array[GF_ERROR_CODE_BADFD] = EBADFD; + gf_errno_to_error_array[EBADFD] = GF_ERROR_CODE_BADFD; +/* EREMCHG 82 / * Remote address changed */ + gf_error_to_errno_array[GF_ERROR_CODE_REMCHG] = EREMCHG; + gf_errno_to_error_array[EREMCHG] = GF_ERROR_CODE_REMCHG; + +/* shared library problems */ +/* ELIBACC 83 / * Can't access a needed shared lib. */ + gf_error_to_errno_array[GF_ERROR_CODE_LIBACC] = ELIBACC; + gf_errno_to_error_array[ELIBACC] = GF_ERROR_CODE_LIBACC; +/* ELIBBAD 84 / * Accessing a corrupted shared lib. */ + gf_error_to_errno_array[GF_ERROR_CODE_LIBBAD] = ELIBBAD; + gf_errno_to_error_array[ELIBBAD] = GF_ERROR_CODE_LIBBAD; +/* ELIBSCN 85 / * .lib section in a.out corrupted. */ + gf_error_to_errno_array[GF_ERROR_CODE_LIBSCN] = ELIBSCN; + gf_errno_to_error_array[ELIBSCN] = GF_ERROR_CODE_LIBSCN; +/* ELIBMAX 86 / * Attempting to link in too many libs. */ + gf_error_to_errno_array[GF_ERROR_CODE_LIBMAX] = ELIBMAX; + gf_errno_to_error_array[ELIBMAX] = GF_ERROR_CODE_LIBMAX; +/* ELIBEXEC 87 / * Attempting to exec a shared library. */ + gf_error_to_errno_array[GF_ERROR_CODE_LIBEXEC] = ELIBEXEC; + gf_errno_to_error_array[ELIBEXEC] = GF_ERROR_CODE_LIBEXEC; +/* EILSEQ 88 / * Illegal byte sequence. */ + gf_error_to_errno_array[GF_ERROR_CODE_ILSEQ] = EILSEQ; + gf_errno_to_error_array[EILSEQ] = GF_ERROR_CODE_ILSEQ; +/* ENOSYS 89 / * Unsupported file system operation */ + gf_error_to_errno_array[GF_ERROR_CODE_NOSYS] = ENOSYS; + gf_errno_to_error_array[ENOSYS] = GF_ERROR_CODE_NOSYS; +/* ELOOP 90 / * Symbolic link loop */ + gf_error_to_errno_array[GF_ERROR_CODE_LOOP] = ELOOP; + gf_errno_to_error_array[ELOOP] = GF_ERROR_CODE_LOOP; +/* ERESTART 91 / * Restartable system call */ + gf_error_to_errno_array[GF_ERROR_CODE_RESTART] = ERESTART; + gf_errno_to_error_array[ERESTART] = GF_ERROR_CODE_RESTART; +/* ESTRPIPE 92 / * if pipe/FIFO, don't sleep in stream head */ + gf_error_to_errno_array[GF_ERROR_CODE_STRPIPE] = ESTRPIPE; + gf_errno_to_error_array[ESTRPIPE] = GF_ERROR_CODE_STRPIPE; +/* ENOTEMPTY 93 / * directory not empty */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTEMPTY] = ENOTEMPTY; + gf_errno_to_error_array[ENOTEMPTY] = GF_ERROR_CODE_NOTEMPTY; +/* EUSERS 94 / * Too many users (for UFS) */ + gf_error_to_errno_array[GF_ERROR_CODE_USERS] = EUSERS; + gf_errno_to_error_array[EUSERS] = GF_ERROR_CODE_USERS; + +/* BSD Networking Software */ + /* argument errors */ +/* ENOTSOCK 95 / * Socket operation on non-socket */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTSOCK] = ENOTSOCK; + gf_errno_to_error_array[ENOTSOCK] = GF_ERROR_CODE_NOTSOCK; +/* EDESTADDRREQ 96 / * Destination address required */ + gf_error_to_errno_array[GF_ERROR_CODE_DESTADDRREQ] = EDESTADDRREQ; + gf_errno_to_error_array[EDESTADDRREQ] = GF_ERROR_CODE_DESTADDRREQ; +/* EMSGSIZE 97 / * Message too long */ + gf_error_to_errno_array[GF_ERROR_CODE_MSGSIZE] = EMSGSIZE; + gf_errno_to_error_array[EMSGSIZE] = GF_ERROR_CODE_MSGSIZE; +/* EPROTOTYPE 98 / * Protocol wrong type for socket */ + gf_error_to_errno_array[GF_ERROR_CODE_PROTOTYPE] = EPROTOTYPE; + gf_errno_to_error_array[EPROTOTYPE] = GF_ERROR_CODE_PROTOTYPE; +/* ENOPROTOOPT 99 / * Protocol not available */ + gf_error_to_errno_array[GF_ERROR_CODE_NOPROTOOPT] = ENOPROTOOPT; + gf_errno_to_error_array[ENOPROTOOPT] = GF_ERROR_CODE_NOPROTOOPT; +/* EPROTONOSUPPORT 120 / * Protocol not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_PROTONOSUPPORT] = EPROTONOSUPPORT; + gf_errno_to_error_array[EPROTONOSUPPORT] = GF_ERROR_CODE_PROTONOSUPPORT; +/* ESOCKTNOSUPPORT 121 / * Socket type not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_SOCKTNOSUPPORT] = ESOCKTNOSUPPORT; + gf_errno_to_error_array[ESOCKTNOSUPPORT] = GF_ERROR_CODE_SOCKTNOSUPPORT; + +/* EOPNOTSUPP 122 / * Operation not supported on socket */ + gf_error_to_errno_array[GF_ERROR_CODE_OPNOTSUPP] = EOPNOTSUPP; + gf_errno_to_error_array[EOPNOTSUPP] = GF_ERROR_CODE_OPNOTSUPP; +/* EPFNOSUPPORT 123 / * Protocol family not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_PFNOSUPPORT] = EPFNOSUPPORT; + gf_errno_to_error_array[EPFNOSUPPORT] = GF_ERROR_CODE_PFNOSUPPORT; +/* EAFNOSUPPORT 124 / * Address family not supported by */ + /* protocol family */ + gf_error_to_errno_array[GF_ERROR_CODE_AFNOSUPPORT] = EAFNOSUPPORT; + gf_errno_to_error_array[EAFNOSUPPORT] = GF_ERROR_CODE_AFNOSUPPORT; +/* EADDRINUSE 125 / * Address already in use */ + gf_error_to_errno_array[GF_ERROR_CODE_ADDRINUSE] = EADDRINUSE; + gf_errno_to_error_array[EADDRINUSE] = GF_ERROR_CODE_ADDRINUSE; +/* EADDRNOTAVAIL 126 / * Can't assign requested address */ + /* operational errors */ + gf_error_to_errno_array[GF_ERROR_CODE_ADDRNOTAVAIL] = EADDRNOTAVAIL; + gf_errno_to_error_array[EADDRNOTAVAIL] = GF_ERROR_CODE_ADDRNOTAVAIL; +/* ENETDOWN 127 / * Network is down */ + gf_error_to_errno_array[GF_ERROR_CODE_NETDOWN] = ENETDOWN; + gf_errno_to_error_array[ENETDOWN] = GF_ERROR_CODE_NETDOWN; +/* ENETUNREACH 128 / * Network is unreachable */ + gf_error_to_errno_array[GF_ERROR_CODE_NETUNREACH] = ENETUNREACH; + gf_errno_to_error_array[ENETUNREACH] = GF_ERROR_CODE_NETUNREACH; +/* ENETRESET 129 / * Network dropped connection because */ + /* of reset */ + gf_error_to_errno_array[GF_ERROR_CODE_NETRESET] = ENETRESET; + gf_errno_to_error_array[ENETRESET] = GF_ERROR_CODE_NETRESET; +/* ECONNABORTED 130 / * Software caused connection abort */ + gf_error_to_errno_array[GF_ERROR_CODE_CONNABORTED] = ECONNABORTED; + gf_errno_to_error_array[ECONNABORTED] = GF_ERROR_CODE_CONNABORTED; +/* ECONNRESET 131 / * Connection reset by peer */ + gf_error_to_errno_array[GF_ERROR_CODE_CONNRESET] = ECONNRESET; + gf_errno_to_error_array[ECONNRESET] = GF_ERROR_CODE_CONNRESET; +/* ENOBUFS 132 / * No buffer space available */ + gf_error_to_errno_array[GF_ERROR_CODE_NOBUFS] = ENOBUFS; + gf_errno_to_error_array[ENOBUFS] = GF_ERROR_CODE_NOBUFS; +/* EISCONN 133 / * Socket is already connected */ + gf_error_to_errno_array[GF_ERROR_CODE_ISCONN] = EISCONN; + gf_errno_to_error_array[EISCONN] = GF_ERROR_CODE_ISCONN; +/* ENOTCONN 134 / * Socket is not connected */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTCONN] = ENOTCONN; + gf_errno_to_error_array[ENOTCONN] = GF_ERROR_CODE_NOTCONN; +/* XENIX has 135 - 142 */ +/* ESHUTDOWN 143 / * Can't send after socket shutdown */ + gf_error_to_errno_array[GF_ERROR_CODE_SHUTDOWN] = ESHUTDOWN; + gf_errno_to_error_array[ESHUTDOWN] = GF_ERROR_CODE_SHUTDOWN; +/* ETOOMANYREFS 144 / * Too many references: can't splice */ + gf_error_to_errno_array[GF_ERROR_CODE_TOOMANYREFS] = ETOOMANYREFS; + gf_errno_to_error_array[ETOOMANYREFS] = GF_ERROR_CODE_TOOMANYREFS; +/* ETIMEDOUT 145 / * Connection timed out */ + gf_error_to_errno_array[GF_ERROR_CODE_TIMEDOUT] = ETIMEDOUT; + gf_errno_to_error_array[ETIMEDOUT] = GF_ERROR_CODE_TIMEDOUT; + +/* ECONNREFUSED 146 / * Connection refused */ + gf_error_to_errno_array[GF_ERROR_CODE_CONNREFUSED] = ECONNREFUSED; + gf_errno_to_error_array[ECONNREFUSED] = GF_ERROR_CODE_CONNREFUSED; +/* EHOSTDOWN 147 / * Host is down */ + gf_error_to_errno_array[GF_ERROR_CODE_HOSTDOWN] = EHOSTDOWN; + gf_errno_to_error_array[EHOSTDOWN] = GF_ERROR_CODE_HOSTDOWN; +/* EHOSTUNREACH 148 / * No route to host */ + gf_error_to_errno_array[GF_ERROR_CODE_HOSTUNREACH] = EHOSTUNREACH; + gf_errno_to_error_array[EHOSTUNREACH] = GF_ERROR_CODE_HOSTUNREACH; +/* EALREADY 149 / * operation already in progress */ + gf_error_to_errno_array[GF_ERROR_CODE_ALREADY] = EALREADY; + gf_errno_to_error_array[EALREADY] = GF_ERROR_CODE_ALREADY; +/* EINPROGRESS 150 / * operation now in progress */ + gf_error_to_errno_array[GF_ERROR_CODE_INPROGRESS] = EINPROGRESS; + gf_errno_to_error_array[EINPROGRESS] = GF_ERROR_CODE_INPROGRESS; + +/* SUN Network File System */ +/* ESTALE 151 / * Stale NFS file handle */ + gf_error_to_errno_array[GF_ERROR_CODE_STALE] = ESTALE; + gf_errno_to_error_array[ESTALE] = GF_ERROR_CODE_STALE; + + return ; +} +#endif /* GF_SOLARIS_HOST_OS */ + +#ifdef GF_DARWIN_HOST_OS +static void +init_compat_errno_arrays () +{ + /* EDEADLK 11 / * Resource deadlock would occur */ + gf_error_to_errno_array[GF_ERROR_CODE_DEADLK] = EDEADLK; + gf_errno_to_error_array[EDEADLK] = GF_ERROR_CODE_DEADLK; + + /* EAGAIN 35 / * Try Again */ + gf_error_to_errno_array[GF_ERROR_CODE_AGAIN] = EAGAIN; + gf_errno_to_error_array[EAGAIN] = GF_ERROR_CODE_AGAIN; + + /* EINPROGRESS 36 / * Operation now in progress */ + gf_error_to_errno_array[GF_ERROR_CODE_INPROGRESS] = EINPROGRESS; + gf_errno_to_error_array[EINPROGRESS] = GF_ERROR_CODE_INPROGRESS; + + /* EALREADY 37 / * Operation already in progress */ + gf_error_to_errno_array[GF_ERROR_CODE_ALREADY] = EALREADY; + gf_errno_to_error_array[EALREADY] = GF_ERROR_CODE_ALREADY; + + /* ENOTSOCK 38 / * Socket operation on non-socket */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTSOCK] = ENOTSOCK; + gf_errno_to_error_array[ENOTSOCK] = GF_ERROR_CODE_NOTSOCK; + + /* EDESTADDRREQ 39 / * Destination address required */ + gf_error_to_errno_array[GF_ERROR_CODE_DESTADDRREQ] = EDESTADDRREQ; + gf_errno_to_error_array[EDESTADDRREQ] = GF_ERROR_CODE_DESTADDRREQ; + + /* EMSGSIZE 40 / * Message too long */ + gf_error_to_errno_array[GF_ERROR_CODE_MSGSIZE] = EMSGSIZE; + gf_errno_to_error_array[EMSGSIZE] = GF_ERROR_CODE_MSGSIZE; + + /* EPROTOTYPE 41 / * Protocol wrong type for socket */ + gf_error_to_errno_array[GF_ERROR_CODE_PROTOTYPE] = EPROTOTYPE; + gf_errno_to_error_array[EPROTOTYPE] = GF_ERROR_CODE_PROTOTYPE; + + /* ENOPROTOOPT 42 / * Protocol not available */ + gf_error_to_errno_array[GF_ERROR_CODE_NOPROTOOPT] = ENOPROTOOPT; + gf_errno_to_error_array[ENOPROTOOPT] = GF_ERROR_CODE_NOPROTOOPT; + + /* EPROTONOSUPPORT 43 / * Protocol not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_PROTONOSUPPORT] = EPROTONOSUPPORT; + gf_errno_to_error_array[EPROTONOSUPPORT] = GF_ERROR_CODE_PROTONOSUPPORT; + + /* ESOCKTNOSUPPORT 44 / * Socket type not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_SOCKTNOSUPPORT] = ESOCKTNOSUPPORT; + gf_errno_to_error_array[ESOCKTNOSUPPORT] = GF_ERROR_CODE_SOCKTNOSUPPORT; + + /* EOPNOTSUPP 45 / * Operation not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_OPNOTSUPP] = EOPNOTSUPP; + gf_errno_to_error_array[EOPNOTSUPP] = GF_ERROR_CODE_OPNOTSUPP; + + /* EPFNOSUPPORT 46 / * Protocol family not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_PFNOSUPPORT] = EPFNOSUPPORT; + gf_errno_to_error_array[EPFNOSUPPORT] = GF_ERROR_CODE_PFNOSUPPORT; + + /* EAFNOSUPPORT 47 / * Address family not supported by protocol family */ + gf_error_to_errno_array[GF_ERROR_CODE_AFNOSUPPORT] = EAFNOSUPPORT; + gf_errno_to_error_array[EAFNOSUPPORT] = GF_ERROR_CODE_AFNOSUPPORT; + + /* EADDRINUSE 48 / * Address already in use */ + gf_error_to_errno_array[GF_ERROR_CODE_ADDRINUSE] = EADDRINUSE; + gf_errno_to_error_array[EADDRINUSE] = GF_ERROR_CODE_ADDRINUSE; + + /* EADDRNOTAVAIL 49 / * Can't assign requested address */ + gf_error_to_errno_array[GF_ERROR_CODE_ADDRNOTAVAIL] = EADDRNOTAVAIL; + gf_errno_to_error_array[EADDRNOTAVAIL] = GF_ERROR_CODE_ADDRNOTAVAIL; + + /* ENETDOWN 50 / * Network is down */ + gf_error_to_errno_array[GF_ERROR_CODE_NETDOWN] = ENETDOWN; + gf_errno_to_error_array[ENETDOWN] = GF_ERROR_CODE_NETDOWN; + + /* ENETUNREACH 51 / * Network is unreachable */ + gf_error_to_errno_array[GF_ERROR_CODE_NETUNREACH] = ENETUNREACH; + gf_errno_to_error_array[ENETUNREACH] = GF_ERROR_CODE_NETUNREACH; + + /* ENETRESET 52 / * Network dropped connection on reset */ + gf_error_to_errno_array[GF_ERROR_CODE_NETRESET] = ENETRESET; + gf_errno_to_error_array[ENETRESET] = GF_ERROR_CODE_NETRESET; + + /* ECONNABORTED 53 / * Software caused connection abort */ + gf_error_to_errno_array[GF_ERROR_CODE_CONNABORTED] = ECONNABORTED; + gf_errno_to_error_array[ECONNABORTED] = GF_ERROR_CODE_CONNABORTED; + + /* ECONNRESET 54 / * Connection reset by peer */ + gf_error_to_errno_array[GF_ERROR_CODE_CONNRESET] = ECONNRESET; + gf_errno_to_error_array[ECONNRESET] = GF_ERROR_CODE_CONNRESET; + + /* ENOBUFS 55 / * No buffer space available */ + gf_error_to_errno_array[GF_ERROR_CODE_NOBUFS] = ENOBUFS; + gf_errno_to_error_array[ENOBUFS] = GF_ERROR_CODE_NOBUFS; + + /* EISCONN 56 / * Socket is already connected */ + gf_error_to_errno_array[GF_ERROR_CODE_ISCONN] = EISCONN; + gf_errno_to_error_array[EISCONN] = GF_ERROR_CODE_ISCONN; + + /* ENOTCONN 57 / * Socket is not connected */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTCONN] = ENOTCONN; + gf_errno_to_error_array[ENOTCONN] = GF_ERROR_CODE_NOTCONN; + + /* ESHUTDOWN 58 / * Can't send after socket shutdown */ + gf_error_to_errno_array[GF_ERROR_CODE_SHUTDOWN] = ESHUTDOWN; + gf_errno_to_error_array[ESHUTDOWN] = GF_ERROR_CODE_SHUTDOWN; + + /* ETOOMANYREFS 59 / * Too many references: can't splice */ + gf_error_to_errno_array[GF_ERROR_CODE_TOOMANYREFS] = ETOOMANYREFS; + gf_errno_to_error_array[ETOOMANYREFS] = GF_ERROR_CODE_TOOMANYREFS; + + /* ETIMEDOUT 60 / * Operation timed out */ + gf_error_to_errno_array[GF_ERROR_CODE_TIMEDOUT] = ETIMEDOUT; + gf_errno_to_error_array[ETIMEDOUT] = GF_ERROR_CODE_TIMEDOUT; + + /* ECONNREFUSED 61 / * Connection refused */ + gf_error_to_errno_array[GF_ERROR_CODE_CONNREFUSED] = ECONNREFUSED; + gf_errno_to_error_array[ECONNREFUSED] = GF_ERROR_CODE_CONNREFUSED; + + /* ELOOP 62 / * Too many levels of symbolic links */ + gf_error_to_errno_array[GF_ERROR_CODE_LOOP] = ELOOP; + gf_errno_to_error_array[ELOOP] = GF_ERROR_CODE_LOOP; + + /* ENAMETOOLONG 63 / * File name too long */ + gf_error_to_errno_array[GF_ERROR_CODE_NAMETOOLONG] = ENAMETOOLONG; + gf_errno_to_error_array[ENAMETOOLONG] = GF_ERROR_CODE_NAMETOOLONG; + + /* EHOSTDOWN 64 / * Host is down */ + gf_error_to_errno_array[GF_ERROR_CODE_HOSTDOWN] = EHOSTDOWN; + gf_errno_to_error_array[EHOSTDOWN] = GF_ERROR_CODE_HOSTDOWN; + + /* EHOSTUNREACH 65 / * No route to host */ + gf_error_to_errno_array[GF_ERROR_CODE_HOSTUNREACH] = EHOSTUNREACH; + gf_errno_to_error_array[EHOSTUNREACH] = GF_ERROR_CODE_HOSTUNREACH; + + /* ENOTEMPTY 66 / * Directory not empty */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTEMPTY] = ENOTEMPTY; + gf_errno_to_error_array[ENOTEMPTY] = GF_ERROR_CODE_NOTEMPTY; + + /* EPROCLIM 67 / * Too many processes */ + gf_error_to_errno_array[GF_ERROR_CODE_PROCLIM] = EPROCLIM; + gf_errno_to_error_array[EPROCLIM] = GF_ERROR_CODE_PROCLIM; + + /* EUSERS 68 / * Too many users */ + gf_error_to_errno_array[GF_ERROR_CODE_USERS] = EUSERS; + gf_errno_to_error_array[EUSERS] = GF_ERROR_CODE_USERS; + + /* EDQUOT 69 / * Disc quota exceeded */ + gf_error_to_errno_array[GF_ERROR_CODE_DQUOT] = EDQUOT; + gf_errno_to_error_array[EDQUOT] = GF_ERROR_CODE_DQUOT; + + /* ESTALE 70 / * Stale NFS file handle */ + gf_error_to_errno_array[GF_ERROR_CODE_STALE] = ESTALE; + gf_errno_to_error_array[ESTALE] = GF_ERROR_CODE_STALE; + + /* EREMOTE 71 / * Too many levels of remote in path */ + gf_error_to_errno_array[GF_ERROR_CODE_REMOTE] = EREMOTE; + gf_errno_to_error_array[EREMOTE] = GF_ERROR_CODE_REMOTE; + + /* EBADRPC 72 / * RPC struct is bad */ + gf_error_to_errno_array[GF_ERROR_CODE_BADRPC] = EBADRPC; + gf_errno_to_error_array[EBADRPC] = GF_ERROR_CODE_BADRPC; + + /* ERPCMISMATCH 73 / * RPC version wrong */ + gf_error_to_errno_array[GF_ERROR_CODE_RPCMISMATCH] = ERPCMISMATCH; + gf_errno_to_error_array[ERPCMISMATCH] = GF_ERROR_CODE_RPCMISMATCH; + + /* EPROGUNAVAIL 74 / * RPC prog. not avail */ + gf_error_to_errno_array[GF_ERROR_CODE_PROGUNAVAIL] = EPROGUNAVAIL; + gf_errno_to_error_array[EPROGUNAVAIL] = GF_ERROR_CODE_PROGUNAVAIL; + + /* EPROGMISMATCH 75 / * Program version wrong */ + gf_error_to_errno_array[GF_ERROR_CODE_PROGMISMATCH] = EPROGMISMATCH; + gf_errno_to_error_array[EPROGMISMATCH] = GF_ERROR_CODE_PROGMISMATCH; + + /* EPROCUNAVAIL 76 / * Bad procedure for program */ + gf_error_to_errno_array[GF_ERROR_CODE_PROCUNAVAIL] = EPROCUNAVAIL; + gf_errno_to_error_array[EPROCUNAVAIL] = GF_ERROR_CODE_PROCUNAVAIL; + + /* ENOLCK 77 / * No locks available */ + gf_error_to_errno_array[GF_ERROR_CODE_NOLCK] = ENOLCK; + gf_errno_to_error_array[ENOLCK] = GF_ERROR_CODE_NOLCK; + + /* ENOSYS 78 / * Function not implemented */ + gf_error_to_errno_array[GF_ERROR_CODE_NOSYS] = ENOSYS; + gf_errno_to_error_array[ENOSYS] = GF_ERROR_CODE_NOSYS; + + /* EFTYPE 79 / * Inappropriate file type or format */ + gf_error_to_errno_array[GF_ERROR_CODE_FTYPE] = EFTYPE; + gf_errno_to_error_array[EFTYPE] = GF_ERROR_CODE_FTYPE; + + /* EAUTH 80 / * Authentication error */ + gf_error_to_errno_array[GF_ERROR_CODE_AUTH] = EAUTH; + gf_errno_to_error_array[EAUTH] = GF_ERROR_CODE_AUTH; + + /* ENEEDAUTH 81 / * Need authenticator */ + gf_error_to_errno_array[GF_ERROR_CODE_NEEDAUTH] = ENEEDAUTH; + gf_errno_to_error_array[ENEEDAUTH] = GF_ERROR_CODE_NEEDAUTH; +/* Intelligent device errors */ +/* EPWROFF 82 / * Device power is off */ + gf_error_to_errno_array[GF_ERROR_CODE_PWROFF] = EPWROFF; + gf_errno_to_error_array[EPWROFF] = GF_ERROR_CODE_PWROFF; +/* EDEVERR 83 / * Device error, e.g. paper out */ + gf_error_to_errno_array[GF_ERROR_CODE_DEVERR] = EDEVERR; + gf_errno_to_error_array[EDEVERR] = GF_ERROR_CODE_DEVERR; + + /* EOVERFLOW 84 / * Value too large to be stored in data type */ + gf_error_to_errno_array[GF_ERROR_CODE_OVERFLOW] = EOVERFLOW; + gf_errno_to_error_array[EOVERFLOW] = GF_ERROR_CODE_OVERFLOW; + +/* Program loading errors */ +/* EBADEXEC 85 / * Bad executable */ + gf_error_to_errno_array[GF_ERROR_CODE_BADEXEC] = EBADEXEC; + gf_errno_to_error_array[EBADEXEC] = GF_ERROR_CODE_BADEXEC; + +/* EBADARCH 86 / * Bad CPU type in executable */ + gf_error_to_errno_array[GF_ERROR_CODE_BADARCH] = EBADARCH; + gf_errno_to_error_array[EBADARCH] = GF_ERROR_CODE_BADARCH; + +/* ESHLIBVERS 87 / * Shared library version mismatch */ + gf_error_to_errno_array[GF_ERROR_CODE_SHLIBVERS] = ESHLIBVERS; + gf_errno_to_error_array[ESHLIBVERS] = GF_ERROR_CODE_SHLIBVERS; + +/* EBADMACHO 88 / * Malformed Macho file */ + gf_error_to_errno_array[GF_ERROR_CODE_BADMACHO] = EBADMACHO; + gf_errno_to_error_array[EBADMACHO] = GF_ERROR_CODE_BADMACHO; + +#if 0 + /* EDOOFUS 88 / * Programming error */ + gf_error_to_errno_array[GF_ERROR_CODE_DOOFUS] = EDOOFUS; + gf_errno_to_error_array[EDOOFUS] = GF_ERROR_CODE_DOOFUS; +#endif + + /* ECANCELED 89 / * Operation canceled */ + gf_error_to_errno_array[GF_ERROR_CODE_CANCELED] = ECANCELED; + gf_errno_to_error_array[ECANCELED] = GF_ERROR_CODE_CANCELED; + + /* EIDRM 90 / * Identifier removed */ + gf_error_to_errno_array[GF_ERROR_CODE_IDRM] = EIDRM; + gf_errno_to_error_array[EIDRM] = GF_ERROR_CODE_IDRM; + /* ENOMSG 91 / * No message of desired type */ + gf_error_to_errno_array[GF_ERROR_CODE_NOMSG] = ENOMSG; + gf_errno_to_error_array[ENOMSG] = GF_ERROR_CODE_NOMSG; + + /* EILSEQ 92 / * Illegal byte sequence */ + gf_error_to_errno_array[GF_ERROR_CODE_ILSEQ] = EILSEQ; + gf_errno_to_error_array[EILSEQ] = GF_ERROR_CODE_ILSEQ; + + /* ENOATTR 93 / * Attribute not found */ + gf_error_to_errno_array[GF_ERROR_CODE_NOATTR] = ENOATTR; + gf_errno_to_error_array[ENOATTR] = GF_ERROR_CODE_NOATTR; + + /* EBADMSG 94 / * Bad message */ + gf_error_to_errno_array[GF_ERROR_CODE_BADMSG] = EBADMSG; + gf_errno_to_error_array[EBADMSG] = GF_ERROR_CODE_BADMSG; + + /* EMULTIHOP 95 / * Reserved */ + gf_error_to_errno_array[GF_ERROR_CODE_MULTIHOP] = EMULTIHOP; + gf_errno_to_error_array[EMULTIHOP] = GF_ERROR_CODE_MULTIHOP; + + /* ENODATA 96 / * No message available on STREAM */ + gf_error_to_errno_array[GF_ERROR_CODE_NEEDAUTH] = ENEEDAUTH; + gf_errno_to_error_array[ENEEDAUTH] = GF_ERROR_CODE_NEEDAUTH; + + /* ENOLINK 97 / * Reserved */ + gf_error_to_errno_array[GF_ERROR_CODE_NOLINK] = ENOLINK; + gf_errno_to_error_array[ENOLINK] = GF_ERROR_CODE_NOLINK; + + /* ENOSR 98 / * No STREAM resources */ + gf_error_to_errno_array[GF_ERROR_CODE_NOSR] = ENOSR; + gf_errno_to_error_array[ENOSR] = GF_ERROR_CODE_NOSR; + + /* ENOSTR 99 / * Not a STREAM */ + gf_error_to_errno_array[GF_ERROR_CODE_NOSTR] = ENOSTR; + gf_errno_to_error_array[ENOSTR] = GF_ERROR_CODE_NOSTR; + +/* EPROTO 100 / * Protocol error */ + gf_error_to_errno_array[GF_ERROR_CODE_PROTO] = EPROTO; + gf_errno_to_error_array[EPROTO] = GF_ERROR_CODE_PROTO; +/* ETIME 101 / * STREAM ioctl timeout */ + gf_error_to_errno_array[GF_ERROR_CODE_TIME] = ETIME; + gf_errno_to_error_array[ETIME] = GF_ERROR_CODE_TIME; + +/* This value is only discrete when compiling __DARWIN_UNIX03, or KERNEL */ +/* EOPNOTSUPP 102 / * Operation not supported on socket */ + gf_error_to_errno_array[GF_ERROR_CODE_OPNOTSUPP] = EOPNOTSUPP; + gf_errno_to_error_array[EOPNOTSUPP] = GF_ERROR_CODE_OPNOTSUPP; + +/* ENOPOLICY 103 / * No such policy registered */ + gf_error_to_errno_array[GF_ERROR_CODE_NOPOLICY] = ENOPOLICY; + gf_errno_to_error_array[ENOPOLICY] = GF_ERROR_CODE_NOPOLICY; + + return ; +} +#endif /* GF_DARWIN_HOST_OS */ + +#ifdef GF_BSD_HOST_OS +static void +init_compat_errno_arrays () +{ + /* Quite a bit of things changed in FreeBSD - current */ + + /* EAGAIN 35 / * Try Again */ + gf_error_to_errno_array[GF_ERROR_CODE_AGAIN] = EAGAIN; + gf_errno_to_error_array[EAGAIN] = GF_ERROR_CODE_AGAIN; + + /* EDEADLK 11 / * Resource deadlock would occur */ + gf_error_to_errno_array[GF_ERROR_CODE_DEADLK] = EDEADLK; + gf_errno_to_error_array[EDEADLK] = GF_ERROR_CODE_DEADLK; + + /* EINPROGRESS 36 / * Operation now in progress */ + gf_error_to_errno_array[GF_ERROR_CODE_INPROGRESS] = EINPROGRESS; + gf_errno_to_error_array[EINPROGRESS] = GF_ERROR_CODE_INPROGRESS; + + /* EALREADY 37 / * Operation already in progress */ + gf_error_to_errno_array[GF_ERROR_CODE_ALREADY] = EALREADY; + gf_errno_to_error_array[EALREADY] = GF_ERROR_CODE_ALREADY; + + /* ENOTSOCK 38 / * Socket operation on non-socket */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTSOCK] = ENOTSOCK; + gf_errno_to_error_array[ENOTSOCK] = GF_ERROR_CODE_NOTSOCK; + + /* EDESTADDRREQ 39 / * Destination address required */ + gf_error_to_errno_array[GF_ERROR_CODE_DESTADDRREQ] = EDESTADDRREQ; + gf_errno_to_error_array[EDESTADDRREQ] = GF_ERROR_CODE_DESTADDRREQ; + + /* EMSGSIZE 40 / * Message too long */ + gf_error_to_errno_array[GF_ERROR_CODE_MSGSIZE] = EMSGSIZE; + gf_errno_to_error_array[EMSGSIZE] = GF_ERROR_CODE_MSGSIZE; + + /* EPROTOTYPE 41 / * Protocol wrong type for socket */ + gf_error_to_errno_array[GF_ERROR_CODE_PROTOTYPE] = EPROTOTYPE; + gf_errno_to_error_array[EPROTOTYPE] = GF_ERROR_CODE_PROTOTYPE; + + /* ENOPROTOOPT 42 / * Protocol not available */ + gf_error_to_errno_array[GF_ERROR_CODE_NOPROTOOPT] = ENOPROTOOPT; + gf_errno_to_error_array[ENOPROTOOPT] = GF_ERROR_CODE_NOPROTOOPT; + + /* EPROTONOSUPPORT 43 / * Protocol not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_PROTONOSUPPORT] = EPROTONOSUPPORT; + gf_errno_to_error_array[EPROTONOSUPPORT] = GF_ERROR_CODE_PROTONOSUPPORT; + + /* ESOCKTNOSUPPORT 44 / * Socket type not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_SOCKTNOSUPPORT] = ESOCKTNOSUPPORT; + gf_errno_to_error_array[ESOCKTNOSUPPORT] = GF_ERROR_CODE_SOCKTNOSUPPORT; + + /* EOPNOTSUPP 45 / * Operation not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_OPNOTSUPP] = EOPNOTSUPP; + gf_errno_to_error_array[EOPNOTSUPP] = GF_ERROR_CODE_OPNOTSUPP; + + /* EPFNOSUPPORT 46 / * Protocol family not supported */ + gf_error_to_errno_array[GF_ERROR_CODE_PFNOSUPPORT] = EPFNOSUPPORT; + gf_errno_to_error_array[EPFNOSUPPORT] = GF_ERROR_CODE_PFNOSUPPORT; + + /* EAFNOSUPPORT 47 / * Address family not supported by protocol family */ + gf_error_to_errno_array[GF_ERROR_CODE_AFNOSUPPORT] = EAFNOSUPPORT; + gf_errno_to_error_array[EAFNOSUPPORT] = GF_ERROR_CODE_AFNOSUPPORT; + + /* EADDRINUSE 48 / * Address already in use */ + gf_error_to_errno_array[GF_ERROR_CODE_ADDRINUSE] = EADDRINUSE; + gf_errno_to_error_array[EADDRINUSE] = GF_ERROR_CODE_ADDRINUSE; + + /* EADDRNOTAVAIL 49 / * Can't assign requested address */ + gf_error_to_errno_array[GF_ERROR_CODE_ADDRNOTAVAIL] = EADDRNOTAVAIL; + gf_errno_to_error_array[EADDRNOTAVAIL] = GF_ERROR_CODE_ADDRNOTAVAIL; + + /* ENETDOWN 50 / * Network is down */ + gf_error_to_errno_array[GF_ERROR_CODE_NETDOWN] = ENETDOWN; + gf_errno_to_error_array[ENETDOWN] = GF_ERROR_CODE_NETDOWN; + + /* ENETUNREACH 51 / * Network is unreachable */ + gf_error_to_errno_array[GF_ERROR_CODE_NETUNREACH] = ENETUNREACH; + gf_errno_to_error_array[ENETUNREACH] = GF_ERROR_CODE_NETUNREACH; + + /* ENETRESET 52 / * Network dropped connection on reset */ + gf_error_to_errno_array[GF_ERROR_CODE_NETRESET] = ENETRESET; + gf_errno_to_error_array[ENETRESET] = GF_ERROR_CODE_NETRESET; + + /* ECONNABORTED 53 / * Software caused connection abort */ + gf_error_to_errno_array[GF_ERROR_CODE_CONNABORTED] = ECONNABORTED; + gf_errno_to_error_array[ECONNABORTED] = GF_ERROR_CODE_CONNABORTED; + + /* ECONNRESET 54 / * Connection reset by peer */ + gf_error_to_errno_array[GF_ERROR_CODE_CONNRESET] = ECONNRESET; + gf_errno_to_error_array[ECONNRESET] = GF_ERROR_CODE_CONNRESET; + + /* ENOBUFS 55 / * No buffer space available */ + gf_error_to_errno_array[GF_ERROR_CODE_NOBUFS] = ENOBUFS; + gf_errno_to_error_array[ENOBUFS] = GF_ERROR_CODE_NOBUFS; + + /* EISCONN 56 / * Socket is already connected */ + gf_error_to_errno_array[GF_ERROR_CODE_ISCONN] = EISCONN; + gf_errno_to_error_array[EISCONN] = GF_ERROR_CODE_ISCONN; + + /* ENOTCONN 57 / * Socket is not connected */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTCONN] = ENOTCONN; + gf_errno_to_error_array[ENOTCONN] = GF_ERROR_CODE_NOTCONN; + + /* ESHUTDOWN 58 / * Can't send after socket shutdown */ + gf_error_to_errno_array[GF_ERROR_CODE_SHUTDOWN] = ESHUTDOWN; + gf_errno_to_error_array[ESHUTDOWN] = GF_ERROR_CODE_SHUTDOWN; + + /* ETOOMANYREFS 59 / * Too many references: can't splice */ + gf_error_to_errno_array[GF_ERROR_CODE_TOOMANYREFS] = ETOOMANYREFS; + gf_errno_to_error_array[ETOOMANYREFS] = GF_ERROR_CODE_TOOMANYREFS; + + /* ETIMEDOUT 60 / * Operation timed out */ + gf_error_to_errno_array[GF_ERROR_CODE_TIMEDOUT] = ETIMEDOUT; + gf_errno_to_error_array[ETIMEDOUT] = GF_ERROR_CODE_TIMEDOUT; + + /* ECONNREFUSED 61 / * Connection refused */ + gf_error_to_errno_array[GF_ERROR_CODE_CONNREFUSED] = ECONNREFUSED; + gf_errno_to_error_array[ECONNREFUSED] = GF_ERROR_CODE_CONNREFUSED; + + /* ELOOP 62 / * Too many levels of symbolic links */ + gf_error_to_errno_array[GF_ERROR_CODE_LOOP] = ELOOP; + gf_errno_to_error_array[ELOOP] = GF_ERROR_CODE_LOOP; + + /* ENAMETOOLONG 63 / * File name too long */ + gf_error_to_errno_array[GF_ERROR_CODE_NAMETOOLONG] = ENAMETOOLONG; + gf_errno_to_error_array[ENAMETOOLONG] = GF_ERROR_CODE_NAMETOOLONG; + + /* EHOSTDOWN 64 / * Host is down */ + gf_error_to_errno_array[GF_ERROR_CODE_HOSTDOWN] = EHOSTDOWN; + gf_errno_to_error_array[EHOSTDOWN] = GF_ERROR_CODE_HOSTDOWN; + + /* EHOSTUNREACH 65 / * No route to host */ + gf_error_to_errno_array[GF_ERROR_CODE_HOSTUNREACH] = EHOSTUNREACH; + gf_errno_to_error_array[EHOSTUNREACH] = GF_ERROR_CODE_HOSTUNREACH; + + /* ENOTEMPTY 66 / * Directory not empty */ + gf_error_to_errno_array[GF_ERROR_CODE_NOTEMPTY] = ENOTEMPTY; + gf_errno_to_error_array[ENOTEMPTY] = GF_ERROR_CODE_NOTEMPTY; + + /* EPROCLIM 67 / * Too many processes */ + gf_error_to_errno_array[GF_ERROR_CODE_PROCLIM] = EPROCLIM; + gf_errno_to_error_array[EPROCLIM] = GF_ERROR_CODE_PROCLIM; + + /* EUSERS 68 / * Too many users */ + gf_error_to_errno_array[GF_ERROR_CODE_USERS] = EUSERS; + gf_errno_to_error_array[EUSERS] = GF_ERROR_CODE_USERS; + + /* EDQUOT 69 / * Disc quota exceeded */ + gf_error_to_errno_array[GF_ERROR_CODE_DQUOT] = EDQUOT; + gf_errno_to_error_array[EDQUOT] = GF_ERROR_CODE_DQUOT; + + /* ESTALE 70 / * Stale NFS file handle */ + gf_error_to_errno_array[GF_ERROR_CODE_STALE] = ESTALE; + gf_errno_to_error_array[ESTALE] = GF_ERROR_CODE_STALE; + + /* EREMOTE 71 / * Too many levels of remote in path */ + gf_error_to_errno_array[GF_ERROR_CODE_REMOTE] = EREMOTE; + gf_errno_to_error_array[EREMOTE] = GF_ERROR_CODE_REMOTE; + + /* EBADRPC 72 / * RPC struct is bad */ + gf_error_to_errno_array[GF_ERROR_CODE_BADRPC] = EBADRPC; + gf_errno_to_error_array[EBADRPC] = GF_ERROR_CODE_BADRPC; + + /* ERPCMISMATCH 73 / * RPC version wrong */ + gf_error_to_errno_array[GF_ERROR_CODE_RPCMISMATCH] = ERPCMISMATCH; + gf_errno_to_error_array[ERPCMISMATCH] = GF_ERROR_CODE_RPCMISMATCH; + + /* EPROGUNAVAIL 74 / * RPC prog. not avail */ + gf_error_to_errno_array[GF_ERROR_CODE_PROGUNAVAIL] = EPROGUNAVAIL; + gf_errno_to_error_array[EPROGUNAVAIL] = GF_ERROR_CODE_PROGUNAVAIL; + + /* EPROGMISMATCH 75 / * Program version wrong */ + gf_error_to_errno_array[GF_ERROR_CODE_PROGMISMATCH] = EPROGMISMATCH; + gf_errno_to_error_array[EPROGMISMATCH] = GF_ERROR_CODE_PROGMISMATCH; + + /* EPROCUNAVAIL 76 / * Bad procedure for program */ + gf_error_to_errno_array[GF_ERROR_CODE_PROCUNAVAIL] = EPROCUNAVAIL; + gf_errno_to_error_array[EPROCUNAVAIL] = GF_ERROR_CODE_PROCUNAVAIL; + + /* ENOLCK 77 / * No locks available */ + gf_error_to_errno_array[GF_ERROR_CODE_NOLCK] = ENOLCK; + gf_errno_to_error_array[ENOLCK] = GF_ERROR_CODE_NOLCK; + + /* ENOSYS 78 / * Function not implemented */ + gf_error_to_errno_array[GF_ERROR_CODE_NOSYS] = ENOSYS; + gf_errno_to_error_array[ENOSYS] = GF_ERROR_CODE_NOSYS; + + /* EFTYPE 79 / * Inappropriate file type or format */ + gf_error_to_errno_array[GF_ERROR_CODE_FTYPE] = EFTYPE; + gf_errno_to_error_array[EFTYPE] = GF_ERROR_CODE_FTYPE; + + /* EAUTH 80 / * Authentication error */ + gf_error_to_errno_array[GF_ERROR_CODE_AUTH] = EAUTH; + gf_errno_to_error_array[EAUTH] = GF_ERROR_CODE_AUTH; + + /* ENEEDAUTH 81 / * Need authenticator */ + gf_error_to_errno_array[GF_ERROR_CODE_NEEDAUTH] = ENEEDAUTH; + gf_errno_to_error_array[ENEEDAUTH] = GF_ERROR_CODE_NEEDAUTH; + + /* EIDRM 82 / * Identifier removed */ + gf_error_to_errno_array[GF_ERROR_CODE_IDRM] = EIDRM; + gf_errno_to_error_array[EIDRM] = GF_ERROR_CODE_IDRM; + + /* ENOMSG 83 / * No message of desired type */ + gf_error_to_errno_array[GF_ERROR_CODE_NOMSG] = ENOMSG; + gf_errno_to_error_array[ENOMSG] = GF_ERROR_CODE_NOMSG; + + /* EOVERFLOW 84 / * Value too large to be stored in data type */ + gf_error_to_errno_array[GF_ERROR_CODE_OVERFLOW] = EOVERFLOW; + gf_errno_to_error_array[EOVERFLOW] = GF_ERROR_CODE_OVERFLOW; + + /* ECANCELED 85 / * Operation canceled */ + gf_error_to_errno_array[GF_ERROR_CODE_CANCELED] = ECANCELED; + gf_errno_to_error_array[ECANCELED] = GF_ERROR_CODE_CANCELED; + + /* EILSEQ 86 / * Illegal byte sequence */ + gf_error_to_errno_array[GF_ERROR_CODE_ILSEQ] = EILSEQ; + gf_errno_to_error_array[EILSEQ] = GF_ERROR_CODE_ILSEQ; + + /* ENOATTR 87 / * Attribute not found */ + gf_error_to_errno_array[GF_ERROR_CODE_NOATTR] = ENOATTR; + gf_errno_to_error_array[ENOATTR] = GF_ERROR_CODE_NOATTR; + + /* EDOOFUS 88 / * Programming error */ + gf_error_to_errno_array[GF_ERROR_CODE_DOOFUS] = EDOOFUS; + gf_errno_to_error_array[EDOOFUS] = GF_ERROR_CODE_DOOFUS; + + /* EBADMSG 89 / * Bad message */ + gf_error_to_errno_array[GF_ERROR_CODE_BADMSG] = EBADMSG; + gf_errno_to_error_array[EBADMSG] = GF_ERROR_CODE_BADMSG; + + /* EMULTIHOP 90 / * Multihop attempted */ + gf_error_to_errno_array[GF_ERROR_CODE_MULTIHOP] = EMULTIHOP; + gf_errno_to_error_array[EMULTIHOP] = GF_ERROR_CODE_MULTIHOP; + + /* ENOLINK 91 / * Link has been severed */ + gf_error_to_errno_array[GF_ERROR_CODE_NOLINK] = ENOLINK; + gf_errno_to_error_array[ENOLINK] = GF_ERROR_CODE_NOLINK; + + /* EPROTO 92 / * Protocol error */ + gf_error_to_errno_array[GF_ERROR_CODE_PROTO] = EPROTO; + gf_errno_to_error_array[EPROTO] = GF_ERROR_CODE_PROTO; + + + return ; +} +#endif /* GF_BSD_HOST_OS */ + +#ifdef GF_LINUX_HOST_OS +static void +init_compat_errno_arrays () +{ + /* Things are fine. Everything should work seemlessly on GNU/Linux machines */ + return ; +} +#endif /* GF_LINUX_HOST_OS */ + + +static void +init_errno_arrays () +{ + int i; + for (i=0; i < GF_ERROR_CODE_UNKNOWN; i++) { + gf_errno_to_error_array[i] = i; + gf_error_to_errno_array[i] = i; + } + /* Now change the order if it needs to be. */ + init_compat_errno_arrays(); + + return; +} + +int32_t +gf_errno_to_error (int32_t op_errno) +{ + if (!gf_compat_errno_init_done) { + init_errno_arrays (); + gf_compat_errno_init_done = 1; + } + + if ((op_errno > GF_ERROR_CODE_SUCCESS) && (op_errno < GF_ERROR_CODE_UNKNOWN)) + return gf_errno_to_error_array[op_errno]; + + return op_errno; +} + + +int32_t +gf_error_to_errno (int32_t error) +{ + if (!gf_compat_errno_init_done) { + init_errno_arrays (); + gf_compat_errno_init_done = 1; + } + + if ((error > GF_ERROR_CODE_SUCCESS) && (error < GF_ERROR_CODE_UNKNOWN)) + return gf_error_to_errno_array[error]; + + return error; +} + diff --git a/libglusterfs/src/compat-errno.h b/libglusterfs/src/compat-errno.h new file mode 100644 index 000000000..918df45eb --- /dev/null +++ b/libglusterfs/src/compat-errno.h @@ -0,0 +1,240 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef __COMPAT_ERRNO_H__ +#define __COMPAT_ERRNO_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include + +#define GF_ERROR_CODE_SUCCESS 0 +#define GF_ERROR_CODE_UNKNOWN 1024 +#define GF_ERRNO_UNKNOWN 1024 + +#define GF_ERROR_CODE_PERM 1 /* Operation not permitted */ +#define GF_ERROR_CODE_NOENT 2 /* No such file or directory */ +#define GF_ERROR_CODE_SRCH 3 /* No such process */ +#define GF_ERROR_CODE_INTR 4 /* Interrupted system call */ +#define GF_ERROR_CODE_IO 5 /* I/O error */ +#define GF_ERROR_CODE_NXIO 6 /* No such device or address */ +#define GF_ERROR_CODE_2BIG 7 /* Argument list too long */ +#define GF_ERROR_CODE_NOEXEC 8 /* Exec format error */ +#define GF_ERROR_CODE_BADF 9 /* Bad file number */ +#define GF_ERROR_CODE_CHILD 10 /* No child processes */ +#define GF_ERROR_CODE_AGAIN 11 /* Try again */ +#define GF_ERROR_CODE_NOMEM 12 /* Out of memory */ +#define GF_ERROR_CODE_ACCES 13 /* Permission denied */ +#define GF_ERROR_CODE_FAULT 14 /* Bad address */ +#define GF_ERROR_CODE_NOTBLK 15 /* Block device required */ +#define GF_ERROR_CODE_BUSY 16 /* Device or resource busy */ +#define GF_ERROR_CODE_EXIST 17 /* File exists */ +#define GF_ERROR_CODE_XDEV 18 /* Cross-device link */ +#define GF_ERROR_CODE_NODEV 19 /* No such device */ +#define GF_ERROR_CODE_NOTDIR 20 /* Not a directory */ +#define GF_ERROR_CODE_ISDIR 21 /* Is a directory */ +#define GF_ERROR_CODE_INVAL 22 /* Invalid argument */ +#define GF_ERROR_CODE_NFILE 23 /* File table overflow */ +#define GF_ERROR_CODE_MFILE 24 /* Too many open files */ +#define GF_ERROR_CODE_NOTTY 25 /* Not a typewriter */ +#define GF_ERROR_CODE_TXTBSY 26 /* Text file busy */ +#define GF_ERROR_CODE_FBIG 27 /* File too large */ +#define GF_ERROR_CODE_NOSPC 28 /* No space left on device */ +#define GF_ERROR_CODE_SPIPE 29 /* Illegal seek */ +#define GF_ERROR_CODE_ROFS 30 /* Read-only file system */ +#define GF_ERROR_CODE_MLINK 31 /* Too many links */ +#define GF_ERROR_CODE_PIPE 32 /* Broken pipe */ +#define GF_ERROR_CODE_DOM 33 /* Math argument out of domain of func */ +#define GF_ERROR_CODE_RANGE 34 /* Math result not representable */ +#define GF_ERROR_CODE_DEADLK 35 /* Resource deadlock would occur */ +#define GF_ERROR_CODE_NAMETOOLONG 36 /* File name too long */ +#define GF_ERROR_CODE_NOLCK 37 /* No record locks available */ +#define GF_ERROR_CODE_NOSYS 38 /* Function not implemented */ +#define GF_ERROR_CODE_NOTEMPTY 39 /* Directory not empty */ +#define GF_ERROR_CODE_LOOP 40 /* Too many symbolic links encountered */ + +#define GF_ERROR_CODE_NOMSG 42 /* No message of desired type */ +#define GF_ERROR_CODE_IDRM 43 /* Identifier removed */ +#define GF_ERROR_CODE_CHRNG 44 /* Channel number out of range */ +#define GF_ERROR_CODE_L2NSYNC 45 /* Level 2 not synchronized */ +#define GF_ERROR_CODE_L3HLT 46 /* Level 3 halted */ +#define GF_ERROR_CODE_L3RST 47 /* Level 3 reset */ +#define GF_ERROR_CODE_LNRNG 48 /* Link number out of range */ +#define GF_ERROR_CODE_UNATCH 49 /* Protocol driver not attached */ +#define GF_ERROR_CODE_NOCSI 50 /* No CSI structure available */ +#define GF_ERROR_CODE_L2HLT 51 /* Level 2 halted */ +#define GF_ERROR_CODE_BADE 52 /* Invalid exchange */ +#define GF_ERROR_CODE_BADR 53 /* Invalid request descriptor */ +#define GF_ERROR_CODE_XFULL 54 /* Exchange full */ +#define GF_ERROR_CODE_NOANO 55 /* No anode */ +#define GF_ERROR_CODE_BADRQC 56 /* Invalid request code */ +#define GF_ERROR_CODE_BADSLT 57 /* Invalid slot */ +#define GF_ERROR_CODE_BFONT 59 /* Bad font file format */ +#define GF_ERROR_CODE_NOSTR 60 /* Device not a stream */ +#define GF_ERROR_CODE_NODATA 61 /* No data available */ +#define GF_ERROR_CODE_TIME 62 /* Timer expired */ +#define GF_ERROR_CODE_NOSR 63 /* Out of streams resources */ +#define GF_ERROR_CODE_NONET 64 /* Machine is not on the network */ +#define GF_ERROR_CODE_NOPKG 65 /* Package not installed */ +#define GF_ERROR_CODE_REMOTE 66 /* Object is remote */ +#define GF_ERROR_CODE_NOLINK 67 /* Link has been severed */ +#define GF_ERROR_CODE_ADV 68 /* Advertise error */ +#define GF_ERROR_CODE_SRMNT 69 /* Srmount error */ +#define GF_ERROR_CODE_COMM 70 /* Communication error on send */ +#define GF_ERROR_CODE_PROTO 71 /* Protocol error */ +#define GF_ERROR_CODE_MULTIHOP 72 /* Multihop attempted */ +#define GF_ERROR_CODE_DOTDOT 73 /* RFS specific error */ +#define GF_ERROR_CODE_BADMSG 74 /* Not a data message */ +#define GF_ERROR_CODE_OVERFLOW 75 /* Value too large for defined data type */ +#define GF_ERROR_CODE_NOTUNIQ 76 /* Name not unique on network */ +#define GF_ERROR_CODE_BADFD 77 /* File descriptor in bad state */ +#define GF_ERROR_CODE_REMCHG 78 /* Remote address changed */ +#define GF_ERROR_CODE_LIBACC 79 /* Can not access a needed shared library */ +#define GF_ERROR_CODE_LIBBAD 80 /* Accessing a corrupted shared library */ +#define GF_ERROR_CODE_LIBSCN 81 /* .lib section in a.out corrupted */ +#define GF_ERROR_CODE_LIBMAX 82 /* Attempting to link in too many shared libraries */ +#define GF_ERROR_CODE_LIBEXEC 83 /* Cannot exec a shared library directly */ +#define GF_ERROR_CODE_ILSEQ 84 /* Illegal byte sequence */ +#define GF_ERROR_CODE_RESTART 85 /* Interrupted system call should be restarted */ +#define GF_ERROR_CODE_STRPIPE 86 /* Streams pipe error */ +#define GF_ERROR_CODE_USERS 87 /* Too many users */ +#define GF_ERROR_CODE_NOTSOCK 88 /* Socket operation on non-socket */ +#define GF_ERROR_CODE_DESTADDRREQ 89 /* Destination address required */ +#define GF_ERROR_CODE_MSGSIZE 90 /* Message too long */ +#define GF_ERROR_CODE_PROTOTYPE 91 /* Protocol wrong type for socket */ +#define GF_ERROR_CODE_NOPROTOOPT 92 /* Protocol not available */ +#define GF_ERROR_CODE_PROTONOSUPPORT 93 /* Protocol not supported */ +#define GF_ERROR_CODE_SOCKTNOSUPPORT 94 /* Socket type not supported */ +#define GF_ERROR_CODE_OPNOTSUPP 95 /* Operation not supported on transport endpoint */ +#define GF_ERROR_CODE_PFNOSUPPORT 96 /* Protocol family not supported */ +#define GF_ERROR_CODE_AFNOSUPPORT 97 /* Address family not supported by protocol */ +#define GF_ERROR_CODE_ADDRINUSE 98 /* Address already in use */ +#define GF_ERROR_CODE_ADDRNOTAVAIL 99 /* Cannot assign requested address */ +#define GF_ERROR_CODE_NETDOWN 100 /* Network is down */ +#define GF_ERROR_CODE_NETUNREACH 101 /* Network is unreachable */ +#define GF_ERROR_CODE_NETRESET 102 /* Network dropped connection because of reset */ +#define GF_ERROR_CODE_CONNABORTED 103 /* Software caused connection abort */ +#define GF_ERROR_CODE_CONNRESET 104 /* Connection reset by peer */ +#define GF_ERROR_CODE_NOBUFS 105 /* No buffer space available */ +#define GF_ERROR_CODE_ISCONN 106 /* Transport endpoint is already connected */ +#define GF_ERROR_CODE_NOTCONN 107 /* Transport endpoint is not connected */ +#define GF_ERROR_CODE_SHUTDOWN 108 /* Cannot send after transport endpoint shutdown */ +#define GF_ERROR_CODE_TOOMANYREFS 109 /* Too many references: cannot splice */ +#define GF_ERROR_CODE_TIMEDOUT 110 /* Connection timed out */ +#define GF_ERROR_CODE_CONNREFUSED 111 /* Connection refused */ +#define GF_ERROR_CODE_HOSTDOWN 112 /* Host is down */ +#define GF_ERROR_CODE_HOSTUNREACH 113 /* No route to host */ +#define GF_ERROR_CODE_ALREADY 114 /* Operation already in progress */ +#define GF_ERROR_CODE_INPROGRESS 115 /* Operation now in progress */ +#define GF_ERROR_CODE_ALREADY 114 /* Operation already in progress */ +#define GF_ERROR_CODE_INPROGRESS 115 /* Operation now in progress */ +#define GF_ERROR_CODE_STALE 116 /* Stale NFS file handle */ +#define GF_ERROR_CODE_UCLEAN 117 /* Structure needs cleaning */ +#define GF_ERROR_CODE_NOTNAM 118 /* Not a XENIX named type file */ +#define GF_ERROR_CODE_NAVAIL 119 /* No XENIX semaphores available */ +#define GF_ERROR_CODE_ISNAM 120 /* Is a named type file */ +#define GF_ERROR_CODE_REMOTEIO 121 /* Remote I/O error */ +#define GF_ERROR_CODE_DQUOT 122 /* Quota exceeded */ +#define GF_ERROR_CODE_NOMEDIUM 123 /* No medium found */ +#define GF_ERROR_CODE_MEDIUMTYPE 124 /* Wrong medium type */ +#define GF_ERROR_CODE_CANCELED 125 /* Operation Canceled */ +#define GF_ERROR_CODE_NOKEY 126 /* Required key not available */ +#define GF_ERROR_CODE_KEYEXPIRED 127 /* Key has expired */ +#define GF_ERROR_CODE_KEYREVOKED 128 /* Key has been revoked */ +#define GF_ERROR_CODE_KEYREJECTED 129 /* Key was rejected by service */ + +/* for robust mutexes */ +#define GF_ERROR_CODE_OWNERDEAD 130 /* Owner died */ +#define GF_ERROR_CODE_NOTRECOVERABLE 131 /* State not recoverable */ + + + +/* Should never be seen by user programs */ +#define GF_ERROR_CODE_RESTARTSYS 512 +#define GF_ERROR_CODE_RESTARTNOINTR 513 +#define GF_ERROR_CODE_RESTARTNOHAND 514 /* restart if no handler.. */ +#define GF_ERROR_CODE_NOIOCTLCMD 515 /* No ioctl command */ +#define GF_ERROR_CODE_RESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */ + +/* Defined for the NFSv3 protocol */ +#define GF_ERROR_CODE_BADHANDLE 521 /* Illegal NFS file handle */ +#define GF_ERROR_CODE_NOTSYNC 522 /* Update synchronization mismatch */ +#define GF_ERROR_CODE_BADCOOKIE 523 /* Cookie is stale */ +#define GF_ERROR_CODE_NOTSUPP 524 /* Operation is not supported */ +#define GF_ERROR_CODE_TOOSMALL 525 /* Buffer or request is too small */ +#define GF_ERROR_CODE_SERVERFAULT 526 /* An untranslatable error occurred */ +#define GF_ERROR_CODE_BADTYPE 527 /* Type not supported by server */ +#define GF_ERROR_CODE_JUKEBOX 528 /* Request initiated, but will not complete before timeout */ +#define GF_ERROR_CODE_IOCBQUEUED 529 /* iocb queued, will get completion event */ +#define GF_ERROR_CODE_IOCBRETRY 530 /* iocb queued, will trigger a retry */ + +/* Darwin OS X */ +#define GF_ERROR_CODE_NOPOLICY 701 +#define GF_ERROR_CODE_BADMACHO 702 +#define GF_ERROR_CODE_PWROFF 703 +#define GF_ERROR_CODE_DEVERR 704 +#define GF_ERROR_CODE_BADARCH 705 +#define GF_ERROR_CODE_BADEXEC 706 +#define GF_ERROR_CODE_SHLIBVERS 707 + + + +/* Solaris */ +/* ENOTACTIVE 73 / * Facility is not active */ +#define GF_ERROR_CODE_NOTACTIVE 801 +/* ELOCKUNMAPPED 72 / * locked lock was unmapped */ +#define GF_ERROR_CODE_LOCKUNMAPPED 802 + +/* BSD system */ +#define GF_ERROR_CODE_PROCLIM 901 /* Too many processes */ +#define GF_ERROR_CODE_BADRPC 902 /* RPC struct is bad */ +#define GF_ERROR_CODE_RPCMISMATCH 903 /* RPC version wrong */ +#define GF_ERROR_CODE_PROGUNAVAIL 904 /* RPC prog. not avail */ +#define GF_ERROR_CODE_PROGMISMATCH 905 /* Program version wrong */ +#define GF_ERROR_CODE_PROCUNAVAIL 905 /* Bad procedure for program */ +#define GF_ERROR_CODE_FTYPE 906 /* Inappropriate file type or format */ +#define GF_ERROR_CODE_AUTH 907 /* Authentication error */ +#define GF_ERROR_CODE_NEEDAUTH 908 /* Need authenticator */ +#define GF_ERROR_CODE_DOOFUS 909 /* Programming error */ + +#define GF_ERROR_CODE_NOATTR GF_ERROR_CODE_NODATA /* Attribute not found */ + +/* Either one of enodata or enoattr will be there in system */ +#ifndef ENOATTR +#define ENOATTR ENODATA +#endif /* ENOATTR */ + +#ifndef ENODATA +#define ENODATA ENOATTR +#endif /* ENODATA */ + +#ifndef EBADFD +#define EBADFD EBADRPC +#endif /* EBADFD */ + +/* These functions are defined for all the OS flags, but content will + * be different for each OS flag. + */ +int32_t gf_errno_to_error (int32_t op_errno); +int32_t gf_error_to_errno (int32_t error); + +#endif /* __COMPAT_ERRNO_H__ */ diff --git a/libglusterfs/src/compat.c b/libglusterfs/src/compat.c new file mode 100644 index 000000000..71aeb32c7 --- /dev/null +++ b/libglusterfs/src/compat.c @@ -0,0 +1,383 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include + +#ifdef GF_SOLARIS_HOST_OS +#include "logging.h" +#endif /* GF_SOLARIS_HOST_OS */ + +#include "compat.h" +#include "common-utils.h" + +#ifdef GF_DARWIN_HOST_OS + +#define GF_FINDER_INFO_XATTR "com.apple.FinderInfo" +#define GF_RESOURCE_FORK_XATTR "com.apple.ResourceFork" +#define GF_FINDER_INFO_SIZE 32 + +static const char gf_finder_info_content[GF_FINDER_INFO_SIZE] = { + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, +}; + + +int32_t +gf_darwin_compat_listxattr (int len, dict_t *dict, int size) +{ + data_t *data = NULL; + if (len == -1) + len = 0; + + data = dict_get (dict, GF_FINDER_INFO_XATTR); + if (!data) { + dict_set (dict, GF_FINDER_INFO_XATTR, + bin_to_data ((void *)gf_finder_info_content, + GF_FINDER_INFO_SIZE)); + len += strlen (GF_FINDER_INFO_XATTR); + } + + data = dict_get (dict, GF_RESOURCE_FORK_XATTR); + if (!data) { + dict_set (dict, GF_RESOURCE_FORK_XATTR, str_to_data ("")); + len += strlen (GF_RESOURCE_FORK_XATTR); + } + + return len; +} + +int32_t +gf_darwin_compat_getxattr (const char *key, dict_t *dict) +{ + data_t *data = NULL; + + if (strcmp(key, GF_FINDER_INFO_XATTR) == 0) { + data = dict_get (dict, GF_FINDER_INFO_XATTR); + if (!data) { + dict_set (dict, GF_FINDER_INFO_XATTR, + bin_to_data ((void *)gf_finder_info_content, + GF_FINDER_INFO_SIZE)); + return GF_FINDER_INFO_SIZE; + } + return 0; + } + + if (strcmp(key, GF_RESOURCE_FORK_XATTR) == 0) { + data = dict_get (dict, GF_RESOURCE_FORK_XATTR); + if (!data) { + /* Always null */ + dict_set (dict, GF_RESOURCE_FORK_XATTR, + str_to_data ("")); + return 0; + } + return 0; + } + return -1; +} + + +int32_t +gf_darwin_compat_setxattr (dict_t *dict) +{ + data_t *data = NULL; + + data = dict_get (dict, GF_FINDER_INFO_XATTR); + if (data) + return 0; + data = dict_get (dict, GF_RESOURCE_FORK_XATTR); + if (data) + return 0; + + return -1; +} + +#endif /* DARWIN */ + + +#ifdef GF_SOLARIS_HOST_OS + +int +solaris_fsetxattr(int fd, + const char* key, + const char *value, + size_t size, + int flags) +{ + int attrfd = -1; + int ret = 0; + + attrfd = openat (fd, key, flags|O_CREAT|O_WRONLY|O_XATTR, 0777); + if (attrfd >= 0) { + ftruncate (attrfd, 0); + ret = write (attrfd, value, size); + close (attrfd); + } else { + if (errno != ENOENT) + gf_log ("libglusterfs", GF_LOG_ERROR, + "Couldn't set extended attribute for %d (%d)", + fd, errno); + return -1; + } + + return 0; +} + + +int +solaris_fgetxattr(int fd, + const char* key, + char *value, + size_t size) +{ + int attrfd = -1; + int ret = 0; + + attrfd = openat (fd, key, O_RDONLY|O_XATTR); + if (attrfd >= 0) { + if (size == 0) { + struct stat buf; + fstat (attrfd, &buf); + ret = buf.st_size; + } else { + ret = read (attrfd, value, size); + } + close (attrfd); + } else { + if (errno == ENOENT) + errno = ENODATA; + if (errno != ENOENT) + gf_log ("libglusterfs", GF_LOG_DEBUG, + "Couldn't read extended attribute for the file %d (%d)", + fd, errno); + return -1; + } + + return ret; +} + + +int +solaris_setxattr(const char *path, + const char* key, + const char *value, + size_t size, + int flags) +{ + int attrfd = -1; + int ret = 0; + + attrfd = attropen (path, key, flags|O_CREAT|O_WRONLY, 0777); + if (attrfd >= 0) { + ftruncate (attrfd, 0); + ret = write (attrfd, value, size); + close (attrfd); + } else { + if (errno != ENOENT) + gf_log ("libglusterfs", GF_LOG_ERROR, + "Couldn't set extended attribute for %s (%d)", + path, errno); + return -1; + } + + return 0; +} + + +int +solaris_listxattr(const char *path, + char *list, + size_t size) +{ + int attrdirfd = -1; + ssize_t len = 0; + DIR *dirptr = NULL; + struct dirent *dent = NULL; + int newfd = -1; + + attrdirfd = attropen (path, ".", O_RDONLY, 0); + if (attrdirfd >= 0) { + newfd = dup(attrdirfd); + dirptr = fdopendir(newfd); + if (dirptr) { + while ((dent = readdir(dirptr))) { + size_t listlen = strlen(dent->d_name); + if (!strcmp(dent->d_name, ".") || !strcmp(dent->d_name, "..")) { + /* we don't want "." and ".." here */ + continue; + } + if (size == 0) { + /* return the current size of the list of extended attribute names*/ + len += listlen + 1; + } else { + /* check size and copy entrie + nul into list. */ + if ((len + listlen + 1) > size) { + errno = ERANGE; + len = -1; + break; + } else { + strncpy(list + len, dent->d_name, listlen); + len += listlen; + list[len] = '\0'; + ++len; + } + } + } + + if (closedir(dirptr) == -1) { + close (attrdirfd); + return -1; + } + } else { + close (attrdirfd); + return -1; + } + close (attrdirfd); + } + return len; +} + +int +solaris_removexattr(const char *path, + const char* key) +{ + int ret = -1; + int attrfd = attropen (path, ".", O_RDONLY, 0); + if (attrfd >= 0) { + ret = unlinkat (attrfd, key, 0); + close (attrfd); + } else { + if (errno == ENOENT) + errno = ENODATA; + return -1; + } + + return ret; +} + +int +solaris_getxattr(const char *path, + const char* key, + char *value, + size_t size) +{ + int attrfd = -1; + int ret = 0; + + attrfd = attropen (path, key, O_RDONLY, 0); + if (attrfd >= 0) { + if (size == 0) { + struct stat buf; + fstat (attrfd, &buf); + ret = buf.st_size; + } else { + ret = read (attrfd, value, size); + } + close (attrfd); + } else { + if (errno == ENOENT) + errno = ENODATA; + if (errno != ENOENT) + gf_log ("libglusterfs", GF_LOG_DEBUG, + "Couldn't read extended attribute for the file %s (%d)", + path, errno); + return -1; + } + return ret; +} + + +int +asprintf(char **string_ptr, const char *format, ...) +{ + va_list arg; + char *str; + int size; + int rv; + + if (!string_ptr || !format) + return -1; + + va_start(arg, format); + size = vsnprintf(NULL, 0, format, arg); + size++; + va_start(arg, format); + str = MALLOC(size); + if (str == NULL) { + va_end(arg); + /* + * Strictly speaking, GNU asprintf doesn't do this, + * but the caller isn't checking the return value. + */ + gf_log ("libglusterfs", GF_LOG_CRITICAL, "failed to allocate memory"); + return -1; + } + rv = vsnprintf(str, size, format, arg); + va_end(arg); + + *string_ptr = str; + return (rv); +} + +char* strsep(char** str, const char* delims) +{ + char* token; + + if (*str==NULL) { + /* No more tokens */ + return NULL; + } + + token=*str; + while (**str!='\0') { + if (strchr(delims,**str)!=NULL) { + **str='\0'; + (*str)++; + return token; + } + (*str)++; + } + /* There is no other token */ + *str=NULL; + return token; +} + +#endif /* GF_SOLARIS_HOST_OS */ + +#ifndef HAVE_STRNLEN +size_t +strnlen(const char *string, size_t maxlen) +{ + int len = 0; + while ((len < maxlen) && string[len]) + len++; + return len; +} +#endif /* STRNLEN */ diff --git a/libglusterfs/src/compat.h b/libglusterfs/src/compat.h new file mode 100644 index 000000000..1e8ccaab1 --- /dev/null +++ b/libglusterfs/src/compat.h @@ -0,0 +1,356 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef __COMPAT_H__ +#define __COMPAT_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include "dict.h" + +#ifndef LLONG_MAX +#define LLONG_MAX LONG_LONG_MAX /* compat with old gcc */ +#endif /* LLONG_MAX */ + + +#ifdef GF_LINUX_HOST_OS + +#define UNIX_PATH_MAX 108 + +#include +#include +#include +#include + + +#ifndef HAVE_LLISTXATTR + +/* This part is valid only incase of old glibc which doesn't support + * 'llistxattr()' system calls. + */ + +#define lremovexattr(path,key) removexattr(path,key) +#define llistxattr(path,key,size) listxattr(path,key,size) +#define lgetxattr(path, key, value, size) getxattr(path,key,value,size) +#define lsetxattr(path,key,value,size,flags) setxattr(path,key,value,size,flags) + +#endif /* HAVE_LLISTXATTR */ +#endif /* GF_LINUX_HOST_OS */ + +#ifdef GF_BSD_HOST_OS +/* In case of FreeBSD */ + +#define UNIX_PATH_MAX 104 +#include + +#include +#include +#include +#include + +#include + +enum { + ATTR_CREATE = 1, +#define XATTR_CREATE ATTR_CREATE + ATTR_REPLACE = 2 +#define XATTR_REPLACE ATTR_REPLACE +}; + + +#ifndef sighandler_t +#define sighandler_t sig_t +#endif + +#ifndef ino64_t +#define ino64_t ino_t +#endif + +#ifndef EUCLEAN +#define EUCLEAN 0 +#endif + +#include +#ifndef s6_addr16 +#define s6_addr16 __u6_addr.__u6_addr16 +#endif +#ifndef s6_addr32 +#define s6_addr32 __u6_addr.__u6_addr32 +#endif + +/* Posix dictates NAME_MAX to be used */ +# ifndef NAME_MAX +# ifdef MAXNAMLEN +# define NAME_MAX MAXNAMLEN +# else +# define NAME_MAX 255 +# endif +# endif + +#define lremovexattr(path,key) extattr_delete_link(path, EXTATTR_NAMESPACE_USER, key) +#define llistxattr(path,key,size) extattr_list_link(path, EXTATTR_NAMESPACE_USER, key, size) +#define lgetxattr(path, key, value, size) extattr_get_link(path, EXTATTR_NAMESPACE_USER, key, value, size) +#define lsetxattr(path,key,value,size,flags) extattr_set_link(path, EXTATTR_NAMESPACE_USER, key, value, size) +#define fgetxattr(fd,key,value,size) extattr_get_fd(fd, EXTATTR_NAMESPACE_USER, key, value, size) +#define fsetxattr(fd,key,value,size,flag) extattr_set_fd(fd, EXTATTR_NAMESPACE_USER, key, value, size) + + +#define F_GETLK64 F_GETLK +#define F_SETLK64 F_SETLK +#define F_SETLKW64 F_SETLKW + +#endif /* GF_BSD_HOST_OS */ + +#ifdef GF_DARWIN_HOST_OS + +#define UNIX_PATH_MAX 104 +#include + +#include +#include +#include +#include + +#include + + +#if __DARWIN_64_BIT_INO_T == 0 +# error '64 bit ino_t is must for GlusterFS to work, Compile with "CFLAGS=-D__DARWIN_64_BIT_INO_T"' +#endif /* __DARWIN_64_BIT_INO_T */ + + +#if __DARWIN_64_BIT_INO_T == 0 +# error '64 bit ino_t is must for GlusterFS to work, Compile with "CFLAGS=-D__DARWIN_64_BIT_INO_T"' +#endif /* __DARWIN_64_BIT_INO_T */ + +#ifndef sighandler_t +#define sighandler_t sig_t +#endif + +#ifndef EUCLEAN +#define EUCLEAN 0 +#endif + +#include +#ifndef s6_addr16 +#define s6_addr16 __u6_addr.__u6_addr16 +#endif +#ifndef s6_addr32 +#define s6_addr32 __u6_addr.__u6_addr32 +#endif + +/* Posix dictates NAME_MAX to be used */ +# ifndef NAME_MAX +# ifdef MAXNAMLEN +# define NAME_MAX MAXNAMLEN +# else +# define NAME_MAX 255 +# endif +# endif + +#define llistxattr(path,key,size) listxattr(path,key,size,XATTR_NOFOLLOW) +#define lgetxattr(path,key,value,size) getxattr(path,key,value,size,0,XATTR_NOFOLLOW) +#define lsetxattr(path,key,value,size,flags) setxattr(path,key,value,size,0,flags|XATTR_NOFOLLOW) +#define lremovexattr(path,key) removexattr(path,key,XATTR_NOFOLLOW) +#define fgetxattr(path,key,value,size) fgetxattr(path,key,value,size,0,0) +#define fsetxattr(path,key,value,size,flag) fsetxattr(path,key,value,size,0,flag) + +#define F_GETLK64 F_GETLK +#define F_SETLK64 F_SETLK +#define F_SETLKW64 F_SETLKW + +int32_t gf_darwin_compat_listxattr (int len, dict_t *dict, int size); +int32_t gf_darwin_compat_getxattr (const char *key, dict_t *dict); +int32_t gf_darwin_compat_setxattr (dict_t *dict); + +#endif /* GF_DARWIN_HOST_OS */ + +#ifdef GF_SOLARIS_HOST_OS + +#define UNIX_PATH_MAX 108 +#define EUCLEAN 117 + +#include +#include +#include +#include +#include +#include +#include + +#ifndef lchmod +#define lchmod chmod +#endif + +enum { + ATTR_CREATE = 1, +#define XATTR_CREATE ATTR_CREATE + ATTR_REPLACE = 2 +#define XATTR_REPLACE ATTR_REPLACE +}; + +/* This patch is not present in Solaris 10 and before */ +#ifndef dirfd +#define dirfd(dirp) ((dirp)->dd_fd) +#endif + +/* Posix dictates NAME_MAX to be used */ +# ifndef NAME_MAX +# ifdef MAXNAMLEN +# define NAME_MAX MAXNAMLEN +# else +# define NAME_MAX 255 +# endif +# endif + +#include +#ifndef s6_addr16 +#define S6_ADDR16(x) ((uint16_t*) ((char*)&(x).s6_addr)) +#endif +#ifndef s6_addr32 +#define s6_addr32 _S6_un._S6_u32 +#endif + +#define lremovexattr(path,key) solaris_removexattr(path,key) +#define llistxattr(path,key,size) solaris_listxattr(path,key,size) +#define lgetxattr(path,key,value,size) solaris_getxattr(path,key,value,size) +#define lsetxattr(path,key,value,size,flags) solaris_setxattr(path,key,value,size,flags) +#define fgetxattr(fd,key,value,size) solaris_fgetxattr(fd,key,value,size) +#define fsetxattr(fd,key,value,size,flags) solaris_fsetxattr(fd,key,value,size,flags) +#define lutimes(filename,times) utimes(filename,times) + +int asprintf(char **string_ptr, const char *format, ...); +char* strsep(char** str, const char* delims); +int solaris_listxattr(const char *path, char *list, size_t size); +int solaris_removexattr(const char *path, const char* key); +int solaris_getxattr(const char *path, const char* key, + char *value, size_t size); +int solaris_setxattr(const char *path, const char* key, const char *value, + size_t size, int flags); +int solaris_fgetxattr(int fd, const char* key, + char *value, size_t size); +int solaris_fsetxattr(int fd, const char* key, const char *value, + size_t size, int flags); + +#endif /* GF_SOLARIS_HOST_OS */ + +#ifndef HAVE_ARGP +#include "argp.h" +#else +#include +#endif /* HAVE_ARGP */ + +#ifndef HAVE_STRNLEN +size_t strnlen(const char *string, size_t maxlen); +#endif /* STRNLEN */ + +#ifndef strdupa +#define strdupa(s) \ + (__extension__ \ + ({ \ + __const char *__old = (s); \ + size_t __len = strlen (__old) + 1; \ + char *__new = (char *) __builtin_alloca (__len); \ + (char *) memcpy (__new, __old, __len); \ + })) +#endif + +#define ALIGN(x) (((x) + sizeof (uint64_t) - 1) & ~(sizeof (uint64_t) - 1)) + +#include +#include + +static inline int32_t +dirent_size (struct dirent *entry) +{ +#ifdef GF_BSD_HOST_OS + return ALIGN (24 /* FIX MEEEE!!! */ + entry->d_namlen); +#endif +#ifdef GF_DARWIN_HOST_OS + return ALIGN (24 /* FIX MEEEE!!! */ + entry->d_namlen); +#endif +#ifdef GF_LINUX_HOST_OS + return ALIGN (24 /* FIX MEEEE!!! */ + entry->d_reclen); +#endif +#ifdef GF_SOLARIS_HOST_OS + return ALIGN (24 /* FIX MEEEE!!! */ + entry->d_reclen); +#endif +} + + +static inline int32_t +gf_compat_getxattr (const char *key, dict_t *dict) +{ +#ifdef GF_DARWIN_HOST_OS + return gf_darwin_compat_getxattr (key, dict); +#endif + return -1; +} + + +static inline int32_t +gf_compat_setxattr (dict_t *dict) +{ +#ifdef GF_DARWIN_HOST_OS + return gf_darwin_compat_setxattr (dict); +#endif + return -1; +} + + +static inline int32_t +gf_compat_listxattr (int len, dict_t *dict, int size) +{ +#ifdef GF_DARWIN_HOST_OS + return gf_darwin_compat_listxattr (len, dict, size); +#endif + return len; +} + + +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC +/* Linux, Solaris, Cygwin */ +#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atim.tv_nsec) +#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctim.tv_nsec) +#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtim.tv_nsec) +#define ST_ATIM_NSEC_SET(stbuf, val) ((stbuf)->st_atim.tv_nsec = (val)) +#define ST_MTIM_NSEC_SET(stbuf, val) ((stbuf)->st_mtim.tv_nsec = (val)) +#define ST_CTIM_NSEC_SET(stbuf, val) ((stbuf)->st_ctim.tv_nsec = (val)) +#elif defined(HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC) +/* FreeBSD, NetBSD */ +#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atimespec.tv_nsec) +#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctimespec.tv_nsec) +#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtimespec.tv_nsec) +#define ST_ATIM_NSEC_SET(stbuf, val) ((stbuf)->st_atimespec.tv_nsec = (val)) +#define ST_MTIM_NSEC_SET(stbuf, val) ((stbuf)->st_mtimespec.tv_nsec = (val)) +#define ST_CTIM_NSEC_SET(stbuf, val) ((stbuf)->st_ctimespec.tv_nsec = (val)) +#else +#define ST_ATIM_NSEC(stbuf) (0) +#define ST_CTIM_NSEC(stbuf) (0) +#define ST_MTIM_NSEC(stbuf) (0) +#define ST_ATIM_NSEC_SET(stbuf, val) do { } while (0); +#define ST_MTIM_NSEC_SET(stbuf, val) do { } while (0); +#define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0); +#endif + +#endif /* __COMPAT_H__ */ diff --git a/libglusterfs/src/defaults.c b/libglusterfs/src/defaults.c new file mode 100644 index 000000000..575e3d86e --- /dev/null +++ b/libglusterfs/src/defaults.c @@ -0,0 +1,1388 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +/* libglusterfs/src/defaults.c: + This file contains functions, which are used to fill the 'fops' and 'mops' + structures in the xlator structures, if they are not written. Here, all the + function calls are plainly forwared to the first child of the xlator, and + all the *_cbk function does plain STACK_UNWIND of the frame, and returns. + + All the functions are plain enough to understand. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" + +static int32_t +default_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + inode, + buf, + dict); + return 0; +} + +int32_t +default_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + STACK_WIND (frame, + default_lookup_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, + loc, + xattr_req); + return 0; +} + + +int32_t +default_forget (xlator_t *this, + inode_t *inode) +{ + return 0; +} + +static int32_t +default_stat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +default_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + STACK_WIND (frame, + default_stat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc); + return 0; +} + +static int32_t +default_chmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +default_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + STACK_WIND (frame, + default_chmod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chmod, + loc, + mode); + return 0; +} + + +static int32_t +default_fchmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +default_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + STACK_WIND (frame, + default_fchmod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fchmod, + fd, + mode); + return 0; +} + +static int32_t +default_chown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +default_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + STACK_WIND (frame, + default_chown_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chown, + loc, + uid, + gid); + return 0; +} + +static int32_t +default_fchown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +default_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + STACK_WIND (frame, + default_fchown_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fchown, + fd, + uid, + gid); + return 0; +} + +static int32_t +default_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +default_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + STACK_WIND (frame, + default_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + loc, + offset); + return 0; +} + +static int32_t +default_ftruncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +default_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + STACK_WIND (frame, + default_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + fd, + offset); + return 0; +} + +int32_t +default_utimens_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + + +int32_t +default_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + STACK_WIND (frame, + default_utimens_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->utimens, + loc, + tv); + return 0; +} + +static int32_t +default_access_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +default_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + STACK_WIND (frame, + default_access_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, + loc, + mask); + return 0; +} + + +static int32_t +default_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + path); + return 0; +} + +int32_t +default_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + STACK_WIND (frame, + default_readlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, + loc, + size); + return 0; +} + + +static int32_t +default_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + inode, + buf); + return 0; +} + +int32_t +default_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + STACK_WIND (frame, + default_mknod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev); + return 0; +} + +static int32_t +default_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + inode, + buf); + return 0; +} + +int32_t +default_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + STACK_WIND (frame, + default_mkdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, + loc, mode); + return 0; +} + +static int32_t +default_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +default_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + STACK_WIND (frame, + default_unlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc); + return 0; +} + +static int32_t +default_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +default_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + STACK_WIND (frame, + default_rmdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, + loc); + return 0; +} + + +static int32_t +default_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +default_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc) +{ + STACK_WIND (frame, + default_symlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + linkpath, loc); + return 0; +} + + +static int32_t +default_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +default_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + STACK_WIND (frame, + default_rename_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, + oldloc, newloc); + return 0; +} + + +static int32_t +default_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +default_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + STACK_WIND (frame, + default_link_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, + oldloc, newloc); + return 0; +} + + +static int32_t +default_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + return 0; +} + +int32_t +default_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, fd_t *fd) +{ + STACK_WIND (frame, default_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, flags, mode, fd); + return 0; +} + +static int32_t +default_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + fd); + return 0; +} + +int32_t +default_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, fd_t *fd) +{ + STACK_WIND (frame, + default_open_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, + loc, flags, fd); + return 0; +} + +static int32_t +default_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + vector, + count, + stbuf); + return 0; +} + +int32_t +default_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + STACK_WIND (frame, + default_readv_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, + size, + offset); + return 0; +} + + +static int32_t +default_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + stbuf); + return 0; +} + +int32_t +default_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off) +{ + STACK_WIND (frame, + default_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + fd, + vector, + count, + off); + return 0; +} + +static int32_t +default_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +default_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + STACK_WIND (frame, + default_flush_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd); + return 0; +} + + +static int32_t +default_fsync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +default_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + STACK_WIND (frame, + default_fsync_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, + fd, + flags); + return 0; +} + +static int32_t +default_fstat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +default_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + STACK_WIND (frame, + default_fstat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, + fd); + return 0; +} + +static int32_t +default_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + fd); + return 0; +} + +int32_t +default_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, fd_t *fd) +{ + STACK_WIND (frame, + default_opendir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, + loc, fd); + return 0; +} + + +static int32_t +default_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + entries, + count); + return 0; +} + +int32_t +default_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag) +{ + STACK_WIND (frame, + default_getdents_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getdents, + fd, + size, + offset, + flag); + return 0; +} + + +static int32_t +default_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +default_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + STACK_WIND (frame, + default_setdents_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setdents, + fd, + flags, + entries, + count); + return 0; +} + + +static int32_t +default_fsyncdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +default_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + STACK_WIND (frame, + default_fsyncdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, + fd, + flags); + return 0; +} + + +static int32_t +default_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +default_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + STACK_WIND (frame, + default_statfs_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, + loc); + return 0; +} + + +static int32_t +default_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +default_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + STACK_WIND (frame, + default_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, + dict, + flags); + return 0; +} + +static int32_t +default_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + dict); + return 0; +} + +int32_t +default_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + STACK_WIND (frame, + default_getxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, + name); + return 0; +} + +int32_t +default_xattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +default_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + STACK_WIND (frame, + default_xattrop_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, + loc, + flags, + dict); + return 0; +} + +int32_t +default_fxattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +default_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + STACK_WIND (frame, + default_fxattrop_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, + fd, + flags, + dict); + return 0; +} + + +static int32_t +default_removexattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +default_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + STACK_WIND (frame, + default_removexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + loc, + name); + return 0; +} + +static int32_t +default_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + lock); + return 0; +} + +int32_t +default_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + STACK_WIND (frame, + default_lk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lk, + fd, + cmd, + lock); + return 0; +} + + +static int32_t +default_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +default_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *lock) +{ + STACK_WIND (frame, + default_inodelk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, + loc, cmd, lock); + return 0; +} + + +static int32_t +default_finodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +default_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *lock) +{ + STACK_WIND (frame, + default_finodelk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + fd, cmd, lock); + return 0; +} + + +static int32_t +default_entrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +default_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + STACK_WIND (frame, default_entrylk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, + loc, basename, cmd, type); + return 0; +} + +static int32_t +default_fentrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +default_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + STACK_WIND (frame, default_fentrylk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fentrylk, + fd, basename, cmd, type); + return 0; +} + + +/* Management operations */ + +static int32_t +default_stats_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + stats); + return 0; +} + + +int32_t +default_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags) +{ + STACK_WIND (frame, + default_stats_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->mops->stats, + flags); + return 0; +} + +static int32_t +default_getspec_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + char *spec_data) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + spec_data); + return 0; +} + + +int32_t +default_getspec (call_frame_t *frame, + xlator_t *this, + const char *key, + int32_t flags) +{ + STACK_WIND (frame, + default_getspec_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->mops->getspec, + key, flags); + return 0; +} + + +static int32_t +default_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + file_checksum, + dir_checksum); + return 0; +} + + +int32_t +default_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + STACK_WIND (frame, + default_checksum_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->checksum, + loc, + flag); + return 0; +} + + +int32_t +default_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries) +{ + STACK_UNWIND (frame, op_ret, op_errno, entries); + return 0; +} + + +int32_t +default_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t off) +{ + STACK_WIND (frame, + default_readdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, + fd, size, off); + return 0; +} + +/* notify */ +int32_t +default_notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + switch (event) + { + case GF_EVENT_PARENT_UP: + { + xlator_list_t *list = this->children; + + while (list) + { + list->xlator->notify (list->xlator, event, this); + list = list->next; + } + } + break; + case GF_EVENT_CHILD_DOWN: + case GF_EVENT_CHILD_UP: + default: + { + xlator_list_t *parent = this->parents; + while (parent) { + parent->xlator->notify (parent->xlator, event, this, NULL); + parent = parent->next; + } + } + } + + return 0; +} + +int32_t +default_releasedir (xlator_t *this, + fd_t *fd) +{ + return 0; +} + +int32_t +default_release (xlator_t *this, + fd_t *fd) +{ + return 0; +} + diff --git a/libglusterfs/src/defaults.h b/libglusterfs/src/defaults.h new file mode 100644 index 000000000..aa15df599 --- /dev/null +++ b/libglusterfs/src/defaults.h @@ -0,0 +1,273 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +/* libglusterfs/src/defaults.h: + This file contains definition of default fops and mops functions. +*/ + +#ifndef _DEFAULTS_H +#define _DEFAULTS_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" + +/* Management Operations */ + +int32_t default_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags); + +int32_t default_getspec (call_frame_t *frame, + xlator_t *this, + const char *key, + int32_t flag); + +int32_t default_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag); + + +/* FileSystem operations */ +int32_t default_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req); + +int32_t default_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc); + +int32_t default_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd); + +int32_t default_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode); + +int32_t default_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode); + +int32_t default_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid); + +int32_t default_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid); + +int32_t default_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset); + +int32_t default_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset); + +int32_t default_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]); + +int32_t default_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask); + +int32_t default_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size); + +int32_t default_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev); + +int32_t default_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode); + +int32_t default_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc); + +int32_t default_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc); + +int32_t default_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc); + +int32_t default_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc); + +int32_t default_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc); + +int32_t default_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, fd_t *fd); + +int32_t default_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, fd_t *fd); + +int32_t default_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset); + +int32_t default_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset); + +int32_t default_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd); + +int32_t default_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync); + +int32_t default_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, fd_t *fd); + +int32_t default_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag); + +int32_t default_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync); + +int32_t default_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc); + +int32_t default_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags); + +int32_t default_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name); + +int32_t default_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name); + +int32_t default_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *flock); + +int32_t default_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *flock); + +int32_t default_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *flock); + +int32_t default_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type); + +int32_t default_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type); + +int32_t default_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, off_t off); + +int32_t default_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count); + +int32_t default_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict); + +int32_t default_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict); + +int32_t default_notify (xlator_t *this, + int32_t event, + void *data, + ...); + +int32_t default_forget (xlator_t *this, + inode_t *inode); + +int32_t default_release (xlator_t *this, + fd_t *fd); + +int32_t default_releasedir (xlator_t *this, + fd_t *fd); + +#endif /* _DEFAULTS_H */ diff --git a/libglusterfs/src/dict.c b/libglusterfs/src/dict.c new file mode 100644 index 000000000..eb181f191 --- /dev/null +++ b/libglusterfs/src/dict.c @@ -0,0 +1,2243 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include +#include +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "common-utils.h" +#include "dict.h" +#include "hashfn.h" +#include "logging.h" +#include "compat.h" +#include "byte-order.h" + +data_pair_t * +get_new_data_pair () +{ + data_pair_t *data_pair_ptr = NULL; + + data_pair_ptr = (data_pair_t *) CALLOC (1, sizeof (data_pair_t)); + ERR_ABORT (data_pair_ptr); + + return data_pair_ptr; +} + +data_t * +get_new_data () +{ + data_t *data = NULL; + + data = (data_t *) CALLOC (1, sizeof (data_t)); + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "calloc () returned NULL"); + return NULL; + } + + LOCK_INIT (&data->lock); + return data; +} + +dict_t * +get_new_dict_full (int size_hint) +{ + dict_t *dict = CALLOC (1, sizeof (dict_t)); + + if (!dict) { + gf_log ("dict", GF_LOG_CRITICAL, + "calloc () returned NULL"); + return NULL; + } + + dict->hash_size = size_hint; + dict->members = CALLOC (size_hint, sizeof (data_pair_t *)); + + if (!dict->members) { + gf_log ("dict", GF_LOG_CRITICAL, + "calloc () returned NULL"); + return NULL; + } + + LOCK_INIT (&dict->lock); + + return dict; +} + +dict_t * +get_new_dict (void) +{ + return get_new_dict_full (1); +} + +dict_t * +dict_new (void) +{ + dict_t *dict = NULL; + + dict = get_new_dict_full(1); + + if (dict) + dict_ref (dict); + + return dict; +} + + +int32_t +is_data_equal (data_t *one, + data_t *two) +{ + if (!one || !two || !one->data || !two->data) + return 1; + + if (one == two) + return 1; + + if (one->len != two->len) + return 0; + + if (one->data == two->data) + return 1; + + if (memcmp (one->data, two->data, one->len) == 0) + return 1; + + return 0; +} + +void +data_destroy (data_t *data) +{ + if (data) { + LOCK_DESTROY (&data->lock); + + if (!data->is_static) { + if (data->data) + FREE (data->data); + if (data->vec) + FREE (data->vec); + } + + data->len = 0xbabababa; + if (!data->is_const) + FREE (data); + } +} + +data_t * +data_copy (data_t *old) +{ + if (!old) { + gf_log ("dict", GF_LOG_CRITICAL, + "@old is NULL"); + return NULL; + } + + data_t *newdata = (data_t *) CALLOC (1, sizeof (*newdata)); + + if (!newdata) { + gf_log ("dict", GF_LOG_CRITICAL, + "@old is NULL"); + return NULL; + } + + if (old) { + newdata->len = old->len; + if (old->data) + newdata->data = memdup (old->data, old->len); + if (old->vec) + newdata->vec = memdup (old->vec, old->len * (sizeof (void *) + + sizeof (size_t))); + if (!old->data && !old->vec) { + gf_log ("dict", GF_LOG_CRITICAL, + "@newdata->data || @newdata->vec got NULL from CALLOC()"); + return NULL; + } + } + + return newdata; +} + +static data_pair_t * +_dict_lookup (dict_t *this, char *key) +{ + if (!this || !key) { + gf_log ("dict", GF_LOG_CRITICAL, + "@this=%p @key=%p", this, key); + return NULL; + } + + int hashval = SuperFastHash (key, strlen (key)) % this->hash_size; + data_pair_t *pair; + + for (pair = this->members[hashval]; pair != NULL; pair = pair->hash_next) { + if (pair->key && !strcmp (pair->key, key)) + return pair; + } + + return NULL; +} + + +static int32_t +_dict_set (dict_t *this, + char *key, + data_t *value) +{ + int hashval; + data_pair_t *pair; + char key_free = 0; + int tmp = 0; + + if (!key) { + asprintf (&key, "ref:%p", value); + key_free = 1; + } + + tmp = SuperFastHash (key, strlen (key)); + hashval = (tmp % this->hash_size); + pair = _dict_lookup (this, key); + + if (pair) { + data_t *unref_data = pair->value; + pair->value = data_ref (value); + data_unref (unref_data); + if (key_free) + FREE (key); + /* Indicates duplicate key */ + return 0; + } + pair = (data_pair_t *) CALLOC (1, sizeof (*pair)); + if (!pair) { + gf_log ("dict", GF_LOG_CRITICAL, + "@pair - NULL returned by CALLOC"); + return -1; + } + + pair->key = (char *) CALLOC (1, strlen (key) + 1); + if (!pair->key) { + gf_log ("dict", GF_LOG_CRITICAL, + "@pair->key - NULL returned by CALLOC"); + return -1; + } + + strcpy (pair->key, key); + pair->value = data_ref (value); + + pair->hash_next = this->members[hashval]; + this->members[hashval] = pair; + + pair->next = this->members_list; + pair->prev = NULL; + if (this->members_list) + this->members_list->prev = pair; + this->members_list = pair; + this->count++; + + if (key_free) + FREE (key); + return 0; +} + +int32_t +dict_set (dict_t *this, + char *key, + data_t *value) +{ + int32_t ret; + + if (!this || !value) { + gf_log ("dict", GF_LOG_CRITICAL, + "@this=%p @value=%p", this, value); + return -1; + } + + LOCK (&this->lock); + + ret = _dict_set (this, key, value); + + UNLOCK (&this->lock); + + return ret; +} + + +data_t * +dict_get (dict_t *this, + char *key) +{ + data_pair_t *pair; + + if (!this || !key) { + gf_log ("dict", GF_LOG_DEBUG, + "@this=%p @key=%p", this, key); + return NULL; + } + + LOCK (&this->lock); + + pair = _dict_lookup (this, key); + + UNLOCK (&this->lock); + + if (pair) + return pair->value; + + return NULL; +} + +void +dict_del (dict_t *this, + char *key) +{ + if (!this || !key) { + gf_log ("dict", GF_LOG_DEBUG, + "@this=%p @key=%p", this, key); + return; + } + + LOCK (&this->lock); + + int hashval = SuperFastHash (key, strlen (key)) % this->hash_size; + data_pair_t *pair = this->members[hashval]; + data_pair_t *prev = NULL; + + while (pair) { + if (strcmp (pair->key, key) == 0) { + if (prev) + prev->hash_next = pair->hash_next; + else + this->members[hashval] = pair->hash_next; + + data_unref (pair->value); + + if (pair->prev) + pair->prev->next = pair->next; + else + this->members_list = pair->next; + + if (pair->next) + pair->next->prev = pair->prev; + + FREE (pair->key); + FREE (pair); + this->count--; + break; + } + + prev = pair; + pair = pair->hash_next; + } + + UNLOCK (&this->lock); + + return; +} + +void +dict_destroy (dict_t *this) +{ + if (!this) { + gf_log ("dict", GF_LOG_DEBUG, + "@this=%p", this); + return; + } + + data_pair_t *pair = this->members_list; + data_pair_t *prev = this->members_list; + + LOCK_DESTROY (&this->lock); + + while (prev) { + pair = pair->next; + data_unref (prev->value); + FREE (prev->key); + FREE (prev); + prev = pair; + } + + FREE (this->members); + + if (this->extra_free) + FREE (this->extra_free); + + if (!this->is_static) + FREE (this); + + return; +} + +void +dict_unref (dict_t *this) +{ + int32_t ref; + + if (!this) { + gf_log ("dict", GF_LOG_DEBUG, + "@this=%p", this); + return; + } + + LOCK (&this->lock); + + this->refcount--; + ref = this->refcount; + + UNLOCK (&this->lock); + + if (!ref) + dict_destroy (this); +} + +dict_t * +dict_ref (dict_t *this) +{ + if (!this) { + gf_log ("dict", GF_LOG_DEBUG, + "@this=%p", this); + return NULL; + } + + LOCK (&this->lock); + + this->refcount++; + + UNLOCK (&this->lock); + + return this; +} + +void +data_unref (data_t *this) +{ + int32_t ref; + + if (!this) { + gf_log ("dict", GF_LOG_DEBUG, + "@this=%p", this); + return; + } + + LOCK (&this->lock); + + this->refcount--; + ref = this->refcount; + + UNLOCK (&this->lock); + + if (!ref) + data_destroy (this); +} + +data_t * +data_ref (data_t *this) +{ + if (!this) { + gf_log ("dict", GF_LOG_DEBUG, + "@this=%p", this); + return NULL; + } + + LOCK (&this->lock); + + this->refcount++; + + UNLOCK (&this->lock); + + return this; +} + +/* + Serialization format: + ---- + Count:8 + Key_len:8:Value_len:8 + Key + Value + . + . + . +*/ + +int32_t +dict_serialized_length_old (dict_t *this) +{ + + if (!this) { + gf_log ("dict", GF_LOG_DEBUG, + "@this=%p", this); + return -1; + } + + int32_t len = 9; /* count + \n */ + int32_t count = this->count; + data_pair_t *pair = this->members_list; + + while (count) { + len += 18; + len += strlen (pair->key) + 1; + if (pair->value->vec) { + int i; + for (i=0; ivalue->len; i++) { + len += pair->value->vec[i].iov_len; + } + } else { + len += pair->value->len; + } + pair = pair->next; + count--; + } + + return len; +} + +int32_t +dict_serialize_old (dict_t *this, char *buf) +{ + if (!this || !buf) { + gf_log ("dict", GF_LOG_DEBUG, + "@this=%p @buf=%p", this, buf); + return -1; + } + + data_pair_t *pair = this->members_list; + int32_t count = this->count; + uint64_t dcount = this->count; + + // FIXME: magic numbers + + sprintf (buf, "%08"PRIx64"\n", dcount); + buf += 9; + while (count) { + uint64_t keylen = strlen (pair->key) + 1; + uint64_t vallen = pair->value->len; + + sprintf (buf, "%08"PRIx64":%08"PRIx64"\n", keylen, vallen); + buf += 18; + memcpy (buf, pair->key, keylen); + buf += keylen; + memcpy (buf, pair->value->data, pair->value->len); + buf += pair->value->len; + pair = pair->next; + count--; + } + return (0); +} + + +dict_t * +dict_unserialize_old (char *buf, int32_t size, dict_t **fill) +{ + int32_t ret = 0; + int32_t cnt = 0; + + if (!buf || fill == NULL || !*fill) { + gf_log ("dict", GF_LOG_ERROR, + "@buf=%p @fill=%p @*fill=%p", buf, fill, *fill); + return NULL; + } + + uint64_t count; + ret = sscanf (buf, "%"SCNx64"\n", &count); + (*fill)->count = 0; + + if (!ret){ + gf_log ("dict", + GF_LOG_ERROR, + "sscanf on buf failed"); + goto err; + } + buf += 9; + + if (count == 0) { + gf_log ("dict", + GF_LOG_ERROR, + "count == 0"); + goto err; + } + + for (cnt = 0; cnt < count; cnt++) { + data_t *value = NULL; + char *key = NULL; + uint64_t key_len, value_len; + + ret = sscanf (buf, "%"SCNx64":%"SCNx64"\n", &key_len, &value_len); + if (ret != 2) { + gf_log ("dict", + GF_LOG_ERROR, + "sscanf for key_len and value_len failed"); + goto err; + } + buf += 18; + + key = buf; + buf += key_len; + + value = get_new_data (); + value->len = value_len; + value->data = buf; + value->is_static = 1; + buf += value_len; + + dict_set (*fill, key, value); + } + + goto ret; + +err: + FREE (*fill); + *fill = NULL; + +ret: + return *fill; +} + + +int32_t +dict_iovec_len (dict_t *this) +{ + if (!this) { + gf_log ("dict", GF_LOG_CRITICAL, + "@this=%p", this); + return -1; + } + + int32_t len = 0; + data_pair_t *pair = this->members_list; + + len++; /* initial header */ + while (pair) { + len++; /* pair header */ + len++; /* key */ + + if (pair->value->vec) + len += pair->value->len; + else + len++; + pair = pair->next; + } + + return len; +} + +int32_t +dict_to_iovec (dict_t *this, + struct iovec *vec, + int32_t count) +{ + if (!this || !vec) { + gf_log ("dict", GF_LOG_CRITICAL, + "@this=%p @vec=%p", this, vec); + return -1; + } + + int32_t i = 0; + data_pair_t *pair = this->members_list; + + vec[0].iov_len = 9; + if (vec[0].iov_base) + sprintf (vec[0].iov_base, + "%08"PRIx64"\n", + (int64_t)this->count); + i++; + + while (pair) { + int64_t keylen = strlen (pair->key) + 1; + int64_t vallen = 0; + + if (pair->value->vec) { + int i; + + for (i=0; ivalue->len; i++) { + vallen += pair->value->vec[i].iov_len; + } + } else { + vallen = pair->value->len; + } + + vec[i].iov_len = 18; + if (vec[i].iov_base) + sprintf (vec[i].iov_base, + "%08"PRIx64":%08"PRIx64"\n", + keylen, + vallen); + i++; + + vec[i].iov_len = keylen; + vec[i].iov_base = pair->key; + i++; + + if (pair->value->vec) { + int k; + + for (k=0; kvalue->len; k++) { + vec[i].iov_len = pair->value->vec[k].iov_len; + vec[i].iov_base = pair->value->vec[k].iov_base; + i++; + } + } else { + vec[i].iov_len = pair->value->len; + vec[i].iov_base = pair->value->data; + i++; + } + + pair = pair->next; + } + + return 0; +} + +data_t * +int_to_data (int64_t value) +{ + data_t *data = get_new_data (); + + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data - NULL returned by CALLOC"); + return NULL; + } + + asprintf (&data->data, "%"PRId64, value); + data->len = strlen (data->data) + 1; + + return data; +} + +data_t * +data_from_int64 (int64_t value) +{ + data_t *data = get_new_data (); + + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data - NULL returned by CALLOC"); + return NULL; + } + asprintf (&data->data, "%"PRId64, value); + data->len = strlen (data->data) + 1; + + return data; +} + +data_t * +data_from_int32 (int32_t value) +{ + data_t *data = get_new_data (); + + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data - NULL returned by CALLOC"); + return NULL; + } + asprintf (&data->data, "%"PRId32, value); + data->len = strlen (data->data) + 1; + + return data; +} + +data_t * +data_from_int16 (int16_t value) +{ + + data_t *data = get_new_data (); + + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data - NULL returned by CALLOC"); + return NULL; + } + asprintf (&data->data, "%"PRId16, value); + data->len = strlen (data->data) + 1; + + return data; +} + +data_t * +data_from_int8 (int8_t value) +{ + + data_t *data = get_new_data (); + + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data - NULL returned by CALLOC"); + return NULL; + } + asprintf (&data->data, "%d", value); + data->len = strlen (data->data) + 1; + + return data; +} + +data_t * +data_from_uint64 (uint64_t value) +{ + data_t *data = get_new_data (); + + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data - NULL returned by CALLOC"); + return NULL; + } + asprintf (&data->data, "%"PRIu64, value); + data->len = strlen (data->data) + 1; + + return data; +} + + +data_t * +data_from_uint32 (uint32_t value) +{ + data_t *data = get_new_data (); + + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data - NULL returned by CALLOC"); + return NULL; + } + asprintf (&data->data, "%"PRIu32, value); + data->len = strlen (data->data) + 1; + + return data; +} + + +data_t * +data_from_uint16 (uint16_t value) +{ + data_t *data = get_new_data (); + + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data - NULL returned by CALLOC"); + return NULL; + } + asprintf (&data->data, "%"PRIu16, value); + data->len = strlen (data->data) + 1; + + return data; +} + + +data_t * +data_from_ptr (void *value) +{ + if (!value) { + gf_log ("dict", GF_LOG_CRITICAL, + "@value=%p", value); + return NULL; + } + + data_t *data = get_new_data (); + + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data - NULL returned by CALLOC"); + return NULL; + } + + data->data = value; + return data; +} + +data_t * +data_from_static_ptr (void *value) +{ +/* + this is valid to set 0 as value.. + + if (!value) { + gf_log ("dict", GF_LOG_CRITICAL, + "@value=%p", value); + return NULL; + } +*/ + data_t *data = get_new_data (); + + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data - NULL returned by CALLOC"); + return NULL; + } + + data->is_static = 1; + data->data = value; + + return data; +} + +data_t * +str_to_data (char *value) +{ + if (!value) { + gf_log ("dict", GF_LOG_CRITICAL, + "@value=%p", value); + return NULL; + } + data_t *data = get_new_data (); + + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data - NULL returned by CALLOC"); + return NULL; + } + data->len = strlen (value) + 1; + + data->data = value; + data->is_static = 1; + + return data; +} + +data_t * +data_from_dynstr (char *value) +{ + if (!value) { + gf_log ("dict", GF_LOG_CRITICAL, + "@value=%p", value); + return NULL; + } + + data_t *data = get_new_data (); + + data->len = strlen (value) + 1; + data->data = value; + + return data; +} + +data_t * +data_from_dynptr (void *value, int32_t len) +{ + data_t *data = get_new_data (); + + data->len = len; + data->data = value; + + return data; +} + +data_t * +bin_to_data (void *value, int32_t len) +{ + if (!value) { + gf_log ("dict", GF_LOG_CRITICAL, + "@value=%p", value); + return NULL; + } + + data_t *data = get_new_data (); + + data->is_static = 1; + data->len = len; + data->data = value; + + return data; +} + +int64_t +data_to_int64 (data_t *data) +{ + if (!data) + return -1; + + char *str = alloca (data->len + 1); + ERR_ABORT (str); + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + return (int64_t) strtoull (str, NULL, 0); +} + +int32_t +data_to_int32 (data_t *data) +{ + if (!data) + return -1; + + char *str = alloca (data->len + 1); + ERR_ABORT (str); + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + return strtoul (str, NULL, 0); +} + +int16_t +data_to_int16 (data_t *data) +{ + if (!data) + return -1; + + char *str = alloca (data->len + 1); + ERR_ABORT (str); + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + return strtol (str, NULL, 0); +} + + +int8_t +data_to_int8 (data_t *data) +{ + if (!data) + return -1; + + char *str = alloca (data->len + 1); + ERR_ABORT (str); + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + return (int8_t)strtol (str, NULL, 0); +} + + +uint64_t +data_to_uint64 (data_t *data) +{ + if (!data) + return -1; + char *str = alloca (data->len + 1); + ERR_ABORT (str); + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + return strtoll (str, NULL, 0); +} + +uint32_t +data_to_uint32 (data_t *data) +{ + if (!data) + return -1; + + char *str = alloca (data->len + 1); + ERR_ABORT (str); + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + return strtol (str, NULL, 0); +} + +uint16_t +data_to_uint16 (data_t *data) +{ + if (!data) + return -1; + + char *str = alloca (data->len + 1); + ERR_ABORT (str); + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + return strtol (str, NULL, 0); +} + +char * +data_to_str (data_t *data) +{ + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data=%p", data); + return NULL; + } + return data->data; +} + +void * +data_to_ptr (data_t *data) +{ + if (!data) { + return NULL; + } + return data->data; +} + +void * +data_to_bin (data_t *data) +{ + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data=%p", data); + return NULL; + } + return data->data; +} + +void +dict_foreach (dict_t *dict, + void (*fn)(dict_t *this, + char *key, + data_t *value, + void *data), + void *data) +{ + if (!data) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data=%p", data); + return; + } + + data_pair_t *pairs = dict->members_list; + + while (pairs) { + fn (dict, pairs->key, pairs->value, data); + pairs = pairs->next; + } +} + + +static void +_copy (dict_t *unused, + char *key, + data_t *value, + void *newdict) +{ + dict_set ((dict_t *)newdict, key, (value)); +} + + +dict_t * +dict_copy (dict_t *dict, + dict_t *new) +{ + if (!dict) { + gf_log ("dict", GF_LOG_CRITICAL, + "@data=%p", dict); + return NULL; + } + + if (!new) + new = get_new_dict_full (dict->hash_size); + + dict_foreach (dict, _copy, new); + + return new; +} + +dict_t * +dict_copy_with_ref (dict_t *dict, + dict_t *new) +{ + dict_t *local_new = NULL; + + GF_VALIDATE_OR_GOTO("dict", dict, fail); + + if (new == NULL) { + local_new = dict_new (); + GF_VALIDATE_OR_GOTO("dict", local_new, fail); + new = local_new; + } + + dict_foreach (dict, _copy, new); +fail: + return new; +} + +/* + * !!!!!!! CLEANED UP CODE !!!!!!! + */ + +/** + * Common cleaned up interface: + * + * Return value: 0 success + * -val error, val = errno + */ + + +static int +dict_get_with_ref (dict_t *this, char *key, data_t **data) +{ + data_pair_t * pair = NULL; + int ret = -ENOENT; + + if (!this || !key || !data) { + ret = -EINVAL; + goto err; + } + + LOCK (&this->lock); + { + pair = _dict_lookup (this, key); + } + UNLOCK (&this->lock); + + if (pair) { + ret = 0; + *data = data_ref (pair->value); + } + +err: + return ret; +} + +static int +_data_to_ptr (data_t *data, void **val) +{ + int ret = 0; + + if (!data) { + ret = -EINVAL; + goto err; + } + + *val = data->data; +err: + return ret; +} + + +static int +_data_to_int8 (data_t *data, int8_t *val) +{ + int ret = 0; + char * str = NULL; + + if (!data || !val) { + ret = -EINVAL; + goto err; + } + + str = alloca (data->len + 1); + if (!str) { + ret = -ENOMEM; + goto err; + } + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + errno = 0; + *val = strtol (str, NULL, 0); + if (errno != 0) + ret = -errno; + +err: + return ret; +} + +static int +_data_to_int16 (data_t *data, int16_t *val) +{ + int ret = 0; + char * str = NULL; + + if (!data || !val) { + ret = -EINVAL; + goto err; + } + + str = alloca (data->len + 1); + if (!str) { + ret = -ENOMEM; + goto err; + } + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + errno = 0; + *val = strtol (str, NULL, 0); + if (errno != 0) + ret = -errno; + +err: + return ret; +} + +static int +_data_to_int32 (data_t *data, int32_t *val) +{ + int ret = 0; + char * str = NULL; + + if (!data || !val) { + ret = -EINVAL; + goto err; + } + + str = alloca (data->len + 1); + if (!str) { + ret = -ENOMEM; + goto err; + } + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + errno = 0; + *val = strtol (str, NULL, 0); + if (errno != 0) + ret = -errno; + +err: + return ret; +} + +static int +_data_to_int64 (data_t *data, int64_t *val) +{ + int ret = 0; + char * str = NULL; + + if (!data || !val) { + ret = -EINVAL; + goto err; + } + + str = alloca (data->len + 1); + if (!str) { + ret = -ENOMEM; + goto err; + } + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + errno = 0; + *val = strtoll (str, NULL, 0); + if (errno != 0) + ret = -errno; + +err: + return ret; +} + +static int +_data_to_uint16 (data_t *data, uint16_t *val) +{ + int ret = 0; + char * str = NULL; + + if (!data || !val) { + ret = -EINVAL; + goto err; + } + + str = alloca (data->len + 1); + if (!str) { + ret = -ENOMEM; + goto err; + } + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + errno = 0; + *val = strtoul (str, NULL, 0); + if (errno != 0) + ret = -errno; + +err: + return ret; +} + +static int +_data_to_uint32 (data_t *data, uint32_t *val) +{ + int ret = 0; + char * str = NULL; + + if (!data || !val) { + ret = -EINVAL; + goto err; + } + + str = alloca (data->len + 1); + if (!str) { + ret = -ENOMEM; + goto err; + } + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + errno = 0; + *val = strtoul (str, NULL, 0); + if (errno != 0) + ret = -errno; + +err: + return ret; +} + +static int +_data_to_uint64 (data_t *data, uint64_t *val) +{ + int ret = 0; + char * str = NULL; + + if (!data || !val) { + ret = -EINVAL; + goto err; + } + + str = alloca (data->len + 1); + if (!str) { + ret = -ENOMEM; + goto err; + } + memcpy (str, data->data, data->len); + str[data->len] = '\0'; + + errno = 0; + *val = strtoull (str, NULL, 0); + if (errno != 0) + ret = -errno; + +err: + return ret; +} + +int +dict_get_int8 (dict_t *this, char *key, int8_t *val) +{ + data_t * data = NULL; + int ret = 0; + + if (!this || !key || !val) { + ret = -EINVAL; + goto err; + } + + ret = dict_get_with_ref (this, key, &data); + if (ret != 0) { + goto err; + } + + ret = _data_to_int8 (data, val); + +err: + if (data) + data_unref (data); + return ret; +} + + +int +dict_set_int8 (dict_t *this, char *key, int8_t val) +{ + data_t * data = NULL; + int ret = 0; + + data = data_from_int8 (val); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + +int +dict_get_int16 (dict_t *this, char *key, int16_t *val) +{ + data_t * data = NULL; + int ret = 0; + + if (!this || !key || !val) { + ret = -EINVAL; + goto err; + } + + ret = dict_get_with_ref (this, key, &data); + if (ret != 0) { + goto err; + } + + ret = _data_to_int16 (data, val); + +err: + if (data) + data_unref (data); + return ret; +} + + +int +dict_set_int16 (dict_t *this, char *key, int16_t val) +{ + data_t * data = NULL; + int ret = 0; + + data = data_from_int16 (val); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + +int +dict_get_int32 (dict_t *this, char *key, int32_t *val) +{ + data_t * data = NULL; + int ret = 0; + + if (!this || !key || !val) { + ret = -EINVAL; + goto err; + } + + ret = dict_get_with_ref (this, key, &data); + if (ret != 0) { + goto err; + } + + ret = _data_to_int32 (data, val); + +err: + if (data) + data_unref (data); + return ret; +} + + +int +dict_set_int32 (dict_t *this, char *key, int32_t val) +{ + data_t * data = NULL; + int ret = 0; + + data = data_from_int32 (val); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + +int +dict_get_int64 (dict_t *this, char *key, int64_t *val) +{ + data_t * data = NULL; + int ret = 0; + + if (!this || !key || !val) { + ret = -EINVAL; + goto err; + } + + ret = dict_get_with_ref (this, key, &data); + if (ret != 0) { + goto err; + } + + ret = _data_to_int64 (data, val); + +err: + if (data) + data_unref (data); + return ret; +} + + +int +dict_set_int64 (dict_t *this, char *key, int64_t val) +{ + data_t * data = NULL; + int ret = 0; + + data = data_from_int64 (val); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + +int +dict_get_uint16 (dict_t *this, char *key, uint16_t *val) +{ + data_t * data = NULL; + int ret = 0; + + if (!this || !key || !val) { + ret = -EINVAL; + goto err; + } + + ret = dict_get_with_ref (this, key, &data); + if (ret != 0) { + goto err; + } + + ret = _data_to_uint16 (data, val); + +err: + if (data) + data_unref (data); + return ret; +} + + +int +dict_set_uint16 (dict_t *this, char *key, uint16_t val) +{ + data_t * data = NULL; + int ret = 0; + + data = data_from_uint16 (val); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + +int +dict_get_uint32 (dict_t *this, char *key, uint32_t *val) +{ + data_t * data = NULL; + int ret = 0; + + if (!this || !key || !val) { + ret = -EINVAL; + goto err; + } + + ret = dict_get_with_ref (this, key, &data); + if (ret != 0) { + goto err; + } + + ret = _data_to_uint32 (data, val); + +err: + if (data) + data_unref (data); + return ret; +} + + +int +dict_set_uint32 (dict_t *this, char *key, uint32_t val) +{ + data_t * data = NULL; + int ret = 0; + + data = data_from_uint32 (val); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + +int +dict_get_uint64 (dict_t *this, char *key, uint64_t *val) +{ + data_t * data = NULL; + int ret = 0; + + if (!this || !key || !val) { + ret = -EINVAL; + goto err; + } + + ret = dict_get_with_ref (this, key, &data); + if (ret != 0) { + goto err; + } + + ret = _data_to_uint64 (data, val); + +err: + if (data) + data_unref (data); + return ret; +} + + +int +dict_set_uint64 (dict_t *this, char *key, uint64_t val) +{ + data_t * data = NULL; + int ret = 0; + + data = data_from_uint64 (val); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + +int +dict_set_static_ptr (dict_t *this, char *key, void *ptr) +{ + data_t * data = NULL; + int ret = 0; + + data = data_from_static_ptr (ptr); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + +int +dict_set_dynptr (dict_t *this, char *key, void *ptr, size_t len) +{ + data_t * data = NULL; + int ret = 0; + + data = data_from_dynptr (ptr, len); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + +int +dict_get_ptr (dict_t *this, char *key, void **ptr) +{ + data_t * data = NULL; + int ret = 0; + + if (!this || !key || !ptr) { + ret = -EINVAL; + goto err; + } + + ret = dict_get_with_ref (this, key, &data); + if (ret != 0) { + goto err; + } + + ret = _data_to_ptr (data, ptr); + if (ret != 0) { + goto err; + } + +err: + if (data) + data_unref (data); + + return ret; +} + +int +dict_set_ptr (dict_t *this, char *key, void *ptr) +{ + data_t * data = NULL; + int ret = 0; + + data = data_from_ptr (ptr); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + + +int +dict_get_str (dict_t *this, char *key, char **str) +{ + data_t * data = NULL; + int ret = -EINVAL; + + if (!this || !key || !str) { + goto err; + } + + ret = dict_get_with_ref (this, key, &data); + if (ret < 0) { + goto err; + } + + if (!data || !data->data) { + goto err; + } + *str = data->data; + +err: + if (data) + data_unref (data); + + return ret; +} + +int +dict_set_str (dict_t *this, char *key, char *str) +{ + data_t * data = NULL; + int ret = 0; + + data = str_to_data (str); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + +int +dict_set_dynstr (dict_t *this, char *key, char *str) +{ + data_t * data = NULL; + int ret = 0; + + data = data_from_dynstr (str); + if (!data) { + ret = -EINVAL; + goto err; + } + + ret = dict_set (this, key, data); + +err: + return ret; +} + + +int +dict_get_bin (dict_t *this, char *key, void **bin) +{ + data_t * data = NULL; + int ret = -EINVAL; + + if (!this || !key || !bin) { + goto err; + } + + ret = dict_get_with_ref (this, key, &data); + if (ret < 0) { + goto err; + } + + if (!data || !data->data) { + goto err; + } + *bin = data->data; + +err: + if (data) + data_unref (data); + + return ret; +} + + +int +dict_set_bin (dict_t *this, char *key, void *ptr, size_t size) +{ + data_t * data = NULL; + int ret = 0; + + if (!ptr || (size < 0)) { + ret = -EINVAL; + goto err; + } + + data = bin_to_data (ptr, size); + if (!data) { + ret = -EINVAL; + goto err; + } + + data->data = ptr; + data->len = size; + data->is_static = 0; + + ret = dict_set (this, key, data); + +err: + return ret; +} + + +int +dict_set_static_bin (dict_t *this, char *key, void *ptr, size_t size) +{ + data_t * data = NULL; + int ret = 0; + + if (!ptr || (size < 0)) { + ret = -EINVAL; + goto err; + } + + data = bin_to_data (ptr, size); + if (!data) { + ret = -EINVAL; + goto err; + } + + data->data = ptr; + data->len = size; + data->is_static = 1; + + ret = dict_set (this, key, data); + +err: + return ret; +} + +/** + * Serialization format: + * -------- -------- -------- ----------- ------------- + * | count | key len | val len | key \0| value + * ---------------------------------------- ------------- + * 4 4 4 + */ + +#define DICT_HDR_LEN 4 +#define DICT_DATA_HDR_KEY_LEN 4 +#define DICT_DATA_HDR_VAL_LEN 4 + +/** + * dict_serialized_length - return the length of serialized dict + * + * @this: dict to be serialized + * @return: success: len + * : failure: -errno + */ + +int +dict_serialized_length (dict_t *this) +{ + int ret = -EINVAL; + int count = 0; + int len = 0; + int i = 0; + data_pair_t * pair = NULL; + + if (!this) { + gf_log ("dict", GF_LOG_ERROR, "this is null!"); + goto out; + } + + len = DICT_HDR_LEN; + count = this->count; + + if (count < 0) { + gf_log ("dict", GF_LOG_ERROR, "count (%d) < 0!", count); + goto out; + } + + pair = this->members_list; + + while (count) { + if (!pair) { + gf_log ("dict", GF_LOG_ERROR, + "less than count data pairs found!"); + goto out; + } + + len += DICT_DATA_HDR_KEY_LEN + DICT_DATA_HDR_VAL_LEN; + + if (!pair->key) { + gf_log ("dict", GF_LOG_ERROR, "pair->key is null!"); + goto out; + } + + len += strlen (pair->key) + 1 /* for '\0' */; + + if (!pair->value) { + gf_log ("dict", GF_LOG_ERROR, + "pair->value is null!"); + goto out; + } + + if (pair->value->vec) { + for (i = 0; i < pair->value->len; i++) { + if (pair->value->vec[i].iov_len < 0) { + gf_log ("dict", GF_LOG_ERROR, + "iov_len (%"GF_PRI_SIZET") < 0!", + pair->value->vec[i].iov_len); + goto out; + } + + len += pair->value->vec[i].iov_len; + } + } else { + if (pair->value->len < 0) { + gf_log ("dict", GF_LOG_ERROR, + "value->len (%d) < 0", + pair->value->len); + goto out; + } + + len += pair->value->len; + } + + pair = pair->next; + count--; + } + + ret = len; +out: + return ret; +} + +/** + * dict_serialize - serialize a dictionary into a buffer + * + * @this: dict to serialize + * @buf: buffer to serialize into. This must be + * atleast dict_serialized_length (this) large + * + * @return: success: 0 + * failure: -errno + */ + +int +dict_serialize (dict_t *this, char *buf) +{ + int ret = -1; + data_pair_t * pair = NULL; + int32_t count = 0; + int32_t keylen = 0; + int32_t vallen = 0; + + if (!this) { + gf_log ("dict", GF_LOG_ERROR, + "this is null!"); + goto out; + } + if (!buf) { + gf_log ("dict", GF_LOG_ERROR, + "buf is null!"); + goto out; + } + + count = this->count; + if (count < 0) { + gf_log ("dict", GF_LOG_ERROR, "count (%d) < 0!", count); + goto out; + } + + *(int32_t *) buf = hton32 (count); + buf += DICT_HDR_LEN; + pair = this->members_list; + + while (count) { + if (!pair) { + gf_log ("dict", GF_LOG_ERROR, + "less than count data pairs found!"); + goto out; + } + + if (!pair->key) { + gf_log ("dict", GF_LOG_ERROR, + "pair->key is null!"); + goto out; + } + + keylen = strlen (pair->key); + *(int32_t *) buf = hton32 (keylen); + buf += DICT_DATA_HDR_KEY_LEN; + + if (!pair->value) { + gf_log ("dict", GF_LOG_ERROR, + "pair->value is null!"); + goto out; + } + + vallen = pair->value->len; + *(int32_t *) buf = hton32 (vallen); + buf += DICT_DATA_HDR_VAL_LEN; + + memcpy (buf, pair->key, keylen); + buf += keylen; + *buf++ = '\0'; + + if (!pair->value->data) { + gf_log ("dict", GF_LOG_ERROR, + "pair->value->data is null!"); + goto out; + } + memcpy (buf, pair->value->data, vallen); + buf += vallen; + + pair = pair->next; + count--; + } + + ret = 0; +out: + return ret; +} + + +/** + * dict_unserialize - unserialize a buffer into a dict + * + * @buf: buf containing serialized dict + * @size: size of the @buf + * @fill: dict to fill in + * + * @return: success: 0 + * failure: -errno + */ + +int32_t +dict_unserialize (char *orig_buf, int32_t size, dict_t **fill) +{ + char *buf = NULL; + int ret = -1; + int32_t count = 0; + int i = 0; + + data_t * value = NULL; + char * key = NULL; + int32_t keylen = 0; + int32_t vallen = 0; + + + buf = orig_buf; + + if (!buf) { + gf_log ("dict", GF_LOG_ERROR, + "buf is null!"); + goto out; + } + + if (size == 0) { + gf_log ("dict", GF_LOG_ERROR, + "size is 0!"); + goto out; + } + + if (!fill) { + gf_log ("dict", GF_LOG_ERROR, + "fill is null!"); + goto out; + } + + if (!*fill) { + gf_log ("dict", GF_LOG_ERROR, + "*fill is null!"); + goto out; + } + + if ((buf + DICT_HDR_LEN) > (orig_buf + size)) { + gf_log ("dict", GF_LOG_ERROR, + "undersized buffer passsed"); + goto out; + } + + count = ntoh32 (*(int32_t *) buf); + buf += DICT_HDR_LEN; + + if (count < 0) { + gf_log ("dict", GF_LOG_ERROR, + "count (%d) <= 0", count); + goto out; + } + + /* count will be set by the dict_set's below */ + (*fill)->count = 0; + + for (i = 0; i < count; i++) { + if ((buf + DICT_DATA_HDR_KEY_LEN) > (orig_buf + size)) { + gf_log ("dict", GF_LOG_ERROR, + "undersized buffer passsed"); + goto out; + } + keylen = ntoh32 (*(int32_t *) buf); + buf += DICT_DATA_HDR_KEY_LEN; + + if ((buf + DICT_DATA_HDR_VAL_LEN) > (orig_buf + size)) { + gf_log ("dict", GF_LOG_ERROR, + "undersized buffer passsed"); + goto out; + } + vallen = ntoh32 (*(int32_t *) buf); + buf += DICT_DATA_HDR_VAL_LEN; + + if ((buf + keylen) > (orig_buf + size)) { + gf_log ("dict", GF_LOG_ERROR, + "undersized buffer passsed"); + goto out; + } + key = buf; + buf += keylen + 1; /* for '\0' */ + + if ((buf + vallen) > (orig_buf + size)) { + gf_log ("dict", GF_LOG_ERROR, + "undersized buffer passsed"); + goto out; + } + value = get_new_data (); + value->len = vallen; + value->data = buf; + value->is_static = 1; + buf += vallen; + + dict_set (*fill, key, value); + } + + ret = 0; +out: + return ret; +} + diff --git a/libglusterfs/src/dict.h b/libglusterfs/src/dict.h new file mode 100644 index 000000000..5c299d039 --- /dev/null +++ b/libglusterfs/src/dict.h @@ -0,0 +1,179 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _DICT_H +#define _DICT_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include + +#include "common-utils.h" + +typedef struct _data data_t; +typedef struct _dict dict_t; +typedef struct _data_pair data_pair_t; + +struct _data { + unsigned char is_static:1; + unsigned char is_const:1; + int32_t len; + struct iovec *vec; + char *data; + int32_t refcount; + gf_lock_t lock; +}; + +struct _data_pair { + struct _data_pair *hash_next; + struct _data_pair *prev; + struct _data_pair *next; + data_t *value; + char *key; +}; + +struct _dict { + unsigned char is_static:1; + int32_t hash_size; + int32_t count; + int32_t refcount; + data_pair_t **members; + data_pair_t *members_list; + char *extra_free; + gf_lock_t lock; +}; + + +int32_t is_data_equal (data_t *one, data_t *two); +void data_destroy (data_t *data); + +int32_t dict_set (dict_t *this, char *key, data_t *value); +data_t *dict_get (dict_t *this, char *key); +void dict_del (dict_t *this, char *key); + +int32_t dict_serialized_length (dict_t *dict); +int32_t dict_serialize (dict_t *dict, char *buf); +int32_t dict_unserialize (char *buf, int32_t size, dict_t **fill); + +int32_t dict_iovec_len (dict_t *dict); +int32_t dict_to_iovec (dict_t *dict, struct iovec *vec, int32_t count); + +void dict_destroy (dict_t *dict); +void dict_unref (dict_t *dict); +dict_t *dict_ref (dict_t *dict); +data_t *data_ref (data_t *data); +void data_unref (data_t *data); + +/* + TODO: provide converts for differnt byte sizes, signedness, and void * + */ +data_t *int_to_data (int64_t value); +data_t *str_to_data (char *value); +data_t *data_from_dynstr (char *value); +data_t *data_from_dynptr (void *value, int32_t len); +data_t *bin_to_data (void *value, int32_t len); +data_t *static_str_to_data (char *value); +data_t *static_bin_to_data (void *value); + +int64_t data_to_int64 (data_t *data); +int32_t data_to_int32 (data_t *data); +int16_t data_to_int16 (data_t *data); +int8_t data_to_int8 (data_t *data); + +uint64_t data_to_uint64 (data_t *data); +uint32_t data_to_uint32 (data_t *data); +uint16_t data_to_uint16 (data_t *data); + +data_t *data_from_ptr (void *value); +data_t *data_from_static_ptr (void *value); + +data_t *data_from_int64 (int64_t value); +data_t *data_from_int32 (int32_t value); +data_t *data_from_int16 (int16_t value); +data_t *data_from_int8 (int8_t value); + +data_t *data_from_uint64 (uint64_t value); +data_t *data_from_uint32 (uint32_t value); +data_t *data_from_uint16 (uint16_t value); + +char *data_to_str (data_t *data); +void *data_to_bin (data_t *data); +void *data_to_ptr (data_t *data); + +data_t *get_new_data (); +dict_t *get_new_dict_full (int size_hint); +dict_t *get_new_dict (); + +data_pair_t *get_new_data_pair (); + +void dict_foreach (dict_t *this, + void (*fn)(dict_t *this, + char *key, + data_t *value, + void *data), + void *data); + +dict_t *dict_copy (dict_t *this, + dict_t *new); + +/* CLEANED UP FUNCTIONS DECLARATIONS */ +GF_MUST_CHECK dict_t *dict_new (void); +dict_t *dict_copy_with_ref (dict_t *this, + dict_t *new); + +GF_MUST_CHECK int dict_get_int8 (dict_t *this, char *key, int8_t *val); +GF_MUST_CHECK int dict_set_int8 (dict_t *this, char *key, int8_t val); + +GF_MUST_CHECK int dict_get_int16 (dict_t *this, char *key, int16_t *val); +GF_MUST_CHECK int dict_set_int16 (dict_t *this, char *key, int16_t val); + +GF_MUST_CHECK int dict_get_int32 (dict_t *this, char *key, int32_t *val); +GF_MUST_CHECK int dict_set_int32 (dict_t *this, char *key, int32_t val); + +GF_MUST_CHECK int dict_get_int64 (dict_t *this, char *key, int64_t *val); +GF_MUST_CHECK int dict_set_int64 (dict_t *this, char *key, int64_t val); + +GF_MUST_CHECK int dict_get_uint16 (dict_t *this, char *key, uint16_t *val); +GF_MUST_CHECK int dict_set_uint16 (dict_t *this, char *key, uint16_t val); + +GF_MUST_CHECK int dict_get_uint32 (dict_t *this, char *key, uint32_t *val); +GF_MUST_CHECK int dict_set_uint32 (dict_t *this, char *key, uint32_t val); + +GF_MUST_CHECK int dict_get_uint64 (dict_t *this, char *key, uint64_t *val); +GF_MUST_CHECK int dict_set_uint64 (dict_t *this, char *key, uint64_t val); + +GF_MUST_CHECK int dict_set_static_ptr (dict_t *this, char *key, void *ptr); +GF_MUST_CHECK int dict_get_ptr (dict_t *this, char *key, void **ptr); +GF_MUST_CHECK int dict_set_ptr (dict_t *this, char *key, void *ptr); +GF_MUST_CHECK int dict_set_dynptr (dict_t *this, char *key, void *ptr, size_t size); + +GF_MUST_CHECK int dict_get_bin (dict_t *this, char *key, void **ptr); +GF_MUST_CHECK int dict_set_bin (dict_t *this, char *key, void *ptr, size_t size); +GF_MUST_CHECK int dict_set_static_bin (dict_t *this, char *key, void *ptr, size_t size); + +GF_MUST_CHECK int dict_set_str (dict_t *this, char *key, char *str); +GF_MUST_CHECK int dict_set_dynstr (dict_t *this, char *key, char *str); +GF_MUST_CHECK int dict_get_str (dict_t *this, char *key, char **str); + +#endif diff --git a/libglusterfs/src/event.c b/libglusterfs/src/event.c new file mode 100644 index 000000000..f2bbddd20 --- /dev/null +++ b/libglusterfs/src/event.c @@ -0,0 +1,978 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include +#include +#include +#include +#include +#include + +#include "logging.h" +#include "event.h" +#include "mem-pool.h" + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +static int +event_register_poll (struct event_pool *event_pool, int fd, + event_handler_t handler, + void *data, int poll_in, int poll_out); + + +static int +__flush_fd (int fd, int idx, void *data, + int poll_in, int poll_out, int poll_err) +{ + char buf[64]; + int ret = -1; + + if (!poll_in) + return ret; + + do { + ret = read (fd, buf, 64); + if (ret == -1 && errno != EAGAIN) { + gf_log ("poll", GF_LOG_ERROR, + "read on %d returned error (%s)", + fd, strerror (errno)); + } + } while (ret == 64); + + return ret; +} + + +static int +__event_getindex (struct event_pool *event_pool, int fd, int idx) +{ + int ret = -1; + int i = 0; + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + if (idx > -1 && idx < event_pool->used) { + if (event_pool->reg[idx].fd == fd) + ret = idx; + } + + for (i=0; ret == -1 && iused; i++) { + if (event_pool->reg[i].fd == fd) { + ret = i; + break; + } + } + + return ret; +} + + +static struct event_pool * +event_pool_new_poll (int count) +{ + struct event_pool *event_pool = NULL; + int ret = -1; + + event_pool = CALLOC (1, sizeof (*event_pool)); + + if (!event_pool) + return NULL; + + event_pool->count = count; + event_pool->reg = CALLOC (event_pool->count, + sizeof (*event_pool->reg)); + + if (!event_pool->reg) { + gf_log ("poll", GF_LOG_CRITICAL, + "failed to allocate event registry"); + free (event_pool); + return NULL; + } + + pthread_mutex_init (&event_pool->mutex, NULL); + + ret = pipe (event_pool->breaker); + + if (ret == -1) { + gf_log ("poll", GF_LOG_ERROR, + "pipe creation failed (%s)", strerror (errno)); + free (event_pool->reg); + free (event_pool); + return NULL; + } + + ret = fcntl (event_pool->breaker[0], F_SETFL, O_NONBLOCK); + if (ret == -1) { + gf_log ("poll", GF_LOG_ERROR, + "could not set pipe to non blocking mode (%s)", + strerror (errno)); + close (event_pool->breaker[0]); + close (event_pool->breaker[1]); + event_pool->breaker[0] = event_pool->breaker[1] = -1; + + free (event_pool->reg); + free (event_pool); + return NULL; + } + + ret = fcntl (event_pool->breaker[1], F_SETFL, O_NONBLOCK); + if (ret == -1) { + gf_log ("poll", GF_LOG_ERROR, + "could not set pipe to non blocking mode (%s)", + strerror (errno)); + + close (event_pool->breaker[0]); + close (event_pool->breaker[1]); + event_pool->breaker[0] = event_pool->breaker[1] = -1; + + free (event_pool->reg); + free (event_pool); + return NULL; + } + + ret = event_register_poll (event_pool, event_pool->breaker[0], + __flush_fd, NULL, 1, 0); + if (ret == -1) { + gf_log ("poll", GF_LOG_ERROR, + "could not register pipe fd with poll event loop"); + close (event_pool->breaker[0]); + close (event_pool->breaker[1]); + event_pool->breaker[0] = event_pool->breaker[1] = -1; + + free (event_pool->reg); + free (event_pool); + return NULL; + } + + return event_pool; +} + + +static int +event_register_poll (struct event_pool *event_pool, int fd, + event_handler_t handler, + void *data, int poll_in, int poll_out) +{ + int idx = -1; + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + pthread_mutex_lock (&event_pool->mutex); + { + if (event_pool->count == event_pool->used) + { + event_pool->count += 256; + + event_pool->reg = realloc (event_pool->reg, + event_pool->count * + sizeof (*event_pool->reg)); + } + + idx = event_pool->used++; + + event_pool->reg[idx].fd = fd; + event_pool->reg[idx].events = POLLPRI; + event_pool->reg[idx].handler = handler; + event_pool->reg[idx].data = data; + + switch (poll_in) { + case 1: + event_pool->reg[idx].events |= POLLIN; + break; + case 0: + event_pool->reg[idx].events &= ~POLLIN; + break; + case -1: + /* do nothing */ + break; + default: + gf_log ("poll", GF_LOG_ERROR, + "invalid poll_in value %d", poll_in); + break; + } + + switch (poll_out) { + case 1: + event_pool->reg[idx].events |= POLLOUT; + break; + case 0: + event_pool->reg[idx].events &= ~POLLOUT; + break; + case -1: + /* do nothing */ + break; + default: + gf_log ("poll", GF_LOG_ERROR, + "invalid poll_out value %d", poll_in); + break; + } + + event_pool->changed = 1; + + } + pthread_mutex_unlock (&event_pool->mutex); + + return idx; +} + + +static int +event_unregister_poll (struct event_pool *event_pool, int fd, int idx_hint) +{ + int idx = -1; + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + pthread_mutex_lock (&event_pool->mutex); + { + idx = __event_getindex (event_pool, fd, idx_hint); + + if (idx == -1) { + gf_log ("poll", GF_LOG_ERROR, + "index not found for fd=%d (idx_hint=%d)", + fd, idx_hint); + errno = ENOENT; + goto unlock; + } + + event_pool->reg[idx] = event_pool->reg[--event_pool->used]; + event_pool->changed = 1; + } +unlock: + pthread_mutex_unlock (&event_pool->mutex); + + return idx; +} + + +static int +event_select_on_poll (struct event_pool *event_pool, int fd, int idx_hint, + int poll_in, int poll_out) +{ + int idx = -1; + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + pthread_mutex_lock (&event_pool->mutex); + { + idx = __event_getindex (event_pool, fd, idx_hint); + + if (idx == -1) { + gf_log ("poll", GF_LOG_ERROR, + "index not found for fd=%d (idx_hint=%d)", + fd, idx_hint); + errno = ENOENT; + goto unlock; + } + + switch (poll_in) { + case 1: + event_pool->reg[idx].events |= POLLIN; + break; + case 0: + event_pool->reg[idx].events &= ~POLLIN; + break; + case -1: + /* do nothing */ + break; + default: + /* TODO: log error */ + break; + } + + switch (poll_out) { + case 1: + event_pool->reg[idx].events |= POLLOUT; + break; + case 0: + event_pool->reg[idx].events &= ~POLLOUT; + break; + case -1: + /* do nothing */ + break; + default: + /* TODO: log error */ + break; + } + + if (poll_in + poll_out > -2) + event_pool->changed = 1; + } +unlock: + pthread_mutex_unlock (&event_pool->mutex); + + return idx; +} + + +static int +event_dispatch_poll_handler (struct event_pool *event_pool, + struct pollfd *ufds, int i) +{ + event_handler_t handler = NULL; + void *data = NULL; + int idx = -1; + int ret = 0; + + handler = NULL; + data = NULL; + idx = -1; + + pthread_mutex_lock (&event_pool->mutex); + { + idx = __event_getindex (event_pool, ufds[i].fd, i); + + if (idx == -1) { + gf_log ("poll", GF_LOG_ERROR, + "index not found for fd=%d (idx_hint=%d)", + ufds[i].fd, i); + goto unlock; + } + + handler = event_pool->reg[idx].handler; + data = event_pool->reg[idx].data; + } +unlock: + pthread_mutex_unlock (&event_pool->mutex); + + if (handler) + ret = handler (ufds[i].fd, idx, data, + (ufds[i].revents & (POLLIN|POLLPRI)), + (ufds[i].revents & (POLLOUT)), + (ufds[i].revents & (POLLERR|POLLHUP|POLLNVAL))); + + return ret; +} + + +static int +event_dispatch_poll_resize (struct event_pool *event_pool, + struct pollfd *ufds, int size) +{ + int i = 0; + + pthread_mutex_lock (&event_pool->mutex); + { + if (event_pool->changed == 0) { + goto unlock; + } + + if (event_pool->used > event_pool->evcache_size) { + if (event_pool->evcache) + free (event_pool->evcache); + + event_pool->evcache = ufds = NULL; + + event_pool->evcache_size = event_pool->used; + + ufds = CALLOC (sizeof (struct pollfd), + event_pool->evcache_size); + event_pool->evcache = ufds; + } + + for (i = 0; i < event_pool->used; i++) { + ufds[i].fd = event_pool->reg[i].fd; + ufds[i].events = event_pool->reg[i].events; + ufds[i].revents = 0; + } + + size = i; + } +unlock: + pthread_mutex_unlock (&event_pool->mutex); + + return size; +} + + +static int +event_dispatch_poll (struct event_pool *event_pool) +{ + struct pollfd *ufds = NULL; + int size = 0; + int i = 0; + int ret = -1; + + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + while (1) { + size = event_dispatch_poll_resize (event_pool, ufds, size); + ufds = event_pool->evcache; + + ret = poll (ufds, size, 1); + + if (ret == 0) + /* timeout */ + continue; + + if (ret == -1 && errno == EINTR) + /* sys call */ + continue; + + for (i = 0; i < size; i++) { + if (!ufds[i].revents) + continue; + + event_dispatch_poll_handler (event_pool, ufds, i); + } + } + + return -1; +} + + +static struct event_ops event_ops_poll = { + .new = event_pool_new_poll, + .event_register = event_register_poll, + .event_select_on = event_select_on_poll, + .event_unregister = event_unregister_poll, + .event_dispatch = event_dispatch_poll +}; + + + +#ifdef HAVE_SYS_EPOLL_H +#include + + +static struct event_pool * +event_pool_new_epoll (int count) +{ + struct event_pool *event_pool = NULL; + int epfd = -1; + + event_pool = CALLOC (1, sizeof (*event_pool)); + + if (!event_pool) + return NULL; + + event_pool->count = count; + event_pool->reg = CALLOC (event_pool->count, + sizeof (*event_pool->reg)); + + if (!event_pool->reg) { + gf_log ("epoll", GF_LOG_CRITICAL, + "event registry allocation failed"); + free (event_pool); + return NULL; + } + + epfd = epoll_create (count); + + if (epfd == -1) { + gf_log ("epoll", GF_LOG_ERROR, "epoll fd creation failed (%s)", + strerror (errno)); + free (event_pool->reg); + free (event_pool); + return NULL; + } + + event_pool->fd = epfd; + + event_pool->count = count; + + pthread_mutex_init (&event_pool->mutex, NULL); + pthread_cond_init (&event_pool->cond, NULL); + + return event_pool; +} + + +int +event_register_epoll (struct event_pool *event_pool, int fd, + event_handler_t handler, + void *data, int poll_in, int poll_out) +{ + int idx = -1; + int ret = -1; + struct epoll_event epoll_event = {0, }; + struct event_data *ev_data = (void *)&epoll_event.data; + + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + pthread_mutex_lock (&event_pool->mutex); + { + if (event_pool->count == event_pool->used) { + event_pool->count *= 2; + + event_pool->reg = realloc (event_pool->reg, + event_pool->count * + sizeof (*event_pool->reg)); + + if (!event_pool->reg) { + gf_log ("epoll", GF_LOG_ERROR, + "event registry re-allocation failed"); + goto unlock; + } + } + + idx = event_pool->used; + event_pool->used++; + + event_pool->reg[idx].fd = fd; + event_pool->reg[idx].events = EPOLLPRI; + event_pool->reg[idx].handler = handler; + event_pool->reg[idx].data = data; + + switch (poll_in) { + case 1: + event_pool->reg[idx].events |= EPOLLIN; + break; + case 0: + event_pool->reg[idx].events &= ~EPOLLIN; + break; + case -1: + /* do nothing */ + break; + default: + gf_log ("epoll", GF_LOG_ERROR, + "invalid poll_in value %d", poll_in); + break; + } + + switch (poll_out) { + case 1: + event_pool->reg[idx].events |= EPOLLOUT; + break; + case 0: + event_pool->reg[idx].events &= ~EPOLLOUT; + break; + case -1: + /* do nothing */ + break; + default: + gf_log ("epoll", GF_LOG_ERROR, + "invalid poll_out value %d", poll_in); + break; + } + + event_pool->changed = 1; + + epoll_event.events = event_pool->reg[idx].events; + ev_data->fd = fd; + ev_data->idx = idx; + + ret = epoll_ctl (event_pool->fd, EPOLL_CTL_ADD, fd, + &epoll_event); + + if (ret == -1) { + gf_log ("epoll", GF_LOG_ERROR, + "failed to add fd(=%d) to epoll fd(=%d) (%s)", + fd, event_pool->fd, strerror (errno)); + goto unlock; + } + + pthread_cond_broadcast (&event_pool->cond); + } +unlock: + pthread_mutex_unlock (&event_pool->mutex); + + return ret; +} + + +static int +event_unregister_epoll (struct event_pool *event_pool, int fd, int idx_hint) +{ + int idx = -1; + int ret = -1; + + struct epoll_event epoll_event = {0, }; + struct event_data *ev_data = (void *)&epoll_event.data; + int lastidx = -1; + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + pthread_mutex_lock (&event_pool->mutex); + { + idx = __event_getindex (event_pool, fd, idx_hint); + + if (idx == -1) { + gf_log ("epoll", GF_LOG_ERROR, + "index not found for fd=%d (idx_hint=%d)", + fd, idx_hint); + errno = ENOENT; + goto unlock; + } + + ret = epoll_ctl (event_pool->fd, EPOLL_CTL_DEL, fd, NULL); + + /* if ret is -1, this array member should never be accessed */ + /* if it is 0, the array member might be used by idx_cache + * in which case the member should not be accessed till + * it is reallocated + */ + + event_pool->reg[idx].fd = -1; + + if (ret == -1) { + gf_log ("epoll", GF_LOG_ERROR, + "fail to del fd(=%d) from epoll fd(=%d) (%s)", + fd, event_pool->fd, strerror (errno)); + goto unlock; + } + + lastidx = event_pool->used - 1; + if (lastidx == idx) { + event_pool->used--; + goto unlock; + } + + epoll_event.events = event_pool->reg[lastidx].events; + ev_data->fd = event_pool->reg[lastidx].fd; + ev_data->idx = idx; + + ret = epoll_ctl (event_pool->fd, EPOLL_CTL_MOD, ev_data->fd, + &epoll_event); + if (ret == -1) { + gf_log ("epoll", GF_LOG_ERROR, + "fail to modify fd(=%d) index %d to %d (%s)", + ev_data->fd, event_pool->used, idx, + strerror (errno)); + goto unlock; + } + + /* just replace the unregistered idx by last one */ + event_pool->reg[idx] = event_pool->reg[lastidx]; + event_pool->used--; + } +unlock: + pthread_mutex_unlock (&event_pool->mutex); + + return ret; +} + + +static int +event_select_on_epoll (struct event_pool *event_pool, int fd, int idx_hint, + int poll_in, int poll_out) +{ + int idx = -1; + int ret = -1; + + struct epoll_event epoll_event = {0, }; + struct event_data *ev_data = (void *)&epoll_event.data; + + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + pthread_mutex_lock (&event_pool->mutex); + { + idx = __event_getindex (event_pool, fd, idx_hint); + + if (idx == -1) { + gf_log ("epoll", GF_LOG_ERROR, + "index not found for fd=%d (idx_hint=%d)", + fd, idx_hint); + errno = ENOENT; + goto unlock; + } + + switch (poll_in) { + case 1: + event_pool->reg[idx].events |= EPOLLIN; + break; + case 0: + event_pool->reg[idx].events &= ~EPOLLIN; + break; + case -1: + /* do nothing */ + break; + default: + gf_log ("epoll", GF_LOG_ERROR, + "invalid poll_in value %d", poll_in); + break; + } + + switch (poll_out) { + case 1: + event_pool->reg[idx].events |= EPOLLOUT; + break; + case 0: + event_pool->reg[idx].events &= ~EPOLLOUT; + break; + case -1: + /* do nothing */ + break; + default: + gf_log ("epoll", GF_LOG_ERROR, + "invalid poll_out value %d", poll_in); + break; + } + + epoll_event.events = event_pool->reg[idx].events; + ev_data->fd = fd; + ev_data->idx = idx; + + ret = epoll_ctl (event_pool->fd, EPOLL_CTL_MOD, fd, + &epoll_event); + if (ret == -1) { + gf_log ("epoll", GF_LOG_ERROR, + "failed to modify fd(=%d) events to %d", + fd, epoll_event.events); + } + } +unlock: + pthread_mutex_unlock (&event_pool->mutex); + + return ret; +} + + +static int +event_dispatch_epoll_handler (struct event_pool *event_pool, + struct epoll_event *events, int i) +{ + struct event_data *event_data = NULL; + event_handler_t handler = NULL; + void *data = NULL; + int idx = -1; + int ret = -1; + + + event_data = (void *)&events[i].data; + handler = NULL; + data = NULL; + idx = -1; + + pthread_mutex_lock (&event_pool->mutex); + { + idx = __event_getindex (event_pool, event_data->fd, + event_data->idx); + + if (idx == -1) { + gf_log ("epoll", GF_LOG_ERROR, + "index not found for fd(=%d) (idx_hint=%d)", + event_data->fd, event_data->idx); + goto unlock; + } + + handler = event_pool->reg[idx].handler; + data = event_pool->reg[idx].data; + } +unlock: + pthread_mutex_unlock (&event_pool->mutex); + + if (handler) + ret = handler (event_data->fd, event_data->idx, data, + (events[i].events & (EPOLLIN|EPOLLPRI)), + (events[i].events & (EPOLLOUT)), + (events[i].events & (EPOLLERR|EPOLLHUP))); + return ret; +} + + +static int +event_dispatch_epoll (struct event_pool *event_pool) +{ + struct epoll_event *events = NULL; + int size = 0; + int i = 0; + int ret = -1; + + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + while (1) { + pthread_mutex_lock (&event_pool->mutex); + { + while (event_pool->used == 0) + pthread_cond_wait (&event_pool->cond, + &event_pool->mutex); + + if (event_pool->used > event_pool->evcache_size) { + if (event_pool->evcache) + free (event_pool->evcache); + + event_pool->evcache = events = NULL; + + event_pool->evcache_size = + event_pool->used + 256; + + events = CALLOC (event_pool->evcache_size, + sizeof (struct epoll_event)); + + event_pool->evcache = events; + } + } + pthread_mutex_unlock (&event_pool->mutex); + + ret = epoll_wait (event_pool->fd, event_pool->evcache, + event_pool->evcache_size, -1); + + if (ret == 0) + /* timeout */ + continue; + + if (ret == -1 && errno == EINTR) + /* sys call */ + continue; + + size = ret; + + for (i = 0; i < size; i++) { + if (!events[i].events) + continue; + + ret = event_dispatch_epoll_handler (event_pool, + events, i); + } + } + + return -1; +} + + +static struct event_ops event_ops_epoll = { + .new = event_pool_new_epoll, + .event_register = event_register_epoll, + .event_select_on = event_select_on_epoll, + .event_unregister = event_unregister_epoll, + .event_dispatch = event_dispatch_epoll +}; + +#endif + + +struct event_pool * +event_pool_new (int count) +{ + struct event_pool *event_pool = NULL; + +#ifdef HAVE_SYS_EPOLL_H + event_pool = event_ops_epoll.new (count); + + if (event_pool) { + event_pool->ops = &event_ops_epoll; + } else { + gf_log ("event", GF_LOG_WARNING, + "failing back to poll based event handling"); + } +#endif + + if (!event_pool) { + event_pool = event_ops_poll.new (count); + + if (event_pool) + event_pool->ops = &event_ops_poll; + } + + return event_pool; +} + + +int +event_register (struct event_pool *event_pool, int fd, + event_handler_t handler, + void *data, int poll_in, int poll_out) +{ + int ret = -1; + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + ret = event_pool->ops->event_register (event_pool, fd, handler, data, + poll_in, poll_out); + return ret; +} + + +int +event_unregister (struct event_pool *event_pool, int fd, int idx) +{ + int ret = -1; + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + ret = event_pool->ops->event_unregister (event_pool, fd, idx); + + return ret; +} + + +int +event_select_on (struct event_pool *event_pool, int fd, int idx_hint, + int poll_in, int poll_out) +{ + int ret = -1; + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + ret = event_pool->ops->event_select_on (event_pool, fd, idx_hint, + poll_in, poll_out); + return ret; +} + + +int +event_dispatch (struct event_pool *event_pool) +{ + int ret = -1; + + if (event_pool == NULL) { + gf_log ("event", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + ret = event_pool->ops->event_dispatch (event_pool); + + return ret; +} diff --git a/libglusterfs/src/event.h b/libglusterfs/src/event.h new file mode 100644 index 000000000..ec80e2a58 --- /dev/null +++ b/libglusterfs/src/event.h @@ -0,0 +1,90 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _EVENT_H_ +#define _EVENT_H_ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include + +struct event_pool; +struct event_ops; +struct event_data { + int fd; + int idx; +} __attribute__ ((__packed__)); + + +typedef int (*event_handler_t) (int fd, int idx, void *data, + int poll_in, int poll_out, int poll_err); + +struct event_pool { + struct event_ops *ops; + + int fd; + int breaker[2]; + + int count; + struct { + int fd; + int events; + void *data; + event_handler_t handler; + } *reg; + + int used; + int idx_cache; + int changed; + + pthread_mutex_t mutex; + pthread_cond_t cond; + + void *evcache; + int evcache_size; +}; + +struct event_ops { + struct event_pool * (*new) (int count); + + int (*event_register) (struct event_pool *event_pool, int fd, + event_handler_t handler, + void *data, int poll_in, int poll_out); + + int (*event_select_on) (struct event_pool *event_pool, int fd, int idx, + int poll_in, int poll_out); + + int (*event_unregister) (struct event_pool *event_pool, int fd, int idx); + + int (*event_dispatch) (struct event_pool *event_pool); +}; + +struct event_pool * event_pool_new (int count); +int event_select_on (struct event_pool *event_pool, int fd, int idx, + int poll_in, int poll_out); +int event_register (struct event_pool *event_pool, int fd, + event_handler_t handler, + void *data, int poll_in, int poll_out); +int event_unregister (struct event_pool *event_pool, int fd, int idx); +int event_dispatch (struct event_pool *event_pool); + +#endif /* _EVENT_H_ */ diff --git a/libglusterfs/src/fd.c b/libglusterfs/src/fd.c new file mode 100644 index 000000000..78c578842 --- /dev/null +++ b/libglusterfs/src/fd.c @@ -0,0 +1,611 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include "fd.h" +#include "glusterfs.h" +#include "inode.h" +#include "dict.h" + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +static uint32_t +gf_fd_fdtable_expand (fdtable_t *fdtable, uint32_t nr); + +static fd_t * +_fd_ref (fd_t *fd); + +/* + Allocate in memory chunks of power of 2 starting from 1024B + Assumes fdtable->lock is held +*/ +static inline uint32_t +gf_roundup_power_of_two (uint32_t nr) +{ + uint32_t result = nr; + + if (nr < 0) { + gf_log ("server-protocol/fd", + GF_LOG_ERROR, + "Negative number passed"); + return -1; + } + + switch (nr) { + case 0: + case 1: + result = 1; + break; + + default: + { + uint32_t cnt = 0, tmp = nr; + uint8_t remainder = 0; + while (tmp != 1){ + if (tmp % 2) + remainder = 1; + tmp /= 2; + cnt++; + } + + if (remainder) + result = 0x1 << (cnt + 1); + break; + } + } + + return result; +} + +static uint32_t +gf_fd_fdtable_expand (fdtable_t *fdtable, uint32_t nr) +{ + fd_t **oldfds = NULL; + uint32_t oldmax_fds = -1; + + if (fdtable == NULL || nr < 0) + { + gf_log ("fd", GF_LOG_ERROR, "invalid argument"); + return EINVAL; + } + + nr /= (1024 / sizeof (fd_t *)); + nr = gf_roundup_power_of_two (nr + 1); + nr *= (1024 / sizeof (fd_t *)); + + oldfds = fdtable->fds; + oldmax_fds = fdtable->max_fds; + + fdtable->fds = CALLOC (nr, sizeof (fd_t *)); + ERR_ABORT (fdtable->fds); + fdtable->max_fds = nr; + + if (oldfds) { + uint32_t cpy = oldmax_fds * sizeof (fd_t *); + memcpy (fdtable->fds, oldfds, cpy); + } + + FREE (oldfds); + return 0; +} + +fdtable_t * +gf_fd_fdtable_alloc (void) +{ + fdtable_t *fdtable = NULL; + + fdtable = CALLOC (1, sizeof (*fdtable)); + if (!fdtable) + return NULL; + + pthread_mutex_init (&fdtable->lock, NULL); + + pthread_mutex_lock (&fdtable->lock); + { + gf_fd_fdtable_expand (fdtable, 0); + } + pthread_mutex_unlock (&fdtable->lock); + + return fdtable; +} + +void +gf_fd_fdtable_destroy (fdtable_t *fdtable) +{ + + int32_t i = 0; + + if (fdtable) { + pthread_mutex_lock (&fdtable->lock); + { + for (i=0; i < fdtable->max_fds; i++) { + if (fdtable->fds[i]) { + fd_t *fd = fdtable->fds[i]; + + fd_unref (fd); + } + } + + FREE (fdtable->fds); + } + pthread_mutex_unlock (&fdtable->lock); + pthread_mutex_destroy (&fdtable->lock); + FREE (fdtable); + } +} + +int32_t +gf_fd_unused_get2 (fdtable_t *fdtable, fd_t *fdptr, int32_t fd) +{ + int32_t ret = -1; + + if (fdtable == NULL || fdptr == NULL || fd < 0) + { + gf_log ("fd", GF_LOG_ERROR, "invalid argument"); + errno = EINVAL; + return -1; + } + + pthread_mutex_lock (&fdtable->lock); + { + while (fdtable->max_fds < fd) { + int error = 0; + error = gf_fd_fdtable_expand (fdtable, fdtable->max_fds + 1); + if (error) + { + gf_log ("fd.c", + GF_LOG_ERROR, + "Cannot expand fdtable:%s", strerror (error)); + goto err; + } + } + + if (!fdtable->fds[fd]) + { + fdtable->fds[fd] = fdptr; + fd_ref (fdptr); + ret = fd; + } + else + { + gf_log ("fd.c", + GF_LOG_ERROR, + "Cannot allocate fd %d (slot not empty in fdtable)", fd); + } + } +err: + pthread_mutex_unlock (&fdtable->lock); + + return ret; +} + + +int32_t +gf_fd_unused_get (fdtable_t *fdtable, fd_t *fdptr) +{ + int32_t fd = -1, i = 0; + + if (fdtable == NULL || fdptr == NULL) + { + gf_log ("fd", GF_LOG_ERROR, "invalid argument"); + return EINVAL; + } + + pthread_mutex_lock (&fdtable->lock); + { + for (i = 0; imax_fds; i++) + { + if (!fdtable->fds[i]) + break; + } + + if (i < fdtable->max_fds) { + fdtable->fds[i] = fdptr; + fd = i; + } else { + int32_t error; + error = gf_fd_fdtable_expand (fdtable, fdtable->max_fds + 1); + if (error) { + gf_log ("server-protocol.c", + GF_LOG_ERROR, + "Cannot expand fdtable:%s", strerror (error)); + } else { + fdtable->fds[i] = fdptr; + fd = i; + } + } + } + pthread_mutex_unlock (&fdtable->lock); + + return fd; +} + + +inline void +gf_fd_put (fdtable_t *fdtable, int32_t fd) +{ + fd_t *fdptr = NULL; + if (fdtable == NULL || fd < 0) + { + gf_log ("fd", GF_LOG_ERROR, "invalid argument"); + return; + } + + if (!(fd < fdtable->max_fds)) + { + gf_log ("fd", GF_LOG_ERROR, "invalid argument"); + return; + } + + pthread_mutex_lock (&fdtable->lock); + { + fdptr = fdtable->fds[fd]; + fdtable->fds[fd] = NULL; + } + pthread_mutex_unlock (&fdtable->lock); + + if (fdptr) { + fd_unref (fdptr); + } +} + + +fd_t * +gf_fd_fdptr_get (fdtable_t *fdtable, int64_t fd) +{ + fd_t *fdptr = NULL; + + if (fdtable == NULL || fd < 0) + { + gf_log ("fd", GF_LOG_ERROR, "invalid argument"); + errno = EINVAL; + return NULL; + } + + if (!(fd < fdtable->max_fds)) + { + gf_log ("fd", GF_LOG_ERROR, "invalid argument"); + errno = EINVAL; + return NULL; + } + + pthread_mutex_lock (&fdtable->lock); + { + fdptr = fdtable->fds[fd]; + if (fdptr) { + fd_ref (fdptr); + } + } + pthread_mutex_unlock (&fdtable->lock); + + return fdptr; +} + +fd_t * +_fd_ref (fd_t *fd) +{ + ++fd->refcount; + + return fd; +} + +fd_t * +fd_ref (fd_t *fd) +{ + fd_t *refed_fd = NULL; + + if (!fd) { + gf_log ("fd", GF_LOG_ERROR, "@fd=%p", fd); + return NULL; + } + + LOCK (&fd->inode->lock); + refed_fd = _fd_ref (fd); + UNLOCK (&fd->inode->lock); + + return refed_fd; +} + +fd_t * +_fd_unref (fd_t *fd) +{ + assert (fd->refcount); + + --fd->refcount; + + if (fd->refcount == 0){ + list_del_init (&fd->inode_list); + } + + return fd; +} + +static void +fd_destroy (fd_t *fd) +{ + data_pair_t *pair = NULL; + xlator_t *xl = NULL; + int i = 0; + + if (fd == NULL){ + gf_log ("xlator", GF_LOG_ERROR, "invalid arugument"); + goto out; + } + + if (fd->inode == NULL){ + gf_log ("xlator", GF_LOG_ERROR, "fd->inode is NULL"); + goto out; + } + if (!fd->_ctx) + goto out; + + if (S_ISDIR (fd->inode->st_mode)) { + for (pair = fd->ctx->members_list; pair; pair = pair->next) { + /* notify all xlators which have a context */ + xl = xlator_search_by_name (fd->inode->table->xl, + pair->key); + + if (!xl) { + gf_log ("fd", GF_LOG_CRITICAL, + "fd(%p)->ctx has invalid key(%s)", + fd, pair->key); + continue; + } + if (xl->cbks->releasedir) { + xl->cbks->releasedir (xl, fd); + } else { + gf_log ("fd", GF_LOG_CRITICAL, + "xlator(%s) in fd(%p) no RELEASE cbk", + xl->name, fd); + } + + } + for (i = 0; i < fd->inode->table->xl->ctx->xl_count; i++) { + if (fd->_ctx[i].key) { + xl = (xlator_t *)(long)fd->_ctx[i].key; + if (xl->cbks->releasedir) + xl->cbks->releasedir (xl, fd); + } + } + } else { + for (pair = fd->ctx->members_list; pair; pair = pair->next) { + /* notify all xlators which have a context */ + xl = xlator_search_by_name (fd->inode->table->xl, + pair->key); + + if (!xl) { + gf_log ("fd", GF_LOG_CRITICAL, + "fd(%p)->ctx has invalid key(%s)", + fd, pair->key); + continue; + } + if (xl->cbks->release) { + xl->cbks->release (xl, fd); + } else { + gf_log ("fd", GF_LOG_CRITICAL, + "xlator(%s) in fd(%p) no RELEASE cbk", + xl->name, fd); + } + } + for (i = 0; i < fd->inode->table->xl->ctx->xl_count; i++) { + if (fd->_ctx[i].key) { + xl = (xlator_t *)(long)fd->_ctx[i].key; + if (xl->cbks->release) + xl->cbks->release (xl, fd); + } + } + } + + FREE (fd->_ctx); + inode_unref (fd->inode); + fd->inode = (inode_t *)0xaaaaaaaa; + dict_destroy (fd->ctx); + FREE (fd); + +out: + return; +} + +void +fd_unref (fd_t *fd) +{ + int32_t refcount = 0; + + if (!fd) { + gf_log ("fd.c", GF_LOG_ERROR, "fd is NULL"); + return; + } + + LOCK (&fd->inode->lock); + { + _fd_unref (fd); + refcount = fd->refcount; + } + UNLOCK (&fd->inode->lock); + + if (refcount == 0) { + fd_destroy (fd); + } + + return ; +} + +fd_t * +fd_bind (fd_t *fd) +{ + inode_t *inode = fd->inode; + + if (!fd) { + gf_log ("fd.c", GF_LOG_ERROR, "fd is NULL"); + return NULL; + } + + LOCK (&inode->lock); + { + list_add (&fd->inode_list, &inode->fd_list); + } + UNLOCK (&inode->lock); + + return fd; +} + +fd_t * +fd_create (inode_t *inode, pid_t pid) +{ + fd_t *fd = NULL; + + if (inode == NULL) { + gf_log ("fd", GF_LOG_ERROR, "invalid argument"); + return NULL; + } + + fd = CALLOC (1, sizeof (fd_t)); + ERR_ABORT (fd); + + fd->_ctx = CALLOC (1, (sizeof (struct _fd_ctx) * + inode->table->xl->ctx->xl_count)); + fd->ctx = get_new_dict (); + fd->inode = inode_ref (inode); + fd->pid = pid; + INIT_LIST_HEAD (&fd->inode_list); + + LOCK (&inode->lock); + fd = _fd_ref (fd); + UNLOCK (&inode->lock); + + return fd; +} + +fd_t * +fd_lookup (inode_t *inode, pid_t pid) +{ + fd_t *fd = NULL; + fd_t *iter_fd = NULL; + + LOCK (&inode->lock); + { + if (list_empty (&inode->fd_list)) { + fd = NULL; + } else { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + if (pid) { + if (iter_fd->pid == pid) { + fd = _fd_ref (iter_fd); + break; + } + } else { + fd = _fd_ref (iter_fd); + break; + } + } + } + } + UNLOCK (&inode->lock); + + return fd; +} + +uint8_t +fd_list_empty (inode_t *inode) +{ + uint8_t empty = 0; + + LOCK (&inode->lock); + { + empty = list_empty (&inode->fd_list); + } + UNLOCK (&inode->lock); + + return empty; +} + +int +fd_ctx_set (fd_t *fd, xlator_t *xlator, uint64_t value) +{ + int index = 0; + + if (!fd || !xlator) + return -1; + + for (index = 0; index < xlator->ctx->xl_count; index++) { + if (!fd->_ctx[index].key || + (fd->_ctx[index].key == (uint64_t)(long)xlator)) + break; + } + + if (index == xlator->ctx->xl_count) + return -1; + + fd->_ctx[index].key = (uint64_t)(long) xlator; + fd->_ctx[index].value = value; + + return 0; +} + +int +fd_ctx_get (fd_t *fd, xlator_t *xlator, uint64_t *value) +{ + int index = 0; + + if (!fd || !xlator) + return -1; + + for (index = 0; index < xlator->ctx->xl_count; index++) { + if (fd->_ctx[index].key == (uint64_t)(long)xlator) + break; + } + + if (index == xlator->ctx->xl_count) + return -1; + + if (value) + *value = fd->_ctx[index].value; + + return 0; +} + + +int +fd_ctx_del (fd_t *fd, xlator_t *xlator, uint64_t *value) +{ + int index = 0; + + if (!fd || !xlator) + return -1; + + for (index = 0; index < xlator->ctx->xl_count; index++) { + if (fd->_ctx[index].key == (uint64_t)(long)xlator) + break; + } + + if (index == xlator->ctx->xl_count) + return -1; + + if (value) + *value = fd->_ctx[index].value; + + fd->_ctx[index].key = 0; + fd->_ctx[index].value = 0; + + return 0; +} diff --git a/libglusterfs/src/fd.h b/libglusterfs/src/fd.h new file mode 100644 index 000000000..8b8effdc3 --- /dev/null +++ b/libglusterfs/src/fd.h @@ -0,0 +1,107 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _FD_H +#define _FD_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "list.h" +#include +#include +#include "glusterfs.h" + +struct _inode; +struct _dict; +struct _fd_ctx { + uint64_t key; + uint64_t value; +}; + +struct _fd { + pid_t pid; + int32_t flags; + int32_t refcount; + struct list_head inode_list; + struct _inode *inode; + struct _dict *ctx; + struct _fd_ctx *_ctx; +}; +typedef struct _fd fd_t; + +struct _fdtable { + int refcount; + uint32_t max_fds; + pthread_mutex_t lock; + fd_t **fds; +}; +typedef struct _fdtable fdtable_t; + +#include "logging.h" +#include "xlator.h" + +inline void +gf_fd_put (fdtable_t *fdtable, int32_t fd); + +fd_t * +gf_fd_fdptr_get (fdtable_t *fdtable, int64_t fd); + +fdtable_t * +gf_fd_fdtable_alloc (void); + +int32_t +gf_fd_unused_get (fdtable_t *fdtable, fd_t *fdptr); + +int32_t +gf_fd_unused_get2 (fdtable_t *fdtable, fd_t *fdptr, int32_t fd); + +void +gf_fd_fdtable_destroy (fdtable_t *fdtable); + +fd_t * +fd_ref (fd_t *fd); + +void +fd_unref (fd_t *fd); + +fd_t * +fd_create (struct _inode *inode, pid_t pid); + +fd_t * +fd_lookup (struct _inode *inode, pid_t pid); + +uint8_t +fd_list_empty (struct _inode *inode); + +fd_t * +fd_bind (fd_t *fd); + +int +fd_ctx_set (fd_t *fd, xlator_t *xlator, uint64_t value); + +int +fd_ctx_get (fd_t *fd, xlator_t *xlator, uint64_t *value); + +int +fd_ctx_del (fd_t *fd, xlator_t *xlator, uint64_t *value); + +#endif /* _FD_H */ diff --git a/libglusterfs/src/gf-dirent.c b/libglusterfs/src/gf-dirent.c new file mode 100644 index 000000000..28d1ab425 --- /dev/null +++ b/libglusterfs/src/gf-dirent.c @@ -0,0 +1,157 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include +#include +#include +#include "compat.h" +#include "xlator.h" +#include "byte-order.h" + + +struct gf_dirent_nb { + uint64_t d_ino; + uint64_t d_off; + uint32_t d_len; + uint32_t d_type; + char d_name[0]; +} __attribute__((packed)); + + +int +gf_dirent_nb_size (gf_dirent_t *entries) +{ + return (sizeof (struct gf_dirent_nb) + strlen (entries->d_name) + 1); +} + + +gf_dirent_t * +gf_dirent_for_name (const char *name) +{ + gf_dirent_t *gf_dirent = NULL; + + /* TODO: use mem-pool */ + gf_dirent = CALLOC (gf_dirent_size (name), 1); + if (!gf_dirent) + return NULL; + + INIT_LIST_HEAD (&gf_dirent->list); + strcpy (gf_dirent->d_name, name); + + gf_dirent->d_off = 0; + gf_dirent->d_ino = -1; + gf_dirent->d_type = 0; + gf_dirent->d_len = strlen (name); + + return gf_dirent; +} + + +void +gf_dirent_free (gf_dirent_t *entries) +{ + gf_dirent_t *entry = NULL; + gf_dirent_t *tmp = NULL; + + list_for_each_entry_safe (entry, tmp, &entries->list, list) { + list_del (&entry->list); + FREE (entry); + } +} + + +int +gf_dirent_serialize (gf_dirent_t *entries, char *buf, size_t buf_size) +{ + struct gf_dirent_nb *entry_nb = NULL; + gf_dirent_t *entry = NULL; + int size = 0; + int entry_size = 0; + + + list_for_each_entry (entry, &entries->list, list) { + entry_size = gf_dirent_nb_size (entry); + + if (buf && (size + entry_size <= buf_size)) { + entry_nb = (void *) (buf + size); + + entry_nb->d_ino = hton64 (entry->d_ino); + entry_nb->d_off = hton64 (entry->d_off); + entry_nb->d_len = hton32 (entry->d_len); + entry_nb->d_type = hton32 (entry->d_type); + + strcpy (entry_nb->d_name, entry->d_name); + } + size += entry_size; + } + + return size; +} + + +int +gf_dirent_unserialize (gf_dirent_t *entries, const char *buf, size_t buf_size) +{ + struct gf_dirent_nb *entry_nb = NULL; + int remaining_size = 0; + int least_dirent_size = 0; + int count = 0; + gf_dirent_t *entry = NULL; + int entry_strlen = 0; + int entry_len = 0; + + + remaining_size = buf_size; + least_dirent_size = (sizeof (struct gf_dirent_nb) + 2); + + while (remaining_size >= least_dirent_size) { + entry_nb = (void *)(buf + (buf_size - remaining_size)); + + entry_strlen = strnlen (entry_nb->d_name, remaining_size); + if (entry_strlen == remaining_size) { + break; + } + + entry_len = sizeof (gf_dirent_t) + entry_strlen + 1; + entry = CALLOC (1, entry_len); + if (!entry) { + break; + } + + entry->d_ino = ntoh64 (entry_nb->d_ino); + entry->d_off = ntoh64 (entry_nb->d_off); + entry->d_len = ntoh32 (entry_nb->d_len); + entry->d_type = ntoh32 (entry_nb->d_type); + strcpy (entry->d_name, entry_nb->d_name); + + list_add_tail (&entry->list, &entries->list); + + remaining_size -= (sizeof (*entry_nb) + entry_strlen + 1); + count++; + } + + return count; +} diff --git a/libglusterfs/src/gf-dirent.h b/libglusterfs/src/gf-dirent.h new file mode 100644 index 000000000..fa0a8a625 --- /dev/null +++ b/libglusterfs/src/gf-dirent.h @@ -0,0 +1,60 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +#ifndef _GF_DIRENT_H +#define _GF_DIRENT_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#define gf_dirent_size(name) (sizeof (gf_dirent_t) + strlen (name) + 1) + +struct _dir_entry_t { + struct _dir_entry_t *next; + char *name; + char *link; + struct stat buf; +}; + + +struct _gf_dirent_t { + union { + struct list_head list; + struct { + struct _gf_dirent_t *next; + struct _gf_dirent_t *prev; + }; + }; + uint64_t d_ino; + uint64_t d_off; + uint32_t d_len; + uint32_t d_type; + char d_name[0]; +}; + + +gf_dirent_t *gf_dirent_for_name (const char *name); +void gf_dirent_free (gf_dirent_t *entries); +int gf_dirent_serialize (gf_dirent_t *entries, char *buf, size_t size); +int gf_dirent_unserialize (gf_dirent_t *entries, const char *buf, size_t size); + +#endif /* _GF_DIRENT_H */ diff --git a/libglusterfs/src/glusterfs.h b/libglusterfs/src/glusterfs.h new file mode 100644 index 000000000..76891f5b0 --- /dev/null +++ b/libglusterfs/src/glusterfs.h @@ -0,0 +1,277 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _GLUSTERFS_H +#define _GLUSTERFS_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "list.h" +#include "logging.h" + +#define GF_YES 1 +#define GF_NO 0 + +#ifndef O_LARGEFILE +/* savannah bug #20053, patch for compiling on darwin */ +#define O_LARGEFILE 0 +#endif + +#ifndef O_DIRECT +/* savannah bug #20050, #20052 */ +#define O_DIRECT 0 /* From asm/fcntl.h */ +#endif + +#ifndef O_DIRECTORY +/* FreeBSD does not need O_DIRECTORY */ +#define O_DIRECTORY 0 +#endif + +#define ZR_FILE_CONTENT_STR "glusterfs.file." +#define ZR_FILE_CONTENT_STRLEN 15 + +#define GLUSTERFS_OPEN_FD_COUNT "glusterfs.open-fd-count" + +#define ZR_FILE_CONTENT_REQUEST(key) (!strncmp(key, ZR_FILE_CONTENT_STR, \ + ZR_FILE_CONTENT_STRLEN)) + +/* TODO: Should we use PATH-MAX? On some systems it may save space */ +#define ZR_PATH_MAX 4096 + +/* This is used as the maximum permitted filename length over FS. + * If the backend FS supports higher than this, it should be changed. + */ +#define ZR_FILENAME_MAX 256 + + +/* NOTE: add members ONLY at the end (just before _MAXVALUE) */ +typedef enum { + GF_FOP_STAT, /* 0 */ + GF_FOP_READLINK, /* 1 */ + GF_FOP_MKNOD, /* 2 */ + GF_FOP_MKDIR, + GF_FOP_UNLINK, + GF_FOP_RMDIR, /* 5 */ + GF_FOP_SYMLINK, + GF_FOP_RENAME, + GF_FOP_LINK, + GF_FOP_CHMOD, + GF_FOP_CHOWN, /* 10 */ + GF_FOP_TRUNCATE, + GF_FOP_OPEN, + GF_FOP_READ, + GF_FOP_WRITE, + GF_FOP_STATFS, /* 15 */ + GF_FOP_FLUSH, + GF_FOP_FSYNC, + GF_FOP_SETXATTR, + GF_FOP_GETXATTR, + GF_FOP_REMOVEXATTR,/* 20 */ + GF_FOP_OPENDIR, + GF_FOP_GETDENTS, + GF_FOP_FSYNCDIR, + GF_FOP_ACCESS, + GF_FOP_CREATE, /* 25 */ + GF_FOP_FTRUNCATE, + GF_FOP_FSTAT, + GF_FOP_LK, + GF_FOP_UTIMENS, + GF_FOP_FCHMOD, /* 30 */ + GF_FOP_FCHOWN, + GF_FOP_LOOKUP, + GF_FOP_SETDENTS, + GF_FOP_READDIR, + GF_FOP_INODELK, /* 35 */ + GF_FOP_FINODELK, + GF_FOP_ENTRYLK, + GF_FOP_FENTRYLK, + GF_FOP_CHECKSUM, + GF_FOP_XATTROP, /* 40 */ + GF_FOP_FXATTROP, + GF_FOP_MAXVALUE, +} glusterfs_fop_t; + +/* NOTE: add members ONLY at the end (just before _MAXVALUE) */ +typedef enum { + GF_MOP_SETVOLUME, /* 0 */ + GF_MOP_GETVOLUME, /* 1 */ + GF_MOP_STATS, + GF_MOP_SETSPEC, + GF_MOP_GETSPEC, + GF_MOP_PING, + GF_MOP_MAXVALUE /* 5 */ +} glusterfs_mop_t; + +typedef enum { + GF_CBK_FORGET, /* 0 */ + GF_CBK_RELEASE, /* 1 */ + GF_CBK_RELEASEDIR, /* 2 */ + GF_CBK_MAXVALUE /* 3 */ +} glusterfs_cbk_t; + +typedef enum { + GF_OP_TYPE_FOP_REQUEST = 1, + GF_OP_TYPE_MOP_REQUEST, + GF_OP_TYPE_CBK_REQUEST, + GF_OP_TYPE_FOP_REPLY, + GF_OP_TYPE_MOP_REPLY, + GF_OP_TYPE_CBK_REPLY +} glusterfs_op_type_t; + +/* NOTE: all the miscellaneous flags used by GlusterFS should be listed here */ +typedef enum { + GF_LK_GETLK = 0, + GF_LK_SETLK, + GF_LK_SETLKW, +} glusterfs_lk_cmds_t; + +typedef enum { + GF_LK_F_RDLCK = 0, + GF_LK_F_WRLCK, + GF_LK_F_UNLCK +} glusterfs_lk_types_t; + +typedef enum { + GF_LOCK_POSIX, + GF_LOCK_INTERNAL +} gf_lk_domain_t; + +typedef enum { + ENTRYLK_LOCK, + ENTRYLK_UNLOCK, + ENTRYLK_LOCK_NB +} entrylk_cmd; + +typedef enum { + ENTRYLK_RDLCK, + ENTRYLK_WRLCK +} entrylk_type; + +typedef enum { + GF_GET_ALL = 1, + GF_GET_DIR_ONLY, + GF_GET_SYMLINK_ONLY, + GF_GET_REGULAR_FILES_ONLY, +} glusterfs_getdents_flags_t; + +typedef enum { + GF_XATTROP_ADD_ARRAY, +} gf_xattrop_flags_t; + +#define GF_SET_IF_NOT_PRESENT 0x1 /* default behaviour */ +#define GF_SET_OVERWRITE 0x2 /* Overwrite with the buf given */ +#define GF_SET_DIR_ONLY 0x4 +#define GF_SET_EPOCH_TIME 0x8 /* used by afr dir lookup selfheal */ + + +struct _xlator_cmdline_option { + struct list_head cmd_args; + char *volume; + char *key; + char *value; +}; +typedef struct _xlator_cmdline_option xlator_cmdline_option_t; + +struct _cmd_args { + /* basic options */ + char *volfile_server; + char *volume_file; + gf_loglevel_t log_level; + char *log_file; + /* advanced options */ + uint32_t volfile_server_port; + char *volfile_server_transport; + char *pid_file; + int no_daemon_mode; + char *run_id; + int debug_mode; + struct list_head xlator_options; /* list of xlator_option_t */ + + /* fuse options */ + int fuse_direct_io_mode_flag; + unsigned int fuse_entry_timeout; + unsigned int fuse_attribute_timeout; + char *volume_name; + int non_local; /* Used only by darwin os, + used for '-o local' option */ + char *icon_name; /* This string will appear as + Desktop icon name when mounted + on darwin */ + int fuse_nodev; + int fuse_nosuid; + + /* key args */ + char *mount_point; + char *volfile_id; +}; +typedef struct _cmd_args cmd_args_t; + +struct _glusterfs_ctx { + cmd_args_t cmd_args; + char *process_uuid; + FILE *specfp; + FILE *pidfp; + char fin; + void *timer; + void *ib; + void *pool; + void *graph; + void *top; /* either fuse or server protocol */ + void *event_pool; + pthread_mutex_t lock; + int xl_count; +}; + +typedef struct _glusterfs_ctx glusterfs_ctx_t; + +typedef enum { + GF_EVENT_PARENT_UP = 1, + GF_EVENT_POLLIN, + GF_EVENT_POLLOUT, + GF_EVENT_POLLERR, + GF_EVENT_CHILD_UP, + GF_EVENT_CHILD_DOWN, + GF_EVENT_CHILD_CONNECTING, + GF_EVENT_TRANSPORT_CLEANUP, + GF_EVENT_TRANSPORT_CONNECTED, +} glusterfs_event_t; + +#define GF_MUST_CHECK __attribute__((warn_unused_result)) + +#endif /* _GLUSTERFS_H */ diff --git a/libglusterfs/src/hashfn.c b/libglusterfs/src/hashfn.c new file mode 100644 index 000000000..edc49678f --- /dev/null +++ b/libglusterfs/src/hashfn.c @@ -0,0 +1,89 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "hashfn.h" + +#define get16bits(d) (*((const uint16_t *) (d))) + +/* + This is apparently the "fastest hash function for strings". + Written by Paul Hsieh +*/ + +/* In any case make sure, you return 1 */ + +uint32_t SuperFastHash (const char * data, int32_t len) { + uint32_t hash = len, tmp; + int32_t rem; + + if (len <= 1 || data == NULL) return 1; + + + for (;len > 0; len--) { + hash ^= data[len]; + + return hash; + } + + rem = len & 3; + len >>= 2; + + /* Main loop */ + for (;len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (uint16_t)] << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} diff --git a/libglusterfs/src/hashfn.h b/libglusterfs/src/hashfn.h new file mode 100644 index 000000000..13673f6e6 --- /dev/null +++ b/libglusterfs/src/hashfn.h @@ -0,0 +1,33 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef __HASHFN_H__ +#define __HASHFN_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include + +uint32_t SuperFastHash (const char * data, int32_t len); + +#endif /* __HASHFN_H__ */ diff --git a/libglusterfs/src/inode.c b/libglusterfs/src/inode.c new file mode 100644 index 000000000..6c527fc75 --- /dev/null +++ b/libglusterfs/src/inode.c @@ -0,0 +1,1174 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "inode.h" +#include "common-utils.h" +#include +#include +#include +#include "list.h" +#include +#include + +/* TODO: + move latest accessed dentry to list_head of inode +*/ + +static inode_t * +__inode_unref (inode_t *inode); + +static int +inode_table_prune (inode_table_t *table); + +static int +hash_name (ino_t par, + const char *name, + int mod) +{ + int hash = 0; + int ret = 0; + + hash = *name; + if (hash) { + for (name += 1; *name != '\0'; name++) { + hash = (hash << 5) - hash + *name; + } + } + ret = (hash + par) % mod; + + return ret; +} + + +static int +hash_inode (ino_t ino, + int mod) +{ + int hash = 0; + + hash = ino % mod; + + return hash; +} + + +static void +__dentry_hash (dentry_t *dentry) +{ + inode_table_t *table = NULL; + int hash = 0; + + table = dentry->inode->table; + hash = hash_name (dentry->parent->ino, dentry->name, + table->hashsize); + + list_del_init (&dentry->hash); + list_add (&dentry->hash, &table->name_hash[hash]); + + list_del_init (&dentry->parent_list); + list_add (&dentry->parent_list, &dentry->parent->child_list); + + gf_log (table->name, GF_LOG_DEBUG, + "dentry hashed %s (%"PRId64")", + dentry->name, dentry->inode->ino); +} + + +static int +__is_dentry_hashed (dentry_t *dentry) +{ + return !list_empty (&dentry->hash); +} + + +static void +__dentry_unhash (dentry_t *dentry) +{ + list_del_init (&dentry->hash); + + gf_log (dentry->inode->table->name, GF_LOG_DEBUG, + "dentry unhashed %s (%"PRId64")", + dentry->name, dentry->inode->ino); +} + + +static void +__dentry_unset (dentry_t *dentry) +{ + __dentry_unhash (dentry); + + list_del_init (&dentry->inode_list); + + gf_log (dentry->inode->table->name, GF_LOG_DEBUG, + "unset dentry %s (%"PRId64")", + dentry->name, dentry->inode->ino); + + if (dentry->name) + FREE (dentry->name); + + if (dentry->parent) { + list_del_init (&dentry->parent_list); + __inode_unref (dentry->parent); + dentry->parent = NULL; + } + + FREE (dentry); +} + + +static void +__inode_unhash (inode_t *inode) +{ + list_del_init (&inode->hash); +} + + +static int +__is_inode_hashed (inode_t *inode) +{ + return !list_empty (&inode->hash); +} + + +static void +__inode_hash (inode_t *inode) +{ + inode_table_t *table = NULL; + int hash = 0; + + table = inode->table; + hash = hash_inode (inode->ino, table->hashsize); + + list_del_init (&inode->hash); + list_add (&inode->hash, &table->inode_hash[hash]); +} + + +static inode_t * +__inode_search (inode_table_t *table, + ino_t ino) +{ + int hash = 0; + inode_t *inode = NULL; + inode_t *tmp = NULL; + + hash = hash_inode (ino, table->hashsize); + + list_for_each_entry (tmp, &table->inode_hash[hash], hash) { + if (tmp->ino == ino) { + inode = tmp; + break; + } + } + + return inode; +} + + +static dentry_t * +__dentry_search_for_inode (inode_t *inode, + ino_t par, + const char *name) +{ + dentry_t *dentry = NULL; + dentry_t *tmp = NULL; + + list_for_each_entry (tmp, &inode->dentry_list, inode_list) { + if (tmp->parent->ino == par && !strcmp (tmp->name, name)) { + dentry = tmp; + break; + } + } + + return dentry; +} + + +static dentry_t * +__dentry_search (inode_table_t *table, + ino_t par, + const char *name) +{ + int hash = 0; + dentry_t *dentry = NULL; + dentry_t *tmp = NULL; + + hash = hash_name (par, name, table->hashsize); + + list_for_each_entry (tmp, &table->name_hash[hash], hash) { + if (tmp->parent->ino == par && !strcmp (tmp->name, name)) { + dentry = tmp; + break; + } + } + + return dentry; +} + + +static void +__inode_destroy (inode_t *inode) +{ + int index = 0; + data_pair_t *pair = NULL; + xlator_t *xl = NULL; + + if (!inode->ctx) { + goto noctx; + } + for (pair = inode->ctx->members_list; pair; pair = pair->next) { + /* notify all xlators which have a context */ + xl = xlator_search_by_name (inode->table->xl, pair->key); + + if (!xl) { + gf_log (inode->table->name, GF_LOG_CRITICAL, + "inode(%"PRId64")->ctx has invalid key(%s)", + inode->ino, pair->key); + continue; + } + + if (xl->cbks->forget) + xl->cbks->forget (xl, inode); + else + gf_log (inode->table->name, GF_LOG_CRITICAL, + "xlator(%s) in inode(%"PRId64") no FORGET fop", + xl->name, inode->ino); + } + dict_destroy (inode->ctx); + + if (!inode->_ctx) + goto noctx; + + for (index = 0; index < inode->table->xl->ctx->xl_count; index++) { + if (inode->_ctx[index].key) { + xl = (xlator_t *)(long)inode->_ctx[index].key; + if (xl->cbks->forget) + xl->cbks->forget (xl, inode); + } + } + + FREE (inode->_ctx); +noctx: + + if (inode->ino) + gf_log (inode->table->name, GF_LOG_DEBUG, + "destroy inode(%"PRId64") [@%p]", inode->ino, inode); + + LOCK_DESTROY (&inode->lock); + // memset (inode, 0xb, sizeof (*inode)); + FREE (inode); +} + + +static void +__inode_activate (inode_t *inode) +{ + list_move (&inode->list, &inode->table->active); + inode->table->active_size++; + + gf_log (inode->table->name, GF_LOG_DEBUG, + "activating inode(%"PRId64"), lru=%d/%d active=%d purge=%d", + inode->ino, inode->table->lru_size, inode->table->lru_limit, + inode->table->active_size, inode->table->purge_size); +} + + +static void +__inode_passivate (inode_t *inode) +{ + dentry_t *dentry = NULL; + dentry_t *t = NULL; + inode_table_t *table = NULL; + + table = inode->table; + + list_move_tail (&inode->list, &inode->table->lru); + inode->table->lru_size++; + + gf_log (table->name, GF_LOG_DEBUG, + "passivating inode(%"PRId64") lru=%d/%d active=%d purge=%d", + inode->ino, table->lru_size, table->lru_limit, + table->active_size, table->purge_size); + + list_for_each_entry_safe (dentry, t, &inode->dentry_list, inode_list) { + if (!__is_dentry_hashed (dentry)) + __dentry_unset (dentry); + } +} + + +static void +__inode_retire (inode_t *inode) +{ + dentry_t *dentry = NULL; + dentry_t *t = NULL; + inode_table_t *table = NULL; + + table = inode->table; + + list_move_tail (&inode->list, &inode->table->purge); + inode->table->purge_size++; + + gf_log (table->name, GF_LOG_DEBUG, + "retiring inode(%"PRId64") lru=%d/%d active=%d purge=%d", + inode->ino, table->lru_size, table->lru_limit, + table->active_size, table->purge_size); + + __inode_unhash (inode); + assert (list_empty (&inode->child_list)); + + list_for_each_entry_safe (dentry, t, &inode->dentry_list, inode_list) { + __dentry_unset (dentry); + } +} + + +static inode_t * +__inode_unref (inode_t *inode) +{ + if (inode->ino == 1) + return inode; + + assert (inode->ref); + + --inode->ref; + + if (!inode->ref) { + inode->table->active_size--; + + if (inode->nlookup && __is_inode_hashed (inode)) + __inode_passivate (inode); + else + __inode_retire (inode); + } + + return inode; +} + + +static inode_t * +__inode_ref (inode_t *inode) +{ + if (!inode->ref) { + inode->table->lru_size--; + __inode_activate (inode); + } + inode->ref++; + + return inode; +} + + +inode_t * +inode_unref (inode_t *inode) +{ + inode_table_t *table = NULL; + + table = inode->table; + + pthread_mutex_lock (&table->lock); + { + inode = __inode_unref (inode); + } + pthread_mutex_unlock (&table->lock); + + inode_table_prune (table); + + return inode; +} + + +inode_t * +inode_ref (inode_t *inode) +{ + inode_table_t *table = NULL; + + table = inode->table; + + pthread_mutex_lock (&table->lock); + { + inode = __inode_ref (inode); + } + pthread_mutex_unlock (&table->lock); + + return inode; +} + + +static dentry_t * +__dentry_create (inode_t *inode, + inode_t *parent, + const char *name) +{ + dentry_t *newd = NULL; + + newd = (void *) CALLOC (1, sizeof (*newd)); + + INIT_LIST_HEAD (&newd->inode_list); + INIT_LIST_HEAD (&newd->parent_list); + INIT_LIST_HEAD (&newd->hash); + + list_add (&newd->parent_list, &parent->child_list); + newd->parent = __inode_ref (parent); + newd->name = strdup (name); + + list_add (&newd->inode_list, &inode->dentry_list); + newd->inode = inode; + + return newd; +} + + +static inode_t * +__inode_create (inode_table_t *table) +{ + inode_t *newi = NULL; + + newi = (void *) CALLOC (1, sizeof (*newi)); + if (!newi) + return NULL; + + newi->table = table; + + LOCK_INIT (&newi->lock); + + INIT_LIST_HEAD (&newi->fd_list); + INIT_LIST_HEAD (&newi->list); + INIT_LIST_HEAD (&newi->hash); + INIT_LIST_HEAD (&newi->dentry_list); + INIT_LIST_HEAD (&newi->child_list); + + + list_add (&newi->list, &table->lru); + table->lru_size++; + + newi->_ctx = CALLOC (1, (sizeof (struct _inode_ctx) * + table->xl->ctx->xl_count)); + + newi->ctx = get_new_dict (); + gf_log (table->name, GF_LOG_DEBUG, + "create inode(%"PRId64")", newi->ino); + + return newi; +} + + +inode_t * +inode_new (inode_table_t *table) +{ + inode_t *inode = NULL; + + pthread_mutex_lock (&table->lock); + { + inode = __inode_create (table); + __inode_ref (inode); + } + pthread_mutex_unlock (&table->lock); + + return inode; +} + + +static inode_t * +__inode_lookup (inode_t *inode) +{ + inode->nlookup++; + + return inode; +} + + +static inode_t * +__inode_forget (inode_t *inode, uint64_t nlookup) +{ + assert (inode->nlookup >= nlookup); + + inode->nlookup -= nlookup; + + if (!nlookup) + inode->nlookup = 0; + + return inode; +} + + +inode_t * +inode_search (inode_table_t *table, + ino_t ino, + const char *name) +{ + inode_t *inode = NULL; + dentry_t *dentry = NULL; + + pthread_mutex_lock (&table->lock); + { + if (!name) { + inode = __inode_search (table, ino); + } else { + dentry = __dentry_search (table, ino, name); + + if (dentry) + inode = dentry->inode; + } + + if (inode) + __inode_ref (inode); + } + pthread_mutex_unlock (&table->lock); + + return inode; +} + + +static void +__copy_dentries (inode_t *oldi, inode_t *newi) +{ + dentry_t *dentry = NULL; + dentry_t *newd = NULL; + dentry_t *tmp = NULL; + + list_for_each_entry (dentry, &oldi->dentry_list, inode_list) { + tmp = __dentry_search_for_inode (newi, dentry->parent->ino, + dentry->name); + + if (!tmp) { + newd = __dentry_create (newi, dentry->parent, + dentry->name); + } else { + newd = tmp; + } + + if (__is_dentry_hashed (dentry)) { + __dentry_unhash (dentry); + __dentry_hash (newd); + } + } +} + + +static void +__adopt_children (inode_t *oldi, inode_t *newi) +{ + dentry_t *dentry = NULL; + + list_for_each_entry (dentry, &oldi->child_list, parent_list) { + assert (dentry->parent == oldi); + __inode_unref (dentry->parent); + dentry->parent = __inode_ref (newi); + } + + list_splice_init (&oldi->child_list, &newi->child_list); +} + + +static void +__inode_replace (inode_t *oldi, inode_t *newi) +{ + gf_log (oldi->table->name, GF_LOG_DEBUG, + "inode(%"PRId64") replaced (%"PRId64"", + oldi->ino, newi->ino); + + __copy_dentries (oldi, newi); + __adopt_children (oldi, newi); + + newi->nlookup = oldi->nlookup; + newi->generation = oldi->generation; + + oldi->nlookup = 0; + oldi->generation = 0; + + __inode_unhash (oldi); + + if (newi->ino == 1) + newi->table->root = newi; +} + + +static inode_t * +__inode_link (inode_t *inode, + inode_t *parent, + const char *name, + struct stat *stbuf) +{ + dentry_t *dentry = NULL; + dentry_t *old_dentry = NULL; + inode_t *old_inode = NULL; + inode_table_t *table = NULL; + + table = inode->table; + + if (inode->ino) + assert (inode->ino == stbuf->st_ino); + + inode->ino = stbuf->st_ino; + inode->st_mode = stbuf->st_mode; + + old_inode = __inode_search (table, stbuf->st_ino); + + if (old_inode && old_inode != inode) { + __inode_ref (old_inode); + __inode_replace (old_inode, inode); + __inode_unref (old_inode); + } + __inode_hash (inode); + + if (parent) { + dentry = __dentry_search_for_inode (inode, parent->ino, name); + if (!dentry) { + dentry = __dentry_create (inode, parent, name); + } + + old_dentry = __dentry_search (table, parent->ino, name); + if (old_dentry) { + __dentry_unhash (old_dentry); + } + + __dentry_hash (dentry); + } else if (inode->ino != 1) { + gf_log (table->name, GF_LOG_ERROR, + "child (%"PRId64") without a parent :O", inode->ino); + } + + return inode; +} + + +int +inode_link (inode_t *inode, + inode_t *parent, + const char *name, + struct stat *stbuf) +{ + inode_table_t *table = NULL; + + table = inode->table; + + pthread_mutex_lock (&table->lock); + { + inode = __inode_link (inode, parent, name, stbuf); + } + pthread_mutex_unlock (&table->lock); + + inode_table_prune (table); + + return 0; +} + + +int +inode_lookup (inode_t *inode) +{ + inode_table_t *table = NULL; + inode_t *lookup_inode = NULL; + + table = inode->table; + lookup_inode = inode; + + pthread_mutex_lock (&table->lock); + { + if (!__is_inode_hashed (inode)) { + lookup_inode = __inode_search (table, inode->ino); + } + + __inode_lookup (lookup_inode); + } + pthread_mutex_unlock (&table->lock); + + return 0; +} + + +int +inode_forget (inode_t *inode, uint64_t nlookup) +{ + inode_table_t *table = NULL; + inode_t *forget_inode = NULL; + + table = inode->table; + forget_inode = inode; + + pthread_mutex_lock (&table->lock); + { + if (!__is_inode_hashed (inode)) { + forget_inode = __inode_search (table, inode->ino); + } + + __inode_forget (forget_inode, nlookup); + } + pthread_mutex_unlock (&table->lock); + + inode_table_prune (table); + + return 0; +} + + +static void +__inode_unlink (inode_t *inode, + inode_t *parent, + const char *name) +{ + dentry_t *dentry = NULL; + + dentry = __dentry_search_for_inode (inode, parent->ino, name); + + /* dentry NULL for corrupted backend */ + if (dentry) + __dentry_unset (dentry); +} + + +void +inode_unlink (inode_t *inode, + inode_t *parent, + const char *name) +{ + inode_table_t *table = NULL; + inode_t *unlink_inode = NULL; + + table = inode->table; + unlink_inode = inode; + + pthread_mutex_lock (&table->lock); + { + if (!__is_inode_hashed (inode)) { + unlink_inode = __inode_search (table, inode->ino); + } + + __inode_unlink (unlink_inode, parent, name); + } + pthread_mutex_unlock (&table->lock); + + inode_table_prune (table); +} + + +int +inode_rename (inode_table_t *table, + inode_t *srcdir, + const char *srcname, + inode_t *dstdir, + const char *dstname, + inode_t *inode, + struct stat *stbuf) +{ + dentry_t *old_dst = NULL; + inode_t *rename_inode = NULL; + + rename_inode = inode; + + pthread_mutex_lock (&table->lock); + { + if (!__is_inode_hashed (inode)) { + rename_inode = __inode_search (table, inode->ino); + } + + old_dst = __dentry_search (table, dstdir->ino, dstname); + if (old_dst) + __dentry_unset (old_dst); + + __inode_unlink (rename_inode, srcdir, srcname); + __inode_link (rename_inode, dstdir, dstname, stbuf); + } + pthread_mutex_unlock (&table->lock); + + inode_table_prune (table); + + return 0; +} + + +static dentry_t * +__dentry_search_arbit (inode_t *inode) +{ + dentry_t *dentry = NULL; + dentry_t *trav = NULL; + + list_for_each_entry (trav, &inode->dentry_list, inode_list) { + if (__is_dentry_hashed (trav)) { + dentry = trav; + break; + } + } + + if (!dentry) { + list_for_each_entry (trav, &inode->dentry_list, inode_list) { + dentry = trav; + break; + } + } + + return dentry; +} + + +inode_t * +inode_parent (inode_t *inode, ino_t par, const char *name) +{ + inode_t *parent = NULL; + inode_table_t *table = NULL; + dentry_t *dentry = NULL; + + table = inode->table; + + pthread_mutex_lock (&table->lock); + { + if (par && name) { + dentry = __dentry_search_for_inode (inode, par, name); + } else { + dentry = __dentry_search_arbit (inode); + } + + if (dentry) + parent = __inode_ref (dentry->parent); + } + pthread_mutex_unlock (&table->lock); + + return parent; +} + + +int32_t +inode_path (inode_t *inode, + const char *name, + char **bufp) +{ + inode_table_t *table = NULL; + dentry_t *trav = NULL; + size_t i = 0, size = 0; + int64_t ret = 0; + int len = 0; + char *buf = NULL; + + table = inode->table; + + pthread_mutex_lock (&table->lock); + { + for (trav = __dentry_search_arbit (inode); trav; + trav = __dentry_search_arbit (trav->parent)) { + i ++; /* "/" */ + i += strlen (trav->name); + } + + if ((inode->ino != 1) && + (i == 0)) { + gf_log (table->name, GF_LOG_DEBUG, + "no dentry for non-root inode %"PRId64, + inode->ino); + ret = -ENOENT; + goto unlock; + } + + if (name) { + i++; + i += strlen (name); + } + + ret = i; + size = i + 1; + buf = CALLOC (size, sizeof (char)); + if (buf) { + + buf[size - 1] = 0; + + if (name) { + len = strlen (name); + strncpy (buf + (i - len), name, len); + buf[i-len-1] = '/'; + i -= (len + 1); + } + + for (trav = __dentry_search_arbit (inode); trav; + trav = __dentry_search_arbit (trav->parent)) { + len = strlen (trav->name); + strncpy (buf + (i - len), trav->name, len); + buf[i-len-1] = '/'; + i -= (len + 1); + } + *bufp = buf; + } else { + gf_log (table->name, GF_LOG_ERROR, + "out of memory"); + ret = -ENOMEM; + } + } +unlock: + pthread_mutex_unlock (&table->lock); + + if (inode->ino == 1 && !name) { + ret = 1; + if (buf) { + FREE (buf); + } + buf = CALLOC (ret + 1, sizeof (char)); + if (buf) { + strcpy (buf, "/"); + *bufp = buf; + } else { + gf_log (table->name, GF_LOG_ERROR, + "out of memory"); + ret = -ENOMEM; + } + } + + return ret; +} + +static int +inode_table_prune (inode_table_t *table) +{ + int ret = 0; + struct list_head purge = {0, }; + inode_t *del = NULL; + inode_t *tmp = NULL; + inode_t *entry = NULL; + + + INIT_LIST_HEAD (&purge); + + pthread_mutex_lock (&table->lock); + { + while (table->lru_limit + && table->lru_size > (table->lru_limit)) { + + entry = list_entry (table->lru.next, inode_t, list); + + table->lru_size--; + __inode_retire (entry); + + ret++; + } + + list_splice_init (&table->purge, &purge); + table->purge_size = 0; + } + pthread_mutex_unlock (&table->lock); + + { + list_for_each_entry_safe (del, tmp, &purge, list) { + list_del_init (&del->list); + __inode_forget (del, 0); + __inode_destroy (del); + } + } + + return ret; +} + + +static void +__inode_table_init_root (inode_table_t *table) +{ + inode_t *root = NULL; + struct stat stbuf = {0, }; + + root = __inode_create (table); + + stbuf.st_ino = 1; + stbuf.st_mode = S_IFDIR|0755; + + __inode_link (root, NULL, NULL, &stbuf); + table->root = root; +} + + +inode_table_t * +inode_table_new (size_t lru_limit, xlator_t *xl) +{ + inode_table_t *new = NULL; + int i = 0; + + + new = (void *)calloc (1, sizeof (*new)); + if (!new) + return NULL; + + gf_log (xl->name, GF_LOG_DEBUG, + "creating new inode table with lru_limit=%"GF_PRI_SIZET"", lru_limit); + + new->xl = xl; + + new->lru_limit = lru_limit; + + new->hashsize = 14057; /* TODO: Random Number?? */ + + new->inode_hash = (void *)calloc (new->hashsize, + sizeof (struct list_head)); + if (!new->inode_hash) { + FREE (new); + return NULL; + } + + new->name_hash = (void *)calloc (new->hashsize, + sizeof (struct list_head)); + if (!new->name_hash) { + FREE (new->inode_hash); + FREE (new); + return NULL; + } + + for (i=0; ihashsize; i++) { + INIT_LIST_HEAD (&new->inode_hash[i]); + } + + + for (i=0; ihashsize; i++) { + INIT_LIST_HEAD (&new->name_hash[i]); + } + + INIT_LIST_HEAD (&new->active); + INIT_LIST_HEAD (&new->lru); + INIT_LIST_HEAD (&new->purge); + + asprintf (&new->name, "%s/inode", xl->name); + + __inode_table_init_root (new); + + pthread_mutex_init (&new->lock, NULL); + + return new; +} + + +inode_t * +inode_from_path (inode_table_t *itable, const char *path) +{ + inode_t *inode = NULL; + inode_t *parent = NULL; + inode_t *root = NULL; + inode_t *curr = NULL; + char *pathname = NULL; + char *component = NULL, *next_component = NULL; + char *strtokptr = NULL; + + /* top-down approach */ + root = itable->root; + parent = inode_ref (root); + pathname = strdup (path); + component = strtok_r (pathname, "/", &strtokptr); + + if (component == NULL) + /* root inode */ + inode = inode_ref (parent); + + while (component) { + curr = inode_search (itable, parent->ino, component); + + if (curr == NULL) { + component = strtok_r (NULL, "/", &strtokptr); + break; + } + + next_component = strtok_r (NULL, "/", &strtokptr); + + if (next_component) { + inode_unref (parent); + parent = curr; + curr = NULL; + } else { + inode = curr; + } + + component = next_component; + } + + if (parent) + inode_unref (parent); + + if (pathname) + free (pathname); + + return inode; +} + +int +inode_ctx_put (inode_t *inode, xlator_t *xlator, uint64_t value) +{ + int index = 0; + + if (!inode || !xlator) + return -1; + + for (index = 0; index < xlator->ctx->xl_count; index++) { + if (!inode->_ctx[index].key || + (inode->_ctx[index].key == (uint64_t)(long)xlator)) + break; + } + + if (index == xlator->ctx->xl_count) + return -1; + + inode->_ctx[index].key = (uint64_t)(long) xlator; + inode->_ctx[index].value = value; + + return 0; +} + +int +inode_ctx_get (inode_t *inode, xlator_t *xlator, uint64_t *value) +{ + int index = 0; + + if (!inode || !xlator) + return -1; + + for (index = 0; index < xlator->ctx->xl_count; index++) { + if (inode->_ctx[index].key == (uint64_t)(long)xlator) + break; + } + + if (index == xlator->ctx->xl_count) + return -1; + + if (value) + *value = inode->_ctx[index].value; + + return 0; +} + + +int +inode_ctx_del (inode_t *inode, xlator_t *xlator, uint64_t *value) +{ + int index = 0; + + if (!inode || !xlator) + return -1; + + for (index = 0; index < xlator->ctx->xl_count; index++) { + if (inode->_ctx[index].key == (uint64_t)(long)xlator) + break; + } + + if (index == xlator->ctx->xl_count) + return -1; + + if (value) + *value = inode->_ctx[index].value; + + inode->_ctx[index].key = 0; + inode->_ctx[index].value = 0; + + return 0; +} diff --git a/libglusterfs/src/inode.h b/libglusterfs/src/inode.h new file mode 100644 index 000000000..67490f0ee --- /dev/null +++ b/libglusterfs/src/inode.h @@ -0,0 +1,160 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _INODE_H +#define _INODE_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include + +struct _inode_table; +typedef struct _inode_table inode_table_t; + +struct _inode; +typedef struct _inode inode_t; + +struct _dentry; +typedef struct _dentry dentry_t; + +#include "list.h" +#include "xlator.h" + + +struct _inode_table { + pthread_mutex_t lock; + size_t hashsize; /* bucket size of inode hash and dentry hash */ + char *name; /* name of the inode table, just for gf_log() */ + inode_t *root; /* root directory inode, with number 1 */ + xlator_t *xl; /* xlator to be called to do purge */ + uint32_t lru_limit; /* maximum LRU cache size */ + struct list_head *inode_hash; /* buckets for inode hash table */ + struct list_head *name_hash; /* buckets for dentry hash table */ + struct list_head active; /* list of inodes currently active (in an fop) */ + uint32_t active_size; /* count of inodes in active list */ + struct list_head lru; /* list of inodes recently used. + lru.next most recent */ + uint32_t lru_size; /* count of inodes in lru list */ + struct list_head purge; /* list of inodes to be purged soon */ + uint32_t purge_size; /* count of inodes in purge list */ +}; + + +struct _dentry { + struct list_head inode_list; /* list of dentries of inode */ + struct list_head hash; /* hash table pointers */ + struct list_head parent_list; /* list of dentries under the parent */ + inode_t *inode; /* inode of this directory entry */ + char *name; /* name of the directory entry */ + inode_t *parent; /* directory of the entry */ +}; + +//#define ZR_INODE_CTX_VALUE_LEN 2 +struct _inode_ctx { + uint64_t key; + uint64_t value; + //uint64_t value[ZR_INODE_CTX_VALUE_LEN]; +}; + +struct _inode { + inode_table_t *table; /* the table this inode belongs to */ + gf_lock_t lock; + uint64_t nlookup; + uint64_t generation; + uint32_t ref; /* reference count on this inode */ + ino_t ino; /* inode number in the storage (persistent) */ + dict_t *ctx; /* per xlator private */ + mode_t st_mode; /* what kind of file */ + struct list_head fd_list; /* list of open files on this inode */ + struct list_head dentry_list; /* list of directory entries for this inode */ + struct list_head child_list; /* list of directory entries under this inode */ + struct list_head hash; /* hash table pointers */ + struct list_head list; /* active/lru/purge */ + + struct _inode_ctx *_ctx; /* replacement for dict_t *(inode->ctx) */ +}; + + +inode_table_t * +inode_table_new (size_t lru_limit, xlator_t *xl); + +inode_t * +inode_new (inode_table_t *table); + +inode_t * +inode_search (inode_table_t *table, ino_t ino, const char *name); + +int +inode_link (inode_t *inode, inode_t *parent, + const char *name, struct stat *stbuf); + +void +inode_unlink (inode_t *inode, + inode_t *parent, + const char *name); + +inode_t * +inode_parent (inode_t *inode, ino_t par, const char *name); + +inode_t * +inode_ref (inode_t *inode); + +inode_t * +inode_unref (inode_t *inode); + +int +inode_lookup (inode_t *inode); + +int +inode_forget (inode_t *inode, + uint64_t nlookup); + +int +inode_rename (inode_table_t *table, + inode_t *olddir, + const char *oldname, + inode_t *newdir, + const char *newname, + inode_t *inode, + struct stat *stbuf); + + +int32_t +inode_path (inode_t *inode, + const char *name, + char **bufp); + +inode_t * +inode_from_path (inode_table_t *table, + const char *path); + +int +inode_ctx_put (inode_t *inode, xlator_t *xlator, uint64_t value); + +int +inode_ctx_get (inode_t *inode, xlator_t *xlator, uint64_t *value); + +int +inode_ctx_del (inode_t *inode, xlator_t *xlator, uint64_t *value); + +#endif /* _INODE_H */ diff --git a/libglusterfs/src/list.h b/libglusterfs/src/list.h new file mode 100644 index 000000000..0d862f09c --- /dev/null +++ b/libglusterfs/src/list.h @@ -0,0 +1,154 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _LLIST_H +#define _LLIST_H + + +struct list_head { + struct list_head *next; + struct list_head *prev; +}; + + +#define INIT_LIST_HEAD(head) do { \ + (head)->next = (head)->prev = head; \ + } while (0) + + +static inline void +list_add (struct list_head *new, struct list_head *head) +{ + new->prev = head; + new->next = head->next; + + new->prev->next = new; + new->next->prev = new; +} + + +static inline void +list_add_tail (struct list_head *new, struct list_head *head) +{ + new->next = head; + new->prev = head->prev; + + new->prev->next = new; + new->next->prev = new; +} + + +static inline void +list_del (struct list_head *old) +{ + old->prev->next = old->next; + old->next->prev = old->prev; + + old->next = (void *)0xbabebabe; + old->prev = (void *)0xcafecafe; +} + + +static inline void +list_del_init (struct list_head *old) +{ + old->prev->next = old->next; + old->next->prev = old->prev; + + old->next = old; + old->prev = old; +} + + +static inline void +list_move (struct list_head *list, struct list_head *head) +{ + list_del (list); + list_add (list, head); +} + + +static inline void +list_move_tail (struct list_head *list, struct list_head *head) +{ + list_del (list); + list_add_tail (list, head); +} + + +static inline int +list_empty (struct list_head *head) +{ + return (head->next == head); +} + + +static inline void +__list_splice (struct list_head *list, struct list_head *head) +{ + (list->prev)->next = (head->next); + (head->next)->prev = (list->prev); + + (head)->next = (list->next); + (list->next)->prev = (head); +} + + +static inline void +list_splice (struct list_head *list, struct list_head *head) +{ + if (list_empty (list)) + return; + + __list_splice (list, head); +} + + +static inline void +list_splice_init (struct list_head *list, struct list_head *head) +{ + if (list_empty (list)) + return; + + __list_splice (list, head); + INIT_LIST_HEAD (list); +} + + +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + + +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + + +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + + +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +#endif /* _LLIST_H */ diff --git a/libglusterfs/src/locking.h b/libglusterfs/src/locking.h new file mode 100644 index 000000000..c5c5163d1 --- /dev/null +++ b/libglusterfs/src/locking.h @@ -0,0 +1,49 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _LOCKING_H +#define _LOCKING_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include + +#if HAVE_SPINLOCK +#define LOCK_INIT(x) pthread_spin_init (x, 0) +#define LOCK(x) pthread_spin_lock (x) +#define TRY_LOCK(x) pthread_spin_trylock (x) +#define UNLOCK(x) pthread_spin_unlock (x) +#define LOCK_DESTROY(x) pthread_spin_destroy (x) + +typedef pthread_spinlock_t gf_lock_t; +#else +#define LOCK_INIT(x) pthread_mutex_init (x, 0) +#define LOCK(x) pthread_mutex_lock (x) +#define TRY_LOCK(x) pthread_mutex_trylock (x) +#define UNLOCK(x) pthread_mutex_unlock (x) +#define LOCK_DESTROY(x) pthread_mutex_destroy (x) + +typedef pthread_mutex_t gf_lock_t; +#endif /* HAVE_SPINLOCK */ + + +#endif /* _LOCKING_H */ diff --git a/libglusterfs/src/logging.c b/libglusterfs/src/logging.c new file mode 100644 index 000000000..c0b10edca --- /dev/null +++ b/libglusterfs/src/logging.c @@ -0,0 +1,207 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include "logging.h" + + +static pthread_mutex_t logfile_mutex; +static char *filename = NULL; +static uint8_t logrotate = 0; + +static FILE *logfile = NULL; +static gf_loglevel_t loglevel = GF_LOG_MAX; + +gf_loglevel_t gf_log_loglevel; /* extern'd */ +FILE *gf_log_logfile; + + +void +gf_log_logrotate (int signum) +{ + logrotate = 1; +} + + +gf_loglevel_t +gf_log_get_loglevel (void) +{ + return loglevel; +} + + +void +gf_log_set_loglevel (gf_loglevel_t level) +{ + gf_log_loglevel = loglevel = level; +} + + +void +gf_log_fini (void) +{ + pthread_mutex_destroy (&logfile_mutex); +} + + +int +gf_log_init (const char *file) +{ + if (!file){ + fprintf (stderr, "gf_log_init: no filename specified\n"); + return -1; + } + + pthread_mutex_init (&logfile_mutex, NULL); + + filename = strdup (file); + if (!filename) { + fprintf (stderr, "gf_log_init: strdup error\n"); + return -1; + } + + logfile = fopen (file, "a"); + if (!logfile){ + fprintf (stderr, + "gf_log_init: failed to open logfile \"%s\" (%s)\n", + file, + strerror (errno)); + return -1; + } + + gf_log_logfile = logfile; + + return 0; +} + + +void +gf_log_lock (void) +{ + pthread_mutex_lock (&logfile_mutex); +} + + +void +gf_log_unlock (void) +{ + pthread_mutex_unlock (&logfile_mutex); +} + + +void +gf_log_cleanup (void) +{ + pthread_mutex_destroy (&logfile_mutex); +} + + +int +_gf_log (const char *domain, const char *file, const char *function, int line, + gf_loglevel_t level, const char *fmt, ...) +{ + const char *basename = NULL; + FILE *new_logfile = NULL; + va_list ap; + time_t utime = 0; + struct tm *tm = NULL; + char timestr[256]; + static char *level_strings[] = {"N", /* NONE */ + "T", /* TRACE */ + "C", /* CRITICAL */ + "E", /* ERROR */ + "W", /* WARNING */ + "N", /* TRACE (GF_LOG_NORMAL) */ + "D", /* DEBUG */ + ""}; + + if (!domain || !file || !function || !fmt) { + fprintf (stderr, + "logging: %s:%s():%d: invalid argument\n", + __FILE__, __PRETTY_FUNCTION__, __LINE__); + return -1; + } + + if (!logfile) { + fprintf (stderr, "no logfile set\n"); + return (-1); + } + + if (logrotate) { + logrotate = 0; + + new_logfile = fopen (filename, "a"); + if (!new_logfile) { + gf_log ("logrotate", GF_LOG_CRITICAL, + "failed to open logfile %s (%s)", + filename, strerror (errno)); + goto log; + } + + fclose (logfile); + gf_log_logfile = logfile = new_logfile; + } + +log: + utime = time (NULL); + tm = localtime (&utime); + + if (level > loglevel) { + goto out; + } + + pthread_mutex_lock (&logfile_mutex); + { + va_start (ap, fmt); + + strftime (timestr, 256, "%Y-%m-%d %H:%M:%S", tm); + + basename = strrchr (file, '/'); + if (basename) + basename++; + else + basename = file; + + fprintf (logfile, "%s %s [%s:%d:%s] %s: ", + timestr, level_strings[level], + basename, line, function, + domain); + + vfprintf (logfile, fmt, ap); + va_end (ap); + fprintf (logfile, "\n"); + fflush (logfile); + } + pthread_mutex_unlock (&logfile_mutex); + +out: + return (0); +} diff --git a/libglusterfs/src/logging.h b/libglusterfs/src/logging.h new file mode 100644 index 000000000..7a0deb6e8 --- /dev/null +++ b/libglusterfs/src/logging.h @@ -0,0 +1,132 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +#ifndef __LOGGING_H__ +#define __LOGGING_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include + +#define GF_PRI_FSBLK PRId64 +#define GF_PRI_BLKSIZE "ld" +#if GF_LINUX_HOST_OS + +# if __WORDSIZE == 64 +# define GF_PRI_SIZET "lu" +# define GF_PRI_NLINK "lu" +# else +# define GF_PRI_SIZET "u" +# define GF_PRI_NLINK "u" +# endif /* __WORDSIZE */ + +#elif GF_DARWIN_HOST_OS + +/* Noticed that size_t and ino_t are different on OSX, need to fix the warnings */ +# define GF_PRI_SIZET "lu" +# define GF_PRI_NLINK "u" + +# undef GF_PRI_FSBLK +# define GF_PRI_FSBLK "u" + +# undef GF_PRI_BLKSIZE +# define GF_PRI_BLKSIZE "u" + +# if __DARWIN_64_BIT_INO_T == 0 +# error '64 bit ino_t is must for GlusterFS to work, Compile with "CFLAGS=-D__DARWIN_64_BIT_INO_T"' +# endif /* __DARWIN_64_BIT_INO_T */ + +#else /* !LINUX && !DARWIN */ + +/* BSD and Solaris : Change as per testing there.. */ +# define GF_PRI_SIZET "lu" +# define GF_PRI_NLINK "u" + +#endif /* LINUX_OS */ + +#define GF_PRI_DEV GF_PRI_FSBLK + +typedef enum { + GF_LOG_NONE, + GF_LOG_TRACE, + GF_LOG_CRITICAL, /* fatal errors */ + GF_LOG_ERROR, /* major failures (not necessarily fatal) */ + GF_LOG_WARNING, /* info about normal operation */ + GF_LOG_INFO, /* Normal information */ +#define GF_LOG_NORMAL GF_LOG_INFO + GF_LOG_DEBUG, /* all other junk */ +} gf_loglevel_t; + +#define GF_LOG_MAX GF_LOG_DEBUG + +extern gf_loglevel_t gf_log_loglevel; + +#define gf_log(dom, levl, fmt...) do { \ + if (levl <= gf_log_loglevel) \ + _gf_log (dom, __FILE__, __FUNCTION__, __LINE__, \ + levl, ##fmt); \ + if (0) { \ + printf (fmt); \ + } \ +} while (0) + +/* Log once in GF_UNIVERSAL_ANSWER times */ +#define GF_LOG_OCCASIONALLY(var, args...) if (!(var++%GF_UNIVERSAL_ANSWER)) { \ + gf_log (args); \ + } + + +void +gf_log_logrotate (int signum); + +int gf_log_init (const char *filename); + +int +_gf_log (const char *domain, const char *file, const char *function, + int32_t line, gf_loglevel_t level, const char *fmt, ...); + +void gf_log_lock (void); +void gf_log_unlock (void); + +gf_loglevel_t +gf_log_get_loglevel (void); +void +gf_log_set_loglevel (gf_loglevel_t level); + +#define GF_DEBUG(xl, format, args...) \ + gf_log ((xl)->name, GF_LOG_DEBUG, format, ##args) +#define GF_INFO(xl, format, args...) \ + gf_log ((xl)->name, GF_LOG_INFO, format, ##args) +#define GF_WARNING(xl, format, args...) \ + gf_log ((xl)->name, GF_LOG_WARNING, format, ##args) +#define GF_ERROR(xl, format, args...) \ + gf_log ((xl)->name, GF_LOG_ERROR, format, ##args) + +#define GF_TRACE(xl, args...) do { \ + if ((xl)->trace) \ + _gf_log ((xl)->name, __FILE__, __FUNCTION__, \ + __LINE__, GF_LOG_TRACE, ##args); \ + } while(0); \ + +#endif /* __LOGGING_H__ */ diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c new file mode 100644 index 000000000..c3646f350 --- /dev/null +++ b/libglusterfs/src/mem-pool.c @@ -0,0 +1,174 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include "mem-pool.h" +#include "logging.h" +#include + + +#define GF_MEM_POOL_PAD_BOUNDRY 16 + + +struct mem_pool * +mem_pool_new_fn (unsigned long sizeof_type, + unsigned long count) +{ + struct mem_pool *mem_pool = NULL; + int pad = 0; + unsigned long padded_sizeof_type = 0; + void *pool = NULL; + int i = 0; + struct list_head *list = NULL; + + if (!sizeof_type || !count) { + gf_log ("mem-pool", GF_LOG_ERROR, "invalid argument"); + return NULL; + } + + pad = GF_MEM_POOL_PAD_BOUNDRY - + (sizeof_type % GF_MEM_POOL_PAD_BOUNDRY); + padded_sizeof_type = sizeof_type + pad; + + mem_pool = CALLOC (sizeof (*mem_pool), 1); + if (!mem_pool) + return NULL; + + LOCK_INIT (&mem_pool->lock); + INIT_LIST_HEAD (&mem_pool->list); + + mem_pool->padded_sizeof_type = padded_sizeof_type; + mem_pool->cold_count = count; + + pool = CALLOC (count, sizeof_type + pad); + if (!pool) + return NULL; + + for (i = 0; i < count; i++) { + list = pool + (i * (sizeof_type + pad)); + INIT_LIST_HEAD (list); + list_add_tail (list, &mem_pool->list); + } + + mem_pool->pool = pool; + mem_pool->pool_end = pool + (count * (sizeof_type + pad)); + + return mem_pool; +} + + +void * +mem_get (struct mem_pool *mem_pool) +{ + struct list_head *list = NULL; + void *ptr = NULL; + + if (!mem_pool) { + gf_log ("mem-pool", GF_LOG_ERROR, "invalid argument"); + return NULL; + } + + LOCK (&mem_pool->lock); + { + if (mem_pool->cold_count) { + list = mem_pool->list.next; + list_del (list); + + mem_pool->hot_count++; + mem_pool->cold_count--; + + ptr = list; + } + } + UNLOCK (&mem_pool->lock); + + if (ptr == NULL) { + ptr = MALLOC (mem_pool->padded_sizeof_type); + + if (!ptr) { + return NULL; + } + + LOCK (&mem_pool->lock); + { + mem_pool->hot_count ++; + } + UNLOCK (&mem_pool->lock); + } + + return ptr; +} + + +static int +__is_member (struct mem_pool *pool, void *ptr) +{ + if (!pool || !ptr) { + gf_log ("mem-pool", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + if (ptr < pool->pool || ptr >= pool->pool_end) + return 0; + + if ((ptr - pool->pool) % pool->padded_sizeof_type) + return -1; + + return 1; +} + + +void +mem_put (struct mem_pool *pool, void *ptr) +{ + struct list_head *list = NULL; + + if (!pool || !ptr) { + gf_log ("mem-pool", GF_LOG_ERROR, "invalid argument"); + return; + } + + list = ptr; + + LOCK (&pool->lock); + { + pool->hot_count--; + + switch (__is_member (pool, ptr)) + { + case 1: + pool->cold_count++; + list_add (list, &pool->list); + break; + case -1: + /* log error */ + abort (); + break; + case 0: + free (ptr); + break; + default: + /* log error */ + break; + } + } + UNLOCK (&pool->lock); + + if (ptr) + free (ptr); +} diff --git a/libglusterfs/src/mem-pool.h b/libglusterfs/src/mem-pool.h new file mode 100644 index 000000000..b36c24477 --- /dev/null +++ b/libglusterfs/src/mem-pool.h @@ -0,0 +1,54 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _MEM_POOL_H_ +#define _MEM_POOL_H_ + +#include "list.h" +#include "locking.h" + + +#define MALLOC(size) malloc(size) +#define CALLOC(size,cnt) calloc(size,cnt) + +#define FREE(ptr) \ + if (ptr != NULL) { \ + free ((void *)ptr); \ + ptr = (void *)0xeeeeeeee; \ + } + +struct mem_pool { + struct list_head list; + int hot_count; + int cold_count; + gf_lock_t lock; + unsigned long padded_sizeof_type; + void *pool; + void *pool_end; +}; + +struct mem_pool * +mem_pool_new_fn (unsigned long sizeof_type, unsigned long count); + +#define mem_pool_new(type,count) mem_pool_new_fn (sizeof(type), count) + +void mem_put (struct mem_pool *pool, void *ptr); +void *mem_get (struct mem_pool *pool); + +#endif /* _MEM_POOL_H */ diff --git a/libglusterfs/src/protocol.h b/libglusterfs/src/protocol.h new file mode 100644 index 000000000..4ba869dee --- /dev/null +++ b/libglusterfs/src/protocol.h @@ -0,0 +1,777 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _PROTOCOL_H +#define _PROTOCOL_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "byte-order.h" + + +struct gf_stat { + uint64_t ino; + uint64_t size; + uint64_t blocks; + uint32_t dev; + uint32_t rdev; + uint32_t mode; + uint32_t nlink; + uint32_t uid; + uint32_t gid; + uint32_t blksize; + uint32_t atime; + uint32_t atime_nsec; + uint32_t mtime; + uint32_t mtime_nsec; + uint32_t ctime; + uint32_t ctime_nsec; +} __attribute__((packed)); + + +static inline void +gf_stat_to_stat (struct gf_stat *gf_stat, struct stat *stat) +{ + stat->st_dev = ntoh32 (gf_stat->dev); + stat->st_ino = ntoh64 (gf_stat->ino); + stat->st_mode = ntoh32 (gf_stat->mode); + stat->st_nlink = ntoh32 (gf_stat->nlink); + stat->st_uid = ntoh32 (gf_stat->uid); + stat->st_gid = ntoh32 (gf_stat->gid); + stat->st_rdev = ntoh32 (gf_stat->rdev); + stat->st_size = ntoh64 (gf_stat->size); + stat->st_blksize = ntoh32 (gf_stat->blksize); + stat->st_blocks = ntoh64 (gf_stat->blocks); + stat->st_atime = ntoh32 (gf_stat->atime); + stat->st_mtime = ntoh32 (gf_stat->mtime); + stat->st_ctime = ntoh32 (gf_stat->ctime); + /* TODO: handle nsec */ +} + + +static inline void +gf_stat_from_stat (struct gf_stat *gf_stat, struct stat *stat) +{ + gf_stat->dev = hton32 (stat->st_dev); + gf_stat->ino = hton64 (stat->st_ino); + gf_stat->mode = hton32 (stat->st_mode); + gf_stat->nlink = hton32 (stat->st_nlink); + gf_stat->uid = hton32 (stat->st_uid); + gf_stat->gid = hton32 (stat->st_gid); + gf_stat->rdev = hton32 (stat->st_rdev); + gf_stat->size = hton64 (stat->st_size); + gf_stat->blksize = hton32 (stat->st_blksize); + gf_stat->blocks = hton64 (stat->st_blocks); + gf_stat->atime = hton32 (stat->st_atime); + gf_stat->mtime = hton32 (stat->st_mtime); + gf_stat->ctime = hton32 (stat->st_ctime); + /* TODO: handle nsec */ +} + + +struct gf_statfs { + uint64_t bsize; + uint64_t frsize; + uint64_t blocks; + uint64_t bfree; + uint64_t bavail; + uint64_t files; + uint64_t ffree; + uint64_t favail; + uint64_t fsid; + uint64_t flag; + uint64_t namemax; +} __attribute__((packed)); + + +static inline void +gf_statfs_to_statfs (struct gf_statfs *gf_stat, struct statvfs *stat) +{ + stat->f_bsize = ntoh64 (gf_stat->bsize); + stat->f_frsize = ntoh64 (gf_stat->frsize); + stat->f_blocks = ntoh64 (gf_stat->blocks); + stat->f_bfree = ntoh64 (gf_stat->bfree); + stat->f_bavail = ntoh64 (gf_stat->bavail); + stat->f_files = ntoh64 (gf_stat->files); + stat->f_ffree = ntoh64 (gf_stat->ffree); + stat->f_favail = ntoh64 (gf_stat->favail); + stat->f_fsid = ntoh64 (gf_stat->fsid); + stat->f_flag = ntoh64 (gf_stat->flag); + stat->f_namemax = ntoh64 (gf_stat->namemax); +} + + +static inline void +gf_statfs_from_statfs (struct gf_statfs *gf_stat, struct statvfs *stat) +{ + gf_stat->bsize = hton64 (stat->f_bsize); + gf_stat->frsize = hton64 (stat->f_frsize); + gf_stat->blocks = hton64 (stat->f_blocks); + gf_stat->bfree = hton64 (stat->f_bfree); + gf_stat->bavail = hton64 (stat->f_bavail); + gf_stat->files = hton64 (stat->f_files); + gf_stat->ffree = hton64 (stat->f_ffree); + gf_stat->favail = hton64 (stat->f_favail); + gf_stat->fsid = hton64 (stat->f_fsid); + gf_stat->flag = hton64 (stat->f_flag); + gf_stat->namemax = hton64 (stat->f_namemax); +} + + +struct gf_flock { + uint16_t type; + uint16_t whence; + uint64_t start; + uint64_t len; + uint32_t pid; +} __attribute__((packed)); + + +static inline void +gf_flock_to_flock (struct gf_flock *gf_flock, struct flock *flock) +{ + flock->l_type = ntoh16 (gf_flock->type); + flock->l_whence = ntoh16 (gf_flock->whence); + flock->l_start = ntoh64 (gf_flock->start); + flock->l_len = ntoh64 (gf_flock->len); + flock->l_pid = ntoh32 (gf_flock->pid); +} + + +static inline void +gf_flock_from_flock (struct gf_flock *gf_flock, struct flock *flock) +{ + gf_flock->type = hton16 (flock->l_type); + gf_flock->whence = hton16 (flock->l_whence); + gf_flock->start = hton64 (flock->l_start); + gf_flock->len = hton64 (flock->l_len); + gf_flock->pid = hton32 (flock->l_pid); +} + + +struct gf_timespec { + uint32_t tv_sec; + uint32_t tv_nsec; +} __attribute__((packed)); + + +static inline void +gf_timespec_to_timespec (struct gf_timespec *gf_ts, struct timespec *ts) +{ + + ts[0].tv_sec = ntoh32 (gf_ts[0].tv_sec); + ts[0].tv_nsec = ntoh32 (gf_ts[0].tv_nsec); + ts[1].tv_sec = ntoh32 (gf_ts[1].tv_sec); + ts[1].tv_nsec = ntoh32 (gf_ts[1].tv_nsec); +} + + +static inline void +gf_timespec_from_timespec (struct gf_timespec *gf_ts, struct timespec *ts) +{ + gf_ts[0].tv_sec = hton32 (ts[0].tv_sec); + gf_ts[0].tv_nsec = hton32 (ts[0].tv_nsec); + gf_ts[1].tv_sec = hton32 (ts[1].tv_sec); + gf_ts[1].tv_nsec = hton32 (ts[1].tv_nsec); +} + + +typedef struct { + uint64_t ino; + char path[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_stat_req_t;; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_stat_rsp_t; + + +typedef struct { + uint64_t ino; + uint32_t size; + char path[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_readlink_req_t; +typedef struct { + char path[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_readlink_rsp_t; + + +typedef struct { + uint64_t par; + uint64_t dev; + uint32_t mode; + char path[0]; /* NULL terminated */ + char bname[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_mknod_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_mknod_rsp_t; + + +typedef struct { + uint64_t par; + uint32_t mode; + char path[0]; /* NULL terminated */ + char bname[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_mkdir_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_mkdir_rsp_t; + + +typedef struct { + uint64_t par; + char path[0]; /* NULL terminated */ + char bname[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_unlink_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_unlink_rsp_t; + + +typedef struct { + uint64_t par; + char path[0]; + char bname[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_rmdir_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_rmdir_rsp_t; + + +typedef struct { + uint64_t par; + char path[0]; + char bname[0]; + char linkname[0]; +} __attribute__((packed)) gf_fop_symlink_req_t; +typedef struct { + struct gf_stat stat; +}__attribute__((packed)) gf_fop_symlink_rsp_t; + + +typedef struct { + uint64_t oldpar; + uint64_t newpar; + char oldpath[0]; + char oldbname[0]; /* NULL terminated */ + char newpath[0]; + char newbname[0]; /* NULL terminated */ +} __attribute__((packed)) gf_fop_rename_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_rename_rsp_t; + + +typedef struct { + uint64_t oldino; + uint64_t newpar; + char oldpath[0]; + char newpath[0]; + char newbname[0]; +}__attribute__((packed)) gf_fop_link_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_link_rsp_t; + + +typedef struct { + uint64_t ino; + uint32_t mode; + char path[0]; +} __attribute__((packed)) gf_fop_chmod_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_chmod_rsp_t; + + +typedef struct { + uint64_t ino; + uint32_t uid; + uint32_t gid; + char path[0]; +} __attribute__((packed)) gf_fop_chown_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_chown_rsp_t; + + +typedef struct { + uint64_t ino; + uint64_t offset; + char path[0]; +} __attribute__((packed)) gf_fop_truncate_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_truncate_rsp_t; + + +typedef struct { + uint64_t ino; + uint32_t flags; + char path[0]; +} __attribute__((packed)) gf_fop_open_req_t; +typedef struct { + int64_t fd; +} __attribute__((packed)) gf_fop_open_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; + uint64_t offset; + uint32_t size; +} __attribute__((packed)) gf_fop_read_req_t; +typedef struct { + struct gf_stat stat; + char buf[0]; +} __attribute__((packed)) gf_fop_read_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; + uint64_t offset; + uint32_t size; +} __attribute__((packed)) gf_fop_write_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_write_rsp_t; + + +typedef struct { + uint64_t ino; + char path[0]; +} __attribute__((packed)) gf_fop_statfs_req_t; +typedef struct { + struct gf_statfs statfs; +} __attribute__((packed)) gf_fop_statfs_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; +} __attribute__((packed)) gf_fop_flush_req_t; +typedef struct { } __attribute__((packed)) gf_fop_flush_rsp_t; + + +typedef struct fsync_req { + uint64_t ino; + int64_t fd; + uint32_t data; +} __attribute__((packed)) gf_fop_fsync_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_fsync_rsp_t; + + +typedef struct { + uint64_t ino; + uint32_t flags; + uint32_t dict_len; + char dict[0]; + char path[0]; +} __attribute__((packed)) gf_fop_setxattr_req_t; +typedef struct { } __attribute__((packed)) gf_fop_setxattr_rsp_t; + +typedef struct { + uint64_t ino; + uint32_t flags; + uint32_t dict_len; + char dict[0]; + char path[0]; +} __attribute__((packed)) gf_fop_xattrop_req_t; + +typedef struct { + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_xattrop_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; + uint32_t flags; + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_fxattrop_req_t; + +typedef struct { + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_fxattrop_rsp_t; + + +typedef struct { + uint64_t ino; + uint32_t namelen; + char path[0]; + char name[0]; +} __attribute__((packed)) gf_fop_getxattr_req_t; +typedef struct { + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_getxattr_rsp_t; + + +typedef struct { + uint64_t ino; + char path[0]; + char name[0]; +} __attribute__((packed)) gf_fop_removexattr_req_t; +typedef struct { } __attribute__((packed)) gf_fop_removexattr_rsp_t; + + +typedef struct { + uint64_t ino; + char path[0]; +} __attribute__((packed)) gf_fop_opendir_req_t; +typedef struct { + int64_t fd; +} __attribute__((packed)) gf_fop_opendir_rsp_t; + + +typedef struct fsyncdir_req { + uint64_t ino; + int64_t fd; + int32_t data; +} __attribute__((packed)) gf_fop_fsyncdir_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_fsyncdir_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; + uint64_t offset; + uint32_t size; +} __attribute__((packed)) gf_fop_readdir_req_t; +typedef struct { + uint32_t size; + char buf[0]; +} __attribute__((packed)) gf_fop_readdir_rsp_t; + + +typedef struct { + uint64_t ino; + uint32_t mask; + char path[0]; +} __attribute__((packed)) gf_fop_access_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_access_rsp_t; + + +typedef struct { + uint64_t par; + uint32_t flags; + uint32_t mode; + char path[0]; + char bname[0]; +} __attribute__((packed)) gf_fop_create_req_t; +typedef struct { + struct gf_stat stat; + uint64_t fd; +} __attribute__((packed)) gf_fop_create_rsp_t; + + + +typedef struct { + uint64_t ino; + int64_t fd; + uint64_t offset; +} __attribute__((packed)) gf_fop_ftruncate_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_ftruncate_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; +} __attribute__((packed)) gf_fop_fstat_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_fstat_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; + uint32_t cmd; + uint32_t type; + struct gf_flock flock; +} __attribute__((packed)) gf_fop_lk_req_t; +typedef struct { + struct gf_flock flock; +} __attribute__((packed)) gf_fop_lk_rsp_t; + +typedef struct { + uint64_t ino; + uint32_t cmd; + uint32_t type; + struct gf_flock flock; + char path[0]; +} __attribute__((packed)) gf_fop_inodelk_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_inodelk_rsp_t; + +typedef struct { + uint64_t ino; + int64_t fd; + uint32_t cmd; + uint32_t type; + struct gf_flock flock; +} __attribute__((packed)) gf_fop_finodelk_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_finodelk_rsp_t; + +typedef struct { + uint64_t ino; + uint32_t cmd; + uint32_t type; + uint64_t namelen; + char path[0]; + char name[0]; +} __attribute__((packed)) gf_fop_entrylk_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_entrylk_rsp_t; + +typedef struct { + uint64_t ino; + int64_t fd; + uint32_t cmd; + uint32_t type; + uint64_t namelen; + char name[0]; +} __attribute__((packed)) gf_fop_fentrylk_req_t; +typedef struct { +} __attribute__((packed)) gf_fop_fentrylk_rsp_t; + +typedef struct { + uint64_t ino; + struct gf_timespec tv[2]; + char path[0]; +} __attribute__((packed)) gf_fop_utimens_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_utimens_rsp_t; + +typedef struct { + uint64_t ino; + uint64_t fd; + uint32_t mode; +} __attribute__((packed)) gf_fop_fchmod_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_fchmod_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; + uint32_t uid; + uint32_t gid; +} __attribute__((packed)) gf_fop_fchown_req_t; +typedef struct { + struct gf_stat stat; +} __attribute__((packed)) gf_fop_fchown_rsp_t; + + +typedef struct { + uint64_t ino; /* NOTE: used only in case of 'root' lookup */ + uint64_t par; + uint32_t flags; + uint32_t dictlen; + char path[0]; + char bname[0]; + char dict[0]; +} __attribute__((packed)) gf_fop_lookup_req_t; +typedef struct { + struct gf_stat stat; + uint32_t dict_len; + char dict[0]; +} __attribute__((packed)) gf_fop_lookup_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; + uint32_t flags; + uint32_t count; + char buf[0]; +} __attribute__((packed)) gf_fop_setdents_req_t; +typedef struct { } __attribute__((packed)) gf_fop_setdents_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; + uint64_t offset; + uint32_t size; + uint32_t flags; +} __attribute__((packed)) gf_fop_getdents_req_t; +typedef struct { + uint32_t count; + char buf[0]; +} __attribute__((packed)) gf_fop_getdents_rsp_t; + + +typedef struct { + uint64_t ino; + uint32_t flag; + char path[0]; +} __attribute__((packed)) gf_fop_checksum_req_t; +typedef struct { + unsigned char fchecksum[0]; + unsigned char dchecksum[0]; +} __attribute__((packed)) gf_fop_checksum_rsp_t; + + +typedef struct { + char name[0]; +} __attribute__((packed)) gf_mop_lock_req_t; +typedef struct {} __attribute__((packed)) gf_mop_lock_rsp_t; + +typedef struct { + char name[0]; +} __attribute__((packed)) gf_mop_unlock_req_t; +typedef struct {} __attribute__((packed)) gf_mop_unlock_rsp_t; + +typedef struct { + char pattern[0]; +} __attribute__((packed)) gf_mop_listlocks_req_t; +typedef struct {} __attribute__((packed)) gf_mop_listlocks_rsp_t; + +typedef struct { + uint32_t flags; +} __attribute__((packed)) gf_mop_stats_req_t; +typedef struct { + char buf[0]; +} __attribute__((packed)) gf_mop_stats_rsp_t; + +typedef struct { + uint32_t flags; + uint32_t keylen; + char key[0]; +} __attribute__((packed)) gf_mop_getspec_req_t; +typedef struct { + char spec[0]; +} __attribute__((packed)) gf_mop_getspec_rsp_t; + + +typedef struct { + uint32_t dict_len; + char buf[0]; +} __attribute__((packed)) gf_mop_setvolume_req_t; +typedef struct { + uint32_t dict_len; + char buf[0]; +} __attribute__((packed)) gf_mop_setvolume_rsp_t; + + +typedef struct { +} __attribute__((packed)) gf_mop_ping_req_t; +typedef struct { +} __attribute__((packed)) gf_mop_ping_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; +} __attribute__((packed)) gf_cbk_releasedir_req_t; +typedef struct { +} __attribute__((packed)) gf_cbk_releasedir_rsp_t; + + +typedef struct { + uint64_t ino; + int64_t fd; +} __attribute__((packed)) gf_cbk_release_req_t; +typedef struct { +} __attribute__((packed)) gf_cbk_release_rsp_t; + + +typedef struct { + uint32_t count; + uint64_t ino_array[0]; +} __attribute__((packed)) gf_cbk_forget_req_t; +typedef struct { } __attribute__((packed)) gf_cbk_forget_rsp_t; + + +typedef struct { + uint32_t pid; + uint32_t uid; + uint32_t gid; +} __attribute__ ((packed)) gf_hdr_req_t; + + +typedef struct { + uint32_t op_ret; + uint32_t op_errno; +} __attribute__ ((packed)) gf_hdr_rsp_t; + + +typedef struct { + uint64_t callid; + uint32_t type; + uint32_t op; + uint32_t size; + union { + gf_hdr_req_t req; + gf_hdr_rsp_t rsp; + } __attribute__ ((packed)); +} __attribute__ ((packed)) gf_hdr_common_t; + + +static inline gf_hdr_common_t * +__gf_hdr_new (int size) +{ + gf_hdr_common_t *hdr = NULL; + + /* TODO: use mem-pool */ + hdr = CALLOC (sizeof (gf_hdr_common_t) + size, 1); + + if (!hdr) { + return NULL; + } + + hdr->size = hton32 (size); + + return hdr; +} + + +#define gf_hdr_len(type, x) (sizeof (gf_hdr_common_t) + sizeof (*type) + x) +#define gf_hdr_new(type, x) __gf_hdr_new (sizeof (*type) + x) + + +static inline void * +gf_param (gf_hdr_common_t *hdr) +{ + return ((void *)hdr) + sizeof (*hdr); +} + +#endif diff --git a/libglusterfs/src/revision.h b/libglusterfs/src/revision.h new file mode 100644 index 000000000..30742cd5e --- /dev/null +++ b/libglusterfs/src/revision.h @@ -0,0 +1 @@ +#define GLUSTERFS_REPOSITORY_REVISION "glusterfs--mainline--3.0--patch-928" diff --git a/libglusterfs/src/scheduler.c b/libglusterfs/src/scheduler.c new file mode 100644 index 000000000..3478a9385 --- /dev/null +++ b/libglusterfs/src/scheduler.c @@ -0,0 +1,80 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include "xlator.h" +#include "scheduler.h" +#include "list.h" + +struct sched_ops * +get_scheduler (xlator_t *xl, const char *name) +{ + struct sched_ops *tmp_sched = NULL; + volume_opt_list_t *vol_opt = NULL; + char *sched_file = NULL; + void *handle = NULL; + + if (name == NULL) { + gf_log ("scheduler", GF_LOG_ERROR, + "'name' not specified, EINVAL"); + return NULL; + } + + asprintf (&sched_file, "%s/%s.so", SCHEDULERDIR, name); + + gf_log ("scheduler", GF_LOG_DEBUG, + "attempt to load file %s.so", name); + + handle = dlopen (sched_file, RTLD_LAZY); + if (!handle) { + gf_log ("scheduler", GF_LOG_ERROR, + "dlopen(%s): %s", sched_file, dlerror ()); + return NULL; + } + + tmp_sched = dlsym (handle, "sched"); + if (!tmp_sched) { + gf_log ("scheduler", GF_LOG_ERROR, + "dlsym(sched) on %s", dlerror ()); + return NULL; + } + + vol_opt = CALLOC (1, sizeof (volume_opt_list_t)); + vol_opt->given_opt = dlsym (handle, "options"); + if (vol_opt->given_opt == NULL) { + gf_log ("scheduler", GF_LOG_DEBUG, + "volume option validation not specified"); + } else { + list_add_tail (&vol_opt->list, &xl->volume_options); + if (validate_xlator_volume_options (xl, vol_opt->given_opt) + == -1) { + gf_log ("scheduler", GF_LOG_ERROR, + "volume option validation failed"); + return NULL; + } + } + + return tmp_sched; +} diff --git a/libglusterfs/src/scheduler.h b/libglusterfs/src/scheduler.h new file mode 100644 index 000000000..5ff1a624f --- /dev/null +++ b/libglusterfs/src/scheduler.h @@ -0,0 +1,40 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _SCHEDULER_H +#define _SCHEDULER_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" + +struct sched_ops { + int32_t (*init) (xlator_t *this); + void (*fini) (xlator_t *this); + void (*update) (xlator_t *this); + xlator_t *(*schedule) (xlator_t *this, const void *path); + void (*notify) (xlator_t *xl, int32_t event, void *data); +}; + +extern struct sched_ops *get_scheduler (xlator_t *xl, const char *name); + +#endif /* _SCHEDULER_H */ diff --git a/libglusterfs/src/spec.l b/libglusterfs/src/spec.l new file mode 100644 index 000000000..0345730b2 --- /dev/null +++ b/libglusterfs/src/spec.l @@ -0,0 +1,94 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +%x STRING +%option prefix="yy" +%option yylineno +%{ + +#define YYSTYPE char * +#include "xlator.h" +#include "y.tab.h" +#include +#define START_STRSIZE 32 + +static char *text; +static int text_asize; +static int text_size; + +void new_string(void) +{ + text = malloc(START_STRSIZE); + text_asize = START_STRSIZE; + text_size = 0; + *text = 0; +} + +void append_string(const char *str, int size) +{ + int new_size = text_size + size + 1; + if (new_size > text_asize) { + new_size += START_STRSIZE - 1; + new_size &= -START_STRSIZE; + text = realloc(text, new_size); + text_asize = new_size; + } + memcpy(text + text_size, str, size); + text_size += size; + text[text_size] = 0; +} + +void alloc_string(const char *str, int size) +{ + text = malloc(size + 1); + memcpy(text, str, size); + text[size] = 0; +} + +%} + +VOLUME [v][o][l][u][m][e] +END [e][n][d] +SUB [s][u][b] +OPTION [o][p][t][i][o][n] +TYPE [t][y][p][e] +%% +\#.* ; +{VOLUME} return SECTION_BEGIN; +{TYPE} return TYPE; +{END}[-]{VOLUME} return SECTION_END; +{SUB}{VOLUME}[Ss] return SUBSECTION; +{OPTION} return OPTION; +\" BEGIN(STRING); +{ + [^\n\"\\]* { append_string (yytext, yyleng); } + \\. { append_string (yytext + 1, yyleng - 1); } + \" { + if (0) { + yyunput (0, NULL); + } + BEGIN (INITIAL); + yylval = text; + return STRING_TOK; + } +} +[^ \t\r\n\"\\]+ { yylval = strdup (yytext) ; return ID; } +[ \t\r\n]+ ; +%% diff --git a/libglusterfs/src/spec.y b/libglusterfs/src/spec.y new file mode 100644 index 000000000..c6491e28d --- /dev/null +++ b/libglusterfs/src/spec.y @@ -0,0 +1,613 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + + +%token SECTION_BEGIN SECTION_END OPTION NEWLINE SUBSECTION ID WHITESPACE COMMENT TYPE STRING_TOK +%name-prefix="yy" + +%{ +#include +#include +#include +#include +#include + +#include "xlator.h" +#include "logging.h" + +static int new_section (char *name); +static int section_type (char *type); +static int section_option (char *key, char *value); +static int section_sub (char *sub); +static int section_end (void); +static void sub_error (void); +static void type_error (void); +static void option_error (void); + +#define YYSTYPE char * +#define GF_CMD_BUFFER_LEN (32 * GF_UNIT_KB) + +int yyerror (const char *); +int yylex (); +%} + + +%% +SECTIONS: SECTION | SECTIONS SECTION; + +SECTION: SECTION_HEADER SECTION_DATA SECTION_FOOTER; +SECTION_HEADER: SECTION_BEGIN WORD {if( -1 == new_section ($2)) { YYABORT; } }; +SECTION_FOOTER: SECTION_END {if( -1 == section_end ()) { YYABORT; } }; + +SECTION_DATA: TYPE_LINE OPTIONS_LINE SUBSECTION_LINE OPTIONS_LINE | + TYPE_LINE SUBSECTION_LINE OPTIONS_LINE | + TYPE_LINE OPTIONS_LINE SUBSECTION_LINE | + TYPE_LINE SUBSECTION_LINE | + TYPE_LINE OPTIONS_LINE | + OPTIONS_LINE SUBSECTION_LINE OPTIONS_LINE | /* error case */ + OPTIONS_LINE; /* error case */ + +TYPE_LINE: TYPE WORD {if ( -1 == section_type ($2)) { YYABORT; }} | TYPE { type_error(); YYABORT; }; + +SUBSECTION_LINE: SUBSECTION WORDS | SUBSECTION { sub_error (); YYABORT; }; + +OPTIONS_LINE: OPTION_LINE | OPTIONS_LINE OPTION_LINE; + +OPTION_LINE: OPTION WORD WORD {if(-1 == section_option($2,$3)){YYABORT;} } | + OPTION WORD { option_error (); YYABORT; } | + OPTION { option_error (); YYABORT; }; + +WORDS: WORD {if (-1 == section_sub ($1)) {YYABORT; } } | WORDS WORD { if (-1 == section_sub ($2)) { YYABORT; } }; +WORD: ID | STRING_TOK ; +%% + +xlator_t *complete_tree = NULL; +xlator_t *tree = NULL; +glusterfs_ctx_t *gctx; + +static void +type_error (void) +{ + extern int yylineno; + + fprintf (stderr, "volume %s, before line %d: specify which 'type' " + "you need\n", + complete_tree->name, yylineno); + gf_log ("parser", GF_LOG_ERROR, + "volume %s, before line %d: specify which 'type' you need", + complete_tree->name, yylineno); + return; +} + +static void +sub_error (void) +{ + extern int yylineno; + + fprintf (stderr, "volume %s, before line %d: specify what all " + "'subvolumes' you need for volume\n", + complete_tree->name, yylineno); + gf_log ("parser", GF_LOG_ERROR, + "volume %s, before line %d: specify what all 'subvolumes' " + "you need for volume", + complete_tree->name, yylineno); + return; +} + +static void +option_error (void) +{ + extern int yylineno; + + fprintf (stderr, "volume %s, before line %d: you need to specify " + " pair for 'option' token\n", + complete_tree->name, yylineno); + gf_log ("parser", GF_LOG_ERROR, + "volume %s, before line %d: you need to specify " + " pair for 'option' token", + complete_tree->name, yylineno); + return; +} + +static int +cut_tree (xlator_t *tree) +{ + xlator_t *trav = tree, *prev = tree; + + if (!tree) { + gf_log ("parser", GF_LOG_DEBUG, "Translator tree not found"); + return -1; + } + + gf_log ("parser", GF_LOG_DEBUG, "Failed to build translator graph"); + + while (prev) { + trav = prev->next; + dict_destroy (prev->options); + FREE (prev->name); + FREE (prev); + prev = trav; + } + + return 0; +} + + +static int +new_section (char *name) +{ + extern int yylineno; + xlator_t *trav = complete_tree; + xlator_t *node = (void *) calloc (1, sizeof (*node)); + + if (!name) { + gf_log ("parser", GF_LOG_DEBUG, + "invalid argument name '%s'", name); + return -1; + } + + while (trav) { + if (!strcmp (name, trav->name)) { + fprintf (stderr, + "line %d: volume '%s' defined again\n", + yylineno, name); + gf_log ("parser", GF_LOG_ERROR, + "line %d: volume '%s' defined again", + yylineno, name); + return -1; + } + trav = trav->next; + } + + node->ctx = gctx; + node->name = name; + node->next = complete_tree; + if (complete_tree) + complete_tree->prev = node; + node->options = get_new_dict (); + complete_tree = node; + + tree = node; + gf_log ("parser", GF_LOG_DEBUG, "New node for '%s'", name); + + return 0; +} + +static int +section_type (char *type) +{ + extern int yylineno; + int32_t ret = -1; + if (!type) { + gf_log ("parser", GF_LOG_DEBUG, "invalid argument type"); + return -1; + } + + ret = xlator_set_type (tree, type); + if (ret) { + fprintf (stderr, "volume '%s', line %d: type '%s' is not " + "valid or not found on this machine\n", + complete_tree->name, yylineno, type); + gf_log ("parser", GF_LOG_ERROR, + "volume '%s', line %d: type '%s' is not valid or " + "not found on this machine", + complete_tree->name, yylineno, type); + return -1; + } + gf_log ("parser", GF_LOG_DEBUG, "Type:%s:%s", tree->name, type); + + return 0; +} + + +static int +section_option (char *key, char *value) +{ + extern int yylineno; + + int ret = 0; + + if (!key || !value){ + fprintf (stderr, "invalid argument\n"); + gf_log ("parser", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + ret = dict_set (tree->options, key, str_to_data (value)); + + if (ret == 1) { + gf_log ("parser", GF_LOG_ERROR, + "volume '%s', line %d: duplicate entry " + "('option %s') present", + tree->name, yylineno, key); + return -1; + } + gf_log ("parser", GF_LOG_DEBUG, "Option:%s:%s:%s", + tree->name, key, value); + + return 0; +} + +static int +section_sub (char *sub) +{ + extern int yylineno; + xlator_t *trav = complete_tree; + xlator_list_t *xlchild, *tmp, *xlparent; + + if (!sub) { + fprintf (stderr, "invalid subvolumes argument\n"); + gf_log ("parser", GF_LOG_ERROR, "invalid subvolumes argument"); + return -1; + } + + while (trav) { + if (!strcmp (sub, trav->name)) + break; + trav = trav->next; + } + if (!trav) { + fprintf (stderr, + "volume '%s', line %d: subvolume '%s' is not " + "defined prior to usage\n", + complete_tree->name, yylineno, sub); + gf_log ("parser", GF_LOG_ERROR, + "volume '%s', line %d: subvolume '%s' is not defined " + "prior to usage", + complete_tree->name, yylineno, sub); + return -1; + } + + if (trav == tree) { + fprintf (stderr, "volume '%s', line %d: has '%s' itself as " + "subvolume\n", + complete_tree->name, yylineno, sub); + gf_log ("parser", GF_LOG_ERROR, + "volume '%s', line %d: has '%s' itself as subvolume", + complete_tree->name, yylineno, sub); + return -1; + } + + xlparent = (void *) calloc (1, sizeof (*xlparent)); + xlparent->xlator = tree; + + tmp = trav->parents; + if (tmp == NULL) { + trav->parents = xlparent; + } else { + while (tmp->next) + tmp = tmp->next; + tmp->next = xlparent; + } + + xlchild = (void *) calloc (1, sizeof(*xlchild)); + xlchild->xlator = trav; + + tmp = tree->children; + if (tmp == NULL) { + tree->children = xlchild; + } else { + while (tmp->next) + tmp = tmp->next; + tmp->next = xlchild; + } + + gf_log ("parser", GF_LOG_DEBUG, "child:%s->%s", tree->name, sub); + + return 0; +} + +static int +section_end (void) +{ + if (!tree->fops || !tree->mops) { + fprintf (stderr, + "\"type\" not specified for volume %s\n", tree->name); + gf_log ("parser", GF_LOG_ERROR, + "\"type\" not specified for volume %s", tree->name); + return -1; + } + gf_log ("parser", GF_LOG_DEBUG, "end:%s", tree->name); + + tree = NULL; + return 0; +} + +int +yywrap () +{ + return 1; +} + +int +yyerror (const char *str) +{ + extern char *yytext; + extern int yylineno; + + if (complete_tree && complete_tree->name) + { + if (!strcmp (yytext, "volume")) + { + fprintf (stderr, + "'end-volume' not defined for volume '%s'\n", + complete_tree->name); + gf_log ("parser", GF_LOG_ERROR, + "'end-volume' not defined for volume '%s'", + complete_tree->name); + } + else if (!strcmp (yytext, "type")) + { + fprintf (stderr, "line %d: duplicate 'type' defined " + "for volume '%s'", + yylineno, complete_tree->name); + gf_log ("parser", GF_LOG_ERROR, + "line %d: duplicate 'type' defined for " + "volume '%s'", + yylineno, complete_tree->name); + } + else if (!strcmp (yytext, "subvolumes")) + { + fprintf (stderr, "line %d: duplicate 'subvolumes' " + "defined for volume '%s'", + yylineno, complete_tree->name); + gf_log ("parser", GF_LOG_ERROR, + "line %d: duplicate 'subvolumes' defined for " + "volume '%s'", + yylineno, complete_tree->name); + } + else if (tree) + { + fprintf (stderr, + "syntax error: line %d (volume '%s'): \"%s\"" + "\nallowed tokens are 'volume', 'type', " + "'subvolumes', 'option', 'end-volume'", + yylineno, complete_tree->name, + yytext); + + gf_log ("parser", GF_LOG_ERROR, + "syntax error: line %d (volume '%s'): \"%s\"" + "\nallowed tokens are 'volume', 'type', " + "'subvolumes', 'option', 'end-volume'()", + yylineno, complete_tree->name, + yytext); + } + else + { + fprintf (stderr, + "syntax error: line %d (just after volume " + "'%s'): \"%s\"\n(%s)", + yylineno, complete_tree->name, + yytext, + "allowed tokens are 'volume', 'type', " + "'subvolumes', 'option', 'end-volume'"); + gf_log ("parser", GF_LOG_ERROR, + "syntax error: line %d (just after volume " + "'%s'): \"%s\"\n(%s)", + yylineno, complete_tree->name, + yytext, + "allowed tokens are 'volume', 'type', " + "'subvolumes', 'option', 'end-volume'"); + } + } + else + { + fprintf (stderr, + "syntax error in line %d: \"%s\" \n" + "(allowed tokens are 'volume', 'type', " + "'subvolumes', 'option', 'end-volume')\n", + yylineno, yytext); + gf_log ("parser", GF_LOG_ERROR, + "syntax error in line %d: \"%s\" \n" + "(allowed tokens are 'volume', 'type', " + "'subvolumes', 'option', 'end-volume')\n", + yylineno, yytext); + } + + cut_tree (tree); + complete_tree = NULL; + return 0; +} + +static int +execute_cmd (char *cmd, char *result, int size) +{ + FILE *fpp = NULL; + int ret = 0; + + fpp = popen (cmd, "r"); + if (!fpp) + { + gf_log ("parser", GF_LOG_ERROR, "%s: failed to popen", cmd); + return -1; + } + + if (!fgets (result, GF_UNIT_KB, fpp)) + { + gf_log ("parser", GF_LOG_ERROR, "failed to read output of cmd (%s)", cmd); + pclose (fpp); + return -1; + } + + ret = strlen (result); + result[ret - 1] = '\0'; + ret--; + pclose (fpp); + + return ret; +} + +static int +find_and_execute_cmds (char *src, char *dst) +{ + char escaped = 0; + char *cmd = NULL; + char in_backtick = 0; + int size = 0, ret = 0; + + if (!src || !dst) { + ret = -1; + goto out; + } + + while (*src) { + if (*src == '`' && !escaped) { + if (in_backtick) { + *src = '\0'; + ret = execute_cmd (cmd, dst, GF_UNIT_KB); + if (ret < 0) { + ret = -1; + size = -1; + goto out; + } + + dst += ret; + size += ret; + } else { + cmd = src + 1; + } + + in_backtick = !in_backtick; + } else if (!in_backtick) { + *dst++ = *src; + size++; + } + + if (*src == '\\') { + escaped = !escaped; + } else { + escaped = 0; + } + + src++; + } + +out: + return size; +} + + +static int +parse_backtick (FILE *srcfp, FILE *dstfp) +{ + char srcbuf[8 * GF_UNIT_KB] = {0, }; + char *dstbuf = NULL; + int ret = 0; + int size = 0; + + dstbuf = calloc (32 * GF_UNIT_KB, 1); + + fseek (srcfp, 0L, SEEK_SET); + fseek (dstfp, 0L, SEEK_SET); + + while (!feof (srcfp)) { + if (fgets (srcbuf, 8 * GF_UNIT_KB, srcfp) == NULL) { + break; + } + + size = find_and_execute_cmds (srcbuf, dstbuf); + if (size < 0) { + ret = -1; + break; + } + fwrite (dstbuf, size, 1, dstfp); + } + + fseek (srcfp, 0L, SEEK_SET); + fseek (dstfp, 0L, SEEK_SET); + FREE (dstbuf); + return ret; +} + +extern FILE *yyin; +xlator_t * +file_to_xlator_tree (glusterfs_ctx_t *ctx, + FILE *fp) +{ + int32_t ret = 0; + xlator_t *tmp_tree = NULL; + FILE *tmp_file = NULL; + int fd = -1, tmp_fd = -1; + struct stat stbuf = {0, }; + char *buffer = NULL; + + tmp_file = tmpfile (); + if (NULL == tmp_file) { + gf_log ("parser", GF_LOG_ERROR, + "cannot create temparory file"); + return NULL; + } + + fd = fileno (fp); + if (fd == -1) { + gf_log ("parser", GF_LOG_ERROR, + "cannot get file descriptor from volume specification file stream pointer"); + fclose (tmp_file); + return NULL; + } + + ret = fstat (fd, &stbuf); + if (ret == -1) { + gf_log ("parser", GF_LOG_ERROR, + "getting the size of volume specification file failed"); + fclose (tmp_file); + return NULL; + } + + buffer = calloc (stbuf.st_size + GF_CMD_BUFFER_LEN, 1); + + tmp_fd = fileno (tmp_file); + if (!mmap (buffer, stbuf.st_size + GF_CMD_BUFFER_LEN, + PROT_NONE, 0, tmp_fd, 0)) { + gf_log ("parser", GF_LOG_ERROR, + "mmap of volume specification file failed"); + fclose (tmp_file); + FREE (buffer); + return NULL; + } + + ret = parse_backtick (fp, tmp_file); + if (ret < 0) { + gf_log ("parser", GF_LOG_ERROR, + "parsing of backticks failed"); + fclose (tmp_file); + FREE (buffer); + return NULL; + } + + gctx = ctx; + yyin = tmp_file; + ret = yyparse (); + + fclose (tmp_file); + FREE (buffer); + + if (1 == ret) { + gf_log ("parser", GF_LOG_DEBUG, + "parsing of volfile failed, please review it " + "once more"); + tree = complete_tree = NULL; + return NULL; + } + + tmp_tree = complete_tree; + tree = complete_tree = NULL; + + return tmp_tree; +} diff --git a/libglusterfs/src/stack.h b/libglusterfs/src/stack.h new file mode 100644 index 000000000..f014a4a27 --- /dev/null +++ b/libglusterfs/src/stack.h @@ -0,0 +1,266 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +/* + This file defines MACROS and static inlines used to emulate a function + call over asynchronous communication with remote server +*/ + +#ifndef _STACK_H +#define _STACK_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +struct _call_stack_t; +typedef struct _call_stack_t call_stack_t; +struct _call_frame_t; +typedef struct _call_frame_t call_frame_t; +struct _call_pool_t; +typedef struct _call_pool_t call_pool_t; + +#include "xlator.h" +#include "dict.h" +#include "list.h" +#include "common-utils.h" + + +typedef int32_t (*ret_fn_t) (call_frame_t *frame, + call_frame_t *prev_frame, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + ...); + +struct _call_pool_t { + union { + struct list_head all_frames; + struct { + call_stack_t *next_call; + call_stack_t *prev_call; + } all_stacks; + }; + int64_t cnt; + gf_lock_t lock; +}; + +struct _call_frame_t { + call_stack_t *root; /* stack root */ + call_frame_t *parent; /* previous BP */ + call_frame_t *next; + call_frame_t *prev; /* maintainence list */ + void *local; /* local variables */ + xlator_t *this; /* implicit object */ + ret_fn_t ret; /* op_return address */ + int32_t ref_count; + gf_lock_t lock; + void *cookie; /* unique cookie */ +}; + +struct _call_stack_t { + union { + struct list_head all_frames; + struct { + call_stack_t *next_call; + call_stack_t *prev_call; + }; + }; + call_pool_t *pool; + void *trans; + uint64_t unique; + void *state; /* pointer to request state */ + uid_t uid; + gid_t gid; + pid_t pid; + call_frame_t frames; + dict_t *req_refs; + dict_t *rsp_refs; + + int32_t op; + int8_t type; +}; + + +static inline void +FRAME_DESTROY (call_frame_t *frame) +{ + if (frame->next) + frame->next->prev = frame->prev; + if (frame->prev) + frame->prev->next = frame->next; + if (frame->local) + FREE (frame->local); + LOCK_DESTROY (&frame->lock); + FREE (frame); +} + + +static inline void +STACK_DESTROY (call_stack_t *stack) +{ + LOCK (&stack->pool->lock); + { + list_del_init (&stack->all_frames); + stack->pool->cnt--; + } + UNLOCK (&stack->pool->lock); + + if (stack->frames.local) + FREE (stack->frames.local); + + LOCK_DESTROY (&stack->frames.lock); + + while (stack->frames.next) { + FRAME_DESTROY (stack->frames.next); + } + FREE (stack); +} + + +#define cbk(x) cbk_##x + + +/* make a call */ +#define STACK_WIND(frame, rfn, obj, fn, params ...) \ + do { \ + call_frame_t *_new = NULL; \ + \ + _new = CALLOC (1, sizeof (call_frame_t)); \ + ERR_ABORT (_new); \ + typeof(fn##_cbk) tmp_cbk = rfn; \ + _new->root = frame->root; \ + _new->next = frame->root->frames.next; \ + _new->prev = &frame->root->frames; \ + if (frame->root->frames.next) \ + frame->root->frames.next->prev = _new; \ + frame->root->frames.next = _new; \ + _new->this = obj; \ + _new->ret = (ret_fn_t) tmp_cbk; \ + _new->parent = frame; \ + _new->cookie = _new; \ + LOCK_INIT (&_new->lock); \ + frame->ref_count++; \ + \ + fn (_new, obj, params); \ + } while (0) + + +/* make a call with a cookie */ +#define STACK_WIND_COOKIE(frame, rfn, cky, obj, fn, params ...) \ + do { \ + call_frame_t *_new = CALLOC (1, \ + sizeof (call_frame_t)); \ + ERR_ABORT (_new); \ + typeof(fn##_cbk) tmp_cbk = rfn; \ + _new->root = frame->root; \ + _new->next = frame->root->frames.next; \ + _new->prev = &frame->root->frames; \ + if (frame->root->frames.next) \ + frame->root->frames.next->prev = _new; \ + frame->root->frames.next = _new; \ + _new->this = obj; \ + _new->ret = (ret_fn_t) tmp_cbk; \ + _new->parent = frame; \ + _new->cookie = cky; \ + LOCK_INIT (&_new->lock); \ + frame->ref_count++; \ + fn##_cbk = rfn; \ + \ + fn (_new, obj, params); \ + } while (0) + + +/* return from function */ +#define STACK_UNWIND(frame, params ...) \ + do { \ + ret_fn_t fn = frame->ret; \ + call_frame_t *_parent = frame->parent; \ + _parent->ref_count--; \ + fn (_parent, frame->cookie, _parent->this, params); \ + } while (0) + + +static inline call_frame_t * +copy_frame (call_frame_t *frame) +{ + call_stack_t *newstack = NULL; + call_stack_t *oldstack = NULL; + + if (!frame) { + return NULL; + } + + newstack = (void *) CALLOC (1, sizeof (*newstack)); + oldstack = frame->root; + + newstack->uid = oldstack->uid; + newstack->gid = oldstack->gid; + newstack->pid = oldstack->pid; + newstack->unique = oldstack->unique; + + newstack->frames.this = frame->this; + newstack->frames.root = newstack; + newstack->pool = oldstack->pool; + + LOCK_INIT (&newstack->frames.lock); + + LOCK (&oldstack->pool->lock); + { + list_add (&newstack->all_frames, &oldstack->all_frames); + newstack->pool->cnt++; + + } + UNLOCK (&oldstack->pool->lock); + + return &newstack->frames; +} + +static inline call_frame_t * +create_frame (xlator_t *xl, call_pool_t *pool) +{ + call_stack_t *stack = NULL; + + if (!xl || !pool) { + return NULL; + } + + stack = CALLOC (1, sizeof (*stack)); + if (!stack) + return NULL; + + stack->pool = pool; + stack->frames.root = stack; + stack->frames.this = xl; + + LOCK (&pool->lock); + { + list_add (&stack->all_frames, &pool->all_frames); + pool->cnt++; + } + UNLOCK (&pool->lock); + + LOCK_INIT (&stack->frames.lock); + + return &stack->frames; +} + + +#endif /* _STACK_H */ diff --git a/libglusterfs/src/timer.c b/libglusterfs/src/timer.c new file mode 100644 index 000000000..a6dbaaa83 --- /dev/null +++ b/libglusterfs/src/timer.c @@ -0,0 +1,220 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "timer.h" +#include "logging.h" +#include "common-utils.h" + +#define TS(tv) ((((unsigned long long) tv.tv_sec) * 1000000) + (tv.tv_usec)) + +gf_timer_t * +gf_timer_call_after (glusterfs_ctx_t *ctx, + struct timeval delta, + gf_timer_cbk_t cbk, + void *data) +{ + gf_timer_registry_t *reg = NULL; + gf_timer_t *event = NULL; + gf_timer_t *trav = NULL; + unsigned long long at = 0L; + + if (ctx == NULL) + { + gf_log ("timer", GF_LOG_ERROR, "invalid argument"); + return NULL; + } + + reg = gf_timer_registry_init (ctx); + + if (!reg) { + gf_log ("timer", GF_LOG_ERROR, "!reg"); + return NULL; + } + + event = CALLOC (1, sizeof (*event)); + if (!event) { + gf_log ("timer", GF_LOG_CRITICAL, "Not enough memory"); + return NULL; + } + gettimeofday (&event->at, NULL); + event->at.tv_usec = ((event->at.tv_usec + delta.tv_usec) % 1000000); + event->at.tv_sec += ((event->at.tv_usec + delta.tv_usec) / 1000000); + event->at.tv_sec += delta.tv_sec; + at = TS (event->at); + event->cbk = cbk; + event->data = data; + pthread_mutex_lock (®->lock); + { + trav = reg->active.prev; + while (trav != ®->active) { + if (TS (trav->at) < at) + break; + trav = trav->prev; + } + event->prev = trav; + event->next = event->prev->next; + event->prev->next = event; + event->next->prev = event; + } + pthread_mutex_unlock (®->lock); + return event; +} + +int32_t +gf_timer_call_stale (gf_timer_registry_t *reg, + gf_timer_t *event) +{ + if (reg == NULL || event == NULL) + { + gf_log ("timer", GF_LOG_ERROR, "invalid argument"); + return 0; + } + + event->next->prev = event->prev; + event->prev->next = event->next; + event->next = ®->stale; + event->prev = event->next->prev; + event->next->prev = event; + event->prev->next = event; + + return 0; +} + +int32_t +gf_timer_call_cancel (glusterfs_ctx_t *ctx, + gf_timer_t *event) +{ + gf_timer_registry_t *reg = NULL; + + if (ctx == NULL || event == NULL) + { + gf_log ("timer", GF_LOG_ERROR, "invalid argument"); + return 0; + } + + reg = gf_timer_registry_init (ctx); + if (!reg) { + gf_log ("timer", GF_LOG_ERROR, "!reg"); + return 0; + } + + pthread_mutex_lock (®->lock); + { + event->next->prev = event->prev; + event->prev->next = event->next; + } + pthread_mutex_unlock (®->lock); + + FREE (event); + return 0; +} + +void * +gf_timer_proc (void *ctx) +{ + gf_timer_registry_t *reg = NULL; + + if (ctx == NULL) + { + gf_log ("timer", GF_LOG_ERROR, "invalid argument"); + return NULL; + } + + reg = gf_timer_registry_init (ctx); + if (!reg) { + gf_log ("timer", GF_LOG_ERROR, "!reg"); + return NULL; + } + + while (!reg->fin) { + unsigned long long now; + struct timeval now_tv; + gf_timer_t *event = NULL; + + gettimeofday (&now_tv, NULL); + now = TS (now_tv); + while (1) { + unsigned long long at; + char need_cbk = 0; + + pthread_mutex_lock (®->lock); + { + event = reg->active.next; + at = TS (event->at); + if (event != ®->active && now >= at) { + need_cbk = 1; + gf_timer_call_stale (reg, event); + } + } + pthread_mutex_unlock (®->lock); + if (need_cbk) + event->cbk (event->data); + + else + break; + } + usleep (1000000); + } + + pthread_mutex_lock (®->lock); + { + while (reg->active.next != ®->active) { + gf_timer_call_cancel (ctx, reg->active.next); + } + + while (reg->stale.next != ®->stale) { + gf_timer_call_cancel (ctx, reg->stale.next); + } + } + pthread_mutex_unlock (®->lock); + pthread_mutex_destroy (®->lock); + FREE (((glusterfs_ctx_t *)ctx)->timer); + + return NULL; +} + +gf_timer_registry_t * +gf_timer_registry_init (glusterfs_ctx_t *ctx) +{ + if (ctx == NULL) + { + gf_log ("timer", GF_LOG_ERROR, "invalid argument"); + return NULL; + } + + if (!ctx->timer) { + gf_timer_registry_t *reg = NULL; + + ctx->timer = reg = CALLOC (1, sizeof (*reg)); + ERR_ABORT (reg); + pthread_mutex_init (®->lock, NULL); + reg->active.next = ®->active; + reg->active.prev = ®->active; + reg->stale.next = ®->stale; + reg->stale.prev = ®->stale; + + pthread_create (®->th, NULL, gf_timer_proc, ctx); + } + return ctx->timer; +} diff --git a/libglusterfs/src/timer.h b/libglusterfs/src/timer.h new file mode 100644 index 000000000..5152900f3 --- /dev/null +++ b/libglusterfs/src/timer.h @@ -0,0 +1,68 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _TIMER_H +#define _TIMER_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include +#include + +typedef void (*gf_timer_cbk_t) (void *); + +struct _gf_timer { + struct _gf_timer *next, *prev; + struct timeval at; + gf_timer_cbk_t cbk; + void *data; +}; + +struct _gf_timer_registry { + pthread_t th; + char fin; + struct _gf_timer stale; + struct _gf_timer active; + pthread_mutex_t lock; +}; + +typedef struct _gf_timer gf_timer_t; +typedef struct _gf_timer_registry gf_timer_registry_t; + +gf_timer_t * +gf_timer_call_after (glusterfs_ctx_t *ctx, + struct timeval delta, + gf_timer_cbk_t cbk, + void *data); + +int32_t +gf_timer_call_cancel (glusterfs_ctx_t *ctx, + gf_timer_t *event); + +void * +gf_timer_proc (void *data); + +gf_timer_registry_t * +gf_timer_registry_init (glusterfs_ctx_t *ctx); + +#endif /* _TIMER_H */ diff --git a/libglusterfs/src/transport.c b/libglusterfs/src/transport.c new file mode 100644 index 000000000..8bd4ff010 --- /dev/null +++ b/libglusterfs/src/transport.c @@ -0,0 +1,339 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#include +#include +#include +#include +#include +#include + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "logging.h" +#include "transport.h" +#include "glusterfs.h" +#include "xlator.h" +#include "list.h" + + +transport_t * +transport_load (dict_t *options, + xlator_t *xl) +{ + struct transport *trans = NULL, *return_trans = NULL; + char *addr_family = NULL; + char *name = NULL; + void *handle = NULL; + char *type = NULL; + char str[] = "ERROR"; + int32_t ret = -1; + int8_t is_tcp = 0, is_unix = 0, is_ibsdp = 0; + volume_opt_list_t *vol_opt = NULL; + + GF_VALIDATE_OR_GOTO("transport", options, fail); + GF_VALIDATE_OR_GOTO("transport", xl, fail); + + trans = CALLOC (1, sizeof (struct transport)); + GF_VALIDATE_OR_GOTO("transport", trans, fail); + + trans->xl = xl; + type = str; + + /* Backward compatibility */ + ret = dict_get_str (options, "transport-type", &type); + if (ret < 0) { + ret = dict_set_str (options, "transport-type", "socket"); + if (ret < 0) + gf_log ("dict", GF_LOG_DEBUG, + "setting transport-type failed"); + ret = dict_get_str (options, "transport.address-family", + &addr_family); + if (ret < 0) { + ret = dict_get_str (options, "address-family", + &addr_family); + } + + if (ret < 0) { + ret = dict_set_str (options, + "transport.address-family", + "inet"); + if (ret < 0) { + gf_log ("dict", GF_LOG_ERROR, + "setting address-family failed"); + } + } + + gf_log ("transport", GF_LOG_WARNING, + "missing 'option transport-type'. defaulting to " + "\"socket\" (%s)", addr_family?addr_family:"inet"); + } else { + { + /* Backword compatibility to handle * /client, + * * /server. + */ + char *tmp = strchr (type, '/'); + if (tmp) + *tmp = '\0'; + } + + is_tcp = strcmp (type, "tcp"); + is_unix = strcmp (type, "unix"); + is_ibsdp = strcmp (type, "ib-sdp"); + if ((is_tcp == 0) || + (is_unix == 0) || + (is_ibsdp == 0)) { + if (is_tcp == 0) + ret = dict_set_str (options, + "transport.address-family", + "inet"); + if (is_unix == 0) + ret = dict_set_str (options, + "transport.address-family", + "unix"); + if (is_ibsdp == 0) + ret = dict_set_str (options, + "transport.address-family", + "inet-sdp"); + + if (ret < 0) + gf_log ("dict", GF_LOG_DEBUG, + "setting address-family failed"); + + ret = dict_set_str (options, + "transport-type", "socket"); + if (ret < 0) + gf_log ("dict", GF_LOG_DEBUG, + "setting transport-type failed"); + } + } + + ret = dict_get_str (options, "transport-type", &type); + if (ret < 0) { + FREE (trans); + gf_log ("transport", GF_LOG_ERROR, + "'option transport-type ' missing in volume '%s'", + xl->name); + goto fail; + } + + asprintf (&name, "%s/%s.so", TRANSPORTDIR, type); + gf_log ("transport", GF_LOG_DEBUG, + "attempt to load file %s", name); + + handle = dlopen (name, RTLD_NOW|RTLD_GLOBAL); + if (handle == NULL) { + gf_log ("transport", GF_LOG_ERROR, "%s", dlerror ()); + gf_log ("transport", GF_LOG_ERROR, + "volume '%s': transport-type '%s' is not valid or " + "not found on this machine", + xl->name, type); + FREE (name); + FREE (trans); + goto fail; + } + FREE (name); + + trans->ops = dlsym (handle, "tops"); + if (trans->ops == NULL) { + gf_log ("transport", GF_LOG_ERROR, + "dlsym (transport_ops) on %s", dlerror ()); + FREE (trans); + goto fail; + } + + trans->init = dlsym (handle, "init"); + if (trans->init == NULL) { + gf_log ("transport", GF_LOG_ERROR, + "dlsym (gf_transport_init) on %s", dlerror ()); + FREE (trans); + goto fail; + } + + trans->fini = dlsym (handle, "fini"); + if (trans->fini == NULL) { + gf_log ("transport", GF_LOG_ERROR, + "dlsym (gf_transport_fini) on %s", dlerror ()); + FREE (trans); + goto fail; + } + + vol_opt = CALLOC (1, sizeof (volume_opt_list_t)); + vol_opt->given_opt = dlsym (handle, "options"); + if (vol_opt->given_opt == NULL) { + gf_log ("transport", GF_LOG_DEBUG, + "volume option validation not specified"); + } else { + list_add_tail (&vol_opt->list, &xl->volume_options); + if (-1 == + validate_xlator_volume_options (xl, + vol_opt->given_opt)) { + gf_log ("transport", GF_LOG_ERROR, + "volume option validation failed"); + FREE (trans); + goto fail; + } + } + + ret = trans->init (trans); + if (ret != 0) { + gf_log ("transport", GF_LOG_ERROR, + "'%s' initialization failed", type); + FREE (trans); + goto fail; + } + + pthread_mutex_init (&trans->lock, NULL); + return_trans = trans; +fail: + return return_trans; +} + + +int32_t +transport_submit (transport_t *this, char *buf, int32_t len, + struct iovec *vector, int count, dict_t *refs) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + GF_VALIDATE_OR_GOTO("transport", this->ops, fail); + + ret = this->ops->submit (this, buf, len, vector, count, refs); +fail: + return ret; +} + + +int32_t +transport_connect (transport_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + ret = this->ops->connect (this); +fail: + return ret; +} + + +int32_t +transport_listen (transport_t *this) +{ + int ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + ret = this->ops->listen (this); +fail: + return ret; +} + + +int32_t +transport_disconnect (transport_t *this) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + ret = this->ops->disconnect (this); +fail: + return ret; +} + + +int32_t +transport_destroy (transport_t *this) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + if (this->fini) + this->fini (this); + + pthread_mutex_destroy (&this->lock); + FREE (this); +fail: + return ret; +} + + +transport_t * +transport_ref (transport_t *this) +{ + transport_t *return_this = NULL; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + pthread_mutex_lock (&this->lock); + { + this->refcount ++; + } + pthread_mutex_unlock (&this->lock); + + return_this = this; +fail: + return return_this; +} + + +int32_t +transport_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, + char **buf_p, size_t *buflen_p) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + ret = this->ops->receive (this, hdr_p, hdrlen_p, buf_p, buflen_p); +fail: + return ret; +} + + +int32_t +transport_unref (transport_t *this) +{ + int32_t refcount = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO("transport", this, fail); + + pthread_mutex_lock (&this->lock); + { + refcount = --this->refcount; + } + pthread_mutex_unlock (&this->lock); + + if (refcount == 0) { + this->xl->notify (this->xl, GF_EVENT_TRANSPORT_CLEANUP, this); + transport_destroy (this); + } + + ret = 0; +fail: + return ret; +} + diff --git a/libglusterfs/src/transport.h b/libglusterfs/src/transport.h new file mode 100644 index 000000000..be5c8b5df --- /dev/null +++ b/libglusterfs/src/transport.h @@ -0,0 +1,85 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef __TRANSPORT_H__ +#define __TRANSPORT_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include + +struct transport_ops; +typedef struct transport transport_t; + +#include "xlator.h" +#include "dict.h" +#include "compat.h" + +typedef struct peer_info { + struct sockaddr_storage sockaddr; + socklen_t sockaddr_len; + char identifier[UNIX_PATH_MAX]; +}peer_info_t; + +struct transport { + struct transport_ops *ops; + void *private; + void *xl_private; + pthread_mutex_t lock; + int32_t refcount; + + xlator_t *xl; + void *dnscache; + data_t *buf; + int32_t (*init) (transport_t *this); + void (*fini) (transport_t *this); + /* int (*notify) (transport_t *this, int event, void *data); */ + peer_info_t peerinfo; + peer_info_t myinfo; +}; + +struct transport_ops { + int32_t (*receive) (transport_t *this, char **hdr_p, size_t *hdrlen_p, + char **buf_p, size_t *buflen_p); + int32_t (*submit) (transport_t *this, char *buf, int len, + struct iovec *vector, int count, dict_t *refs); + int32_t (*connect) (transport_t *this); + int32_t (*listen) (transport_t *this); + int32_t (*disconnect) (transport_t *this); +}; + + +int32_t transport_listen (transport_t *this); +int32_t transport_connect (transport_t *this); +int32_t transport_disconnect (transport_t *this); +int32_t transport_notify (transport_t *this, int event); +int32_t transport_submit (transport_t *this, char *buf, int len, + struct iovec *vector, int count, dict_t *refs); +int32_t transport_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, + char **buf_p, size_t *buflen_p); +int32_t transport_destroy (transport_t *this); + +transport_t *transport_load (dict_t *options, xlator_t *xl); +transport_t *transport_ref (transport_t *trans); +int32_t transport_unref (transport_t *trans); + +#endif /* __TRANSPORT_H__ */ diff --git a/libglusterfs/src/xlator.c b/libglusterfs/src/xlator.c new file mode 100644 index 000000000..2b17cc7f6 --- /dev/null +++ b/libglusterfs/src/xlator.c @@ -0,0 +1,728 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include +#include +#include +#include "defaults.h" + + +#define SET_DEFAULT_FOP(fn) do { \ + if (!xl->fops->fn) \ + xl->fops->fn = default_##fn; \ + } while (0) + +#define SET_DEFAULT_MOP(fn) do { \ + if (!xl->mops->fn) \ + xl->mops->fn = default_##fn; \ + } while (0) + +#define SET_DEFAULT_CBK(fn) do { \ + if (!xl->cbks->fn) \ + xl->cbks->fn = default_##fn; \ + } while (0) + + +static void +fill_defaults (xlator_t *xl) +{ + if (xl == NULL) { + gf_log ("xlator", GF_LOG_ERROR, "invalid argument"); + return; + } + + SET_DEFAULT_FOP (create); + SET_DEFAULT_FOP (open); + SET_DEFAULT_FOP (stat); + SET_DEFAULT_FOP (readlink); + SET_DEFAULT_FOP (mknod); + SET_DEFAULT_FOP (mkdir); + SET_DEFAULT_FOP (unlink); + SET_DEFAULT_FOP (rmdir); + SET_DEFAULT_FOP (symlink); + SET_DEFAULT_FOP (rename); + SET_DEFAULT_FOP (link); + SET_DEFAULT_FOP (chmod); + SET_DEFAULT_FOP (chown); + SET_DEFAULT_FOP (truncate); + SET_DEFAULT_FOP (utimens); + SET_DEFAULT_FOP (readv); + SET_DEFAULT_FOP (writev); + SET_DEFAULT_FOP (statfs); + SET_DEFAULT_FOP (flush); + SET_DEFAULT_FOP (fsync); + SET_DEFAULT_FOP (setxattr); + SET_DEFAULT_FOP (getxattr); + SET_DEFAULT_FOP (removexattr); + SET_DEFAULT_FOP (opendir); + SET_DEFAULT_FOP (readdir); + SET_DEFAULT_FOP (fsyncdir); + SET_DEFAULT_FOP (access); + SET_DEFAULT_FOP (ftruncate); + SET_DEFAULT_FOP (fstat); + SET_DEFAULT_FOP (lk); + SET_DEFAULT_FOP (inodelk); + SET_DEFAULT_FOP (finodelk); + SET_DEFAULT_FOP (entrylk); + SET_DEFAULT_FOP (fentrylk); + SET_DEFAULT_FOP (lookup); + SET_DEFAULT_FOP (fchown); + SET_DEFAULT_FOP (fchmod); + SET_DEFAULT_FOP (setdents); + SET_DEFAULT_FOP (getdents); + SET_DEFAULT_FOP (checksum); + SET_DEFAULT_FOP (xattrop); + SET_DEFAULT_FOP (fxattrop); + + SET_DEFAULT_MOP (stats); + + SET_DEFAULT_CBK (release); + SET_DEFAULT_CBK (releasedir); + SET_DEFAULT_CBK (forget); + + if (!xl->notify) + xl->notify = default_notify; + + return; +} + +int +_volume_option_value_validate (xlator_t *xl, + data_pair_t *pair, + volume_option_t *opt) +{ + int i = 0; + int ret = -1; + uint64_t input_size = 0; + long long inputll = 0; + + /* Key is valid, validate the option */ + switch (opt->type) { + case GF_OPTION_TYPE_PATH: + { + /* Make sure the given path is valid */ + if (pair->value->data[0] != '/') { + gf_log (xl->name, GF_LOG_WARNING, + "option %s %s: '%s' is not an " + "absolute path name", + pair->key, pair->value->data, + pair->value->data); + } + ret = 0; + } + break; + case GF_OPTION_TYPE_INT: + { + /* Check the range */ + if (gf_string2longlong (pair->value->data, + &inputll) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\" in " + "\"option %s\"", + pair->value->data, pair->key); + goto out; + } + + if ((opt->min == 0) && (opt->max == 0)) { + gf_log (xl->name, GF_LOG_DEBUG, + "no range check required for " + "'option %s %s'", + pair->key, pair->value->data); + ret = 0; + break; + } + if ((inputll < opt->min) || + (inputll > opt->max)) { + gf_log (xl->name, GF_LOG_WARNING, + "'%lld' in 'option %s %s' is out of " + "range [%"PRId64" - %"PRId64"]", + inputll, pair->key, + pair->value->data, + opt->min, opt->max); + } + ret = 0; + } + break; + case GF_OPTION_TYPE_SIZET: + { + /* Check the range */ + if (gf_string2bytesize (pair->value->data, + &input_size) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid size format \"%s\" in " + "\"option %s\"", + pair->value->data, pair->key); + goto out; + } + + if ((opt->min == 0) && (opt->max == 0)) { + gf_log (xl->name, GF_LOG_DEBUG, + "no range check required for " + "'option %s %s'", + pair->key, pair->value->data); + ret = 0; + break; + } + if ((input_size < opt->min) || + (input_size > opt->max)) { + gf_log (xl->name, GF_LOG_ERROR, + "'%"PRId64"' in 'option %s %s' is " + "out of range [%"PRId64" - %"PRId64"]", + input_size, pair->key, + pair->value->data, + opt->min, opt->max); + } + ret = 0; + } + break; + case GF_OPTION_TYPE_BOOL: + { + /* Check if the value is one of + '0|1|on|off|no|yes|true|false|enable|disable' */ + gf_boolean_t bool_value; + if (gf_string2boolean (pair->value->data, + &bool_value) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "option %s %s: '%s' is not a valid " + "boolean value", + pair->key, pair->value->data, + pair->value->data); + goto out; + } + ret = 0; + } + break; + case GF_OPTION_TYPE_XLATOR: + { + /* Check if the value is one of the xlators */ + xlator_t *xlopt = xl; + while (xlopt->prev) + xlopt = xlopt->prev; + + while (xlopt) { + if (strcmp (pair->value->data, + xlopt->name) == 0) { + ret = 0; + break; + } + xlopt = xlopt->next; + } + if (!xlopt) { + gf_log (xl->name, GF_LOG_ERROR, + "option %s %s: '%s' is not a " + "valid volume name", + pair->key, pair->value->data, + pair->value->data); + } + ret = 0; + } + break; + case GF_OPTION_TYPE_STR: + { + /* Check if the '*str' is valid */ + if (!opt->value) { + ret = 0; + goto out; + } + + for (i = 0; (i < ZR_OPTION_MAX_ARRAY_SIZE) && + opt->value[i]; i++) { + if (strcasecmp (opt->value[i], + pair->value->data) == 0) { + ret = 0; + break; + } + } + + if ((i == ZR_OPTION_MAX_ARRAY_SIZE) + || ((i < ZR_OPTION_MAX_ARRAY_SIZE) + && (!opt->value[i]))) { + /* enter here only if + * 1. reached end of opt->value array and haven't validated input + * OR + * 2. valid input list is less than ZR_OPTION_MAX_ARRAY_SIZE and + * input has not matched all possible input values. + */ + char given_array[4096] = {0,}; + for (i = 0; (i < ZR_OPTION_MAX_ARRAY_SIZE) && + opt->value[i]; i++) { + strcat (given_array, opt->value[i]); + strcat (given_array, ", "); + } + + gf_log (xl->name, GF_LOG_ERROR, + "option %s %s: '%s' is not valid " + "(possible options are %s)", + pair->key, pair->value->data, + pair->value->data, given_array); + + goto out; + } + } + break; + case GF_OPTION_TYPE_PERCENT: + { + uint32_t percent = 0; + + /* Check if the value is valid percentage */ + if (gf_string2percent (pair->value->data, + &percent) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid percent format \"%s\" " + "in \"option %s\"", + pair->value->data, pair->key); + goto out; + } + + if ((percent < 0) || (percent > 100)) { + gf_log (xl->name, GF_LOG_ERROR, + "'%d' in 'option %s %s' is out of " + "range [0 - 100]", + percent, pair->key, + pair->value->data); + } + ret = 0; + } + break; + case GF_OPTION_TYPE_TIME: + { + uint32_t input_time = 0; + + /* Check if the value is valid percentage */ + if (gf_string2time (pair->value->data, + &input_time) != 0) { + gf_log (xl->name, + GF_LOG_ERROR, + "invalid time format \"%s\" in " + "\"option %s\"", + pair->value->data, pair->key); + goto out; + } + + if ((opt->min == 0) && (opt->max == 0)) { + gf_log (xl->name, GF_LOG_DEBUG, + "no range check required for " + "'option %s %s'", + pair->key, pair->value->data); + ret = 0; + goto out; + } + if ((input_time < opt->min) || + (input_time > opt->max)) { + gf_log (xl->name, GF_LOG_ERROR, + "'%"PRIu32"' in 'option %s %s' is " + "out of range [%"PRId64" - %"PRId64"]", + input_time, pair->key, + pair->value->data, + opt->min, opt->max); + } + ret = 0; + } + break; + case GF_OPTION_TYPE_ANY: + /* NO CHECK */ + ret = 0; + break; + } + + out: + return ret; +} + +int +validate_xlator_volume_options (xlator_t *xl, volume_option_t *opt) +{ + int i = 0; + int ret = -1; + int index = 0; + volume_option_t *trav = NULL; + data_pair_t *pairs = NULL; + + if (!opt) { + ret = 0; + goto out; + } + + /* First search for not supported options, if any report error */ + pairs = xl->options->members_list; + while (pairs) { + ret = -1; + for (index = 0; + opt[index].key && opt[index].key[0] ; index++) { + trav = &(opt[index]); + for (i = 0 ; + (i < ZR_VOLUME_MAX_NUM_KEY) && + trav->key[i]; i++) { + /* Check if the key is valid */ + if (fnmatch (trav->key[i], + pairs->key, FNM_NOESCAPE) == 0) { + ret = 0; + break; + } + } + if (!ret) { + if (i) { + gf_log (xl->name, GF_LOG_WARNING, + "option '%s' is deprecated, " + "preferred is '%s', continuing" + " with correction", + trav->key[i], trav->key[0]); + /* TODO: some bytes lost */ + pairs->key = strdup (trav->key[0]); + } + break; + } + } + if (!ret) { + ret = _volume_option_value_validate (xl, pairs, trav); + if (-1 == ret) { + goto out; + } + } + + pairs = pairs->next; + } + + ret = 0; + out: + return ret; +} + +int32_t +xlator_set_type (xlator_t *xl, + const char *type) +{ + char *name = NULL; + void *handle = NULL; + volume_opt_list_t *vol_opt = NULL; + + if (xl == NULL || type == NULL) { + gf_log ("xlator", GF_LOG_ERROR, "invalid argument"); + return -1; + } + + xl->type = strdup (type); + + asprintf (&name, "%s/%s.so", XLATORDIR, type); + + gf_log ("xlator", GF_LOG_DEBUG, "attempt to load file %s", name); + + handle = dlopen (name, RTLD_NOW|RTLD_GLOBAL); + if (!handle) { + gf_log ("xlator", GF_LOG_ERROR, "%s", dlerror ()); + return -1; + } + + if (!(xl->fops = dlsym (handle, "fops"))) { + gf_log ("xlator", GF_LOG_ERROR, "dlsym(fops) on %s", + dlerror ()); + return -1; + } + + if (!(xl->mops = dlsym (handle, "mops"))) { + gf_log ("xlator", GF_LOG_ERROR, "dlsym(mops) on %s", + dlerror ()); + return -1; + } + + if (!(xl->cbks = dlsym (handle, "cbks"))) { + gf_log ("xlator", GF_LOG_ERROR, "dlsym(cbks) on %s", + dlerror ()); + return -1; + } + + if (!(xl->init = dlsym (handle, "init"))) { + gf_log ("xlator", GF_LOG_ERROR, "dlsym(init) on %s", + dlerror ()); + return -1; + } + + if (!(xl->fini = dlsym (handle, "fini"))) { + gf_log ("xlator", GF_LOG_ERROR, "dlsym(fini) on %s", + dlerror ()); + return -1; + } + + if (!(xl->notify = dlsym (handle, "notify"))) { + gf_log ("xlator", GF_LOG_DEBUG, + "dlsym(notify) on %s -- neglecting", dlerror ()); + } + + INIT_LIST_HEAD (&xl->volume_options); + + vol_opt = CALLOC (1, sizeof (volume_opt_list_t)); + + if (!(vol_opt->given_opt = dlsym (handle, "options"))) { + dlerror (); + gf_log (xl->name, GF_LOG_DEBUG, + "strict option validation not enforced -- neglecting"); + } + list_add_tail (&vol_opt->list, &xl->volume_options); + + fill_defaults (xl); + + FREE (name); + return 0; +} + + +void +xlator_foreach (xlator_t *this, + void (*fn)(xlator_t *each, + void *data), + void *data) +{ + xlator_t *first = NULL; + + if (this == NULL || fn == NULL || data == NULL) { + gf_log ("xlator", GF_LOG_ERROR, "invalid argument"); + return; + } + + first = this; + + while (first->prev) + first = first->prev; + + while (first) { + fn (first, data); + first = first->next; + } +} + + +xlator_t * +xlator_search_by_name (xlator_t *any, const char *name) +{ + xlator_t *search = NULL; + + if (any == NULL || name == NULL) { + gf_log ("xlator", GF_LOG_ERROR, "invalid argument"); + return NULL; + } + + search = any; + + while (search->prev) + search = search->prev; + + while (search) { + if (!strcmp (search->name, name)) + break; + search = search->next; + } + + return search; +} + + +static int32_t +xlator_init_rec (xlator_t *xl) +{ + xlator_list_t *trav = NULL; + int32_t ret = 0; + + if (xl == NULL) { + gf_log ("xlator", GF_LOG_ERROR, "invalid argument"); + return 0; + } + + trav = xl->children; + + while (trav) { + ret = 0; + ret = xlator_init_rec (trav->xlator); + if (ret != 0) + break; + gf_log (trav->xlator->name, GF_LOG_DEBUG, + "Initialization done"); + trav = trav->next; + } + + if (!ret && !xl->ready) { + ret = -1; + if (xl->init) { + ret = xl->init (xl); + if (ret) { + gf_log ("xlator", GF_LOG_ERROR, + "initialization of volume '%s' failed," + " review your volfile again", + xl->name); + } else { + xl->init_succeeded = 1; + } + } else { + gf_log (xl->name, GF_LOG_ERROR, "No init() found"); + } + /* This 'xl' is checked */ + xl->ready = 1; + } + + return ret; +} + + +int32_t +xlator_tree_init (xlator_t *xl) +{ + xlator_t *top = NULL; + int32_t ret = 0; + + if (xl == NULL) { + gf_log ("xlator", GF_LOG_ERROR, "invalid argument"); + return 0; + } + + top = xl; +/* + while (top->parents) + top = top->parents->xlator; +*/ + ret = xlator_init_rec (top); + + if (ret == 0 && top->notify) { + top->notify (top, GF_EVENT_PARENT_UP, NULL); + } + + return ret; +} + + +static void +xlator_fini_rec (xlator_t *xl) +{ + xlator_list_t *trav = NULL; + + if (xl == NULL) { + gf_log ("xlator", GF_LOG_ERROR, "invalid argument"); + return; + } + + trav = xl->children; + + while (trav) { + if (!trav->xlator->init_succeeded) { + break; + } + + xlator_fini_rec (trav->xlator); + gf_log (trav->xlator->name, GF_LOG_DEBUG, "fini done"); + trav = trav->next; + } + + if (xl->init_succeeded) { + if (xl->fini) { + xl->fini (xl); + } else { + gf_log (xl->name, GF_LOG_ERROR, "No fini() found"); + } + xl->init_succeeded = 0; + } +} + + +void +xlator_tree_fini (xlator_t *xl) +{ + xlator_t *top = NULL; + + if (xl == NULL) { + gf_log ("xlator", GF_LOG_ERROR, "invalid argument"); + return; + } + + top = xl; + xlator_fini_rec (top); +} + + +int +xlator_tree_free (xlator_t *tree) +{ + xlator_t *trav = tree, *prev = tree; + + if (!tree) { + gf_log ("parser", GF_LOG_ERROR, "Translator tree not found"); + return -1; + } + + while (prev) { + trav = prev->next; + dict_destroy (prev->options); + FREE (prev->name); + FREE (prev->type); + FREE (prev); + prev = trav; + } + + return 0; +} + + +void +loc_wipe (loc_t *loc) +{ + if (loc->inode) { + inode_unref (loc->inode); + loc->inode = NULL; + } + if (loc->path) { + FREE (loc->path); + loc->path = NULL; + } + + if (loc->parent) { + inode_unref (loc->parent); + loc->parent = NULL; + } +} + + +int +loc_copy (loc_t *dst, loc_t *src) +{ + int ret = -1; + + dst->ino = src->ino; + + if (src->inode) + dst->inode = inode_ref (src->inode); + + if (src->parent) + dst->parent = inode_ref (src->parent); + + dst->path = strdup (src->path); + + if (!dst->path) + goto out; + + dst->name = strrchr (dst->path, '/'); + if (dst->name) + dst->name++; + + ret = 0; +out: + return ret; +} diff --git a/libglusterfs/src/xlator.h b/libglusterfs/src/xlator.h new file mode 100644 index 000000000..eadc9fd1a --- /dev/null +++ b/libglusterfs/src/xlator.h @@ -0,0 +1,842 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _XLATOR_H +#define _XLATOR_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include +#include +#include + + +#include "glusterfs.h" +#include "logging.h" +#include "common-utils.h" +#include "dict.h" +#include "compat.h" +#include "list.h" + +#define FIRST_CHILD(xl) (xl->children->xlator) + +struct _xlator; +typedef struct _xlator xlator_t; +struct _dir_entry_t; +typedef struct _dir_entry_t dir_entry_t; +struct _gf_dirent_t; +typedef struct _gf_dirent_t gf_dirent_t; +struct _loc; +typedef struct _loc loc_t; + + +typedef int32_t (*event_notify_fn_t) (xlator_t *this, + int32_t event, + void *data, + ...); + +#include "list.h" +#include "gf-dirent.h" +#include "stack.h" +#include "inode.h" +#include "fd.h" + +struct _loc { + const char *path; + const char *name; + ino_t ino; + inode_t *inode; + inode_t *parent; +}; + + +struct xlator_stats { + uint64_t nr_files; /* Number of files open via this xlator */ + uint64_t free_disk; /* Mega bytes */ + uint64_t total_disk_size; /* Mega Bytes */ + uint64_t disk_usage; /* Mega bytes */ + uint64_t disk_speed; /* MHz or Mbps */ + uint64_t nr_clients; /* Number of client nodes */ + uint64_t write_usage; + uint64_t read_usage; /* add more stats here */ +}; + + + +typedef int32_t (*mop_stats_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats); + +typedef int32_t (*mop_getspec_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + char *spec_data); + +typedef int32_t (*fop_checksum_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum); + +typedef int32_t (*mop_setvolume_t) (call_frame_t *frame, + xlator_t *this, + const char *volume); + +typedef int32_t (*mop_stats_t) (call_frame_t *frame, + xlator_t *this, + int32_t flags); + +typedef int32_t (*mop_getspec_t) (call_frame_t *frame, + xlator_t *this, + const char *key, + int32_t flag); + +typedef int32_t (*fop_checksum_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag); + +struct xlator_mops { + mop_stats_t stats; + mop_getspec_t getspec; + + mop_stats_cbk_t stats_cbk; + mop_getspec_cbk_t getspec_cbk; +}; + + +typedef int32_t (*fop_lookup_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *xattr); + +typedef int32_t (*fop_stat_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +typedef int32_t (*fop_fstat_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +typedef int32_t (*fop_chmod_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +typedef int32_t (*fop_fchmod_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +typedef int32_t (*fop_chown_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +typedef int32_t (*fop_fchown_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +typedef int32_t (*fop_truncate_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +typedef int32_t (*fop_ftruncate_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +typedef int32_t (*fop_utimens_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +typedef int32_t (*fop_access_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_readlink_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path); + +typedef int32_t (*fop_mknod_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + +typedef int32_t (*fop_mkdir_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + +typedef int32_t (*fop_unlink_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_rmdir_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_symlink_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + +typedef int32_t (*fop_rename_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +typedef int32_t (*fop_link_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf); + +typedef int32_t (*fop_create_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf); + +typedef int32_t (*fop_open_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd); + +typedef int32_t (*fop_readv_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf); + +typedef int32_t (*fop_writev_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf); + +typedef int32_t (*fop_flush_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_fsync_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_opendir_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd); + +typedef int32_t (*fop_getdents_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count); + +typedef int32_t (*fop_fsyncdir_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_statfs_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf); + +typedef int32_t (*fop_setxattr_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_getxattr_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict); + +typedef int32_t (*fop_removexattr_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_lk_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *flock); + +typedef int32_t (*fop_inodelk_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_finodelk_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_entrylk_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_fentrylk_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_setdents_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +typedef int32_t (*fop_readdir_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries); + +typedef int32_t (*fop_xattrop_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xattr); + +typedef int32_t (*fop_fxattrop_cbk_t) (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *xattr); + +typedef int32_t (*fop_lookup_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req); + +typedef int32_t (*fop_stat_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc); + +typedef int32_t (*fop_fstat_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd); + +typedef int32_t (*fop_chmod_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode); + +typedef int32_t (*fop_fchmod_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode); + +typedef int32_t (*fop_chown_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid); + +typedef int32_t (*fop_fchown_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid); + +typedef int32_t (*fop_truncate_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset); + +typedef int32_t (*fop_ftruncate_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset); + +typedef int32_t (*fop_utimens_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]); + +typedef int32_t (*fop_access_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask); + +typedef int32_t (*fop_readlink_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size); + +typedef int32_t (*fop_mknod_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev); + +typedef int32_t (*fop_mkdir_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode); + +typedef int32_t (*fop_unlink_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc); + +typedef int32_t (*fop_rmdir_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc); + +typedef int32_t (*fop_symlink_t) (call_frame_t *frame, + xlator_t *this, + const char *linkname, + loc_t *loc); + +typedef int32_t (*fop_rename_t) (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc); + +typedef int32_t (*fop_link_t) (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc); + +typedef int32_t (*fop_create_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd); + +typedef int32_t (*fop_open_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd); + +typedef int32_t (*fop_readv_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset); + +typedef int32_t (*fop_writev_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset); + +typedef int32_t (*fop_flush_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd); + +typedef int32_t (*fop_fsync_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync); + +typedef int32_t (*fop_opendir_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + fd_t *fd); + +typedef int32_t (*fop_getdents_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag); + +typedef int32_t (*fop_fsyncdir_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync); + +typedef int32_t (*fop_statfs_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc); + +typedef int32_t (*fop_setxattr_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags); + +typedef int32_t (*fop_getxattr_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name); + +typedef int32_t (*fop_removexattr_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name); + +typedef int32_t (*fop_lk_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *flock); + +typedef int32_t (*fop_inodelk_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t cmd, + struct flock *flock); + +typedef int32_t (*fop_finodelk_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *flock); + +typedef int32_t (*fop_entrylk_t) (call_frame_t *frame, + xlator_t *this, loc_t *loc, + const char *basename, entrylk_cmd cmd, + entrylk_type type); + +typedef int32_t (*fop_fentrylk_t) (call_frame_t *frame, + xlator_t *this, fd_t *fd, + const char *basename, entrylk_cmd cmd, + entrylk_type type); + +typedef int32_t (*fop_setdents_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count); + +typedef int32_t (*fop_readdir_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset); + +typedef int32_t (*fop_xattrop_t) (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t optype, + dict_t *xattr); + +typedef int32_t (*fop_fxattrop_t) (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t optype, + dict_t *xattr); + +struct xlator_fops { + fop_lookup_t lookup; + fop_stat_t stat; + fop_fstat_t fstat; + fop_chmod_t chmod; + fop_fchmod_t fchmod; + fop_chown_t chown; + fop_fchown_t fchown; + fop_truncate_t truncate; + fop_ftruncate_t ftruncate; + fop_utimens_t utimens; + fop_access_t access; + fop_readlink_t readlink; + fop_mknod_t mknod; + fop_mkdir_t mkdir; + fop_unlink_t unlink; + fop_rmdir_t rmdir; + fop_symlink_t symlink; + fop_rename_t rename; + fop_link_t link; + fop_create_t create; + fop_open_t open; + fop_readv_t readv; + fop_writev_t writev; + fop_flush_t flush; + fop_fsync_t fsync; + fop_opendir_t opendir; + fop_readdir_t readdir; + fop_fsyncdir_t fsyncdir; + fop_statfs_t statfs; + fop_setxattr_t setxattr; + fop_getxattr_t getxattr; + fop_removexattr_t removexattr; + fop_lk_t lk; + fop_inodelk_t inodelk; + fop_finodelk_t finodelk; + fop_entrylk_t entrylk; + fop_fentrylk_t fentrylk; + fop_setdents_t setdents; + fop_getdents_t getdents; + fop_checksum_t checksum; + fop_xattrop_t xattrop; + fop_fxattrop_t fxattrop; + + /* these entries are used for a typechecking hack in STACK_WIND _only_ */ + fop_lookup_cbk_t lookup_cbk; + fop_stat_cbk_t stat_cbk; + fop_fstat_cbk_t fstat_cbk; + fop_chmod_cbk_t chmod_cbk; + fop_fchmod_cbk_t fchmod_cbk; + fop_chown_cbk_t chown_cbk; + fop_fchown_cbk_t fchown_cbk; + fop_truncate_cbk_t truncate_cbk; + fop_ftruncate_cbk_t ftruncate_cbk; + fop_utimens_cbk_t utimens_cbk; + fop_access_cbk_t access_cbk; + fop_readlink_cbk_t readlink_cbk; + fop_mknod_cbk_t mknod_cbk; + fop_mkdir_cbk_t mkdir_cbk; + fop_unlink_cbk_t unlink_cbk; + fop_rmdir_cbk_t rmdir_cbk; + fop_symlink_cbk_t symlink_cbk; + fop_rename_cbk_t rename_cbk; + fop_link_cbk_t link_cbk; + fop_create_cbk_t create_cbk; + fop_open_cbk_t open_cbk; + fop_readv_cbk_t readv_cbk; + fop_writev_cbk_t writev_cbk; + fop_flush_cbk_t flush_cbk; + fop_fsync_cbk_t fsync_cbk; + fop_opendir_cbk_t opendir_cbk; + fop_readdir_cbk_t readdir_cbk; + fop_fsyncdir_cbk_t fsyncdir_cbk; + fop_statfs_cbk_t statfs_cbk; + fop_setxattr_cbk_t setxattr_cbk; + fop_getxattr_cbk_t getxattr_cbk; + fop_removexattr_cbk_t removexattr_cbk; + fop_lk_cbk_t lk_cbk; + fop_inodelk_cbk_t inodelk_cbk; + fop_finodelk_cbk_t finodelk_cbk; + fop_entrylk_cbk_t entrylk_cbk; + fop_fentrylk_cbk_t fentrylk_cbk; + fop_setdents_cbk_t setdents_cbk; + fop_getdents_cbk_t getdents_cbk; + fop_checksum_cbk_t checksum_cbk; + fop_xattrop_cbk_t xattrop_cbk; + fop_fxattrop_cbk_t fxattrop_cbk; +}; + +typedef int32_t (*cbk_forget_t) (xlator_t *this, + inode_t *inode); + +typedef int32_t (*cbk_release_t) (xlator_t *this, + fd_t *fd); + +struct xlator_cbks { + cbk_forget_t forget; + cbk_release_t release; + cbk_release_t releasedir; +}; + +typedef struct xlator_list { + xlator_t *xlator; + struct xlator_list *next; +} xlator_list_t; + +/* Add possible new type of option you may need */ +typedef enum { + GF_OPTION_TYPE_ANY = 0, + GF_OPTION_TYPE_STR, + GF_OPTION_TYPE_INT, + GF_OPTION_TYPE_SIZET, + GF_OPTION_TYPE_PERCENT, + GF_OPTION_TYPE_BOOL, + GF_OPTION_TYPE_XLATOR, + GF_OPTION_TYPE_PATH, + GF_OPTION_TYPE_TIME, +} volume_option_type_t; + +#define ZR_VOLUME_MAX_NUM_KEY 4 +#define ZR_OPTION_MAX_ARRAY_SIZE 64 + +/* Each translator should define this structure */ +typedef struct volume_options { + char *key[ZR_VOLUME_MAX_NUM_KEY]; + /* different key, same meaning */ + volume_option_type_t type; + int64_t min; /* -1 means no range */ + int64_t max; /* -1 means no range */ + char *value[ZR_OPTION_MAX_ARRAY_SIZE]; + /* If specified, will check for one of + the value from this array */ + char *description; /* about the key */ +} volume_option_t; + +typedef struct vol_opt_list { + struct list_head list; + volume_option_t *given_opt; +} volume_opt_list_t; + +struct _xlator { + /* Built during parsing */ + char *name; + char *type; + xlator_t *next; + xlator_t *prev; + xlator_list_t *parents; + xlator_list_t *children; + dict_t *options; + + /* Set after doing dlopen() */ + struct xlator_fops *fops; + struct xlator_mops *mops; + struct xlator_cbks *cbks; + struct list_head volume_options; /* list of volume_option_t */ + + void (*fini) (xlator_t *this); + int32_t (*init) (xlator_t *this); + event_notify_fn_t notify; + + /* Misc */ + glusterfs_ctx_t *ctx; + inode_table_t *itable; + char ready; + char trace; + char init_succeeded; + void *private; +}; + +int validate_xlator_volume_options (xlator_t *xl, volume_option_t *opt); + +int32_t xlator_set_type (xlator_t *xl, const char *type); + +xlator_t *file_to_xlator_tree (glusterfs_ctx_t *ctx, + FILE *fp); + + +int32_t xlator_tree_init (xlator_t *xl); +int32_t xlator_tree_free (xlator_t *xl); + +void xlator_tree_fini (xlator_t *xl); + +void xlator_foreach (xlator_t *this, + void (*fn) (xlator_t *each, + void *data), + void *data); + +xlator_t *xlator_search_by_name (xlator_t *any, const char *name); + +void inode_destroy_notify (inode_t *inode, const char *xlname); + +int loc_copy (loc_t *dst, loc_t *src); +#define loc_dup(src, dst) loc_copy(dst, src) +void loc_wipe (loc_t *loc); + +#define GF_STAT_PRINT_FMT_STR "%"PRIx64",%"PRIx64",%"PRIx32",%"PRIx32",%"PRIx32",%"PRIx32",%"PRIx64",%"PRIx64",%"PRIx32",%"PRIx64",%"PRIx32",%"PRIx32",%"PRIx32",%"PRIx32",%"PRIx32",%"PRIx32"\n" + +#define GF_STAT_SCAN_FMT_STR "%"SCNx64",%"SCNx64",%"SCNx32",%"SCNx32",%"SCNx32",%"SCNx32",%"SCNx64",%"SCNx64",%"SCNx32",%"SCNx64",%"SCNx32",%"SCNx32",%"SCNx32",%"SCNx32",%"SCNx32",%"SCNx32"\n" + +#define GF_STATFS_PRINT_FMT_STR "%"PRIx32",%"PRIx32",%"PRIx64",%"PRIx64",%"PRIx64",%"PRIx64",%"PRIx64",%"PRIx64",%"PRIx32",%"PRIx32",%"PRIx32"\n" + +#define GF_STATFS_SCAN_FMT_STR "%"SCNx32",%"SCNx32",%"SCNx64",%"SCNx64",%"SCNx64",%"SCNx64",%"SCNx64",%"SCNx64",%"SCNx32",%"SCNx32",%"SCNx32"\n" + +#endif /* _XLATOR_H */ + diff --git a/libglusterfsclient/Makefile.am b/libglusterfsclient/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/libglusterfsclient/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/libglusterfsclient/src/Makefile.am b/libglusterfsclient/src/Makefile.am new file mode 100644 index 000000000..8382b2561 --- /dev/null +++ b/libglusterfsclient/src/Makefile.am @@ -0,0 +1,16 @@ +lib_LTLIBRARIES = libglusterfsclient.la +noinst_HEADERS = libglusterfsclient-internals.h +libglusterfsclient_HEADERS = libglusterfsclient.h +libglusterfsclientdir = $(includedir) + +libglusterfsclient_la_SOURCES = libglusterfsclient.c +libglusterfsclient_la_CFLAGS = -fPIC -Wall -pthread +libglusterfsclient_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +libglusterfsclient_la_CPPFLAGS = -D_FILE_OFFSET_BITS=64 -D$(GF_HOST_OS) -D__USE_FILE_OFFSET64 -D_GNU_SOURCE -I$(top_srcdir)/libglusterfs/src -DDATADIR=\"$(localstatedir)\" -DCONFDIR=\"$(sysconfdir)/glusterfs\" $(GF_CFLAGS) +libglusterfsclient_la_LDFLAGS = -shared -nostartfiles + +CLEANFILES = + +$(top_builddir)/libglusterfs/src/libglusterfs.la: + $(MAKE) -C $(top_builddir)/libglusterfs/src/ all + diff --git a/libglusterfsclient/src/libglusterfsclient-internals.h b/libglusterfsclient/src/libglusterfsclient-internals.h new file mode 100755 index 000000000..c07da8180 --- /dev/null +++ b/libglusterfsclient/src/libglusterfsclient-internals.h @@ -0,0 +1,144 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef __LIBGLUSTERFSCLIENT_INTERNALS_H +#define __LIBGLUSTERFSCLIENT_INTERNALS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef void (*sighandler_t) (int); +typedef struct list_head list_head_t; + +typedef struct libglusterfs_client_ctx { + glusterfs_ctx_t gf_ctx; + inode_table_t *itable; + pthread_t reply_thread; + call_pool_t pool; + uint32_t counter; + time_t lookup_timeout; + time_t stat_timeout; +}libglusterfs_client_ctx_t; + +typedef struct signal_handler { + int signo; + sighandler_t handler; + list_head_t next; +}libgf_client_signal_handler_t ; + +typedef struct { + pthread_mutex_t lock; + pthread_cond_t reply_cond; + call_stub_t *reply_stub; + char complete; + union { + struct { + char is_revalidate; + loc_t *loc; + int32_t size; + } lookup; + }fop; +}libgf_client_local_t; + +#define LIBGF_STACK_WIND_AND_WAIT(frame, rfn, obj, fn, params ...) \ + do { \ + STACK_WIND (frame, rfn, obj, fn, params); \ + pthread_mutex_lock (&local->lock); \ + { \ + while (!local->complete) { \ + pthread_cond_wait (&local->reply_cond, &local->lock); \ + } \ + } \ + pthread_mutex_unlock (&local->lock); \ + } while (0) + + + +#define LIBGF_CLIENT_SIGNAL(signal_handler_list, signo, handler) \ + do { \ + libgf_client_signal_handler_t *libgf_handler = CALLOC (1, sizeof (*libgf_handler)); \ + ERR_ABORT (libgf_handler); \ + libgf_handler->signo = signo; \ + libgf_handler->handler = signal (signo, handler); \ + list_add (&libgf_handler->next, signal_handler_list); \ + } while (0) + +#define LIBGF_INSTALL_SIGNAL_HANDLERS(signal_handlers) \ + do { \ + INIT_LIST_HEAD (&signal_handlers); \ + /* Handle SIGABORT and SIGSEGV */ \ + LIBGF_CLIENT_SIGNAL (&signal_handlers, SIGSEGV, gf_print_trace); \ + LIBGF_CLIENT_SIGNAL (&signal_handlers, SIGABRT, gf_print_trace); \ + LIBGF_CLIENT_SIGNAL (&signal_handlers, SIGHUP, gf_log_logrotate); \ + /* LIBGF_CLIENT_SIGNAL (SIGTERM, glusterfs_cleanup_and_exit); */ \ + } while (0) + +#define LIBGF_RESTORE_SIGNAL_HANDLERS(local) \ + do { \ + libgf_client_signal_handler_t *ptr = NULL, *tmp = NULL; \ + list_for_each_entry_safe (ptr, tmp, &local->signal_handlers, next) { \ + signal (ptr->signo, ptr->handler); \ + FREE (ptr); \ + } \ + } while (0) + +#define LIBGF_CLIENT_FOP_ASYNC(ctx, local, ret_fn, op, args ...) \ + do { \ + call_frame_t *frame = get_call_frame_for_req (ctx, 1); \ + xlator_t *xl = frame->this->children ? \ + frame->this->children->xlator : NULL; \ + dict_t *refs = frame->root->req_refs; \ + frame->root->state = ctx; \ + frame->local = local; \ + STACK_WIND (frame, ret_fn, xl, xl->fops->op, args); \ + dict_unref (refs); \ + } while (0) + +#define LIBGF_CLIENT_FOP(ctx, stub, op, local, args ...) \ + do { \ + call_frame_t *frame = get_call_frame_for_req (ctx, 1); \ + xlator_t *xl = frame->this->children ? \ + frame->this->children->xlator : NULL; \ + dict_t *refs = frame->root->req_refs; \ + if (!local) { \ + local = CALLOC (1, sizeof (*local)); \ + } \ + ERR_ABORT (local); \ + frame->local = local; \ + frame->root->state = ctx; \ + pthread_cond_init (&local->reply_cond, NULL); \ + pthread_mutex_init (&local->lock, NULL); \ + LIBGF_STACK_WIND_AND_WAIT (frame, libgf_client_##op##_cbk, xl, xl->fops->op, args); \ + dict_unref (refs); \ + stub = local->reply_stub; \ + FREE (frame->local); \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + } while (0) + +#endif diff --git a/libglusterfsclient/src/libglusterfsclient.c b/libglusterfsclient/src/libglusterfsclient.c new file mode 100755 index 000000000..51098a065 --- /dev/null +++ b/libglusterfsclient/src/libglusterfsclient.c @@ -0,0 +1,3146 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "defaults.h" +#include +#include +#include "transport.h" +#include "event.h" +#include "libglusterfsclient.h" +#include "libglusterfsclient-internals.h" +#include "compat.h" +#include "compat-errno.h" + +#define XLATOR_NAME "libglusterfsclient" +#define LIBGLUSTERFS_INODE_TABLE_LRU_LIMIT 14057 + +typedef struct { + pthread_cond_t init_con_established; + pthread_mutex_t lock; + char complete; +}libglusterfs_client_private_t; + +typedef struct { + pthread_mutex_t lock; + uint32_t previous_lookup_time; + uint32_t previous_stat_time; + struct stat stbuf; +} libglusterfs_client_inode_ctx_t; + +typedef struct { + pthread_mutex_t lock; + off_t offset; + libglusterfs_client_ctx_t *ctx; +} libglusterfs_client_fd_ctx_t; + +typedef struct libglusterfs_client_async_local { + void *cbk_data; + union { + struct { + fd_t *fd; + glusterfs_readv_cbk_t cbk; + }readv_cbk; + + struct { + fd_t *fd; + glusterfs_writev_cbk_t cbk; + }writev_cbk; + + struct { + fd_t *fd; + }close_cbk; + + struct { + void *buf; + size_t size; + loc_t *loc; + char is_revalidate; + glusterfs_lookup_cbk_t cbk; + }lookup_cbk; + }fop; +}libglusterfs_client_async_local_t; + +static inline xlator_t * +libglusterfs_graph (xlator_t *graph); + +static int first_init = 1; +static int first_fini = 1; + + +char * +zr_build_process_uuid () +{ + char tmp_str[1024] = {0,}; + char hostname[256] = {0,}; + struct timeval tv = {0,}; + struct tm now = {0, }; + char now_str[32]; + + if (-1 == gettimeofday(&tv, NULL)) { + gf_log ("", GF_LOG_ERROR, + "gettimeofday: failed %s", + strerror (errno)); + } + + if (-1 == gethostname (hostname, 256)) { + gf_log ("", GF_LOG_ERROR, + "gethostname: failed %s", + strerror (errno)); + } + + localtime_r (&tv.tv_sec, &now); + strftime (now_str, 32, "%Y/%m/%d-%H:%M:%S", &now); + snprintf (tmp_str, 1024, "%s-%d-%s:%ld", + hostname, getpid(), now_str, tv.tv_usec); + + return strdup (tmp_str); +} + + +int32_t +libgf_client_forget (xlator_t *this, + inode_t *inode) +{ + libglusterfs_client_inode_ctx_t *ctx = NULL; + + inode_ctx_del (inode, this, (uint64_t *)ctx); + FREE (ctx); + + return 0; +} + + +int32_t +libgf_client_release (xlator_t *this, + fd_t *fd) +{ + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + fd_ctx_data = dict_get (fd->ctx, XLATOR_NAME); + + fd_ctx = data_to_ptr (fd_ctx_data); + pthread_mutex_destroy (&fd_ctx->lock); + + return 0; +} + + +int32_t +libgf_client_releasedir (xlator_t *this, + fd_t *fd) +{ + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + fd_ctx_data = dict_get (fd->ctx, XLATOR_NAME); + + fd_ctx = data_to_ptr (fd_ctx_data); + pthread_mutex_destroy (&fd_ctx->lock); + + return 0; +} + + +void *poll_proc (void *ptr) +{ + glusterfs_ctx_t *ctx = ptr; + + event_dispatch (ctx->event_pool); + + return NULL; +} + + +int32_t +xlator_graph_init (xlator_t *xl) +{ + xlator_t *trav = xl; + int32_t ret = -1; + + while (trav->prev) + trav = trav->prev; + + while (trav) { + if (!trav->ready) { + ret = xlator_tree_init (trav); + if (ret < 0) + break; + } + trav = trav->next; + } + + return ret; +} + + +void +xlator_graph_fini (xlator_t *xl) +{ + xlator_t *trav = xl; + while (trav->prev) + trav = trav->prev; + + while (trav) { + if (!trav->init_succeeded) { + break; + } + + xlator_tree_fini (trav); + trav = trav->next; + } +} + + +static void +libgf_client_loc_wipe (loc_t *loc) +{ + if (loc->path) { + FREE (loc->path); + } + + if (loc->parent) { + inode_unref (loc->parent); + loc->parent = NULL; + } + + if (loc->inode) { + inode_unref (loc->inode); + loc->inode = NULL; + } +} + + +static int32_t +libgf_client_loc_fill (loc_t *loc, const char *path, + ino_t ino, libglusterfs_client_ctx_t *ctx) +{ + int32_t op_ret = -1; + int32_t ret = 0; + char *dentry_path = NULL; + + loc->inode = NULL; + /* directory structure is flat. All files are immediate children of root */ + if (path) { + /* libglusterfsclient accepts only absolute paths */ + if (path[0] != '/') { + asprintf ((char **) &loc->path, "/%s", path); + } else { + loc->path = strdup (path); + } + + loc->inode = inode_search (ctx->itable, 1, path); + } else { + loc->inode = inode_search (ctx->itable, ino, NULL); + if (loc->inode == NULL) { + gf_log ("libglusterfsclient", GF_LOG_ERROR, + "cannot find inode for ino %"PRId64, + ino); + goto out; + } + + ret = inode_path (loc->inode, NULL, &dentry_path); + if (ret <= 0) { + gf_log ("libglusterfsclient", GF_LOG_ERROR, + "inode_path failed for %"PRId64, + loc->inode->ino); + inode_unref (loc->inode); + op_ret = ret; + goto out; + } else { + loc->path = dentry_path; + } + } + + loc->name = strrchr (loc->path, '/'); + if (loc->name) { + loc->name++; + } + + loc->parent = inode_ref (ctx->itable->root); + + if (loc->inode) { + loc->ino = loc->inode->ino; + } + + op_ret = 0; +out: + return op_ret; +} + + +static call_frame_t * +get_call_frame_for_req (libglusterfs_client_ctx_t *ctx, char d) +{ + call_pool_t *pool = ctx->gf_ctx.pool; + xlator_t *this = ctx->gf_ctx.graph; + call_frame_t *frame = NULL; + + + frame = create_frame (this, pool); + + frame->root->uid = geteuid (); + frame->root->gid = getegid (); + frame->root->pid = getpid (); + frame->root->unique = ctx->counter++; + + if (d) { + frame->root->req_refs = dict_ref (get_new_dict ()); + /* + TODO + dict_set (frame->root->req_refs, NULL, priv->buf); + */ + } + + return frame; +} + +void +libgf_client_fini (xlator_t *this) +{ + FREE (this->private); + return; +} + + +int32_t +libgf_client_notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + libglusterfs_client_private_t *priv = this->private; + + switch (event) + { + case GF_EVENT_CHILD_UP: + pthread_mutex_lock (&priv->lock); + { + priv->complete = 1; + pthread_cond_broadcast (&priv->init_con_established); + } + pthread_mutex_unlock (&priv->lock); + break; + + default: + default_notify (this, event, data); + } + + return 0; +} + +int32_t +libgf_client_init (xlator_t *this) +{ + return 0; +} + + +libglusterfs_handle_t +glusterfs_init (glusterfs_init_ctx_t *init_ctx) +{ + libglusterfs_client_ctx_t *ctx = NULL; + libglusterfs_client_private_t *priv = NULL; + FILE *specfp = NULL; + xlator_t *graph = NULL, *trav = NULL; + call_pool_t *pool = NULL; + int32_t ret = 0; + struct rlimit lim; + uint32_t xl_count = 0; + + if (!init_ctx || (!init_ctx->specfile && !init_ctx->specfp)) { + errno = EINVAL; + return NULL; + } + + ctx = CALLOC (1, sizeof (*ctx)); + if (!ctx) { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: out of memory\n", + __FILE__, __PRETTY_FUNCTION__, __LINE__); + + errno = ENOMEM; + return NULL; + } + + ctx->lookup_timeout = init_ctx->lookup_timeout; + ctx->stat_timeout = init_ctx->stat_timeout; + + pthread_mutex_init (&ctx->gf_ctx.lock, NULL); + + pool = ctx->gf_ctx.pool = CALLOC (1, sizeof (call_pool_t)); + if (!pool) { + errno = ENOMEM; + FREE (ctx); + return NULL; + } + + LOCK_INIT (&pool->lock); + INIT_LIST_HEAD (&pool->all_frames); + + ctx->gf_ctx.event_pool = event_pool_new (16384); + + lim.rlim_cur = RLIM_INFINITY; + lim.rlim_max = RLIM_INFINITY; + setrlimit (RLIMIT_CORE, &lim); + setrlimit (RLIMIT_NOFILE, &lim); + + ctx->gf_ctx.cmd_args.log_level = GF_LOG_WARNING; + + if (init_ctx->logfile) + ctx->gf_ctx.cmd_args.log_file = strdup (init_ctx->logfile); + else + ctx->gf_ctx.cmd_args.log_file = strdup ("/dev/stderr"); + + if (init_ctx->loglevel) { + if (!strncasecmp (init_ctx->loglevel, "DEBUG", strlen ("DEBUG"))) { + ctx->gf_ctx.cmd_args.log_level = GF_LOG_DEBUG; + } else if (!strncasecmp (init_ctx->loglevel, "WARNING", strlen ("WARNING"))) { + ctx->gf_ctx.cmd_args.log_level = GF_LOG_WARNING; + } else if (!strncasecmp (init_ctx->loglevel, "CRITICAL", strlen ("CRITICAL"))) { + ctx->gf_ctx.cmd_args.log_level = GF_LOG_CRITICAL; + } else if (!strncasecmp (init_ctx->loglevel, "NONE", strlen ("NONE"))) { + ctx->gf_ctx.cmd_args.log_level = GF_LOG_NONE; + } else if (!strncasecmp (init_ctx->loglevel, "ERROR", strlen ("ERROR"))) { + ctx->gf_ctx.cmd_args.log_level = GF_LOG_ERROR; + } else { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: Unrecognized log-level \"%s\", possible values are \"DEBUG|WARNING|[ERROR]|CRITICAL|NONE\"\n", __FILE__, __PRETTY_FUNCTION__, + __LINE__, init_ctx->loglevel); + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + FREE (ctx); + errno = EINVAL; + return NULL; + } + } + + if (first_init) + { + ret = gf_log_init (ctx->gf_ctx.cmd_args.log_file); + if (ret == -1) { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: failed to open logfile \"%s\"\n", + __FILE__, __PRETTY_FUNCTION__, __LINE__, + ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + FREE (ctx); + return NULL; + } + + gf_log_set_loglevel (ctx->gf_ctx.cmd_args.log_level); + } + + if (init_ctx->specfp) { + specfp = init_ctx->specfp; + if (fseek (specfp, 0L, SEEK_SET)) { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: fseek on volume file stream failed (%s)\n", __FILE__, __PRETTY_FUNCTION__, __LINE__, strerror (errno)); + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + FREE (ctx); + return NULL; + } + } else if (init_ctx->specfile) { + specfp = fopen (init_ctx->specfile, "r"); + ctx->gf_ctx.cmd_args.volume_file = strdup (init_ctx->specfile); + } + + if (!specfp) { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: could not open volfile: %s\n", + __FILE__, __PRETTY_FUNCTION__, __LINE__, strerror (errno)); + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.cmd_args.volume_file); + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + FREE (ctx); + return NULL; + } + + if (init_ctx->volume_name) { + ctx->gf_ctx.cmd_args.volume_name = strdup (init_ctx->volume_name); + } + + graph = file_to_xlator_tree (&ctx->gf_ctx, specfp); + if (!graph) { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: cannot create configuration graph (%s)\n", + __FILE__, __PRETTY_FUNCTION__, __LINE__, strerror (errno)); + + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.cmd_args.volume_file); + FREE (ctx->gf_ctx.cmd_args.volume_name); + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + FREE (ctx); + return NULL; + } + + if (init_ctx->volume_name) { + trav = graph; + while (trav) { + if (strcmp (trav->name, init_ctx->volume_name) == 0) { + graph = trav; + break; + } + trav = trav->next; + } + } + + ctx->gf_ctx.graph = libglusterfs_graph (graph); + if (!ctx->gf_ctx.graph) { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: graph creation failed (%s)\n", + __FILE__, __PRETTY_FUNCTION__, __LINE__, strerror (errno)); + + xlator_tree_free (graph); + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.cmd_args.volume_file); + FREE (ctx->gf_ctx.cmd_args.volume_name); + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + FREE (ctx); + return NULL; + } + graph = ctx->gf_ctx.graph; + + trav = graph; + while (trav) { + xl_count++; /* Getting this value right is very important */ + trav = trav->next; + } + + ctx->gf_ctx.xl_count = xl_count + 1; + + priv = CALLOC (1, sizeof (*priv)); + if (!priv) { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: cannot allocate memory (%s)\n", + __FILE__, __PRETTY_FUNCTION__, __LINE__, strerror (errno)); + + xlator_tree_free (graph); + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.cmd_args.volume_file); + FREE (ctx->gf_ctx.cmd_args.volume_name); + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + /* inode_table_destroy (ctx->itable); */ + FREE (ctx); + + return NULL; + } + + pthread_cond_init (&priv->init_con_established, NULL); + pthread_mutex_init (&priv->lock, NULL); + + graph->private = priv; + ctx->itable = inode_table_new (LIBGLUSTERFS_INODE_TABLE_LRU_LIMIT, graph); + if (!ctx->itable) { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: cannot create inode table\n", + __FILE__, __PRETTY_FUNCTION__, __LINE__); + xlator_tree_free (graph); + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.cmd_args.volume_file); + FREE (ctx->gf_ctx.cmd_args.volume_name); + + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + xlator_tree_free (graph); + /* TODO: destroy graph */ + /* inode_table_destroy (ctx->itable); */ + FREE (ctx); + + return NULL; + } + + if (xlator_graph_init (graph) == -1) { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: graph initialization failed\n", + __FILE__, __PRETTY_FUNCTION__, __LINE__); + xlator_tree_free (graph); + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.cmd_args.volume_file); + FREE (ctx->gf_ctx.cmd_args.volume_name); + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + /* TODO: destroy graph */ + /* inode_table_destroy (ctx->itable); */ + FREE (ctx); + return NULL; + } + + /* Send notify to all translator saying things are ready */ + graph->notify (graph, GF_EVENT_PARENT_UP, graph); + + if (gf_timer_registry_init (&ctx->gf_ctx) == NULL) { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: timer init failed (%s)\n", + __FILE__, __PRETTY_FUNCTION__, __LINE__, strerror (errno)); + + xlator_graph_fini (graph); + xlator_tree_free (graph); + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.cmd_args.volume_file); + FREE (ctx->gf_ctx.cmd_args.volume_name); + + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + /* TODO: destroy graph */ + /* inode_table_destroy (ctx->itable); */ + FREE (ctx); + return NULL; + } + + if ((ret = pthread_create (&ctx->reply_thread, NULL, poll_proc, (void *)&ctx->gf_ctx))) { + fprintf (stderr, + "libglusterfsclient: %s:%s():%d: reply thread creation failed\n", + __FILE__, __PRETTY_FUNCTION__, __LINE__); + xlator_graph_fini (graph); + xlator_tree_free (graph); + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.cmd_args.volume_file); + FREE (ctx->gf_ctx.cmd_args.volume_name); + + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + /* TODO: destroy graph */ + /* inode_table_destroy (ctx->itable); */ + FREE (ctx); + return NULL; + } + + set_global_ctx_ptr (&ctx->gf_ctx); + ctx->gf_ctx.process_uuid = zr_build_process_uuid (); + + pthread_mutex_lock (&priv->lock); + { + while (!priv->complete) { + pthread_cond_wait (&priv->init_con_established, &priv->lock); + } + } + pthread_mutex_unlock (&priv->lock); + + first_init = 0; + + return ctx; +} + + +void +glusterfs_reset (void) +{ + first_fini = first_init = 1; +} + + +void +glusterfs_log_lock (void) +{ + gf_log_lock (); +} + + +void glusterfs_log_unlock (void) +{ + gf_log_unlock (); +} + + +int +glusterfs_fini (libglusterfs_client_ctx_t *ctx) +{ + FREE (ctx->gf_ctx.cmd_args.log_file); + FREE (ctx->gf_ctx.cmd_args.volume_file); + FREE (ctx->gf_ctx.cmd_args.volume_name); + FREE (ctx->gf_ctx.pool); + FREE (ctx->gf_ctx.event_pool); + ((gf_timer_registry_t *)ctx->gf_ctx.timer)->fin = 1; + /* inode_table_destroy (ctx->itable); */ + + xlator_graph_fini (ctx->gf_ctx.graph); + xlator_tree_free (ctx->gf_ctx.graph); + ctx->gf_ctx.graph = NULL; + + /* FREE (ctx->gf_ctx.specfile); */ + + /* TODO complete cleanup of timer */ + /*TODO + * destroy the reply thread + * destroy inode table + * FREE (ctx) + */ + + FREE (ctx); + + if (first_fini) { + ; + //gf_log_cleanup (); + } + + return 0; +} + + +int32_t +libgf_client_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + libgf_client_local_t *local = frame->local; + libglusterfs_client_ctx_t *ctx = frame->root->state; + dict_t *xattr_req = NULL; + + if (op_ret == 0) { + /* flat directory structure */ + inode_t *parent = inode_search (ctx->itable, 1, NULL); + + inode_link (inode, parent, local->fop.lookup.loc->path, buf); + inode_lookup (inode); + inode_unref (parent); + } else { + if (local->fop.lookup.is_revalidate == 0 && op_errno == ENOENT) { + gf_log ("libglusterfsclient", GF_LOG_DEBUG, + "%"PRId64": (op_num=%d) %s => -1 (%s)", + frame->root->unique, frame->root->op, + local->fop.lookup.loc->path, + strerror (op_errno)); + } else { + gf_log ("libglusterfsclient", GF_LOG_ERROR, + "%"PRId64": (op_num=%d) %s => -1 (%s)", + frame->root->unique, frame->root->op, + local->fop.lookup.loc->path, + strerror (op_errno)); + } + + if (local->fop.lookup.is_revalidate == 1) { + int32_t ret = 0; + inode_unref (local->fop.lookup.loc->inode); + local->fop.lookup.loc->inode = inode_new (ctx->itable); + local->fop.lookup.is_revalidate = 2; + + if (local->fop.lookup.size > 0) { + xattr_req = dict_new (); + ret = dict_set (xattr_req, "glusterfs.content", + data_from_uint64 (local->fop.lookup.size)); + if (ret == -1) { + op_ret = -1; + /* TODO: set proper error code */ + op_errno = errno; + inode = NULL; + buf = NULL; + dict = NULL; + dict_unref (xattr_req); + goto out; + } + } + + STACK_WIND (frame, libgf_client_lookup_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->lookup, + local->fop.lookup.loc, xattr_req); + + if (xattr_req) { + dict_unref (xattr_req); + xattr_req = NULL; + } + + return 0; + } + } + +out: + local->reply_stub = fop_lookup_cbk_stub (frame, NULL, op_ret, op_errno, inode, buf, dict); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + +int32_t +libgf_client_lookup (libglusterfs_client_ctx_t *ctx, + loc_t *loc, + struct stat *stbuf, + dict_t **dict, + dict_t *xattr_req) +{ + call_stub_t *stub = NULL; + int32_t op_ret; + libgf_client_local_t *local = NULL; + xlator_t *this = NULL; + int32_t ret = -1; + + local = CALLOC (1, sizeof (*local)); + if (loc->inode) { + local->fop.lookup.is_revalidate = 1; + loc->ino = loc->inode->ino; + } + else + loc->inode = inode_new (ctx->itable); + + local->fop.lookup.loc = loc; + + LIBGF_CLIENT_FOP(ctx, stub, lookup, local, loc, xattr_req); + + op_ret = stub->args.lookup_cbk.op_ret; + errno = stub->args.lookup_cbk.op_errno; + + if (!op_ret) { + time_t current = 0; + libglusterfs_client_inode_ctx_t *inode_ctx = NULL; + inode_t *inode = stub->args.lookup_cbk.inode; + uint64_t ptr = 0; + + this = ctx->gf_ctx.graph; + ret = inode_ctx_get (inode, this, &ptr); + if (ret == -1) { + inode_ctx = CALLOC (1, sizeof (*inode_ctx)); + ERR_ABORT (inode_ctx); + pthread_mutex_init (&inode_ctx->lock, NULL); + } else { + inode_ctx = (libglusterfs_client_inode_ctx_t *)(long)ptr; + } + + current = time (NULL); + + pthread_mutex_lock (&inode_ctx->lock); + { + inode_ctx->previous_lookup_time = current; + inode_ctx->previous_stat_time = current; + memcpy (&inode_ctx->stbuf, &stub->args.lookup_cbk.buf, + sizeof (inode_ctx->stbuf)); + } + pthread_mutex_unlock (&inode_ctx->lock); + + ret = inode_ctx_get (inode, this, NULL); + if (ret == -1) { + inode_ctx_put (inode, this, (uint64_t)(long)inode_ctx); + } + + if (stbuf) + *stbuf = stub->args.lookup_cbk.buf; + + if (dict) + *dict = dict_ref (stub->args.lookup_cbk.dict); + } + + call_stub_destroy (stub); + return op_ret; +} + +int +glusterfs_lookup (libglusterfs_handle_t handle, + const char *path, + void *buf, + size_t size, + struct stat *stbuf) +{ + int32_t op_ret = 0; + loc_t loc = {0, }; + libglusterfs_client_ctx_t *ctx = handle; + dict_t *dict = NULL; + dict_t *xattr_req = NULL; + + op_ret = libgf_client_loc_fill (&loc, path, 0, ctx); + if (op_ret < 0) { + gf_log ("libglusterfsclient", + GF_LOG_ERROR, + "libgf_client_loc_fill returned -1, returning EINVAL"); + errno = EINVAL; + goto out; + } + + if (size < 0) + size = 0; + + if (size > 0) { + xattr_req = dict_new (); + op_ret = dict_set (xattr_req, "glusterfs.content", data_from_uint64 (size)); + if (op_ret < 0) { + gf_log ("libglusterfsclient", + GF_LOG_ERROR, + "setting requested content size dictionary failed"); + goto out; + } + } + + op_ret = libgf_client_lookup (ctx, &loc, stbuf, &dict, xattr_req); + + if (!op_ret && size && stbuf && stbuf->st_size && dict && buf) { + data_t *mem_data = NULL; + void *mem = NULL; + + mem_data = dict_get (dict, "glusterfs.content"); + if (mem_data) { + mem = data_to_ptr (mem_data); + } + + if (mem && stbuf->st_size <= size) { + memcpy (buf, mem, stbuf->st_size); + } + } + + if (dict) { + dict_unref (dict); + } + + libgf_client_loc_wipe (&loc); +out: + if (xattr_req) { + dict_unref (xattr_req); + } + + return op_ret; +} + +int +libgf_client_lookup_async_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + libglusterfs_client_async_local_t *local = frame->local; + glusterfs_lookup_cbk_t lookup_cbk = local->fop.lookup_cbk.cbk; + libglusterfs_client_ctx_t *ctx = frame->root->state; + dict_t *xattr_req = NULL; + int32_t ret = 0; + + if (op_ret == 0) { + time_t current = 0; + data_t *inode_ctx_data = NULL; + libglusterfs_client_inode_ctx_t *inode_ctx = NULL; + + /* flat directory structure */ + inode_t *parent = inode_search (ctx->itable, 1, NULL); + + inode_link (inode, parent, local->fop.lookup_cbk.loc->path, buf); + + inode_ctx_data = dict_get (inode->ctx, XLATOR_NAME); + if (inode_ctx_data) { + inode_ctx = data_to_ptr (inode_ctx_data); + } + + if (!inode_ctx) { + inode_ctx = CALLOC (1, sizeof (*inode_ctx)); + pthread_mutex_init (&inode_ctx->lock, NULL); + } + + current = time (NULL); + + pthread_mutex_lock (&inode_ctx->lock); + { + inode_ctx->previous_lookup_time = current; + inode_ctx->previous_stat_time = current; + memcpy (&inode_ctx->stbuf, buf, sizeof (inode_ctx->stbuf)); + } + pthread_mutex_unlock (&inode_ctx->lock); + + ret = inode_ctx_get (inode, this, NULL); + if (ret == -1) { + inode_ctx_put (inode, this, (uint64_t)(long)inode_ctx); + } + + inode_lookup (inode); + inode_unref (parent); + } else { + if (local->fop.lookup_cbk.is_revalidate == 0 && op_errno == ENOENT) { + gf_log ("libglusterfsclient", GF_LOG_DEBUG, + "%"PRId64": (op_num=%d) %s => -1 (%s)", + frame->root->unique, frame->root->op, + local->fop.lookup_cbk.loc->path, + strerror (op_errno)); + } else { + gf_log ("libglusterfsclient", GF_LOG_ERROR, + "%"PRId64": (op_num=%d) %s => -1 (%s)", + frame->root->unique, frame->root->op, + local->fop.lookup_cbk.loc->path, + strerror (op_errno)); + } + + if (local->fop.lookup_cbk.is_revalidate == 1) { + int32_t ret = 0; + inode_unref (local->fop.lookup_cbk.loc->inode); + local->fop.lookup_cbk.loc->inode = inode_new (ctx->itable); + local->fop.lookup_cbk.is_revalidate = 2; + + if (local->fop.lookup_cbk.size > 0) { + xattr_req = dict_new (); + ret = dict_set (xattr_req, "glusterfs.content", + data_from_uint64 (local->fop.lookup_cbk.size)); + if (ret == -1) { + op_ret = -1; + /* TODO: set proper error code */ + op_errno = errno; + inode = NULL; + buf = NULL; + dict = NULL; + dict_unref (xattr_req); + goto out; + } + } + + + STACK_WIND (frame, libgf_client_lookup_async_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->lookup, + local->fop.lookup_cbk.loc, xattr_req); + + if (xattr_req) { + dict_unref (xattr_req); + xattr_req = NULL; + } + + return 0; + } + + } + +out: + if (!op_ret && local->fop.lookup_cbk.size && dict && local->fop.lookup_cbk.buf) { + data_t *mem_data = NULL; + void *mem = NULL; + + mem_data = dict_get (dict, "glusterfs.content"); + if (mem_data) { + mem = data_to_ptr (mem_data); + } + + if (mem && buf->st_size <= local->fop.lookup_cbk.size) { + memcpy (local->fop.lookup_cbk.buf, mem, buf->st_size); + } + } + + lookup_cbk(op_ret, op_errno, local->fop.lookup_cbk.buf, buf, local->cbk_data); + + libgf_client_loc_wipe (local->fop.lookup_cbk.loc); + free (local->fop.lookup_cbk.loc); + + free (local); + frame->local = NULL; + STACK_DESTROY (frame->root); + + return 0; +} + +int +glusterfs_lookup_async (libglusterfs_handle_t handle, + const char *path, + void *buf, + size_t size, + glusterfs_lookup_cbk_t cbk, + void *cbk_data) +{ + loc_t *loc = NULL; + libglusterfs_client_ctx_t *ctx = handle; + libglusterfs_client_async_local_t *local = NULL; + int32_t op_ret = 0; + dict_t *xattr_req = NULL; + + local = CALLOC (1, sizeof (*local)); + local->fop.lookup_cbk.is_revalidate = 1; + + loc = CALLOC (1, sizeof (*loc)); + op_ret = libgf_client_loc_fill (loc, path, 0, ctx); + if (op_ret < 0) { + gf_log ("libglusterfsclient", + GF_LOG_ERROR, + "libgf_client_loc_fill returned -1, returning EINVAL"); + errno = EINVAL; + goto out; + } + + if (!loc->inode) { + loc->inode = inode_new (ctx->itable); + local->fop.lookup_cbk.is_revalidate = 0; + } + + local->fop.lookup_cbk.cbk = cbk; + local->fop.lookup_cbk.buf = buf; + local->fop.lookup_cbk.size = size; + local->fop.lookup_cbk.loc = loc; + local->cbk_data = cbk_data; + + if (size < 0) + size = 0; + + if (size > 0) { + xattr_req = dict_new (); + op_ret = dict_set (xattr_req, "glusterfs.content", data_from_uint64 (size)); + if (op_ret < 0) { + dict_unref (xattr_req); + xattr_req = NULL; + goto out; + } + } + + LIBGF_CLIENT_FOP_ASYNC (ctx, + local, + libgf_client_lookup_async_cbk, + lookup, + loc, + xattr_req); + if (xattr_req) { + dict_unref (xattr_req); + xattr_req = NULL; + } + +out: + return op_ret; +} + +int32_t +libgf_client_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + + libgf_client_local_t *local = frame->local; + + local->reply_stub = fop_getxattr_cbk_stub (frame, NULL, op_ret, op_errno, dict); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + +size_t +libgf_client_getxattr (libglusterfs_client_ctx_t *ctx, + loc_t *loc, + const char *name, + void *value, + size_t size) +{ + call_stub_t *stub = NULL; + int32_t op_ret = 0; + libgf_client_local_t *local = NULL; + + LIBGF_CLIENT_FOP (ctx, stub, getxattr, local, loc, name); + + op_ret = stub->args.getxattr_cbk.op_ret; + errno = stub->args.getxattr_cbk.op_errno; + + if (op_ret >= 0) { + /* + gf_log ("LIBGF_CLIENT", GF_LOG_DEBUG, + "%"PRId64": %s => %d", frame->root->unique, + state->fuse_loc.loc.path, op_ret); + */ + + data_t *value_data = dict_get (stub->args.getxattr_cbk.dict, (char *)name); + + if (value_data) { + int32_t copy_len = 0; + op_ret = value_data->len; /* Don't return the value for '\0' */ + + copy_len = size < value_data->len ? size : value_data->len; + memcpy (value, value_data->data, copy_len); + } else { + errno = ENODATA; + op_ret = -1; + } + } + + call_stub_destroy (stub); + return op_ret; +} + +ssize_t +glusterfs_getxattr (libglusterfs_client_ctx_t *ctx, + const char *path, + const char *name, + void *value, + size_t size) +{ + int32_t op_ret = 0; + loc_t loc = {0, }; + dict_t *dict = NULL; + + op_ret = libgf_client_loc_fill (&loc, path, 0, ctx); + if (op_ret < 0) { + gf_log ("libglusterfsclient", + GF_LOG_ERROR, + "libgf_client_loc_fill returned -1, returning EINVAL"); + errno = EINVAL; + goto out; + } + + op_ret = libgf_client_lookup (ctx, &loc, NULL, &dict, NULL); + if (op_ret == 0) { + data_t *value_data = dict_get (dict, (char *)name); + + if (value_data) { + int32_t copy_len = 0; + op_ret = value_data->len; /* Don't return the value for '\0' */ + + copy_len = size < value_data->len ? size : value_data->len; + memcpy (value, value_data->data, copy_len); + } else { + errno = ENODATA; + op_ret = -1; + } + } + + if (dict) { + dict_unref (dict); + } + + libgf_client_loc_wipe (&loc); + +out: + return op_ret; +} + +static int32_t +libgf_client_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + libgf_client_local_t *local = frame->local; + + local->reply_stub = fop_open_cbk_stub (frame, NULL, op_ret, op_errno, fd); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + + +int +libgf_client_open (libglusterfs_client_ctx_t *ctx, + loc_t *loc, + fd_t *fd, + int flags) +{ + call_stub_t *stub = NULL; + int32_t op_ret = 0; + libgf_client_local_t *local = NULL; + + LIBGF_CLIENT_FOP (ctx, stub, open, local, loc, flags, fd); + + op_ret = stub->args.open_cbk.op_ret; + errno = stub->args.open_cbk.op_errno; + + call_stub_destroy (stub); + return op_ret; +} + +static int32_t +libgf_client_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + libgf_client_local_t *local = frame->local; + + local->reply_stub = fop_create_cbk_stub (frame, NULL, op_ret, op_errno, fd, inode, buf); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + +int +libgf_client_creat (libglusterfs_client_ctx_t *ctx, + loc_t *loc, + fd_t *fd, + int flags, + mode_t mode) +{ + call_stub_t *stub = NULL; + int32_t op_ret = 0; + libgf_client_local_t *local = NULL; + xlator_t *this = NULL; + + LIBGF_CLIENT_FOP (ctx, stub, create, local, loc, flags, mode, fd); + + if (stub->args.create_cbk.op_ret == 0) { + inode_t *libgf_inode = NULL; + time_t current = 0; + libglusterfs_client_inode_ctx_t *inode_ctx = NULL; + + /* flat directory structure */ + inode_t *parent = inode_search (ctx->itable, 1, NULL); + libgf_inode = stub->args.create_cbk.inode; + inode_link (libgf_inode, parent, + loc->path, &stub->args.create_cbk.buf); + + inode_lookup (libgf_inode); + inode_unref (parent); + + inode_ctx = CALLOC (1, sizeof (*inode_ctx)); + ERR_ABORT (inode_ctx); + pthread_mutex_init (&inode_ctx->lock, NULL); + + current = time (NULL); + + inode_ctx->previous_lookup_time = current; + inode_ctx->previous_stat_time = current; + memcpy (&inode_ctx->stbuf, &stub->args.lookup_cbk.buf, + sizeof (inode_ctx->stbuf)); + + this = ctx->gf_ctx.graph; + inode_ctx_put (libgf_inode, this, (uint64_t)(long)inode_ctx); + } + + op_ret = stub->args.create_cbk.op_ret; + errno = stub->args.create_cbk.op_errno; + + call_stub_destroy (stub); + return op_ret; +} + +int32_t +libgf_client_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + libgf_client_local_t *local = frame->local; + + local->reply_stub = fop_opendir_cbk_stub (frame, NULL, op_ret, op_errno, fd); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + +int +libgf_client_opendir (libglusterfs_client_ctx_t *ctx, + loc_t *loc, + fd_t *fd) +{ + call_stub_t *stub = NULL; + int32_t op_ret = 0; + libgf_client_local_t *local = NULL; + + LIBGF_CLIENT_FOP (ctx, stub, opendir, local, loc, fd); + + op_ret = stub->args.opendir_cbk.op_ret; + errno = stub->args.opendir_cbk.op_errno; + + call_stub_destroy (stub); + return 0; +} + +unsigned long +glusterfs_open (libglusterfs_client_ctx_t *ctx, + const char *path, + int flags, + mode_t mode) +{ + loc_t loc = {0, }; + long op_ret = 0; + fd_t *fd = NULL; + struct stat stbuf; + char lookup_required = 1; + int32_t ret = -1; + xlator_t *this = NULL; + + if (!ctx || !path || path[0] != '/') { + errno = EINVAL; + return 0; + } + + this = ctx->gf_ctx.graph; + + op_ret = libgf_client_loc_fill (&loc, path, 0, ctx); + if (op_ret < 0) { + gf_log ("libglusterfsclient", + GF_LOG_ERROR, + "libgf_client_loc_fill returned -1, returning EINVAL"); + errno = EINVAL; + fd = NULL; + goto out; + } + + if (loc.inode) { + libglusterfs_client_inode_ctx_t *inode_ctx = NULL; + time_t current, prev; + uint64_t ptr = 0; + + ret = inode_ctx_get (loc.inode, this, &ptr); + if (ret == 0) { + inode_ctx = (libglusterfs_client_inode_ctx_t *)(long)ptr; + memset (¤t, 0, sizeof (current)); + + pthread_mutex_lock (&inode_ctx->lock); + { + prev = inode_ctx->previous_lookup_time; + } + pthread_mutex_unlock (&inode_ctx->lock); + + current = time (NULL); + if (prev >= 0 && ctx->lookup_timeout >= (current - prev)) { + lookup_required = 0; + } + } + } + + if (lookup_required) { + op_ret = libgf_client_lookup (ctx, &loc, &stbuf, NULL, NULL); + if (!op_ret && ((flags & O_CREAT) == O_CREAT) && ((flags & O_EXCL) == O_EXCL)) { + errno = EEXIST; + op_ret = -1; + } + } + + if (!op_ret || (op_ret == -1 && errno == ENOENT && ((flags & O_CREAT) == O_CREAT))) { + fd = fd_create (loc.inode, 0); + fd->flags = flags; + + if (!op_ret) { + if (S_ISDIR (loc.inode->st_mode)) { + if (((flags & O_RDONLY) == O_RDONLY) && + ((flags & O_WRONLY) == 0) && + ((flags & O_RDWR) == 0)) { + op_ret = libgf_client_opendir (ctx, &loc, fd); + } else { + op_ret = -1; + errno = EISDIR; + } + } else { + op_ret = libgf_client_open (ctx, &loc, fd, flags); + } + } else { + op_ret = libgf_client_creat (ctx, &loc, fd, flags, mode); + } + + if (op_ret == -1) { + fd_unref (fd); + fd = NULL; + } else { + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + libglusterfs_client_inode_ctx_t *inode_ctx = NULL; + data_t *ctx_data = NULL; + + ctx_data = dict_get (fd->ctx, XLATOR_NAME); + if (!ctx_data) { + fd_ctx = CALLOC (1, sizeof (*fd_ctx)); + ERR_ABORT (fd_ctx); + pthread_mutex_init (&fd_ctx->lock, NULL); + } + + pthread_mutex_lock (&fd_ctx->lock); + { + fd_ctx->ctx = ctx; + } + pthread_mutex_unlock (&fd_ctx->lock); + + if (!ctx_data) { + dict_set (fd->ctx, XLATOR_NAME, data_from_dynptr (fd_ctx, sizeof (*fd_ctx))); + } + + if ((flags & O_TRUNC) && ((flags & O_RDWR) || (flags & O_WRONLY))) { + uint64_t ptr = 0; + ret = inode_ctx_get (fd->inode, this, &ptr); + if (ret == 0) { + inode_ctx = (libglusterfs_client_inode_ctx_t *)(long)ptr; + if (S_ISREG (inode_ctx->stbuf.st_mode)) { + inode_ctx->stbuf.st_size = inode_ctx->stbuf.st_blocks = 0; + } + } else { + gf_log ("libglusterfsclient", GF_LOG_WARNING, + "inode_ctx is NULL for inode (%p) belonging to fd (%p)", + fd->inode, fd); + } + } + } + } + + libgf_client_loc_wipe (&loc); + +out: + return (long)fd; +} + + +unsigned long +glusterfs_creat (libglusterfs_client_ctx_t *ctx, + const char *path, + mode_t mode) +{ + return glusterfs_open (ctx, path, + (O_CREAT | O_WRONLY | O_TRUNC), mode); +} + + +int32_t +libgf_client_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + libgf_client_local_t *local = frame->local; + + local->reply_stub = fop_flush_cbk_stub (frame, NULL, op_ret, op_errno); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + + +int +libgf_client_flush (libglusterfs_client_ctx_t *ctx, fd_t *fd) +{ + call_stub_t *stub; + int32_t op_ret; + libgf_client_local_t *local = NULL; + + LIBGF_CLIENT_FOP (ctx, stub, flush, local, fd); + + op_ret = stub->args.flush_cbk.op_ret; + errno = stub->args.flush_cbk.op_errno; + + call_stub_destroy (stub); + return op_ret; +} + + +int +glusterfs_close (unsigned long fd) +{ + int32_t op_ret = -1; + data_t *fd_ctx_data = NULL; + libglusterfs_client_ctx_t *ctx = NULL; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + + if (!fd) { + errno = EINVAL; + goto out; + } + + fd_ctx_data = dict_get (((fd_t *) fd)->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + ctx = fd_ctx->ctx; + + op_ret = libgf_client_flush (ctx, (fd_t *)fd); + + fd_unref ((fd_t *)fd); + +out: + return op_ret; +} + +int32_t +libgf_client_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + libgf_client_local_t *local = frame->local; + + local->reply_stub = fop_setxattr_cbk_stub (frame, NULL, op_ret, op_errno); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + +int +libgf_client_setxattr (libglusterfs_client_ctx_t *ctx, + loc_t *loc, + const char *name, + const void *value, + size_t size, + int flags) +{ + call_stub_t *stub = NULL; + int32_t op_ret = 0; + dict_t *dict; + libgf_client_local_t *local = NULL; + + dict = get_new_dict (); + + dict_set (dict, (char *)name, + bin_to_data ((void *)value, size)); + dict_ref (dict); + + + LIBGF_CLIENT_FOP (ctx, stub, setxattr, local, loc, dict, flags); + + op_ret = stub->args.setxattr_cbk.op_ret; + errno = stub->args.setxattr_cbk.op_errno; + + dict_unref (dict); + call_stub_destroy (stub); + return op_ret; +} + +int +glusterfs_setxattr (libglusterfs_client_ctx_t *ctx, + const char *path, + const char *name, + const void *value, + size_t size, + int flags) +{ + int32_t op_ret = 0; + loc_t loc = {0, }; + char lookup_required = 1; + xlator_t *this = NULL; + + op_ret = libgf_client_loc_fill (&loc, path, 0, ctx); + if (op_ret < 0) { + gf_log ("libglusterfsclient", + GF_LOG_ERROR, + "libgf_client_loc_fill returned -1, returning EINVAL"); + errno = EINVAL; + goto out; + } + + this = ctx->gf_ctx.graph; + if (loc.inode) { + time_t current, prev; + libglusterfs_client_inode_ctx_t *inode_ctx = NULL; + uint64_t ptr = 0; + + op_ret = inode_ctx_get (loc.inode, this, &ptr); + if (op_ret == -1) { + errno = EINVAL; + goto out; + } + + inode_ctx = (libglusterfs_client_inode_ctx_t *)(long)ptr; + memset (¤t, 0, sizeof (current)); + current = time (NULL); + + pthread_mutex_lock (&inode_ctx->lock); + { + prev = inode_ctx->previous_lookup_time; + } + pthread_mutex_unlock (&inode_ctx->lock); + + if ((prev >= 0) && ctx->lookup_timeout >= (current - prev)) { + lookup_required = 0; + } + } + + if (lookup_required) { + op_ret = libgf_client_lookup (ctx, &loc, NULL, NULL, NULL); + } + + if (!op_ret) + op_ret = libgf_client_setxattr (ctx, &loc, name, value, size, flags); + + libgf_client_loc_wipe (&loc); + +out: + return op_ret; +} + +int +glusterfs_lsetxattr (libglusterfs_client_ctx_t *ctx, + const char *path, + const char *name, + const void *value, + size_t size, int flags) +{ + return ENOSYS; +} + +int +glusterfs_fsetxattr (unsigned long fd, + const char *name, + const void *value, + size_t size, + int flags) +{ + int32_t op_ret = 0; + fd_t *__fd ; + char lookup_required = 1; + libglusterfs_client_ctx_t *ctx = NULL; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + loc_t loc = {0, }; + xlator_t *this = NULL; + + __fd = (fd_t *)fd; + fd_ctx_data = dict_get (__fd->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + op_ret = -1; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + ctx = fd_ctx->ctx; + + op_ret = libgf_client_loc_fill (&loc, NULL, __fd->inode->ino, ctx); + if (op_ret < 0) { + gf_log ("libglusterfsclient", + GF_LOG_ERROR, + "libgf_client_loc_fill returned -1, returning EINVAL"); + errno = EINVAL; + goto out; + } + + this = ctx->gf_ctx.graph; + if (loc.inode) { + time_t current, prev; + libglusterfs_client_inode_ctx_t *inode_ctx = NULL; + uint64_t ptr = 0; + + op_ret = inode_ctx_get (loc.inode, this, &ptr); + if (op_ret == -1) { + errno = EINVAL; + goto out; + } + + inode_ctx = (libglusterfs_client_inode_ctx_t *)(long)ptr; + memset (¤t, 0, sizeof (current)); + current = time (NULL); + + pthread_mutex_lock (&inode_ctx->lock); + { + prev = inode_ctx->previous_lookup_time; + } + pthread_mutex_unlock (&inode_ctx->lock); + + if ( (prev >= 0) && ctx->lookup_timeout >= (current - prev)) { + lookup_required = 0; + } + } + + if (lookup_required) { + op_ret = libgf_client_lookup (ctx, &loc, NULL, NULL, NULL); + } + + if (!op_ret) + op_ret = libgf_client_setxattr (ctx, &loc, name, value, size, flags); + + libgf_client_loc_wipe (&loc); +out: + return op_ret; +} + +ssize_t +glusterfs_lgetxattr (libglusterfs_client_ctx_t *ctx, + const char *path, + const char *name, + void *value, + size_t size) +{ + return ENOSYS; +} + +ssize_t +glusterfs_fgetxattr (unsigned long fd, + const char *name, + void *value, + size_t size) +{ + int32_t op_ret = 0; + libglusterfs_client_ctx_t *ctx; + fd_t *__fd = (fd_t *)fd; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + loc_t loc = {0, }; + dict_t *dict = NULL; + + fd_ctx_data = dict_get (__fd->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + op_ret = -1; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + ctx = fd_ctx->ctx; + + op_ret = libgf_client_loc_fill (&loc, NULL, __fd->inode->ino, ctx); + if (op_ret < 0) { + gf_log ("libglusterfsclient", + GF_LOG_ERROR, + "libgf_client_loc_fill returned -1, returning EINVAL"); + errno = EINVAL; + goto out; + } + + op_ret = libgf_client_lookup (ctx, &loc, NULL, &dict, NULL); + if (op_ret == 0) { + data_t *value_data = dict_get (dict, (char *)name); + + if (value_data) { + int32_t copy_len = 0; + op_ret = value_data->len; /* Don't return the value for '\0' */ + + copy_len = size < value_data->len ? size : value_data->len; + memcpy (value, value_data->data, copy_len); + } else { + errno = ENODATA; + op_ret = -1; + } + } + + if (dict) { + dict_unref (dict); + } + + libgf_client_loc_wipe (&loc); + +out: + return op_ret; +} + +ssize_t +glusterfs_listxattr (libglusterfs_client_ctx_t *ctx, + const char *path, + char *list, + size_t size) +{ + return ENOSYS; +} + +ssize_t +glusterfs_llistxattr (libglusterfs_client_ctx_t *ctx, + const char *path, + char *list, + size_t size) +{ + return ENOSYS; +} + +ssize_t +glusterfs_flistxattr (unsigned long fd, + char *list, + size_t size) +{ + return ENOSYS; +} + +int +glusterfs_removexattr (libglusterfs_client_ctx_t *ctx, + const char *path, + const char *name) +{ + return ENOSYS; +} + +int +glusterfs_lremovexattr (libglusterfs_client_ctx_t *ctx, + const char *path, + const char *name) +{ + return ENOSYS; +} + +int +glusterfs_fremovexattr (unsigned long fd, + const char *name) +{ + return ENOSYS; +} + +int32_t +libgf_client_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + libgf_client_local_t *local = frame->local; + + local->reply_stub = fop_readv_cbk_stub (frame, NULL, op_ret, op_errno, vector, count, stbuf); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + +int +libgf_client_read (libglusterfs_client_ctx_t *ctx, + fd_t *fd, + void *buf, + size_t size, + off_t offset) +{ + call_stub_t *stub; + struct iovec *vector; + int32_t op_ret = -1; + int count = 0; + libgf_client_local_t *local = NULL; + + LIBGF_CLIENT_FOP (ctx, stub, readv, local, fd, size, offset); + + op_ret = stub->args.readv_cbk.op_ret; + errno = stub->args.readv_cbk.op_errno; + count = stub->args.readv_cbk.count; + vector = stub->args.readv_cbk.vector; + if (op_ret > 0) { + int i = 0; + op_ret = 0; + while (size && (i < count)) { + int len = (size < vector[i].iov_len) ? size : vector[i].iov_len; + memcpy (buf, vector[i++].iov_base, len); + buf += len; + size -= len; + op_ret += len; + } + } + + call_stub_destroy (stub); + return op_ret; +} + +ssize_t +glusterfs_read (unsigned long fd, + void *buf, + size_t nbytes) +{ + int32_t op_ret = -1; + off_t offset = 0; + libglusterfs_client_ctx_t *ctx = NULL; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + if (fd == 0) { + errno = EINVAL; + goto out; + } + + fd_ctx_data = dict_get (((fd_t *) fd)->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + + pthread_mutex_lock (&fd_ctx->lock); + { + ctx = fd_ctx->ctx; + offset = fd_ctx->offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + + op_ret = libgf_client_read (ctx, (fd_t *)fd, buf, nbytes, offset); + + if (op_ret > 0) { + offset += op_ret; + pthread_mutex_lock (&fd_ctx->lock); + { + fd_ctx->offset = offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + } + +out: + return op_ret; +} + + +ssize_t +libgf_client_readv (libglusterfs_client_ctx_t *ctx, + fd_t *fd, + const struct iovec *dst_vector, + int dst_count, + off_t offset) +{ + call_stub_t *stub = NULL; + struct iovec *src_vector; + int src_count = 0; + int32_t op_ret = -1; + libgf_client_local_t *local = NULL; + size_t size = 0; + int32_t i = 0; + + for (i = 0; i < dst_count; i++) + { + size += dst_vector[i].iov_len; + } + + LIBGF_CLIENT_FOP (ctx, stub, readv, local, fd, size, offset); + + op_ret = stub->args.readv_cbk.op_ret; + errno = stub->args.readv_cbk.op_errno; + src_count = stub->args.readv_cbk.count; + src_vector = stub->args.readv_cbk.vector; + if (op_ret > 0) { + int src = 0, dst = 0; + off_t src_offset = 0, dst_offset = 0; + op_ret = 0; + + while ((size != 0) && (dst < dst_count) && (src < src_count)) { + int len = 0, src_len, dst_len; + + src_len = src_vector[src].iov_len - src_offset; + dst_len = dst_vector[dst].iov_len - dst_offset; + + len = (src_len < dst_len) ? src_len : dst_len; + if (len > size) { + len = size; + } + + memcpy (dst_vector[dst].iov_base + dst_offset, + src_vector[src].iov_base + src_offset, len); + + size -= len; + src_offset += len; + dst_offset += len; + + if (src_offset == src_vector[src].iov_len) { + src_offset = 0; + src++; + } + + if (dst_offset == dst_vector[dst].iov_len) { + dst_offset = 0; + dst++; + } + } + } + + call_stub_destroy (stub); + return op_ret; +} + + +ssize_t +glusterfs_readv (unsigned long fd, const struct iovec *vec, int count) +{ + int32_t op_ret = -1; + off_t offset = 0; + libglusterfs_client_ctx_t *ctx = NULL; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + if (!fd) { + errno = EINVAL; + goto out; + } + + fd_ctx_data = dict_get (((fd_t *) fd)->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + + pthread_mutex_lock (&fd_ctx->lock); + { + ctx = fd_ctx->ctx; + offset = fd_ctx->offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + + op_ret = libgf_client_readv (ctx, (fd_t *)fd, vec, count, offset); + + if (op_ret > 0) { + offset += op_ret; + pthread_mutex_lock (&fd_ctx->lock); + { + fd_ctx->offset = offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + } + +out: + return op_ret; +} + + +ssize_t +glusterfs_pread (unsigned long fd, + void *buf, + size_t count, + off_t offset) +{ + int32_t op_ret = -1; + libglusterfs_client_ctx_t *ctx = NULL; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + if (!fd) { + errno = EINVAL; + goto out; + } + + fd_ctx_data = dict_get (((fd_t *) fd)->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + + ctx = fd_ctx->ctx; + + op_ret = libgf_client_read (ctx, (fd_t *)fd, buf, count, offset); + +out: + return op_ret; +} + + +int +libgf_client_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + libgf_client_local_t *local = frame->local; + + local->reply_stub = fop_writev_cbk_stub (frame, NULL, op_ret, op_errno, stbuf); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + return 0; +} + +int +libgf_client_writev (libglusterfs_client_ctx_t *ctx, + fd_t *fd, + struct iovec *vector, + int count, + off_t offset) +{ + call_stub_t *stub = NULL; + int op_ret = -1; + libgf_client_local_t *local = NULL; + + LIBGF_CLIENT_FOP (ctx, stub, writev, local, fd, vector, count, offset); + + op_ret = stub->args.writev_cbk.op_ret; + errno = stub->args.writev_cbk.op_errno; + + call_stub_destroy (stub); + return op_ret; +} + + +ssize_t +glusterfs_write (unsigned long fd, + const void *buf, + size_t n) +{ + int32_t op_ret = -1; + off_t offset = 0; + struct iovec vector; + libglusterfs_client_ctx_t *ctx = NULL; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + if (!fd) { + errno = EINVAL; + goto out; + } + + fd_ctx_data = dict_get (((fd_t *) fd)->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + + ctx = fd_ctx->ctx; + + pthread_mutex_lock (&fd_ctx->lock); + { + offset = fd_ctx->offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + + vector.iov_base = (void *)buf; + vector.iov_len = n; + + op_ret = libgf_client_writev (ctx, + (fd_t *)fd, + &vector, + 1, + offset); + + if (op_ret >= 0) { + offset += op_ret; + pthread_mutex_lock (&fd_ctx->lock); + { + fd_ctx->offset = offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + } + +out: + return op_ret; +} + +ssize_t +glusterfs_writev (unsigned long fd, + const struct iovec *vector, + size_t count) +{ + int32_t op_ret = -1; + off_t offset = 0; + libglusterfs_client_ctx_t *ctx = NULL; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + if (!fd) { + errno = EINVAL; + goto out; + } + + + fd_ctx_data = dict_get (((fd_t *) fd)->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + + ctx = fd_ctx->ctx; + + pthread_mutex_lock (&fd_ctx->lock); + { + offset = fd_ctx->offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + + + op_ret = libgf_client_writev (ctx, + (fd_t *)fd, + (struct iovec *)vector, + count, + offset); + + if (op_ret >= 0) { + offset += op_ret; + pthread_mutex_lock (&fd_ctx->lock); + { + fd_ctx->offset = offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + } + +out: + return op_ret; +} + + +ssize_t +glusterfs_pwrite (unsigned long fd, + const void *buf, + size_t count, + off_t offset) +{ + int32_t op_ret = -1; + struct iovec vector; + libglusterfs_client_ctx_t *ctx = NULL; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + if (!fd) { + errno = EINVAL; + goto out; + } + + fd_ctx_data = dict_get (((fd_t *) fd)->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + + ctx = fd_ctx->ctx; + + vector.iov_base = (void *)buf; + vector.iov_len = count; + + op_ret = libgf_client_writev (ctx, + (fd_t *)fd, + &vector, + 1, + offset); + +out: + return op_ret; +} + + +int32_t +libgf_client_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries) +{ + libgf_client_local_t *local = frame->local; + + local->reply_stub = fop_readdir_cbk_stub (frame, NULL, op_ret, op_errno, entries); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + return 0; +} + +int +libgf_client_readdir (libglusterfs_client_ctx_t *ctx, + fd_t *fd, + struct dirent *dirp, + size_t size, + off_t *offset, + int32_t num_entries) +{ + call_stub_t *stub = NULL; + int op_ret = -1; + libgf_client_local_t *local = NULL; + gf_dirent_t *entry = NULL; + int32_t count = 0; + size_t entry_size = 0; + + LIBGF_CLIENT_FOP (ctx, stub, readdir, local, fd, size, *offset); + + op_ret = stub->args.readdir_cbk.op_ret; + errno = stub->args.readdir_cbk.op_errno; + + if (op_ret > 0) { + list_for_each_entry (entry, &stub->args.readdir_cbk.entries.list, list) { + entry_size = offsetof (struct dirent, d_name) + strlen (entry->d_name) + 1; + + if ((size < entry_size) || (count == num_entries)) { + break; + } + + size -= entry_size; + + dirp->d_ino = entry->d_ino; + /* + #ifdef GF_DARWIN_HOST_OS + dirp->d_off = entry->d_seekoff; + #endif + #ifdef GF_LINUX_HOST_OS + dirp->d_off = entry->d_off; + #endif + */ + + *offset = dirp->d_off = entry->d_off; + /* dirp->d_type = entry->d_type; */ + dirp->d_reclen = entry->d_len; + strncpy (dirp->d_name, entry->d_name, dirp->d_reclen); + dirp->d_name[dirp->d_reclen] = '\0'; + + dirp = (struct dirent *) (((char *) dirp) + entry_size); + count++; + } + } + + call_stub_destroy (stub); + return op_ret; +} + +int +glusterfs_readdir (unsigned long fd, + struct dirent *dirp, + unsigned int count) +{ + int op_ret = -1; + libglusterfs_client_ctx_t *ctx = NULL; + off_t offset = 0; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + fd_ctx_data = dict_get (((fd_t *) fd)->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + + pthread_mutex_lock (&fd_ctx->lock); + { + ctx = fd_ctx->ctx; + offset = fd_ctx->offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + + op_ret = libgf_client_readdir (ctx, (fd_t *)fd, dirp, sizeof (*dirp), &offset, 1); + + if (op_ret > 0) { + pthread_mutex_lock (&fd_ctx->lock); + { + fd_ctx->offset = offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + op_ret = 1; + } + +out: + return op_ret; +} + + +int +glusterfs_getdents (unsigned long fd, struct dirent *dirp, unsigned int count) +{ + int op_ret = -1; + libglusterfs_client_ctx_t *ctx = NULL; + off_t offset = 0; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + fd_ctx_data = dict_get (((fd_t *) fd)->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + + pthread_mutex_lock (&fd_ctx->lock); + { + ctx = fd_ctx->ctx; + offset = fd_ctx->offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + + op_ret = libgf_client_readdir (ctx, (fd_t *)fd, dirp, count, &offset, -1); + + if (op_ret > 0) { + pthread_mutex_lock (&fd_ctx->lock); + { + fd_ctx->offset = offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + } + +out: + return op_ret; +} + + +static int32_t +libglusterfs_readv_async_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + glusterfs_read_buf_t *buf; + libglusterfs_client_async_local_t *local = frame->local; + fd_t *__fd = local->fop.readv_cbk.fd; + glusterfs_readv_cbk_t readv_cbk = local->fop.readv_cbk.cbk; + + buf = CALLOC (1, sizeof (*buf)); + ERR_ABORT (buf); + + if (vector) { + buf->vector = iov_dup (vector, count); + } + + buf->count = count; + buf->op_ret = op_ret; + buf->op_errno = op_errno; + + if (frame->root->rsp_refs) { + buf->ref = dict_ref (frame->root->rsp_refs); + } + + if (op_ret > 0) { + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + fd_ctx_data = dict_get (__fd->ctx, XLATOR_NAME); + + fd_ctx = data_to_ptr (fd_ctx_data); + pthread_mutex_lock (&fd_ctx->lock); + { + fd_ctx->offset += op_ret; + } + pthread_mutex_unlock (&fd_ctx->lock); + } + + readv_cbk (buf, local->cbk_data); + + FREE (local); + frame->local = NULL; + STACK_DESTROY (frame->root); + + return 0; +} + +void +glusterfs_free (glusterfs_read_buf_t *buf) +{ + //iov_free (buf->vector, buf->count); + FREE (buf->vector); + dict_unref ((dict_t *) buf->ref); + FREE (buf); +} + +int +glusterfs_read_async (unsigned long fd, + size_t nbytes, + off_t offset, + glusterfs_readv_cbk_t readv_cbk, + void *cbk_data) +{ + libglusterfs_client_ctx_t *ctx; + fd_t *__fd = (fd_t *)fd; + libglusterfs_client_async_local_t *local = NULL; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + int32_t op_ret = 0; + + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + local->fop.readv_cbk.fd = __fd; + local->fop.readv_cbk.cbk = readv_cbk; + local->cbk_data = cbk_data; + + fd_ctx_data = dict_get (((fd_t *) fd)->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + op_ret = -1; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + + ctx = fd_ctx->ctx; + + if (offset < 0) { + pthread_mutex_lock (&fd_ctx->lock); + { + offset = fd_ctx->offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + } + + LIBGF_CLIENT_FOP_ASYNC (ctx, + local, + libglusterfs_readv_async_cbk, + readv, + __fd, + nbytes, + offset); + +out: + return op_ret; +} + +static int32_t +libglusterfs_writev_async_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + libglusterfs_client_async_local_t *local = frame->local; + fd_t *fd = NULL; + glusterfs_writev_cbk_t writev_cbk; + + writev_cbk = local->fop.writev_cbk.cbk; + fd = local->fop.writev_cbk.fd; + + if (op_ret > 0) { + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + + fd_ctx_data = dict_get (fd->ctx, XLATOR_NAME); + + fd_ctx = data_to_ptr (fd_ctx_data); + + pthread_mutex_lock (&fd_ctx->lock); + { + fd_ctx->offset += op_ret; + } + pthread_mutex_unlock (&fd_ctx->lock); + } + + writev_cbk (op_ret, op_errno, local->cbk_data); + + STACK_DESTROY (frame->root); + return 0; +} + +int32_t +glusterfs_write_async (unsigned long fd, + const void *buf, + size_t nbytes, + off_t offset, + glusterfs_writev_cbk_t writev_cbk, + void *cbk_data) +{ + fd_t *__fd = (fd_t *)fd; + struct iovec vector; + off_t __offset = offset; + libglusterfs_client_ctx_t *ctx = NULL; + libglusterfs_client_async_local_t *local = NULL; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + int32_t op_ret = 0; + + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + local->fop.writev_cbk.fd = __fd; + local->fop.writev_cbk.cbk = writev_cbk; + local->cbk_data = cbk_data; + + vector.iov_base = (void *)buf; + vector.iov_len = nbytes; + + fd_ctx_data = dict_get (__fd->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + op_ret = -1; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + ctx = fd_ctx->ctx; + + if (offset < 0) { + pthread_mutex_lock (&fd_ctx->lock); + { + __offset = fd_ctx->offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + } + + LIBGF_CLIENT_FOP_ASYNC (ctx, + local, + libglusterfs_writev_async_cbk, + writev, + __fd, + &vector, + 1, + __offset); + +out: + return op_ret; +} + +off_t +glusterfs_lseek (unsigned long fd, off_t offset, int whence) +{ + off_t __offset = 0; + int32_t op_ret = -1; + fd_t *__fd = (fd_t *)fd; + data_t *fd_ctx_data = NULL; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + libglusterfs_client_inode_ctx_t *inode_ctx = NULL; + libglusterfs_client_ctx_t *ctx = NULL; + xlator_t *this = NULL; + + fd_ctx_data = dict_get (__fd->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADFD; + __offset = -1; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + ctx = fd_ctx->ctx; + + switch (whence) + { + case SEEK_SET: + __offset = offset; + break; + + case SEEK_CUR: + pthread_mutex_lock (&fd_ctx->lock); + { + __offset = fd_ctx->offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + + __offset += offset; + break; + + case SEEK_END: + { + char cache_valid = 0; + off_t end = 0; + time_t prev, current; + loc_t loc = {0, }; + struct stat stbuf = {0, }; + int32_t ret = -1; + uint64_t ptr = 0; + + ret = inode_ctx_get (__fd->inode, this, &ptr); + if (ret == 0) { + inode_ctx = (libglusterfs_client_inode_ctx_t *)(long)ptr; + memset (¤t, 0, sizeof (current)); + current = time (NULL); + + pthread_mutex_lock (&inode_ctx->lock); + { + prev = inode_ctx->previous_lookup_time; + } + pthread_mutex_unlock (&inode_ctx->lock); + + if (prev >= 0 && ctx->lookup_timeout >= (current - prev)) { + cache_valid = 1; + } + } + + if (cache_valid) { + end = inode_ctx->stbuf.st_size; + } else { + op_ret = libgf_client_loc_fill (&loc, NULL, __fd->inode->ino, ctx); + if (op_ret < 0) { + gf_log ("libglusterfsclient", + GF_LOG_ERROR, + "libgf_client_loc_fill returned -1, returning EINVAL"); + errno = EINVAL; + __offset = -1; + goto out; + } + + op_ret = libgf_client_lookup (ctx, &loc, &stbuf, NULL, NULL); + if (op_ret < 0) { + __offset = -1; + goto out; + } + + end = stbuf.st_size; + } + + __offset = end + offset; + } + break; + + default: + gf_log ("libglusterfsclient", + GF_LOG_ERROR, + "invalid value for whence"); + __offset = -1; + errno = EINVAL; + goto out; + } + + pthread_mutex_lock (&fd_ctx->lock); + { + fd_ctx->offset = __offset; + } + pthread_mutex_unlock (&fd_ctx->lock); + +out: + return __offset; +} + + +int32_t +libgf_client_stat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + libgf_client_local_t *local = frame->local; + + local->reply_stub = fop_stat_cbk_stub (frame, + NULL, + op_ret, + op_errno, + buf); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + +int32_t +libgf_client_stat (libglusterfs_client_ctx_t *ctx, + loc_t *loc, + struct stat *stbuf) +{ + call_stub_t *stub = NULL; + int32_t op_ret = 0; + time_t prev, current; + libglusterfs_client_inode_ctx_t *inode_ctx = NULL; + libgf_client_local_t *local = NULL; + xlator_t *this = NULL; + uint64_t ptr = 0; + + this = ctx->gf_ctx.graph; + op_ret = inode_ctx_get (loc->inode, this, &ptr); + if (op_ret == -1) { + errno = EINVAL; + goto out; + } + + inode_ctx = (libglusterfs_client_inode_ctx_t *)(long)ptr; + current = time (NULL); + pthread_mutex_lock (&inode_ctx->lock); + { + prev = inode_ctx->previous_lookup_time; + } + pthread_mutex_unlock (&inode_ctx->lock); + + if ((current - prev) <= ctx->stat_timeout) { + pthread_mutex_lock (&inode_ctx->lock); + { + memcpy (stbuf, &inode_ctx->stbuf, sizeof (*stbuf)); + } + pthread_mutex_unlock (&inode_ctx->lock); + op_ret = 0; + goto out; + } + + LIBGF_CLIENT_FOP (ctx, stub, stat, local, loc); + + op_ret = stub->args.stat_cbk.op_ret; + errno = stub->args.stat_cbk.op_errno; + *stbuf = stub->args.stat_cbk.buf; + + pthread_mutex_lock (&inode_ctx->lock); + { + memcpy (&inode_ctx->stbuf, stbuf, sizeof (*stbuf)); + current = time (NULL); + inode_ctx->previous_stat_time = current; + } + pthread_mutex_unlock (&inode_ctx->lock); + + call_stub_destroy (stub); + +out: + return op_ret; +} + +int32_t +glusterfs_stat (libglusterfs_handle_t handle, + const char *path, + struct stat *buf) +{ + int32_t op_ret = 0; + loc_t loc = {0, }; + char lookup_required = 1; + libglusterfs_client_ctx_t *ctx = handle; + xlator_t *this = NULL; + + op_ret = libgf_client_loc_fill (&loc, path, 0, ctx); + if (op_ret < 0) { + gf_log ("libglusterfsclient", + GF_LOG_ERROR, + "libgf_client_loc_fill returned -1, returning EINVAL"); + errno = EINVAL; + goto out; + } + + this = ctx->gf_ctx.graph; + if (loc.inode) { + time_t current, prev; + libglusterfs_client_inode_ctx_t *inode_ctx = NULL; + uint64_t ptr = 0; + + op_ret = inode_ctx_get (loc.inode, this, &ptr); + if (op_ret == -1) { + inode_unref (loc.inode); + errno = EINVAL; + goto out; + } + + inode_ctx = (libglusterfs_client_inode_ctx_t *)(long)ptr; + memset (¤t, 0, sizeof (current)); + current = time (NULL); + + pthread_mutex_lock (&inode_ctx->lock); + { + prev = inode_ctx->previous_lookup_time; + } + pthread_mutex_unlock (&inode_ctx->lock); + + if (prev >= 0 && ctx->lookup_timeout >= (current - prev)) { + lookup_required = 0; + } + } + + if (lookup_required) { + op_ret = libgf_client_lookup (ctx, &loc, buf, NULL, NULL); + } + + if (!op_ret) { + op_ret = libgf_client_stat (ctx, &loc, buf); + } + + libgf_client_loc_wipe (&loc); + +out: + return op_ret; +} + +static int32_t +libgf_client_fstat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + libgf_client_local_t *local = frame->local; + + local->reply_stub = fop_fstat_cbk_stub (frame, + NULL, + op_ret, + op_errno, + buf); + + pthread_mutex_lock (&local->lock); + { + local->complete = 1; + pthread_cond_broadcast (&local->reply_cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; + +} + +int32_t +libgf_client_fstat (libglusterfs_client_ctx_t *ctx, + fd_t *fd, + struct stat *buf) +{ + call_stub_t *stub = NULL; + int32_t op_ret = 0; + fd_t *__fd = fd; + time_t current, prev; + libglusterfs_client_inode_ctx_t *inode_ctx = NULL; + libgf_client_local_t *local = NULL; + xlator_t *this = NULL; + uint64_t ptr = 0; + + current = time (NULL); + op_ret = inode_ctx_get (fd->inode, this, &ptr); + if (op_ret == -1) { + errno = EINVAL; + goto out; + } + + inode_ctx = (libglusterfs_client_inode_ctx_t *)(long)ptr; + pthread_mutex_lock (&inode_ctx->lock); + { + prev = inode_ctx->previous_stat_time; + } + pthread_mutex_unlock (&inode_ctx->lock); + + if ((current - prev) <= ctx->stat_timeout) { + pthread_mutex_lock (&inode_ctx->lock); + { + memcpy (buf, &inode_ctx->stbuf, sizeof (*buf)); + } + pthread_mutex_unlock (&inode_ctx->lock); + op_ret = 0; + goto out; + } + + LIBGF_CLIENT_FOP (ctx, stub, fstat, local, __fd); + + op_ret = stub->args.fstat_cbk.op_ret; + errno = stub->args.fstat_cbk.op_errno; + *buf = stub->args.fstat_cbk.buf; + + pthread_mutex_lock (&inode_ctx->lock); + { + memcpy (&inode_ctx->stbuf, buf, sizeof (*buf)); + current = time (NULL); + inode_ctx->previous_stat_time = current; + } + pthread_mutex_unlock (&inode_ctx->lock); + + call_stub_destroy (stub); + +out: + return op_ret; +} + +int32_t +glusterfs_fstat (unsigned long fd, struct stat *buf) +{ + libglusterfs_client_ctx_t *ctx; + fd_t *__fd = (fd_t *)fd; + libglusterfs_client_fd_ctx_t *fd_ctx = NULL; + data_t *fd_ctx_data = NULL; + int32_t op_ret = -1; + + fd_ctx_data = dict_get (((fd_t *) fd)->ctx, XLATOR_NAME); + if (!fd_ctx_data) { + errno = EBADF; + op_ret = -1; + goto out; + } + + fd_ctx = data_to_ptr (fd_ctx_data); + ctx = fd_ctx->ctx; + + op_ret = libgf_client_fstat (ctx, __fd, buf); + +out: + return op_ret; +} + +static struct xlator_fops libgf_client_fops = { +}; + +static struct xlator_mops libgf_client_mops = { +}; + +static struct xlator_cbks libgf_client_cbks = { + .forget = libgf_client_forget, + .release = libgf_client_release, + .releasedir = libgf_client_releasedir, +}; + +static inline xlator_t * +libglusterfs_graph (xlator_t *graph) +{ + xlator_t *top = NULL; + xlator_list_t *xlchild, *xlparent; + + top = CALLOC (1, sizeof (*top)); + ERR_ABORT (top); + + xlchild = CALLOC (1, sizeof(*xlchild)); + ERR_ABORT (xlchild); + xlchild->xlator = graph; + top->children = xlchild; + top->ctx = graph->ctx; + top->next = graph; + top->name = strdup (XLATOR_NAME); + + xlparent = CALLOC (1, sizeof(*xlparent)); + xlparent->xlator = top; + graph->parents = xlparent; + asprintf (&top->type, XLATOR_NAME); + + top->init = libgf_client_init; + top->fops = &libgf_client_fops; + top->mops = &libgf_client_mops; + top->cbks = &libgf_client_cbks; + top->notify = libgf_client_notify; + top->fini = libgf_client_fini; + // fill_defaults (top); + + return top; +} diff --git a/libglusterfsclient/src/libglusterfsclient.h b/libglusterfsclient/src/libglusterfsclient.h new file mode 100755 index 000000000..19b7ea036 --- /dev/null +++ b/libglusterfsclient/src/libglusterfsclient.h @@ -0,0 +1,279 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef __LIBGLUSTERFSCLIENT_H +#define __LIBGLUSTERFSCLIENT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + //#include +#include +#include +/* #include */ + +typedef struct { + int op_ret; + int op_errno; + struct iovec *vector; + int count; + void *ref; +}glusterfs_read_buf_t; + + +typedef struct { + char *logfile; + char *loglevel; + struct { + char *specfile; + FILE *specfp; + }; + char *volume_name; + unsigned long lookup_timeout; + unsigned long stat_timeout; +}glusterfs_init_ctx_t; + +typedef struct libglusterfs_client_ctx *libglusterfs_handle_t; + +typedef int (*glusterfs_readv_cbk_t) (glusterfs_read_buf_t *buf, + void *cbk_data); + +typedef int (*glusterfs_writev_cbk_t) (int op_ret, + int op_errno, + void *cbk_data); + +typedef int (*glusterfs_lookup_cbk_t) (int op_ret, + int op_errno, + void *buf, + struct stat *st, + void *cbk_data); + +/* Used to free the glusterfs_read_buf passed to the application from glusterfs_read_async_cbk */ +void +glusterfs_free (glusterfs_read_buf_t *buf); + +/* libglusterfsclient initialization function */ +libglusterfs_handle_t +glusterfs_init (glusterfs_init_ctx_t *ctx); + +int +glusterfs_fini (libglusterfs_handle_t handle); + +/* added for log related initialization for fork implementation in booster */ +void +glusterfs_reset (void); + +void +glusterfs_log_lock (void); + +void +glusterfs_log_unlock (void); + +/* For smaller files, application can use just glusterfs_lookup/glusterfs_lookup_async to read + * the whole content. Limit of the file-sizes to be read in + * glusterfs_lookup/glusterfs_lookup_async is passed in the size argument */ + +/* glusterfs_lookup: + * @handle: glusterfs handle + * @path: path to be looked upon + * @buf: pointer to pre-allocated buf, in which the file content is returned for files with sizes * less than the size argument. + * @size: upper limit of file-sizes to be read in lookup + * @stbuf: stat buffer + */ + +int +glusterfs_lookup (libglusterfs_handle_t handle, + const char *path, + void *buf, + size_t size, + struct stat *stbuf); + +int +glusterfs_lookup_async (libglusterfs_handle_t handle, + const char *path, + void *buf, + size_t size, + glusterfs_lookup_cbk_t cbk, + void *cbk_data); + +unsigned long +glusterfs_open (libglusterfs_handle_t handle, + const char *path, + int flags, + mode_t mode); + +unsigned long +glusterfs_creat (libglusterfs_handle_t handle, + const char *path, + mode_t mode); + +int +glusterfs_close (unsigned long fd); + +int +glusterfs_stat (libglusterfs_handle_t handle, + const char *path, + struct stat *buf); + +int +glusterfs_fstat (unsigned long fd, + struct stat *buf) ; + +int +glusterfs_setxattr (libglusterfs_handle_t handle, + const char *path, + const char *name, + const void *value, + size_t size, + int flags); + +int +glusterfs_lsetxattr (libglusterfs_handle_t handle, + const char *path, + const char *name, + const void *value, + size_t size, + int flags); + +int +glusterfs_fsetxattr (unsigned long fd, + const char *name, + const void *value, + size_t size, + int flags); + +ssize_t +glusterfs_getxattr (libglusterfs_handle_t handle, + const char *path, + const char *name, + void *value, + size_t size); + +ssize_t +glusterfs_lgetxattr (libglusterfs_handle_t handle, + const char *path, + const char *name, + void *value, + size_t size); + +ssize_t +glusterfs_fgetxattr (unsigned long fd, + const char *name, + void *value, + size_t size); + +ssize_t +glusterfs_listxattr (libglusterfs_handle_t handle, + const char *path, + char *list, + size_t size); + +ssize_t +glusterfs_llistxattr (libglusterfs_handle_t handle, + const char *path, + char *list, + size_t size); + +ssize_t +glusterfs_flistxattr (unsigned long fd, + char *list, + size_t size); + +int +glusterfs_removexattr (libglusterfs_handle_t handle, + const char *path, + const char *name); + +int +glusterfs_lremovexattr (libglusterfs_handle_t handle, + const char *path, + const char *name); + +int +glusterfs_fremovexattr (unsigned long fd, + const char *name); + +ssize_t +glusterfs_read (unsigned long fd, + void *buf, + size_t nbytes); + +ssize_t +glusterfs_readv (unsigned long fd, + const struct iovec *vec, + int count); + +int +glusterfs_read_async (unsigned long fd, + size_t nbytes, + off_t offset, + glusterfs_readv_cbk_t readv_cbk, + void *cbk_data); + +ssize_t +glusterfs_write (unsigned long fd, + const void *buf, + size_t n); + +ssize_t +glusterfs_writev (unsigned long fd, + const struct iovec *vector, + size_t count); + +int +glusterfs_write_async (unsigned long fd, + const void *buf, + size_t nbytes, + off_t offset, + glusterfs_writev_cbk_t writev_cbk, + void *cbk_data); + +int +glusterfs_readdir (unsigned long fd, + struct dirent *dirp, + unsigned int count); + +int +glusterfs_getdents (unsigned long fd, + struct dirent *dirp, + unsigned int count); + +ssize_t +glusterfs_pread (unsigned long fd, + void *buf, + size_t count, + off_t offset); + +ssize_t +glusterfs_pwrite (unsigned long fd, + const void *buf, + size_t count, + off_t offset); + +off_t +glusterfs_lseek (unsigned long fd, off_t offset, int whence); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/mod_glusterfs/Makefile.am b/mod_glusterfs/Makefile.am new file mode 100644 index 000000000..0abe8dcfc --- /dev/null +++ b/mod_glusterfs/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = apache lighttpd + +CLEANFILES = diff --git a/mod_glusterfs/apache/1.3/Makefile.am b/mod_glusterfs/apache/1.3/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/mod_glusterfs/apache/1.3/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/mod_glusterfs/apache/1.3/src/Makefile.am b/mod_glusterfs/apache/1.3/src/Makefile.am new file mode 100644 index 000000000..6bb3075f5 --- /dev/null +++ b/mod_glusterfs/apache/1.3/src/Makefile.am @@ -0,0 +1,30 @@ +mod_glusterfs_PROGRAMS = mod_glusterfs.so +mod_glusterfsdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/apache/1.3 + +mod_glusterfs_so_SOURCES = mod_glusterfs.c + +all: mod_glusterfs.so + +mod_glusterfs.so: $(top_srcdir)/mod_glusterfs/apache/1.3/src/mod_glusterfs.c $(top_builddir)/libglusterfsclient/src/libglusterfsclient.la + ln -sf $(top_srcdir)/mod_glusterfs/apache/1.3/src/mod_glusterfs.c $(top_builddir)/mod_glusterfs/apache/1.3/src/mod_glusterfs-build.c + $(APXS) -c -Wc,-g3 -Wc,-O0 -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -D_GNU_SOURCE -I$(top_srcdir)/libglusterfsclient/src -Wl,-rpath,$(libdir) -Wl,-rpath,$(top_builddir)/libglusterfsclient/src/.libs/ $(top_builddir)/libglusterfsclient/src/.libs/libglusterfsclient.so mod_glusterfs-build.c -o $(top_builddir)/mod_glusterfs/apache/1.3/src/mod_glusterfs.so + +$(top_builddir)/libglusterfsclient/src/libglusterfsclient.la: + $(MAKE) -C $(top_builddir)/libglusterfsclient/src/ all + +install-data-local: + @echo "" + @echo "" + @echo "**********************************************************************************" + @echo "* TO INSTALL MODGLUSTERFS, PLEASE USE, " + @echo "* $(APXS) -n glusterfs -ia $(mod_glusterfsdir)/mod_glusterfs.so " + @echo "**********************************************************************************" + @echo "" + @echo "" + +#install: +# cp -fv mod_glusterfs.so $(HTTPD_LIBEXECDIR) +# cp -fv httpd.conf $(HTTPD_CONF_DIR) + +clean: + -rm -fv *.so *.o mod_glusterfs-build.c diff --git a/mod_glusterfs/apache/1.3/src/README.txt b/mod_glusterfs/apache/1.3/src/README.txt new file mode 100644 index 000000000..378a51d79 --- /dev/null +++ b/mod_glusterfs/apache/1.3/src/README.txt @@ -0,0 +1,107 @@ +What is mod_glusterfs? +====================== +* mod_glusterfs is a module for apache written for efficient serving of files from glusterfs. + mod_glusterfs interfaces with glusterfs using apis provided by libglusterfsclient. + +* this README speaks about installation of apache-1.3.x, where x is any minor version. + +Prerequisites for mod_glusterfs +=============================== +Though mod_glusterfs has been written as a module, with an intent of making no changes to the way apache has +been built, currently following points have to be taken care of: + +* module "so" has to be enabled, for apache to support modules. +* since glusterfs is compiled with _FILE_OFFSET_BITS=64 and __USE_FILE_OFFSET64 flags, mod_glusterfs and apache + in turn have to be compiled with the above two flags. + + $ tar xzvf apache-1.3.9.tar.gz + $ cd apache-1.3.9/ + $ # add -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 to EXTRA_CFLAGS in src/Configuration. + $ ./configure --prefix=/usr --enable-module=so + $ cd src + $ ./Configure + $ cd ../ + $ make install + $ httpd -l | grep -i mod_so + mod_so.c + +* if multiple apache installations are present, make sure to pass --with-apxs=/path/to/apxs/of/proper/version to configure script while building glusterfs. + +Build/Install mod_glusterfs +=========================== +* mod_glusterfs is provided with glusterfs--mainline--3.0 and all releases from the same branch. + +* building glusterfs also builds mod_glusterfs. But 'make install' of glusterfs installs mod_glusterfs.so to + glusterfs install directory instead of the apache modules directory. + +* 'make install' of glusterfs will print a message similar to the one given below, which is self explanatory. + Make sure to use apxs of proper apache version in case of multiple apache installations. This will copy + mod_glusterfs.so to modules directory of proper apache version and modify the appropriate httpd.conf to enable + mod_glusterfs. + +********************************************************************************************** +* TO INSTALL MODGLUSTERFS, PLEASE USE, +* apxs -n mod_glusterfs -ia /usr/lib/glusterfs/1.4.0pre2/apache-1.3/mod_glusterfs.so +********************************************************************************************** + +Configuration +============= +* Following configuration has to be added to httpd.conf. + + + GlusterfsLogfile "/var/log/glusterfs/glusterfs.log" + GlusterfsLoglevel "warning" + GlusterfsVolumeSpecfile "/etc/glusterfs/glusterfs-client.spec" + GlusterfsCacheTimeout "600" + GlusterfsXattrFileSize "65536" + SetHandler "glusterfs-handler" + + +* GlusterfsVolumeSpecfile (COMPULSORY) + Path to the the glusterfs volume specification file. + +* GlusterfsLogfile (COMPULSORY) + Path to the glusterfs logfile. + +* GlusterfsLoglevel (OPTIONAL, default = warning) + Severity of messages that are to be logged. Allowed values are critical, error, warning, debug, none + in the decreasing order of severity. + +* GlusterfsCacheTimeOut (OPTIONAL, default = 0) + Timeout values for glusterfs stat and lookup cache. + +* GlusterfsXattrFileSize (OPTIONAL, default = 0) + Files with sizes upto and including this value are fetched through the extended attribute interface of + glusterfs rather than the usual open-read-close set of operations. For files of small sizes, it is recommended + to use extended attribute interface. + +* With the above configuration all the requests to httpd of the form www.example.org/glusterfs/path/to/file are + served from glusterfs. + +Miscellaneous points +==================== +* httpd by default runs with username "nobody" and group "nogroup". Permissions of logfile and specfile have to + be set suitably. + +* Since mod_glusterfs runs with permissions of nobody.nogroup, glusterfs has to use only login based + authentication. See docs/authentication.txt for more details. + +* To copy the data served by httpd into glusterfs mountpoint, glusterfs can be started with the + volume-specification file provided to mod_glusterfs. Any tool like cp can then be used. + +* To run in gdb, apache has to be compiled with -lpthread, since libglusterfsclient is multithreaded. + If not on Linux gdb runs into errors like: + "Error while reading shared library symbols: + Cannot find new threads: generic error" + +* when used with ib-verbs transport, ib_verbs initialization fails. + reason for this is that apache runs as non-privileged user and the amount of memory that can be + locked by default is not sufficient for ib-verbs. to fix this, as root run, + + # ulimit -l unlimited + + and then start apache. + +TODO +==== +* directory listing for the directories accessed through mod_glusterfs. diff --git a/mod_glusterfs/apache/1.3/src/mod_glusterfs.c b/mod_glusterfs/apache/1.3/src/mod_glusterfs.c new file mode 100644 index 000000000..e13d77626 --- /dev/null +++ b/mod_glusterfs/apache/1.3/src/mod_glusterfs.c @@ -0,0 +1,514 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef CORE_PRIVATE +#define CORE_PRIVATE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define GLUSTERFS_INVALID_LOGLEVEL "mod_glusterfs: Unrecognized log-level \"%s\", possible values are \"DEBUG|WARNING|ERROR|CRITICAL|NONE\"\n" + +#define GLUSTERFS_HANDLER "glusterfs-handler" +#define GLUSTERFS_CHUNK_SIZE 131072 + +module MODULE_VAR_EXPORT glusterfs_module; + +/*TODO: verify error returns to server core */ + +typedef struct glusterfs_dir_config { + char *logfile; + char *loglevel; + char *specfile; + char *mount_dir; + char *buf; + size_t xattr_file_size; + uint32_t cache_timeout; + libglusterfs_handle_t handle; +} glusterfs_dir_config_t; + +typedef struct glusterfs_async_local { + int op_ret; + int op_errno; + char async_read_complete; + off_t length; + off_t read_bytes; + glusterfs_read_buf_t *buf; + request_rec *request; + pthread_mutex_t lock; + pthread_cond_t cond; +}glusterfs_async_local_t; + +#define GLUSTERFS_CMD_PERMS ACCESS_CONF + +static glusterfs_dir_config_t * +mod_glusterfs_dconfig(request_rec *r) +{ + glusterfs_dir_config_t *dir_config = NULL; + if (r->per_dir_config != NULL) { + dir_config = ap_get_module_config (r->per_dir_config, &glusterfs_module); + } + + return dir_config; +} + +static +const char *add_xattr_file_size(cmd_parms *cmd, void *dummy, char *arg) +{ + glusterfs_dir_config_t *dir_config = dummy; + dir_config->xattr_file_size = atoi (arg); + return NULL; +} + +static +const char *set_cache_timeout(cmd_parms *cmd, void *dummy, char *arg) +{ + glusterfs_dir_config_t *dir_config = dummy; + dir_config->cache_timeout = atoi (arg); + return NULL; +} + +static +const char *set_loglevel(cmd_parms *cmd, void *dummy, char *arg) +{ + glusterfs_dir_config_t *dir_config = dummy; + char *error = NULL; + if (strncasecmp (arg, "DEBUG", strlen ("DEBUG")) + && strncasecmp (arg, "WARNING", strlen ("WARNING")) + && strncasecmp (arg, "CRITICAL", strlen ("CRITICAL")) + && strncasecmp (arg, "NONE", strlen ("NONE")) + && strncasecmp (arg, "ERROR", strlen ("ERROR"))) + error = GLUSTERFS_INVALID_LOGLEVEL; + else + dir_config->loglevel = arg; + + return error; +} + +static +const char *add_logfile(cmd_parms *cmd, void *dummy, char *arg) +{ + glusterfs_dir_config_t *dir_config = dummy; + dir_config->logfile = arg; + + return NULL; +} + +static +const char *add_specfile(cmd_parms *cmd, void *dummy, char *arg) +{ + glusterfs_dir_config_t *dir_config = dummy; + + dir_config->specfile = arg; + + return NULL; +} + +static void * +mod_glusterfs_create_dir_config(pool *p, char *dirspec) +{ + glusterfs_dir_config_t *dir_config = NULL; + + dir_config = (glusterfs_dir_config_t *) ap_pcalloc(p, sizeof(*dir_config)); + + dir_config->mount_dir = dirspec; + dir_config->logfile = dir_config->specfile = (char *)0; + dir_config->loglevel = "warning"; + dir_config->handle = (libglusterfs_handle_t) 0; + dir_config->cache_timeout = 0; + dir_config->buf = NULL; + + return (void *) dir_config; +} + +static void +mod_glusterfs_child_init(server_rec *s, pool *p) +{ + void **urls = NULL; + int n, i; + core_server_config *mod_core_config = ap_get_module_config (s->module_config, + &core_module); + glusterfs_dir_config_t *dir_config = NULL; + glusterfs_init_ctx_t ctx; + + n = mod_core_config->sec_url->nelts; + urls = (void **)mod_core_config->sec_url->elts; + for (i = 0; i < n; i++) { + dir_config = ap_get_module_config (urls[i], &glusterfs_module); + + if (dir_config) { + memset (&ctx, 0, sizeof (ctx)); + + ctx.logfile = dir_config->logfile; + ctx.loglevel = dir_config->loglevel; + ctx.lookup_timeout = ctx.stat_timeout = dir_config->cache_timeout; + ctx.specfile = dir_config->specfile; + + dir_config->handle = glusterfs_init (&ctx); + } + dir_config = NULL; + } +} + +static void +mod_glusterfs_child_exit(server_rec *s, pool *p) +{ + void **urls = NULL; + int n, i; + core_server_config *mod_core_config = ap_get_module_config (s->module_config, + &core_module); + glusterfs_dir_config_t *dir_config = NULL; + + n = mod_core_config->sec_url->nelts; + urls = (void **)mod_core_config->sec_url->elts; + for (i = 0; i < n; i++) { + dir_config = ap_get_module_config (urls[i], &glusterfs_module); + if (dir_config && dir_config->handle) { + glusterfs_fini (dir_config->handle); + dir_config->handle = 0; + } + dir_config = NULL; + } +} + +static int mod_glusterfs_fixup(request_rec *r) +{ + glusterfs_dir_config_t *dir_config = NULL; + int access_status; + int ret; + char *path = NULL; + + dir_config = mod_glusterfs_dconfig(r); + + if (dir_config && dir_config->mount_dir && !(strncmp (ap_pstrcat (r->pool, dir_config->mount_dir, "/", NULL), r->uri, strlen (dir_config->mount_dir) + 1) && !r->handler)) + r->handler = ap_pstrdup (r->pool, GLUSTERFS_HANDLER); + + if (!r->handler || (r->handler && strcmp (r->handler, GLUSTERFS_HANDLER))) + return DECLINED; + + if (dir_config->mount_dir) + path = r->uri + strlen (dir_config->mount_dir); + + memset (&r->finfo, 0, sizeof (r->finfo)); + + dir_config->buf = calloc (1, dir_config->xattr_file_size); + if (!dir_config->buf) { + return HTTP_INTERNAL_SERVER_ERROR; + } + + ret = glusterfs_lookup (dir_config->handle, path, dir_config->buf, + dir_config->xattr_file_size, &r->finfo); + + if (ret == -1 || r->finfo.st_size > dir_config->xattr_file_size || S_ISDIR (r->finfo.st_mode)) { + free (dir_config->buf); + dir_config->buf = NULL; + + if (ret == -1) { + int error = HTTP_NOT_FOUND; + char *emsg = NULL; + if (r->path_info == NULL) { + emsg = ap_pstrcat(r->pool, strerror (errno), r->filename, NULL); + } + else { + emsg = ap_pstrcat(r->pool, strerror (errno), r->filename, r->path_info, NULL); + } + ap_log_rerror(APLOG_MARK, APLOG_ERR|APLOG_NOERRNO, r, "%s", emsg); + if (errno != ENOENT) { + error = HTTP_INTERNAL_SERVER_ERROR; + } + return error; + } + } + + if (r->uri && strlen (r->uri) && r->uri[strlen(r->uri) - 1] == '/') + r->handler = NULL; + + r->filename = ap_pstrcat (r->pool, r->filename, r->path_info, NULL); + + if ((access_status = ap_find_types(r)) != 0) { + return DECLINED; + } + + return OK; +} + + +int +mod_glusterfs_readv_async_cbk (glusterfs_read_buf_t *buf, + void *cbk_data) +{ + glusterfs_async_local_t *local = cbk_data; + + pthread_mutex_lock (&local->lock); + { + local->async_read_complete = 1; + local->buf = buf; + pthread_cond_signal (&local->cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + +/* use read_async just to avoid memcpy of read buffer in libglusterfsclient */ +static int +mod_glusterfs_read_async (request_rec *r, int fd, off_t offset, off_t length) +{ + glusterfs_async_local_t local; + off_t end; + int nbytes; + int complete; + pthread_cond_init (&local.cond, NULL); + pthread_mutex_init (&local.lock, NULL); + + memset (&local, 0, sizeof (local)); + local.request = r; + + if (length > 0) + end = offset + length; + + do { + glusterfs_read_buf_t *buf; + int i; + if (length > 0) { + nbytes = end - offset; + if (nbytes > GLUSTERFS_CHUNK_SIZE) + nbytes = GLUSTERFS_CHUNK_SIZE; + } else + nbytes = GLUSTERFS_CHUNK_SIZE; + + glusterfs_read_async(fd, + nbytes, + offset, + mod_glusterfs_readv_async_cbk, + (void *)&local); + + pthread_mutex_lock (&local.lock); + { + while (!local.async_read_complete) { + pthread_cond_wait (&local.cond, &local.lock); + } + + local.op_ret = local.buf->op_ret; + local.op_errno = local.buf->op_errno; + + local.async_read_complete = 0; + buf = local.buf; + + if (length < 0) + complete = (local.buf->op_ret <= 0); + else { + local.read_bytes += local.buf->op_ret; + complete = ((local.read_bytes == length) || (local.buf->op_ret < 0)); + } + } + pthread_mutex_unlock (&local.lock); + + for (i = 0; i < buf->count; i++) { + if (ap_rwrite (buf->vector[i].iov_base, buf->vector[i].iov_len, r) < 0) { + local.op_ret = -1; + complete = 1; + break; + } + } + + glusterfs_free (buf); + + offset += nbytes; + } while (!complete); + + return (local.op_ret < 0 ? SERVER_ERROR : OK); +} + +/* TODO: to read blocks of size "length" from offset "offset" */ +/* + static int + mod_glusterfs_read_sync (request_rec *r, int fd, off_t offset, off_t length) + { + int error = OK; + off_t read_bytes; + char buf [GLUSTERFS_CHUNK_SIZE]; + + while ((read_bytes = glusterfs_read (fd, buf, GLUSTERFS_CHUNK_SIZE)) && read_bytes != -1) { + ap_rwrite (buf, read_bytes, r); + } + if (read_bytes) { + error = SERVER_ERROR; + } + return error; + } +*/ + +static int +mod_glusterfs_handler(request_rec *r) +{ + glusterfs_dir_config_t *dir_config; + char *path = NULL; + int error = OK; + int rangestatus = 0; + int errstatus = OK; + int fd; + + if (!r->handler || (r->handler && strcmp (r->handler, GLUSTERFS_HANDLER))) + return DECLINED; + + if (r->uri[0] == '\0' || r->uri[strlen(r->uri) - 1] == '/') { + return DECLINED; + } + + dir_config = mod_glusterfs_dconfig (r); + + if (r->method_number != M_GET) { + return METHOD_NOT_ALLOWED; + } + + if (!dir_config->handle) { + ap_log_rerror (APLOG_MARK, APLOG_ERR, r, + "glusterfs initialization failed\n"); + return FORBIDDEN; + } + + ap_update_mtime(r, r->finfo.st_mtime); + ap_set_last_modified(r); + ap_set_etag(r); + ap_table_setn(r->headers_out, "Accept-Ranges", "bytes"); + if (((errstatus = ap_meets_conditions(r)) != OK) + || (errstatus = ap_set_content_length(r, r->finfo.st_size))) { + return errstatus; + } + rangestatus = ap_set_byterange(r); + ap_send_http_header(r); + + if (r->finfo.st_size <= dir_config->xattr_file_size && dir_config->buf) { + if (!r->header_only) { + error = OK; + ap_log_rerror (APLOG_MARK, APLOG_NOTICE, r, + "fetching data from glusterfs through xattr interface\n"); + + if (!rangestatus) { + if (ap_rwrite (dir_config->buf, r->finfo.st_size, r) < 0) { + error = HTTP_INTERNAL_SERVER_ERROR; + } + } else { + long offset, length; + while (ap_each_byterange (r, &offset, &length)) { + if (ap_rwrite (dir_config->buf + offset, length, r) < 0) { + error = HTTP_INTERNAL_SERVER_ERROR; + break; + } + } + } + } + + free (dir_config->buf); + dir_config->buf = NULL; + + return error; + } + + path = r->uri + strlen (dir_config->mount_dir); + fd = glusterfs_open (dir_config->handle, path , O_RDONLY, 0); + + if (fd == 0) { + ap_log_rerror(APLOG_MARK, APLOG_ERR, r, + "file permissions deny server access: %s", r->filename); + return FORBIDDEN; + } + + if (!r->header_only) { + if (!rangestatus) { + mod_glusterfs_read_async (r, fd, 0, -1); + } else { + long offset, length; + while (ap_each_byterange(r, &offset, &length)) { + mod_glusterfs_read_async (r, fd, offset, length); + } + } + } + + glusterfs_close (fd); + return error; +} + +static const command_rec mod_glusterfs_cmds[] = +{ + {"GlusterfsLogfile", add_logfile, NULL, + GLUSTERFS_CMD_PERMS, TAKE1, + "Glusterfs Logfile"}, + {"GlusterfsLoglevel", set_loglevel, NULL, + GLUSTERFS_CMD_PERMS, TAKE1, + "Glusterfs Loglevel:anyone of none, critical, error, warning, debug"}, + {"GlusterfsCacheTimeout", set_cache_timeout, NULL, + GLUSTERFS_CMD_PERMS, TAKE1, + "Timeout value in seconds for caching lookups and stats"}, + {"GlusterfsVolumeSpecfile", add_specfile, NULL, + GLUSTERFS_CMD_PERMS, TAKE1, + "Glusterfs Specfile required to access contents of this directory"}, + {"GlusterfsXattrFileSize", add_xattr_file_size, NULL, + GLUSTERFS_CMD_PERMS, TAKE1, + "Maximum size of the file to be fetched using xattr interface of glusterfs"}, + {NULL} +}; + +static const handler_rec mod_glusterfs_handlers[] = +{ + {GLUSTERFS_HANDLER, mod_glusterfs_handler}, + {NULL} +}; + +module glusterfs_module = +{ + STANDARD_MODULE_STUFF, + NULL, + mod_glusterfs_create_dir_config, /* per-directory config creator */ + NULL, + NULL, /* server config creator */ + NULL, /* server config merger */ + mod_glusterfs_cmds, /* command table */ + mod_glusterfs_handlers, /* [7] list of handlers */ + NULL, /* [2] filename-to-URI translation */ + NULL, /* [5] check/validate user_id */ + NULL, /* [6] check user_id is valid *here* */ + NULL, /* [4] check access by host address */ + NULL, /* [7] MIME type checker/setter */ + mod_glusterfs_fixup, /* [8] fixups */ + NULL, /* [10] logger */ +#if MODULE_MAGIC_NUMBER >= 19970103 + NULL, /* [3] header parser */ +#endif +#if MODULE_MAGIC_NUMBER >= 19970719 + mod_glusterfs_child_init, /* process initializer */ +#endif +#if MODULE_MAGIC_NUMBER >= 19970728 + mod_glusterfs_child_exit, /* process exit/cleanup */ +#endif +#if MODULE_MAGIC_NUMBER >= 19970902 + NULL /* [1] post read_request handling */ +#endif +}; diff --git a/mod_glusterfs/apache/2.2/Makefile.am b/mod_glusterfs/apache/2.2/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/mod_glusterfs/apache/2.2/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/mod_glusterfs/apache/2.2/src/Makefile.am b/mod_glusterfs/apache/2.2/src/Makefile.am new file mode 100644 index 000000000..1e8f3a31e --- /dev/null +++ b/mod_glusterfs/apache/2.2/src/Makefile.am @@ -0,0 +1,31 @@ +mod_glusterfs_PROGRAMS = mod_glusterfs.so +mod_glusterfsdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/apache/2.2 + +mod_glusterfs_so_SOURCES = mod_glusterfs.c + +all: mod_glusterfs.so + +mod_glusterfs.so: $(top_srcdir)/mod_glusterfs/apache/2.2/src/mod_glusterfs.c $(top_builddir)/libglusterfsclient/src/libglusterfsclient.la + ln -sf $(top_srcdir)/mod_glusterfs/apache/2.2/src/mod_glusterfs.c $(top_builddir)/mod_glusterfs/apache/2.2/src/mod_glusterfs-build.c + $(APXS) -c -o mod_glusterfs.la -Wc,-g3 -Wc,-O0 -DLINUX=2 -D_REENTRANT -D_GNU_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -I$(top_srcdir)/libglusterfsclient/src -L$(top_builddir)/libglusterfsclient/src/.libs/ -lglusterfsclient mod_glusterfs-build.c + -ln -sf .libs/mod_glusterfs.so mod_glusterfs.so + +$(top_builddir)/libglusterfsclient/src/libglusterfsclient.la: + $(MAKE) -C $(top_builddir)/libglusterfsclient/src/ all + +install-data-local: + @echo "" + @echo "" + @echo "**********************************************************************************" + @echo "* TO INSTALL MODGLUSTERFS, PLEASE USE, " + @echo "* $(APXS) -n glusterfs -ia $(mod_glusterfsdir)/mod_glusterfs.so " + @echo "**********************************************************************************" + @echo "" + @echo "" + +#install: +# cp -fv mod_glusterfs.so $(HTTPD_LIBEXECDIR) +# cp -fv httpd.conf $(HTTPD_CONF_DIR) + +clean: + rm -fv *.so *.o diff --git a/mod_glusterfs/apache/2.2/src/README.txt b/mod_glusterfs/apache/2.2/src/README.txt new file mode 100644 index 000000000..002984542 --- /dev/null +++ b/mod_glusterfs/apache/2.2/src/README.txt @@ -0,0 +1,105 @@ +What is mod_glusterfs? +====================== +* mod_glusterfs is a module for apache written for efficient serving of files from glusterfs. + mod_glusterfs interfaces with glusterfs using apis provided by libglusterfsclient. + +* this README speaks about installing mod_glusterfs for httpd-2.2 and higher. + +Prerequisites for mod_glusterfs +=============================== +Though mod_glusterfs has been written as a module, with an intent of making no changes to +the way apache has been built, currently following points have to be taken care of: + +* since glusterfs is compiled with _FILE_OFFSET_BITS=64 and __USE_FILE_OFFSET64 flags, mod_glusterfs and apache + in turn have to be compiled with the above two flags. + + $ tar xzf httpd-2.2.10.tar.gz + $ cd httpd-2.2.10/ + $ export CFLAGS='-D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64' + $ ./configure --prefix=/usr + $ make + $ make install + $ httpd -l | grep -i mod_so + mod_so.c + +* if multiple apache installations are present, make sure to pass --with-apxs=/path/to/apxs/of/proper/version + to configure script while building glusterfs. + +Build/Install mod_glusterfs +=========================== +* mod_glusterfs is provided with glusterfs--mainline--3.0 and all releases from the same branch. + +* building glusterfs also builds mod_glusterfs. But 'make install' of glusterfs installs mod_glusterfs.so to + glusterfs install directory instead of the apache modules directory. + +* 'make install' of glusterfs will print a message similar to the one given below, which is self explanatory. + Make sure to use apxs of proper apache version in case of multiple apache installations. This will copy + mod_glusterfs.so to modules directory of proper apache version and modify the appropriate httpd.conf to enable + mod_glusterfs. + +********************************************************************************** +* TO INSTALL MODGLUSTERFS, PLEASE USE, +* apxs -n glusterfs -ia /usr/lib/glusterfs/1.4.0tla872/apache/2.2/mod_glusterfs.so +********************************************************************************** + +Configuration +============= +* Following configuration has to be added to httpd.conf. + + + GlusterfsLogfile "/var/log/glusterfs/glusterfs.log" + GlusterfsLoglevel "warning" + GlusterfsVolumeSpecfile "/etc/glusterfs/glusterfs-client.spec" + GlusterfsCacheTimeout "600" + GlusterfsXattrFileSize "65536" + SetHandler "glusterfs-handler" + + +* GlusterfsVolumeSpecfile (COMPULSORY) + Path to the the glusterfs volume specification file. + +* GlusterfsLogfile (COMPULSORY) + Path to the glusterfs logfile. + +* GlusterfsLoglevel (OPTIONAL, default = warning) + Severity of messages that are to be logged. Allowed values are critical, error, warning, debug, none + in the decreasing order of severity. + +* GlusterfsCacheTimeOut (OPTIONAL, default = 0) + Timeout values for glusterfs stat and lookup cache. + +* GlusterfsXattrFileSize (OPTIONAL, default = 0) + Files with sizes upto and including this value are fetched through the extended attribute interface of + glusterfs rather than the usual open-read-close set of operations. For files of small sizes, it is recommended + to use extended attribute interface. + +* With the above configuration all the requests to httpd of the form www.example.org/glusterfs/path/to/file are + served from glusterfs. + +* mod_glusterfs also implements mod_dir and mod_autoindex behaviour for files under glusterfs mount. + Hence it also takes the directives related to both of these modules. For more details, refer the + documentation for both of these modules. + +Miscellaneous points +==================== +* httpd by default runs with username "nobody" and group "nogroup". Permissions of logfile and specfile have to + be set suitably. + +* Since mod_glusterfs runs with permissions of nobody.nogroup, glusterfs has to use only login based + authentication. See docs/authentication.txt for more details. + +* To copy the data served by httpd into glusterfs mountpoint, glusterfs can be started with the + volume-specification file provided to mod_glusterfs. Any tool like cp can then be used. + +* To run in gdb, apache has to be compiled with -lpthread, since libglusterfsclient is + multithreaded. If not on Linux gdb runs into errors like: + "Error while reading shared library symbols: + Cannot find new threads: generic error" + +* when used with ib-verbs transport, ib_verbs initialization fails. + reason for this is that apache runs as non-privileged user and the amount of memory that can be + locked by default is not sufficient for ib-verbs. to fix this, as root run, + + # ulimit -l unlimited + + and then start apache. diff --git a/mod_glusterfs/apache/2.2/src/mod_glusterfs.c b/mod_glusterfs/apache/2.2/src/mod_glusterfs.c new file mode 100644 index 000000000..dff058178 --- /dev/null +++ b/mod_glusterfs/apache/2.2/src/mod_glusterfs.c @@ -0,0 +1,3536 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + . +*/ + +#ifndef CORE_PRIVATE +#define CORE_PRIVATE +#endif + +#ifndef NO_CONTENT_TYPE +#define NO_CONTENT_TYPE "none" +#endif + +#define BYTERANGE_FMT "%" APR_OFF_T_FMT "-%" APR_OFF_T_FMT "/%" APR_OFF_T_FMT + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define GLUSTERFS_INVALID_LOGLEVEL "mod_glfs: Unrecognized log-level \"%s\", possible values are \"DEBUG|WARNING|ERROR|CRITICAL|NONE\"\n" + +#define GLUSTERFS_HANDLER "glusterfs-handler" +#define GLUSTERFS_CHUNK_SIZE 131072 + +static char c_by_encoding, c_by_type, c_by_path; + +#define BY_ENCODING &c_by_encoding +#define BY_TYPE &c_by_type +#define BY_PATH &c_by_path + +module AP_MODULE_DECLARE_DATA glusterfs_module; +extern module core_module; + +#define NO_OPTIONS (1 << 0) /* Indexing options */ +#define ICONS_ARE_LINKS (1 << 1) +#define SCAN_HTML_TITLES (1 << 2) +#define SUPPRESS_ICON (1 << 3) +#define SUPPRESS_LAST_MOD (1 << 4) +#define SUPPRESS_SIZE (1 << 5) +#define SUPPRESS_DESC (1 << 6) +#define SUPPRESS_PREAMBLE (1 << 7) +#define SUPPRESS_COLSORT (1 << 8) +#define SUPPRESS_RULES (1 << 9) +#define FOLDERS_FIRST (1 << 10) +#define VERSION_SORT (1 << 11) +#define TRACK_MODIFIED (1 << 12) +#define FANCY_INDEXING (1 << 13) +#define TABLE_INDEXING (1 << 14) +#define IGNORE_CLIENT (1 << 15) +#define IGNORE_CASE (1 << 16) +#define EMIT_XHTML (1 << 17) +#define SHOW_FORBIDDEN (1 << 18) + +#define K_NOADJUST 0 +#define K_ADJUST 1 +#define K_UNSET 2 + +/* + * Define keys for sorting. + */ +#define K_NAME 'N' /* Sort by file name (default) */ +#define K_LAST_MOD 'M' /* Last modification date */ +#define K_SIZE 'S' /* Size (absolute, not as displayed) */ +#define K_DESC 'D' /* Description */ +#define K_VALID "NMSD" /* String containing _all_ valid K_ opts */ + +#define D_ASCENDING 'A' +#define D_DESCENDING 'D' +#define D_VALID "AD" /* String containing _all_ valid D_ opts */ + +/* + * These are the dimensions of the default icons supplied with Apache. + */ +#define DEFAULT_ICON_WIDTH 20 +#define DEFAULT_ICON_HEIGHT 22 + +/* + * Other default dimensions. + */ +#define DEFAULT_NAME_WIDTH 23 +#define DEFAULT_DESC_WIDTH 23 + +struct mod_glfs_ai_item { + char *type; + char *apply_to; + char *apply_path; + char *data; +}; + +typedef struct mod_glfs_ai_desc_t { + char *pattern; + char *description; + int full_path; + int wildcards; +} mod_glfs_ai_desc_t; + +typedef enum { + SLASH_OFF = 0, + SLASH_ON, + SLASH_UNSET +} mod_glfs_dir_slash_cfg; + +/* static ap_filter_rec_t *mod_glfs_output_filter_handle; */ + +/*TODO: verify error returns to server core */ + +typedef struct glusterfs_dir_config { + char *logfile; + char *loglevel; + char *specfile; + char *mount_dir; + char *buf; + size_t xattr_file_size; + uint32_t cache_timeout; + libglusterfs_handle_t handle; + + /* mod_dir options */ + apr_array_header_t *index_names; + mod_glfs_dir_slash_cfg do_slash; + + /* autoindex options */ + char *default_icon; + char *style_sheet; + apr_int32_t opts; + apr_int32_t incremented_opts; + apr_int32_t decremented_opts; + int name_width; + int name_adjust; + int desc_width; + int desc_adjust; + int icon_width; + int icon_height; + char default_keyid; + char default_direction; + + apr_array_header_t *icon_list; + apr_array_header_t *alt_list; + apr_array_header_t *desc_list; + apr_array_header_t *ign_list; + apr_array_header_t *hdr_list; + apr_array_header_t *rdme_list; + + char *ctype; + char *charset; +} glusterfs_dir_config_t; + +typedef struct glusterfs_async_local { + int op_ret; + int op_errno; + char async_read_complete; + off_t length; + off_t read_bytes; + glusterfs_read_buf_t *buf; + request_rec *request; + pthread_mutex_t lock; + pthread_cond_t cond; +}glusterfs_async_local_t; + +#define GLUSTERFS_CMD_PERMS ACCESS_CONF + + +static glusterfs_dir_config_t * +mod_glfs_dconfig (request_rec *r) +{ + glusterfs_dir_config_t *dir_config = NULL; + if (r->per_dir_config != NULL) { + dir_config = ap_get_module_config (r->per_dir_config, &glusterfs_module); + } + + return dir_config; +} + + +static const char * +cmd_add_xattr_file_size (cmd_parms *cmd, void *dummy, const char *arg) +{ + glusterfs_dir_config_t *dir_config = dummy; + dir_config->xattr_file_size = atoi (arg); + return NULL; +} + + +static const char * +cmd_set_cache_timeout (cmd_parms *cmd, void *dummy, const char *arg) +{ + glusterfs_dir_config_t *dir_config = dummy; + dir_config->cache_timeout = atoi (arg); + return NULL; +} + + +static const char * +cmd_set_loglevel (cmd_parms *cmd, void *dummy, const char *arg) +{ + glusterfs_dir_config_t *dir_config = dummy; + char *error = NULL; + if (strncasecmp (arg, "DEBUG", strlen ("DEBUG")) + && strncasecmp (arg, "WARNING", strlen ("WARNING")) + && strncasecmp (arg, "CRITICAL", strlen ("CRITICAL")) + && strncasecmp (arg, "NONE", strlen ("NONE")) + && strncasecmp (arg, "ERROR", strlen ("ERROR"))) + error = GLUSTERFS_INVALID_LOGLEVEL; + else + dir_config->loglevel = apr_pstrdup (cmd->pool, arg); + + return error; +} + +static const char * +cmd_add_logfile (cmd_parms *cmd, void *dummy, const char *arg) +{ + glusterfs_dir_config_t *dir_config = dummy; + dir_config->logfile = apr_pstrdup (cmd->pool, arg); + + return NULL; +} + + +static const char * +cmd_add_volume_specfile (cmd_parms *cmd, void *dummy, const char *arg) +{ + glusterfs_dir_config_t *dir_config = dummy; + + dir_config->specfile = apr_pstrdup (cmd->pool, arg); + + return NULL; +} + +#define WILDCARDS_REQUIRED 0 + +static const char * +cmd_add_desc (cmd_parms *cmd, void *d, const char *desc, + const char *to) +{ + glusterfs_dir_config_t *dcfg = (glusterfs_dir_config_t *) d; + mod_glfs_ai_desc_t *desc_entry; + char *prefix = ""; + + desc_entry = (mod_glfs_ai_desc_t *) apr_array_push(dcfg->desc_list); + desc_entry->full_path = (ap_strchr_c(to, '/') == NULL) ? 0 : 1; + desc_entry->wildcards = (WILDCARDS_REQUIRED + || desc_entry->full_path + || apr_fnmatch_test(to)); + if (desc_entry->wildcards) { + prefix = desc_entry->full_path ? "*/" : "*"; + desc_entry->pattern = apr_pstrcat(dcfg->desc_list->pool, + prefix, to, "*", NULL); + } + else { + desc_entry->pattern = apr_pstrdup(dcfg->desc_list->pool, to); + } + desc_entry->description = apr_pstrdup(dcfg->desc_list->pool, desc); + return NULL; +} + + +static void push_item(apr_array_header_t *arr, char *type, const char *to, + const char *path, const char *data) +{ + struct mod_glfs_ai_item *p = (struct mod_glfs_ai_item *) apr_array_push(arr); + + if (!to) { + to = ""; + } + if (!path) { + path = ""; + } + + p->type = type; + p->data = data ? apr_pstrdup(arr->pool, data) : NULL; + p->apply_path = apr_pstrcat(arr->pool, path, "*", NULL); + + if ((type == BY_PATH) && (!ap_is_matchexp(to))) { + p->apply_to = apr_pstrcat(arr->pool, "*", to, NULL); + } + else if (to) { + p->apply_to = apr_pstrdup(arr->pool, to); + } + else { + p->apply_to = NULL; + } +} + + +static const char * +cmd_add_ignore (cmd_parms *cmd, void *d, const char *ext) +{ + push_item(((glusterfs_dir_config_t *) d)->ign_list, 0, ext, cmd->path, NULL); + return NULL; +} + + +static const char * +cmd_add_header (cmd_parms *cmd, void *d, const char *name) +{ + push_item(((glusterfs_dir_config_t *) d)->hdr_list, 0, NULL, cmd->path, + name); + return NULL; +} + + +static const char * +cmd_add_readme (cmd_parms *cmd, void *d, const char *name) +{ + push_item(((glusterfs_dir_config_t *) d)->rdme_list, 0, NULL, cmd->path, + name); + return NULL; +} + + +static const char * +cmd_add_opts (cmd_parms *cmd, void *d, int argc, char *const argv[]) +{ + int i; + char *w; + apr_int32_t opts; + apr_int32_t opts_add; + apr_int32_t opts_remove; + char action; + glusterfs_dir_config_t *d_cfg = (glusterfs_dir_config_t *) d; + + opts = d_cfg->opts; + opts_add = d_cfg->incremented_opts; + opts_remove = d_cfg->decremented_opts; + + for (i = 0; i < argc; i++) { + int option = 0; + w = argv[i]; + + if ((*w == '+') || (*w == '-')) { + action = *(w++); + } + else { + action = '\0'; + } + if (!strcasecmp(w, "FancyIndexing")) { + option = FANCY_INDEXING; + } + else if (!strcasecmp(w, "FoldersFirst")) { + option = FOLDERS_FIRST; + } + else if (!strcasecmp(w, "HTMLTable")) { + option = TABLE_INDEXING; + } + else if (!strcasecmp(w, "IconsAreLinks")) { + option = ICONS_ARE_LINKS; + } + else if (!strcasecmp(w, "IgnoreCase")) { + option = IGNORE_CASE; + } + else if (!strcasecmp(w, "IgnoreClient")) { + option = IGNORE_CLIENT; + } + else if (!strcasecmp(w, "ScanHTMLTitles")) { + option = SCAN_HTML_TITLES; + } + else if (!strcasecmp(w, "SuppressColumnSorting")) { + option = SUPPRESS_COLSORT; + } + else if (!strcasecmp(w, "SuppressDescription")) { + option = SUPPRESS_DESC; + } + else if (!strcasecmp(w, "SuppressHTMLPreamble")) { + option = SUPPRESS_PREAMBLE; + } + else if (!strcasecmp(w, "SuppressIcon")) { + option = SUPPRESS_ICON; + } + else if (!strcasecmp(w, "SuppressLastModified")) { + option = SUPPRESS_LAST_MOD; + } + else if (!strcasecmp(w, "SuppressSize")) { + option = SUPPRESS_SIZE; + } + else if (!strcasecmp(w, "SuppressRules")) { + option = SUPPRESS_RULES; + } + else if (!strcasecmp(w, "TrackModified")) { + option = TRACK_MODIFIED; + } + else if (!strcasecmp(w, "VersionSort")) { + option = VERSION_SORT; + } + else if (!strcasecmp(w, "XHTML")) { + option = EMIT_XHTML; + } + else if (!strcasecmp(w, "ShowForbidden")) { + option = SHOW_FORBIDDEN; + } + else if (!strcasecmp(w, "None")) { + if (action != '\0') { + return "Cannot combine '+' or '-' with 'None' keyword"; + } + opts = NO_OPTIONS; + opts_add = 0; + opts_remove = 0; + } + else if (!strcasecmp(w, "IconWidth")) { + if (action != '-') { + d_cfg->icon_width = DEFAULT_ICON_WIDTH; + } + else { + d_cfg->icon_width = 0; + } + } + else if (!strncasecmp(w, "IconWidth=", 10)) { + if (action == '-') { + return "Cannot combine '-' with IconWidth=n"; + } + d_cfg->icon_width = atoi(&w[10]); + } + else if (!strcasecmp(w, "IconHeight")) { + if (action != '-') { + d_cfg->icon_height = DEFAULT_ICON_HEIGHT; + } + else { + d_cfg->icon_height = 0; + } + } + else if (!strncasecmp(w, "IconHeight=", 11)) { + if (action == '-') { + return "Cannot combine '-' with IconHeight=n"; + } + d_cfg->icon_height = atoi(&w[11]); + } + else if (!strcasecmp(w, "NameWidth")) { + if (action != '-') { + return "NameWidth with no value may only appear as " + "'-NameWidth'"; + } + d_cfg->name_width = DEFAULT_NAME_WIDTH; + d_cfg->name_adjust = K_NOADJUST; + } + else if (!strncasecmp(w, "NameWidth=", 10)) { + if (action == '-') { + return "Cannot combine '-' with NameWidth=n"; + } + if (w[10] == '*') { + d_cfg->name_adjust = K_ADJUST; + } + else { + int width = atoi(&w[10]); + + if (width && (width < 5)) { + return "NameWidth value must be greater than 5"; + } + d_cfg->name_width = width; + d_cfg->name_adjust = K_NOADJUST; + } + } + else if (!strcasecmp(w, "DescriptionWidth")) { + if (action != '-') { + return "DescriptionWidth with no value may only appear as " + "'-DescriptionWidth'"; + } + d_cfg->desc_width = DEFAULT_DESC_WIDTH; + d_cfg->desc_adjust = K_NOADJUST; + } + else if (!strncasecmp(w, "DescriptionWidth=", 17)) { + if (action == '-') { + return "Cannot combine '-' with DescriptionWidth=n"; + } + if (w[17] == '*') { + d_cfg->desc_adjust = K_ADJUST; + } + else { + int width = atoi(&w[17]); + + if (width && (width < 12)) { + return "DescriptionWidth value must be greater than 12"; + } + d_cfg->desc_width = width; + d_cfg->desc_adjust = K_NOADJUST; + } + } + else if (!strncasecmp(w, "Type=", 5)) { + d_cfg->ctype = apr_pstrdup(cmd->pool, &w[5]); + } + else if (!strncasecmp(w, "Charset=", 8)) { + d_cfg->charset = apr_pstrdup(cmd->pool, &w[8]); + } + else { + return "Invalid directory indexing option"; + } + if (action == '\0') { + opts |= option; + opts_add = 0; + opts_remove = 0; + } + else if (action == '+') { + opts_add |= option; + opts_remove &= ~option; + } + else { + opts_remove |= option; + opts_add &= ~option; + } + } + if ((opts & NO_OPTIONS) && (opts & ~NO_OPTIONS)) { + return "Cannot combine other IndexOptions keywords with 'None'"; + } + d_cfg->incremented_opts = opts_add; + d_cfg->decremented_opts = opts_remove; + d_cfg->opts = opts; + return NULL; +} + + +static const char * +cmd_set_default_order(cmd_parms *cmd, void *m, + const char *direction, const char *key) +{ + glusterfs_dir_config_t *d_cfg = (glusterfs_dir_config_t *) m; + + if (!strcasecmp(direction, "Ascending")) { + d_cfg->default_direction = D_ASCENDING; + } + else if (!strcasecmp(direction, "Descending")) { + d_cfg->default_direction = D_DESCENDING; + } + else { + return "First keyword must be 'Ascending' or 'Descending'"; + } + + if (!strcasecmp(key, "Name")) { + d_cfg->default_keyid = K_NAME; + } + else if (!strcasecmp(key, "Date")) { + d_cfg->default_keyid = K_LAST_MOD; + } + else if (!strcasecmp(key, "Size")) { + d_cfg->default_keyid = K_SIZE; + } + else if (!strcasecmp(key, "Description")) { + d_cfg->default_keyid = K_DESC; + } + else { + return "Second keyword must be 'Name', 'Date', 'Size', or " + "'Description'"; + } + + return NULL; +} + + +static char c_by_encoding, c_by_type, c_by_path; + +#define BY_ENCODING &c_by_encoding +#define BY_TYPE &c_by_type +#define BY_PATH &c_by_path + +/* + * This routine puts the standard HTML header at the top of the index page. + * We include the DOCTYPE because we may be using features therefrom (i.e., + * HEIGHT and WIDTH attributes on the icons if we're FancyIndexing). + */ +static void emit_preamble(request_rec *r, int xhtml, const char *title) +{ + glusterfs_dir_config_t *d; + + d = (glusterfs_dir_config_t *) ap_get_module_config(r->per_dir_config, + &glusterfs_module); + + if (xhtml) { + ap_rvputs(r, DOCTYPE_XHTML_1_0T, + "\n" + " \n Index of ", title, + "\n", NULL); + } else { + ap_rvputs(r, DOCTYPE_HTML_3_2, + "\n \n" + " Index of ", title, + "\n", NULL); + } + + if (d->style_sheet != NULL) { + ap_rvputs(r, " style_sheet, + "\" type=\"text/css\"", xhtml ? " />\n" : ">\n", NULL); + } + ap_rvputs(r, " \n \n", NULL); +} + + +static const char *cmd_add_alt(cmd_parms *cmd, void *d, const char *alt, + const char *to) +{ + if (cmd->info == BY_PATH) { + if (!strcmp(to, "**DIRECTORY**")) { + to = "^^DIRECTORY^^"; + } + } + if (cmd->info == BY_ENCODING) { + char *tmp = apr_pstrdup(cmd->pool, to); + ap_str_tolower(tmp); + to = tmp; + } + + push_item(((glusterfs_dir_config_t *) d)->alt_list, cmd->info, to, + cmd->path, alt); + return NULL; +} + +static const char *cmd_add_icon(cmd_parms *cmd, void *d, const char *icon, + const char *to) +{ + char *iconbak = apr_pstrdup(cmd->pool, icon); + + if (icon[0] == '(') { + char *alt; + char *cl = strchr(iconbak, ')'); + + if (cl == NULL) { + return "missing closing paren"; + } + alt = ap_getword_nc(cmd->pool, &iconbak, ','); + *cl = '\0'; /* Lose closing paren */ + cmd_add_alt(cmd, d, &alt[1], to); + } + if (cmd->info == BY_PATH) { + if (!strcmp(to, "**DIRECTORY**")) { + to = "^^DIRECTORY^^"; + } + } + if (cmd->info == BY_ENCODING) { + char *tmp = apr_pstrdup(cmd->pool, to); + ap_str_tolower(tmp); + to = tmp; + } + + push_item(((glusterfs_dir_config_t *) d)->icon_list, cmd->info, to, + cmd->path, iconbak); + return NULL; +} + + +static void * +mod_glfs_create_dir_config(apr_pool_t *p, char *dirspec) +{ + glusterfs_dir_config_t *dir_config = NULL; + + dir_config = (glusterfs_dir_config_t *) apr_pcalloc(p, sizeof(*dir_config)); + + dir_config->mount_dir = dirspec; + dir_config->logfile = dir_config->specfile = (char *)0; + dir_config->loglevel = "warning"; + dir_config->handle = (libglusterfs_handle_t) 0; + dir_config->cache_timeout = 0; + dir_config->buf = NULL; + + /* mod_dir options init */ + dir_config->index_names = NULL; + dir_config->do_slash = SLASH_UNSET; + + /* autoindex options init */ + dir_config->icon_width = 0; + dir_config->icon_height = 0; + dir_config->name_width = DEFAULT_NAME_WIDTH; + dir_config->name_adjust = K_UNSET; + dir_config->desc_width = DEFAULT_DESC_WIDTH; + dir_config->desc_adjust = K_UNSET; + dir_config->icon_list = apr_array_make(p, 4, sizeof(struct mod_glfs_ai_item)); + dir_config->alt_list = apr_array_make(p, 4, sizeof(struct mod_glfs_ai_item)); + dir_config->desc_list = apr_array_make(p, 4, sizeof(mod_glfs_ai_desc_t)); + dir_config->ign_list = apr_array_make(p, 4, sizeof(struct mod_glfs_ai_item)); + dir_config->hdr_list = apr_array_make(p, 4, sizeof(struct mod_glfs_ai_item)); + dir_config->rdme_list = apr_array_make(p, 4, sizeof(struct mod_glfs_ai_item)); + dir_config->opts = 0; + dir_config->incremented_opts = 0; + dir_config->decremented_opts = 0; + dir_config->default_keyid = '\0'; + dir_config->default_direction = '\0'; + + return (void *) dir_config; +} + + +static void * +mod_glfs_merge_dir_config(apr_pool_t *p, void *parent_conf, + void *newloc_conf) +{ + glusterfs_dir_config_t *new = (glusterfs_dir_config_t *) + apr_pcalloc(p, sizeof(glusterfs_dir_config_t)); + glusterfs_dir_config_t *add = newloc_conf; + glusterfs_dir_config_t *base = parent_conf; + + if (add->logfile) + new->logfile = apr_pstrdup (p, add->logfile); + + if (add->loglevel) + new->loglevel = apr_pstrdup (p, add->loglevel); + + if (add->specfile) + new->specfile = apr_pstrdup (p, add->specfile); + + if (add->mount_dir) + new->mount_dir = apr_pstrdup (p, add->mount_dir); + + new->xattr_file_size = add->xattr_file_size; + new->cache_timeout = add->cache_timeout; + new->handle = add->handle; + new->buf = add->buf; + + /* mod_dir */ + new->index_names = add->index_names ? add->index_names : base->index_names; + new->do_slash = + (add->do_slash == SLASH_UNSET) ? base->do_slash : add->do_slash; + + /* auto index */ + new->default_icon = add->default_icon ? add->default_icon + : base->default_icon; + new->style_sheet = add->style_sheet ? add->style_sheet + : base->style_sheet; + new->icon_height = add->icon_height ? add->icon_height : base->icon_height; + new->icon_width = add->icon_width ? add->icon_width : base->icon_width; + + new->ctype = add->ctype ? add->ctype : base->ctype; + new->charset = add->charset ? add->charset : base->charset; + + new->alt_list = apr_array_append(p, add->alt_list, base->alt_list); + new->ign_list = apr_array_append(p, add->ign_list, base->ign_list); + new->hdr_list = apr_array_append(p, add->hdr_list, base->hdr_list); + new->desc_list = apr_array_append(p, add->desc_list, base->desc_list); + new->icon_list = apr_array_append(p, add->icon_list, base->icon_list); + new->rdme_list = apr_array_append(p, add->rdme_list, base->rdme_list); + if (add->opts & NO_OPTIONS) { + /* + * If the current directory says 'no options' then we also + * clear any incremental mods from being inheritable further down. + */ + new->opts = NO_OPTIONS; + new->incremented_opts = 0; + new->decremented_opts = 0; + } + else { + /* + * If there were any nonincremental options selected for + * this directory, they dominate and we don't inherit *anything.* + * Contrariwise, we *do* inherit if the only settings here are + * incremental ones. + */ + if (add->opts == 0) { + new->incremented_opts = (base->incremented_opts + | add->incremented_opts) + & ~add->decremented_opts; + new->decremented_opts = (base->decremented_opts + | add->decremented_opts); + /* + * We may have incremental settings, so make sure we don't + * inadvertently inherit an IndexOptions None from above. + */ + new->opts = (base->opts & ~NO_OPTIONS); + } + else { + /* + * There are local nonincremental settings, which clear + * all inheritance from above. They *are* the new base settings. + */ + new->opts = add->opts;; + } + /* + * We're guaranteed that there'll be no overlap between + * the add-options and the remove-options. + */ + new->opts |= new->incremented_opts; + new->opts &= ~new->decremented_opts; + } + /* + * Inherit the NameWidth settings if there aren't any specific to + * the new location; otherwise we'll end up using the defaults set in the + * config-rec creation routine. + */ + if (add->name_adjust == K_UNSET) { + new->name_width = base->name_width; + new->name_adjust = base->name_adjust; + } + else { + new->name_width = add->name_width; + new->name_adjust = add->name_adjust; + } + + /* + * Likewise for DescriptionWidth. + */ + if (add->desc_adjust == K_UNSET) { + new->desc_width = base->desc_width; + new->desc_adjust = base->desc_adjust; + } + else { + new->desc_width = add->desc_width; + new->desc_adjust = add->desc_adjust; + } + + new->default_keyid = add->default_keyid ? add->default_keyid + : base->default_keyid; + new->default_direction = add->default_direction ? add->default_direction + : base->default_direction; + + return (void *) new; +} + + +static void +mod_glfs_child_init(apr_pool_t *p, server_rec *s) +{ + int i; + core_server_config *sconf = NULL; + ap_conf_vector_t **sec_ent = NULL; + glusterfs_dir_config_t *dir_config = NULL; + glusterfs_init_ctx_t ctx; + int num_sec = 0; + + sconf = (core_server_config *) ap_get_module_config (s->module_config, &core_module); + sec_ent = (ap_conf_vector_t **) sconf->sec_url->elts; + num_sec = sconf->sec_url->nelts; + + for (i = 0; i < num_sec; i++) { + dir_config = ap_get_module_config (sec_ent[i], &glusterfs_module); + + if (dir_config) { + memset (&ctx, 0, sizeof (ctx)); + + ctx.logfile = dir_config->logfile; + ctx.loglevel = dir_config->loglevel; + ctx.lookup_timeout = ctx.stat_timeout = dir_config->cache_timeout; + ctx.specfile = dir_config->specfile; + + dir_config->handle = glusterfs_init (&ctx); + if (!dir_config->handle) { + ap_log_error(APLOG_MARK, APLOG_ERR, APR_EGENERAL, s, + "mod_glfs_child_init: glusterfs_init failed, check glusterfs logfile %s for more details", + dir_config->logfile); + } + } + dir_config = NULL; + } +} + + +static void +mod_glfs_child_exit(server_rec *s, apr_pool_t *p) +{ + int i; + core_server_config *sconf = ap_get_module_config(s->module_config, + &core_module); + ap_conf_vector_t **sec_ent = (ap_conf_vector_t **) sconf->sec_url->elts; + glusterfs_dir_config_t *dir_config = NULL; + glusterfs_init_ctx_t ctx; + int num_sec = sconf->sec_url->nelts; + + for (i = 0; i < num_sec; i++) { + dir_config = ap_get_module_config (sec_ent[i], &glusterfs_module); + if (dir_config && dir_config->handle) { + glusterfs_fini (dir_config->handle); + dir_config->handle = 0; + } + dir_config = NULL; + } +} + + +static apr_filetype_e filetype_from_mode(mode_t mode) +{ + apr_filetype_e type = APR_NOFILE; + + if (S_ISREG(mode)) + type = APR_REG; + else if (S_ISDIR(mode)) + type = APR_DIR; + else if (S_ISCHR(mode)) + type = APR_CHR; + else if (S_ISBLK(mode)) + type = APR_BLK; + else if (S_ISFIFO(mode)) + type = APR_PIPE; + else if (S_ISLNK(mode)) + type = APR_LNK; + else if (S_ISSOCK(mode)) + type = APR_SOCK; + else + type = APR_UNKFILE; + return type; +} + + +static void fill_out_finfo(apr_finfo_t *finfo, struct stat *info, + apr_int32_t wanted) +{ + finfo->valid = APR_FINFO_MIN | APR_FINFO_IDENT | APR_FINFO_NLINK + | APR_FINFO_OWNER | APR_FINFO_PROT; + finfo->protection = apr_unix_mode2perms(info->st_mode); + finfo->filetype = filetype_from_mode(info->st_mode); + finfo->user = info->st_uid; + finfo->group = info->st_gid; + finfo->size = info->st_size; + finfo->device = info->st_dev; + finfo->nlink = info->st_nlink; + + /* Check for overflow if storing a 64-bit st_ino in a 32-bit + * apr_ino_t for LFS builds: */ + if (sizeof(apr_ino_t) >= sizeof(info->st_ino) + || (apr_ino_t)info->st_ino == info->st_ino) { + finfo->inode = info->st_ino; + } else { + finfo->valid &= ~APR_FINFO_INODE; + } + + apr_time_ansi_put(&finfo->atime, info->st_atime); +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC + finfo->atime += info->st_atim.tv_nsec / APR_TIME_C(1000); +#elif defined(HAVE_STRUCT_STAT_ST_ATIMENSEC) + finfo->atime += info->st_atimensec / APR_TIME_C(1000); +#elif defined(HAVE_STRUCT_STAT_ST_ATIME_N) + finfo->ctime += info->st_atime_n / APR_TIME_C(1000); +#endif + + apr_time_ansi_put(&finfo->mtime, info->st_mtime); +#ifdef HAVE_STRUCT_STAT_ST_MTIM_TV_NSEC + finfo->mtime += info->st_mtim.tv_nsec / APR_TIME_C(1000); +#elif defined(HAVE_STRUCT_STAT_ST_MTIMENSEC) + finfo->mtime += info->st_mtimensec / APR_TIME_C(1000); +#elif defined(HAVE_STRUCT_STAT_ST_MTIME_N) + finfo->ctime += info->st_mtime_n / APR_TIME_C(1000); +#endif + + apr_time_ansi_put(&finfo->ctime, info->st_ctime); +#ifdef HAVE_STRUCT_STAT_ST_CTIM_TV_NSEC + finfo->ctime += info->st_ctim.tv_nsec / APR_TIME_C(1000); +#elif defined(HAVE_STRUCT_STAT_ST_CTIMENSEC) + finfo->ctime += info->st_ctimensec / APR_TIME_C(1000); +#elif defined(HAVE_STRUCT_STAT_ST_CTIME_N) + finfo->ctime += info->st_ctime_n / APR_TIME_C(1000); +#endif + +#ifdef HAVE_STRUCT_STAT_ST_BLOCKS +#ifdef DEV_BSIZE + finfo->csize = (apr_off_t)info->st_blocks * (apr_off_t)DEV_BSIZE; +#else + finfo->csize = (apr_off_t)info->st_blocks * (apr_off_t)512; +#endif + finfo->valid |= APR_FINFO_CSIZE; +#endif +} + + +static int +mod_glfs_map_to_storage(request_rec *r) +{ + glusterfs_dir_config_t *dir_config = NULL, *tmp = NULL; + int access_status; + int ret; + char *path = NULL; + struct stat st = {0, }; + core_server_config *sconf = NULL; + ap_conf_vector_t **sec_ent = NULL; + int num_sec = 0, i = 0; + + sconf = (core_server_config *) ap_get_module_config (r->server->module_config, &core_module); + sec_ent = (ap_conf_vector_t **) sconf->sec_url->elts; + num_sec = sconf->sec_url->nelts; + + for (i = 0; i < num_sec; i++) { + tmp = ap_get_module_config (sec_ent[i], &glusterfs_module); + + if (tmp && !strncmp (tmp->mount_dir, r->uri, strlen (tmp->mount_dir))) { + if (!dir_config || + strlen (tmp->mount_dir) > strlen (dir_config->mount_dir)) { + dir_config = tmp; + } + } + + } + + if (dir_config && dir_config->mount_dir && !(strncmp (apr_pstrcat (r->pool, dir_config->mount_dir, "/", NULL), r->uri, strlen (dir_config->mount_dir) + 1) && !r->handler)) + r->handler = GLUSTERFS_HANDLER; //apr_pstrdup (r->pool, GLUSTERFS_HANDLER); + + if (!r->handler || (r->handler && strcmp (r->handler, GLUSTERFS_HANDLER))) + return DECLINED; + + if (dir_config->mount_dir) + path = r->uri + strlen (dir_config->mount_dir); + + memset (&r->finfo, 0, sizeof (r->finfo)); + + dir_config->buf = calloc (1, dir_config->xattr_file_size); + if (!dir_config->buf) { + return HTTP_INTERNAL_SERVER_ERROR; + } + + if (!dir_config->handle) { + ap_log_rerror(APLOG_MARK, APLOG_ERR, APR_EGENERAL, r, + "mod_glfs_map_to_storage: glusterfs handle is NULL, check glusterfs logfile %s", + dir_config->logfile); + return HTTP_INTERNAL_SERVER_ERROR; + } + + ret = glusterfs_lookup (dir_config->handle, path, dir_config->buf, + dir_config->xattr_file_size, &st); + + if (ret == -1 || st.st_size > dir_config->xattr_file_size || S_ISDIR (st.st_mode)) { + free (dir_config->buf); + dir_config->buf = NULL; + + if (ret == -1) { + int error = HTTP_NOT_FOUND; + char *emsg = NULL; + if (r->path_info == NULL) { + emsg = apr_pstrcat(r->pool, strerror (errno), r->filename, NULL); + } + else { + emsg = apr_pstrcat(r->pool, strerror (errno), r->filename, r->path_info, NULL); + } + ap_log_rerror(APLOG_MARK, APLOG_ERR|APLOG_NOERRNO, 0, r, "%s", emsg); + if (errno != ENOENT) { + error = HTTP_INTERNAL_SERVER_ERROR; + } + return error; + } + } + + r->finfo.pool = r->pool; + r->finfo.fname = r->filename; + fill_out_finfo (&r->finfo, &st, + APR_FINFO_MIN | APR_FINFO_IDENT | APR_FINFO_NLINK | APR_FINFO_OWNER | APR_FINFO_PROT); + + /* r->filename = apr_pstrcat (r->pool, r->filename, r->path_info, NULL); */ + + /* allow core module to run directory_walk() and location_walk() */ + return DECLINED; +} + + +static int +mod_glfs_readv_async_cbk (glusterfs_read_buf_t *buf, + void *cbk_data) +{ + glusterfs_async_local_t *local = cbk_data; + + pthread_mutex_lock (&local->lock); + { + local->async_read_complete = 1; + local->buf = buf; + pthread_cond_signal (&local->cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + +/* use read_async just to avoid memcpy of read buffer in libglusterfsclient */ +static int +mod_glfs_read_async (request_rec *r, apr_bucket_brigade *bb, long fd, apr_off_t offset, apr_off_t length) +{ + glusterfs_async_local_t local; + off_t end; + int nbytes; + int complete; + conn_rec *c = r->connection; + apr_bucket *e = NULL; + apr_status_t status; + + if (length == 0) { + return 0; + } + + pthread_cond_init (&local.cond, NULL); + pthread_mutex_init (&local.lock, NULL); + + memset (&local, 0, sizeof (local)); + local.request = r; + + if (length > 0) + end = offset + length; + + do { + glusterfs_read_buf_t *buf; + if (length > 0) { + nbytes = end - offset; + if (nbytes > GLUSTERFS_CHUNK_SIZE) + nbytes = GLUSTERFS_CHUNK_SIZE; + } else + nbytes = GLUSTERFS_CHUNK_SIZE; + + glusterfs_read_async(fd, + nbytes, + offset, + mod_glfs_readv_async_cbk, + (void *)&local); + + pthread_mutex_lock (&local.lock); + { + while (!local.async_read_complete) { + pthread_cond_wait (&local.cond, &local.lock); + } + + local.op_ret = local.buf->op_ret; + local.op_errno = local.buf->op_errno; + + local.async_read_complete = 0; + buf = local.buf; + + if (length < 0) + complete = (local.buf->op_ret <= 0); + else { + local.read_bytes += local.buf->op_ret; + complete = ((local.read_bytes == length) || (local.buf->op_ret < 0)); + } + } + pthread_mutex_unlock (&local.lock); + + if (!bb) { + bb = apr_brigade_create (r->pool, c->bucket_alloc); + } + apr_brigade_writev (bb, NULL, NULL, buf->vector, buf->count); + + /* make sure all the data is written out, since we call glusterfs_free on buf once + ap_pass_brigade returns */ + e = apr_bucket_flush_create (c->bucket_alloc); + APR_BRIGADE_INSERT_TAIL (bb, e); + + status = ap_pass_brigade (r->output_filters, bb); + if (status != APR_SUCCESS) { + /* no way to know what type of error occurred */ + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, status, r, + "mod_glfs_handler: ap_pass_brigade returned %i", + status); + complete = 1; + local.op_ret = -1; + } + + glusterfs_free (buf); + + /* bb has already been cleaned up by core_output_filter, just being paranoid */ + apr_brigade_cleanup (bb); + + offset += nbytes; + } while (!complete); + + return (local.op_ret < 0 ? HTTP_INTERNAL_SERVER_ERROR : OK); +} + +/* TODO: to read blocks of size "length" from offset "offset" */ +/* + static int + mod_glfs_read_sync (request_rec *r, int fd, off_t offset, off_t length) + { + int error = OK; + off_t read_bytes; + char buf [GLUSTERFS_CHUNK_SIZE]; + + while ((read_bytes = glusterfs_read (fd, buf, GLUSTERFS_CHUNK_SIZE)) && read_bytes != -1) { + ap_rwrite (buf, read_bytes, r); + } + if (read_bytes) { + error = SERVER_ERROR; + } + return error; + } +*/ + + +static int +parse_byterange(char *range, apr_off_t clength, + apr_off_t *start, apr_off_t *end) +{ + char *dash = strchr(range, '-'); + char *errp; + apr_off_t number; + + if (!dash) { + return 0; + } + + if ((dash == range)) { + /* In the form "-5" */ + if (apr_strtoff(&number, dash+1, &errp, 10) || *errp) { + return 0; + } + *start = clength - number; + *end = clength - 1; + } + else { + *dash++ = '\0'; + if (apr_strtoff(&number, range, &errp, 10) || *errp) { + return 0; + } + *start = number; + if (*dash) { + if (apr_strtoff(&number, dash, &errp, 10) || *errp) { + return 0; + } + *end = number; + } + else { /* "5-" */ + *end = clength - 1; + } + } + + if (*start < 0) { + *start = 0; + } + + if (*end >= clength) { + *end = clength - 1; + } + + if (*start > *end) { + return -1; + } + + return (*start > 0 || *end < clength); +} + + +static int use_range_x(request_rec *r) +{ + const char *ua; + return (apr_table_get(r->headers_in, "Request-Range") + || ((ua = apr_table_get(r->headers_in, "User-Agent")) + && ap_strstr_c(ua, "MSIE 3"))); +} + + +static int ap_set_byterange(request_rec *r) +{ + const char *range; + const char *if_range; + const char *match; + const char *ct; + int num_ranges; + + if (r->assbackwards) { + return 0; + } + + /* Check for Range request-header (HTTP/1.1) or Request-Range for + * backwards-compatibility with second-draft Luotonen/Franks + * byte-ranges (e.g. Netscape Navigator 2-3). + * + * We support this form, with Request-Range, and (farther down) we + * send multipart/x-byteranges instead of multipart/byteranges for + * Request-Range based requests to work around a bug in Netscape + * Navigator 2-3 and MSIE 3. + */ + + if (!(range = apr_table_get(r->headers_in, "Range"))) { + range = apr_table_get(r->headers_in, "Request-Range"); + } + + if (!range || strncasecmp(range, "bytes=", 6) || r->status != HTTP_OK) { + return 0; + } + + /* is content already a single range? */ + if (apr_table_get(r->headers_out, "Content-Range")) { + return 0; + } + + /* is content already a multiple range? */ + if ((ct = apr_table_get(r->headers_out, "Content-Type")) + && (!strncasecmp(ct, "multipart/byteranges", 20) + || !strncasecmp(ct, "multipart/x-byteranges", 22))) { + return 0; + } + + /* Check the If-Range header for Etag or Date. + * Note that this check will return false (as required) if either + * of the two etags are weak. + */ + if ((if_range = apr_table_get(r->headers_in, "If-Range"))) { + if (if_range[0] == '"') { + if (!(match = apr_table_get(r->headers_out, "Etag")) + || (strcmp(if_range, match) != 0)) { + return 0; + } + } + else if (!(match = apr_table_get(r->headers_out, "Last-Modified")) + || (strcmp(if_range, match) != 0)) { + return 0; + } + } + + if (!ap_strchr_c(range, ',')) { + /* a single range */ + num_ranges = 1; + } + else { + /* a multiple range */ + num_ranges = 2; + } + + r->status = HTTP_PARTIAL_CONTENT; + r->range = range + 6; + + return num_ranges; +} + + +static void +mod_glfs_handle_byte_ranges (request_rec *r, long fd, int num_ranges) +{ + conn_rec *c = r->connection; + char *boundary = NULL, *bound_head = NULL; + const char *orig_ct = NULL; + apr_bucket_brigade *bsend = NULL; + apr_bucket *e = NULL; + apr_off_t range_start; + apr_off_t range_end; + char *current = NULL; + apr_status_t rv; + char found = 0; + + orig_ct = ap_make_content_type (r, r->content_type); + + if (num_ranges > 1) { + boundary = apr_psprintf(r->pool, "%" APR_UINT64_T_HEX_FMT "%lx", + (apr_uint64_t)r->request_time, (long) getpid()); + + ap_set_content_type(r, apr_pstrcat(r->pool, "multipart", + use_range_x(r) ? "/x-" : "/", + "byteranges; boundary=", + boundary, NULL)); + + if (strcasecmp(orig_ct, NO_CONTENT_TYPE)) { + bound_head = apr_pstrcat(r->pool, + CRLF "--", boundary, + CRLF "Content-type: ", + orig_ct, + CRLF "Content-range: bytes ", + NULL); + } + else { + /* if we have no type for the content, do our best */ + bound_head = apr_pstrcat(r->pool, + CRLF "--", boundary, + CRLF "Content-range: bytes ", + NULL); + } +// ap_xlate_proto_to_ascii(bound_head, strlen(bound_head)); + } + + while ((current = ap_getword(r->pool, &r->range, ',')) + && (rv = parse_byterange(current, r->finfo.size, &range_start, + &range_end))) { + apr_bucket *e2; + apr_bucket *ec; + + bsend = NULL; + if (rv == -1) { + continue; + } + + found = 1; + + /* For single range requests, we must produce Content-Range header. + * Otherwise, we need to produce the multipart boundaries. + */ + if (num_ranges == 1) { + apr_table_setn(r->headers_out, "Content-Range", + apr_psprintf(r->pool, "bytes " BYTERANGE_FMT, + range_start, range_end, r->finfo.size)); + } + else { + char *ts; + /* this brigade holds what we will be sending */ + bsend = apr_brigade_create(r->pool, c->bucket_alloc); + + e = apr_bucket_pool_create(bound_head, strlen(bound_head), + r->pool, c->bucket_alloc); + APR_BRIGADE_INSERT_TAIL(bsend, e); + + ts = apr_psprintf(r->pool, BYTERANGE_FMT CRLF CRLF, + range_start, range_end, r->finfo.size); +// ap_xlate_proto_to_ascii(ts, strlen(ts)); + e = apr_bucket_pool_create(ts, strlen(ts), r->pool, + c->bucket_alloc); + APR_BRIGADE_INSERT_TAIL(bsend, e); + } + mod_glfs_read_async (r, bsend, fd, range_start, (range_end + 1 - range_start)); + } + + bsend = apr_brigade_create (r->pool, c->bucket_alloc); + + if (found == 0) { + r->status = HTTP_OK; + /* bsend is assumed to be empty if we get here. */ + e = ap_bucket_error_create(HTTP_RANGE_NOT_SATISFIABLE, NULL, + r->pool, c->bucket_alloc); + APR_BRIGADE_INSERT_TAIL(bsend, e); + e = apr_bucket_eos_create(c->bucket_alloc); + APR_BRIGADE_INSERT_TAIL(bsend, e); + ap_pass_brigade (r->output_filters, bsend); + return; + } + + if (num_ranges > 1) { + char *end; + + /* add the final boundary */ + end = apr_pstrcat(r->pool, CRLF "--", boundary, "--" CRLF, NULL); +// ap_xlate_proto_to_ascii(end, strlen(end)); + e = apr_bucket_pool_create(end, strlen(end), r->pool, c->bucket_alloc); + APR_BRIGADE_INSERT_TAIL(bsend, e); + } + + ap_pass_brigade (r->output_filters, bsend); +} + + + +/**************************************************************** + * + * Looking things up in config entries... + */ + +/* Structure used to hold entries when we're actually building an index */ + +struct ent { + char *name; + char *icon; + char *alt; + char *desc; + apr_off_t size; + apr_time_t lm; + struct ent *next; + int ascending, ignore_case, version_sort; + char key; + int isdir; +}; + +static char *find_item(request_rec *r, apr_array_header_t *list, int path_only) +{ + const char *content_type = ap_field_noparam(r->pool, r->content_type); + const char *content_encoding = r->content_encoding; + char *path = r->filename; + + struct mod_glfs_ai_item *items = (struct mod_glfs_ai_item *) list->elts; + int i; + + for (i = 0; i < list->nelts; ++i) { + struct mod_glfs_ai_item *p = &items[i]; + + /* Special cased for ^^DIRECTORY^^ and ^^BLANKICON^^ */ + if ((path[0] == '^') || (!ap_strcmp_match(path, p->apply_path))) { + if (!*(p->apply_to)) { + return p->data; + } + else if (p->type == BY_PATH || path[0] == '^') { + if (!ap_strcmp_match(path, p->apply_to)) { + return p->data; + } + } + else if (!path_only) { + if (!content_encoding) { + if (p->type == BY_TYPE) { + if (content_type + && !ap_strcasecmp_match(content_type, + p->apply_to)) { + return p->data; + } + } + } + else { + if (p->type == BY_ENCODING) { + if (!ap_strcasecmp_match(content_encoding, + p->apply_to)) { + return p->data; + } + } + } + } + } + } + return NULL; +} + +#define find_icon(d,p,t) find_item(p,d->icon_list,t) +#define find_alt(d,p,t) find_item(p,d->alt_list,t) +#define find_header(d,p) find_item(p,d->hdr_list,0) +#define find_readme(d,p) find_item(p,d->rdme_list,0) + +static char *find_default_item(char *bogus_name, apr_array_header_t *list) +{ + request_rec r; + /* Bleah. I tried to clean up find_item, and it lead to this bit + * of ugliness. Note that the fields initialized are precisely + * those that find_item looks at... + */ + r.filename = bogus_name; + r.content_type = r.content_encoding = NULL; + return find_item(&r, list, 1); +} + +#define find_default_icon(d,n) find_default_item(n, d->icon_list) +#define find_default_alt(d,n) find_default_item(n, d->alt_list) + +/* + * Look through the list of pattern/description pairs and return the first one + * if any) that matches the filename in the request. If multiple patterns + * match, only the first one is used; since the order in the array is the + * same as the order in which directives were processed, earlier matching + * directives will dominate. + */ + +#ifdef CASE_BLIND_FILESYSTEM +#define MATCH_FLAGS APR_FNM_CASE_BLIND +#else +#define MATCH_FLAGS 0 +#endif + +static char *find_desc(glusterfs_dir_config_t *dcfg, const char *filename_full) +{ + int i; + mod_glfs_ai_desc_t *list = (mod_glfs_ai_desc_t *) dcfg->desc_list->elts; + const char *filename_only; + const char *filename; + + /* + * If the filename includes a path, extract just the name itself + * for the simple matches. + */ + if ((filename_only = ap_strrchr_c(filename_full, '/')) == NULL) { + filename_only = filename_full; + } + else { + filename_only++; + } + for (i = 0; i < dcfg->desc_list->nelts; ++i) { + mod_glfs_ai_desc_t *tuple = &list[i]; + int found; + + /* + * Only use the full-path filename if the pattern contains '/'s. + */ + filename = (tuple->full_path) ? filename_full : filename_only; + /* + * Make the comparison using the cheapest method; only do + * wildcard checking if we must. + */ + if (tuple->wildcards) { + found = (apr_fnmatch(tuple->pattern, filename, MATCH_FLAGS) == 0); + } + else { + found = (ap_strstr_c(filename, tuple->pattern) != NULL); + } + if (found) { + return tuple->description; + } + } + return NULL; +} + +static int ignore_entry(glusterfs_dir_config_t *d, char *path) +{ + apr_array_header_t *list = d->ign_list; + struct mod_glfs_ai_item *items = (struct mod_glfs_ai_item *) list->elts; + char *tt; + int i; + + if ((tt = strrchr(path, '/')) == NULL) { + tt = path; + } + else { + tt++; + } + + for (i = 0; i < list->nelts; ++i) { + struct mod_glfs_ai_item *p = &items[i]; + char *ap; + + if ((ap = strrchr(p->apply_to, '/')) == NULL) { + ap = p->apply_to; + } + else { + ap++; + } + +#ifndef CASE_BLIND_FILESYSTEM + if (!ap_strcmp_match(path, p->apply_path) + && !ap_strcmp_match(tt, ap)) { + return 1; + } +#else /* !CASE_BLIND_FILESYSTEM */ + /* + * On some platforms, the match must be case-blind. This is really + * a factor of the filesystem involved, but we can't detect that + * reliably - so we have to granularise at the OS level. + */ + if (!ap_strcasecmp_match(path, p->apply_path) + && !ap_strcasecmp_match(tt, ap)) { + return 1; + } +#endif /* !CASE_BLIND_FILESYSTEM */ + } + return 0; +} + +/***************************************************************** + * + * Actually generating output + */ + +/* + * Elements of the emitted document: + * Preamble + * Emitted unless SUPPRESS_PREAMBLE is set AND ap_run_sub_req + * succeeds for the (content_type == text/html) header file. + * Header file + * Emitted if found (and able). + * H1 tag line + * Emitted if a header file is NOT emitted. + * Directory stuff + * Always emitted. + * HR + * Emitted if FANCY_INDEXING is set. + * Readme file + * Emitted if found (and able). + * ServerSig + * Emitted if ServerSignature is not Off AND a readme file + * is NOT emitted. + * Postamble + * Emitted unless SUPPRESS_PREAMBLE is set AND ap_run_sub_req + * succeeds for the (content_type == text/html) readme file. + */ + + +/* + * emit a plain text file + */ +static void do_emit_plain(request_rec *r, apr_file_t *f) +{ + char buf[AP_IOBUFSIZE + 1]; + int ch; + apr_size_t i, c, n; + apr_status_t rv; + + ap_rputs("
\n", r);
+        while (!apr_file_eof(f)) {
+                do {
+                        n = sizeof(char) * AP_IOBUFSIZE;
+                        rv = apr_file_read(f, buf, &n);
+                } while (APR_STATUS_IS_EINTR(rv));
+                if (n == 0 || rv != APR_SUCCESS) {
+                        /* ###: better error here? */
+                        break;
+                }
+                buf[n] = '\0';
+                c = 0;
+                while (c < n) {
+                        for (i = c; i < n; i++) {
+                                if (buf[i] == '<' || buf[i] == '>' || buf[i] == '&') {
+                                        break;
+                                }
+                        }
+                        ch = buf[i];
+                        buf[i] = '\0';
+                        ap_rputs(&buf[c], r);
+                        if (ch == '<') {
+                                ap_rputs("<", r);
+                        }
+                        else if (ch == '>') {
+                                ap_rputs(">", r);
+                        }
+                        else if (ch == '&') {
+                                ap_rputs("&", r);
+                        }
+                        c = i + 1;
+                }
+        }
+        ap_rputs("
\n", r); +} + +/* + * Handle the preamble through the H1 tag line, inclusive. Locate + * the file with a subrequests. Process text/html documents by actually + * running the subrequest; text/xxx documents get copied verbatim, + * and any other content type is ignored. This means that a non-text + * document (such as HEADER.gif) might get multiviewed as the result + * instead of a text document, meaning nothing will be displayed, but + * oh well. + */ +static void emit_head(request_rec *r, char *header_fname, int suppress_amble, + int emit_xhtml, char *title) +{ + apr_table_t *hdrs = r->headers_in; + apr_file_t *f = NULL; + request_rec *rr = NULL; + int emit_amble = 1; + int emit_H1 = 1; + const char *r_accept; + const char *r_accept_enc; + + /* + * If there's a header file, send a subrequest to look for it. If it's + * found and html do the subrequest, otherwise handle it + */ + r_accept = apr_table_get(hdrs, "Accept"); + r_accept_enc = apr_table_get(hdrs, "Accept-Encoding"); + apr_table_setn(hdrs, "Accept", "text/html, text/plain"); + apr_table_unset(hdrs, "Accept-Encoding"); + + + if ((header_fname != NULL) && r->args) { + header_fname = apr_pstrcat(r->pool, header_fname, "?", r->args, NULL); + } + + if ((header_fname != NULL) + && (rr = ap_sub_req_lookup_uri(header_fname, r, r->output_filters)) + && (rr->status == HTTP_OK) + && (rr->filename != NULL) + && (rr->finfo.filetype == APR_REG)) { + /* + * Check for the two specific cases we allow: text/html and + * text/anything-else. The former is allowed to be processed for + * SSIs. + */ + if (rr->content_type != NULL) { + if (!strcasecmp(ap_field_noparam(r->pool, rr->content_type), + "text/html")) { + ap_filter_t *f; + /* Hope everything will work... */ + emit_amble = 0; + emit_H1 = 0; + + if (! suppress_amble) { + emit_preamble(r, emit_xhtml, title); + } + /* This is a hack, but I can't find any better way to do this. + * The problem is that we have already created the sub-request, + * but we just inserted the OLD_WRITE filter, and the + * sub-request needs to pass its data through the OLD_WRITE + * filter, or things go horribly wrong (missing data, data in + * the wrong order, etc). To fix it, if you create a + * sub-request and then insert the OLD_WRITE filter before you + * run the request, you need to make sure that the sub-request + * data goes through the OLD_WRITE filter. Just steal this + * code. The long-term solution is to remove the ap_r* + * functions. + */ + for (f=rr->output_filters; + f->frec != ap_subreq_core_filter_handle; f = f->next); + f->next = r->output_filters; + + /* + * If there's a problem running the subrequest, display the + * preamble if we didn't do it before -- the header file + * didn't get displayed. + */ + if (ap_run_sub_req(rr) != OK) { + /* It didn't work */ + emit_amble = suppress_amble; + emit_H1 = 1; + } + } + else if (!strncasecmp("text/", rr->content_type, 5)) { + /* + * If we can open the file, prefix it with the preamble + * regardless; since we'll be sending a
 block around
+                                 * the file's contents, any HTML header it had won't end up
+                                 * where it belongs.
+                                 */
+                                if (apr_file_open(&f, rr->filename, APR_READ,
+                                                  APR_OS_DEFAULT, r->pool) == APR_SUCCESS) {
+                                        emit_preamble(r, emit_xhtml, title);
+                                        emit_amble = 0;
+                                        do_emit_plain(r, f);
+                                        apr_file_close(f);
+                                        emit_H1 = 0;
+                                }
+                        }
+                }
+        }
+
+        if (r_accept) {
+                apr_table_setn(hdrs, "Accept", r_accept);
+        }
+        else {
+                apr_table_unset(hdrs, "Accept");
+        }
+
+        if (r_accept_enc) {
+                apr_table_setn(hdrs, "Accept-Encoding", r_accept_enc);
+        }
+
+        if (emit_amble) {
+                emit_preamble(r, emit_xhtml, title);
+        }
+        if (emit_H1) {
+                ap_rvputs(r, "

Index of ", title, "

\n", NULL); + } + if (rr != NULL) { + ap_destroy_sub_req(rr); + } +} + + +/* + * Handle the Readme file through the postamble, inclusive. Locate + * the file with a subrequests. Process text/html documents by actually + * running the subrequest; text/xxx documents get copied verbatim, + * and any other content type is ignored. This means that a non-text + * document (such as FOOTER.gif) might get multiviewed as the result + * instead of a text document, meaning nothing will be displayed, but + * oh well. + */ +static void emit_tail(request_rec *r, char *readme_fname, int suppress_amble) +{ + apr_file_t *f = NULL; + request_rec *rr = NULL; + int suppress_post = 0; + int suppress_sig = 0; + + /* + * If there's a readme file, send a subrequest to look for it. If it's + * found and a text file, handle it -- otherwise fall through and + * pretend there's nothing there. + */ + if ((readme_fname != NULL) + && (rr = ap_sub_req_lookup_uri(readme_fname, r, r->output_filters)) + && (rr->status == HTTP_OK) + && (rr->filename != NULL) + && rr->finfo.filetype == APR_REG) { + /* + * Check for the two specific cases we allow: text/html and + * text/anything-else. The former is allowed to be processed for + * SSIs. + */ + if (rr->content_type != NULL) { + if (!strcasecmp(ap_field_noparam(r->pool, rr->content_type), + "text/html")) { + ap_filter_t *f; + for (f=rr->output_filters; + f->frec != ap_subreq_core_filter_handle; f = f->next); + f->next = r->output_filters; + + + if (ap_run_sub_req(rr) == OK) { + /* worked... */ + suppress_sig = 1; + suppress_post = suppress_amble; + } + } + else if (!strncasecmp("text/", rr->content_type, 5)) { + /* + * If we can open the file, suppress the signature. + */ + if (apr_file_open(&f, rr->filename, APR_READ, + APR_OS_DEFAULT, r->pool) == APR_SUCCESS) { + do_emit_plain(r, f); + apr_file_close(f); + suppress_sig = 1; + } + } + } + } + + if (!suppress_sig) { + ap_rputs(ap_psignature("", r), r); + } + if (!suppress_post) { + ap_rputs("\n", r); + } + if (rr != NULL) { + ap_destroy_sub_req(rr); + } +} + + +static char *find_title(request_rec *r) +{ + char titlebuf[MAX_STRING_LEN], *find = ""; + apr_file_t *thefile = NULL; + int x, y, p; + apr_size_t n; + + if (r->status != HTTP_OK) { + return NULL; + } + if ((r->content_type != NULL) + && (!strcasecmp(ap_field_noparam(r->pool, r->content_type), + "text/html") + || !strcmp(r->content_type, INCLUDES_MAGIC_TYPE)) + && !r->content_encoding) { + if (apr_file_open(&thefile, r->filename, APR_READ, + APR_OS_DEFAULT, r->pool) != APR_SUCCESS) { + return NULL; + } + n = sizeof(char) * (MAX_STRING_LEN - 1); + apr_file_read(thefile, titlebuf, &n); + if (n <= 0) { + apr_file_close(thefile); + return NULL; + } + titlebuf[n] = '\0'; + for (x = 0, p = 0; titlebuf[x]; x++) { + if (apr_tolower(titlebuf[x]) == find[p]) { + if (!find[++p]) { + if ((p = ap_ind(&titlebuf[++x], '<')) != -1) { + titlebuf[x + p] = '\0'; + } + /* Scan for line breaks for Tanmoy's secretary */ + for (y = x; titlebuf[y]; y++) { + if ((titlebuf[y] == CR) || (titlebuf[y] == LF)) { + if (y == x) { + x++; + } + else { + titlebuf[y] = ' '; + } + } + } + apr_file_close(thefile); + return apr_pstrdup(r->pool, &titlebuf[x]); + } + } + else { + p = 0; + } + } + apr_file_close(thefile); + } + return NULL; +} + +static struct ent *make_parent_entry(apr_int32_t autoindex_opts, + glusterfs_dir_config_t *d, + request_rec *r, char keyid, + char direction) +{ + struct ent *p = (struct ent *) apr_pcalloc(r->pool, sizeof(struct ent)); + char *testpath; + /* + * p->name is now the true parent URI. + * testpath is a crafted lie, so that the syntax '/some/..' + * (or simply '..')be used to describe 'up' from '/some/' + * when processeing IndexIgnore, and Icon|Alt|Desc configs. + */ + + /* The output has always been to the parent. Don't make ourself + * our own parent (worthless cyclical reference). + */ + if (!(p->name = ap_make_full_path(r->pool, r->uri, "../"))) { + return (NULL); + } + ap_getparents(p->name); + if (!*p->name) { + return (NULL); + } + + /* IndexIgnore has always compared "/thispath/.." */ + testpath = ap_make_full_path(r->pool, r->filename, ".."); + if (ignore_entry(d, testpath)) { + return (NULL); + } + + p->size = -1; + p->lm = -1; + p->key = apr_toupper(keyid); + p->ascending = (apr_toupper(direction) == D_ASCENDING); + p->version_sort = autoindex_opts & VERSION_SORT; + if (autoindex_opts & FANCY_INDEXING) { + if (!(p->icon = find_default_icon(d, testpath))) { + p->icon = find_default_icon(d, "^^DIRECTORY^^"); + } + if (!(p->alt = find_default_alt(d, testpath))) { + if (!(p->alt = find_default_alt(d, "^^DIRECTORY^^"))) { + p->alt = "DIR"; + } + } + p->desc = find_desc(d, testpath); + } + return p; +} + +static struct ent *make_autoindex_entry(const apr_finfo_t *dirent, + int autoindex_opts, + glusterfs_dir_config_t *d, + request_rec *r, char keyid, + char direction, + const char *pattern) +{ + request_rec *rr; + struct ent *p; + int show_forbidden = 0; + + /* Dot is ignored, Parent is handled by make_parent_entry() */ + if ((dirent->name[0] == '.') && (!dirent->name[1] + || ((dirent->name[1] == '.') && !dirent->name[2]))) + return (NULL); + + /* + * On some platforms, the match must be case-blind. This is really + * a factor of the filesystem involved, but we can't detect that + * reliably - so we have to granularise at the OS level. + */ + if (pattern && (apr_fnmatch(pattern, dirent->name, + APR_FNM_NOESCAPE | APR_FNM_PERIOD +#ifdef CASE_BLIND_FILESYSTEM + | APR_FNM_CASE_BLIND +#endif + ) + != APR_SUCCESS)) { + return (NULL); + } + + if (ignore_entry(d, ap_make_full_path(r->pool, + r->filename, dirent->name))) { + return (NULL); + } + + if (!(rr = ap_sub_req_lookup_dirent(dirent, r, AP_SUBREQ_NO_ARGS, NULL))) { + return (NULL); + } + + if((autoindex_opts & SHOW_FORBIDDEN) + && (rr->status == HTTP_UNAUTHORIZED || rr->status == HTTP_FORBIDDEN)) { + show_forbidden = 1; + } + + if ((rr->finfo.filetype != APR_DIR && rr->finfo.filetype != APR_REG) + || !(rr->status == OK || ap_is_HTTP_SUCCESS(rr->status) + || ap_is_HTTP_REDIRECT(rr->status) + || show_forbidden == 1)) { + ap_destroy_sub_req(rr); + return (NULL); + } + + p = (struct ent *) apr_pcalloc(r->pool, sizeof(struct ent)); + if (dirent->filetype == APR_DIR) { + p->name = apr_pstrcat(r->pool, dirent->name, "/", NULL); + } + else { + p->name = apr_pstrdup(r->pool, dirent->name); + } + p->size = -1; + p->icon = NULL; + p->alt = NULL; + p->desc = NULL; + p->lm = -1; + p->isdir = 0; + p->key = apr_toupper(keyid); + p->ascending = (apr_toupper(direction) == D_ASCENDING); + p->version_sort = !!(autoindex_opts & VERSION_SORT); + p->ignore_case = !!(autoindex_opts & IGNORE_CASE); + + if (autoindex_opts & (FANCY_INDEXING | TABLE_INDEXING)) { + p->lm = rr->finfo.mtime; + if (dirent->filetype == APR_DIR) { + if (autoindex_opts & FOLDERS_FIRST) { + p->isdir = 1; + } + rr->filename = ap_make_dirstr_parent (rr->pool, rr->filename); + + /* omit the trailing slash (1.3 compat) */ + rr->filename[strlen(rr->filename) - 1] = '\0'; + + if (!(p->icon = find_icon(d, rr, 1))) { + p->icon = find_default_icon(d, "^^DIRECTORY^^"); + } + if (!(p->alt = find_alt(d, rr, 1))) { + if (!(p->alt = find_default_alt(d, "^^DIRECTORY^^"))) { + p->alt = "DIR"; + } + } + } + else { + p->icon = find_icon(d, rr, 0); + p->alt = find_alt(d, rr, 0); + p->size = rr->finfo.size; + } + + p->desc = find_desc(d, rr->filename); + + if ((!p->desc) && (autoindex_opts & SCAN_HTML_TITLES)) { + p->desc = apr_pstrdup(r->pool, find_title(rr)); + } + } + ap_destroy_sub_req(rr); + /* + * We don't need to take any special action for the file size key. + * If we did, it would go here. + */ + if (keyid == K_LAST_MOD) { + if (p->lm < 0) { + p->lm = 0; + } + } + return (p); +} + +static char *terminate_description(glusterfs_dir_config_t *d, char *desc, + apr_int32_t autoindex_opts, int desc_width) +{ + int maxsize = desc_width; + register int x; + + /* + * If there's no DescriptionWidth in effect, default to the old + * behaviour of adjusting the description size depending upon + * what else is being displayed. Otherwise, stick with the + * setting. + */ + if (d->desc_adjust == K_UNSET) { + if (autoindex_opts & SUPPRESS_ICON) { + maxsize += 6; + } + if (autoindex_opts & SUPPRESS_LAST_MOD) { + maxsize += 19; + } + if (autoindex_opts & SUPPRESS_SIZE) { + maxsize += 7; + } + } + for (x = 0; desc[x] && ((maxsize > 0) || (desc[x] == '<')); x++) { + if (desc[x] == '<') { + while (desc[x] != '>') { + if (!desc[x]) { + maxsize = 0; + break; + } + ++x; + } + } + else if (desc[x] == '&') { + /* entities like ä count as one character */ + --maxsize; + for ( ; desc[x] != ';'; ++x) { + if (desc[x] == '\0') { + maxsize = 0; + break; + } + } + } + else { + --maxsize; + } + } + if (!maxsize && desc[x] != '\0') { + desc[x - 1] = '>'; /* Grump. */ + desc[x] = '\0'; /* Double Grump! */ + } + return desc; +} + +/* + * Emit the anchor for the specified field. If a field is the key for the + * current request, the link changes its meaning to reverse the order when + * selected again. Non-active fields always start in ascending order. + */ +static void emit_link(request_rec *r, const char *anchor, char column, + char curkey, char curdirection, + const char *colargs, int nosort) +{ + if (!nosort) { + char qvalue[9]; + + qvalue[0] = '?'; + qvalue[1] = 'C'; + qvalue[2] = '='; + qvalue[3] = column; + qvalue[4] = ';'; + qvalue[5] = 'O'; + qvalue[6] = '='; + /* reverse? */ + qvalue[7] = ((curkey == column) && (curdirection == D_ASCENDING)) + ? D_DESCENDING : D_ASCENDING; + qvalue[8] = '\0'; + ap_rvputs(r, "<a href=\"", qvalue, colargs ? colargs : "", + "\">", anchor, "</a>", NULL); + } + else { + ap_rputs(anchor, r); + } +} + +static void output_directories(struct ent **ar, int n, + glusterfs_dir_config_t *d, request_rec *r, + apr_int32_t autoindex_opts, char keyid, + char direction, const char *colargs) +{ + int x; + apr_size_t rv; + char *name = r->uri; + char *tp; + int static_columns = !!(autoindex_opts & SUPPRESS_COLSORT); + apr_pool_t *scratch; + int name_width; + int desc_width; + char *name_scratch; + char *pad_scratch; + char *breakrow = ""; + + apr_pool_create(&scratch, r->pool); + if (name[0] == '\0') { + name = "/"; + } + + name_width = d->name_width; + desc_width = d->desc_width; + + if ((autoindex_opts & (FANCY_INDEXING | TABLE_INDEXING)) + == FANCY_INDEXING) { + if (d->name_adjust == K_ADJUST) { + for (x = 0; x < n; x++) { + int t = strlen(ar[x]->name); + if (t > name_width) { + name_width = t; + } + } + } + + if (d->desc_adjust == K_ADJUST) { + for (x = 0; x < n; x++) { + if (ar[x]->desc != NULL) { + int t = strlen(ar[x]->desc); + if (t > desc_width) { + desc_width = t; + } + } + } + } + } + name_scratch = apr_palloc(r->pool, name_width + 1); + pad_scratch = apr_palloc(r->pool, name_width + 1); + memset(pad_scratch, ' ', name_width); + pad_scratch[name_width] = '\0'; + + if (autoindex_opts & TABLE_INDEXING) { + int cols = 1; + ap_rputs("<table><tr>", r); + if (!(autoindex_opts & SUPPRESS_ICON)) { + ap_rputs("<th>", r); + if ((tp = find_default_icon(d, "^^BLANKICON^^"))) { + ap_rvputs(r, "<img src=\"", ap_escape_html(scratch, tp), + "\" alt=\"[ICO]\"", NULL); + if (d->icon_width) { + ap_rprintf(r, " width=\"%d\"", d->icon_width); + } + if (d->icon_height) { + ap_rprintf(r, " height=\"%d\"", d->icon_height); + } + + if (autoindex_opts & EMIT_XHTML) { + ap_rputs(" /", r); + } + ap_rputs("></th>", r); + } + else { + ap_rputs(" </th>", r); + } + + ++cols; + } + ap_rputs("<th>", r); + emit_link(r, "Name", K_NAME, keyid, direction, + colargs, static_columns); + if (!(autoindex_opts & SUPPRESS_LAST_MOD)) { + ap_rputs("</th><th>", r); + emit_link(r, "Last modified", K_LAST_MOD, keyid, direction, + colargs, static_columns); + ++cols; + } + if (!(autoindex_opts & SUPPRESS_SIZE)) { + ap_rputs("</th><th>", r); + emit_link(r, "Size", K_SIZE, keyid, direction, + colargs, static_columns); + ++cols; + } + if (!(autoindex_opts & SUPPRESS_DESC)) { + ap_rputs("</th><th>", r); + emit_link(r, "Description", K_DESC, keyid, direction, + colargs, static_columns); + ++cols; + } + if (!(autoindex_opts & SUPPRESS_RULES)) { + breakrow = apr_psprintf(r->pool, + "<tr><th colspan=\"%d\">" + "<hr%s></th></tr>\n", cols, + (autoindex_opts & EMIT_XHTML) ? " /" : ""); + } + ap_rvputs(r, "</th></tr>", breakrow, NULL); + } + else if (autoindex_opts & FANCY_INDEXING) { + ap_rputs("<pre>", r); + if (!(autoindex_opts & SUPPRESS_ICON)) { + if ((tp = find_default_icon(d, "^^BLANKICON^^"))) { + ap_rvputs(r, "<img src=\"", ap_escape_html(scratch, tp), + "\" alt=\"Icon \"", NULL); + if (d->icon_width) { + ap_rprintf(r, " width=\"%d\"", d->icon_width); + } + if (d->icon_height) { + ap_rprintf(r, " height=\"%d\"", d->icon_height); + } + + if (autoindex_opts & EMIT_XHTML) { + ap_rputs(" /", r); + } + ap_rputs("> ", r); + } + else { + ap_rputs(" ", r); + } + } + emit_link(r, "Name", K_NAME, keyid, direction, + colargs, static_columns); + ap_rputs(pad_scratch + 4, r); + /* + * Emit the guaranteed-at-least-one-space-between-columns byte. + */ + ap_rputs(" ", r); + if (!(autoindex_opts & SUPPRESS_LAST_MOD)) { + emit_link(r, "Last modified", K_LAST_MOD, keyid, direction, + colargs, static_columns); + ap_rputs(" ", r); + } + if (!(autoindex_opts & SUPPRESS_SIZE)) { + emit_link(r, "Size", K_SIZE, keyid, direction, + colargs, static_columns); + ap_rputs(" ", r); + } + if (!(autoindex_opts & SUPPRESS_DESC)) { + emit_link(r, "Description", K_DESC, keyid, direction, + colargs, static_columns); + } + if (!(autoindex_opts & SUPPRESS_RULES)) { + ap_rputs("<hr", r); + if (autoindex_opts & EMIT_XHTML) { + ap_rputs(" /", r); + } + ap_rputs(">", r); + } + else { + ap_rputc('\n', r); + } + } + else { + ap_rputs("<ul>", r); + } + + for (x = 0; x < n; x++) { + char *anchor, *t, *t2; + int nwidth; + + apr_pool_clear(scratch); + + t = ar[x]->name; + anchor = ap_escape_html(scratch, ap_os_escape_path(scratch, t, 0)); + + if (!x && t[0] == '/') { + t2 = "Parent Directory"; + } + else { + t2 = t; + } + + if (autoindex_opts & TABLE_INDEXING) { + ap_rputs("<tr>", r); + if (!(autoindex_opts & SUPPRESS_ICON)) { + ap_rputs("<td valign=\"top\">", r); + if (autoindex_opts & ICONS_ARE_LINKS) { + ap_rvputs(r, "<a href=\"", anchor, "\">", NULL); + } + if ((ar[x]->icon) || d->default_icon) { + ap_rvputs(r, "<img src=\"", + ap_escape_html(scratch, + ar[x]->icon ? ar[x]->icon + : d->default_icon), + "\" alt=\"[", (ar[x]->alt ? ar[x]->alt : " "), + "]\"", NULL); + if (d->icon_width) { + ap_rprintf(r, " width=\"%d\"", d->icon_width); + } + if (d->icon_height) { + ap_rprintf(r, " height=\"%d\"", d->icon_height); + } + + if (autoindex_opts & EMIT_XHTML) { + ap_rputs(" /", r); + } + ap_rputs(">", r); + } + else { + ap_rputs(" ", r); + } + if (autoindex_opts & ICONS_ARE_LINKS) { + ap_rputs("</a></td>", r); + } + else { + ap_rputs("</td>", r); + } + } + if (d->name_adjust == K_ADJUST) { + ap_rvputs(r, "<td><a href=\"", anchor, "\">", + ap_escape_html(scratch, t2), "</a>", NULL); + } + else { + nwidth = strlen(t2); + if (nwidth > name_width) { + memcpy(name_scratch, t2, name_width - 3); + name_scratch[name_width - 3] = '.'; + name_scratch[name_width - 2] = '.'; + name_scratch[name_width - 1] = '>'; + name_scratch[name_width] = 0; + t2 = name_scratch; + nwidth = name_width; + } + ap_rvputs(r, "<td><a href=\"", anchor, "\">", + ap_escape_html(scratch, t2), + "</a>", pad_scratch + nwidth, NULL); + } + if (!(autoindex_opts & SUPPRESS_LAST_MOD)) { + if (ar[x]->lm != -1) { + char time_str[MAX_STRING_LEN]; + apr_time_exp_t ts; + apr_time_exp_lt(&ts, ar[x]->lm); + apr_strftime(time_str, &rv, MAX_STRING_LEN, + "</td><td align=\"right\">%d-%b-%Y %H:%M ", + &ts); + ap_rputs(time_str, r); + } + else { + ap_rputs("</td><td> ", r); + } + } + if (!(autoindex_opts & SUPPRESS_SIZE)) { + char buf[5]; + ap_rvputs(r, "</td><td align=\"right\">", + apr_strfsize(ar[x]->size, buf), NULL); + } + if (!(autoindex_opts & SUPPRESS_DESC)) { + if (ar[x]->desc) { + if (d->desc_adjust == K_ADJUST) { + ap_rvputs(r, "</td><td>", ar[x]->desc, NULL); + } + else { + ap_rvputs(r, "</td><td>", + terminate_description(d, ar[x]->desc, + autoindex_opts, + desc_width), NULL); + } + } + } + else { + ap_rputs("</td><td> ", r); + } + ap_rputs("</td></tr>\n", r); + } + else if (autoindex_opts & FANCY_INDEXING) { + if (!(autoindex_opts & SUPPRESS_ICON)) { + if (autoindex_opts & ICONS_ARE_LINKS) { + ap_rvputs(r, "<a href=\"", anchor, "\">", NULL); + } + if ((ar[x]->icon) || d->default_icon) { + ap_rvputs(r, "<img src=\"", + ap_escape_html(scratch, + ar[x]->icon ? ar[x]->icon + : d->default_icon), + "\" alt=\"[", (ar[x]->alt ? ar[x]->alt : " "), + "]\"", NULL); + if (d->icon_width) { + ap_rprintf(r, " width=\"%d\"", d->icon_width); + } + if (d->icon_height) { + ap_rprintf(r, " height=\"%d\"", d->icon_height); + } + + if (autoindex_opts & EMIT_XHTML) { + ap_rputs(" /", r); + } + ap_rputs(">", r); + } + else { + ap_rputs(" ", r); + } + if (autoindex_opts & ICONS_ARE_LINKS) { + ap_rputs("</a> ", r); + } + else { + ap_rputc(' ', r); + } + } + nwidth = strlen(t2); + if (nwidth > name_width) { + memcpy(name_scratch, t2, name_width - 3); + name_scratch[name_width - 3] = '.'; + name_scratch[name_width - 2] = '.'; + name_scratch[name_width - 1] = '>'; + name_scratch[name_width] = 0; + t2 = name_scratch; + nwidth = name_width; + } + ap_rvputs(r, "<a href=\"", anchor, "\">", + ap_escape_html(scratch, t2), + "</a>", pad_scratch + nwidth, NULL); + /* + * The blank before the storm.. er, before the next field. + */ + ap_rputs(" ", r); + if (!(autoindex_opts & SUPPRESS_LAST_MOD)) { + if (ar[x]->lm != -1) { + char time_str[MAX_STRING_LEN]; + apr_time_exp_t ts; + apr_time_exp_lt(&ts, ar[x]->lm); + apr_strftime(time_str, &rv, MAX_STRING_LEN, + "%d-%b-%Y %H:%M ", &ts); + ap_rputs(time_str, r); + } + else { + /*Length="22-Feb-1998 23:42 " (see 4 lines above) */ + ap_rputs(" ", r); + } + } + if (!(autoindex_opts & SUPPRESS_SIZE)) { + char buf[5]; + ap_rputs(apr_strfsize(ar[x]->size, buf), r); + ap_rputs(" ", r); + } + if (!(autoindex_opts & SUPPRESS_DESC)) { + if (ar[x]->desc) { + ap_rputs(terminate_description(d, ar[x]->desc, + autoindex_opts, + desc_width), r); + } + } + ap_rputc('\n', r); + } + else { + ap_rvputs(r, "<li><a href=\"", anchor, "\"> ", + ap_escape_html(scratch, t2), + "</a></li>\n", NULL); + } + } + if (autoindex_opts & TABLE_INDEXING) { + ap_rvputs(r, breakrow, "</table>\n", NULL); + } + else if (autoindex_opts & FANCY_INDEXING) { + if (!(autoindex_opts & SUPPRESS_RULES)) { + ap_rputs("<hr", r); + if (autoindex_opts & EMIT_XHTML) { + ap_rputs(" /", r); + } + ap_rputs("></pre>\n", r); + } + else { + ap_rputs("</pre>\n", r); + } + } + else { + ap_rputs("</ul>\n", r); + } +} + +/* + * Compare two file entries according to the sort criteria. The return + * is essentially a signum function value. + */ + +static int dsortf(struct ent **e1, struct ent **e2) +{ + struct ent *c1; + struct ent *c2; + int result = 0; + + /* + * First, see if either of the entries is for the parent directory. + * If so, that *always* sorts lower than anything else. + */ + if ((*e1)->name[0] == '/') { + return -1; + } + if ((*e2)->name[0] == '/') { + return 1; + } + /* + * Now see if one's a directory and one isn't, if we're set + * isdir for FOLDERS_FIRST. + */ + if ((*e1)->isdir != (*e2)->isdir) { + return (*e1)->isdir ? -1 : 1; + } + /* + * All of our comparisons will be of the c1 entry against the c2 one, + * so assign them appropriately to take care of the ordering. + */ + if ((*e1)->ascending) { + c1 = *e1; + c2 = *e2; + } + else { + c1 = *e2; + c2 = *e1; + } + + switch (c1->key) { + case K_LAST_MOD: + if (c1->lm > c2->lm) { + return 1; + } + else if (c1->lm < c2->lm) { + return -1; + } + break; + case K_SIZE: + if (c1->size > c2->size) { + return 1; + } + else if (c1->size < c2->size) { + return -1; + } + break; + case K_DESC: + if (c1->version_sort) { + result = apr_strnatcmp(c1->desc ? c1->desc : "", + c2->desc ? c2->desc : ""); + } + else { + result = strcmp(c1->desc ? c1->desc : "", + c2->desc ? c2->desc : ""); + } + if (result) { + return result; + } + break; + } + + /* names may identical when treated case-insensitively, + * so always fall back on strcmp() flavors to put entries + * in deterministic order. This means that 'ABC' and 'abc' + * will always appear in the same order, rather than + * variably between 'ABC abc' and 'abc ABC' order. + */ + + if (c1->version_sort) { + if (c1->ignore_case) { + result = apr_strnatcasecmp (c1->name, c2->name); + } + if (!result) { + result = apr_strnatcmp(c1->name, c2->name); + } + } + + /* The names may be identical in respects other other than + * filename case when strnatcmp is used above, so fall back + * to strcmp on conflicts so that fn1.01.zzz and fn1.1.zzz + * are also sorted in a deterministic order. + */ + + if (!result && c1->ignore_case) { + result = strcasecmp (c1->name, c2->name); + } + + if (!result) { + result = strcmp (c1->name, c2->name); + } + + return result; +} + + +static int +mod_glfs_index_directory (request_rec *r, + glusterfs_dir_config_t *autoindex_conf) +{ + char *title_name = ap_escape_html(r->pool, r->uri); + char *title_endp; + char *name = r->filename; + char *pstring = NULL; + apr_finfo_t dirent; + long fd = -1; + apr_status_t status; + int num_ent = 0, x; + struct ent *head, *p; + struct ent **ar = NULL; + const char *qstring; + apr_int32_t autoindex_opts = autoindex_conf->opts; + char keyid; + char direction; + char *colargs; + char *fullpath; + apr_size_t dirpathlen; + char *ctype = "text/html"; + char *charset; + glusterfs_dir_config_t *dir_config = NULL; + int ret = -1; + struct dirent entry = {0, }; + struct stat st = {0, }; + char *path = NULL; + char *fname = NULL; + + dir_config = mod_glfs_dconfig (r); + if (!dir_config || !dir_config->handle) { + return HTTP_INTERNAL_SERVER_ERROR; + } + + path = r->uri + strlen (dir_config->mount_dir); + if (!dir_config->handle) { + ap_log_rerror(APLOG_MARK, APLOG_ERR, status, r, + "mod_glfs_index_directory: glusterfs handler is NULL, check glusterfs logfile %s for more details", + dir_config->logfile); + return HTTP_INTERNAL_SERVER_ERROR; + } + + fd = glusterfs_open (dir_config->handle, path, O_RDONLY, 0); + if (fd == 0) { + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, + "file permissions deny server access: %s", r->filename); + return HTTP_FORBIDDEN; + } + + if (autoindex_conf->ctype) { + ctype = autoindex_conf->ctype; + } + if (autoindex_conf->charset) { + charset = autoindex_conf->charset; + } + else { +#if APR_HAS_UNICODE_FS + charset = "UTF-8"; +#else + charset = "ISO-8859-1"; +#endif + } + if (*charset) { + ap_set_content_type(r, apr_pstrcat(r->pool, ctype, ";charset=", + charset, NULL)); + } + else { + ap_set_content_type(r, ctype); + } + + if (autoindex_opts & TRACK_MODIFIED) { + ap_update_mtime(r, r->finfo.mtime); + ap_set_last_modified(r); + ap_set_etag(r); + } + if (r->header_only) { + glusterfs_close (fd); + return 0; + } + + /* + * If there is no specific ordering defined for this directory, + * default to ascending by filename. + */ + keyid = autoindex_conf->default_keyid + ? autoindex_conf->default_keyid : K_NAME; + direction = autoindex_conf->default_direction + ? autoindex_conf->default_direction : D_ASCENDING; + + /* + * Figure out what sort of indexing (if any) we're supposed to use. + * + * If no QUERY_STRING was specified or client query strings have been + * explicitly disabled. + * If we are ignoring the client, suppress column sorting as well. + */ + if (autoindex_opts & IGNORE_CLIENT) { + qstring = NULL; + autoindex_opts |= SUPPRESS_COLSORT; + colargs = ""; + } + else { + char fval[5], vval[5], *ppre = "", *epattern = ""; + fval[0] = '\0'; vval[0] = '\0'; + qstring = r->args; + + while (qstring && *qstring) { + + /* C= First Sort key Column (N, M, S, D) */ + if ( qstring[0] == 'C' && qstring[1] == '=' + && qstring[2] && strchr(K_VALID, qstring[2]) + && ( qstring[3] == '&' || qstring[3] == ';' + || !qstring[3])) { + keyid = qstring[2]; + qstring += qstring[3] ? 4 : 3; + } + + /* O= Sort order (A, D) */ + else if ( qstring[0] == 'O' && qstring[1] == '=' + && ( (qstring[2] == D_ASCENDING) + || (qstring[2] == D_DESCENDING)) + && ( qstring[3] == '&' || qstring[3] == ';' + || !qstring[3])) { + direction = qstring[2]; + qstring += qstring[3] ? 4 : 3; + } + + /* F= Output Format (0 plain, 1 fancy (pre), 2 table) */ + else if ( qstring[0] == 'F' && qstring[1] == '=' + && qstring[2] && strchr("012", qstring[2]) + && ( qstring[3] == '&' || qstring[3] == ';' + || !qstring[3])) { + if (qstring[2] == '0') { + autoindex_opts &= ~(FANCY_INDEXING | TABLE_INDEXING); + } + else if (qstring[2] == '1') { + autoindex_opts = (autoindex_opts | FANCY_INDEXING) + & ~TABLE_INDEXING; + } + else if (qstring[2] == '2') { + autoindex_opts |= FANCY_INDEXING | TABLE_INDEXING; + } + strcpy(fval, ";F= "); + fval[3] = qstring[2]; + qstring += qstring[3] ? 4 : 3; + } + + /* V= Version sort (0, 1) */ + else if ( qstring[0] == 'V' && qstring[1] == '=' + && (qstring[2] == '0' || qstring[2] == '1') + && ( qstring[3] == '&' || qstring[3] == ';' + || !qstring[3])) { + if (qstring[2] == '0') { + autoindex_opts &= ~VERSION_SORT; + } + else if (qstring[2] == '1') { + autoindex_opts |= VERSION_SORT; + } + strcpy(vval, ";V= "); + vval[3] = qstring[2]; + qstring += qstring[3] ? 4 : 3; + } + + /* P= wildcard pattern (*.foo) */ + else if (qstring[0] == 'P' && qstring[1] == '=') { + const char *eos = qstring += 2; /* for efficiency */ + + while (*eos && *eos != '&' && *eos != ';') { + ++eos; + } + + if (eos == qstring) { + pstring = NULL; + } + else { + pstring = apr_pstrndup(r->pool, qstring, eos - qstring); + if (ap_unescape_url(pstring) != OK) { + /* ignore the pattern, if it's bad. */ + pstring = NULL; + } + else { + ppre = ";P="; + /* be correct */ + epattern = ap_escape_uri(r->pool, pstring); + } + } + + if (*eos && *++eos) { + qstring = eos; + } + else { + qstring = NULL; + } + } + + /* Syntax error? Ignore the remainder! */ + else { + qstring = NULL; + } + } + colargs = apr_pstrcat(r->pool, fval, vval, ppre, epattern, NULL); + } + + /* Spew HTML preamble */ + title_endp = title_name + strlen(title_name) - 1; + + while (title_endp > title_name && *title_endp == '/') { + *title_endp-- = '\0'; + } + + emit_head(r, find_header(autoindex_conf, r), + autoindex_opts & SUPPRESS_PREAMBLE, + autoindex_opts & EMIT_XHTML, title_name); + + /* + * Since we don't know how many dir. entries there are, put them into a + * linked list and then arrayificate them so qsort can use them. + */ + head = NULL; + p = make_parent_entry(autoindex_opts, autoindex_conf, r, keyid, direction); + if (p != NULL) { + p->next = head; + head = p; + num_ent++; + } + fullpath = apr_palloc(r->pool, APR_PATH_MAX); + dirpathlen = strlen(name); + memcpy(fullpath, name, dirpathlen); + + do { + ret = glusterfs_readdir (fd, &entry, sizeof (entry)); + if (ret <= 0) { + break; + } + + fname = apr_pstrcat (r->pool, path, entry.d_name); + + ret = glusterfs_stat (dir_config->handle, fname, &st); + if (ret != 0) { + break; + } + + dirent.fname = fname; + dirent.name = apr_pstrdup (r->pool, entry.d_name); + fill_out_finfo (&dirent, &st, + APR_FINFO_MIN | APR_FINFO_IDENT | APR_FINFO_NLINK | APR_FINFO_OWNER | + APR_FINFO_PROT); + + p = make_autoindex_entry(&dirent, autoindex_opts, autoindex_conf, r, + keyid, direction, pstring); + if (p != NULL) { + p->next = head; + head = p; + num_ent++; + } + } while (1); + + if (num_ent > 0) { + ar = (struct ent **) apr_palloc(r->pool, + num_ent * sizeof(struct ent *)); + p = head; + x = 0; + while (p) { + ar[x++] = p; + p = p->next; + } + + qsort((void *) ar, num_ent, sizeof(struct ent *), + (int (*)(const void *, const void *)) dsortf); + } + output_directories(ar, num_ent, autoindex_conf, r, autoindex_opts, + keyid, direction, colargs); + glusterfs_close (fd); + + emit_tail(r, find_readme(autoindex_conf, r), + autoindex_opts & SUPPRESS_PREAMBLE); + + return 0; +} + + +static int +handle_autoindex(request_rec *r) +{ + glusterfs_dir_config_t *dir_config = NULL; + int allow_opts; + + allow_opts = ap_allow_options(r); + + r->allowed |= (AP_METHOD_BIT << M_GET); + if (r->method_number != M_GET) { + return DECLINED; + } + + dir_config = mod_glfs_dconfig (r); + + /* OK, nothing easy. Trot out the heavy artillery... */ + + if (allow_opts & OPT_INDEXES) { + int errstatus; + + if ((errstatus = ap_discard_request_body(r)) != OK) { + return errstatus; + } + + /* KLUDGE --- make the sub_req lookups happen in the right directory. + * Fixing this in the sub_req_lookup functions themselves is difficult, + * and would probably break virtual includes... + */ + + if (r->filename[strlen(r->filename) - 1] != '/') { + r->filename = apr_pstrcat(r->pool, r->filename, "/", NULL); + } + return mod_glfs_index_directory(r, dir_config); + } else { + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, + "Directory index forbidden by " + "Options directive: %s", r->filename); + return HTTP_FORBIDDEN; + } +} + + +static int +mod_glfs_handler (request_rec *r) +{ + conn_rec *c = r->connection; + apr_bucket_brigade *bb; + apr_bucket *e; + core_dir_config *d; + int errstatus; + long fd = -1; + apr_status_t status; + glusterfs_dir_config_t *dir_config = NULL; + char *path = NULL; + int num_ranges = 0; + apr_size_t size = 0; + apr_off_t range_start = 0, range_end = 0; + char *current = NULL; + apr_status_t rv = 0; + + /* XXX if/when somebody writes a content-md5 filter we either need to + * remove this support or coordinate when to use the filter vs. + * when to use this code + * The current choice of when to compute the md5 here matches the 1.3 + * support fairly closely (unlike 1.3, we don't handle computing md5 + * when the charset is translated). + */ + + int bld_content_md5; + if (!r->handler || (r->handler && strcmp (r->handler, GLUSTERFS_HANDLER))) + return DECLINED; + + if (r->uri[0] == '\0') { + return DECLINED; + } + + if (r->finfo.filetype == APR_DIR) { + return handle_autoindex (r); + } + + dir_config = mod_glfs_dconfig (r); + + ap_allow_standard_methods(r, MERGE_ALLOW, M_GET, -1); + + /* We understood the (non-GET) method, but it might not be legal for + this particular resource. Check to see if the 'deliver_script' + flag is set. If so, then we go ahead and deliver the file since + it isn't really content (only GET normally returns content). + + Note: based on logic further above, the only possible non-GET + method at this point is POST. In the future, we should enable + script delivery for all methods. */ + if (r->method_number != M_GET) { + core_request_config *req_cfg; + + req_cfg = ap_get_module_config(r->request_config, &core_module); + if (!req_cfg->deliver_script) { + /* The flag hasn't been set for this request. Punt. */ + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, + "This resource does not accept the %s method.", + r->method); + return HTTP_METHOD_NOT_ALLOWED; + } + } + + if (!dir_config->handle) { + ap_log_rerror (APLOG_MARK, APLOG_ERR, 0, r, + "glusterfs initialization failed\n"); + return HTTP_FORBIDDEN; + } + + d = (core_dir_config *)ap_get_module_config(r->per_dir_config, + &core_module); + bld_content_md5 = (d->content_md5 & 1) + && r->output_filters->frec->ftype != AP_FTYPE_RESOURCE; + + if ((errstatus = ap_discard_request_body(r)) != OK) { + return errstatus; + } + + if (r->finfo.filetype == 0) { + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, + "File does not exist: %s", r->filename); + return HTTP_NOT_FOUND; + } + + if ((r->used_path_info != AP_REQ_ACCEPT_PATH_INFO) && + r->path_info && *r->path_info) + { + /* default to reject */ + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, + "File does not exist: %s", + apr_pstrcat(r->pool, r->filename, r->path_info, NULL)); + return HTTP_NOT_FOUND; + } + + ap_update_mtime (r, r->finfo.mtime); + ap_set_last_modified (r); + ap_set_etag (r); + apr_table_setn (r->headers_out, "Accept-Ranges", "bytes"); + + num_ranges = ap_set_byterange(r); + if (num_ranges == 0) { + size = r->finfo.size; + } else { + char *tmp = apr_pstrdup (r->pool, r->range); + while ((current = ap_getword(r->pool, (const char **)&tmp, ',')) + && (rv = parse_byterange(current, r->finfo.size, &range_start, + &range_end))) { + size += (range_end - range_start); + } + } + + ap_set_content_length (r, size); + + if ((errstatus = ap_meets_conditions(r)) != OK) { + r->status = errstatus; + } + + /* file is small enough to have already got the content in glusterfs_lookup */ + if (r->finfo.size <= dir_config->xattr_file_size && dir_config->buf) { + if (bld_content_md5) { + apr_table_setn (r->headers_out, "Content-MD5", + (const char *)ap_md5_binary(r->pool, dir_config->buf, r->finfo.size)); + } + + ap_log_rerror (APLOG_MARK, APLOG_NOTICE, 0, r, + "fetching data from glusterfs through xattr interface\n"); + + bb = apr_brigade_create(r->pool, c->bucket_alloc); + + e = apr_bucket_heap_create (dir_config->buf, r->finfo.size, free, c->bucket_alloc); + APR_BRIGADE_INSERT_TAIL (bb, e); + + e = apr_bucket_eos_create(c->bucket_alloc); + APR_BRIGADE_INSERT_TAIL(bb, e); + + dir_config->buf = NULL; + + /* let the byterange_filter handle multipart requests */ + status = ap_pass_brigade(r->output_filters, bb); + if (status == APR_SUCCESS + || r->status != HTTP_OK + || c->aborted) { + return OK; + } + else { + /* no way to know what type of error occurred */ + ap_log_rerror(APLOG_MARK, APLOG_DEBUG, status, r, + "mod_glfs_handler: ap_pass_brigade returned %i", + status); + return HTTP_INTERNAL_SERVER_ERROR; + } + } + + if (!dir_config->handle) { + ap_log_rerror(APLOG_MARK, APLOG_ERR, status, r, + "mod_glfs_handler: glusterfs handler is NULL, check glusterfs logfile %s for more details", + dir_config->logfile); + return HTTP_INTERNAL_SERVER_ERROR; + } + + /* do standard open/read/close to fetch content */ + path = r->uri + strlen (dir_config->mount_dir); + + fd = glusterfs_open (dir_config->handle, path , O_RDONLY, 0); + if (fd == 0) { + ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, + "file permissions deny server access: %s", r->filename); + return HTTP_FORBIDDEN; + } + + /* byterange_filter cannot handle range requests, since we are not sending the + whole data in a single brigade */ + + + if (num_ranges == 0) { + mod_glfs_read_async (r, NULL, fd, 0, -1); + } else { + mod_glfs_handle_byte_ranges (r, fd, num_ranges); + } + + glusterfs_close (fd); +} + + +#if 0 +static apr_status_t +mod_glfs_output_filter (ap_filter_t *f, + apr_bucket_brigade *b) +{ + size_t size = 0; + apr_bucket_t *e = NULL; + size = atol (apr_table_get (r->notes, MOD_GLFS_SIZE)); + + for (e = APR_BRIGADE_FIRST(b); + e != APR_BRIGADE_SENTINEL(b); + e = APR_BUCKET_NEXT(e)) + { + /* FIXME: can there be more than one heap buckets? */ + if (e->type == &apr_bucket_type_heap) { + break; + } + } + + if (e != APR_BRIGADE_SENTINEL(b)) { + e->length = size; + } + + return ap_pass_brigade (f->next, b); +} +#endif + +static int +mod_glfs_fixup_dir(request_rec *r) +{ + glusterfs_dir_config_t *d; + char *dummy_ptr[1]; + char **names_ptr; + int num_names; + int error_notfound = 0; + + /* only handle requests against directories */ + if (r->finfo.filetype != APR_DIR) { + return DECLINED; + } + + if (!r->handler || strcmp (r->handler, GLUSTERFS_HANDLER)) { + return DECLINED; + } + + /* Never tolerate path_info on dir requests */ + if (r->path_info && *r->path_info) { + return DECLINED; + } + + d = (glusterfs_dir_config_t *)ap_get_module_config(r->per_dir_config, + &glusterfs_module); + + /* Redirect requests that are not '/' terminated */ + if (r->uri[0] == '\0' || r->uri[strlen(r->uri) - 1] != '/') + { + char *ifile; + + if (!d->do_slash) { + return DECLINED; + } + + /* Only redirect non-get requests if we have no note to warn + * that this browser cannot handle redirs on non-GET requests + * (such as Microsoft's WebFolders). + */ + if ((r->method_number != M_GET) + && apr_table_get(r->subprocess_env, "redirect-carefully")) { + return DECLINED; + } + + if (r->args != NULL) { + ifile = apr_pstrcat(r->pool, ap_escape_uri(r->pool, r->uri), + "/", "?", r->args, NULL); + } + else { + ifile = apr_pstrcat(r->pool, ap_escape_uri(r->pool, r->uri), + "/", NULL); + } + + apr_table_setn(r->headers_out, "Location", + ap_construct_url(r->pool, ifile, r)); + return HTTP_MOVED_PERMANENTLY; + } + + if (d->index_names) { + names_ptr = (char **)d->index_names->elts; + num_names = d->index_names->nelts; + } + else { + dummy_ptr[0] = AP_DEFAULT_INDEX; + names_ptr = dummy_ptr; + num_names = 1; + } + + for (; num_names; ++names_ptr, --num_names) { + /* XXX: Is this name_ptr considered escaped yet, or not??? */ + char *name_ptr = *names_ptr; + request_rec *rr; + + /* Once upon a time args were handled _after_ the successful redirect. + * But that redirect might then _refuse_ the given r->args, creating + * a nasty tangle. It seems safer to consider the r->args while we + * determine if name_ptr is our viable index, and therefore set them + * up correctly on redirect. + */ + if (r->args != NULL) { + name_ptr = apr_pstrcat(r->pool, name_ptr, "?", r->args, NULL); + } + + rr = ap_sub_req_lookup_uri(name_ptr, r, NULL); + + /* The sub request lookup is very liberal, and the core map_to_storage + * handler will almost always result in HTTP_OK as /foo/index.html + * may be /foo with PATH_INFO="/index.html", or even / with + * PATH_INFO="/foo/index.html". To get around this we insist that the + * the index be a regular filetype. + * + * Another reason is that the core handler also makes the assumption + * that if r->finfo is still NULL by the time it gets called, the + * file does not exist. + */ + if (rr->status == HTTP_OK + && ( (rr->handler && !strcmp(rr->handler, "proxy-server")) + || rr->finfo.filetype == APR_REG)) { + ap_internal_fast_redirect(rr, r); + return OK; + } + + /* If the request returned a redirect, propagate it to the client */ + + if (ap_is_HTTP_REDIRECT(rr->status) + || (rr->status == HTTP_NOT_ACCEPTABLE && num_names == 1) + || (rr->status == HTTP_UNAUTHORIZED && num_names == 1)) { + + apr_pool_join(r->pool, rr->pool); + error_notfound = rr->status; + r->notes = apr_table_overlay(r->pool, r->notes, rr->notes); + r->headers_out = apr_table_overlay(r->pool, r->headers_out, + rr->headers_out); + r->err_headers_out = apr_table_overlay(r->pool, r->err_headers_out, + rr->err_headers_out); + return error_notfound; + } + + /* If the request returned something other than 404 (or 200), + * it means the module encountered some sort of problem. To be + * secure, we should return the error, rather than allow autoindex + * to create a (possibly unsafe) directory index. + * + * So we store the error, and if none of the listed files + * exist, we return the last error response we got, instead + * of a directory listing. + */ + if (rr->status && rr->status != HTTP_NOT_FOUND + && rr->status != HTTP_OK) { + error_notfound = rr->status; + } + + ap_destroy_sub_req(rr); + } + + if (error_notfound) { + return error_notfound; + } + + /* nothing for us to do, pass on through */ + return DECLINED; +} + + +static void +mod_glfs_register_hooks(apr_pool_t *p) +{ + ap_hook_child_init (mod_glfs_child_init, NULL, NULL, APR_HOOK_MIDDLE); + ap_hook_handler (mod_glfs_handler, NULL, NULL, APR_HOOK_REALLY_FIRST); + ap_hook_map_to_storage (mod_glfs_map_to_storage, NULL, NULL, APR_HOOK_REALLY_FIRST); + ap_hook_fixups(mod_glfs_fixup_dir,NULL,NULL,APR_HOOK_LAST); + +/* mod_glfs_output_filter_handle = + ap_register_output_filter ("MODGLFS", mod_glfs_output_filter, + NULL, AP_FTYPE_PROTOCOL); */ +} + +static const char * +cmd_add_index (cmd_parms *cmd, void *dummy, const char *arg) +{ + glusterfs_dir_config_t *d = dummy; + + if (!d->index_names) { + d->index_names = apr_array_make(cmd->pool, 2, sizeof(char *)); + } + *(const char **)apr_array_push(d->index_names) = arg; + return NULL; +} + +static const char * +cmd_configure_slash (cmd_parms *cmd, void *d_, int arg) +{ + glusterfs_dir_config_t *d = d_; + + d->do_slash = arg ? SLASH_ON : SLASH_OFF; + return NULL; +} + +#define DIR_CMD_PERMS OR_INDEXES + +static const +command_rec mod_glfs_cmds[] = +{ + AP_INIT_TAKE1( + "GlusterfsLogfile", + cmd_add_logfile, + NULL, + ACCESS_CONF, /*FIXME: allow overriding in .htaccess files */ + "Glusterfs logfile" + ), + + AP_INIT_TAKE1( + "GlusterfsLoglevel", + cmd_set_loglevel, + NULL, + ACCESS_CONF, + "Glusterfs loglevel:anyone of none, critical, error, warning, debug" + ), + + AP_INIT_TAKE1( + "GlusterfsCacheTimeout", + cmd_set_cache_timeout, + NULL, + ACCESS_CONF, + "Timeout value in seconds for lookup and stat cache of libglusterfsclient" + ), + + AP_INIT_TAKE1( + "GlusterfsVolumeSpecfile", + cmd_add_volume_specfile, + NULL, + ACCESS_CONF, + "Glusterfs Volume specfication file specifying filesystem under this directory" + ), + + AP_INIT_TAKE1( + "GlusterfsXattrFileSize", + cmd_add_xattr_file_size, + NULL, + ACCESS_CONF, + "Maximum size of the file that can be fetched through extended attribute interface of libglusterfsclient" + ), + + /* mod_dir cmds */ + AP_INIT_ITERATE("DirectoryIndex", cmd_add_index, + NULL, DIR_CMD_PERMS, + "a list of file names"), + + AP_INIT_FLAG("DirectorySlash", cmd_configure_slash, + NULL, DIR_CMD_PERMS, + "On or Off"), + + /* autoindex cmds */ + AP_INIT_ITERATE2("AddIcon", cmd_add_icon, + BY_PATH, DIR_CMD_PERMS, + "an icon URL followed by one or more filenames"), + + AP_INIT_ITERATE2("AddIconByType", cmd_add_icon, + BY_TYPE, DIR_CMD_PERMS, + "an icon URL followed by one or more MIME types"), + + AP_INIT_ITERATE2("AddIconByEncoding", cmd_add_icon, + BY_ENCODING, DIR_CMD_PERMS, + "an icon URL followed by one or more content encodings"), + + AP_INIT_ITERATE2("AddAlt", cmd_add_alt, BY_PATH, + DIR_CMD_PERMS, + "alternate descriptive text followed by one or more " + "filenames"), + + AP_INIT_ITERATE2("AddAltByType", cmd_add_alt, + BY_TYPE, DIR_CMD_PERMS, + "alternate descriptive text followed by one or more MIME " + "types"), + + AP_INIT_ITERATE2("AddAltByEncoding", cmd_add_alt, + BY_ENCODING, DIR_CMD_PERMS, + "alternate descriptive text followed by one or more " + "content encodings"), + + AP_INIT_TAKE_ARGV("IndexOptions", cmd_add_opts, + NULL, DIR_CMD_PERMS, + "one or more index options [+|-][]"), + + AP_INIT_TAKE2("IndexOrderDefault", cmd_set_default_order, + NULL, DIR_CMD_PERMS, + "{Ascending,Descending} {Name,Size,Description,Date}"), + + AP_INIT_ITERATE("IndexIgnore", cmd_add_ignore, + NULL, DIR_CMD_PERMS, + "one or more file extensions"), + + AP_INIT_ITERATE2("AddDescription", cmd_add_desc, + BY_PATH, DIR_CMD_PERMS, + "Descriptive text followed by one or more filenames"), + + AP_INIT_TAKE1("HeaderName", cmd_add_header, + NULL, DIR_CMD_PERMS, + "a filename"), + + AP_INIT_TAKE1("ReadmeName", cmd_add_readme, + NULL, DIR_CMD_PERMS, + "a filename"), + + AP_INIT_RAW_ARGS("FancyIndexing", ap_set_deprecated, + NULL, OR_ALL, + "The FancyIndexing directive is no longer supported. " + "Use IndexOptions FancyIndexing."), + + AP_INIT_TAKE1("DefaultIcon", ap_set_string_slot, + (void *)APR_OFFSETOF(glusterfs_dir_config_t, default_icon), + DIR_CMD_PERMS, "an icon URL"), + + AP_INIT_TAKE1("IndexStyleSheet", ap_set_string_slot, + (void *)APR_OFFSETOF(glusterfs_dir_config_t, style_sheet), + DIR_CMD_PERMS, "URL to style sheet"), + + {NULL} +}; + +module AP_MODULE_DECLARE_DATA glusterfs_module = +{ + STANDARD20_MODULE_STUFF, + mod_glfs_create_dir_config, + mod_glfs_merge_dir_config, + NULL, //mod_glfs_create_server_config, + NULL, //mod_glfs_merge_server_config, + mod_glfs_cmds, + mod_glfs_register_hooks, +}; diff --git a/mod_glusterfs/apache/Makefile.am b/mod_glusterfs/apache/Makefile.am new file mode 100644 index 000000000..bda039310 --- /dev/null +++ b/mod_glusterfs/apache/Makefile.am @@ -0,0 +1,10 @@ +SUBDIRS = $(MOD_GLUSTERFS_HTTPD_VERSION) + +EXTRA_DIST = 1.3/Makefile.am 1.3/Makefile.in \ + 1.3/src/Makefile.am 1.3/src/Makefile.in \ + 1.3/src/mod_glusterfs.c \ + 1.3/src/README.txt \ + 2.2/Makefile.am 2.2/Makefile.in \ + 2.2/src/Makefile.am 2.2/src/Makefile.in \ + 2.2/src/mod_glusterfs.c +CLEANFILES = diff --git a/mod_glusterfs/lighttpd/1.4/Makefile.am b/mod_glusterfs/lighttpd/1.4/Makefile.am new file mode 100644 index 000000000..eda329111 --- /dev/null +++ b/mod_glusterfs/lighttpd/1.4/Makefile.am @@ -0,0 +1,3 @@ +EXTRA_DIST = Makefile.am.diff mod_glusterfs.c mod_glusterfs.h README.txt + +CLEANFILES = diff --git a/mod_glusterfs/lighttpd/1.4/Makefile.am.diff b/mod_glusterfs/lighttpd/1.4/Makefile.am.diff new file mode 100644 index 000000000..375696b5d --- /dev/null +++ b/mod_glusterfs/lighttpd/1.4/Makefile.am.diff @@ -0,0 +1,29 @@ +--- lighttpd-1.4.19/src/Makefile.am 2008-04-16 18:42:18.000000000 +0400 ++++ lighttpd-1.4.19.mod/src/Makefile.am 2008-04-16 18:41:11.000000000 +0400 +@@ -1,4 +1,4 @@ +-AM_CFLAGS = $(FAM_CFLAGS) ++AM_CFLAGS = $(FAM_CFLAGS) -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 + + noinst_PROGRAMS=proc_open lemon # simple-fcgi #graphic evalo bench ajp ssl error_test adserver gen-license + sbin_PROGRAMS=lighttpd lighttpd-angel +@@ -241,6 +241,11 @@ + mod_accesslog_la_LDFLAGS = -module -export-dynamic -avoid-version -no-undefined + mod_accesslog_la_LIBADD = $(common_libadd) + ++lib_LTLIBRARIES += mod_glusterfs.la ++mod_glusterfs_la_SOURCES = mod_glusterfs.c ++mod_glusterfs_la_CFLAGS = $(AM_CFLAGS) ++mod_glusterfs_la_LDFLAGS = -module -export-dynamic -avoid-version -no-undefined -lglusterfsclient -lpthread ++mod_glusterfs_la_LIBADD = $(common_libadd) + + hdr = server.h buffer.h network.h log.h keyvalue.h \ + response.h request.h fastcgi.h chunk.h \ +@@ -254,7 +259,7 @@ + configparser.h mod_ssi_exprparser.h \ + sys-mmap.h sys-socket.h mod_cml.h mod_cml_funcs.h \ + splaytree.h proc_open.h status_counter.h \ +- mod_magnet_cache.h ++ mod_magnet_cache.h mod_glusterfs.h + + DEFS= @DEFS@ -DLIBRARY_DIR="\"$(libdir)\"" -DSBIN_DIR="\"$(sbindir)\"" + diff --git a/mod_glusterfs/lighttpd/1.4/README.txt b/mod_glusterfs/lighttpd/1.4/README.txt new file mode 100644 index 000000000..786a146e4 --- /dev/null +++ b/mod_glusterfs/lighttpd/1.4/README.txt @@ -0,0 +1,57 @@ +Introduction +============ +mod_glusterfs is a module written for lighttpd to speed up the access of files present on glusterfs. mod_glusterfs uses libglusterfsclient library provided for glusterfs and hence can be used without fuse (File System in User Space). + +Usage +===== +To use mod_glusterfs with lighttpd-1.4, copy mod_glusterfs.c and mod_glusterfs.h into src/ of lighttpd-1.4 source tree, and apply the Makefile.am.diff to src/Makefile.am. Re-run ./autogen.sh on the top level of the lighttpd-1.4 build tree and recompile. + +# cp mod_glusterfs.[ch] /home/glusterfs/lighttpd-1.4/src/ +# cp Makefile.am.diff /home/glusterfs/lighttpd-1.4/ +# cd /home/glusterfs/lighttpd-1.4 +# patch -p1 < Makefile.am.diff +# ./autogen.sh +# ./configure +# make +# make install + +Configuration +============= +* mod_glusterfs should be listed at the begining of the list server.modules in lighttpd.conf. + +Below is a snippet from lighttpd.conf concerning to mod_glusterfs. + +$HTTP["url"] =~ "^/glusterfs" { + glusterfs.prefix = "/glusterfs" + glusterfs.document-root = "/home/glusterfs/document-root" + glusterfs.logfile = "/var/log/glusterfs-logfile" + glusterfs.volume-specfile = "/etc/glusterfs/glusterfs.vol" + glusterfs.loglevel = "error" + glusterfs.cache-timeout = 300 + glusterfs.xattr-interface-size-limit = "65536" +} + +* $HTTP["url"] =~ "^/glusterfs" + A perl style regular expression used to match against the url. If regular expression matches the url, the url is handled by mod_glusterfs. Note that the pattern given here should match glusterfs.prefix. + +* glusterfs.prefix (COMPULSORY) + A string to be present at the starting of the file path in the url so that the file would be handled by glusterfs. + Eg., A GET request on the url http://www.example.com/glusterfs-prefix/some-dir/example-file will result in fetching of the file "/some-dir/example-file" from glusterfs mount if glusterfs.prefix is set to "/glusterfs-prefix". + +* glusterfs.volume-specfile (COMPULSORY) + Path to the the glusterfs volume specification file. + +* glusterfs.logfile (COMPULSORY) + Path to the glusterfs logfile. + +* glusterfs.loglevel (OPTIONAL, default = warning) + Allowed values are critical, error, warning, debug, none in the decreasing order of severity of error conditions. + +* glusterfs.cache-timeout (OPTIONAL, default = 0) + Timeout values for glusterfs stat and lookup cache. + +* glusterfs.document-root (COMPULSORY) + An absolute path, relative to which all the files are fetched from glusterfs. + +* glusterfs.xattr-interface-size-limit (OPTIONAL, default = 0) + Files with sizes upto and including this value are fetched through the extended attribute interface of glusterfs rather than the usual open-read-close set of operations. For files of small sizes, it is recommended to use extended attribute interface. diff --git a/mod_glusterfs/lighttpd/1.4/mod_glusterfs.c b/mod_glusterfs/lighttpd/1.4/mod_glusterfs.c new file mode 100644 index 000000000..9de33b7c4 --- /dev/null +++ b/mod_glusterfs/lighttpd/1.4/mod_glusterfs.c @@ -0,0 +1,1709 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <ctype.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <pthread.h> +#include <sys/types.h> +#include <fcntl.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <unistd.h> +#include <assert.h> + +#include "base.h" +#include "log.h" +#include "buffer.h" + +#include "plugin.h" + +#include "stat_cache.h" +#include "mod_glusterfs.h" +#include "etag.h" +#include "http_chunk.h" +#include "response.h" + +#include "fdevent.h" +#include <libglusterfsclient.h> + +#ifdef HAVE_ATTR_ATTRIBUTES_H +#include <attr/attributes.h> +#endif + +#ifdef HAVE_FAM_H +# include <fam.h> +#endif + +#include "sys-mmap.h" + +/* NetBSD 1.3.x needs it */ +#ifndef MAP_FAILED +# define MAP_FAILED -1 +#endif + +#ifndef O_LARGEFILE +# define O_LARGEFILE 0 +#endif + +#ifndef HAVE_LSTAT +#define lstat stat +#endif + +#if 0 +/* enables debug code for testing if all nodes in the stat-cache as accessable */ +#define DEBUG_STAT_CACHE +#endif + +#ifdef HAVE_LSTAT +#undef HAVE_LSTAT +#endif + +#define GLUSTERFS_FILE_CHUNK (FILE_CHUNK + 1) + +/* Keep this value large. Each glusterfs_async_read of GLUSTERFS_CHUNK_SIZE results in a network_backend_write of the read data*/ + +#define GLUSTERFS_CHUNK_SIZE 8192 + +/** + * this is a staticfile for a lighttpd plugin + * + */ + +typedef struct glusterfs_async_local { + int op_ret; + int op_errno; + char async_read_complete; + off_t length; + size_t read_bytes; + glusterfs_read_buf_t *buf; + pthread_mutex_t lock; + pthread_cond_t cond; +} glusterfs_async_local_t; + + +typedef struct { + unsigned long fd; + void *buf; + buffer *glusterfs_path; + /* off_t response_content_length; */ + int prefix; +}mod_glusterfs_ctx_t; + +/* plugin config for all request/connections */ +typedef struct { + buffer *logfile; + buffer *loglevel; + buffer *specfile; + buffer *prefix; + buffer *xattr_file_size; + buffer *document_root; + array *exclude_exts; + unsigned short cache_timeout; + + /* FIXME: its a pointer, hence cant be short */ + unsigned long handle; +} plugin_config; + +static int (*network_backend_write)(struct server *srv, connection *con, int fd, chunkqueue *cq); + +typedef struct { + PLUGIN_DATA; + buffer *range_buf; + plugin_config **config_storage; + + plugin_config conf; +} plugin_data; + +typedef struct { + chunkqueue *cq; + glusterfs_read_buf_t *buf; + size_t length; +}mod_glusterfs_chunkqueue; + +#ifdef HAVE_FAM_H +typedef struct { + FAMRequest *req; + FAMConnection *fc; + + buffer *name; + + int version; +} fam_dir_entry; +#endif + +/* the directory name is too long to always compare on it + * - we need a hash + * - the hash-key is used as sorting criteria for a tree + * - a splay-tree is used as we can use the caching effect of it + */ + +/* we want to cleanup the stat-cache every few seconds, let's say 10 + * + * - remove entries which are outdated since 30s + * - remove entries which are fresh but havn't been used since 60s + * - if we don't have a stat-cache entry for a directory, release it from the monitor + */ + +#ifdef DEBUG_STAT_CACHE +typedef struct { + int *ptr; + + size_t used; + size_t size; +} fake_keys; + +static fake_keys ctrl; +#endif + +int +mod_glusterfs_readv_async_cbk (glusterfs_read_buf_t *buf, + void *cbk_data) +{ + glusterfs_async_local_t *local = cbk_data; + pthread_mutex_lock (&local->lock); + { + local->async_read_complete = 1; + local->buf = buf; + + pthread_cond_signal (&local->cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + +static int +mod_glusterfs_read_async (server *srv, connection *con, chunk *glusterfs_chunk) +{ + glusterfs_async_local_t local; + off_t end = 0; + int nbytes; + int complete; + chunkqueue *cq = NULL; + chunk *c = NULL; + off_t offset = glusterfs_chunk->file.start; + size_t length = glusterfs_chunk->file.length; + long fd = (long)glusterfs_chunk->file.name; + + pthread_cond_init (&local.cond, NULL); + pthread_mutex_init (&local.lock, NULL); + + //local.fd = fd; + memset (&local, 0, sizeof (local)); + + if (length > 0) + end = offset + length; + + cq = chunkqueue_init (); + if (!cq) { + con->http_status = 500; + return HANDLER_FINISHED; + } + + do { + glusterfs_read_buf_t *buf; + int i; + if (length > 0) { + nbytes = end - offset; + if (nbytes > GLUSTERFS_CHUNK_SIZE) + nbytes = GLUSTERFS_CHUNK_SIZE; + } else + nbytes = GLUSTERFS_CHUNK_SIZE; + + glusterfs_read_async(fd, + nbytes, + offset, + mod_glusterfs_readv_async_cbk, + (void *)&local); + + pthread_mutex_lock (&local.lock); + { + while (!local.async_read_complete) { + pthread_cond_wait (&local.cond, &local.lock); + } + + local.op_ret = local.buf->op_ret; + local.op_errno = local.buf->op_errno; + + local.async_read_complete = 0; + buf = local.buf; + + if ((int)length < 0) + complete = (local.buf->op_ret <= 0); + else { + local.read_bytes += local.buf->op_ret; + complete = ((local.read_bytes == length) || (local.buf->op_ret <= 0)); + } + } + pthread_mutex_unlock (&local.lock); + + if (local.op_ret > 0) { + unsigned long check = 0; + for (i = 0; i < buf->count; i++) { + buffer *nw_write_buf = buffer_init (); + + check += buf->vector[i].iov_len; + + nw_write_buf->used = nw_write_buf->size = buf->vector[i].iov_len + 1; + nw_write_buf->ptr = buf->vector[i].iov_base; + + // buffer_copy_memory (nw_write_buf, buf->vector[i].iov_base, buf->vector[i].iov_len + 1); + offset += local.op_ret; + chunkqueue_append_buffer_weak (cq, nw_write_buf); + } + + network_backend_write (srv, con, con->fd, cq); + + if (chunkqueue_written (cq) != local.op_ret) { + mod_glusterfs_chunkqueue *gf_cq; + glusterfs_chunk->file.start = offset; + if ((int)glusterfs_chunk->file.length > 0) + glusterfs_chunk->file.length -= local.read_bytes; + + gf_cq = calloc (1, sizeof (*gf_cq)); + /* ERR_ABORT (gf_cq); */ + gf_cq->cq = cq; + gf_cq->buf = buf; + gf_cq->length = local.op_ret; + glusterfs_chunk->file.mmap.start = (char *)gf_cq; + return local.read_bytes; + } + + for (c = cq->first ; c; c = c->next) + c->mem->ptr = NULL; + + chunkqueue_reset (cq); + } + + glusterfs_free (buf); + } while (!complete); + + chunkqueue_free (cq); + glusterfs_close (fd); + + if (local.op_ret < 0) + con->http_status = 500; + + return (local.op_ret < 0 ? HANDLER_FINISHED : HANDLER_GO_ON); +} + +int mod_glusterfs_network_backend_write(struct server *srv, connection *con, int fd, chunkqueue *cq) +{ + chunk *c, *prev, *first; + int chunks_written = 0; + int error = 0; + + for (first = prev = c = cq->first; c; c = c->next, chunks_written++) { + + if (c->type == MEM_CHUNK && c->mem->used && !c->mem->ptr) { + if (cq->first != c) { + prev->next = NULL; + + /* call stored network_backend_write */ + network_backend_write (srv, con, fd, cq); + + prev->next = c; + } + cq->first = c->next; + + if (c->file.fd < 0) { + error = HANDLER_ERROR; + break; + } + + if (c->file.mmap.start) { + chunk *tmp; + mod_glusterfs_chunkqueue *gf_cq = (mod_glusterfs_chunkqueue *)c->file.mmap.start; + + network_backend_write (srv, con, fd, gf_cq->cq); + + if ((size_t)chunkqueue_written (gf_cq->cq) != gf_cq->length) { + cq->first = first; + return chunks_written; + } + for (tmp = gf_cq->cq->first ; tmp; tmp = tmp->next) + tmp->mem->ptr = NULL; + + chunkqueue_free (gf_cq->cq); + glusterfs_free (gf_cq->buf); + free (gf_cq); + c->file.mmap.start = NULL; + } + + mod_glusterfs_read_async (srv, con, c); //c->file.fd, c->file.start, -1);//c->file.length); + if (c->file.mmap.start) { + /* pending chunkqueue from mod_glusterfs_read_async to be written to network */ + cq->first = first; + return chunks_written; + } + + buffer_free (c->mem); + c->mem = NULL; + + c->type = FILE_CHUNK; + c->offset = c->file.length = 0; + c->file.name = NULL; + + if (first == c) + first = c->next; + + if (cq->last == c) + cq->last = NULL; + + prev->next = c->next; + + free(c); + } + prev = c; + } + + network_backend_write (srv, con, fd, cq); + + cq->first = first; + + return chunks_written; +} + +int chunkqueue_append_glusterfs_file (connection *con, long fd, off_t offset, off_t len) +{ + chunk *c = NULL; + c = chunkqueue_get_append_tempfile (con->write_queue); + + if (c->file.is_temp) { + close (c->file.fd); + unlink (c->file.name->ptr); + } + + c->type = MEM_CHUNK; + + c->mem = buffer_init (); + c->mem->used = len + 1; + c->mem->ptr = NULL; + c->offset = 0; + + /* buffer_copy_string_buffer (c->file.name, fn); */ + buffer_free (c->file.name); + + /* fd returned by libglusterfsclient is a pointer */ + c->file.name = (buffer *)fd; + c->file.start = offset; + c->file.length = len; + + //c->file.fd = fd; + c->file.mmap.start = NULL; + return 0; +} + +/* init the plugin data */ +INIT_FUNC(mod_glusterfs_init) { + plugin_data *p; + + p = calloc(1, sizeof(*p)); + /* ERR_ABORT (p); */ + network_backend_write = NULL; + + return p; +} + +/* detroy the plugin data */ +FREE_FUNC(mod_glusterfs_free) { + plugin_data *p = p_d; + + UNUSED (srv); + + if (!p) return HANDLER_GO_ON; + + if (p->config_storage) { + size_t i; + for (i = 0; i < srv->config_context->used; i++) { + plugin_config *s = p->config_storage[i]; + + buffer_free (s->logfile); + buffer_free (s->loglevel); + buffer_free (s->specfile); + buffer_free (s->prefix); + buffer_free (s->xattr_file_size); + buffer_free (s->document_root); + array_free (s->exclude_exts); + + free (s); + } + free (p->config_storage); + } + buffer_free (p->range_buf); + + free (p); + + return HANDLER_GO_ON; +} + +SETDEFAULTS_FUNC(mod_glusterfs_set_defaults) { + plugin_data *p = p_d; + size_t i = 0; + + config_values_t cv[] = { + { "glusterfs.logfile", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + + { "glusterfs.loglevel", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + + { "glusterfs.volume-specfile", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + + { "glusterfs.cache-timeout", NULL, T_CONFIG_SHORT, T_CONFIG_SCOPE_CONNECTION }, + + { "glusterfs.exclude-extensions", NULL, T_CONFIG_ARRAY, T_CONFIG_SCOPE_CONNECTION }, + + /*TODO: get the prefix from config_conext and remove glusterfs.prefix from conf file */ + { "glusterfs.prefix", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + + { "glusterfs.xattr-interface-size-limit", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + + { "glusterfs.document-root", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + + { NULL, NULL, T_CONFIG_UNSET, T_CONFIG_SCOPE_UNSET } + }; + + p->config_storage = calloc(1, srv->config_context->used * sizeof(specific_config *)); + /* ERR_ABORT (p->config_storage);*/ + p->range_buf = buffer_init (); + + for (i = 0; i < srv->config_context->used; i++) { + plugin_config *s; + + s = calloc(1, sizeof(plugin_config)); + /* ERR_ABORT (s); */ + s->logfile = buffer_init (); + s->loglevel = buffer_init (); + s->specfile = buffer_init (); + s->document_root = buffer_init (); + s->exclude_exts = array_init (); + s->prefix = buffer_init (); + s->xattr_file_size = buffer_init (); + + cv[0].destination = s->logfile; + cv[1].destination = s->loglevel; + cv[2].destination = s->specfile; + cv[3].destination = &s->cache_timeout; + cv[4].destination = s->exclude_exts; + cv[5].destination = s->prefix; + cv[6].destination = s->xattr_file_size; + cv[7].destination = s->document_root; + p->config_storage[i] = s; + + if (0 != config_insert_values_global(srv, ((data_config *)srv->config_context->data[i])->value, cv)) { + return HANDLER_FINISHED; + } + } + + return HANDLER_GO_ON; +} + +#define PATCH(x) \ + p->conf.x = s->x; + +static int mod_glusterfs_patch_connection(server *srv, connection *con, plugin_data *p) { + size_t i, j; + plugin_config *s; + + /* skip the first, the global context */ + /* glusterfs related config can only occur inside $HTTP["url"] == "<glusterfs-prefix>" */ + p->conf.logfile = NULL; + p->conf.loglevel = NULL; + p->conf.specfile = NULL; + p->conf.cache_timeout = 0; + p->conf.exclude_exts = NULL; + p->conf.prefix = NULL; + p->conf.xattr_file_size = NULL; + p->conf.document_root = NULL; + + for (i = 1; i < srv->config_context->used; i++) { + data_config *dc = (data_config *)srv->config_context->data[i]; + s = p->config_storage[i]; + + /* condition didn't match */ + if (!config_check_cond(srv, con, dc)) continue; + + /* merge config */ + for (j = 0; j < dc->value->used; j++) { + data_unset *du = dc->value->data[j]; + + if (buffer_is_equal_string (du->key, CONST_STR_LEN("glusterfs.logfile"))) { + PATCH (logfile); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN("glusterfs.loglevel"))) { + PATCH (loglevel); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN ("glusterfs.volume-specfile"))) { + PATCH (specfile); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN("glusterfs.cache-timeout"))) { + PATCH (cache_timeout); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN ("glusterfs.exclude-extensions"))) { + PATCH (exclude_exts); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN ("glusterfs.prefix"))) { + PATCH (prefix); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN ("glusterfs.xattr-interface-size-limit"))) { + PATCH (xattr_file_size); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN ("glusterfs.document-root"))) { + PATCH (document_root); + } + } + } + return 0; +} + +#undef PATCH + +static int http_response_parse_range(server *srv, connection *con, plugin_data *p) { + int multipart = 0; + int error; + off_t start, end; + const char *s, *minus; + char *boundary = "fkj49sn38dcn3"; + data_string *ds; + stat_cache_entry *sce = NULL; + buffer *content_type = NULL; + size_t size = 0; + mod_glusterfs_ctx_t *ctx = con->plugin_ctx[p->id]; + + if (p->conf.xattr_file_size && p->conf.xattr_file_size->ptr) { + size = atoi (p->conf.xattr_file_size->ptr); + } + + if (HANDLER_ERROR == stat_cache_get_entry(srv, con, con->physical.path, &sce)) { + SEGFAULT(); + } + + start = 0; + end = sce->st.st_size - 1; + + con->response.content_length = 0; + + if (NULL != (ds = (data_string *)array_get_element(con->response.headers, "Content-Type"))) { + content_type = ds->value; + } + + for (s = con->request.http_range, error = 0; + !error && *s && NULL != (minus = strchr(s, '-')); ) { + char *err; + off_t la, le; + + if (s == minus) { + /* -<stop> */ + + le = strtoll(s, &err, 10); + + if (le == 0) { + /* RFC 2616 - 14.35.1 */ + + con->http_status = 416; + error = 1; + } else if (*err == '\0') { + /* end */ + s = err; + + end = sce->st.st_size - 1; + start = sce->st.st_size + le; + } else if (*err == ',') { + multipart = 1; + s = err + 1; + + end = sce->st.st_size - 1; + start = sce->st.st_size + le; + } else { + error = 1; + } + + } else if (*(minus+1) == '\0' || *(minus+1) == ',') { + /* <start>- */ + + la = strtoll(s, &err, 10); + + if (err == minus) { + /* ok */ + + if (*(err + 1) == '\0') { + s = err + 1; + + end = sce->st.st_size - 1; + start = la; + + } else if (*(err + 1) == ',') { + multipart = 1; + s = err + 2; + + end = sce->st.st_size - 1; + start = la; + } else { + error = 1; + } + } else { + /* error */ + error = 1; + } + } else { + /* <start>-<stop> */ + + la = strtoll(s, &err, 10); + + if (err == minus) { + le = strtoll(minus+1, &err, 10); + + /* RFC 2616 - 14.35.1 */ + if (la > le) { + error = 1; + } + + if (*err == '\0') { + /* ok, end*/ + s = err; + + end = le; + start = la; + } else if (*err == ',') { + multipart = 1; + s = err + 1; + + end = le; + start = la; + } else { + /* error */ + + error = 1; + } + } else { + /* error */ + + error = 1; + } + } + + if (!error) { + if (start < 0) start = 0; + + /* RFC 2616 - 14.35.1 */ + if (end > sce->st.st_size - 1) end = sce->st.st_size - 1; + + if (start > sce->st.st_size - 1) { + error = 1; + + con->http_status = 416; + } + } + + if (!error) { + if (multipart) { + /* write boundary-header */ + buffer *b; + + b = chunkqueue_get_append_buffer(con->write_queue); + + buffer_copy_string(b, "\r\n--"); + buffer_append_string(b, boundary); + + /* write Content-Range */ + buffer_append_string(b, "\r\nContent-Range: bytes "); + buffer_append_off_t(b, start); + buffer_append_string(b, "-"); + buffer_append_off_t(b, end); + buffer_append_string(b, "/"); + buffer_append_off_t(b, sce->st.st_size); + + buffer_append_string(b, "\r\nContent-Type: "); + buffer_append_string_buffer(b, content_type); + + /* write END-OF-HEADER */ + buffer_append_string(b, "\r\n\r\n"); + + con->response.content_length += b->used - 1; + + } + + /* path = con->physical.path->ptr + p->conf.prefix->used - 1 + con->physical.doc_root->used - 1; */ + /* + fd = glusterfs_open (p->conf.handle, path, O_RDONLY); + if (fd < 0) + return HANDLER_ERROR; + */ + /* fn = buffer_init_string (path); */ + if ((size_t)sce->st.st_size > size) { + chunkqueue_append_glusterfs_file(con, ctx->fd, start, end - start + 1); + } else { + if (!start) { + buffer *mem = buffer_init (); + mem->ptr = ctx->buf; + mem->used = mem->size = sce->st.st_size + 1; + http_chunk_append_buffer (srv, con, mem); + ctx->buf = NULL; + } else { + chunkqueue_append_mem (con->write_queue, ((char *)ctx->buf) + start, end - start + 1); + } + } + + con->response.content_length += end - start + 1; + } + } + + if (ctx->buf) { + free (ctx->buf); + ctx->buf = NULL; + } + + /* something went wrong */ + if (error) return -1; + + if (multipart) { + /* add boundary end */ + buffer *b; + + b = chunkqueue_get_append_buffer(con->write_queue); + + buffer_copy_string_len(b, "\r\n--", 4); + buffer_append_string(b, boundary); + buffer_append_string_len(b, "--\r\n", 4); + + con->response.content_length += b->used - 1; + + /* set header-fields */ + + buffer_copy_string(p->range_buf, "multipart/byteranges; boundary="); + buffer_append_string(p->range_buf, boundary); + + /* overwrite content-type */ + response_header_overwrite(srv, con, CONST_STR_LEN("Content-Type"), CONST_BUF_LEN(p->range_buf)); + } else { + /* add Content-Range-header */ + + buffer_copy_string(p->range_buf, "bytes "); + buffer_append_off_t(p->range_buf, start); + buffer_append_string(p->range_buf, "-"); + buffer_append_off_t(p->range_buf, end); + buffer_append_string(p->range_buf, "/"); + buffer_append_off_t(p->range_buf, sce->st.st_size); + + response_header_insert(srv, con, CONST_STR_LEN("Content-Range"), CONST_BUF_LEN(p->range_buf)); + } + + /* ok, the file is set-up */ + return 0; +} + +PHYSICALPATH_FUNC(mod_glusterfs_handle_physical) { + plugin_data *p = p_d; + stat_cache_entry *sce; + mod_glusterfs_ctx_t *plugin_ctx = NULL; + size_t size = 0; + + if (con->http_status != 0) return HANDLER_GO_ON; + if (con->uri.path->used == 0) return HANDLER_GO_ON; + if (con->physical.path->used == 0) return HANDLER_GO_ON; + + if (con->mode != DIRECT) return HANDLER_GO_ON; + + /* + network_backend_write = srv->network_backend_write; + srv->network_backend_write = mod_glusterfs_network_backend_write; + */ + + switch (con->request.http_method) { + case HTTP_METHOD_GET: + case HTTP_METHOD_POST: + case HTTP_METHOD_HEAD: + break; + + default: + return HANDLER_GO_ON; + } + + mod_glusterfs_patch_connection(srv, con, p); + if (!p->conf.prefix || p->conf.prefix->used == 0) { + return HANDLER_GO_ON; + } + + if (!p->conf.document_root || p->conf.document_root->used == 0) { + log_error_write(srv, __FILE__, __LINE__, "s", "glusterfs.document-root is not specified"); + con->http_status = 500; + return HANDLER_FINISHED; + } + + if (p->conf.handle <= 0) { + glusterfs_init_ctx_t ctx; + + if (!p->conf.specfile || p->conf.specfile->used == 0) { + return HANDLER_GO_ON; + } + memset (&ctx, 0, sizeof (ctx)); + + ctx.specfile = p->conf.specfile->ptr; + ctx.logfile = p->conf.logfile->ptr; + ctx.loglevel = p->conf.loglevel->ptr; + ctx.lookup_timeout = ctx.stat_timeout = p->conf.cache_timeout; + + p->conf.handle = (long)glusterfs_init (&ctx); + + if (p->conf.handle <= 0) { + con->http_status = 500; + log_error_write(srv, __FILE__, __LINE__, "sbs", "glusterfs initialization failed, please check your configuration. Glusterfs logfile ", p->conf.logfile, "might contain details"); + return HANDLER_FINISHED; + } + } + + size = 0; + if (p->conf.xattr_file_size && p->conf.xattr_file_size->ptr) + size = atoi (p->conf.xattr_file_size->ptr); + + if (!con->plugin_ctx[p->id]) { +/* FIXME: what if multiple files are requested from a single connection? */ +/* TODO: check whether this works fine for HTTP protocol 1.1 */ + + buffer *tmp_buf = buffer_init_buffer (con->physical.basedir); + + plugin_ctx = calloc (1, sizeof (*plugin_ctx)); + /* ERR_ABORT (plugin_ctx); */ + con->plugin_ctx[p->id] = plugin_ctx; + + buffer_append_string_buffer (tmp_buf, p->conf.prefix); + buffer_path_simplify (tmp_buf, tmp_buf); + + plugin_ctx->prefix = tmp_buf->used - 1; + if (tmp_buf->ptr[plugin_ctx->prefix - 1] == '/') + plugin_ctx->prefix--; + + buffer_free (tmp_buf); + } else + /*FIXME: error!! error!! */ + plugin_ctx = con->plugin_ctx[p->id]; + + + if (size) + { + plugin_ctx->buf = MALLOC (size); + /* ERR_ABORT (plugin_ctx->buf); */ + } + + plugin_ctx->glusterfs_path = buffer_init (); + buffer_copy_string_buffer (plugin_ctx->glusterfs_path, p->conf.document_root); + buffer_append_string (plugin_ctx->glusterfs_path, "/"); + buffer_append_string (plugin_ctx->glusterfs_path, con->physical.path->ptr + plugin_ctx->prefix); + buffer_path_simplify (plugin_ctx->glusterfs_path, plugin_ctx->glusterfs_path); + + if (glusterfs_stat_cache_get_entry (srv, con, (libglusterfs_handle_t )p->conf.handle, plugin_ctx->glusterfs_path, con->physical.path, plugin_ctx->buf, size, &sce) == HANDLER_ERROR) { + if (errno == ENOENT) + con->http_status = 404; + else + con->http_status = 403; + + free (plugin_ctx->buf); + buffer_free (plugin_ctx->glusterfs_path); + plugin_ctx->glusterfs_path = NULL; + plugin_ctx->buf = NULL; + + free (plugin_ctx); + con->plugin_ctx[p->id] = NULL; + + return HANDLER_FINISHED; + } + + if (!(S_ISREG (sce->st.st_mode) && (size_t)sce->st.st_size <= size)) { + free (plugin_ctx->buf); + plugin_ctx->buf = NULL; + } + + return HANDLER_GO_ON; +} + +static int http_chunk_append_len(server *srv, connection *con, size_t len) { + size_t i, olen = len, j; + buffer *b; + + b = srv->tmp_chunk_len; + + if (len == 0) { + buffer_copy_string(b, "0"); + } else { + for (i = 0; i < 8 && len; i++) { + len >>= 4; + } + + /* i is the number of hex digits we have */ + buffer_prepare_copy(b, i + 1); + + for (j = i-1, len = olen; j+1 > 0; j--) { + b->ptr[j] = (len & 0xf) + (((len & 0xf) <= 9) ? '0' : 'a' - 10); + len >>= 4; + } + b->used = i; + b->ptr[b->used++] = '\0'; + } + + buffer_append_string(b, "\r\n"); + chunkqueue_append_buffer(con->write_queue, b); + + return 0; +} + +int http_chunk_append_glusterfs_file_chunk(server *srv, connection *con, long fd, off_t offset, off_t len) { + chunkqueue *cq; + + if (!con) return -1; + + cq = con->write_queue; + + if (con->response.transfer_encoding & HTTP_TRANSFER_ENCODING_CHUNKED) { + http_chunk_append_len(srv, con, len); + } + + chunkqueue_append_glusterfs_file (con, fd, offset, len); + + if (con->response.transfer_encoding & HTTP_TRANSFER_ENCODING_CHUNKED && len > 0) { + chunkqueue_append_mem(cq, "\r\n", 2 + 1); + } + + return 0; +} + +int http_chunk_append_glusterfs_mem(server *srv, connection *con, const char * mem, size_t len) { + chunkqueue *cq = NULL; + buffer *buf = NULL; + + if (!con) return -1; + + cq = con->write_queue; + + if (len == 0) { + if (con->response.transfer_encoding & HTTP_TRANSFER_ENCODING_CHUNKED) { + chunkqueue_append_mem(cq, "0\r\n\r\n", 5 + 1); + } else { + chunkqueue_append_mem(cq, "", 1); + } + return 0; + } + + if (con->response.transfer_encoding & HTTP_TRANSFER_ENCODING_CHUNKED) { + http_chunk_append_len(srv, con, len - 1); + } + + buf = buffer_init (); + + buf->used = len + 1; + buf->ptr = (char *)mem; + chunkqueue_append_buffer_weak (cq, buf); + + if (con->response.transfer_encoding & HTTP_TRANSFER_ENCODING_CHUNKED) { + chunkqueue_append_mem(cq, "\r\n", 2 + 1); + } + + return 0; +} + + + +URIHANDLER_FUNC(mod_glusterfs_subrequest) { + plugin_data *p = p_d; + stat_cache_entry *sce = NULL; + int s_len; + char allow_caching = 1; + size_t size = 0; + mod_glusterfs_ctx_t *ctx = con->plugin_ctx[p->id]; + + /* someone else has done a decision for us */ + if (con->http_status != 0) return HANDLER_GO_ON; + if (con->uri.path->used == 0) return HANDLER_GO_ON; + if (con->physical.path->used == 0) return HANDLER_GO_ON; + + /* someone else has handled this request */ + if (con->mode != DIRECT) return HANDLER_GO_ON; + + /* we only handle GET, POST and HEAD */ + switch(con->request.http_method) { + case HTTP_METHOD_GET: + case HTTP_METHOD_POST: + case HTTP_METHOD_HEAD: + break; + default: + return HANDLER_GO_ON; + } + + mod_glusterfs_patch_connection(srv, con, p); + + if (!p->conf.prefix || !p->conf.prefix->used) + return HANDLER_GO_ON; + + s_len = con->uri.path->used - 1; + /* ignore certain extensions */ + /* + for (k = 0; k < p->conf.exclude_exts->used; k++) { + data_string *ds; + ds = (data_string *)p->conf.exclude_exts->data[k]; + + if (ds->value->used == 0) continue; + + if (!strncmp (ds->value->ptr, con->uri.path->ptr, strlen (ds->value->ptr))) + break; + } + + if (k == p->conf.exclude_exts->used) { + return HANDLER_GO_ON; + } + */ + + if (con->conf.log_request_handling) { + log_error_write(srv, __FILE__, __LINE__, "s", "-- serving file from glusterfs"); + } + + if (HANDLER_ERROR == stat_cache_get_entry(srv, con, con->physical.path, &sce)) { + con->http_status = 403; + + log_error_write(srv, __FILE__, __LINE__, "sbsb", + "not a regular file:", con->uri.path, + "->", con->physical.path); + + free (ctx); + con->plugin_ctx[p->id] = NULL; + + return HANDLER_FINISHED; + } + + if (con->uri.path->ptr[s_len] == '/' || !S_ISREG(sce->st.st_mode)) { + free (ctx); + con->plugin_ctx[p->id] = NULL; + return HANDLER_FINISHED; + } + + if (p->conf.xattr_file_size && p->conf.xattr_file_size->ptr) + size = atoi (p->conf.xattr_file_size->ptr); + + if ((size_t)sce->st.st_size > size) { + ctx->fd = glusterfs_open ((libglusterfs_handle_t ) ((unsigned long)p->conf.handle), ctx->glusterfs_path->ptr, O_RDONLY, 0); + + if (((long)ctx->fd) == 0) { + con->http_status = 403; + free (ctx); + con->plugin_ctx[p->id] = NULL; + return HANDLER_FINISHED; + } + } + + buffer_free (ctx->glusterfs_path); + ctx->glusterfs_path = NULL; + + /* we only handline regular files */ +#ifdef HAVE_LSTAT + if ((sce->is_symlink == 1) && !con->conf.follow_symlink) { + con->http_status = 403; + + if (con->conf.log_request_handling) { + log_error_write(srv, __FILE__, __LINE__, "s", "-- access denied due symlink restriction"); + log_error_write(srv, __FILE__, __LINE__, "sb", "Path :", con->physical.path); + } + + buffer_reset(con->physical.path); + free (ctx); + con->plugin_ctx[p->id] = NULL; + return HANDLER_FINISHED; + } +#endif + if (!S_ISREG(sce->st.st_mode)) { + con->http_status = 404; + + if (con->conf.log_file_not_found) { + log_error_write(srv, __FILE__, __LINE__, "sbsb", + "not a regular file:", con->uri.path, + "->", sce->name); + } + + free (ctx); + con->plugin_ctx[p->id] = NULL; + + return HANDLER_FINISHED; + } + + /* mod_compress might set several data directly, don't overwrite them */ + + /* set response content-type, if not set already */ + + if (NULL == array_get_element(con->response.headers, "Content-Type")) { + if (buffer_is_empty(sce->content_type)) { + /* we are setting application/octet-stream, but also announce that + * this header field might change in the seconds few requests + * + * This should fix the aggressive caching of FF and the script download + * seen by the first installations + */ + response_header_overwrite(srv, con, CONST_STR_LEN("Content-Type"), CONST_STR_LEN("application/octet-stream")); + + allow_caching = 0; + } else { + response_header_overwrite(srv, con, CONST_STR_LEN("Content-Type"), CONST_BUF_LEN(sce->content_type)); + } + } + + if (con->conf.range_requests) { + response_header_overwrite(srv, con, CONST_STR_LEN("Accept-Ranges"), CONST_STR_LEN("bytes")); + } + + /* TODO: Allow Cachable requests */ +#if 0 + if (allow_caching) { + if (p->conf.etags_used && con->etag_flags != 0 && !buffer_is_empty(sce->etag)) { + if (NULL == array_get_element(con->response.headers, "ETag")) { + /* generate e-tag */ + etag_mutate(con->physical.etag, sce->etag); + + response_header_overwrite(srv, con, CONST_STR_LEN("ETag"), CONST_BUF_LEN(con->physical.etag)); + } + } + + /* prepare header */ + if (NULL == (ds = (data_string *)array_get_element(con->response.headers, "Last-Modified"))) { + mtime = strftime_cache_get(srv, sce->st.st_mtime); + response_header_overwrite(srv, con, CONST_STR_LEN("Last-Modified"), CONST_BUF_LEN(mtime)); + } else { + mtime = ds->value; + } + + if (HANDLER_FINISHED == http_response_handle_cachable(srv, con, mtime)) { + free (ctx); + con->plugin_ctx[p->id] = NULL; + return HANDLER_FINISHED; + } + } +#endif + + /*TODO: Read about etags */ + if (con->request.http_range && con->conf.range_requests) { + int do_range_request = 1; + data_string *ds = NULL; + buffer *mtime = NULL; + /* check if we have a conditional GET */ + + /* prepare header */ + if (NULL == (ds = (data_string *)array_get_element(con->response.headers, "Last-Modified"))) { + mtime = strftime_cache_get(srv, sce->st.st_mtime); + response_header_overwrite(srv, con, CONST_STR_LEN("Last-Modified"), CONST_BUF_LEN(mtime)); + } else { + mtime = ds->value; + } + + if (NULL != (ds = (data_string *)array_get_element(con->request.headers, "If-Range"))) { + /* if the value is the same as our ETag, we do a Range-request, + * otherwise a full 200 */ + + if (ds->value->ptr[0] == '"') { + /** + * client wants a ETag + */ + if (!con->physical.etag) { + do_range_request = 0; + } else if (!buffer_is_equal(ds->value, con->physical.etag)) { + do_range_request = 0; + } + } else if (!mtime) { + /** + * we don't have a Last-Modified and can match the If-Range: + * + * sending all + */ + do_range_request = 0; + } else if (!buffer_is_equal(ds->value, mtime)) { + do_range_request = 0; + } + } + + if (do_range_request) { + /* content prepared, I'm done */ + con->file_finished = 1; + + if (0 == http_response_parse_range(srv, con, p)) { + con->http_status = 206; + } + + free (ctx); + con->plugin_ctx[p->id] = NULL; + return HANDLER_FINISHED; + } + } + + /* if we are still here, prepare body */ + + /* we add it here for all requests + * the HEAD request will drop it afterwards again + */ + /*TODO check whether 1 should be subtracted */ + + if (p->conf.xattr_file_size && p->conf.xattr_file_size->ptr) + size = atoi (p->conf.xattr_file_size->ptr); + + if (size < (size_t)sce->st.st_size) { + http_chunk_append_glusterfs_file_chunk (srv, con, ctx->fd, 0, sce->st.st_size); + } else { + http_chunk_append_glusterfs_mem (srv, con, ctx->buf, sce->st.st_size); + } + + con->http_status = 200; + con->file_finished = 1; + + free (ctx); + con->plugin_ctx[p->id] = NULL; + + return HANDLER_FINISHED; +} + +#if 0 +URIHANDLER_FUNC(mod_glusterfs_request_done) +{ + mod_glusterfs_read_buf_t *cur = first, *prev; + while (cur) { + prev = cur; + glusterfs_free (cur->buf); + cur = cur->next; + free (prev); + } + first = NULL + } +#endif + +/* this function is called at dlopen() time and inits the callbacks */ +CONNECTION_FUNC(mod_glusterfs_connection_reset) +{ + (void) p_d; + (void) con; + if (!network_backend_write) + network_backend_write = srv->network_backend_write; + + srv->network_backend_write = mod_glusterfs_network_backend_write; + + return HANDLER_GO_ON; +} + +int mod_glusterfs_plugin_init(plugin *p) { + p->version = LIGHTTPD_VERSION_ID; + p->name = buffer_init_string("glusterfs"); + p->init = mod_glusterfs_init; + p->handle_physical = mod_glusterfs_handle_physical; + p->handle_subrequest_start = mod_glusterfs_subrequest; + // p->handle_request_done = mod_glusterfs_request_done; + p->set_defaults = mod_glusterfs_set_defaults; + p->connection_reset = mod_glusterfs_connection_reset; + p->cleanup = mod_glusterfs_free; + + p->data = NULL; + + return 0; +} + + +/* mod_glusterfs_stat_cache */ +static stat_cache_entry * stat_cache_entry_init(void) { + stat_cache_entry *sce = NULL; + + sce = calloc(1, sizeof(*sce)); + /* ERR_ABORT (sce); */ + + sce->name = buffer_init(); + sce->etag = buffer_init(); + sce->content_type = buffer_init(); + + return sce; +} + +#ifdef HAVE_FAM_H +static fam_dir_entry * fam_dir_entry_init(void) { + fam_dir_entry *fam_dir = NULL; + + fam_dir = calloc(1, sizeof(*fam_dir)); + /* ERR_ABORT (fam_dir); */ + + fam_dir->name = buffer_init(); + + return fam_dir; +} + +static void fam_dir_entry_free(void *data) { + fam_dir_entry *fam_dir = data; + + if (!fam_dir) return; + + FAMCancelMonitor(fam_dir->fc, fam_dir->req); + + buffer_free(fam_dir->name); + free(fam_dir->req); + + free(fam_dir); +} +#endif + +#ifdef HAVE_XATTR +static int stat_cache_attr_get(buffer *buf, char *name) { + int attrlen; + int ret; + + attrlen = 1024; + buffer_prepare_copy(buf, attrlen); + attrlen--; + if(0 == (ret = attr_get(name, "Content-Type", buf->ptr, &attrlen, 0))) { + buf->used = attrlen + 1; + buf->ptr[attrlen] = '\0'; + } + return ret; +} +#endif + +/* the famous DJB hash function for strings */ +static uint32_t hashme(buffer *str) { + uint32_t hash = 5381; + const char *s; + for (s = str->ptr; *s; s++) { + hash = ((hash << 5) + hash) + *s; + } + + hash &= ~(1 << 31); /* strip the highest bit */ + + return hash; +} + + +#ifdef HAVE_LSTAT +static int stat_cache_lstat(server *srv, buffer *dname, struct stat *lst) { + if (lstat(dname->ptr, lst) == 0) { + return S_ISLNK(lst->st_mode) ? 0 : 1; + } + else { + log_error_write(srv, __FILE__, __LINE__, "sbs", + "lstat failed for:", + dname, strerror(errno)); + }; + return -1; +} +#endif + +/*** + * + * + * + * returns: + * - HANDLER_FINISHED on cache-miss (don't forget to reopen the file) + * - HANDLER_ERROR on stat() failed -> see errno for problem + */ + +handler_t glusterfs_stat_cache_get_entry(server *srv, + connection *con, + libglusterfs_handle_t handle, + buffer *glusterfs_path, + buffer *name, + void *buf, + size_t size, + stat_cache_entry **ret_sce) +{ +#ifdef HAVE_FAM_H + fam_dir_entry *fam_dir = NULL; + int dir_ndx = -1; + splay_tree *dir_node = NULL; +#endif + stat_cache_entry *sce = NULL; + stat_cache *sc; + struct stat st; + size_t k; +#ifdef DEBUG_STAT_CACHE + size_t i; +#endif + int file_ndx; + splay_tree *file_node = NULL; + + *ret_sce = NULL; + memset (&st, 0, sizeof (st)); + + /* + * check if the directory for this file has changed + */ + + sc = srv->stat_cache; + + buffer_copy_string_buffer(sc->hash_key, name); + buffer_append_long(sc->hash_key, con->conf.follow_symlink); + + file_ndx = hashme(sc->hash_key); + sc->files = splaytree_splay(sc->files, file_ndx); + +#ifdef DEBUG_STAT_CACHE + for (i = 0; i < ctrl.used; i++) { + if (ctrl.ptr[i] == file_ndx) break; + } +#endif + + if (sc->files && (sc->files->key == file_ndx)) { +#ifdef DEBUG_STAT_CACHE + /* it was in the cache */ + assert(i < ctrl.used); +#endif + + /* we have seen this file already and + * don't stat() it again in the same second */ + + file_node = sc->files; + + sce = file_node->data; + + /* check if the name is the same, we might have a collision */ + + if (buffer_is_equal(name, sce->name)) { + if (srv->srvconf.stat_cache_engine == STAT_CACHE_ENGINE_SIMPLE) { + if (sce->stat_ts == srv->cur_ts && !buf) { + *ret_sce = sce; + return HANDLER_GO_ON; + } + } + } else { + /* oops, a collision, + * + * file_node is used by the FAM check below to see if we know this file + * and if we can save a stat(). + * + * BUT, the sce is not reset here as the entry into the cache is ok, we + * it is just not pointing to our requested file. + * + * */ + + file_node = NULL; + } + } else { +#ifdef DEBUG_STAT_CACHE + if (i != ctrl.used) { + fprintf(stderr, "%s.%d: %08x was already inserted but not found in cache, %s\n", __FILE__, __LINE__, file_ndx, name->ptr); + } + assert(i == ctrl.used); +#endif + } + /* + * *lol* + * - open() + fstat() on a named-pipe results in a (intended) hang. + * - stat() if regular file + open() to see if we can read from it is better + * + * */ + if (-1 == glusterfs_lookup(handle, glusterfs_path->ptr, buf, size, &st)) { + return HANDLER_ERROR; + } + + if (NULL == sce) { + int osize = 0; + + if (sc->files) { + osize = sc->files->size; + } + + sce = stat_cache_entry_init(); + buffer_copy_string_buffer(sce->name, name); + + sc->files = splaytree_insert(sc->files, file_ndx, sce); +#ifdef DEBUG_STAT_CACHE + if (ctrl.size == 0) { + ctrl.size = 16; + ctrl.used = 0; + ctrl.ptr = MALLOC(ctrl.size * sizeof(*ctrl.ptr)); + /* ERR_ABORT (ctrl.ptr); */ + } else if (ctrl.size == ctrl.used) { + ctrl.size += 16; + ctrl.ptr = realloc(ctrl.ptr, ctrl.size * sizeof(*ctrl.ptr)); + /* ERR_ABORT (ctrl.ptr); */ + } + + ctrl.ptr[ctrl.used++] = file_ndx; + + assert(sc->files); + assert(sc->files->data == sce); + assert(osize + 1 == splaytree_size(sc->files)); +#endif + } + + sce->st = st; + sce->stat_ts = srv->cur_ts; + + /* catch the obvious symlinks + * + * this is not a secure check as we still have a race-condition between + * the stat() and the open. We can only solve this by + * 1. open() the file + * 2. fstat() the fd + * + * and keeping the file open for the rest of the time. But this can + * only be done at network level. + * + * per default it is not a symlink + * */ +#ifdef HAVE_LSTAT + sce->is_symlink = 0; + + /* we want to only check for symlinks if we should block symlinks. + */ + if (!con->conf.follow_symlink) { + if (stat_cache_lstat(srv, name, &lst) == 0) { +#ifdef DEBUG_STAT_CACHE + log_error_write(srv, __FILE__, __LINE__, "sb", + "found symlink", name); +#endif + sce->is_symlink = 1; + } + + /* + * we assume "/" can not be symlink, so + * skip the symlink stuff if our path is / + **/ + else if ((name->used > 2)) { + buffer *dname; + char *s_cur; + + dname = buffer_init(); + buffer_copy_string_buffer(dname, name); + + while ((s_cur = strrchr(dname->ptr,'/'))) { + *s_cur = '\0'; + dname->used = s_cur - dname->ptr + 1; + if (dname->ptr == s_cur) { +#ifdef DEBUG_STAT_CACHE + log_error_write(srv, __FILE__, __LINE__, "s", "reached /"); +#endif + break; + } +#ifdef DEBUG_STAT_CACHE + log_error_write(srv, __FILE__, __LINE__, "sbs", + "checking if", dname, "is a symlink"); +#endif + if (stat_cache_lstat(srv, dname, &lst) == 0) { + sce->is_symlink = 1; +#ifdef DEBUG_STAT_CACHE + log_error_write(srv, __FILE__, __LINE__, "sb", + "found symlink", dname); +#endif + break; + }; + }; + buffer_free(dname); + }; + }; +#endif + + if (S_ISREG(st.st_mode)) { + /* determine mimetype */ + buffer_reset(sce->content_type); + + for (k = 0; k < con->conf.mimetypes->used; k++) { + data_string *ds = (data_string *)con->conf.mimetypes->data[k]; + buffer *type = ds->key; + + if (type->used == 0) continue; + + /* check if the right side is the same */ + if (type->used > name->used) continue; + + if (0 == strncasecmp(name->ptr + name->used - type->used, type->ptr, type->used - 1)) { + buffer_copy_string_buffer(sce->content_type, ds->value); + break; + } + } + etag_create(sce->etag, &(sce->st), con->etag_flags); +#ifdef HAVE_XATTR + if (con->conf.use_xattr && buffer_is_empty(sce->content_type)) { + stat_cache_attr_get(sce->content_type, name->ptr); + } +#endif + } else if (S_ISDIR(st.st_mode)) { + etag_create(sce->etag, &(sce->st), con->etag_flags); + } + +#ifdef HAVE_FAM_H + if (sc->fam && + (srv->srvconf.stat_cache_engine == STAT_CACHE_ENGINE_FAM)) { + /* is this directory already registered ? */ + if (!dir_node) { + fam_dir = fam_dir_entry_init(); + fam_dir->fc = sc->fam; + + buffer_copy_string_buffer(fam_dir->name, sc->dir_name); + + fam_dir->version = 1; + + fam_dir->req = calloc(1, sizeof(FAMRequest)); + /* ERR_ABORT (fam_dir->req); */ + + if (0 != FAMMonitorDirectory(sc->fam, fam_dir->name->ptr, + fam_dir->req, fam_dir)) { + + log_error_write(srv, __FILE__, __LINE__, "sbsbs", + "monitoring dir failed:", + fam_dir->name, + "file:", name, + FamErrlist[FAMErrno]); + + fam_dir_entry_free(fam_dir); + } else { + int osize = 0; + + if (sc->dirs) { + osize = sc->dirs->size; + } + + sc->dirs = splaytree_insert(sc->dirs, dir_ndx, fam_dir); + assert(sc->dirs); + assert(sc->dirs->data == fam_dir); + assert(osize == (sc->dirs->size - 1)); + } + } else { + fam_dir = dir_node->data; + } + + /* bind the fam_fc to the stat() cache entry */ + + if (fam_dir) { + sce->dir_version = fam_dir->version; + sce->dir_ndx = dir_ndx; + } + } +#endif + + *ret_sce = sce; + + return HANDLER_GO_ON; +} + +/** + * remove stat() from cache which havn't been stat()ed for + * more than 10 seconds + * + * + * walk though the stat-cache, collect the ids which are too old + * and remove them in a second loop + */ + +static int stat_cache_tag_old_entries(server *srv, splay_tree *t, int *keys, size_t *ndx) { + stat_cache_entry *sce; + + if (!t) return 0; + + stat_cache_tag_old_entries(srv, t->left, keys, ndx); + stat_cache_tag_old_entries(srv, t->right, keys, ndx); + + sce = t->data; + + if (srv->cur_ts - sce->stat_ts > 2) { + keys[(*ndx)++] = t->key; + } + + return 0; +} diff --git a/mod_glusterfs/lighttpd/1.4/mod_glusterfs.h b/mod_glusterfs/lighttpd/1.4/mod_glusterfs.h new file mode 100644 index 000000000..5dc65cbcf --- /dev/null +++ b/mod_glusterfs/lighttpd/1.4/mod_glusterfs.h @@ -0,0 +1,29 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _MOD_GLUSTERFS_FILE_CACHE_H_ +#define _MOD_GLUSTERFS_FILE_CACHE_H_ + +#include "stat_cache.h" +#include <libglusterfsclient.h> +#include "base.h" + +handler_t glusterfs_stat_cache_get_entry(server *srv, connection *con, libglusterfs_handle_t handle, buffer *glusterfs_path, buffer *name, void *buf, size_t size, stat_cache_entry **fce); + +#endif diff --git a/mod_glusterfs/lighttpd/1.5/Makefile.am b/mod_glusterfs/lighttpd/1.5/Makefile.am new file mode 100644 index 000000000..eda329111 --- /dev/null +++ b/mod_glusterfs/lighttpd/1.5/Makefile.am @@ -0,0 +1,3 @@ +EXTRA_DIST = Makefile.am.diff mod_glusterfs.c mod_glusterfs.h README.txt + +CLEANFILES = diff --git a/mod_glusterfs/lighttpd/1.5/Makefile.am.diff b/mod_glusterfs/lighttpd/1.5/Makefile.am.diff new file mode 100644 index 000000000..375696b5d --- /dev/null +++ b/mod_glusterfs/lighttpd/1.5/Makefile.am.diff @@ -0,0 +1,29 @@ +--- lighttpd-1.4.19/src/Makefile.am 2008-04-16 18:42:18.000000000 +0400 ++++ lighttpd-1.4.19.mod/src/Makefile.am 2008-04-16 18:41:11.000000000 +0400 +@@ -1,4 +1,4 @@ +-AM_CFLAGS = $(FAM_CFLAGS) ++AM_CFLAGS = $(FAM_CFLAGS) -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 + + noinst_PROGRAMS=proc_open lemon # simple-fcgi #graphic evalo bench ajp ssl error_test adserver gen-license + sbin_PROGRAMS=lighttpd lighttpd-angel +@@ -241,6 +241,11 @@ + mod_accesslog_la_LDFLAGS = -module -export-dynamic -avoid-version -no-undefined + mod_accesslog_la_LIBADD = $(common_libadd) + ++lib_LTLIBRARIES += mod_glusterfs.la ++mod_glusterfs_la_SOURCES = mod_glusterfs.c ++mod_glusterfs_la_CFLAGS = $(AM_CFLAGS) ++mod_glusterfs_la_LDFLAGS = -module -export-dynamic -avoid-version -no-undefined -lglusterfsclient -lpthread ++mod_glusterfs_la_LIBADD = $(common_libadd) + + hdr = server.h buffer.h network.h log.h keyvalue.h \ + response.h request.h fastcgi.h chunk.h \ +@@ -254,7 +259,7 @@ + configparser.h mod_ssi_exprparser.h \ + sys-mmap.h sys-socket.h mod_cml.h mod_cml_funcs.h \ + splaytree.h proc_open.h status_counter.h \ +- mod_magnet_cache.h ++ mod_magnet_cache.h mod_glusterfs.h + + DEFS= @DEFS@ -DLIBRARY_DIR="\"$(libdir)\"" -DSBIN_DIR="\"$(sbindir)\"" + diff --git a/mod_glusterfs/lighttpd/1.5/README.txt b/mod_glusterfs/lighttpd/1.5/README.txt new file mode 100644 index 000000000..bdbdfffbc --- /dev/null +++ b/mod_glusterfs/lighttpd/1.5/README.txt @@ -0,0 +1,57 @@ +Introduction +============ +mod_glusterfs is a module written for lighttpd to speed up the access of files present on glusterfs. mod_glusterfs uses libglusterfsclient library provided for glusterfs and hence can be used without fuse (File System in User Space). + +Usage +===== +To use mod_glusterfs with lighttpd-1.5, copy mod_glusterfs.c and mod_glusterfs.h into src/ of lighttpd-1.5 source tree, and apply the Makefile.am.diff to src/Makefile.am. Re-run ./autogen.sh on the top level of the lighttpd-1.5 build tree and recompile. + +# cp mod_glusterfs.[ch] /home/glusterfs/lighttpd-1.5/src/ +# cp Makefile.am.diff /home/glusterfs/lighttpd-1.5/ +# cd /home/glusterfs/lighttpd-1.5 +# patch -p1 < Makefile.am.diff +# ./autogen.sh +# ./configure +# make +# make install + +Configuration +============= +* mod_glusterfs should be listed at the begining of the list server.modules in lighttpd.conf. + +Below is a snippet from lighttpd.conf concerning to mod_glusterfs. + +$HTTP["url"] =~ "^/glusterfs" { + glusterfs.prefix = "/glusterfs" + glusterfs.logfile = "/var/log/glusterfs-logfile" + glusterfs.document-root = "/home/glusterfs/document-root" + glusterfs.volume-specfile = "/etc/glusterfs/glusterfs.vol" + glusterfs.loglevel = "error" + glusterfs.cache-timeout = 300 + glusterfs.xattr-interface-size-limit = "65536" +} + +* $HTTP["url"] =~ "^/glusterfs" + A perl style regular expression used to match against the url. If regular expression matches the url, the url is handled by mod_glusterfs. Note that the pattern given here should match glusterfs.prefix. + +* glusterfs.prefix (COMPULSORY) + A string to be present at the starting of the file path in the url so that the file would be handled by glusterfs. + Eg., A GET request on the url http://www.example.com/glusterfs-prefix/some-dir/example-file will result in fetching of the file "/some-dir/example-file" from glusterfs mount if glusterfs.prefix is set to "/glusterfs-prefix". + +* glusterfs.volume-specfile (COMPULSORY) + Path to the the glusterfs volume specification file. + +* glusterfs.logfile (COMPULSORY) + Path to the glusterfs logfile. + +* glusterfs.loglevel (OPTIONAL, default = warning) + Allowed values are critical, error, warning, debug, none in the decreasing order of severity of error conditions. + +* glusterfs.cache-timeout (OPTIONAL, default = 0) + Timeout values for glusterfs stat and lookup cache. + +* glusterfs.document-root (COMPULSORY) + An absolute path, relative to which all the files are fetched from glusterfs. + +* glusterfs.xattr-interface-size-limit (OPTIONAL, default = 0) + Files with sizes upto and including this value are fetched through the extended attribute interface of glusterfs rather than the usual open-read-close set of operations. For files of small sizes, it is recommended to use extended attribute interface. diff --git a/mod_glusterfs/lighttpd/1.5/mod_glusterfs.c b/mod_glusterfs/lighttpd/1.5/mod_glusterfs.c new file mode 100644 index 000000000..4329640e2 --- /dev/null +++ b/mod_glusterfs/lighttpd/1.5/mod_glusterfs.c @@ -0,0 +1,1476 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <ctype.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <pthread.h> +#include <sys/types.h> +#include <fcntl.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include <errno.h> +#include <unistd.h> +#include <assert.h> + +#include "base.h" +#include "log.h" +#include "buffer.h" + +#include "plugin.h" + +#include "stat_cache.h" +#include "mod_glusterfs.h" +#include "etag.h" +#include "response.h" + +#include "fdevent.h" +#include "joblist.h" +#include "http_req_range.h" +#include "connections.h" +#include "configfile.h" + +#include <libglusterfsclient.h> + +#ifdef HAVE_ATTR_ATTRIBUTES_H +#include <attr/attributes.h> +#endif + +#ifdef HAVE_FAM_H +# include <fam.h> +#endif + +#include "sys-mmap.h" + +/* NetBSD 1.3.x needs it */ +#ifndef MAP_FAILED +# define MAP_FAILED -1 +#endif + +#ifndef O_LARGEFILE +# define O_LARGEFILE 0 +#endif + +#ifndef HAVE_LSTAT +#define lstat stat +#endif + +#if 0 +/* enables debug code for testing if all nodes in the stat-cache as accessable */ +#define DEBUG_STAT_CACHE +#endif + +#ifdef HAVE_LSTAT +#undef HAVE_LSTAT +#endif + +#define GLUSTERFS_FILE_CHUNK (FILE_CHUNK + 1) + +/* Keep this value large. Each glusterfs_async_read of GLUSTERFS_CHUNK_SIZE results in a network_backend_write of the read data*/ + +#define GLUSTERFS_CHUNK_SIZE 8192 + +/** + * this is a staticfile for a lighttpd plugin + * + */ + + +/* plugin config for all request/connections */ + +typedef struct { + buffer *logfile; + buffer *loglevel; + buffer *specfile; + buffer *prefix; + buffer *xattr_file_size; + buffer *document_root; + array *exclude_exts; + unsigned short cache_timeout; + + /* FIXME: its a pointer, hence cant be short */ + unsigned long handle; +} plugin_config; + +static network_status_t (*network_backend_write)(struct server *srv, connection *con, iosocket *sock, chunkqueue *cq); + +typedef struct { + PLUGIN_DATA; + buffer *range_buf; + plugin_config **config_storage; + http_req_range *ranges; + plugin_config conf; +} plugin_data; + +typedef struct glusterfs_async_local { + int op_ret; + int op_errno; + pthread_mutex_t lock; + pthread_cond_t cond; + connection *con; + server *srv; + plugin_data *p; + + union { + struct { + char async_read_complete; + off_t length; + size_t read_bytes; + glusterfs_read_buf_t *buf; + }readv; + + struct { + buffer *name; + buffer *hash_key; + size_t size; + }lookup; + }fop; +} glusterfs_async_local_t; + +typedef struct { + unsigned long fd; + buffer *glusterfs_path; + void *buf; + off_t response_content_length; + int prefix; +}mod_glusterfs_ctx_t; + +typedef struct { + chunkqueue *cq; + glusterfs_read_buf_t *buf; + size_t length; +}mod_glusterfs_chunkqueue; + +#ifdef HAVE_FAM_H +typedef struct { + FAMRequest *req; + FAMConnection *fc; + + buffer *name; + + int version; +} fam_dir_entry; +#endif + +/* the directory name is too long to always compare on it + * - we need a hash + * - the hash-key is used as sorting criteria for a tree + * - a splay-tree is used as we can use the caching effect of it + */ + +/* we want to cleanup the stat-cache every few seconds, let's say 10 + * + * - remove entries which are outdated since 30s + * - remove entries which are fresh but havn't been used since 60s + * - if we don't have a stat-cache entry for a directory, release it from the monitor + */ + +#ifdef DEBUG_STAT_CACHE +typedef struct { + int *ptr; + + size_t used; + size_t size; +} fake_keys; + +static fake_keys ctrl; +#endif + +static stat_cache_entry * +stat_cache_entry_init(void) +{ + stat_cache_entry *sce = NULL; + + sce = calloc(1, sizeof(*sce)); + /* ERR_ABORT (sce); */ + + sce->name = buffer_init(); + sce->etag = buffer_init(); + sce->content_type = buffer_init(); + + return sce; +} + +int chunkqueue_append_glusterfs_mem (chunkqueue *cq, const char * mem, size_t len) { + buffer *buf = NULL; + + buf = chunkqueue_get_append_buffer (cq); + + if (buf->ptr) + free (buf->ptr); + + buf->used = len + 1; + buf->ptr = (char *)mem; + buf->size = len; + + return 0; +} + +static int +glusterfs_lookup_async_cbk (int op_ret, + int op_errno, + void *buf, + struct stat *st, + void *cbk_data) +{ + glusterfs_async_local_t *local = cbk_data; + + mod_glusterfs_ctx_t *ctx = NULL; + ctx = local->con->plugin_ctx[local->p->id]; + + assert (ctx->buf== buf); + + if (op_ret || !(S_ISREG (st->st_mode) && (size_t)st->st_size <= local->fop.lookup.size)) { + + free (ctx->buf); + ctx->buf = NULL; + + if (op_ret) { + buffer_free (ctx->glusterfs_path); + ctx->glusterfs_path = NULL; + free (ctx); + local->con->plugin_ctx[local->p->id] = NULL; + + if (op_errno == ENOENT) + local->con->http_status = 404; + else + local->con->http_status = 403; + } + } + + if (!op_ret) { + stat_cache_entry *sce = NULL; + stat_cache *sc = local->srv->stat_cache; + + sce = (stat_cache_entry *)g_hash_table_lookup(sc->files, local->fop.lookup.hash_key); + + if (!sce) { + sce = stat_cache_entry_init(); + + buffer_copy_string_buffer(sce->name, local->fop.lookup.name); + g_hash_table_insert(sc->files, buffer_init_string(BUF_STR(local->fop.lookup.hash_key)), sce); + } + + sce->state = STAT_CACHE_ENTRY_STAT_FINISHED; + sce->stat_ts = time (NULL); + memcpy (&sce->st, st, sizeof (*st)); + } + + g_async_queue_push (local->srv->joblist_queue, local->con); + /* + joblist_append (local->srv, local->con); + kill (getpid(), SIGUSR1); + */ + free (local); + return 0; +} + +static handler_t +glusterfs_stat_cache_get_entry_async (server *srv, + connection *con, + plugin_data *p, + buffer *glusterfs_path, + buffer *name, + void *buf, + size_t size, + stat_cache_entry **ret_sce) +{ + stat_cache_entry *sce = NULL; + stat_cache *sc; + glusterfs_async_local_t *local = NULL; + + *ret_sce = NULL; + + /* + * check if the directory for this file has changed + */ + + sc = srv->stat_cache; + + buffer_copy_string_buffer(sc->hash_key, name); + buffer_append_long(sc->hash_key, con->conf.follow_symlink); + + if ((sce = (stat_cache_entry *)g_hash_table_lookup(sc->files, sc->hash_key))) { + /* know this entry already */ + + if (sce->state == STAT_CACHE_ENTRY_STAT_FINISHED && + !buf) { + /* verify that this entry is still fresh */ + + *ret_sce = sce; + + return HANDLER_GO_ON; + } + } + + + /* + * *lol* + * - open() + fstat() on a named-pipe results in a (intended) hang. + * - stat() if regular file + open() to see if we can read from it is better + * + * */ + + /* pass a job to the stat-queue */ + + local = calloc (1, sizeof (*local)); + /* ERR_ABORT (local); */ + local->con = con; + local->srv = srv; + local->p = p; + local->fop.lookup.name = buffer_init_buffer (name); + local->fop.lookup.hash_key = buffer_init_buffer (sc->hash_key); + local->fop.lookup.size = size; + + if (glusterfs_lookup_async ((libglusterfs_handle_t )p->conf.handle, glusterfs_path->ptr, buf, size, glusterfs_lookup_async_cbk, (void *) local)) { + free (local); + return HANDLER_ERROR; + } + + return HANDLER_WAIT_FOR_EVENT; +} + +int +mod_glusterfs_readv_async_cbk (glusterfs_read_buf_t *buf, + void *cbk_data) +{ + glusterfs_async_local_t *local = cbk_data; + pthread_mutex_lock (&local->lock); + { + local->fop.readv.async_read_complete = 1; + local->fop.readv.buf = buf; + + pthread_cond_signal (&local->cond); + } + pthread_mutex_unlock (&local->lock); + + return 0; +} + +network_status_t +mod_glusterfs_read_async (server *srv, connection *con, chunk *glusterfs_chunk) +{ + glusterfs_async_local_t local; + off_t end = 0; + int nbytes; + int complete; + chunkqueue *cq = NULL; + chunk *c = NULL; + off_t offset = glusterfs_chunk->file.start; + size_t length = glusterfs_chunk->file.length; + unsigned long fd = (unsigned long)glusterfs_chunk->file.name; + network_status_t ret; + + pthread_cond_init (&local.cond, NULL); + pthread_mutex_init (&local.lock, NULL); + + //local.fd = fd; + memset (&local, 0, sizeof (local)); + + if (length > 0) + end = offset + length; + + cq = chunkqueue_init (); + if (!cq) { + con->http_status = 500; + return NETWORK_STATUS_FATAL_ERROR; + } + + do { + glusterfs_read_buf_t *buf; + int i; + if (length > 0) { + nbytes = end - offset; + if (nbytes > GLUSTERFS_CHUNK_SIZE) + nbytes = GLUSTERFS_CHUNK_SIZE; + } else + nbytes = GLUSTERFS_CHUNK_SIZE; + + glusterfs_read_async(fd, + nbytes, + offset, + mod_glusterfs_readv_async_cbk, + (void *)&local); + + pthread_mutex_lock (&local.lock); + { + while (!local.fop.readv.async_read_complete) { + pthread_cond_wait (&local.cond, &local.lock); + } + + local.op_ret = local.fop.readv.buf->op_ret; + local.op_errno = local.fop.readv.buf->op_errno; + + local.fop.readv.async_read_complete = 0; + buf = local.fop.readv.buf; + + if ((int)length < 0) + complete = (local.fop.readv.buf->op_ret <= 0); + else { + local.fop.readv.read_bytes += local.fop.readv.buf->op_ret; + complete = ((local.fop.readv.read_bytes == length) || (local.fop.readv.buf->op_ret <= 0)); + } + } + pthread_mutex_unlock (&local.lock); + + if (local.op_ret > 0) { + for (i = 0; i < buf->count; i++) { + buffer *nw_write_buf = chunkqueue_get_append_buffer (cq); + + nw_write_buf->used = nw_write_buf->size = buf->vector[i].iov_len + 1; + nw_write_buf->ptr = buf->vector[i].iov_base; + + // buffer_copy_memory (nw_write_buf, buf->vector[i].iov_base, buf->vector[i].iov_len + 1); + offset += local.op_ret; + } + + ret = network_backend_write (srv, con, con->sock, cq); + + if (chunkqueue_written (cq) != local.op_ret) { + mod_glusterfs_chunkqueue *gf_cq; + glusterfs_chunk->file.start = offset; + if ((int)glusterfs_chunk->file.length > 0) + glusterfs_chunk->file.length -= local.fop.readv.read_bytes; + + gf_cq = calloc (1, sizeof (*gf_cq)); + /* ERR_ABORT (qf_cq); */ + gf_cq->cq = cq; + gf_cq->buf = buf; + gf_cq->length = local.op_ret; + glusterfs_chunk->file.mmap.start = (char *)gf_cq; + return ret; + } + + for (c = cq->first ; c; c = c->next) + c->mem->ptr = NULL; + + chunkqueue_reset (cq); + } + + glusterfs_free (buf); + } while (!complete); + + chunkqueue_free (cq); + glusterfs_close (fd); + + if (local.op_ret < 0) + con->http_status = 500; + + return (local.op_ret < 0 ? NETWORK_STATUS_FATAL_ERROR : NETWORK_STATUS_SUCCESS); +} + +network_status_t mod_glusterfs_network_backend_write(struct server *srv, connection *con, iosocket *sock, chunkqueue *cq) +{ + chunk *c, *prev, *first; + int chunks_written = 0; + int error = 0; + network_status_t ret; + + for (first = prev = c = cq->first; c; c = c->next, chunks_written++) { + + if (c->type == MEM_CHUNK && c->mem->used && !c->mem->ptr) { + if (cq->first != c) { + prev->next = NULL; + + /* call stored network_backend_write */ + ret = network_backend_write (srv, con, sock, cq); + + prev->next = c; + if (ret != NETWORK_STATUS_SUCCESS) { + cq->first = first; + return ret; + } + } + cq->first = c->next; + + if (c->file.fd < 0) { + error = HANDLER_ERROR; + break; + } + + if (c->file.mmap.start) { + chunk *tmp; + size_t len; + mod_glusterfs_chunkqueue *gf_cq = (mod_glusterfs_chunkqueue *)c->file.mmap.start; + + ret = network_backend_write (srv, con, sock, gf_cq->cq); + + if ((len = (size_t)chunkqueue_written (gf_cq->cq)) != gf_cq->length) { + gf_cq->length -= len; + cq->first = first; + chunkqueue_remove_finished_chunks (gf_cq->cq); + return ret; + } + + for (tmp = gf_cq->cq->first ; tmp; tmp = tmp->next) + tmp->mem->ptr = NULL; + + chunkqueue_free (gf_cq->cq); + glusterfs_free (gf_cq->buf); + free (gf_cq); + c->file.mmap.start = NULL; + } + + ret = mod_glusterfs_read_async (srv, con, c); //c->file.fd, c->file.start, -1);//c->file.length); + if (c->file.mmap.start) { + /* pending chunkqueue from mod_glusterfs_read_async to be written to network */ + cq->first = first; + return ret; + } + + buffer_free (c->mem); + c->mem = NULL; + + c->type = FILE_CHUNK; + c->offset = c->file.length = 0; + c->file.name = NULL; + + if (first == c) + first = c->next; + + if (cq->last == c) + cq->last = NULL; + + prev->next = c->next; + + free(c); + } + prev = c; + } + + ret = network_backend_write (srv, con, sock, cq); + + cq->first = first; + + return ret; +} + +#if 0 +int chunkqueue_append_glusterfs_file (chunkqueue *cq, unsigned long fd, off_t offset, off_t len) +{ + chunk *c = NULL; + c = chunkqueue_get_append_tempfile (cq); + + if (c->file.is_temp) { + close (c->file.fd); + unlink (c->file.name->ptr); + } + + c->type = MEM_CHUNK; + + c->mem = buffer_init (); + c->mem->used = len + 1; + c->mem->ptr = NULL; + c->offset = 0; + + /* buffer_copy_string_buffer (c->file.name, fn); */ + c->file.start = offset; + c->file.length = len; + /* buffer_free (c->file.name); */ + + /* identify chunk as glusterfs related */ + c->file.mmap.start = MAP_FAILED; + /* c->file.mmap.length = c->file.mmap.offset = len;*/ + + return 0; +} +#endif + +int chunkqueue_append_dummy_mem_chunk (chunkqueue *cq, off_t len) +{ + chunk *c = NULL; + c = chunkqueue_get_append_tempfile (cq); + + if (c->file.is_temp) { + close (c->file.fd); + unlink (c->file.name->ptr); + c->file.is_temp = 0; + } + + c->type = MEM_CHUNK; + + c->mem->used = len + 1; + c->offset = len; + c->mem->ptr = NULL; + + return 0; +} + +int chunkqueue_append_glusterfs_file (chunkqueue *cq, unsigned long fd, off_t offset, off_t len) +{ + chunk *c = NULL; + c = chunkqueue_get_append_tempfile (cq); + + if (c->file.is_temp) { + close (c->file.fd); + unlink (c->file.name->ptr); + c->file.is_temp = 0; + } + + c->type = MEM_CHUNK; + + c->mem = buffer_init (); + c->mem->used = len + 1; + c->mem->ptr = NULL; + c->offset = 0; + + /* buffer_copy_string_buffer (c->file.name, fn); */ + buffer_free (c->file.name); + + /* fd returned by libglusterfsclient is a pointer */ + c->file.name = (buffer *)fd; + c->file.start = offset; + c->file.length = len; + + //c->file.fd = fd; + c->file.mmap.start = NULL; + return 0; +} + +/* init the plugin data */ +INIT_FUNC(mod_glusterfs_init) { + plugin_data *p; + + UNUSED (srv); + p = calloc(1, sizeof(*p)); + /* ERR_ABORT (p); */ + network_backend_write = NULL; + p->ranges = http_request_range_init(); + return p; +} + +/* detroy the plugin data */ +FREE_FUNC(mod_glusterfs_free) { + plugin_data *p = p_d; + + UNUSED (srv); + + if (!p) return HANDLER_GO_ON; + + if (p->config_storage) { + size_t i; + for (i = 0; i < srv->config_context->used; i++) { + plugin_config *s = p->config_storage[i]; + + buffer_free (s->logfile); + buffer_free (s->loglevel); + buffer_free (s->specfile); + buffer_free (s->prefix); + buffer_free (s->xattr_file_size); + buffer_free (s->document_root); + array_free (s->exclude_exts); + + free (s); + } + free (p->config_storage); + } + buffer_free (p->range_buf); + http_request_range_free (p->ranges); + + free (p); + + return HANDLER_GO_ON; +} + +SETDEFAULTS_FUNC(mod_glusterfs_set_defaults) { + plugin_data *p = p_d; + size_t i = 0; + + config_values_t cv[] = { + { "glusterfs.logfile", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + + { "glusterfs.loglevel", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + { "glusterfs.volume-specfile", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + { "glusterfs.cache-timeout", NULL, T_CONFIG_SHORT, T_CONFIG_SCOPE_CONNECTION }, + + { "glusterfs.exclude-extensions", NULL, T_CONFIG_ARRAY, T_CONFIG_SCOPE_CONNECTION }, + + /*TODO: get the prefix from config_conext and remove glusterfs.prefix from conf file */ + { "glusterfs.prefix", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + + { "glusterfs.xattr-interface-size-limit", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + + { "glusterfs.document-root", NULL, T_CONFIG_STRING, T_CONFIG_SCOPE_CONNECTION }, + + { NULL, NULL, T_CONFIG_UNSET, T_CONFIG_SCOPE_UNSET } + }; + + p->config_storage = calloc(1, srv->config_context->used * sizeof(specific_config *)); + /* ERR_ABORT (p->config_storage); */ + p->range_buf = buffer_init (); + + for (i = 0; i < srv->config_context->used; i++) { + plugin_config *s; + + s = calloc(1, sizeof(plugin_config)); + /* ERR_ABORT (s); */ + s->logfile = buffer_init (); + s->loglevel = buffer_init (); + s->specfile = buffer_init (); + s->exclude_exts = array_init (); + s->prefix = buffer_init (); + s->xattr_file_size = buffer_init (); + s->document_root = buffer_init (); + + cv[0].destination = s->logfile; + cv[1].destination = s->loglevel; + cv[2].destination = s->specfile; + cv[3].destination = &s->cache_timeout; + cv[4].destination = s->exclude_exts; + cv[5].destination = s->prefix; + cv[6].destination = s->xattr_file_size; + cv[7].destination = s->document_root; + + p->config_storage[i] = s; + + if (0 != config_insert_values_global(srv, ((data_config *)srv->config_context->data[i])->value, cv)) { + return HANDLER_FINISHED; + } + } + + return HANDLER_GO_ON; +} + +#define PATCH(x) \ + p->conf.x = s->x; + +static int mod_glusterfs_patch_connection(server *srv, connection *con, plugin_data *p) { + size_t i, j; + plugin_config *s; + + p->conf.logfile = NULL; + p->conf.loglevel = NULL; + p->conf.specfile = NULL; + p->conf.cache_timeout = 0; + p->conf.exclude_exts = NULL; + p->conf.prefix = NULL; + p->conf.xattr_file_size = NULL; + p->conf.exclude_exts = NULL; + + /* skip the first, the global context */ + /* glusterfs related config can only occur inside $HTTP["url"] == "<glusterfs-prefix>" */ + for (i = 1; i < srv->config_context->used; i++) { + data_config *dc = (data_config *)srv->config_context->data[i]; + s = p->config_storage[i]; + + /* condition didn't match */ + if (!config_check_cond(srv, con, dc)) continue; + + /* merge config */ + for (j = 0; j < dc->value->used; j++) { + data_unset *du = dc->value->data[j]; + + if (buffer_is_equal_string (du->key, CONST_STR_LEN("glusterfs.logfile"))) { + PATCH (logfile); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN("glusterfs.loglevel"))) { + PATCH (loglevel); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN ("glusterfs.volume-specfile"))) { + PATCH (specfile); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN("glusterfs.cache-timeout"))) { + PATCH (cache_timeout); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN ("glusterfs.exclude-extensions"))) { + PATCH (exclude_exts); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN ("glusterfs.prefix"))) { + PATCH (prefix); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN ("glusterfs.xattr-interface-size-limit"))) { + PATCH (xattr_file_size); + } else if (buffer_is_equal_string (du->key, CONST_STR_LEN ("glusterfs.document-root"))) { + PATCH (document_root); + } + } + } + return 0; +} + +#undef PATCH + +static int http_response_parse_range(server *srv, connection *con, plugin_data *p) { + int multipart = 0; + char *boundary = "fkj49sn38dcn3"; + data_string *ds; + stat_cache_entry *sce = NULL; + buffer *content_type = NULL; + buffer *range = NULL; + http_req_range *ranges, *r; + mod_glusterfs_ctx_t *ctx = con->plugin_ctx[p->id]; + size_t size = 0; + + if (!ctx) { + return -1; + } + + if (NULL != (ds = (data_string *)array_get_element(con->request.headers, CONST_STR_LEN("Range")))) { + range = ds->value; + } else { + /* we don't have a Range header */ + + return -1; + } + + if (HANDLER_ERROR == stat_cache_get_entry(srv, con, con->physical.path, &sce)) { + SEGFAULT(); + } + + ctx->response_content_length = con->response.content_length = 0; + + if (NULL != (ds = (data_string *)array_get_element(con->response.headers, CONST_STR_LEN("Content-Type")))) { + content_type = ds->value; + } + + /* start the range-header parser + * bytes=<num> */ + + ranges = p->ranges; + http_request_range_reset(ranges); + switch (http_request_range_parse(range, ranges)) { + case PARSE_ERROR: + return -1; /* no range valid Range Header */ + case PARSE_SUCCESS: + break; + default: + TRACE("%s", "foobar"); + return -1; + } + + if (ranges->next) { + multipart = 1; + } + + if (p->conf.xattr_file_size && p->conf.xattr_file_size->ptr) { + size = atoi (p->conf.xattr_file_size->ptr); + } + + /* patch the '-1' */ + for (r = ranges; r; r = r->next) { + if (r->start == -1) { + /* -<end> + * + * the last <end> bytes */ + r->start = sce->st.st_size - r->end; + r->end = sce->st.st_size - 1; + } + if (r->end == -1) { + /* <start>- + * all but the first <start> bytes */ + + r->end = sce->st.st_size - 1; + } + + if (r->end > sce->st.st_size - 1) { + /* RFC 2616 - 14.35.1 + * + * if last-byte-pos not present or > size-of-file + * take the size-of-file + * + * */ + r->end = sce->st.st_size - 1; + } + + if (r->start > sce->st.st_size - 1) { + /* RFC 2616 - 14.35.1 + * + * if first-byte-pos > file-size, 416 + */ + + con->http_status = 416; + return -1; + } + + if (r->start > r->end) { + /* RFC 2616 - 14.35.1 + * + * if last-byte-pos is present, it has to be >= first-byte-pos + * + * invalid ranges have to be handle as no Range specified + * */ + + return -1; + } + } + + if (r) { + /* we ran into an range violation */ + return -1; + } + + if (multipart) { + buffer *b; + for (r = ranges; r; r = r->next) { + /* write boundary-header */ + + b = chunkqueue_get_append_buffer(con->send); + + buffer_copy_string(b, "\r\n--"); + buffer_append_string(b, boundary); + + /* write Content-Range */ + buffer_append_string(b, "\r\nContent-Range: bytes "); + buffer_append_off_t(b, r->start); + buffer_append_string(b, "-"); + buffer_append_off_t(b, r->end); + buffer_append_string(b, "/"); + buffer_append_off_t(b, sce->st.st_size); + + buffer_append_string(b, "\r\nContent-Type: "); + buffer_append_string_buffer(b, content_type); + + /* write END-OF-HEADER */ + buffer_append_string(b, "\r\n\r\n"); + + con->response.content_length += b->used - 1; + ctx->response_content_length += b->used - 1; + con->send->bytes_in += b->used - 1; + + if ((size_t)sce->st.st_size > size) { + chunkqueue_append_glusterfs_file(con->send_raw, ctx->fd, r->start, r->end - r->start + 1); + con->send_raw->bytes_in += (r->end - r->start + 1); + chunkqueue_append_dummy_mem_chunk (con->send, r->end - r->start + 1); + } else { + chunkqueue_append_mem (con->send, ((char *)ctx->buf) + r->start, r->end - r->start + 1); + free (ctx->buf); + ctx->buf = NULL; + } + + con->response.content_length += r->end - r->start + 1; + ctx->response_content_length += r->end - r->start + 1; + con->send->bytes_in += r->end - r->start + 1; + } + + /* add boundary end */ + b = chunkqueue_get_append_buffer(con->send); + + buffer_copy_string_len(b, "\r\n--", 4); + buffer_append_string(b, boundary); + buffer_append_string_len(b, "--\r\n", 4); + + con->response.content_length += b->used - 1; + ctx->response_content_length += b->used - 1; + con->send->bytes_in += b->used - 1; + + /* set header-fields */ + + buffer_copy_string(p->range_buf, "multipart/byteranges; boundary="); + buffer_append_string(p->range_buf, boundary); + + /* overwrite content-type */ + response_header_overwrite(srv, con, CONST_STR_LEN("Content-Type"), CONST_BUF_LEN(p->range_buf)); + + } else { + r = ranges; + + chunkqueue_append_glusterfs_file(con->send_raw, ctx->fd, r->start, r->end - r->start + 1); + con->send_raw->bytes_in += (r->end - r->start + 1); + chunkqueue_append_dummy_mem_chunk (con->send, r->end - r->start + 1); + con->response.content_length += r->end - r->start + 1; + ctx->response_content_length += r->end - r->start + 1; + con->send->bytes_in += r->end - r->start + 1; + + buffer_copy_string(p->range_buf, "bytes "); + buffer_append_off_t(p->range_buf, r->start); + buffer_append_string(p->range_buf, "-"); + buffer_append_off_t(p->range_buf, r->end); + buffer_append_string(p->range_buf, "/"); + buffer_append_off_t(p->range_buf, sce->st.st_size); + + response_header_insert(srv, con, CONST_STR_LEN("Content-Range"), CONST_BUF_LEN(p->range_buf)); + } + + /* ok, the file is set-up */ + return 0; +} + +PHYSICALPATH_FUNC(mod_glusterfs_handle_physical) { + plugin_data *p = p_d; + stat_cache_entry *sce; + size_t size = 0; + handler_t ret = 0; + mod_glusterfs_ctx_t *plugin_ctx = NULL; + + if (con->http_status != 0) return HANDLER_GO_ON; + if (con->uri.path->used == 0) return HANDLER_GO_ON; + if (con->physical.path->used == 0) return HANDLER_GO_ON; + + if (con->mode != DIRECT) return HANDLER_GO_ON; + + /* + network_backend_write = srv->network_backend_write; + srv->network_backend_write = mod_glusterfs_network_backend_write; + */ + + switch (con->request.http_method) { + case HTTP_METHOD_GET: + case HTTP_METHOD_POST: + case HTTP_METHOD_HEAD: + break; + + default: + return HANDLER_GO_ON; + } + + mod_glusterfs_patch_connection(srv, con, p); + + if (!p->conf.prefix || !p->conf.prefix->ptr) { + return HANDLER_GO_ON; + } + + if (!p->conf.document_root || p->conf.document_root->used == 0) { + log_error_write(srv, __FILE__, __LINE__, "s", "glusterfs.document-root is not specified"); + con->http_status = 500; + return HANDLER_FINISHED; + } + + if (p->conf.handle <= 0) { + glusterfs_init_ctx_t ctx; + + if (!p->conf.specfile || p->conf.specfile->used == 0) { + return HANDLER_GO_ON; + } + memset (&ctx, 0, sizeof (ctx)); + + ctx.specfile = p->conf.specfile->ptr; + ctx.logfile = p->conf.logfile->ptr; + ctx.loglevel = p->conf.loglevel->ptr; + ctx.lookup_timeout = ctx.stat_timeout = p->conf.cache_timeout; + + p->conf.handle = (unsigned long)glusterfs_init (&ctx); + + if (p->conf.handle <= 0) { + con->http_status = 500; + log_error_write(srv, __FILE__, __LINE__, "sbs", "glusterfs initialization failed, please check your configuration. Glusterfs logfile ", p->conf.logfile, "might contain details"); + return HANDLER_FINISHED; + } + } + + size = 0; + if (p->conf.xattr_file_size && p->conf.xattr_file_size->ptr) + size = atoi (p->conf.xattr_file_size->ptr); + + if (!con->plugin_ctx[p->id]) { + buffer *tmp_buf = buffer_init_buffer (con->physical.basedir); + + plugin_ctx = calloc (1, sizeof (*plugin_ctx)); + /* ERR_ABORT (plugin_ctx); */ + con->plugin_ctx[p->id] = plugin_ctx; + + buffer_append_string_buffer (tmp_buf, p->conf.prefix); + buffer_path_simplify (tmp_buf, tmp_buf); + + plugin_ctx->prefix = tmp_buf->used - 1; + if (tmp_buf->ptr[plugin_ctx->prefix - 1] == '/') + plugin_ctx->prefix--; + + buffer_free (tmp_buf); + } else + /*FIXME: error!! error!! */ + plugin_ctx = con->plugin_ctx[p->id]; + + + if (size) + { + plugin_ctx->buf = MALLOC (size); + /* ERR_ABORT (plugin_ctx->buf); */ + } + + plugin_ctx->glusterfs_path = buffer_init (); + buffer_copy_string_buffer (plugin_ctx->glusterfs_path, p->conf.document_root); + buffer_append_string (plugin_ctx->glusterfs_path, "/"); + buffer_append_string (plugin_ctx->glusterfs_path, con->physical.path->ptr + plugin_ctx->prefix); + buffer_path_simplify (plugin_ctx->glusterfs_path, plugin_ctx->glusterfs_path); + + ret = glusterfs_stat_cache_get_entry_async (srv, con, p, plugin_ctx->glusterfs_path, con->physical.path, plugin_ctx->buf, size, &sce); + + if (ret == HANDLER_ERROR) { + free (plugin_ctx->buf); + plugin_ctx->buf = NULL; + + buffer_free (plugin_ctx->glusterfs_path); + plugin_ctx->glusterfs_path = NULL; + + free (plugin_ctx); + con->plugin_ctx[p->id] = NULL; + + con->http_status = 500; + ret = HANDLER_FINISHED; + } + + return ret; +} + +URIHANDLER_FUNC(mod_glusterfs_subrequest) { + plugin_data *p = p_d; + stat_cache_entry *sce = NULL; + int s_len; + unsigned long fd; + char allow_caching = 1; + size_t size = 0; + mod_glusterfs_ctx_t *ctx = con->plugin_ctx[p->id]; + + /* someone else has done a decision for us */ + if (con->http_status != 0) return HANDLER_GO_ON; + if (con->uri.path->used == 0) return HANDLER_GO_ON; + if (con->physical.path->used == 0) return HANDLER_GO_ON; + + /* someone else has handled this request */ + if (con->mode != DIRECT) return HANDLER_GO_ON; + + /* we only handle GET, POST and HEAD */ + switch(con->request.http_method) { + case HTTP_METHOD_GET: + case HTTP_METHOD_POST: + case HTTP_METHOD_HEAD: + break; + default: + return HANDLER_GO_ON; + } + + mod_glusterfs_patch_connection(srv, con, p); + + if (!p->conf.prefix || !p->conf.prefix->ptr) + return HANDLER_GO_ON; + + if (!ctx) { + con->http_status = 500; + return HANDLER_FINISHED; + } + + s_len = con->uri.path->used - 1; + /* ignore certain extensions */ + /* + for (k = 0; k < p->conf.exclude_exts->used; k++) { + data_string *ds; + ds = (data_string *)p->conf.exclude_exts->data[k]; + + if (ds->value->used == 0) continue; + + if (!strncmp (ds->value->ptr, con->uri.path->ptr, strlen (ds->value->ptr))) + break; + } + + if (k == p->conf.exclude_exts->used) { + return HANDLER_GO_ON; + } + */ + + if (con->conf.log_request_handling) { + log_error_write(srv, __FILE__, __LINE__, "s", "-- serving file from glusterfs"); + } + + if (HANDLER_ERROR == stat_cache_get_entry(srv, con, con->physical.path, &sce)) { + con->http_status = 403; + + /* this might happen if the sce is removed from stat-cache after a successful glusterfs_lookup */ + if (ctx) { + if (ctx->buf) { + free (ctx->buf); + ctx->buf = NULL; + } + + if (ctx->glusterfs_path) { + buffer_free (ctx->glusterfs_path); + ctx->glusterfs_path = NULL; + } + free (ctx); + con->plugin_ctx[p->id] = NULL; + } + + log_error_write(srv, __FILE__, __LINE__, "sbsb", + "not a regular file:", con->uri.path, + "->", con->physical.path); + + return HANDLER_FINISHED; + } + + if (con->uri.path->ptr[s_len] == '/' || !S_ISREG(sce->st.st_mode)) { + if (ctx) { + if (ctx->glusterfs_path) { + buffer_free (ctx->glusterfs_path); + ctx->glusterfs_path = NULL; + } + + free (ctx); + con->plugin_ctx[p->id] = NULL; + } + + return HANDLER_FINISHED; + } + + if (p->conf.xattr_file_size && p->conf.xattr_file_size->ptr) + size = atoi (p->conf.xattr_file_size->ptr); + + if ((size_t)sce->st.st_size > size) { + + fd = glusterfs_open ((libglusterfs_handle_t ) ((unsigned long)p->conf.handle), ctx->glusterfs_path->ptr, O_RDONLY, 0); + + if (!fd) { + if (ctx) { + if (ctx->glusterfs_path) { + buffer_free (ctx->glusterfs_path); + ctx->glusterfs_path = NULL; + } + + free (ctx); + con->plugin_ctx[p->id] = NULL; + } + + con->http_status = 403; + return HANDLER_FINISHED; + } + ctx->fd = fd; + } + + /* we only handline regular files */ +#ifdef HAVE_LSTAT + if ((sce->is_symlink == 1) && !con->conf.follow_symlink) { + con->http_status = 403; + + if (con->conf.log_request_handling) { + log_error_write(srv, __FILE__, __LINE__, "s", "-- access denied due symlink restriction"); + log_error_write(srv, __FILE__, __LINE__, "sb", "Path :", con->physical.path); + } + + buffer_reset(con->physical.path); + if (ctx) { + if (ctx->glusterfs_path) { + buffer_free (ctx->glusterfs_path); + ctx->glusterfs_path = NULL; + } + + free (ctx); + con->plugin_ctx[p->id] = NULL; + } + + return HANDLER_FINISHED; + } +#endif + if (!S_ISREG(sce->st.st_mode)) { + con->http_status = 404; + + if (con->conf.log_file_not_found) { + log_error_write(srv, __FILE__, __LINE__, "sbsb", + "not a regular file:", con->uri.path, + "->", sce->name); + } + + if (ctx) { + if (ctx->glusterfs_path) { + buffer_free (ctx->glusterfs_path); + ctx->glusterfs_path = NULL; + } + + free (ctx); + con->plugin_ctx[p->id] = NULL; + } + + return HANDLER_FINISHED; + } + + /* mod_compress might set several data directly, don't overwrite them */ + + /* set response content-type, if not set already */ + + if (NULL == array_get_element(con->response.headers, CONST_STR_LEN("Content-Type"))) { + if (buffer_is_empty(sce->content_type)) { + /* we are setting application/octet-stream, but also announce that + * this header field might change in the seconds few requests + * + * This should fix the aggressive caching of FF and the script download + * seen by the first installations + */ + response_header_overwrite(srv, con, CONST_STR_LEN("Content-Type"), CONST_STR_LEN("application/octet-stream")); + + allow_caching = 0; + } else { + response_header_overwrite(srv, con, CONST_STR_LEN("Content-Type"), CONST_BUF_LEN(sce->content_type)); + } + } + + if (con->conf.range_requests) { + response_header_overwrite(srv, con, CONST_STR_LEN("Accept-Ranges"), CONST_STR_LEN("bytes")); + } + + /* TODO: Allow Cachable requests */ +#if 0 + if (allow_caching) { + if (p->conf.etags_used && con->etag_flags != 0 && !buffer_is_empty(sce->etag)) { + if (NULL == array_get_element(con->response.headers, "ETag")) { + /* generate e-tag */ + etag_mutate(con->physical.etag, sce->etag); + + response_header_overwrite(srv, con, CONST_STR_LEN("ETag"), CONST_BUF_LEN(con->physical.etag)); + } + } + + /* prepare header */ + if (NULL == (ds = (data_string *)array_get_element(con->response.headers, "Last-Modified"))) { + mtime = strftime_cache_get(srv, sce->st.st_mtime); + response_header_overwrite(srv, con, CONST_STR_LEN("Last-Modified"), CONST_BUF_LEN(mtime)); + } else { + mtime = ds->value; + } + + if (HANDLER_FINISHED == http_response_handle_cachable(srv, con, mtime)) { + if (ctx) { + if (ctx->glusterfs_path) { + buffer_free (ctx->glusterfs_path); + ctx->glusterfs_path = NULL; + } + + free (ctx); + con->plugin_ctx[p->id] = NULL; + } + + return HANDLER_FINISHED; + } + } +#endif + + /*TODO: Read about etags */ + if (NULL != array_get_element(con->request.headers, CONST_STR_LEN("Range")) && con->conf.range_requests) { + int do_range_request = 1; + data_string *ds = NULL; + buffer *mtime = NULL; + /* check if we have a conditional GET */ + + /* prepare header */ + if (NULL == (ds = (data_string *)array_get_element(con->response.headers, CONST_STR_LEN("Last-Modified")))) { + mtime = strftime_cache_get(srv, sce->st.st_mtime); + response_header_overwrite(srv, con, CONST_STR_LEN("Last-Modified"), CONST_BUF_LEN(mtime)); + } else { + mtime = ds->value; + } + + if (NULL != (ds = (data_string *)array_get_element(con->request.headers, CONST_STR_LEN("If-Range")))) { + /* if the value is the same as our ETag, we do a Range-request, + * otherwise a full 200 */ + + if (ds->value->ptr[0] == '"') { + /** + * client wants a ETag + */ + if (!con->physical.etag) { + do_range_request = 0; + } else if (!buffer_is_equal(ds->value, con->physical.etag)) { + do_range_request = 0; + } + } else if (!mtime) { + /** + * we don't have a Last-Modified and can match the If-Range: + * + * sending all + */ + do_range_request = 0; + } else if (!buffer_is_equal(ds->value, mtime)) { + do_range_request = 0; + } + } + + if (do_range_request) { + /* content prepared, I'm done */ + con->send->is_closed = 1; + + if (0 == http_response_parse_range(srv, con, p)) { + con->http_status = 206; + } + if (ctx) { + if (ctx->glusterfs_path) { + buffer_free (ctx->glusterfs_path); + ctx->glusterfs_path = NULL; + } + free (ctx); + con->plugin_ctx[p->id] = NULL; + } + + return HANDLER_FINISHED; + } + } + + /* if we are still here, prepare body */ + + /* we add it here for all requests + * the HEAD request will drop it afterwards again + */ + + if (p->conf.xattr_file_size && p->conf.xattr_file_size->ptr) + size = atoi (p->conf.xattr_file_size->ptr); + + if (size < (size_t)sce->st.st_size) { + chunkqueue_append_glusterfs_file (con->send_raw, fd, 0, sce->st.st_size); + con->send_raw->bytes_in += sce->st.st_size; + chunkqueue_append_dummy_mem_chunk (con->send, sce->st.st_size); + } else { + if (!ctx->buf) { + con->http_status = 404; + return HANDLER_ERROR; + } + chunkqueue_append_glusterfs_mem (con->send, ctx->buf, sce->st.st_size); + ctx->buf = NULL; + } + ctx->response_content_length = con->response.content_length = sce->st.st_size; + + con->send->is_closed = 1; + con->send->bytes_in = sce->st.st_size; + + return HANDLER_FINISHED; +} + +/* this function is called at dlopen() time and inits the callbacks */ +CONNECTION_FUNC(mod_glusterfs_connection_reset) +{ + (void) p_d; + (void) con; + if (!network_backend_write) + network_backend_write = srv->network_backend_write; + + srv->network_backend_write = mod_glusterfs_network_backend_write; + + return HANDLER_GO_ON; +} + +URIHANDLER_FUNC(mod_glusterfs_response_done) { + plugin_data *p = p_d; + UNUSED (srv); + mod_glusterfs_ctx_t *ctx = con->plugin_ctx[p->id]; + + con->plugin_ctx[p->id] = NULL; + if (ctx->glusterfs_path) { + free (ctx->glusterfs_path); + } + + free (ctx); + return HANDLER_GO_ON; +} + +int mod_glusterfs_plugin_init(plugin *p) { + p->version = LIGHTTPD_VERSION_ID; + p->name = buffer_init_string("glusterfs"); + p->init = mod_glusterfs_init; + p->handle_physical = mod_glusterfs_handle_physical; + p->handle_start_backend = mod_glusterfs_subrequest; + p->handle_response_done = mod_glusterfs_response_done; + p->set_defaults = mod_glusterfs_set_defaults; + p->connection_reset = mod_glusterfs_connection_reset; + p->cleanup = mod_glusterfs_free; + + p->data = NULL; + + return 0; +} diff --git a/mod_glusterfs/lighttpd/1.5/mod_glusterfs.h b/mod_glusterfs/lighttpd/1.5/mod_glusterfs.h new file mode 100644 index 000000000..b765936d6 --- /dev/null +++ b/mod_glusterfs/lighttpd/1.5/mod_glusterfs.h @@ -0,0 +1,29 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _MOD_GLUSTERFS_FILE_CACHE_H_ +#define _MOD_GLUSTERFS_FILE_CACHE_H_ + +#include "stat_cache.h" +#include <libglusterfsclient.h> +#include "base.h" + +handler_t glusterfs_stat_cache_get_entry(server *srv, connection *con, libglusterfs_handle_t handle, buffer *glusterfs_path, buffer *name, void *buf, size_t size, stat_cache_entry **fce); + +#endif diff --git a/mod_glusterfs/lighttpd/Makefile.am b/mod_glusterfs/lighttpd/Makefile.am new file mode 100644 index 000000000..c934412b3 --- /dev/null +++ b/mod_glusterfs/lighttpd/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = 1.4 1.5 + +CLEANFILES = diff --git a/scheduler/Makefile.am b/scheduler/Makefile.am new file mode 100644 index 000000000..618fa7dd8 --- /dev/null +++ b/scheduler/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = alu random nufa rr switch + +CLEANFILES = diff --git a/scheduler/alu/Makefile.am b/scheduler/alu/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/scheduler/alu/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/scheduler/alu/src/Makefile.am b/scheduler/alu/src/Makefile.am new file mode 100644 index 000000000..eb7d0db07 --- /dev/null +++ b/scheduler/alu/src/Makefile.am @@ -0,0 +1,14 @@ +sched_LTLIBRARIES = alu.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +alu_la_LDFLAGS = -module -avoidversion + +alu_la_SOURCES = alu.c +alu_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = alu.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/scheduler/alu/src/alu.c b/scheduler/alu/src/alu.c new file mode 100644 index 000000000..754c5e353 --- /dev/null +++ b/scheduler/alu/src/alu.c @@ -0,0 +1,993 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + + +/* ALU code needs a complete re-write. This is one of the most important + * part of GlusterFS and so needs more and more reviews and testing + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/time.h> +#include <stdint.h> +#include "stack.h" +#include "alu.h" + +#define ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT (1 * GF_UNIT_GB) +#define ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT (512 * GF_UNIT_MB) + +#define ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT 25 +#define ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT 5 + +#define ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT 25 +#define ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT 5 + +#define ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT 1000 +#define ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT 100 + +#define ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT 100 + +#define ALU_REFRESH_INTERVAL_DEFAULT 5 +#define ALU_REFRESH_CREATE_COUNT_DEFAULT 5 + + +static int64_t +get_stats_disk_usage (struct xlator_stats *this) +{ + return this->disk_usage; +} + +static int64_t +get_stats_write_usage (struct xlator_stats *this) +{ + return this->write_usage; +} + +static int64_t +get_stats_read_usage (struct xlator_stats *this) +{ + return this->read_usage; +} + +static int64_t +get_stats_disk_speed (struct xlator_stats *this) +{ + return this->disk_speed; +} + +static int64_t +get_stats_file_usage (struct xlator_stats *this) +{ + /* Avoid warning "defined but not used" */ + (void) &get_stats_file_usage; + + return this->nr_files; +} + +static int64_t +get_stats_free_disk (struct xlator_stats *this) +{ + if (this->total_disk_size > 0) + return (this->free_disk * 100) / this->total_disk_size; + return 0; +} + +static int64_t +get_max_diff_write_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->write_usage - min->write_usage); +} + +static int64_t +get_max_diff_read_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->read_usage - min->read_usage); +} + +static int64_t +get_max_diff_disk_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->disk_usage - min->disk_usage); +} + +static int64_t +get_max_diff_disk_speed (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->disk_speed - min->disk_speed); +} + +static int64_t +get_max_diff_file_usage (struct xlator_stats *max, struct xlator_stats *min) +{ + return (max->nr_files - min->nr_files); +} + + +int +alu_parse_options (xlator_t *xl, struct alu_sched *alu_sched) +{ + data_t *order = dict_get (xl->options, "scheduler.alu.order"); + if (!order) { + gf_log (xl->name, GF_LOG_ERROR, + "option 'scheduler.alu.order' not specified"); + return -1; + } + struct alu_threshold *_threshold_fn; + struct alu_threshold *tmp_threshold; + data_t *entry_fn = NULL; + data_t *exit_fn = NULL; + char *tmp_str = NULL; + char *order_str = strtok_r (order->data, ":", &tmp_str); + /* Get the scheduling priority order, specified by the user. */ + while (order_str) { + gf_log ("alu", GF_LOG_DEBUG, + "alu_init: order string: %s", + order_str); + if (strcmp (order_str, "disk-usage") == 0) { + /* Disk usage */ + _threshold_fn = + CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_disk_usage; + _threshold_fn->sched_value = get_stats_disk_usage; + entry_fn = + dict_get (xl->options, + "scheduler.alu.disk-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2bytesize (entry_fn->data, + &alu_sched->entry_limit.disk_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "disk-usage.entry-threshold\"", + entry_fn->data); + return -1; + } + } else { + alu_sched->entry_limit.disk_usage = ALU_DISK_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_disk_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.disk-usage.exit-threshold"); + if (exit_fn) { + if (gf_string2bytesize (exit_fn->data, &alu_sched->exit_limit.disk_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "disk-usage.exit-threshold\"", + exit_fn->data); + return -1; + } + } else { + alu_sched->exit_limit.disk_usage = ALU_DISK_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_disk_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log ("alu", + GF_LOG_DEBUG, "alu_init: = %"PRId64",%"PRId64"", + alu_sched->entry_limit.disk_usage, + alu_sched->exit_limit.disk_usage); + + } else if (strcmp (order_str, "write-usage") == 0) { + /* Handle "write-usage" */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_write_usage; + _threshold_fn->sched_value = get_stats_write_usage; + entry_fn = dict_get (xl->options, + "scheduler.alu.write-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2bytesize (entry_fn->data, + &alu_sched->entry_limit.write_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\" " + "of option scheduler.alu." + "write-usage.entry-threshold", + entry_fn->data); + return -1; + } + } else { + alu_sched->entry_limit.write_usage = ALU_WRITE_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_write_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.write-usage.exit-threshold"); + if (exit_fn) { + if (gf_string2bytesize (exit_fn->data, + &alu_sched->exit_limit.write_usage) != 0) { + gf_log (xl->name, GF_LOG_ERROR, + "invalid number format \"%s\"" + " of \"option scheduler.alu." + "write-usage.exit-threshold\"", + exit_fn->data); + return -1; + } + } else { + alu_sched->exit_limit.write_usage = ALU_WRITE_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_write_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log (xl->name, GF_LOG_DEBUG, + "alu_init: = %"PRId64",%"PRId64"", + alu_sched->entry_limit.write_usage, + alu_sched->exit_limit.write_usage); + + } else if (strcmp (order_str, "read-usage") == 0) { + /* Read usage */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_read_usage; + _threshold_fn->sched_value = get_stats_read_usage; + entry_fn = dict_get (xl->options, + "scheduler.alu.read-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2bytesize (entry_fn->data, + &alu_sched->entry_limit.read_usage) != 0) { + gf_log (xl->name, + GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "read-usage.entry-threshold\"", + entry_fn->data); + return -1; + } + } else { + alu_sched->entry_limit.read_usage = ALU_READ_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_read_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.read-usage.exit-threshold"); + if (exit_fn) + { + if (gf_string2bytesize (exit_fn->data, + &alu_sched->exit_limit.read_usage) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "read-usage.exit-threshold\"", + exit_fn->data); + return -1; + } + } + else + { + alu_sched->exit_limit.read_usage = ALU_READ_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_read_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } + else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log ("alu", GF_LOG_DEBUG, + "alu_init: = %"PRId64",%"PRId64"", + alu_sched->entry_limit.read_usage, + alu_sched->exit_limit.read_usage); + + } else if (strcmp (order_str, "open-files-usage") == 0) { + /* Open files counter */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_file_usage; + _threshold_fn->sched_value = get_stats_file_usage; + entry_fn = dict_get (xl->options, + "scheduler.alu.open-files-usage.entry-threshold"); + if (entry_fn) { + if (gf_string2uint64 (entry_fn->data, + &alu_sched->entry_limit.nr_files) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "open-files-usage.entry-" + "threshold\"", entry_fn->data); + return -1; + } + } + else + { + alu_sched->entry_limit.nr_files = ALU_OPEN_FILES_USAGE_ENTRY_THRESHOLD_DEFAULT; + } + _threshold_fn->entry_value = get_stats_file_usage; + exit_fn = dict_get (xl->options, + "scheduler.alu.open-files-usage.exit-threshold"); + if (exit_fn) + { + if (gf_string2uint64 (exit_fn->data, + &alu_sched->exit_limit.nr_files) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.alu." + "open-files-usage.exit-" + "threshold\"", exit_fn->data); + return -1; + } + } + else + { + alu_sched->exit_limit.nr_files = ALU_OPEN_FILES_USAGE_EXIT_THRESHOLD_DEFAULT; + } + _threshold_fn->exit_value = get_stats_file_usage; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } + else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + gf_log ("alu", GF_LOG_DEBUG, + "alu.c->alu_init: = %"PRIu64",%"PRIu64"", + alu_sched->entry_limit.nr_files, + alu_sched->exit_limit.nr_files); + + } else if (strcmp (order_str, "disk-speed-usage") == 0) { + /* Disk speed */ + + _threshold_fn = CALLOC (1, sizeof (struct alu_threshold)); + ERR_ABORT (_threshold_fn); + _threshold_fn->diff_value = get_max_diff_disk_speed; + _threshold_fn->sched_value = get_stats_disk_speed; + entry_fn = dict_get (xl->options, + "scheduler.alu.disk-speed-usage.entry-threshold"); + if (entry_fn) { + gf_log ("alu", GF_LOG_DEBUG, + "entry-threshold is given, " + "value is constant"); + } + _threshold_fn->entry_value = NULL; + exit_fn = dict_get (xl->options, + "scheduler.alu.disk-speed-usage.exit-threshold"); + if (exit_fn) { + gf_log ("alu", GF_LOG_DEBUG, + "exit-threshold is given, " + "value is constant"); + } + _threshold_fn->exit_value = NULL; + tmp_threshold = alu_sched->threshold_fn; + if (!tmp_threshold) { + alu_sched->threshold_fn = _threshold_fn; + } + else { + while (tmp_threshold->next) { + tmp_threshold = tmp_threshold->next; + } + tmp_threshold->next = _threshold_fn; + } + + } else { + gf_log ("alu", GF_LOG_DEBUG, + "%s, unknown option provided to scheduler", + order_str); + } + order_str = strtok_r (NULL, ":", &tmp_str); + } + + return 0; +} + +static int32_t +alu_init (xlator_t *xl) +{ + struct alu_sched *alu_sched = NULL; + struct alu_limits *_limit_fn = NULL; + struct alu_limits *tmp_limits = NULL; + uint32_t min_free_disk = 0; + data_t *limits = NULL; + + alu_sched = CALLOC (1, sizeof (struct alu_sched)); + ERR_ABORT (alu_sched); + + { + alu_parse_options (xl, alu_sched); + } + + /* Get the limits */ + + limits = dict_get (xl->options, + "scheduler.limits.min-free-disk"); + if (limits) { + _limit_fn = CALLOC (1, sizeof (struct alu_limits)); + ERR_ABORT (_limit_fn); + _limit_fn->min_value = get_stats_free_disk; + _limit_fn->cur_value = get_stats_free_disk; + tmp_limits = alu_sched->limits_fn ; + _limit_fn->next = tmp_limits; + alu_sched->limits_fn = _limit_fn; + + if (gf_string2percent (limits->data, + &min_free_disk) != 0) { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" " + "of \"option scheduler.limits." + "min-free-disk\"", limits->data); + return -1; + } + alu_sched->spec_limit.free_disk = min_free_disk; + + if (alu_sched->spec_limit.free_disk >= 100) { + gf_log ("alu", GF_LOG_ERROR, + "check the \"option scheduler." + "limits.min-free-disk\", it should " + "be percentage value"); + return -1; + } + alu_sched->spec_limit.total_disk_size = ALU_LIMITS_TOTAL_DISK_SIZE_DEFAULT; /* Its in % */ + gf_log ("alu", GF_LOG_DEBUG, + "alu.limit.min-disk-free = %"PRId64"", + _limit_fn->cur_value (&(alu_sched->spec_limit))); + } + + limits = dict_get (xl->options, + "scheduler.limits.max-open-files"); + if (limits) { + // Update alu_sched->priority properly + _limit_fn = CALLOC (1, sizeof (struct alu_limits)); + ERR_ABORT (_limit_fn); + _limit_fn->max_value = get_stats_file_usage; + _limit_fn->cur_value = get_stats_file_usage; + tmp_limits = alu_sched->limits_fn ; + _limit_fn->next = tmp_limits; + alu_sched->limits_fn = _limit_fn; + if (gf_string2uint64_base10 (limits->data, + &alu_sched->spec_limit.nr_files) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format '%s' of option " + "scheduler.limits.max-open-files", + limits->data); + return -1; + } + + gf_log ("alu", GF_LOG_DEBUG, + "alu_init: limit.max-open-files = %"PRId64"", + _limit_fn->cur_value (&(alu_sched->spec_limit))); + } + + + /* Stats refresh options */ + limits = dict_get (xl->options, + "scheduler.refresh-interval"); + if (limits) { + if (gf_string2time (limits->data, + &alu_sched->refresh_interval) != 0) { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" of " + "option scheduler.refresh-interval", + limits->data); + return -1; + } + } else { + alu_sched->refresh_interval = ALU_REFRESH_INTERVAL_DEFAULT; + } + gettimeofday (&(alu_sched->last_stat_fetch), NULL); + + + limits = dict_get (xl->options, + "scheduler.alu.stat-refresh.num-file-create"); + if (limits) { + if (gf_string2uint32 (limits->data, + &alu_sched->refresh_create_count) != 0) + { + gf_log ("alu", GF_LOG_ERROR, + "invalid number format \"%s\" of \"option " + "alu.stat-refresh.num-file-create\"", + limits->data); + return -1; + } + } else { + alu_sched->refresh_create_count = ALU_REFRESH_CREATE_COUNT_DEFAULT; + } + + { + /* Build an array of child_nodes */ + struct alu_sched_struct *sched_array = NULL; + xlator_list_t *trav_xl = xl->children; + data_t *data = NULL; + int32_t index = 0; + + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + alu_sched->child_count = index; + sched_array = CALLOC (index, sizeof (struct alu_sched_struct)); + ERR_ABORT (sched_array); + trav_xl = xl->children; + index = 0; + while (trav_xl) { + sched_array[index].xl = trav_xl->xlator; + sched_array[index].eligible = 1; + index++; + trav_xl = trav_xl->next; + } + alu_sched->array = sched_array; + + data = dict_get (xl->options, + "scheduler.read-only-subvolumes"); + if (data) { + char *child = NULL; + char *tmp = NULL; + char *childs_data = strdup (data->data); + + child = strtok_r (childs_data, ",", &tmp); + while (child) { + for (index = 1; index < alu_sched->child_count; index++) { + if (strcmp (alu_sched->array[index -1].xl->name, child) == 0) { + memcpy (&(alu_sched->array[index -1]), + &(alu_sched->array[alu_sched->child_count -1]), + sizeof (struct alu_sched_struct)); + alu_sched->child_count--; + break; + } + } + child = strtok_r (NULL, ",", &tmp); + } + } + } + + *((long *)xl->private) = (long)alu_sched; + + /* Initialize all the alu_sched structure's elements */ + { + alu_sched->sched_nodes_pending = 0; + + alu_sched->min_limit.free_disk = 0x00FFFFFF; + alu_sched->min_limit.disk_usage = 0xFFFFFFFF; + alu_sched->min_limit.total_disk_size = 0xFFFFFFFF; + alu_sched->min_limit.disk_speed = 0xFFFFFFFF; + alu_sched->min_limit.write_usage = 0xFFFFFFFF; + alu_sched->min_limit.read_usage = 0xFFFFFFFF; + alu_sched->min_limit.nr_files = 0xFFFFFFFF; + alu_sched->min_limit.nr_clients = 0xFFFFFFFF; + } + + pthread_mutex_init (&alu_sched->alu_mutex, NULL); + return 0; +} + +static void +alu_fini (xlator_t *xl) +{ + if (!xl) + return; + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + struct alu_limits *limit = alu_sched->limits_fn; + struct alu_threshold *threshold = alu_sched->threshold_fn; + void *tmp = NULL; + pthread_mutex_destroy (&alu_sched->alu_mutex); + free (alu_sched->array); + while (limit) { + tmp = limit; + limit = limit->next; + free (tmp); + } + while (threshold) { + tmp = threshold; + threshold = threshold->next; + free (tmp); + } + free (alu_sched); +} + +static int32_t +update_stat_array_cbk (call_frame_t *frame, + void *cookie, + xlator_t *xl, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *trav_stats) +{ + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + struct alu_limits *limits_fn = alu_sched->limits_fn; + int32_t idx = 0; + + pthread_mutex_lock (&alu_sched->alu_mutex); + for (idx = 0; idx < alu_sched->child_count; idx++) { + if (alu_sched->array[idx].xl == (xlator_t *)cookie) + break; + } + pthread_mutex_unlock (&alu_sched->alu_mutex); + + if (op_ret == -1) { + alu_sched->array[idx].eligible = 0; + } else { + memcpy (&(alu_sched->array[idx].stats), trav_stats, sizeof (struct xlator_stats)); + + /* Get stats from all the child node */ + /* Here check the limits specified by the user to + consider the nodes to be used by scheduler */ + alu_sched->array[idx].eligible = 1; + limits_fn = alu_sched->limits_fn; + while (limits_fn){ + if (limits_fn->max_value && + (limits_fn->cur_value (trav_stats) > + limits_fn->max_value (&(alu_sched->spec_limit)))) { + alu_sched->array[idx].eligible = 0; + } + if (limits_fn->min_value && + (limits_fn->cur_value (trav_stats) < + limits_fn->min_value (&(alu_sched->spec_limit)))) { + alu_sched->array[idx].eligible = 0; + } + limits_fn = limits_fn->next; + } + + /* Select minimum and maximum disk_usage */ + if (trav_stats->disk_usage > alu_sched->max_limit.disk_usage) { + alu_sched->max_limit.disk_usage = trav_stats->disk_usage; + } + if (trav_stats->disk_usage < alu_sched->min_limit.disk_usage) { + alu_sched->min_limit.disk_usage = trav_stats->disk_usage; + } + + /* Select minimum and maximum disk_speed */ + if (trav_stats->disk_speed > alu_sched->max_limit.disk_speed) { + alu_sched->max_limit.disk_speed = trav_stats->disk_speed; + } + if (trav_stats->disk_speed < alu_sched->min_limit.disk_speed) { + alu_sched->min_limit.disk_speed = trav_stats->disk_speed; + } + + /* Select minimum and maximum number of open files */ + if (trav_stats->nr_files > alu_sched->max_limit.nr_files) { + alu_sched->max_limit.nr_files = trav_stats->nr_files; + } + if (trav_stats->nr_files < alu_sched->min_limit.nr_files) { + alu_sched->min_limit.nr_files = trav_stats->nr_files; + } + + /* Select minimum and maximum write-usage */ + if (trav_stats->write_usage > alu_sched->max_limit.write_usage) { + alu_sched->max_limit.write_usage = trav_stats->write_usage; + } + if (trav_stats->write_usage < alu_sched->min_limit.write_usage) { + alu_sched->min_limit.write_usage = trav_stats->write_usage; + } + + /* Select minimum and maximum read-usage */ + if (trav_stats->read_usage > alu_sched->max_limit.read_usage) { + alu_sched->max_limit.read_usage = trav_stats->read_usage; + } + if (trav_stats->read_usage < alu_sched->min_limit.read_usage) { + alu_sched->min_limit.read_usage = trav_stats->read_usage; + } + + /* Select minimum and maximum free-disk */ + if (trav_stats->free_disk > alu_sched->max_limit.free_disk) { + alu_sched->max_limit.free_disk = trav_stats->free_disk; + } + if (trav_stats->free_disk < alu_sched->min_limit.free_disk) { + alu_sched->min_limit.free_disk = trav_stats->free_disk; + } + } + + STACK_DESTROY (frame->root); + + return 0; +} + +static void +update_stat_array (xlator_t *xl) +{ + /* This function schedules the file in one of the child nodes */ + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + int32_t idx = 0; + call_frame_t *frame = NULL; + call_pool_t *pool = xl->ctx->pool; + + for (idx = 0 ; idx < alu_sched->child_count; idx++) { + frame = create_frame (xl, pool); + + STACK_WIND_COOKIE (frame, + update_stat_array_cbk, + alu_sched->array[idx].xl, //cookie + alu_sched->array[idx].xl, + (alu_sched->array[idx].xl)->mops->stats, + 0); //flag + } + return; +} + +static void +alu_update (xlator_t *xl) +{ + struct timeval tv; + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + + gettimeofday (&tv, NULL); + if (tv.tv_sec > (alu_sched->refresh_interval + alu_sched->last_stat_fetch.tv_sec)) { + /* Update the stats from all the server */ + update_stat_array (xl); + alu_sched->last_stat_fetch.tv_sec = tv.tv_sec; + } +} + +static xlator_t * +alu_scheduler (xlator_t *xl, const void *path) +{ + /* This function schedules the file in one of the child nodes */ + struct alu_sched *alu_sched = (struct alu_sched *)*((long *)xl->private); + int32_t sched_index = 0; + int32_t sched_index_orig = 0; + int32_t idx = 0; + + alu_update (xl); + + /* Now check each threshold one by one if some nodes are classified */ + { + struct alu_threshold *trav_threshold = alu_sched->threshold_fn; + struct alu_threshold *tmp_threshold = alu_sched->sched_method; + struct alu_sched_node *tmp_sched_node; + + /* This pointer 'trav_threshold' contains function pointers according to spec file + give by user, */ + while (trav_threshold) { + /* This check is needed for seeing if already there are nodes in this criteria + to be scheduled */ + if (!alu_sched->sched_nodes_pending) { + for (idx = 0; idx < alu_sched->child_count; idx++) { + if (!alu_sched->array[idx].eligible) { + continue; + } + if (trav_threshold->entry_value) { + if (trav_threshold->diff_value (&(alu_sched->max_limit), + &(alu_sched->array[idx].stats)) < + trav_threshold->entry_value (&(alu_sched->entry_limit))) { + continue; + } + } + tmp_sched_node = CALLOC (1, sizeof (struct alu_sched_node)); + ERR_ABORT (tmp_sched_node); + tmp_sched_node->index = idx; + if (!alu_sched->sched_node) { + alu_sched->sched_node = tmp_sched_node; + } else { + pthread_mutex_lock (&alu_sched->alu_mutex); + tmp_sched_node->next = alu_sched->sched_node; + alu_sched->sched_node = tmp_sched_node; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + alu_sched->sched_nodes_pending++; + } + } /* end of if (sched_nodes_pending) */ + + /* This loop is required to check the eligible nodes */ + struct alu_sched_node *trav_sched_node; + while (alu_sched->sched_nodes_pending) { + trav_sched_node = alu_sched->sched_node; + sched_index = trav_sched_node->index; + if (alu_sched->array[sched_index].eligible) + break; + alu_sched->sched_node = trav_sched_node->next; + free (trav_sched_node); + alu_sched->sched_nodes_pending--; + } + if (alu_sched->sched_nodes_pending) { + /* There are some node in this criteria to be scheduled, no need + * to sort and check other methods + */ + if (tmp_threshold && tmp_threshold->exit_value) { + /* verify the exit value && whether node is eligible or not */ + if (tmp_threshold->diff_value (&(alu_sched->max_limit), + &(alu_sched->array[sched_index].stats)) > + tmp_threshold->exit_value (&(alu_sched->exit_limit))) { + /* Free the allocated info for the node :) */ + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_node = trav_sched_node->next; + free (trav_sched_node); + trav_sched_node = alu_sched->sched_node; + alu_sched->sched_nodes_pending--; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + } else { + /* if there is no exit value, then exit after scheduling once */ + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_node = trav_sched_node->next; + free (trav_sched_node); + trav_sched_node = alu_sched->sched_node; + alu_sched->sched_nodes_pending--; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + + alu_sched->sched_method = tmp_threshold; /* this is the method used for selecting */ + + /* */ + if (trav_sched_node) { + tmp_sched_node = trav_sched_node; + while (trav_sched_node->next) { + trav_sched_node = trav_sched_node->next; + } + if (tmp_sched_node->next) { + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_node = tmp_sched_node->next; + tmp_sched_node->next = NULL; + trav_sched_node->next = tmp_sched_node; + pthread_mutex_unlock (&alu_sched->alu_mutex); + } + } + /* return the scheduled node */ + return alu_sched->array[sched_index].xl; + } /* end of if (pending_nodes) */ + + tmp_threshold = trav_threshold; + trav_threshold = trav_threshold->next; + } + } + + /* This is used only when there is everything seems ok, or no eligible nodes */ + sched_index_orig = alu_sched->sched_index; + alu_sched->sched_method = NULL; + while (1) { + //lock + pthread_mutex_lock (&alu_sched->alu_mutex); + sched_index = alu_sched->sched_index++; + alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count; + pthread_mutex_unlock (&alu_sched->alu_mutex); + //unlock + if (alu_sched->array[sched_index].eligible) + break; + if (sched_index_orig == (sched_index + 1) % alu_sched->child_count) { + gf_log ("alu", GF_LOG_WARNING, "No node is eligible to schedule"); + //lock + pthread_mutex_lock (&alu_sched->alu_mutex); + alu_sched->sched_index++; + alu_sched->sched_index = alu_sched->sched_index % alu_sched->child_count; + pthread_mutex_unlock (&alu_sched->alu_mutex); + //unlock + break; + } + } + return alu_sched->array[sched_index].xl; +} + +/** + * notify + */ +void +alu_notify (xlator_t *xl, int32_t event, void *data) +{ + struct alu_sched *alu_sched = NULL; + int32_t idx = 0; + + alu_sched = (struct alu_sched *)*((long *)xl->private); + if (!alu_sched) + return; + + for (idx = 0; idx < alu_sched->child_count; idx++) { + if (alu_sched->array[idx].xl == (xlator_t *)data) + break; + } + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + //alu_sched->array[idx].eligible = 1; + } + break; + case GF_EVENT_CHILD_DOWN: + { + alu_sched->array[idx].eligible = 0; + } + break; + default: + { + ; + } + break; + } + +} + +struct sched_ops sched = { + .init = alu_init, + .fini = alu_fini, + .update = alu_update, + .schedule = alu_scheduler, + .notify = alu_notify +}; + +struct volume_options options[] = { + { .key = { "scheduler.alu.order", "alu.order" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "scheduler.alu.disk-usage.entry-threshold", + "alu.disk-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.disk-usage.exit-threshold", + "alu.disk-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.write-usage.entry-threshold", + "alu.write-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.write-usage.exit-threshold", + "alu.write-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.read-usage.entry-threshold", + "alu.read-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.read-usage.exit-threshold", + "alu.read-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = { "scheduler.alu.open-files-usage.entry-threshold", + "alu.open-files-usage.entry-threshold" }, + .type = GF_OPTION_TYPE_INT + }, + { .key = { "scheduler.alu.open-files-usage.exit-threshold", + "alu.open-files-usage.exit-threshold" }, + .type = GF_OPTION_TYPE_INT + }, + { .key = { "scheduler.read-only-subvolumes", + "alu.read-only-subvolumes" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "scheduler.refresh-interval", + "alu.refresh-interval", + "alu.stat-refresh.interval" }, + .type = GF_OPTION_TYPE_TIME + }, + { .key = { "scheduler.limits.min-free-disk", + "alu.limits.min-free-disk" }, + .type = GF_OPTION_TYPE_PERCENT + }, + { .key = { "scheduler.alu.stat-refresh.num-file-create" + "alu.stat-refresh.num-file-create"}, + .type = GF_OPTION_TYPE_INT + }, + { .key = {NULL}, } +}; diff --git a/scheduler/alu/src/alu.h b/scheduler/alu/src/alu.h new file mode 100644 index 000000000..a958bb4d2 --- /dev/null +++ b/scheduler/alu/src/alu.h @@ -0,0 +1,89 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _ALU_H +#define _ALU_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" + +struct alu_sched; + +struct alu_sched_struct { + xlator_t *xl; + struct xlator_stats stats; + unsigned char eligible; +}; + +// Write better name for these functions +struct alu_limits { + struct alu_limits *next; + int64_t (*max_value) (struct xlator_stats *); /* Max limit, specified by the user */ + int64_t (*min_value) (struct xlator_stats *); /* Min limit, specified by the user */ + int64_t (*cur_value) (struct xlator_stats *); /* Current values of variables got from stats call */ +}; + +struct alu_threshold { + struct alu_threshold *next; + int64_t (*diff_value) (struct xlator_stats *max, struct xlator_stats *min); /* Diff b/w max and min */ + int64_t (*entry_value) (struct xlator_stats *); /* Limit specified user */ + int64_t (*exit_value) (struct xlator_stats *); /* Exit point for the limit */ + int64_t (*sched_value) (struct xlator_stats *); /* This will return the index of the child area */ +}; + +struct alu_sched_node { + struct alu_sched_node *next; + int32_t index; +}; + +struct alu_sched { + struct alu_limits *limits_fn; + struct alu_threshold *threshold_fn; + struct alu_sched_struct *array; + struct alu_sched_node *sched_node; + struct alu_threshold *sched_method; + struct xlator_stats max_limit; + struct xlator_stats min_limit; + struct xlator_stats entry_limit; + struct xlator_stats exit_limit; + struct xlator_stats spec_limit; /* User given limit */ + + pthread_mutex_t alu_mutex; + struct timeval last_stat_fetch; + uint32_t refresh_interval; /* in seconds */ + uint32_t refresh_create_count; /* num-file-create */ + + int32_t sched_nodes_pending; + + int32_t sched_index; /* used for round robin scheduling in case of many nodes getting the criteria match. */ + int32_t child_count; + +}; + +struct _alu_local_t { + int32_t call_count; +}; + +typedef struct _alu_local_t alu_local_t; + +#endif /* _ALU_H */ diff --git a/scheduler/nufa/Makefile.am b/scheduler/nufa/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/scheduler/nufa/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/scheduler/nufa/src/Makefile.am b/scheduler/nufa/src/Makefile.am new file mode 100644 index 000000000..6eb3d39f1 --- /dev/null +++ b/scheduler/nufa/src/Makefile.am @@ -0,0 +1,12 @@ +sched_LTLIBRARIES = nufa.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +nufa_la_LDFLAGS = -module -avoidversion + +nufa_la_SOURCES = nufa.c +nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/scheduler/nufa/src/nufa.c b/scheduler/nufa/src/nufa.c new file mode 100644 index 000000000..bbc61e2ad --- /dev/null +++ b/scheduler/nufa/src/nufa.c @@ -0,0 +1,403 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/time.h> + +#include "scheduler.h" +#include "common-utils.h" + +struct nufa_sched_struct { + xlator_t *xl; + struct timeval last_stat_fetch; + int64_t free_disk; + int32_t refresh_interval; + unsigned char eligible; +}; + +struct nufa_struct { + struct nufa_sched_struct *array; + struct timeval last_stat_fetch; + + int32_t *local_array; /* Used to keep the index of the local xlators */ + int32_t local_xl_index; /* index in the above array */ + int32_t local_xl_count; /* Count of the local subvolumes */ + + uint32_t refresh_interval; + uint32_t min_free_disk; + + gf_lock_t nufa_lock; + int32_t child_count; + int32_t sched_index; +}; + +#define NUFA_LIMITS_MIN_FREE_DISK_DEFAULT 15 +#define NUFA_REFRESH_INTERVAL_DEFAULT 30 + +static int32_t +nufa_init (xlator_t *xl) +{ + int32_t index = 0; + data_t *local_name = NULL; + data_t *data = NULL; + xlator_list_t *trav_xl = xl->children; + struct nufa_struct *nufa_buf = NULL; + + nufa_buf = CALLOC (1, sizeof (struct nufa_struct)); + ERR_ABORT (nufa_buf); + + data = dict_get (xl->options, "scheduler.limits.min-free-disk"); + if (data) { + if (gf_string2percent (data->data, + &nufa_buf->min_free_disk) != 0) { + gf_log ("nufa", GF_LOG_ERROR, + "invalid number format \"%s\" of " + "\"option scheduler.limits.min-free-disk\"", + data->data); + return -1; + } + if (nufa_buf->min_free_disk >= 100) { + gf_log ("nufa", GF_LOG_ERROR, + "check \"option scheduler.limits.min-free-disk" + "\", it should be percentage value"); + return -1; + } + } else { + gf_log ("nufa", GF_LOG_WARNING, + "No option for limit min-free-disk given, " + "defaulting it to 15%%"); + nufa_buf->min_free_disk = NUFA_LIMITS_MIN_FREE_DISK_DEFAULT; + } + data = dict_get (xl->options, "scheduler.refresh-interval"); + if (data && (gf_string2time (data->data, + &nufa_buf->refresh_interval) != 0)) { + gf_log ("nufa", GF_LOG_ERROR, + "invalid number format \"%s\" of " + "\"option scheduler.refresh-interval\"", + data->data); + return -1; + } else { + gf_log ("nufa", GF_LOG_WARNING, + "No option for scheduler.refresh-interval given, " + "defaulting it to 30"); + nufa_buf->refresh_interval = NUFA_REFRESH_INTERVAL_DEFAULT; + } + + /* Get the array built */ + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + nufa_buf->child_count = index; + nufa_buf->sched_index = 0; + nufa_buf->array = CALLOC (index, sizeof (struct nufa_sched_struct)); + ERR_ABORT (nufa_buf->array); + nufa_buf->local_array = CALLOC (index, sizeof (int32_t)); + ERR_ABORT (nufa_buf->array); + trav_xl = xl->children; + + local_name = dict_get (xl->options, "scheduler.local-volume-name"); + if (!local_name) { + /* Error */ + gf_log ("nufa", GF_LOG_ERROR, + "No 'local-volume-name' option given in volume file"); + FREE (nufa_buf->array); + FREE (nufa_buf->local_array); + FREE (nufa_buf); + return -1; + } + + /* Get the array properly */ + index = 0; + trav_xl = xl->children; + while (trav_xl) { + nufa_buf->array[index].xl = trav_xl->xlator; + nufa_buf->array[index].eligible = 1; + nufa_buf->array[index].free_disk = nufa_buf->min_free_disk; + nufa_buf->array[index].refresh_interval = + nufa_buf->refresh_interval; + + trav_xl = trav_xl->next; + index++; + } + + { + int32_t array_index = 0; + char *child = NULL; + char *tmp = NULL; + char *childs_data = strdup (local_name->data); + + child = strtok_r (childs_data, ",", &tmp); + while (child) { + /* Check if the local_volume specified is proper + subvolume of unify */ + trav_xl = xl->children; + index=0; + while (trav_xl) { + if (strcmp (child, trav_xl->xlator->name) == 0) + break; + trav_xl = trav_xl->next; + index++; + } + + if (!trav_xl) { + /* entry for 'local-volume-name' is wrong, + not present in subvolumes */ + gf_log ("nufa", GF_LOG_ERROR, + "option 'scheduler.local-volume-name' " + "%s is wrong", child); + FREE (nufa_buf->array); + FREE (nufa_buf->local_array); + FREE (nufa_buf); + return -1; + } else { + nufa_buf->local_array[array_index++] = index; + nufa_buf->local_xl_count++; + } + child = strtok_r (NULL, ",", &tmp); + } + free (childs_data); + } + + LOCK_INIT (&nufa_buf->nufa_lock); + *((long *)xl->private) = (long)nufa_buf; // put it at the proper place + return 0; +} + +static void +nufa_fini (xlator_t *xl) +{ + struct nufa_struct *nufa_buf = + (struct nufa_struct *)*((long *)xl->private); + + LOCK_DESTROY (&nufa_buf->nufa_lock); + FREE (nufa_buf->local_array); + FREE (nufa_buf->array); + FREE (nufa_buf); +} + +static int32_t +update_stat_array_cbk (call_frame_t *frame, + void *cookie, + xlator_t *xl, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *trav_stats) +{ + struct nufa_struct *nufa_struct = NULL; + int32_t idx = 0; + int32_t percent = 0; + + nufa_struct = (struct nufa_struct *)*((long *)xl->private); + LOCK (&nufa_struct->nufa_lock); + for (idx = 0; idx < nufa_struct->child_count; idx++) { + if (nufa_struct->array[idx].xl->name == (char *)cookie) + break; + } + UNLOCK (&nufa_struct->nufa_lock); + + if (op_ret == 0) { + percent = ((trav_stats->free_disk * 100) / + trav_stats->total_disk_size); + if (nufa_struct->array[idx].free_disk > percent) { + if (nufa_struct->array[idx].eligible) + gf_log ("nufa", GF_LOG_CRITICAL, + "node \"%s\" is _almost_ (%d %%) full", + nufa_struct->array[idx].xl->name, + 100 - percent); + nufa_struct->array[idx].eligible = 0; + } else { + nufa_struct->array[idx].eligible = 1; + } + } else { + nufa_struct->array[idx].eligible = 0; + } + + STACK_DESTROY (frame->root); + return 0; +} + +static void +update_stat_array (xlator_t *xl) +{ + /* This function schedules the file in one of the child nodes */ + int32_t idx; + struct nufa_struct *nufa_buf = + (struct nufa_struct *)*((long *)xl->private); + call_frame_t *frame = NULL; + call_pool_t *pool = xl->ctx->pool; + + for (idx = 0; idx < nufa_buf->child_count; idx++) { + frame = create_frame (xl, pool); + + STACK_WIND_COOKIE (frame, + update_stat_array_cbk, + nufa_buf->array[idx].xl->name, + nufa_buf->array[idx].xl, + (nufa_buf->array[idx].xl)->mops->stats, + 0); //flag + } + + return; +} + +static void +nufa_update (xlator_t *xl) +{ + struct nufa_struct *nufa_buf = + (struct nufa_struct *)*((long *)xl->private); + struct timeval tv; + gettimeofday (&tv, NULL); + if (tv.tv_sec > (nufa_buf->refresh_interval + + nufa_buf->last_stat_fetch.tv_sec)) { + /* Update the stats from all the server */ + update_stat_array (xl); + nufa_buf->last_stat_fetch.tv_sec = tv.tv_sec; + } +} + +static xlator_t * +nufa_schedule (xlator_t *xl, const void *path) +{ + struct nufa_struct *nufa_buf = + (struct nufa_struct *)*((long *)xl->private); + int32_t nufa_orig = nufa_buf->local_xl_index; + int32_t rr; + + nufa_update (xl); + + while (1) { + LOCK (&nufa_buf->nufa_lock); + rr = nufa_buf->local_xl_index++; + nufa_buf->local_xl_index %= nufa_buf->local_xl_count; + UNLOCK (&nufa_buf->nufa_lock); + + /* if 'eligible' or there are _no_ eligible nodes */ + if (nufa_buf->array[nufa_buf->local_array[rr]].eligible) { + /* Return the local node */ + return nufa_buf->array[nufa_buf->local_array[rr]].xl; + } + if ((rr + 1) % nufa_buf->local_xl_count == nufa_orig) { + gf_log ("nufa", GF_LOG_CRITICAL, + "No free space available on any local " + "volumes, using RR scheduler"); + LOCK (&nufa_buf->nufa_lock); + nufa_buf->local_xl_index++; + nufa_buf->local_xl_index %= nufa_buf->local_xl_count; + UNLOCK (&nufa_buf->nufa_lock); + break; + } + } + + nufa_orig = nufa_buf->sched_index; + while (1) { + LOCK (&nufa_buf->nufa_lock); + rr = nufa_buf->sched_index++; + nufa_buf->sched_index = (nufa_buf->sched_index % + nufa_buf->child_count); + UNLOCK (&nufa_buf->nufa_lock); + + /* if 'eligible' or there are _no_ eligible nodes */ + if (nufa_buf->array[rr].eligible) { + break; + } + if ((rr + 1) % nufa_buf->child_count == nufa_orig) { + gf_log ("nufa", GF_LOG_CRITICAL, + "No free space available on any server, " + "using RR scheduler."); + LOCK (&nufa_buf->nufa_lock); + nufa_buf->sched_index++; + nufa_buf->sched_index = (nufa_buf->sched_index % + nufa_buf->child_count); + UNLOCK (&nufa_buf->nufa_lock); + break; + } + } + return nufa_buf->array[rr].xl; +} + + +/** + * notify + */ +void +nufa_notify (xlator_t *xl, int32_t event, void *data) +{ + struct nufa_struct *nufa_buf = + (struct nufa_struct *)*((long *)xl->private); + int32_t idx = 0; + + if (!nufa_buf) + return; + + for (idx = 0; idx < nufa_buf->child_count; idx++) { + if (strcmp (nufa_buf->array[idx].xl->name, + ((xlator_t *)data)->name) == 0) + break; + } + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + //nufa_buf->array[idx].eligible = 1; + } + break; + case GF_EVENT_CHILD_DOWN: + { + nufa_buf->array[idx].eligible = 0; + } + break; + default: + { + ; + } + break; + } + +} + +struct sched_ops sched = { + .init = nufa_init, + .fini = nufa_fini, + .update = nufa_update, + .schedule = nufa_schedule, + .notify = nufa_notify +}; + +struct volume_options options[] = { + { .key = { "scheduler.refresh-interval", + "nufa.refresh-interval" }, + .type = GF_OPTION_TYPE_TIME + }, + { .key = { "scheduler.limits.min-free-disk", + "nufa.limits.min-free-disk" }, + .type = GF_OPTION_TYPE_PERCENT + }, + { .key = { "scheduler.local-volume-name", + "nufa.local-volume-name" }, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {NULL} } +}; + diff --git a/scheduler/random/Makefile.am b/scheduler/random/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/scheduler/random/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/scheduler/random/src/Makefile.am b/scheduler/random/src/Makefile.am new file mode 100644 index 000000000..572181336 --- /dev/null +++ b/scheduler/random/src/Makefile.am @@ -0,0 +1,14 @@ +sched_LTLIBRARIES = random.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +random_la_LDFLAGS = -module -avoidversion + +random_la_SOURCES = random.c +random_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = random.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/scheduler/random/src/random.c b/scheduler/random/src/random.c new file mode 100644 index 000000000..9e761d08a --- /dev/null +++ b/scheduler/random/src/random.c @@ -0,0 +1,283 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <stdlib.h> +#include <sys/time.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "random.h" + +#define RANDOM_LIMITS_MIN_FREE_DISK_DEFAULT 15 +#define RANDOM_REFRESH_INTERVAL_DEFAULT 10 + + +static int32_t +random_init (xlator_t *xl) +{ + struct random_struct *random_buf = NULL; + xlator_list_t *trav_xl = xl->children; + data_t *limit = NULL; + int32_t index = 0; + + random_buf = CALLOC (1, sizeof (struct random_struct)); + ERR_ABORT (random_buf); + + /* Set the seed for the 'random' function */ + srandom ((uint32_t) time (NULL)); + + limit = dict_get (xl->options, "scheduler.limits.min-free-disk"); + if (limit) { + if (gf_string2percent (data_to_str (limit), + &random_buf->min_free_disk) != 0) { + gf_log ("random", GF_LOG_ERROR, + "invalid number format \"%s\" of \"option " + "scheduler.limits.min-free-disk\"", + limit->data); + return -1; + } + if (random_buf->min_free_disk >= 100) { + gf_log ("random", GF_LOG_ERROR, + "check the \"option random.limits.min-free" + "-disk\", it should be percentage value"); + return -1; + } + + } else { + gf_log ("random", GF_LOG_WARNING, + "No option for limit min-free-disk given, " + "defaulting it to 5%%"); + random_buf->min_free_disk = + RANDOM_LIMITS_MIN_FREE_DISK_DEFAULT; + } + + limit = dict_get (xl->options, "scheduler.refresh-interval"); + if (limit) { + if (gf_string2time (data_to_str (limit), + &random_buf->refresh_interval) != 0) { + gf_log ("random", GF_LOG_ERROR, + "invalid number format \"%s\" of " + "\"option random.refresh-interval\"", + limit->data); + return -1; + } + } else { + random_buf->refresh_interval = RANDOM_REFRESH_INTERVAL_DEFAULT; + } + + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + random_buf->child_count = index; + random_buf->array = CALLOC (index, + sizeof (struct random_sched_struct)); + ERR_ABORT (random_buf->array); + trav_xl = xl->children; + index = 0; + + while (trav_xl) { + random_buf->array[index].xl = trav_xl->xlator; + random_buf->array[index].eligible = 1; + trav_xl = trav_xl->next; + index++; + } + pthread_mutex_init (&random_buf->random_mutex, NULL); + + // put it at the proper place + *((long *)xl->private) = (long)random_buf; + return 0; +} + +static void +random_fini (xlator_t *xl) +{ + struct random_struct *random_buf = NULL; + + random_buf = (struct random_struct *)*((long *)xl->private); + pthread_mutex_destroy (&random_buf->random_mutex); + free (random_buf->array); + free (random_buf); +} + + +static int32_t +update_stat_array_cbk (call_frame_t *frame, + void *cookie, + xlator_t *xl, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *trav_stats) +{ + int32_t idx = 0; + int32_t percent = 0; + struct random_struct *random_buf = NULL; + + random_buf = (struct random_struct *)*((long *)xl->private); + + pthread_mutex_lock (&random_buf->random_mutex); + for (idx = 0; idx < random_buf->child_count; idx++) { + if (strcmp (random_buf->array[idx].xl->name, + (char *)cookie) == 0) + break; + } + pthread_mutex_unlock (&random_buf->random_mutex); + + if (op_ret == 0) { + percent = ((trav_stats->free_disk *100) / + trav_stats->total_disk_size); + if (random_buf->min_free_disk > percent) { + random_buf->array[idx].eligible = 0; + } else { + random_buf->array[idx].eligible = 1; + } + } else { + random_buf->array[idx].eligible = 0; + } + + STACK_DESTROY (frame->root); + return 0; +} + +static void +update_stat_array (xlator_t *xl) +{ + int32_t idx; + struct random_struct *random_buf = NULL; + call_frame_t *frame = NULL; + call_pool_t *pool = xl->ctx->pool; + + random_buf = (struct random_struct *)*((long *)xl->private); + for (idx = 0; idx < random_buf->child_count; idx++) { + frame = create_frame (xl, pool); + + STACK_WIND_COOKIE (frame, + update_stat_array_cbk, + random_buf->array[idx].xl->name, + random_buf->array[idx].xl, + (random_buf->array[idx].xl)->mops->stats, + 0); + } + return ; +} + +static void +random_update (xlator_t *xl) +{ + struct timeval tv; + struct random_struct *random_buf = NULL; + + random_buf = (struct random_struct *)*((long *)xl->private); + + gettimeofday(&tv, NULL); + if (tv.tv_sec > (random_buf->refresh_interval + + random_buf->last_stat_entry.tv_sec)) { + update_stat_array (xl); + random_buf->last_stat_entry.tv_sec = tv.tv_sec; + } +} + +static xlator_t * +random_schedule (xlator_t *xl, const void *path) +{ + struct random_struct *random_buf = NULL; + int32_t rand = 0; + int32_t try = 0; + + random_buf = (struct random_struct *)*((long *)xl->private); + + rand = (int32_t) (1.0*random_buf->child_count * + (random() / (RAND_MAX + 1.0))); + + random_update (xl); + + while (!random_buf->array[rand].eligible) { + if (try++ > 100) { + /* there is a chance of this becoming a + infinite loop otherwise. */ + break; + } + rand = (int32_t) (1.0*random_buf->child_count * + (random() / (RAND_MAX + 1.0))); + } + return random_buf->array[rand].xl; +} + + +/** + * notify + */ +void +random_notify (xlator_t *xl, int32_t event, void *data) +{ + struct random_struct *random_buf = NULL; + int32_t idx = 0; + + random_buf = (struct random_struct *)*((long *)xl->private); + if (!random_buf) + return; + + for (idx = 0; idx < random_buf->child_count; idx++) { + if (random_buf->array[idx].xl == (xlator_t *)data) + break; + } + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + //random_buf->array[idx].eligible = 1; + } + break; + case GF_EVENT_CHILD_DOWN: + { + random_buf->array[idx].eligible = 0; + } + break; + default: + { + ; + } + break; + } + +} + +struct sched_ops sched = { + .init = random_init, + .fini = random_fini, + .update = random_update, + .schedule = random_schedule, + .notify = random_notify +}; + +struct volume_options options[] = { + { .key = { "scheduler.refresh-interval", + "random.refresh-interval" }, + .type = GF_OPTION_TYPE_TIME + }, + { .key = { "scheduler.limits.min-free-disk", + "random.limits.min-free-disk" }, + .type = GF_OPTION_TYPE_PERCENT + }, + { .key = {NULL} } +}; diff --git a/scheduler/random/src/random.h b/scheduler/random/src/random.h new file mode 100644 index 000000000..35c9e02ee --- /dev/null +++ b/scheduler/random/src/random.h @@ -0,0 +1,46 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _RANDOM_H +#define _RANDOM_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include <sys/time.h> +#include "scheduler.h" + +struct random_sched_struct { + xlator_t *xl; + unsigned char eligible; +}; + +struct random_struct { + int32_t child_count; + uint32_t refresh_interval; + uint32_t min_free_disk; + struct timeval last_stat_entry; + struct random_sched_struct *array; + pthread_mutex_t random_mutex; +}; + +#endif /* _RANDOM_H */ diff --git a/scheduler/rr/Makefile.am b/scheduler/rr/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/scheduler/rr/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/scheduler/rr/src/Makefile.am b/scheduler/rr/src/Makefile.am new file mode 100644 index 000000000..7e911c0ed --- /dev/null +++ b/scheduler/rr/src/Makefile.am @@ -0,0 +1,13 @@ +sched_LTLIBRARIES = rr.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +rr_la_LDFLAGS = -module -avoidversion + +rr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +rr_la_SOURCES = rr.c rr-options.c +noinst_HEADERS = rr.h rr-options.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/scheduler/rr/src/rr-options.c b/scheduler/rr/src/rr-options.c new file mode 100644 index 000000000..3f0ffcaf2 --- /dev/null +++ b/scheduler/rr/src/rr-options.c @@ -0,0 +1,256 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "scheduler.h" +#include "rr-options.h" + +#define RR_LIMITS_MIN_FREE_DISK_OPTION_STRING "scheduler.limits.min-free-disk" +#define RR_LIMITS_MIN_FREE_DISK_VALUE_DEFAULT 15 +#define RR_LIMITS_MIN_FREE_DISK_VALUE_MIN 0 +#define RR_LIMITS_MIN_FREE_DISK_VALUE_MAX 100 + +#define RR_REFRESH_INTERVAL_OPTION_STRING "scheduler.refresh-interval" +#define RR_REFRESH_INTERVAL_VALUE_DEFAULT 10 + +#define RR_READ_ONLY_SUBVOLUMES_OPTION_STRING "scheduler.read-only-subvolumes" + +#define LOG_ERROR(args...) gf_log ("rr-options", GF_LOG_ERROR, ##args) +#define LOG_WARNING(args...) gf_log ("rr-options", GF_LOG_WARNING, ##args) + +static int +_rr_options_min_free_disk_validate (const char *value_string, uint32_t *n) +{ + uint32_t value = 0; + + if (value_string == NULL) + { + return -1; + } + + if (gf_string2percent (value_string, &value) != 0) + { + gf_log ("rr", + GF_LOG_ERROR, + "invalid number format [%s] of option [%s]", + value_string, + RR_LIMITS_MIN_FREE_DISK_OPTION_STRING); + return -1; + } + + if ((value <= RR_LIMITS_MIN_FREE_DISK_VALUE_MIN) || + (value >= RR_LIMITS_MIN_FREE_DISK_VALUE_MAX)) + { + gf_log ("rr", + GF_LOG_ERROR, + "out of range [%d] of option [%s]. Allowed range is 0 to 100.", + value, + RR_LIMITS_MIN_FREE_DISK_OPTION_STRING); + return -1; + } + + *n = value; + + return 0; +} + +static int +_rr_options_refresh_interval_validate (const char *value_string, uint32_t *n) +{ + uint32_t value = 0; + + if (value_string == NULL) + { + return -1; + } + + if (gf_string2time (value_string, &value) != 0) + { + gf_log ("rr", + GF_LOG_ERROR, + "invalid number format [%s] of option [%s]", + value_string, + RR_REFRESH_INTERVAL_OPTION_STRING); + return -1; + } + + *n = value; + + return 0; +} + +static int +_rr_options_read_only_subvolumes_validate (const char *value_string, + char ***volume_list, + uint64_t *volume_count) +{ + char **vlist = NULL; + int vcount = 0; + int i = 0; + + if (value_string == NULL || volume_list == NULL || volume_count) + { + return -1; + } + + if (gf_strsplit (value_string, + ", ", + &vlist, + &vcount) != 0) + { + gf_log ("rr", + GF_LOG_ERROR, + "invalid subvolume list [%s] of option [%s]", + value_string, + RR_READ_ONLY_SUBVOLUMES_OPTION_STRING); + return -1; + } + + for (i = 0; i < vcount; i++) + { + if (gf_volume_name_validate (vlist[i]) != 0) + { + gf_log ("rr", + GF_LOG_ERROR, + "invalid subvolume name [%s] in [%s] of option [%s]", + vlist[i], + value_string, + RR_READ_ONLY_SUBVOLUMES_OPTION_STRING); + goto free_exit; + } + } + + *volume_list = vlist; + *volume_count = vcount; + + return 0; + + free_exit: + for (i = 0; i < vcount; i++) + { + free (vlist[i]); + } + free (vlist); + + return -1; +} + +int +rr_options_validate (dict_t *options, rr_options_t *rr_options) +{ + char *value_string = NULL; + + if (options == NULL || rr_options == NULL) + { + return -1; + } + + if (dict_get (options, RR_LIMITS_MIN_FREE_DISK_OPTION_STRING)) + if (data_to_str (dict_get (options, RR_LIMITS_MIN_FREE_DISK_OPTION_STRING))) + value_string = data_to_str (dict_get (options, + RR_LIMITS_MIN_FREE_DISK_OPTION_STRING)); + if (value_string != NULL) + { + if (_rr_options_min_free_disk_validate (value_string, + &rr_options->min_free_disk) != 0) + { + return -1; + } + + gf_log ("rr", + GF_LOG_WARNING, + "using %s = %d", + RR_LIMITS_MIN_FREE_DISK_OPTION_STRING, + rr_options->min_free_disk); + } + else + { + rr_options->min_free_disk = RR_LIMITS_MIN_FREE_DISK_VALUE_DEFAULT; + + gf_log ("rr", GF_LOG_DEBUG, + "using %s = %d [default]", + RR_LIMITS_MIN_FREE_DISK_OPTION_STRING, + rr_options->min_free_disk); + } + + value_string = NULL; + if (dict_get (options, RR_REFRESH_INTERVAL_OPTION_STRING)) + value_string = data_to_str (dict_get (options, + RR_REFRESH_INTERVAL_OPTION_STRING)); + if (value_string != NULL) + { + if (_rr_options_refresh_interval_validate (value_string, + &rr_options->refresh_interval) != 0) + { + return -1; + } + + gf_log ("rr", + GF_LOG_WARNING, + "using %s = %d", + RR_REFRESH_INTERVAL_OPTION_STRING, + rr_options->refresh_interval); + } + else + { + rr_options->refresh_interval = RR_REFRESH_INTERVAL_VALUE_DEFAULT; + + gf_log ("rr", GF_LOG_DEBUG, + "using %s = %d [default]", + RR_REFRESH_INTERVAL_OPTION_STRING, + rr_options->refresh_interval); + } + + value_string = NULL; + if (dict_get (options, RR_READ_ONLY_SUBVOLUMES_OPTION_STRING)) + value_string = data_to_str (dict_get (options, + RR_READ_ONLY_SUBVOLUMES_OPTION_STRING)); + if (value_string != NULL) + { + if (_rr_options_read_only_subvolumes_validate (value_string, + &rr_options->read_only_subvolume_list, + &rr_options->read_only_subvolume_count) != 0) + { + return -1; + } + + gf_log ("rr", + GF_LOG_WARNING, + "using %s = [%s]", + RR_READ_ONLY_SUBVOLUMES_OPTION_STRING, + value_string); + } + + return 0; +} + +struct volume_options options[] = { + { .key = { "scheduler.refresh-interval", + "rr.refresh-interval" }, + .type = GF_OPTION_TYPE_TIME + }, + { .key = { "scheduler.limits.min-free-disk", + "rr.limits.min-free-disk" }, + .type = GF_OPTION_TYPE_PERCENT + }, + { .key = { "scheduler.read-only-subvolumes", + "rr.read-only-subvolumes" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {NULL} } +}; diff --git a/scheduler/rr/src/rr-options.h b/scheduler/rr/src/rr-options.h new file mode 100644 index 000000000..4818c7d49 --- /dev/null +++ b/scheduler/rr/src/rr-options.h @@ -0,0 +1,34 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _RR_OPTIONS_H +#define _RR_OPTIONS_H + +struct rr_options +{ + uint32_t min_free_disk; + uint32_t refresh_interval; + char **read_only_subvolume_list; + uint64_t read_only_subvolume_count; +}; +typedef struct rr_options rr_options_t; + +int rr_options_validate (dict_t *options, rr_options_t *rr_options); + +#endif diff --git a/scheduler/rr/src/rr.c b/scheduler/rr/src/rr.c new file mode 100644 index 000000000..3e54ff5e1 --- /dev/null +++ b/scheduler/rr/src/rr.c @@ -0,0 +1,565 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/time.h> +#include <stdlib.h> + +#include <stdint.h> + +#include "scheduler.h" + +#include "rr-options.h" +#include "rr.h" + +#define RR_MIN_FREE_DISK_NOT_REACHED 0 +#define RR_MIN_FREE_DISK_REACHED 1 + +#define RR_SUBVOLUME_OFFLINE 0 +#define RR_SUBVOLUME_ONLINE 1 + +#define LOG_ERROR(args...) gf_log ("rr", GF_LOG_ERROR, ##args) +#define LOG_WARNING(args...) gf_log ("rr", GF_LOG_WARNING, ##args) +#define LOG_CRITICAL(args...) gf_log ("rr", GF_LOG_CRITICAL, ##args) + +#define ROUND_ROBIN(index, count) ((index + 1) % count) + +static int +_cleanup_rr (rr_t *rr) +{ + int i; + + if (rr == NULL) + { + return -1; + } + + if (rr->options.read_only_subvolume_list != NULL) + { + for (i = 0; i < rr->options.read_only_subvolume_count; i++) + { + free (rr->options.read_only_subvolume_list[i]); + } + free (rr->options.read_only_subvolume_list); + } + + free (rr->subvolume_list); + + free (rr); + + return 0; +} + +int +rr_init (xlator_t *this_xl) +{ + rr_t *rr = NULL; + dict_t *options = NULL; + xlator_list_t *children = NULL; + uint64_t children_count = 0; + int i = 0; + int j = 0; + + if (this_xl == NULL) + { + return -1; + } + + if ((options = this_xl->options) == NULL) + { + return -1; + } + + if ((children = this_xl->children) == NULL) + { + return -1; + } + + if ((rr = CALLOC (1, sizeof (rr_t))) == NULL) + { + return -1; + } + + if (rr_options_validate (options, &rr->options) != 0) + { + free (rr); + return -1; + } + + for (i = 0; i < rr->options.read_only_subvolume_count; i++) + { + char found = 0; + + for (children = this_xl->children; + children != NULL; + children = children->next) + { + if (strcmp (rr->options.read_only_subvolume_list[i], + children->xlator->name) == 0) + { + found = 1; + break; + } + } + + if (!found) + { + LOG_ERROR ("read-only subvolume [%s] not found in volume list", + rr->options.read_only_subvolume_list[i]); + _cleanup_rr (rr); + return -1; + } + } + + for (children = this_xl->children; + children != NULL; + children = children->next) + { + children_count++; + } + + /* bala: excluding read_only_subvolumes */ + if ((rr->subvolume_count = children_count - + rr->options.read_only_subvolume_count) == 0) + { + LOG_ERROR ("no writable volumes found for scheduling"); + _cleanup_rr (rr); + return -1; + } + + if ((rr->subvolume_list = CALLOC (rr->subvolume_count, + sizeof (rr_subvolume_t))) == NULL) + { + _cleanup_rr (rr); + return -1; + } + + i = 0; + j = 0; + for (children = this_xl->children; + children != NULL; + children = children->next) + { + char found = 0; + + for (j = 0; j < rr->options.read_only_subvolume_count; j++) + { + if (strcmp (rr->options.read_only_subvolume_list[i], + children->xlator->name) == 0) + { + found = 1; + break; + } + } + + if (!found) + { + rr_subvolume_t *subvolume = NULL; + + subvolume = &rr->subvolume_list[i]; + + subvolume->xl = children->xlator; + subvolume->free_disk_status = RR_MIN_FREE_DISK_NOT_REACHED; + subvolume->status = RR_SUBVOLUME_ONLINE; + + i++; + } + } + + rr->schedule_index = UINT64_MAX; + rr->last_stat_fetched_time.tv_sec = 0; + rr->last_stat_fetched_time.tv_usec = 0; + pthread_mutex_init (&rr->mutex, NULL); + + *((long *)this_xl->private) = (long)rr; + + return 0; +} + +void +rr_fini (xlator_t *this_xl) +{ + rr_t *rr = NULL; + + if (this_xl == NULL) + { + return; + } + + if ((rr = (rr_t *) *((long *)this_xl->private)) != NULL) + { + pthread_mutex_destroy (&rr->mutex); + _cleanup_rr (rr); + this_xl->private = NULL; + } + + return; +} + +xlator_t * +rr_schedule (xlator_t *this_xl, const void *path) +{ + rr_t *rr = NULL; + uint64_t next_schedule_index = 0; + int i = 0; + + if (this_xl == NULL || path == NULL) + { + return NULL; + } + + rr = (rr_t *) *((long *)this_xl->private); + next_schedule_index = ROUND_ROBIN (rr->schedule_index, + rr->subvolume_count); + + rr_update (this_xl); + + for (i = next_schedule_index; i < rr->subvolume_count; i++) + { + if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE && + rr->subvolume_list[i].status == RR_MIN_FREE_DISK_NOT_REACHED) + { + pthread_mutex_lock (&rr->mutex); + rr->schedule_index = i; + pthread_mutex_unlock (&rr->mutex); + return rr->subvolume_list[i].xl; + } + } + + for (i = 0; i < next_schedule_index; i++) + { + if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE && + rr->subvolume_list[i].status == RR_MIN_FREE_DISK_NOT_REACHED) + { + pthread_mutex_lock (&rr->mutex); + rr->schedule_index = i; + pthread_mutex_unlock (&rr->mutex); + return rr->subvolume_list[i].xl; + } + } + + for (i = next_schedule_index; i < rr->subvolume_count; i++) + { + if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE) + { + pthread_mutex_lock (&rr->mutex); + rr->schedule_index = i; + pthread_mutex_unlock (&rr->mutex); + return rr->subvolume_list[i].xl; + } + } + + for (i = 0; i < next_schedule_index; i++) + { + if (rr->subvolume_list[i].status == RR_SUBVOLUME_ONLINE) + { + pthread_mutex_lock (&rr->mutex); + rr->schedule_index = i; + pthread_mutex_unlock (&rr->mutex); + return rr->subvolume_list[i].xl; + } + } + + return NULL; +} + +void +rr_update (xlator_t *this_xl) +{ + rr_t *rr = NULL; + struct timeval ctime = {0, 0}; + int i = 0; + + if (this_xl == NULL) + { + return ; + } + + if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) + { + return ; + } + + if (gettimeofday (&ctime, NULL) != 0) + { + return ; + } + + if (ctime.tv_sec > (rr->options.refresh_interval + + rr->last_stat_fetched_time.tv_sec)) + { + pthread_mutex_lock (&rr->mutex); + rr->last_stat_fetched_time = ctime; + pthread_mutex_unlock (&rr->mutex); + + for (i = 0; i < rr->subvolume_count; i++) + { + xlator_t *subvolume_xl = NULL; + call_frame_t *frame = NULL; + call_pool_t *pool = NULL; + + subvolume_xl = rr->subvolume_list[i].xl; + + pool = this_xl->ctx->pool; + + frame = create_frame (this_xl, pool); + + STACK_WIND_COOKIE (frame, + rr_update_cbk, + subvolume_xl->name, + subvolume_xl, + subvolume_xl->mops->stats, + 0); + } + } + + return ; +} + +int +rr_update_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this_xl, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + rr_t *rr = NULL; + rr_subvolume_t *subvolume = NULL; + uint8_t free_disk_percent = 0; + int i = 0; + + if (frame == NULL) + { + return -1; + } + + if (cookie == NULL || this_xl == NULL) + { + STACK_DESTROY (frame->root); + return -1; + } + + if (op_ret == 0 && stats == NULL) + { + LOG_CRITICAL ("fatal! op_ret is 0 and stats is NULL. " + "Please report this to <gluster-devel@nongnu.org>"); + STACK_DESTROY (frame->root); + return -1; + } + + if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) + { + STACK_DESTROY (frame->root); + return -1; + } + + for (i = 0; i < rr->subvolume_count; i++) + { + if (rr->subvolume_list[i].xl->name == (char *) cookie) + { + subvolume = &rr->subvolume_list[i]; + break; + } + } + + if (subvolume == NULL) + { + LOG_ERROR ("unknown cookie [%s]", (char *) cookie); + STACK_DESTROY (frame->root); + return -1; + } + + if (op_ret == 0) + { + free_disk_percent = (stats->free_disk * 100) / stats->total_disk_size; + if (free_disk_percent > rr->options.min_free_disk) + { + if (subvolume->free_disk_status != RR_MIN_FREE_DISK_NOT_REACHED) + { + pthread_mutex_lock (&rr->mutex); + subvolume->free_disk_status = RR_MIN_FREE_DISK_NOT_REACHED; + pthread_mutex_unlock (&rr->mutex); + LOG_WARNING ("subvolume [%s] is available with free space for scheduling", + subvolume->xl->name); + } + } + else + { + if (subvolume->free_disk_status != RR_MIN_FREE_DISK_REACHED) + { + pthread_mutex_lock (&rr->mutex); + subvolume->free_disk_status = RR_MIN_FREE_DISK_REACHED; + pthread_mutex_unlock (&rr->mutex); + LOG_WARNING ("subvolume [%s] reached minimum disk space requirement", + subvolume->xl->name); + } + } + } + else + { + pthread_mutex_lock (&rr->mutex); + subvolume->status = RR_SUBVOLUME_OFFLINE; + pthread_mutex_unlock (&rr->mutex); + LOG_ERROR ("unable to get subvolume [%s] status information and " + "scheduling is disabled", + subvolume->xl->name); + } + + STACK_DESTROY (frame->root); + return 0; +} + +void +rr_notify (xlator_t *this_xl, int32_t event, void *data) +{ + rr_t *rr = NULL; + rr_subvolume_t *subvolume = NULL; + xlator_t *subvolume_xl = NULL; + int i = 0, ret = 0; + call_frame_t *frame = NULL; + call_pool_t *pool = NULL; + dict_t *xattr = get_new_dict (); + int32_t version[1] = {1}; + + if (this_xl == NULL || data == NULL) { + return ; + } + + if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) { + return ; + } + + subvolume_xl = (xlator_t *) data; + + for (i = 0; i < rr->subvolume_count; i++) { + if (rr->subvolume_list[i].xl == subvolume_xl) { + subvolume = &rr->subvolume_list[i]; + break; + } + } + + switch (event) { + case GF_EVENT_CHILD_UP: + /* Seeding, to be done only once */ + if (rr->first_time && (i == rr->subvolume_count)) { + loc_t loc = {0,}; + xlator_t *trav = NULL; + + pool = this_xl->ctx->pool; + frame = create_frame (this_xl, pool); + ret = dict_set_bin (xattr, "trusted.glusterfs.scheduler.rr", + version, sizeof (int32_t)); + if (-1 == ret) { + gf_log (this_xl->name, GF_LOG_ERROR, "rr seed setting failed"); + } + if (xattr) + dict_ref (xattr); + + loc.path = strdup ("/"); + for (trav = this_xl->parents->xlator; trav; trav = trav->parents->xlator) { + if (trav->itable) { + loc.inode = trav->itable->root; + break; + } + } + STACK_WIND (frame, + rr_notify_cbk, + (xlator_t *)data, + ((xlator_t *)data)->fops->xattrop, + &loc, + GF_XATTROP_ADD_ARRAY, + xattr); + + if (xattr) + dict_unref (xattr); + + rr->first_time = 0; + } + if (subvolume) { + pthread_mutex_lock (&rr->mutex); + subvolume->status = RR_SUBVOLUME_ONLINE; + pthread_mutex_unlock (&rr->mutex); + } + break; + case GF_EVENT_CHILD_DOWN: + if (subvolume) { + pthread_mutex_lock (&rr->mutex); + subvolume->status = RR_SUBVOLUME_OFFLINE; + pthread_mutex_unlock (&rr->mutex); + } + break; + } + + return ; +} + +int +rr_notify_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this_xl, + int32_t op_ret, + int32_t op_errno, + dict_t *xattr) +{ + rr_t *rr = NULL; + int32_t *index = NULL; + int32_t ret = -1; + void *tmp_index_ptr = NULL; + + if (frame == NULL) + { + return -1; + } + + if ((this_xl == NULL) || (op_ret == -1)) + { + STACK_DESTROY (frame->root); + return -1; + } + + if ((rr = (rr_t *) *((long *)this_xl->private)) == NULL) + { + STACK_DESTROY (frame->root); + return -1; + } + + ret = dict_get_bin (xattr, "trusted.glusterfs.scheduler.rr", &tmp_index_ptr); + index = tmp_index_ptr; + if (ret == 0) + rr->schedule_index = (index[0] % rr->subvolume_count); + else + rr->schedule_index = 0; + + STACK_DESTROY (frame->root); + return 0; +} + +struct sched_ops sched = { + .init = rr_init, + .fini = rr_fini, + .update = rr_update, + .schedule = rr_schedule, + .notify = rr_notify +}; + diff --git a/scheduler/rr/src/rr.h b/scheduler/rr/src/rr.h new file mode 100644 index 000000000..baa471209 --- /dev/null +++ b/scheduler/rr/src/rr.h @@ -0,0 +1,70 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _RR_H +#define _RR_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" +#include <stdint.h> +#include <sys/time.h> + +struct rr_subvolume +{ + xlator_t *xl; + uint8_t free_disk_status; + uint8_t status; +}; +typedef struct rr_subvolume rr_subvolume_t; + +struct rr +{ + rr_options_t options; + rr_subvolume_t *subvolume_list; + uint64_t subvolume_count; + uint64_t schedule_index; + struct timeval last_stat_fetched_time; + pthread_mutex_t mutex; + char first_time; +}; +typedef struct rr rr_t; + +int rr_init (xlator_t *this_xl); +void rr_fini (xlator_t *this_xl); +xlator_t *rr_schedule (xlator_t *this_xl, const void *path); +void rr_update (xlator_t *this_xl); +int rr_update_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this_xl, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats); +void rr_notify (xlator_t *this_xl, int32_t event, void *data); +int rr_notify_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this_xl, + int32_t op_ret, + int32_t op_errno, + dict_t *xattr); + +#endif /* _RR_H */ diff --git a/scheduler/switch/Makefile.am b/scheduler/switch/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/scheduler/switch/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/scheduler/switch/src/Makefile.am b/scheduler/switch/src/Makefile.am new file mode 100644 index 000000000..dc7d16d40 --- /dev/null +++ b/scheduler/switch/src/Makefile.am @@ -0,0 +1,12 @@ +sched_LTLIBRARIES = switch.la +scheddir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/scheduler + +switch_la_LDFLAGS = -module -avoidversion + +switch_la_SOURCES = switch.c +switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/scheduler/switch/src/switch.c b/scheduler/switch/src/switch.c new file mode 100644 index 000000000..70b307187 --- /dev/null +++ b/scheduler/switch/src/switch.c @@ -0,0 +1,398 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <sys/time.h> +#include <stdlib.h> +#include <fnmatch.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "scheduler.h" + +struct switch_sched_array { + xlator_t *xl; + int32_t eligible; + int32_t considered; +}; + +/* Select one of this struct based on the path's pattern match */ +struct switch_sched_struct { + struct switch_sched_struct *next; + struct switch_sched_array *array; + char path_pattern[256]; + int32_t node_index; /* Index of the node in + this pattern. */ + int32_t num_child; /* Total num of child nodes + with this pattern. */ +}; + +struct switch_struct { + struct switch_sched_struct *cond; + struct switch_sched_array *array; + pthread_mutex_t switch_mutex; + int32_t child_count; +}; + +/* This function should return child node as '*:subvolumes' is inserterd */ +static xlator_t * +switch_get_matching_xl (const char *path, struct switch_sched_struct *cond) +{ + struct switch_sched_struct *trav = cond; + char *pathname = strdup (path); + int index = 0; + + while (trav) { + if (fnmatch (trav->path_pattern, + pathname, FNM_NOESCAPE) == 0) { + free (pathname); + trav->node_index %= trav->num_child; + index = (trav->node_index++) % trav->num_child; + return trav->array[index].xl; + } + trav = trav->next; + } + free (pathname); + return NULL; +} + + +static int32_t +switch_init (xlator_t *xl) +{ + int32_t index = 0; + data_t *data = NULL; + char *child = NULL; + char *tmp = NULL; + char *childs_data = NULL; + xlator_list_t *trav_xl = xl->children; + struct switch_struct *switch_buf = NULL; + + switch_buf = CALLOC (1, sizeof (struct switch_struct)); + ERR_ABORT (switch_buf); + + while (trav_xl) { + index++; + trav_xl = trav_xl->next; + } + switch_buf->child_count = index; + switch_buf->array = CALLOC (index + 1, + sizeof (struct switch_sched_struct)); + ERR_ABORT (switch_buf->array); + trav_xl = xl->children; + index = 0; + + while (trav_xl) { + switch_buf->array[index].xl = trav_xl->xlator; + switch_buf->array[index].eligible = 1; + trav_xl = trav_xl->next; + index++; + } + + data = dict_get (xl->options, "scheduler.read-only-subvolumes"); + if (data) { + childs_data = strdup (data->data); + child = strtok_r (childs_data, ",", &tmp); + while (child) { + for (index = 1; + index < switch_buf->child_count; index++) { + if (strcmp (switch_buf->array[index - 1].xl->name, child) == 0) { + gf_log ("switch", GF_LOG_DEBUG, + "Child '%s' is read-only", + child); + memcpy (&(switch_buf->array[index-1]), + &(switch_buf->array[switch_buf->child_count - 1]), + sizeof (struct switch_sched_array)); + switch_buf->child_count--; + break; + } + } + child = strtok_r (NULL, ",", &tmp); + } + free (childs_data); + } + + data = dict_get (xl->options, "scheduler.local-volume-name"); + if (data) { + /* Means, give preference to that node first */ + gf_log ("switch", GF_LOG_DEBUG, + "local volume defined as %s", data->data); + + /* TODO: parse it properly, have an extra index to + specify that first */ + } + + /* *jpg:child1,child2;*mpg:child3;*:child4,child5,child6 */ + data = dict_get (xl->options, "scheduler.switch.case"); + if (data) { + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *switch_str = NULL; + char *pattern = NULL; + char *childs = NULL; + struct switch_sched_struct *switch_opt = NULL; + struct switch_sched_struct *trav = NULL; + /* Get the pattern for considering switch case. + "option block-size *avi:10MB" etc */ + switch_str = strtok_r (data->data, ";", &tmp_str); + while (switch_str) { + dup_str = strdup (switch_str); + switch_opt = + CALLOC (1, + sizeof (struct switch_sched_struct)); + ERR_ABORT (switch_opt); + + /* Link it to the main structure */ + if (switch_buf->cond) { + /* there are already few entries */ + trav = switch_buf->cond; + while (trav->next) + trav = trav->next; + trav->next = switch_opt; + } else { + /* First entry */ + switch_buf->cond = switch_opt; + } + pattern = strtok_r (dup_str, ":", &tmp_str1); + childs = strtok_r (NULL, ":", &tmp_str1); + if (strncmp (pattern, "*", 2) == 0) { + gf_log ("switch", GF_LOG_WARNING, + "'*' pattern will be taken by default " + "for all the unconfigured child nodes," + " hence neglecting current option"); + switch_str = strtok_r (NULL, ";", &tmp_str); + free (dup_str); + continue; + } + memcpy (switch_opt->path_pattern, + pattern, strlen (pattern)); + if (childs) { + int32_t idx = 0; + char *tmp1 = NULL; + char *dup_childs = NULL; + /* TODO: get the list of child nodes for + the given pattern */ + dup_childs = strdup (childs); + child = strtok_r (dup_childs, ",", &tmp); + while (child) { + idx++; + child = strtok_r (NULL, ",", &tmp); + } + free (dup_childs); + child = strtok_r (childs, ",", &tmp1); + switch_opt->num_child = idx; + switch_opt->array = + CALLOC (1, idx * sizeof (struct switch_sched_array)); + ERR_ABORT (switch_opt->array); + idx = 0; + child = strtok_r (childs, ",", &tmp); + while (child) { + for (index = 1; + index < switch_buf->child_count; + index++) { + if (strcmp (switch_buf->array[index - 1].xl->name, + child) == 0) { + gf_log ("switch", + GF_LOG_DEBUG, + "'%s' pattern will be scheduled to \"%s\"", + switch_opt->path_pattern, child); + /* + if (switch_buf->array[index-1].considered) { + gf_log ("switch", GF_LOG_DEBUG, + "ambiguity found, exiting"); + return -1; + } + */ + switch_opt->array[idx].xl = switch_buf->array[index-1].xl; + switch_buf->array[index-1].considered = 1; + idx++; + break; + } + } + child = strtok_r (NULL, ",", &tmp1); + } + } else { + /* error */ + gf_log ("switch", GF_LOG_ERROR, + "Check \"scheduler.switch.case\" " + "option in unify volume. Exiting"); + free (switch_buf->array); + free (switch_buf); + return -1; + } + free (dup_str); + switch_str = strtok_r (NULL, ";", &tmp_str); + } + } + /* Now, all the pattern based considerations done, so for all the + * remaining pattern, '*' to all the remaining child nodes + */ + { + struct switch_sched_struct *switch_opt = NULL; + int32_t flag = 0; + int32_t index = 0; + for (index=0; index < switch_buf->child_count; index++) { + /* check for considered flag */ + if (switch_buf->array[index].considered) + continue; + flag++; + } + if (!flag) { + gf_log ("switch", GF_LOG_ERROR, + "No nodes left for pattern '*'. Exiting."); + return -1; + } + switch_opt = CALLOC (1, sizeof (struct switch_sched_struct)); + ERR_ABORT (switch_opt); + if (switch_buf->cond) { + /* there are already few entries */ + struct switch_sched_struct *trav = switch_buf->cond; + while (trav->next) + trav = trav->next; + trav->next = switch_opt; + } else { + /* First entry */ + switch_buf->cond = switch_opt; + } + /* Add the '*' pattern to the array */ + memcpy (switch_opt->path_pattern, "*", 2); + switch_opt->num_child = flag; + switch_opt->array = + CALLOC (1, flag * sizeof (struct switch_sched_array)); + ERR_ABORT (switch_opt->array); + flag = 0; + for (index=0; index < switch_buf->child_count; index++) { + /* check for considered flag */ + if (switch_buf->array[index].considered) + continue; + gf_log ("switch", GF_LOG_DEBUG, + "'%s' pattern will be scheduled to \"%s\"", + switch_opt->path_pattern, + switch_buf->array[index].xl->name); + switch_opt->array[flag].xl = + switch_buf->array[index].xl; + switch_buf->array[index].considered = 1; + flag++; + } + } + + pthread_mutex_init (&switch_buf->switch_mutex, NULL); + + // put it at the proper place + *((long *)xl->private) = (long)switch_buf; + + return 0; +} + +static void +switch_fini (xlator_t *xl) +{ + /* TODO: free all the allocated entries */ + struct switch_struct *switch_buf = NULL; + switch_buf = (struct switch_struct *)*((long *)xl->private); + + pthread_mutex_destroy (&switch_buf->switch_mutex); + free (switch_buf->array); + free (switch_buf); +} + +static xlator_t * +switch_schedule (xlator_t *xl, const void *path) +{ + struct switch_struct *switch_buf = NULL; + switch_buf = (struct switch_struct *)*((long *)xl->private); + + return switch_get_matching_xl (path, switch_buf->cond); +} + + +/** + * notify + */ +void +switch_notify (xlator_t *xl, int32_t event, void *data) +{ + /* TODO: This should be checking in switch_sched_struct */ +#if 0 + struct switch_struct *switch_buf = NULL; + int32_t idx = 0; + + switch_buf = (struct switch_struct *)*((long *)xl->private); + if (!switch_buf) + return; + + for (idx = 0; idx < switch_buf->child_count; idx++) { + if (switch_buf->array[idx].xl == (xlator_t *)data) + break; + } + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + switch_buf->array[idx].eligible = 1; + } + break; + case GF_EVENT_CHILD_DOWN: + { + switch_buf->array[idx].eligible = 0; + } + break; + default: + { + ; + } + break; + } +#endif +} + +static void +switch_update (xlator_t *xl) +{ + return; +} + +struct sched_ops sched = { + .init = switch_init, + .fini = switch_fini, + .update = switch_update, + .schedule = switch_schedule, + .notify = switch_notify +}; + +struct volume_options options[] = { + { .key = { "scheduler.read-only-subvolumes" , + "switch.read-only-subvolumes"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "scheduler.local-volume-name", + "switch.nufa.local-volume-name" }, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = { "scheduler.switch.case", + "switch.case" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {NULL} } +}; diff --git a/transport/Makefile.am b/transport/Makefile.am new file mode 100644 index 000000000..e2f97437c --- /dev/null +++ b/transport/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = socket $(IBVERBS_SUBDIR) + +CLEANFILES = diff --git a/transport/ib-verbs/Makefile.am b/transport/ib-verbs/Makefile.am new file mode 100644 index 000000000..f963effea --- /dev/null +++ b/transport/ib-verbs/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src \ No newline at end of file diff --git a/transport/ib-verbs/src/Makefile.am b/transport/ib-verbs/src/Makefile.am new file mode 100644 index 000000000..e6240090e --- /dev/null +++ b/transport/ib-verbs/src/Makefile.am @@ -0,0 +1,15 @@ +noinst_HEADERS = ib-verbs.h name.h + +transport_LTLIBRARIES = ib-verbs.la +transportdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/transport/ + +ib_verbs_la_LDFLAGS = -module -avoidversion + +ib_verbs_la_SOURCES = ib-verbs.c name.c +ib_verbs_la_LIBADD = -libverbs $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -I$(top_srcdir)/transport/ib-verbs \ + -shared -nostartfiles + +CLEANFILES = *~ diff --git a/transport/ib-verbs/src/ib-verbs.c b/transport/ib-verbs/src/ib-verbs.c new file mode 100644 index 000000000..b9329588e --- /dev/null +++ b/transport/ib-verbs/src/ib-verbs.c @@ -0,0 +1,2392 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dict.h" +#include "glusterfs.h" +#include "transport.h" +#include "protocol.h" +#include "logging.h" +#include "xlator.h" +#include "name.h" +#include "ib-verbs.h" +#include <signal.h> + +int32_t +gf_resolve_ip6 (const char *hostname, + uint16_t port, + int family, + void **dnscache, + struct addrinfo **addr_info); + +static uint16_t +ib_verbs_get_local_lid (struct ibv_context *context, + int32_t port) +{ + struct ibv_port_attr attr; + + if (ibv_query_port (context, port, &attr)) + return 0; + + return attr.lid; +} + + +static void +ib_verbs_put_post (ib_verbs_queue_t *queue, + ib_verbs_post_t *post) +{ + pthread_mutex_lock (&queue->lock); + if (post->prev) { + queue->active_count--; + post->prev->next = post->next; + } + if (post->next) + post->next->prev = post->prev; + post->prev = &queue->passive_posts; + post->next = post->prev->next; + post->prev->next = post; + post->next->prev = post; + queue->passive_count++; + pthread_mutex_unlock (&queue->lock); +} + + +static ib_verbs_post_t * +ib_verbs_new_post (ib_verbs_device_t *device, int32_t len) +{ + ib_verbs_post_t *post; + + post = (ib_verbs_post_t *) CALLOC (1, sizeof (*post)); + if (!post) + return NULL; + + post->buf_size = len; + + post->buf = valloc (len); + if (!post->buf) { + free (post); + return NULL; + } + + post->mr = ibv_reg_mr (device->pd, + post->buf, + post->buf_size, + IBV_ACCESS_LOCAL_WRITE); + if (!post->mr) { + free (post->buf); + free (post); + return NULL; + } + + return post; +} + + +static ib_verbs_post_t * +ib_verbs_get_post (ib_verbs_queue_t *queue) +{ + ib_verbs_post_t *post; + + pthread_mutex_lock (&queue->lock); + { + post = queue->passive_posts.next; + if (post == &queue->passive_posts) + post = NULL; + + if (post) { + if (post->prev) + post->prev->next = post->next; + if (post->next) + post->next->prev = post->prev; + post->prev = &queue->active_posts; + post->next = post->prev->next; + post->prev->next = post; + post->next->prev = post; + post->reused++; + queue->active_count++; + } + } + pthread_mutex_unlock (&queue->lock); + + return post; +} + +void +ib_verbs_destroy_post (ib_verbs_post_t *post) +{ + ibv_dereg_mr (post->mr); + free (post->buf); + free (post); +} + + +static int32_t +__ib_verbs_quota_get (ib_verbs_peer_t *peer) +{ + int32_t ret = -1; + ib_verbs_private_t *priv = peer->trans->private; + + if (priv->connected && peer->quota > 0) { + ret = peer->quota--; + } + + return ret; +} + +/* + static int32_t + ib_verbs_quota_get (ib_verbs_peer_t *peer) + { + int32_t ret = -1; + ib_verbs_private_t *priv = peer->trans->private; + + pthread_mutex_lock (&priv->write_mutex); + { + ret = __ib_verbs_quota_get (peer); + } + pthread_mutex_unlock (&priv->write_mutex); + + return ret; + } +*/ + +static void +__ib_verbs_ioq_entry_free (ib_verbs_ioq_t *entry) +{ + list_del_init (&entry->list); + if (entry->refs) + dict_unref (entry->refs); + + /* TODO: use mem-pool */ + free (entry->buf); + + /* TODO: use mem-pool */ + free (entry); +} + + +static void +__ib_verbs_ioq_flush (ib_verbs_peer_t *peer) +{ + ib_verbs_ioq_t *entry = NULL, *dummy = NULL; + + list_for_each_entry_safe (entry, dummy, &peer->ioq, list) { + __ib_verbs_ioq_entry_free (entry); + } +} + + +static int32_t +__ib_verbs_disconnect (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + int32_t ret = 0; + + if (priv->connected || priv->tcp_connected) { + fcntl (priv->sock, F_SETFL, O_NONBLOCK); + if (shutdown (priv->sock, SHUT_RDWR) != 0) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "shutdown () - error: %s", + strerror (errno)); + ret = -errno; + priv->tcp_connected = 0; + } + } + + return ret; +} + + +static int32_t +ib_verbs_post_send (struct ibv_qp *qp, + ib_verbs_post_t *post, + int32_t len) +{ + struct ibv_sge list = { + .addr = (unsigned long) post->buf, + .length = len, + .lkey = post->mr->lkey + }; + + struct ibv_send_wr wr = { + .wr_id = (unsigned long) post, + .sg_list = &list, + .num_sge = 1, + .opcode = IBV_WR_SEND, + .send_flags = IBV_SEND_SIGNALED, + }, *bad_wr; + + if (!qp) + return -1; + + return ibv_post_send (qp, &wr, &bad_wr); +} + + +static int32_t +__ib_verbs_ioq_churn_entry (ib_verbs_peer_t *peer, ib_verbs_ioq_t *entry) +{ + int32_t ret = 0, quota = 0; + ib_verbs_private_t *priv = peer->trans->private; + ib_verbs_device_t *device = priv->device; + ib_verbs_options_t *options = &priv->options; + ib_verbs_post_t *post = NULL; + int32_t len = 0; + + quota = __ib_verbs_quota_get (peer); + if (quota > 0) { + post = ib_verbs_get_post (&device->sendq); + if (!post) + post = ib_verbs_new_post (device, + (options->send_size + 2048)); + + len = iov_length ((const struct iovec *)&entry->vector, + entry->count); + if (len >= (options->send_size + 2048)) { + gf_log ("transport/ib-verbs", GF_LOG_CRITICAL, + "increase value of option 'transport.ib-verbs." + "work-request-send-size' (given=> %d) to send " + "bigger (%d) messages", + (options->send_size + 2048), len); + return -1; + } + + iov_unload (post->buf, + (const struct iovec *)&entry->vector, + entry->count); + + ret = ib_verbs_post_send (peer->qp, post, len); + if (!ret) { + __ib_verbs_ioq_entry_free (entry); + ret = len; + } else { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "ibv_post_send failed with ret = %d", ret); + ib_verbs_put_post (&device->sendq, post); + __ib_verbs_disconnect (peer->trans); + ret = -1; + } + } + + return ret; +} + + +static int32_t +__ib_verbs_ioq_churn (ib_verbs_peer_t *peer) +{ + ib_verbs_ioq_t *entry = NULL; + int32_t ret = 0; + + while (!list_empty (&peer->ioq)) + { + /* pick next entry */ + entry = peer->ioq_next; + + ret = __ib_verbs_ioq_churn_entry (peer, entry); + + if (ret <= 0) + break; + } + + /* + list_for_each_entry_safe (entry, dummy, &peer->ioq, list) { + ret = __ib_verbs_ioq_churn_entry (peer, entry); + if (ret <= 0) { + break; + } + } + */ + + return ret; +} + +static int32_t +__ib_verbs_quota_put (ib_verbs_peer_t *peer) +{ + int32_t ret; + + peer->quota++; + ret = peer->quota; + + if (!list_empty (&peer->ioq)) { + ret = __ib_verbs_ioq_churn (peer); + } + + return ret; +} + + +static int32_t +ib_verbs_quota_put (ib_verbs_peer_t *peer) +{ + int32_t ret; + ib_verbs_private_t *priv = peer->trans->private; + + pthread_mutex_lock (&priv->write_mutex); + { + ret = __ib_verbs_quota_put (peer); + } + pthread_mutex_unlock (&priv->write_mutex); + + return ret; +} + + +static int32_t +ib_verbs_post_recv (struct ibv_srq *srq, + ib_verbs_post_t *post) +{ + struct ibv_sge list = { + .addr = (unsigned long) post->buf, + .length = post->buf_size, + .lkey = post->mr->lkey + }; + + struct ibv_recv_wr wr = { + .wr_id = (unsigned long) post, + .sg_list = &list, + .num_sge = 1, + }, *bad_wr; + + return ibv_post_srq_recv (srq, &wr, &bad_wr); +} + + +static int32_t +ib_verbs_writev (transport_t *this, + ib_verbs_ioq_t *entry) +{ + int32_t ret = 0, need_append = 1; + ib_verbs_private_t *priv = this->private; + ib_verbs_peer_t *peer = NULL; + + pthread_mutex_lock (&priv->write_mutex); + { + if (!priv->connected) { + gf_log (this->xl->name, GF_LOG_ERROR, + "ib-verbs is not connected to post a " + "send request"); + ret = -1; + goto unlock; + } + + peer = &priv->peer; + if (list_empty (&peer->ioq)) { + ret = __ib_verbs_ioq_churn_entry (peer, entry); + if (ret > 0) { + need_append = 0; + } + } + + if (need_append) { + list_add_tail (&entry->list, &peer->ioq); + } + } +unlock: + pthread_mutex_unlock (&priv->write_mutex); + return ret; +} + + +static ib_verbs_ioq_t * +ib_verbs_ioq_new (char *buf, int len, struct iovec *vector, + int count, dict_t *refs) +{ + ib_verbs_ioq_t *entry = NULL; + + /* TODO: use mem-pool */ + entry = CALLOC (1, sizeof (*entry)); + + assert (count <= (MAX_IOVEC-2)); + + entry->header.colonO[0] = ':'; + entry->header.colonO[1] = 'O'; + entry->header.colonO[2] = '\0'; + entry->header.version = 42; + entry->header.size1 = hton32 (len); + entry->header.size2 = hton32 (iov_length (vector, count)); + + entry->vector[0].iov_base = &entry->header; + entry->vector[0].iov_len = sizeof (entry->header); + entry->count++; + + entry->vector[1].iov_base = buf; + entry->vector[1].iov_len = len; + entry->count++; + + if (vector && count) + { + memcpy (&entry->vector[2], vector, sizeof (*vector) * count); + entry->count += count; + } + + if (refs) + entry->refs = dict_ref (refs); + + entry->buf = buf; + + INIT_LIST_HEAD (&entry->list); + + return entry; +} + + +static int32_t +ib_verbs_submit (transport_t *this, char *buf, int32_t len, + struct iovec *vector, int count, dict_t *refs) +{ + int32_t ret = 0; + ib_verbs_ioq_t *entry = NULL; + + entry = ib_verbs_ioq_new (buf, len, vector, count, refs); + ret = ib_verbs_writev (this, entry); + + if (ret > 0) { + ret = 0; + } + + return ret; +} + +static int +ib_verbs_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, + char **buf_p, size_t *buflen_p) +{ + ib_verbs_private_t *priv = this->private; + /* TODO: return error if !priv->connected, check with locks */ + /* TODO: boundry checks for data_ptr/offset */ + char *copy_from = NULL; + ib_verbs_header_t *header = NULL; + uint32_t size1, size2, data_len = 0; + char *hdr = NULL, *buf = NULL; + int32_t ret = 0; + + pthread_mutex_lock (&priv->recv_mutex); + { +/* + while (!priv->data_ptr) + pthread_cond_wait (&priv->recv_cond, &priv->recv_mutex); +*/ + + copy_from = priv->data_ptr + priv->data_offset; + + priv->data_ptr = NULL; + data_len = priv->data_len; + /* pthread_cond_broadcast (&priv->recv_cond); */ + } + pthread_mutex_unlock (&priv->recv_mutex); + + header = (ib_verbs_header_t *)copy_from; + if (strcmp (header->colonO, ":O")) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: corrupt header received", this->xl->name); + ret = -1; + goto err; + } + + size1 = ntoh32 (header->size1); + size2 = ntoh32 (header->size2); + + if (data_len != (size1 + size2 + sizeof (*header))) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: sizeof data read from transport is not equal " + "to the size specified in the header", + this->xl->name); + ret = -1; + goto err; + } + + copy_from += sizeof (*header); + + if (size1) { + hdr = CALLOC (1, size1); + memcpy (hdr, copy_from, size1); + copy_from += size1; + *hdr_p = hdr; + } + *hdrlen_p = size1; + + if (size2) { + buf = CALLOC (1, size2); + memcpy (buf, copy_from, size2); + *buf_p = buf; + } + *buflen_p = size2; + +err: + return ret; +} + + +static void +ib_verbs_destroy_cq (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_device_t *device = priv->device; + + if (device->recv_cq) + ibv_destroy_cq (device->recv_cq); + device->recv_cq = NULL; + + if (device->send_cq) + ibv_destroy_cq (device->send_cq); + device->send_cq = NULL; + + return; +} + + +static int32_t +ib_verbs_create_cq (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + ib_verbs_device_t *device = priv->device; + int32_t ret = 0; + + device->recv_cq = ibv_create_cq (priv->device->context, + options->recv_count * 2, + device, + device->recv_chan, + 0); + if (!device->recv_cq) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: creation of CQ failed", + this->xl->name); + ret = -1; + } else if (ibv_req_notify_cq (device->recv_cq, 0)) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: ibv_req_notify_cq on CQ failed", + this->xl->name); + ret = -1; + } + + do { + /* TODO: make send_cq size dynamically adaptive */ + device->send_cq = ibv_create_cq (priv->device->context, + options->send_count * 1024, + device, + device->send_chan, + 0); + if (!device->send_cq) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: creation of send_cq failed", + this->xl->name); + ret = -1; + break; + } + + if (ibv_req_notify_cq (device->send_cq, 0)) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: ibv_req_notify_cq on send_cq failed", + this->xl->name); + ret = -1; + break; + } + } while (0); + + if (ret != 0) + ib_verbs_destroy_cq (this); + + return ret; +} + + +static void +ib_verbs_register_peer (ib_verbs_device_t *device, + int32_t qp_num, + ib_verbs_peer_t *peer) +{ + struct _qpent *ent; + ib_verbs_qpreg_t *qpreg = &device->qpreg; + int32_t hash = qp_num % 42; + + pthread_mutex_lock (&qpreg->lock); + ent = qpreg->ents[hash].next; + while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num)) + ent = ent->next; + if (ent->qp_num == qp_num) { + pthread_mutex_unlock (&qpreg->lock); + return; + } + ent = (struct _qpent *) CALLOC (1, sizeof (*ent)); + ERR_ABORT (ent); + /* TODO: ref reg->peer */ + ent->peer = peer; + ent->next = &qpreg->ents[hash]; + ent->prev = ent->next->prev; + ent->next->prev = ent; + ent->prev->next = ent; + ent->qp_num = qp_num; + qpreg->count++; + pthread_mutex_unlock (&qpreg->lock); +} + + +static void +ib_verbs_unregister_peer (ib_verbs_device_t *device, + int32_t qp_num) +{ + struct _qpent *ent; + ib_verbs_qpreg_t *qpreg = &device->qpreg; + int32_t hash = qp_num % 42; + + pthread_mutex_lock (&qpreg->lock); + ent = qpreg->ents[hash].next; + while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num)) + ent = ent->next; + if (ent->qp_num != qp_num) { + pthread_mutex_unlock (&qpreg->lock); + return; + } + ent->prev->next = ent->next; + ent->next->prev = ent->prev; + /* TODO: unref reg->peer */ + free (ent); + qpreg->count--; + pthread_mutex_unlock (&qpreg->lock); +} + + +static ib_verbs_peer_t * +ib_verbs_lookup_peer (ib_verbs_device_t *device, + int32_t qp_num) +{ + struct _qpent *ent; + ib_verbs_qpreg_t *qpreg = &device->qpreg; + ib_verbs_peer_t *peer; + int32_t hash = qp_num % 42; + + pthread_mutex_lock (&qpreg->lock); + ent = qpreg->ents[hash].next; + while ((ent != &qpreg->ents[hash]) && (ent->qp_num != qp_num)) + ent = ent->next; + peer = ent->peer; + pthread_mutex_unlock (&qpreg->lock); + return peer; +} + + +static void +__ib_verbs_destroy_qp (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + + if (priv->peer.qp) { + ib_verbs_unregister_peer (priv->device, priv->peer.qp->qp_num); + ibv_destroy_qp (priv->peer.qp); + } + priv->peer.qp = NULL; + + return; +} + + +static int32_t +ib_verbs_create_qp (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + ib_verbs_device_t *device = priv->device; + int32_t ret = 0; + ib_verbs_peer_t *peer; + + peer = &priv->peer; + struct ibv_qp_init_attr init_attr = { + .send_cq = device->send_cq, + .recv_cq = device->recv_cq, + .srq = device->srq, + .cap = { + .max_send_wr = peer->send_count, + .max_recv_wr = peer->recv_count, + .max_send_sge = 1, + .max_recv_sge = 1 + }, + .qp_type = IBV_QPT_RC + }; + + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_INIT, + .pkey_index = 0, + .port_num = options->port, + .qp_access_flags = 0 + }; + + peer->qp = ibv_create_qp (device->pd, &init_attr); + if (!peer->qp) { + gf_log ("transport/ib-verbs", + GF_LOG_CRITICAL, + "%s: could not create QP", + this->xl->name); + ret = -1; + } else if (ibv_modify_qp (peer->qp, &attr, + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS)) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: failed to modify QP to INIT state", + this->xl->name); + ret = -1; + } + + peer->local_lid = ib_verbs_get_local_lid (device->context, + options->port); + peer->local_qpn = peer->qp->qp_num; + peer->local_psn = lrand48 () & 0xffffff; + + ib_verbs_register_peer (device, peer->qp->qp_num, peer); + + if (ret == -1) + __ib_verbs_destroy_qp (this); + + return ret; +} + + +static void +ib_verbs_destroy_posts (transport_t *this) +{ + +} + + +static int32_t +__ib_verbs_create_posts (transport_t *this, + int32_t count, + int32_t size, + ib_verbs_queue_t *q) +{ + int32_t i; + int32_t ret = 0; + ib_verbs_private_t *priv = this->private; + ib_verbs_device_t *device = priv->device; + + for (i=0 ; i<count ; i++) { + ib_verbs_post_t *post; + + post = ib_verbs_new_post (device, size + 2048); + if (!post) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: post creation failed", + this->xl->name); + ret = -1; + break; + } + + ib_verbs_put_post (q, post); + } + return ret; +} + + +static int32_t +ib_verbs_create_posts (transport_t *this) +{ + int32_t i, ret; + ib_verbs_post_t *post = NULL; + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + ib_verbs_device_t *device = priv->device; + + ret = __ib_verbs_create_posts (this, options->send_count, + options->send_size, + &device->sendq); + if (!ret) + ret = __ib_verbs_create_posts (this, options->recv_count, + options->recv_size, + &device->recvq); + + if (!ret) { + for (i=0 ; i<options->recv_count ; i++) { + post = ib_verbs_get_post (&device->recvq); + if (ib_verbs_post_recv (device->srq, post) != 0) { + ret = -1; + break; + } + } + } + + if (ret) + ib_verbs_destroy_posts (this); + + return ret; +} + + +static int32_t +ib_verbs_connect_qp (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + struct ibv_qp_attr attr = { + .qp_state = IBV_QPS_RTR, + .path_mtu = options->mtu, + .dest_qp_num = priv->peer.remote_qpn, + .rq_psn = priv->peer.remote_psn, + .max_dest_rd_atomic = 1, + .min_rnr_timer = 12, + .ah_attr = { + .is_global = 0, + .dlid = priv->peer.remote_lid, + .sl = 0, + .src_path_bits = 0, + .port_num = options->port + } + }; + if (ibv_modify_qp (priv->peer.qp, &attr, + IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER)) { + gf_log ("transport/ib-verbs", + GF_LOG_CRITICAL, + "Failed to modify QP to RTR\n"); + return -1; + } + + /* TODO: make timeout and retry_cnt configurable from options */ + attr.qp_state = IBV_QPS_RTS; + attr.timeout = 14; + attr.retry_cnt = 7; + attr.rnr_retry = 7; + attr.sq_psn = priv->peer.local_psn; + attr.max_rd_atomic = 1; + if (ibv_modify_qp (priv->peer.qp, &attr, + IBV_QP_STATE | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC)) { + gf_log ("transport/ib-verbs", + GF_LOG_CRITICAL, + "Failed to modify QP to RTS\n"); + return -1; + } + + return 0; +} + +static int32_t +__ib_verbs_teardown (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + + __ib_verbs_destroy_qp (this); + + if (!list_empty (&priv->peer.ioq)) { + __ib_verbs_ioq_flush (&priv->peer); + } + + /* TODO: decrement cq size */ + return 0; +} + +/* + * return value: + * 0 = success (completed) + * -1 = error + * > 0 = incomplete + */ + +static int +__tcp_rwv (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count, + int write) +{ + ib_verbs_private_t *priv = NULL; + int sock = -1; + int ret = -1; + struct iovec *opvector = vector; + int opcount = count; + int moved = 0; + + priv = this->private; + sock = priv->sock; + + while (opcount) + { + if (write) + { + ret = writev (sock, opvector, opcount); + + if (ret == 0 || (ret == -1 && errno == EAGAIN)) + { + /* done for now */ + break; + } + } + else + { + ret = readv (sock, opvector, opcount); + + if (ret == -1 && errno == EAGAIN) + { + /* done for now */ + break; + } + } + + if (ret == 0) + { + gf_log (this->xl->name, GF_LOG_ERROR, "EOF from peer"); + opcount = -1; + errno = ENOTCONN; + break; + } + + if (ret == -1) + { + if (errno == EINTR) + continue; + + gf_log (this->xl->name, GF_LOG_ERROR, + "%s failed (%s)", write ? "writev" : "readv", + strerror (errno)); + if (write && !priv->connected && + (errno == ECONNREFUSED)) + gf_log (this->xl->name, GF_LOG_ERROR, + "possible mismatch of 'transport-type'" + " in protocol server and client. " + "check volume file"); + opcount = -1; + break; + } + + moved = 0; + + while (moved < ret) + { + if ((ret - moved) >= opvector[0].iov_len) + { + moved += opvector[0].iov_len; + opvector++; + opcount--; + } + else + { + opvector[0].iov_len -= (ret - moved); + opvector[0].iov_base += (ret - moved); + moved += (ret - moved); + } + while (opcount && !opvector[0].iov_len) + { + opvector++; + opcount--; + } + } + } + + if (pending_vector) + *pending_vector = opvector; + + if (pending_count) + *pending_count = opcount; + + return opcount; +} + + +static int +__tcp_readv (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count) +{ + int ret = -1; + + ret = __tcp_rwv (this, vector, count, + pending_vector, pending_count, 0); + + return ret; +} + + +static int +__tcp_writev (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count) +{ + int ret = -1; + ib_verbs_private_t *priv = this->private; + + ret = __tcp_rwv (this, vector, count, pending_vector, + pending_count, 1); + + if (ret > 0) { + /* TODO: Avoid multiple calls when socket is already + registered for POLLOUT */ + priv->idx = event_select_on (this->xl->ctx->event_pool, + priv->sock, priv->idx, -1, 1); + } else if (ret == 0) { + priv->idx = event_select_on (this->xl->ctx->event_pool, + priv->sock, + priv->idx, -1, 0); + } + + return ret; +} + + +static void * +ib_verbs_recv_completion_proc (void *data) +{ + struct ibv_comp_channel *chan = data; + ib_verbs_private_t *priv = NULL; + ib_verbs_device_t *device; + ib_verbs_post_t *post; + ib_verbs_peer_t *peer; + struct ibv_cq *event_cq; + struct ibv_wc wc; + void *event_ctx; + int32_t ret = 0; + + + while (1) { + ret = ibv_get_cq_event (chan, &event_cq, &event_ctx); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "ibv_get_cq_event failed, terminating recv " + "thread %d (%d)", ret, errno); + continue; + } + + device = event_ctx; + + ret = ibv_req_notify_cq (event_cq, 0); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "ibv_req_notify_cq on %s failed, terminating " + "recv thread: %d (%d)", + device->device_name, ret, errno); + continue; + } + + device = (ib_verbs_device_t *) event_ctx; + + while ((ret = ibv_poll_cq (event_cq, 1, &wc)) > 0) { + post = (ib_verbs_post_t *) (long) wc.wr_id; + peer = ib_verbs_lookup_peer (device, wc.qp_num); + + if (wc.status != IBV_WC_SUCCESS) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "recv work request on `%s' returned " + "error (%d)", + device->device_name, + wc.status); + if (peer) + transport_disconnect (peer->trans); + + if (post) { + ib_verbs_post_recv (device->srq, post); + } + continue; + } + + if (peer) { + priv = peer->trans->private; + + pthread_mutex_lock (&priv->recv_mutex); + { +/* while (priv->data_ptr) + pthread_cond_wait (&priv->recv_cond, &priv->recv_mutex); +*/ + + priv->data_ptr = post->buf; + priv->data_offset = 0; + priv->data_len = wc.byte_len; + + /*pthread_cond_broadcast (&priv->recv_cond);*/ + } + pthread_mutex_unlock (&priv->recv_mutex); + + if ((ret = peer->trans->xl->notify (peer->trans->xl, GF_EVENT_POLLIN, + peer->trans, NULL)) == -1) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "pollin notification to %s " + "failed, disconnecting " + "transport", + peer->trans->xl->name); + transport_disconnect (peer->trans); + } + } else { + gf_log ("transport/ib-verbs", + GF_LOG_DEBUG, + "could not lookup peer for qp_num: %d", + wc.qp_num); + } + ib_verbs_post_recv (device->srq, post); + } + + if (ret < 0) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "ibv_poll_cq on `%s' returned error " + "(ret = %d, errno = %d)", + device->device_name, ret, errno); + continue; + } + ibv_ack_cq_events (event_cq, 1); + } + return NULL; +} + + +static void * +ib_verbs_send_completion_proc (void *data) +{ + struct ibv_comp_channel *chan = data; + ib_verbs_post_t *post; + ib_verbs_peer_t *peer; + struct ibv_cq *event_cq; + void *event_ctx; + ib_verbs_device_t *device; + struct ibv_wc wc; + int32_t ret; + + while (1) { + ret = ibv_get_cq_event (chan, &event_cq, &event_ctx); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "ibv_get_cq_event on failed, terminating " + "send thread: %d (%d)", ret, errno); + continue; + } + + device = event_ctx; + + ret = ibv_req_notify_cq (event_cq, 0); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "ibv_req_notify_cq on %s failed, terminating " + "send thread: %d (%d)", + device->device_name, ret, errno); + continue; + } + + while ((ret = ibv_poll_cq (event_cq, 1, &wc)) > 0) { + post = (ib_verbs_post_t *) (long) wc.wr_id; + peer = ib_verbs_lookup_peer (device, wc.qp_num); + + if (wc.status != IBV_WC_SUCCESS) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "send work request on `%s' returned " + "error wc.status = %d, wc.vendor_err " + "= %d, post->buf = %p, wc.byte_len = " + "%d, post->reused = %d", + device->device_name, wc.status, + wc.vendor_err, + post->buf, wc.byte_len, post->reused); + if (peer) + transport_disconnect (peer->trans); + } + + if (post) { + ib_verbs_put_post (&device->sendq, post); + } + + if (peer) { + int quota_ret = ib_verbs_quota_put (peer); + if (quota_ret < 0) { + gf_log ("ib-verbs", GF_LOG_WARNING, + "failed to send message"); + + } + } else { + gf_log ("transport/ib-verbs", GF_LOG_DEBUG, + "could not lookup peer for qp_num: %d", + wc.qp_num); + } + } + + if (ret < 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "ibv_poll_cq on `%s' returned error (ret = %d," + " errno = %d)", + device->device_name, ret, errno); + continue; + } + ibv_ack_cq_events (event_cq, 1); + } + + return NULL; +} + +static void +ib_verbs_options_init (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + int32_t mtu; + data_t *temp; + + /* TODO: validate arguments from options below */ + + options->send_size = 1048576; + options->recv_size = 1048576; + options->send_count = 16; + options->recv_count = 16; + + temp = dict_get (this->xl->options, + "transport.ib-verbs.work-request-send-count"); + if (temp) + options->send_count = data_to_int32 (temp); + + temp = dict_get (this->xl->options, + "transport.ib-verbs.work-request-recv-count"); + if (temp) + options->recv_count = data_to_int32 (temp); + + temp = dict_get (this->xl->options, + "transport.ib-verbs.work-request-send-size"); + if (temp) + options->send_size = data_to_int32 (temp); + + temp = dict_get (this->xl->options, + "transport.ib-verbs.work-request-recv-size"); + if (temp) + options->recv_size = data_to_int32 (temp); + + options->port = 1; + temp = dict_get (this->xl->options, + "transport.ib-verbs.port"); + if (temp) + options->port = data_to_uint64 (temp); + + options->mtu = mtu = IBV_MTU_2048; + temp = dict_get (this->xl->options, + "transport.ib-verbs.mtu"); + if (temp) + mtu = data_to_int32 (temp); + switch (mtu) { + case 256: options->mtu = IBV_MTU_256; + break; + case 512: options->mtu = IBV_MTU_512; + break; + case 1024: options->mtu = IBV_MTU_1024; + break; + case 2048: options->mtu = IBV_MTU_2048; + break; + case 4096: options->mtu = IBV_MTU_4096; + break; + default: + if (temp) + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: unrecognized MTU value '%s', defaulting " + "to '2048'", this->xl->name, + data_to_str (temp)); + else + gf_log ("transport/ib-verbs", GF_LOG_DEBUG, + "%s: defaulting MTU to '2048'", + this->xl->name); + options->mtu = IBV_MTU_2048; + break; + } + + temp = dict_get (this->xl->options, + "transport.ib-verbs.device-name"); + if (temp) + options->device_name = strdup (temp->data); + + return; +} + +static void +ib_verbs_queue_init (ib_verbs_queue_t *queue) +{ + pthread_mutex_init (&queue->lock, NULL); + + queue->active_posts.next = &queue->active_posts; + queue->active_posts.prev = &queue->active_posts; + queue->passive_posts.next = &queue->passive_posts; + queue->passive_posts.prev = &queue->passive_posts; +} + + +static ib_verbs_device_t * +ib_verbs_get_device (transport_t *this, + struct ibv_device *ib_dev, + int32_t port) +{ + glusterfs_ctx_t *ctx = this->xl->ctx; + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + char *device_name = priv->options.device_name; + int32_t ret = 0, i = 0; + + ib_verbs_device_t *trav; + + trav = ctx->ib; + while (trav) { + if ((!strcmp (trav->device_name, device_name)) && + (trav->port == port)) + break; + trav = trav->next; + } + + if (!trav) { + struct ibv_context *ibctx = ibv_open_device (ib_dev); + + if (!ibctx) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "cannot open device `%s'", + device_name); + return NULL; + } + + trav = CALLOC (1, sizeof (*trav)); + ERR_ABORT (trav); + priv->device = trav; + + trav->context = ibctx; + trav->device_name = strdup (device_name); + trav->port = port; + + trav->next = ctx->ib; + ctx->ib = trav; + + trav->send_chan = ibv_create_comp_channel (trav->context); + if (!trav->send_chan) { + gf_log ("transport/ib-verbs", GF_LOG_CRITICAL, + "%s: could not create send completion channel", + device_name); + /* TODO: cleanup current mess */ + return NULL; + } + + trav->recv_chan = ibv_create_comp_channel (trav->context); + if (!trav->recv_chan) { + gf_log ("transport/ib-verbs", GF_LOG_CRITICAL, + "could not create recv completion channel"); + /* TODO: cleanup current mess */ + return NULL; + } + + if (ib_verbs_create_cq (this) < 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: could not create CQ", + this->xl->name); + return NULL; + } + + /* protection domain */ + trav->pd = ibv_alloc_pd (trav->context); + + if (!trav->pd) { + gf_log ("transport/ib-verbs", GF_LOG_CRITICAL, + "%s: could not allocate protection domain", + this->xl->name); + return NULL; + } + + struct ibv_srq_init_attr attr = { + .attr = { + .max_wr = options->recv_count, + .max_sge = 1 + } + }; + trav->srq = ibv_create_srq (trav->pd, &attr); + + if (!trav->srq) { + gf_log ("transport/ib-verbs", GF_LOG_CRITICAL, + "%s: could not create SRQ", + this->xl->name); + return NULL; + } + + /* queue init */ + ib_verbs_queue_init (&trav->sendq); + ib_verbs_queue_init (&trav->recvq); + + if (ib_verbs_create_posts (this) < 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: could not allocate posts", + this->xl->name); + return NULL; + } + + /* completion threads */ + ret = pthread_create (&trav->send_thread, + NULL, + ib_verbs_send_completion_proc, + trav->send_chan); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "could not create send completion thread"); + return NULL; + } + ret = pthread_create (&trav->recv_thread, + NULL, + ib_verbs_recv_completion_proc, + trav->recv_chan); + if (ret) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "could not create recv completion thread"); + return NULL; + } + + /* qpreg */ + pthread_mutex_init (&trav->qpreg.lock, NULL); + for (i=0; i<42; i++) { + trav->qpreg.ents[i].next = &trav->qpreg.ents[i]; + trav->qpreg.ents[i].prev = &trav->qpreg.ents[i]; + } + } + return trav; +} + +static int32_t +ib_verbs_init (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = &priv->options; + struct ibv_device **dev_list; + struct ibv_device *ib_dev = NULL; + int32_t i; + + ib_verbs_options_init (this); + + { + dev_list = ibv_get_device_list (NULL); + + if (!dev_list) { + gf_log ("transport/ib-verbs", + GF_LOG_CRITICAL, + "No IB devices found"); + return -1; + } + + if (!options->device_name) { + if (*dev_list) { + options->device_name = + strdup (ibv_get_device_name (*dev_list)); + } else { + gf_log ("transport/ib-verbs", GF_LOG_CRITICAL, + "IB device list is empty. Check for " + "'ib_uverbs' module"); + return -1; + } + } + + for (i = 0; dev_list[i]; i++) { + if (!strcmp (ibv_get_device_name (dev_list[i]), + options->device_name)) { + ib_dev = dev_list[i]; + break; + } + } + + if (!ib_dev) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "could not open device `%s' (does not exist)", + options->device_name); + ibv_free_device_list (dev_list); + return -1; + } + + priv->device = ib_verbs_get_device (this, ib_dev, + options->port); + + if (!priv->device) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "could not create ib_verbs device for %s", + options->device_name); + ibv_free_device_list (dev_list); + return -1; + } + ibv_free_device_list (dev_list); + } + + priv->peer.trans = this; + INIT_LIST_HEAD (&priv->peer.ioq); + + pthread_mutex_init (&priv->read_mutex, NULL); + pthread_mutex_init (&priv->write_mutex, NULL); + pthread_mutex_init (&priv->recv_mutex, NULL); + /* pthread_cond_init (&priv->recv_cond, NULL); */ + + return 0; +} + + +static int32_t +ib_verbs_disconnect (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + int32_t ret = 0; + + pthread_mutex_lock (&priv->write_mutex); + { + ret = __ib_verbs_disconnect (this); + } + pthread_mutex_unlock (&priv->write_mutex); + + return ret; +} + + +static int32_t +__tcp_connect_finish (int fd) +{ + int ret = -1; + int optval = 0; + socklen_t optlen = sizeof (int); + + ret = getsockopt (fd, SOL_SOCKET, SO_ERROR, + (void *)&optval, &optlen); + + if (ret == 0 && optval) + { + errno = optval; + ret = -1; + } + + return ret; +} + +static inline void +ib_verbs_fill_handshake_data (char *buf, struct ib_verbs_nbio *nbio, + ib_verbs_private_t *priv) +{ + sprintf (buf, + "QP1:RECV_BLKSIZE=%08x:SEND_BLKSIZE=%08x\n" + "QP1:LID=%04x:QPN=%06x:PSN=%06x\n", + priv->peer.recv_size, + priv->peer.send_size, + priv->peer.local_lid, + priv->peer.local_qpn, + priv->peer.local_psn); + + nbio->vector.iov_base = buf; + nbio->vector.iov_len = strlen (buf) + 1; + nbio->count = 1; + return; +} + +static inline void +ib_verbs_fill_handshake_ack (char *buf, struct ib_verbs_nbio *nbio) +{ + sprintf (buf, "DONE\n"); + nbio->vector.iov_base = buf; + nbio->vector.iov_len = strlen (buf) + 1; + nbio->count = 1; + return; +} + +static int +ib_verbs_handshake_pollin (transport_t *this) +{ + int ret = 0; + ib_verbs_private_t *priv = this->private; + char *buf = priv->handshake.incoming.buf; + int32_t recv_buf_size, send_buf_size; + socklen_t sock_len; + + if (priv->handshake.incoming.state == IB_VERBS_HANDSHAKE_COMPLETE) { + return -1; + } + + pthread_mutex_lock (&priv->write_mutex); + { + while (priv->handshake.incoming.state != IB_VERBS_HANDSHAKE_COMPLETE) + { + switch (priv->handshake.incoming.state) + { + case IB_VERBS_HANDSHAKE_START: + buf = priv->handshake.incoming.buf = CALLOC (1, 256); + ib_verbs_fill_handshake_data (buf, &priv->handshake.incoming, priv); + buf[0] = 0; + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVING_DATA; + break; + + case IB_VERBS_HANDSHAKE_RECEIVING_DATA: + ret = __tcp_readv (this, + &priv->handshake.incoming.vector, + priv->handshake.incoming.count, + &priv->handshake.incoming.pending_vector, + &priv->handshake.incoming.pending_count); + if (ret == -1) { + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "partial header read on NB socket. continue later"); + goto unlock; + } + + if (!ret) { + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVED_DATA; + } + break; + + case IB_VERBS_HANDSHAKE_RECEIVED_DATA: + ret = sscanf (buf, + "QP1:RECV_BLKSIZE=%08x:SEND_BLKSIZE=%08x\n" + "QP1:LID=%04x:QPN=%06x:PSN=%06x\n", + &recv_buf_size, + &send_buf_size, + &priv->peer.remote_lid, + &priv->peer.remote_qpn, + &priv->peer.remote_psn); + + if ((ret != 5) && (strncmp (buf, "QP1:", 4))) { + gf_log ("transport/ib-verbs", + GF_LOG_CRITICAL, + "%s: remote-host(%s)'s " + "transport type is different", + this->xl->name, + this->peerinfo.identifier); + ret = -1; + goto unlock; + } + + if (recv_buf_size < priv->peer.recv_size) + priv->peer.recv_size = recv_buf_size; + if (send_buf_size < priv->peer.send_size) + priv->peer.send_size = send_buf_size; + + gf_log ("transport/ib-verbs", GF_LOG_DEBUG, + "%s: transacted recv_size=%d " + "send_size=%d", + this->xl->name, priv->peer.recv_size, + priv->peer.send_size); + + priv->peer.quota = priv->peer.send_count; + + if (ib_verbs_connect_qp (this)) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: failed to connect with " + "remote QP", this->xl->name); + ret = -1; + goto unlock; + } + ib_verbs_fill_handshake_ack (buf, &priv->handshake.incoming); + buf[0] = 0; + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVING_ACK; + break; + + case IB_VERBS_HANDSHAKE_RECEIVING_ACK: + ret = __tcp_readv (this, + &priv->handshake.incoming.vector, + priv->handshake.incoming.count, + &priv->handshake.incoming.pending_vector, + &priv->handshake.incoming.pending_count); + if (ret == -1) { + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "partial header read on NB " + "socket. continue later"); + goto unlock; + } + + if (!ret) { + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_RECEIVED_ACK; + } + break; + + case IB_VERBS_HANDSHAKE_RECEIVED_ACK: + if (strncmp (buf, "DONE", 4)) { + gf_log ("transport/ib-verbs", + GF_LOG_ERROR, + "%s: handshake-3 did not " + "return 'DONE' (%s)", + this->xl->name, buf); + ret = -1; + goto unlock; + } + ret = 0; + priv->connected = 1; + sock_len = sizeof (struct sockaddr_storage); + getpeername (priv->sock, + (struct sockaddr *) &this->peerinfo.sockaddr, + &sock_len); + + FREE (priv->handshake.incoming.buf); + priv->handshake.incoming.buf = NULL; + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_COMPLETE; + } + } + } +unlock: + pthread_mutex_unlock (&priv->write_mutex); + + if (ret == -1) { + transport_disconnect (this); + } else { + ret = 0; + } + + if (!ret && priv->connected) { + ret = this->xl->notify (this->xl, GF_EVENT_CHILD_UP, this); + } + + return ret; +} + +static int +ib_verbs_handshake_pollout (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + char *buf = priv->handshake.outgoing.buf; + int32_t ret = 0; + + if (priv->handshake.outgoing.state == IB_VERBS_HANDSHAKE_COMPLETE) { + return 0; + } + + pthread_mutex_unlock (&priv->write_mutex); + { + while (priv->handshake.outgoing.state != IB_VERBS_HANDSHAKE_COMPLETE) + { + switch (priv->handshake.outgoing.state) + { + case IB_VERBS_HANDSHAKE_START: + buf = priv->handshake.outgoing.buf = CALLOC (1, 256); + ib_verbs_fill_handshake_data (buf, &priv->handshake.outgoing, priv); + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_SENDING_DATA; + break; + + case IB_VERBS_HANDSHAKE_SENDING_DATA: + ret = __tcp_writev (this, + &priv->handshake.outgoing.vector, + priv->handshake.outgoing.count, + &priv->handshake.outgoing.pending_vector, + &priv->handshake.outgoing.pending_count); + if (ret == -1) { + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "partial header read on NB socket. continue later"); + goto unlock; + } + + if (!ret) { + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_SENT_DATA; + } + break; + + case IB_VERBS_HANDSHAKE_SENT_DATA: + ib_verbs_fill_handshake_ack (buf, &priv->handshake.outgoing); + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_SENDING_ACK; + break; + + case IB_VERBS_HANDSHAKE_SENDING_ACK: + ret = __tcp_writev (this, + &priv->handshake.outgoing.vector, + priv->handshake.outgoing.count, + &priv->handshake.outgoing.pending_vector, + &priv->handshake.outgoing.pending_count); + + if (ret == -1) { + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "partial header read on NB " + "socket. continue later"); + goto unlock; + } + + if (!ret) { + FREE (priv->handshake.outgoing.buf); + priv->handshake.outgoing.buf = NULL; + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_COMPLETE; + } + break; + } + } + } +unlock: + pthread_mutex_unlock (&priv->write_mutex); + + if (ret == -1) { + transport_disconnect (this); + } else { + ret = 0; + } + + return ret; +} + +static int +ib_verbs_handshake_pollerr (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + int32_t ret = 0; + char need_unref = 0; + + gf_log ("transport/ib-verbs", GF_LOG_DEBUG, + "%s: peer disconnected, cleaning up", + this->xl->name); + + pthread_mutex_lock (&priv->write_mutex); + { + __ib_verbs_teardown (this); + + if (priv->sock != -1) { + event_unregister (this->xl->ctx->event_pool, + priv->sock, priv->idx); + need_unref = 1; + + if (close (priv->sock) != 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "close () - error: %s", + strerror (errno)); + ret = -errno; + } + priv->tcp_connected = priv->connected = 0; + priv->sock = -1; + } + + if (priv->handshake.incoming.buf) { + FREE (priv->handshake.incoming.buf); + priv->handshake.incoming.buf = NULL; + } + + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_START; + + if (priv->handshake.outgoing.buf) { + FREE (priv->handshake.outgoing.buf); + priv->handshake.outgoing.buf = NULL; + } + + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_START; + } + pthread_mutex_unlock (&priv->write_mutex); + + this->xl->notify (this->xl, GF_EVENT_POLLERR, this, NULL); + + if (need_unref) + transport_unref (this); + + return 0; +} + + +static int +tcp_connect_finish (transport_t *this) +{ + ib_verbs_private_t *priv = this->private; + int error = 0, ret = 0; + + pthread_mutex_lock (&priv->write_mutex); + { + ret = __tcp_connect_finish (priv->sock); + + if (!ret) { + this->myinfo.sockaddr_len = + sizeof (this->myinfo.sockaddr); + ret = getsockname (priv->sock, + (struct sockaddr *)&this->myinfo.sockaddr, + &this->myinfo.sockaddr_len); + if (ret == -1) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "getsockname on new client-socket %d " + "failed (%s)", + priv->sock, strerror (errno)); + close (priv->sock); + error = 1; + goto unlock; + } + + get_transport_identifiers (this); + priv->tcp_connected = 1; + } + + if (ret == -1 && errno != EINPROGRESS) { + gf_log (this->xl->name, GF_LOG_ERROR, + "tcp connect to %s failed (%s)", + this->peerinfo.identifier, strerror (errno)); + error = 1; + } + } +unlock: + pthread_mutex_unlock (&priv->write_mutex); + + if (error) { + transport_disconnect (this); + } + + return ret; +} + +static int +ib_verbs_event_handler (int fd, int idx, void *data, + int poll_in, int poll_out, int poll_err) +{ + transport_t *this = data; + ib_verbs_private_t *priv = this->private; + ib_verbs_options_t *options = NULL; + int ret = 0; + + if (!priv->tcp_connected) { + ret = tcp_connect_finish (this); + if (priv->tcp_connected) { + options = &priv->options; + + priv->peer.send_count = options->send_count; + priv->peer.recv_count = options->recv_count; + priv->peer.send_size = options->send_size; + priv->peer.recv_size = options->recv_size; + + if ((ret = ib_verbs_create_qp (this)) < 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: could not create QP", + this->xl->name); + transport_disconnect (this); + } + } + } + + if (!ret && poll_out && priv->tcp_connected) { + ret = ib_verbs_handshake_pollout (this); + } + + if (!ret && poll_in && priv->tcp_connected) { + if (priv->handshake.incoming.state == IB_VERBS_HANDSHAKE_COMPLETE) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: pollin received on tcp socket (peer: %s) " + "after handshake is complete", + this->xl->name, this->peerinfo.identifier); + ib_verbs_handshake_pollerr (this); + return 0; + } + ret = ib_verbs_handshake_pollin (this); + } + + if (poll_err) { + ret = ib_verbs_handshake_pollerr (this); + } + + return 0; +} + +static int +__tcp_nonblock (int fd) +{ + int flags = 0; + int ret = -1; + + flags = fcntl (fd, F_GETFL); + + if (flags != -1) + ret = fcntl (fd, F_SETFL, flags | O_NONBLOCK); + + return ret; +} + +static int32_t +ib_verbs_connect (struct transport *this) +{ + dict_t *options = this->xl->options; + + ib_verbs_private_t *priv = this->private; + + int32_t ret = 0; + gf_boolean_t non_blocking = 1; + struct sockaddr_storage sockaddr; + socklen_t sockaddr_len = 0; + + if (priv->connected) { + return 0; + } + + if (dict_get (options, "non-blocking-io")) { + char *nb_connect = data_to_str (dict_get (this->xl->options, + "non-blocking-io")); + + if (gf_string2boolean (nb_connect, &non_blocking) == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "'non-blocking-io' takes only boolean " + "options, not taking any action"); + non_blocking = 1; + } + } + + ret = ibverbs_client_get_remote_sockaddr (this, (struct sockaddr *)&sockaddr, + &sockaddr_len); + if (ret != 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot get remote address to connect"); + return ret; + } + + pthread_mutex_lock (&priv->write_mutex); + { + if (priv->sock != -1) { + ret = 0; + goto unlock; + } + + priv->sock = socket (((struct sockaddr *)&sockaddr)->sa_family, + SOCK_STREAM, 0); + + if (priv->sock == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket () - error: %s", strerror (errno)); + ret = -errno; + goto unlock; + } + + gf_log (this->xl->name, GF_LOG_DEBUG, + "socket fd = %d", priv->sock); + + memcpy (&this->peerinfo.sockaddr, &sockaddr, sockaddr_len); + this->peerinfo.sockaddr_len = sockaddr_len; + + ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = + ((struct sockaddr *)&this->peerinfo.sockaddr)->sa_family; + + if (non_blocking) + { + ret = __tcp_nonblock (priv->sock); + + if (ret == -1) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "could not set socket %d to non " + "blocking mode (%s)", + priv->sock, strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + } + + ret = client_bind (this, + (struct sockaddr *)&this->myinfo.sockaddr, + &this->myinfo.sockaddr_len, priv->sock); + if (ret == -1) + { + gf_log (this->xl->name, GF_LOG_WARNING, + "client bind failed: %s", strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + ret = connect (priv->sock, + (struct sockaddr *)&this->peerinfo.sockaddr, + this->peerinfo.sockaddr_len); + if (ret == -1 && errno != EINPROGRESS) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "connection attempt failed (%s)", + strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + priv->tcp_connected = priv->connected = 0; + + transport_ref (this); + + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_START; + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_START; + + priv->idx = event_register (this->xl->ctx->event_pool, + priv->sock, ib_verbs_event_handler, + this, 1, 1); + } +unlock: + pthread_mutex_unlock (&priv->write_mutex); + + return ret; +} + +static int +ib_verbs_server_event_handler (int fd, int idx, void *data, + int poll_in, int poll_out, int poll_err) +{ + int32_t main_sock = -1; + transport_t *this, *trans = data; + ib_verbs_private_t *priv = NULL; + ib_verbs_private_t *trans_priv = (ib_verbs_private_t *) trans->private; + ib_verbs_options_t *options = NULL; + + if (!poll_in) + return 0; + + this = CALLOC (1, sizeof (transport_t)); + ERR_ABORT (this); + priv = CALLOC (1, sizeof (ib_verbs_private_t)); + ERR_ABORT (priv); + this->private = priv; + /* Copy all the ib_verbs related values in priv, from trans_priv + as other than QP, all the values remain same */ + priv->device = trans_priv->device; + priv->options = trans_priv->options; + options = &priv->options; + + this->ops = trans->ops; + this->xl = trans->xl; + + memcpy (&this->myinfo.sockaddr, &trans->myinfo.sockaddr, + trans->myinfo.sockaddr_len); + this->myinfo.sockaddr_len = trans->myinfo.sockaddr_len; + + main_sock = (trans_priv)->sock; + this->peerinfo.sockaddr_len = sizeof (this->peerinfo.sockaddr); + priv->sock = accept (main_sock, + (struct sockaddr *)&this->peerinfo.sockaddr, + &this->peerinfo.sockaddr_len); + if (priv->sock == -1) { + gf_log ("ib-verbs/server", GF_LOG_ERROR, + "accept() failed: %s", + strerror (errno)); + free (this->private); + free (this); + return -1; + } + + priv->peer.trans = this; + transport_ref (this); + + get_transport_identifiers (this); + + priv->tcp_connected = 1; + priv->handshake.incoming.state = IB_VERBS_HANDSHAKE_START; + priv->handshake.outgoing.state = IB_VERBS_HANDSHAKE_START; + + priv->peer.send_count = options->send_count; + priv->peer.recv_count = options->recv_count; + priv->peer.send_size = options->send_size; + priv->peer.recv_size = options->recv_size; + INIT_LIST_HEAD (&priv->peer.ioq); + + if (ib_verbs_create_qp (this) < 0) { + gf_log ("transport/ib-verbs", GF_LOG_ERROR, + "%s: could not create QP", + this->xl->name); + transport_disconnect (this); + return -1; + } + + priv->idx = event_register (this->xl->ctx->event_pool, priv->sock, + ib_verbs_event_handler, this, 1, 1); + + pthread_mutex_init (&priv->read_mutex, NULL); + pthread_mutex_init (&priv->write_mutex, NULL); + pthread_mutex_init (&priv->recv_mutex, NULL); + /* pthread_cond_init (&priv->recv_cond, NULL); */ + + return 0; +} + +static int32_t +ib_verbs_listen (transport_t *this) +{ + struct sockaddr_storage sockaddr; + socklen_t sockaddr_len; + ib_verbs_private_t *priv = this->private; + int opt = 1, ret = 0; + char service[NI_MAXSERV], host[NI_MAXHOST]; + + memset (&sockaddr, 0, sizeof (sockaddr)); + ret = ibverbs_server_get_local_sockaddr (this, + (struct sockaddr *)&sockaddr, + &sockaddr_len); + if (ret != 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot find network address of server to bind to"); + goto err; + } + + priv->sock = socket (((struct sockaddr *)&sockaddr)->sa_family, + SOCK_STREAM, 0); + if (priv->sock == -1) { + gf_log ("ib-verbs/server", GF_LOG_CRITICAL, + "init: failed to create socket, error: %s", + strerror (errno)); + free (this->private); + ret = -1; + goto err; + } + + memcpy (&this->myinfo.sockaddr, &sockaddr, sockaddr_len); + this->myinfo.sockaddr_len = sockaddr_len; + + ret = getnameinfo ((struct sockaddr *)&this->myinfo.sockaddr, + this->myinfo.sockaddr_len, + host, sizeof (host), + service, sizeof (service), + NI_NUMERICHOST); + if (ret != 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "getnameinfo failed (%s)", gai_strerror (ret)); + goto err; + } + sprintf (this->myinfo.identifier, "%s:%s", host, service); + + setsockopt (priv->sock, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof (opt)); + if (bind (priv->sock, + (struct sockaddr *)&sockaddr, + sockaddr_len) != 0) { + ret = -1; + gf_log ("ib-verbs/server", GF_LOG_CRITICAL, + "init: failed to bind to socket for %s (%s)", + this->myinfo.identifier, strerror (errno)); + goto err; + } + + if (listen (priv->sock, 10) != 0) { + gf_log ("ib-verbs/server", GF_LOG_CRITICAL, + "init: listen () failed on socket for %s (%s)", + this->myinfo.identifier, strerror (errno)); + ret = -1; + goto err; + } + + /* Register the main socket */ + priv->idx = event_register (this->xl->ctx->event_pool, priv->sock, + ib_verbs_server_event_handler, + transport_ref (this), 1, 0); + +err: + return ret; +} + +struct transport_ops tops = { + .receive = ib_verbs_receive, + .submit = ib_verbs_submit, + .connect = ib_verbs_connect, + .disconnect = ib_verbs_disconnect, + .listen = ib_verbs_listen, +}; + +int32_t +init (transport_t *this) +{ + ib_verbs_private_t *priv = CALLOC (1, sizeof (*priv)); + this->private = priv; + priv->sock = -1; + + if (ib_verbs_init (this)) { + gf_log (this->xl->name, GF_LOG_ERROR, + "Failed to initialize IB Device"); + return -1; + } + + return 0; +} + +void +fini (struct transport *this) +{ + /* TODO: verify this function does graceful finish */ + ib_verbs_private_t *priv = this->private; + this->private = NULL; + + pthread_mutex_destroy (&priv->recv_mutex); + pthread_mutex_destroy (&priv->write_mutex); + pthread_mutex_destroy (&priv->read_mutex); + /* pthread_cond_destroy (&priv->recv_cond); */ + + gf_log (this->xl->name, GF_LOG_CRITICAL, + "called fini on transport: %p", + this); + free (priv); + return; +} + +/* TODO: expand each option */ +struct volume_options options[] = { + { .key = {"transport.ib-verbs.port", + "ib-verbs-port"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 4, + .description = "check the option by 'ibv_devinfo'" + }, + { .key = {"transport.ib-verbs.mtu", + "ib-verbs-mtu"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"transport.ib-verbs.device-name", + "ib-verbs-device-name"}, + .type = GF_OPTION_TYPE_ANY, + .description = "check by 'ibv_devinfo'" + }, + { .key = {"transport.ib-verbs.work-request-send-size", + "ib-verbs-work-request-send-size"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"transport.ib-verbs.work-request-recv-size", + "ib-verbs-work-request-recv-size"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"transport.ib-verbs.work-request-send-count", + "ib-verbs-work-request-send-count"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"transport.ib-verbs.work-request-recv-count", + "ib-verbs-work-request-recv-count"}, + .type = GF_OPTION_TYPE_INT, + }, + { .key = {"remote-port", + "transport.remote-port", + "transport.ib-verbs.remote-port"}, + .type = GF_OPTION_TYPE_INT + }, + { .key = {"transport.ib-verbs.listen-port", "listen-port"}, + .type = GF_OPTION_TYPE_INT + }, + { .key = {"transport.ib-verbs.connect-path", "connect-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport.ib-verbs.bind-path", "bind-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport.ib-verbs.listen-path", "listen-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport.address-family", + "address-family"}, + .value = {"inet", "inet6", "inet/inet6", "inet6/inet", + "unix", "inet-sdp" }, + .type = GF_OPTION_TYPE_STR + }, + { .key = {NULL} } +}; diff --git a/transport/ib-verbs/src/ib-verbs.h b/transport/ib-verbs/src/ib-verbs.h new file mode 100644 index 000000000..56b717865 --- /dev/null +++ b/transport/ib-verbs/src/ib-verbs.h @@ -0,0 +1,215 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _XPORT_IB_VERBS_H +#define _XPORT_IB_VERBS_H + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef MAX_IOVEC +#define MAX_IOVEC 16 +#endif /* MAX_IOVEC */ + +#include "xlator.h" +#include "event.h" + +#include <stdio.h> +#include <list.h> +#include <arpa/inet.h> +#include <infiniband/verbs.h> + +#define GF_DEFAULT_IBVERBS_LISTEN_PORT 6997 + +/* options per transport end point */ +struct _ib_verbs_options { + int32_t port; + char *device_name; + enum ibv_mtu mtu; + int32_t send_count, send_size, recv_count, recv_size; +}; +typedef struct _ib_verbs_options ib_verbs_options_t; + + +struct _ib_verbs_header { + char colonO[3]; + uint32_t size1; + uint32_t size2; + char version; +} __attribute__((packed)); +typedef struct _ib_verbs_header ib_verbs_header_t; + +struct _ib_verbs_ioq { + union { + struct list_head list; + struct { + struct _ib_verbs_ioq *next; + struct _ib_verbs_ioq *prev; + }; + }; + ib_verbs_header_t header; + struct iovec vector[MAX_IOVEC]; + int count; + char *buf; + dict_t *refs; +}; +typedef struct _ib_verbs_ioq ib_verbs_ioq_t; + +/* represents one communication peer, two per transport_t */ +struct _ib_verbs_peer { + transport_t *trans; + struct ibv_qp *qp; + + int32_t recv_count; + int32_t send_count; + int32_t recv_size; + int32_t send_size; + + int32_t quota; + union { + struct list_head ioq; + struct { + ib_verbs_ioq_t *ioq_next; + ib_verbs_ioq_t *ioq_prev; + }; + }; + + /* QP attributes, needed to connect with remote QP */ + int32_t local_lid; + int32_t local_psn; + int32_t local_qpn; + int32_t remote_lid; + int32_t remote_psn; + int32_t remote_qpn; +}; +typedef struct _ib_verbs_peer ib_verbs_peer_t; + + +struct _ib_verbs_post { + struct _ib_verbs_post *next, *prev; + struct ibv_mr *mr; + char *buf; + int32_t buf_size; + char aux; + int32_t reused; + pthread_barrier_t wait; +}; +typedef struct _ib_verbs_post ib_verbs_post_t; + + +struct _ib_verbs_queue { + ib_verbs_post_t active_posts, passive_posts; + int32_t active_count, passive_count; + pthread_mutex_t lock; +}; +typedef struct _ib_verbs_queue ib_verbs_queue_t; + + +struct _ib_verbs_qpreg { + pthread_mutex_t lock; + int32_t count; + struct _qpent { + struct _qpent *next, *prev; + int32_t qp_num; + ib_verbs_peer_t *peer; + } ents[42]; +}; +typedef struct _ib_verbs_qpreg ib_verbs_qpreg_t; + +/* context per device, stored in global glusterfs_ctx_t->ib */ +struct _ib_verbs_device { + struct _ib_verbs_device *next; + const char *device_name; + struct ibv_context *context; + int32_t port; + struct ibv_pd *pd; + struct ibv_srq *srq; + ib_verbs_qpreg_t qpreg; + struct ibv_comp_channel *send_chan, *recv_chan; + struct ibv_cq *send_cq, *recv_cq; + ib_verbs_queue_t sendq, recvq; + pthread_t send_thread, recv_thread; +}; +typedef struct _ib_verbs_device ib_verbs_device_t; + +typedef enum { + IB_VERBS_HANDSHAKE_START = 0, + IB_VERBS_HANDSHAKE_SENDING_DATA, + IB_VERBS_HANDSHAKE_RECEIVING_DATA, + IB_VERBS_HANDSHAKE_SENT_DATA, + IB_VERBS_HANDSHAKE_RECEIVED_DATA, + IB_VERBS_HANDSHAKE_SENDING_ACK, + IB_VERBS_HANDSHAKE_RECEIVING_ACK, + IB_VERBS_HANDSHAKE_RECEIVED_ACK, + IB_VERBS_HANDSHAKE_COMPLETE, +} ib_verbs_handshake_state_t; + +struct ib_verbs_nbio { + int state; + char *buf; + int count; + struct iovec vector; + struct iovec *pending_vector; + int pending_count; +}; + + +struct _ib_verbs_private { + int32_t sock; + int32_t idx; + unsigned char connected; + unsigned char tcp_connected; + unsigned char ib_connected; + in_addr_t addr; + unsigned short port; + + /* IB Verbs Driver specific variables, pointers */ + ib_verbs_peer_t peer; + ib_verbs_device_t *device; + ib_verbs_options_t options; + + /* Used by trans->op->receive */ + char *data_ptr; + int32_t data_offset; + int32_t data_len; + + /* Mutex */ + pthread_mutex_t read_mutex; + pthread_mutex_t write_mutex; + pthread_barrier_t handshake_barrier; + char handshake_ret; + + pthread_mutex_t recv_mutex; + + /* used during ib_verbs_handshake */ + struct { + struct ib_verbs_nbio incoming; + struct ib_verbs_nbio outgoing; + int state; + ib_verbs_header_t header; + char *buf; + size_t size; + } handshake; +}; +typedef struct _ib_verbs_private ib_verbs_private_t; + +#endif /* _XPORT_IB_VERBS_H */ diff --git a/transport/ib-verbs/src/name.c b/transport/ib-verbs/src/name.c new file mode 100644 index 000000000..697344987 --- /dev/null +++ b/transport/ib-verbs/src/name.c @@ -0,0 +1,682 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <sys/types.h> +#include <sys/socket.h> +#include <errno.h> +#include <netdb.h> +#include <string.h> + +#ifdef CLIENT_PORT_CEILING +#undef CLIENT_PORT_CEILING +#endif + +#define CLIENT_PORT_CEILING 1024 + +#ifndef AF_INET_SDP +#define AF_INET_SDP 27 +#endif + +#include "transport.h" +#include "ib-verbs.h" + +int32_t +gf_resolve_ip6 (const char *hostname, + uint16_t port, + int family, + void **dnscache, + struct addrinfo **addr_info); + +static int32_t +af_inet_bind_to_port_lt_ceiling (int fd, struct sockaddr *sockaddr, + socklen_t sockaddr_len, int ceiling) +{ + int32_t ret = -1; + /* struct sockaddr_in sin = {0, }; */ + uint16_t port = ceiling - 1; + + while (port) + { + switch (sockaddr->sa_family) + { + case AF_INET6: + ((struct sockaddr_in6 *)sockaddr)->sin6_port = htons (port); + break; + + case AF_INET_SDP: + case AF_INET: + ((struct sockaddr_in *)sockaddr)->sin_port = htons (port); + break; + } + + ret = bind (fd, sockaddr, sockaddr_len); + + if (ret == 0) + break; + + if (ret == -1 && errno == EACCES) + break; + + port--; + } + + return ret; +} + +static int32_t +af_unix_client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t sockaddr_len, + int sock) +{ + data_t *path_data = NULL; + struct sockaddr_un *addr = NULL; + int32_t ret = -1; + + path_data = dict_get (this->xl->options, + "transport.ib-verbs.bind-path"); + if (path_data) { + char *path = data_to_str (path_data); + if (!path || strlen (path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "transport.ib-verbs.bind-path not specfied " + "for unix socket, letting connect to assign " + "default value"); + goto err; + } + + addr = (struct sockaddr_un *) sockaddr; + strcpy (addr->sun_path, path); + ret = bind (sock, (struct sockaddr *)addr, sockaddr_len); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot bind to unix-domain socket %d (%s)", + sock, strerror (errno)); + goto err; + } + } + +err: + return ret; +} + +static int32_t +client_fill_address_family (transport_t *this, struct sockaddr *sockaddr) +{ + data_t *address_family_data = NULL; + + address_family_data = dict_get (this->xl->options, + "transport.address-family"); + if (!address_family_data) { + data_t *remote_host_data = NULL, *connect_path_data = NULL; + remote_host_data = dict_get (this->xl->options, "remote-host"); + connect_path_data = dict_get (this->xl->options, + "transport.ib-verbs.connect-path"); + + if (!(remote_host_data || connect_path_data) || + (remote_host_data && connect_path_data)) { + gf_log (this->xl->name, GF_LOG_ERROR, + "address-family not specified and not able to " + "determine the same from other options " + "(remote-host:%s and connect-path:%s)", + data_to_str (remote_host_data), + data_to_str (connect_path_data)); + return -1; + } + + if (remote_host_data) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "address-family not specified, guessing it " + "to be inet/inet6"); + sockaddr->sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_DEBUG, + "address-family not specified, guessing it " + "to be unix"); + sockaddr->sa_family = AF_UNIX; + } + + } else { + char *address_family = data_to_str (address_family_data); + if (!strcasecmp (address_family, "unix")) { + sockaddr->sa_family = AF_UNIX; + } else if (!strcasecmp (address_family, "inet")) { + sockaddr->sa_family = AF_INET; + } else if (!strcasecmp (address_family, "inet6")) { + sockaddr->sa_family = AF_INET6; + } else if (!strcasecmp (address_family, "inet-sdp")) { + sockaddr->sa_family = AF_INET_SDP; + } else if (!strcasecmp (address_family, "inet/inet6") + || !strcasecmp (address_family, "inet6/inet")) { + sockaddr->sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address-family (%s) specified", + address_family); + return -1; + } + } + + return 0; +} + +static int32_t +af_inet_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len) +{ + dict_t *options = this->xl->options; + data_t *remote_host_data = NULL; + data_t *remote_port_data = NULL; + char *remote_host = NULL; + uint16_t remote_port = 0; + struct addrinfo *addr_info = NULL; + int32_t ret = 0; + + remote_host_data = dict_get (options, "remote-host"); + if (remote_host_data == NULL) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-host missing in volume %s", + this->xl->name); + ret = -1; + goto err; + } + + remote_host = data_to_str (remote_host_data); + if (remote_host == NULL) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-host has data NULL in volume %s", + this->xl->name); + ret = -1; + goto err; + } + + remote_port_data = dict_get (options, "remote-port"); + if (remote_port_data == NULL) + { + gf_log (this->xl->name, GF_LOG_DEBUG, + "option remote-port missing in volume %s. " + "Defaulting to %d", + this->xl->name, GF_DEFAULT_IBVERBS_LISTEN_PORT); + + remote_port = GF_DEFAULT_IBVERBS_LISTEN_PORT; + } + else + { + remote_port = data_to_uint16 (remote_port_data); + } + + if (remote_port == (uint16_t)-1) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-port has invalid port in volume %s", + this->xl->name); + ret = -1; + goto err; + } + + /* TODO: gf_resolve is a blocking call. kick in some + non blocking dns techniques */ + ret = gf_resolve_ip6 (remote_host, remote_port, + sockaddr->sa_family, + &this->dnscache, &addr_info); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "DNS resolution failed on host %s", remote_host); + goto err; + } + + memcpy (sockaddr, addr_info->ai_addr, addr_info->ai_addrlen); + *sockaddr_len = addr_info->ai_addrlen; + +err: + return ret; +} + +static int32_t +af_unix_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len) +{ + struct sockaddr_un *sockaddr_un = NULL; + char *connect_path = NULL; + data_t *connect_path_data = NULL; + int32_t ret = 0; + + connect_path_data = dict_get (this->xl->options, + "transport.ib-verbs.connect-path"); + if (!connect_path_data) { + gf_log (this->xl->name, GF_LOG_ERROR, + "option transport.ib-verbs.connect-path not " + "specified for address-family unix"); + ret = -1; + goto err; + } + + connect_path = data_to_str (connect_path_data); + if (!connect_path) { + gf_log (this->xl->name, GF_LOG_ERROR, + "connect-path is null-string"); + ret = -1; + goto err; + } + + if (strlen (connect_path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_ERROR, + "connect-path value length %"GF_PRI_SIZET" > " + "%d octets", strlen (connect_path), UNIX_PATH_MAX); + ret = -1; + goto err; + } + + gf_log (this->xl->name, + GF_LOG_DEBUG, + "using connect-path %s", connect_path); + sockaddr_un = (struct sockaddr_un *)sockaddr; + strcpy (sockaddr_un->sun_path, connect_path); + *sockaddr_len = sizeof (struct sockaddr_un); + +err: + return ret; +} + +static int32_t +af_unix_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len) +{ + data_t *listen_path_data = NULL; + char *listen_path = NULL; + int32_t ret = 0; + struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; + + + listen_path_data = dict_get (this->xl->options, + "transport.ib-verbs.listen-path"); + if (!listen_path_data) { + gf_log (this->xl->name, GF_LOG_ERROR, + "missing option listen-path"); + ret = -1; + goto err; + } + + listen_path = data_to_str (listen_path_data); + +#ifndef UNIX_PATH_MAX +#define UNIX_PATH_MAX 108 +#endif + + if (strlen (listen_path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_ERROR, + "option listen-path has value length %"GF_PRI_SIZET" > %d", + strlen (listen_path), UNIX_PATH_MAX); + ret = -1; + goto err; + } + + sunaddr->sun_family = AF_UNIX; + strcpy (sunaddr->sun_path, listen_path); + *addr_len = sizeof (struct sockaddr_un); + +err: + return ret; +} + +static int32_t +af_inet_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len) +{ + struct addrinfo hints, *res = 0; + data_t *listen_port_data = NULL, *listen_host_data = NULL; + uint16_t listen_port = -1; + char service[NI_MAXSERV], *listen_host = NULL; + dict_t *options = NULL; + int32_t ret = 0; + + options = this->xl->options; + + listen_port_data = dict_get (options, "transport.ib-verbs.listen-port"); + listen_host_data = dict_get (options, "transport.ib-verbs.bind-address"); + + if (listen_port_data) + { + listen_port = data_to_uint16 (listen_port_data); + } + + if (listen_port == (uint16_t) -1) + listen_port = GF_DEFAULT_IBVERBS_LISTEN_PORT; + + + if (listen_host_data) + { + listen_host = data_to_str (listen_host_data); + } + + memset (service, 0, sizeof (service)); + sprintf (service, "%d", listen_port); + + memset (&hints, 0, sizeof (hints)); + hints.ai_family = addr->sa_family; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE; + + ret = getaddrinfo(listen_host, service, &hints, &res); + if (ret != 0) { + gf_log (this->xl->name, + GF_LOG_ERROR, + "getaddrinfo failed for host %s, service %s (%s)", + listen_host, service, gai_strerror (ret)); + ret = -1; + goto err; + } + + memcpy (addr, res->ai_addr, res->ai_addrlen); + *addr_len = res->ai_addrlen; + + freeaddrinfo (res); + +err: + return ret; +} + +int32_t +client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len, + int sock) +{ + int ret = 0; + + *sockaddr_len = sizeof (struct sockaddr_in6); + switch (sockaddr->sa_family) + { + case AF_INET_SDP: + case AF_INET: + *sockaddr_len = sizeof (struct sockaddr_in); + + case AF_INET6: + ret = af_inet_bind_to_port_lt_ceiling (sock, sockaddr, + *sockaddr_len, + CLIENT_PORT_CEILING); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot bind inet socket (%d) to port " + "less than %d (%s)", + sock, CLIENT_PORT_CEILING, strerror (errno)); + ret = 0; + } + break; + + case AF_UNIX: + *sockaddr_len = sizeof (struct sockaddr_un); + ret = af_unix_client_bind (this, (struct sockaddr *)sockaddr, + *sockaddr_len, sock); + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family %d", sockaddr->sa_family); + ret = -1; + break; + } + + return ret; +} + +int32_t +ibverbs_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len) +{ + int32_t ret = 0; + char is_inet_sdp = 0; + + ret = client_fill_address_family (this, sockaddr); + if (ret) { + ret = -1; + goto err; + } + + switch (sockaddr->sa_family) + { + case AF_INET_SDP: + sockaddr->sa_family = AF_INET; + is_inet_sdp = 1; + + case AF_INET: + case AF_INET6: + case AF_UNSPEC: + ret = af_inet_client_get_remote_sockaddr (this, + sockaddr, + sockaddr_len); + + if (is_inet_sdp) { + sockaddr->sa_family = AF_INET_SDP; + } + + break; + + case AF_UNIX: + ret = af_unix_client_get_remote_sockaddr (this, + sockaddr, + sockaddr_len); + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address-family %d", sockaddr->sa_family); + ret = -1; + } + +err: + return ret; +} + +int32_t +ibverbs_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len) +{ + data_t *address_family_data = NULL; + int32_t ret = 0; + char is_inet_sdp = 0; + + address_family_data = dict_get (this->xl->options, + "transport.address-family"); + if (address_family_data) { + char *address_family = NULL; + address_family = data_to_str (address_family_data); + + if (!strcasecmp (address_family, "inet")) { + addr->sa_family = AF_INET; + } else if (!strcasecmp (address_family, "inet6")) { + addr->sa_family = AF_INET6; + } else if (!strcasecmp (address_family, "inet-sdp")) { + addr->sa_family = AF_INET_SDP; + } else if (!strcasecmp (address_family, "unix")) { + addr->sa_family = AF_UNIX; + } else if (!strcasecmp (address_family, "inet/inet6") + || !strcasecmp (address_family, "inet6/inet")) { + addr->sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family (%s) specified", + address_family); + ret = -1; + goto err; + } + } else { + gf_log (this->xl->name, GF_LOG_DEBUG, + "option address-family not specified, defaulting " + "to inet/inet6"); + addr->sa_family = AF_UNSPEC; + } + + switch (addr->sa_family) + { + case AF_INET_SDP: + is_inet_sdp = 1; + addr->sa_family = AF_INET; + + case AF_INET: + case AF_INET6: + case AF_UNSPEC: + ret = af_inet_server_get_local_sockaddr (this, addr, addr_len); + if (is_inet_sdp && !ret) { + addr->sa_family = AF_INET_SDP; + } + break; + + case AF_UNIX: + ret = af_unix_server_get_local_sockaddr (this, addr, addr_len); + break; + } + +err: + return ret; +} + +int32_t +fill_inet6_inet_identifiers (transport_t *this, struct sockaddr_storage *addr, + int32_t addr_len, char *identifier) +{ + int32_t ret = 0, tmpaddr_len = 0; + char service[NI_MAXSERV], host[NI_MAXHOST]; + struct sockaddr_storage tmpaddr; + + memset (&tmpaddr, 0, sizeof (tmpaddr)); + tmpaddr = *addr; + tmpaddr_len = addr_len; + + if (((struct sockaddr *) &tmpaddr)->sa_family == AF_INET6) { + int32_t one_to_four, four_to_eight, twelve_to_sixteen; + int16_t eight_to_ten, ten_to_twelve; + + one_to_four = four_to_eight = twelve_to_sixteen = 0; + eight_to_ten = ten_to_twelve = 0; + + one_to_four = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[0]; + four_to_eight = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[1]; + eight_to_ten = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr16[4]; + ten_to_twelve = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr16[5]; + twelve_to_sixteen = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[3]; + + /* ipv4 mapped ipv6 address has + bits 0-80: 0 + bits 80-96: 0xffff + bits 96-128: ipv4 address + */ + + if (one_to_four == 0 && + four_to_eight == 0 && + eight_to_ten == 0 && + ten_to_twelve == -1) { + struct sockaddr_in *in_ptr = (struct sockaddr_in *)&tmpaddr; + memset (&tmpaddr, 0, sizeof (tmpaddr)); + + in_ptr->sin_family = AF_INET; + in_ptr->sin_port = ((struct sockaddr_in6 *)addr)->sin6_port; + in_ptr->sin_addr.s_addr = twelve_to_sixteen; + tmpaddr_len = sizeof (*in_ptr); + } + } + + ret = getnameinfo ((struct sockaddr *) &tmpaddr, + tmpaddr_len, + host, sizeof (host), + service, sizeof (service), + NI_NUMERICHOST | NI_NUMERICSERV); + if (ret != 0) { + gf_log (this->xl->name, + GF_LOG_ERROR, + "getnameinfo failed (%s)", gai_strerror (ret)); + } + + sprintf (identifier, "%s:%s", host, service); + + return ret; +} + +int32_t +get_transport_identifiers (transport_t *this) +{ + int32_t ret = 0; + char is_inet_sdp = 0; + + switch (((struct sockaddr *) &this->myinfo.sockaddr)->sa_family) + { + case AF_INET_SDP: + is_inet_sdp = 1; + ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET; + + case AF_INET: + case AF_INET6: + { + ret = fill_inet6_inet_identifiers (this, + &this->myinfo.sockaddr, + this->myinfo.sockaddr_len, + this->myinfo.identifier); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "can't fill inet/inet6 identifier for server"); + goto err; + } + + ret = fill_inet6_inet_identifiers (this, + &this->peerinfo.sockaddr, + this->peerinfo.sockaddr_len, + this->peerinfo.identifier); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "can't fill inet/inet6 identifier for client"); + goto err; + } + + if (is_inet_sdp) { + ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET_SDP; + } + } + break; + + case AF_UNIX: + { + struct sockaddr_un *sunaddr = NULL; + + sunaddr = (struct sockaddr_un *) &this->myinfo.sockaddr; + strcpy (this->myinfo.identifier, sunaddr->sun_path); + + sunaddr = (struct sockaddr_un *) &this->peerinfo.sockaddr; + strcpy (this->peerinfo.identifier, sunaddr->sun_path); + } + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family (%d)", + ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family); + ret = -1; + break; + } + +err: + return ret; +} diff --git a/transport/ib-verbs/src/name.h b/transport/ib-verbs/src/name.h new file mode 100644 index 000000000..1b0f378b9 --- /dev/null +++ b/transport/ib-verbs/src/name.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _IB_VERBS_NAME_H +#define _IB_VERBS_NAME_H + +#include <sys/socket.h> +#include <sys/un.h> + +#include "compat.h" + +int32_t +client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len, + int sock); + +int32_t +ibverbs_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len); + +int32_t +ibverbs_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len); + +int32_t +get_transport_identifiers (transport_t *this); + +#endif /* _IB_VERBS_NAME_H */ diff --git a/transport/socket/Makefile.am b/transport/socket/Makefile.am new file mode 100644 index 000000000..f963effea --- /dev/null +++ b/transport/socket/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src \ No newline at end of file diff --git a/transport/socket/src/Makefile.am b/transport/socket/src/Makefile.am new file mode 100644 index 000000000..e11292123 --- /dev/null +++ b/transport/socket/src/Makefile.am @@ -0,0 +1,14 @@ +noinst_HEADERS = socket.h name.h + +transport_LTLIBRARIES = socket.la +transportdir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/transport/ + +socket_la_LDFLAGS = -module -avoidversion + +socket_la_SOURCES = socket.c name.c +socket_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = *~ diff --git a/transport/socket/src/name.c b/transport/socket/src/name.c new file mode 100644 index 000000000..a599b00cc --- /dev/null +++ b/transport/socket/src/name.c @@ -0,0 +1,677 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <sys/types.h> +#include <sys/socket.h> +#include <errno.h> +#include <netdb.h> +#include <string.h> + +#ifdef CLIENT_PORT_CEILING +#undef CLIENT_PORT_CEILING +#endif + +#define CLIENT_PORT_CEILING 1024 + +#ifndef AF_INET_SDP +#define AF_INET_SDP 27 +#endif + +#include "transport.h" +#include "socket.h" + +int32_t +gf_resolve_ip6 (const char *hostname, + uint16_t port, + int family, + void **dnscache, + struct addrinfo **addr_info); + +static int32_t +af_inet_bind_to_port_lt_ceiling (int fd, struct sockaddr *sockaddr, + socklen_t sockaddr_len, int ceiling) +{ + int32_t ret = -1; + /* struct sockaddr_in sin = {0, }; */ + uint16_t port = ceiling - 1; + + while (port) + { + switch (sockaddr->sa_family) + { + case AF_INET6: + ((struct sockaddr_in6 *)sockaddr)->sin6_port = htons (port); + break; + + case AF_INET_SDP: + case AF_INET: + ((struct sockaddr_in *)sockaddr)->sin_port = htons (port); + break; + } + + ret = bind (fd, sockaddr, sockaddr_len); + + if (ret == 0) + break; + + if (ret == -1 && errno == EACCES) + break; + + port--; + } + + return ret; +} + +static int32_t +af_unix_client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t sockaddr_len, + int sock) +{ + data_t *path_data = NULL; + struct sockaddr_un *addr = NULL; + int32_t ret = -1; + + path_data = dict_get (this->xl->options, "transport.socket.bind-path"); + if (path_data) { + char *path = data_to_str (path_data); + if (!path || strlen (path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "bind-path not specfied for unix socket, " + "letting connect to assign default value"); + goto err; + } + + addr = (struct sockaddr_un *) sockaddr; + strcpy (addr->sun_path, path); + ret = bind (sock, (struct sockaddr *)addr, sockaddr_len); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot bind to unix-domain socket %d (%s)", + sock, strerror (errno)); + goto err; + } + } + +err: + return ret; +} + +static int32_t +client_fill_address_family (transport_t *this, struct sockaddr *sockaddr) +{ + data_t *address_family_data = NULL; + + address_family_data = dict_get (this->xl->options, + "transport.address-family"); + if (!address_family_data) { + data_t *remote_host_data = NULL, *connect_path_data = NULL; + remote_host_data = dict_get (this->xl->options, "remote-host"); + connect_path_data = dict_get (this->xl->options, + "transport.socket.connect-path"); + + if (!(remote_host_data || connect_path_data) || + (remote_host_data && connect_path_data)) { + gf_log (this->xl->name, GF_LOG_ERROR, + "transport.address-family not specified and " + "not able to determine the " + "same from other options (remote-host:%s and " + "transport.unix.connect-path:%s)", + data_to_str (remote_host_data), + data_to_str (connect_path_data)); + return -1; + } + + if (remote_host_data) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "address-family not specified, guessing it " + "to be inet/inet6"); + sockaddr->sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_DEBUG, + "address-family not specified, guessing it " + "to be unix"); + sockaddr->sa_family = AF_UNIX; + } + + } else { + char *address_family = data_to_str (address_family_data); + if (!strcasecmp (address_family, "unix")) { + sockaddr->sa_family = AF_UNIX; + } else if (!strcasecmp (address_family, "inet")) { + sockaddr->sa_family = AF_INET; + } else if (!strcasecmp (address_family, "inet6")) { + sockaddr->sa_family = AF_INET6; + } else if (!strcasecmp (address_family, "inet-sdp")) { + sockaddr->sa_family = AF_INET_SDP; + } else if (!strcasecmp (address_family, "inet/inet6") + || !strcasecmp (address_family, "inet6/inet")) { + sockaddr->sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address-family (%s) specified", + address_family); + return -1; + } + } + + return 0; +} + +static int32_t +af_inet_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len) +{ + dict_t *options = this->xl->options; + data_t *remote_host_data = NULL; + data_t *remote_port_data = NULL; + char *remote_host = NULL; + uint16_t remote_port = 0; + struct addrinfo *addr_info = NULL; + int32_t ret = 0; + + remote_host_data = dict_get (options, "remote-host"); + if (remote_host_data == NULL) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-host missing in volume %s", this->xl->name); + ret = -1; + goto err; + } + + remote_host = data_to_str (remote_host_data); + if (remote_host == NULL) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-host has data NULL in volume %s", this->xl->name); + ret = -1; + goto err; + } + + remote_port_data = dict_get (options, "remote-port"); + if (remote_port_data == NULL) + { + gf_log (this->xl->name, GF_LOG_DEBUG, + "option remote-port missing in volume %s. Defaulting to %d", + this->xl->name, GF_DEFAULT_SOCKET_LISTEN_PORT); + + remote_port = GF_DEFAULT_SOCKET_LISTEN_PORT; + } + else + { + remote_port = data_to_uint16 (remote_port_data); + } + + if (remote_port == (uint16_t)-1) + { + gf_log (this->xl->name, GF_LOG_ERROR, + "option remote-port has invalid port in volume %s", + this->xl->name); + ret = -1; + goto err; + } + + /* TODO: gf_resolve is a blocking call. kick in some + non blocking dns techniques */ + ret = gf_resolve_ip6 (remote_host, remote_port, + sockaddr->sa_family, &this->dnscache, &addr_info); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "DNS resolution failed on host %s", remote_host); + goto err; + } + + memcpy (sockaddr, addr_info->ai_addr, addr_info->ai_addrlen); + *sockaddr_len = addr_info->ai_addrlen; + +err: + return ret; +} + +static int32_t +af_unix_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len) +{ + struct sockaddr_un *sockaddr_un = NULL; + char *connect_path = NULL; + data_t *connect_path_data = NULL; + int32_t ret = 0; + + connect_path_data = dict_get (this->xl->options, + "transport.socket.connect-path"); + if (!connect_path_data) { + gf_log (this->xl->name, GF_LOG_ERROR, + "option transport.unix.connect-path not specified for " + "address-family unix"); + ret = -1; + goto err; + } + + connect_path = data_to_str (connect_path_data); + if (!connect_path) { + gf_log (this->xl->name, GF_LOG_ERROR, + "transport.unix.connect-path is null-string"); + ret = -1; + goto err; + } + + if (strlen (connect_path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_ERROR, + "connect-path value length %"GF_PRI_SIZET" > %d octets", + strlen (connect_path), UNIX_PATH_MAX); + ret = -1; + goto err; + } + + gf_log (this->xl->name, GF_LOG_DEBUG, + "using connect-path %s", connect_path); + sockaddr_un = (struct sockaddr_un *)sockaddr; + strcpy (sockaddr_un->sun_path, connect_path); + *sockaddr_len = sizeof (struct sockaddr_un); + +err: + return ret; +} + +static int32_t +af_unix_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len) +{ + data_t *listen_path_data = NULL; + char *listen_path = NULL; + int32_t ret = 0; + struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; + + + listen_path_data = dict_get (this->xl->options, + "transport.socket.listen-path"); + if (!listen_path_data) { + gf_log (this->xl->name, GF_LOG_ERROR, + "missing option transport.socket.listen-path"); + ret = -1; + goto err; + } + + listen_path = data_to_str (listen_path_data); + +#ifndef UNIX_PATH_MAX +#define UNIX_PATH_MAX 108 +#endif + + if (strlen (listen_path) > UNIX_PATH_MAX) { + gf_log (this->xl->name, GF_LOG_ERROR, + "option transport.unix.listen-path has value length " + "%"GF_PRI_SIZET" > %d", + strlen (listen_path), UNIX_PATH_MAX); + ret = -1; + goto err; + } + + sunaddr->sun_family = AF_UNIX; + strcpy (sunaddr->sun_path, listen_path); + *addr_len = sizeof (struct sockaddr_un); + +err: + return ret; +} + +static int32_t +af_inet_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len) +{ + struct addrinfo hints, *res = 0; + data_t *listen_port_data = NULL, *listen_host_data = NULL; + uint16_t listen_port = -1; + char service[NI_MAXSERV], *listen_host = NULL; + dict_t *options = NULL; + int32_t ret = 0; + + options = this->xl->options; + + listen_port_data = dict_get (options, "transport.socket.listen-port"); + listen_host_data = dict_get (options, "transport.socket.bind-address"); + + if (listen_port_data) + { + listen_port = data_to_uint16 (listen_port_data); + } + + if (listen_port == (uint16_t) -1) + listen_port = GF_DEFAULT_SOCKET_LISTEN_PORT; + + + if (listen_host_data) + { + listen_host = data_to_str (listen_host_data); + } + + memset (service, 0, sizeof (service)); + sprintf (service, "%d", listen_port); + + memset (&hints, 0, sizeof (hints)); + hints.ai_family = addr->sa_family; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE; + + ret = getaddrinfo(listen_host, service, &hints, &res); + if (ret != 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "getaddrinfo failed for host %s, service %s (%s)", + listen_host, service, gai_strerror (ret)); + ret = -1; + goto err; + } + + memcpy (addr, res->ai_addr, res->ai_addrlen); + *addr_len = res->ai_addrlen; + + freeaddrinfo (res); + +err: + return ret; +} + +int32_t +client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len, + int sock) +{ + int ret = 0; + + *sockaddr_len = sizeof (struct sockaddr_in6); + switch (sockaddr->sa_family) + { + case AF_INET_SDP: + case AF_INET: + *sockaddr_len = sizeof (struct sockaddr_in); + + case AF_INET6: + ret = af_inet_bind_to_port_lt_ceiling (sock, sockaddr, + *sockaddr_len, CLIENT_PORT_CEILING); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot bind inet socket (%d) to port less than %d (%s)", + sock, CLIENT_PORT_CEILING, strerror (errno)); + ret = 0; + } + break; + + case AF_UNIX: + *sockaddr_len = sizeof (struct sockaddr_un); + ret = af_unix_client_bind (this, (struct sockaddr *)sockaddr, + *sockaddr_len, sock); + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family %d", sockaddr->sa_family); + ret = -1; + break; + } + + return ret; +} + +int32_t +socket_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len) +{ + int32_t ret = 0; + char is_inet_sdp = 0; + + ret = client_fill_address_family (this, sockaddr); + if (ret) { + ret = -1; + goto err; + } + + switch (sockaddr->sa_family) + { + case AF_INET_SDP: + sockaddr->sa_family = AF_INET; + is_inet_sdp = 1; + + case AF_INET: + case AF_INET6: + case AF_UNSPEC: + ret = af_inet_client_get_remote_sockaddr (this, sockaddr, sockaddr_len); + + if (is_inet_sdp) { + sockaddr->sa_family = AF_INET_SDP; + } + + break; + + case AF_UNIX: + ret = af_unix_client_get_remote_sockaddr (this, sockaddr, sockaddr_len); + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address-family %d", sockaddr->sa_family); + ret = -1; + } + +err: + return ret; +} + +int32_t +socket_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len) +{ + data_t *address_family_data = NULL; + int32_t ret = 0; + char is_inet_sdp = 0; + + address_family_data = dict_get (this->xl->options, + "transport.address-family"); + if (address_family_data) { + char *address_family = NULL; + address_family = data_to_str (address_family_data); + + if (!strcasecmp (address_family, "inet")) { + addr->sa_family = AF_INET; + } else if (!strcasecmp (address_family, "inet6")) { + addr->sa_family = AF_INET6; + } else if (!strcasecmp (address_family, "inet-sdp")) { + addr->sa_family = AF_INET_SDP; + } else if (!strcasecmp (address_family, "unix")) { + addr->sa_family = AF_UNIX; + } else if (!strcasecmp (address_family, "inet/inet6") + || !strcasecmp (address_family, "inet6/inet")) { + addr->sa_family = AF_UNSPEC; + } else { + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family (%s) specified", address_family); + ret = -1; + goto err; + } + } else { + gf_log (this->xl->name, GF_LOG_DEBUG, + "option address-family not specified, defaulting to inet/inet6"); + addr->sa_family = AF_UNSPEC; + } + + switch (addr->sa_family) + { + case AF_INET_SDP: + is_inet_sdp = 1; + addr->sa_family = AF_INET; + + case AF_INET: + case AF_INET6: + case AF_UNSPEC: + ret = af_inet_server_get_local_sockaddr (this, addr, addr_len); + if (is_inet_sdp && !ret) { + addr->sa_family = AF_INET_SDP; + } + break; + + case AF_UNIX: + ret = af_unix_server_get_local_sockaddr (this, addr, addr_len); + break; + } + +err: + return ret; +} + +int32_t +fill_inet6_inet_identifiers (transport_t *this, struct sockaddr_storage *addr, + int32_t addr_len, char *identifier) +{ + int32_t ret = 0, tmpaddr_len = 0; + char service[NI_MAXSERV], host[NI_MAXHOST]; + struct sockaddr_storage tmpaddr; + + memset (&tmpaddr, 0, sizeof (tmpaddr)); + tmpaddr = *addr; + tmpaddr_len = addr_len; + + if (((struct sockaddr *) &tmpaddr)->sa_family == AF_INET6) { + int32_t one_to_four, four_to_eight, twelve_to_sixteen; + int16_t eight_to_ten, ten_to_twelve; + + one_to_four = four_to_eight = twelve_to_sixteen = 0; + eight_to_ten = ten_to_twelve = 0; + + one_to_four = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[0]; + four_to_eight = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[1]; +#ifdef GF_SOLARIS_HOST_OS + eight_to_ten = S6_ADDR16(((struct sockaddr_in6 *) &tmpaddr)->sin6_addr)[4]; +#else + eight_to_ten = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr16[4]; +#endif + +#ifdef GF_SOLARIS_HOST_OS + ten_to_twelve = S6_ADDR16(((struct sockaddr_in6 *) &tmpaddr)->sin6_addr)[5]; +#else + ten_to_twelve = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr16[5]; +#endif + + twelve_to_sixteen = ((struct sockaddr_in6 *) &tmpaddr)->sin6_addr.s6_addr32[3]; + + /* ipv4 mapped ipv6 address has + bits 0-80: 0 + bits 80-96: 0xffff + bits 96-128: ipv4 address + */ + + if (one_to_four == 0 && + four_to_eight == 0 && + eight_to_ten == 0 && + ten_to_twelve == -1) { + struct sockaddr_in *in_ptr = (struct sockaddr_in *)&tmpaddr; + memset (&tmpaddr, 0, sizeof (tmpaddr)); + + in_ptr->sin_family = AF_INET; + in_ptr->sin_port = ((struct sockaddr_in6 *)addr)->sin6_port; + in_ptr->sin_addr.s_addr = twelve_to_sixteen; + tmpaddr_len = sizeof (*in_ptr); + } + } + + ret = getnameinfo ((struct sockaddr *) &tmpaddr, + tmpaddr_len, + host, sizeof (host), + service, sizeof (service), + NI_NUMERICHOST | NI_NUMERICSERV); + if (ret != 0) { + gf_log (this->xl->name, GF_LOG_ERROR, + "getnameinfo failed (%s)", gai_strerror (ret)); + } + + sprintf (identifier, "%s:%s", host, service); + + return ret; +} + +int32_t +get_transport_identifiers (transport_t *this) +{ + int32_t ret = 0; + char is_inet_sdp = 0; + + switch (((struct sockaddr *) &this->myinfo.sockaddr)->sa_family) + { + case AF_INET_SDP: + is_inet_sdp = 1; + ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET; + + case AF_INET: + case AF_INET6: + { + ret = fill_inet6_inet_identifiers (this, + &this->myinfo.sockaddr, + this->myinfo.sockaddr_len, + this->myinfo.identifier); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot fill inet/inet6 identifier for server"); + goto err; + } + + ret = fill_inet6_inet_identifiers (this, + &this->peerinfo.sockaddr, + this->peerinfo.sockaddr_len, + this->peerinfo.identifier); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "cannot fill inet/inet6 identifier for client"); + goto err; + } + + if (is_inet_sdp) { + ((struct sockaddr *) &this->peerinfo.sockaddr)->sa_family = ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family = AF_INET_SDP; + } + } + break; + + case AF_UNIX: + { + struct sockaddr_un *sunaddr = NULL; + + sunaddr = (struct sockaddr_un *) &this->myinfo.sockaddr; + strcpy (this->myinfo.identifier, sunaddr->sun_path); + + sunaddr = (struct sockaddr_un *) &this->peerinfo.sockaddr; + strcpy (this->peerinfo.identifier, sunaddr->sun_path); + } + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "unknown address family (%d)", + ((struct sockaddr *) &this->myinfo.sockaddr)->sa_family); + ret = -1; + break; + } + +err: + return ret; +} diff --git a/transport/socket/src/name.h b/transport/socket/src/name.h new file mode 100644 index 000000000..552037bcc --- /dev/null +++ b/transport/socket/src/name.h @@ -0,0 +1,44 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _SOCKET_NAME_H +#define _SOCKET_NAME_H + +#include "compat.h" + +int32_t +client_bind (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len, + int sock); + +int32_t +socket_client_get_remote_sockaddr (transport_t *this, + struct sockaddr *sockaddr, + socklen_t *sockaddr_len); + +int32_t +socket_server_get_local_sockaddr (transport_t *this, + struct sockaddr *addr, + socklen_t *addr_len); + +int32_t +get_transport_identifiers (transport_t *this); + +#endif /* _SOCKET_NAME_H */ diff --git a/transport/socket/src/socket.c b/transport/socket/src/socket.c new file mode 100644 index 000000000..066da7822 --- /dev/null +++ b/transport/socket/src/socket.c @@ -0,0 +1,1370 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "socket.h" +#include "name.h" +#include "dict.h" +#include "transport.h" +#include "logging.h" +#include "xlator.h" +#include "byte-order.h" +#include "common-utils.h" +#include "compat-errno.h" + +#include <fcntl.h> +#include <errno.h> + + +#define GF_LOG_ERRNO(errno) ((errno == ENOTCONN) ? GF_LOG_DEBUG : GF_LOG_ERROR) +#define SA(ptr) ((struct sockaddr *)ptr) + +int socket_init (transport_t *this); + +/* + * return value: + * 0 = success (completed) + * -1 = error + * > 0 = incomplete + */ + +int +__socket_rwv (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count, + int write) +{ + socket_private_t *priv = NULL; + int sock = -1; + int ret = -1; + struct iovec *opvector = NULL; + int opcount = 0; + int moved = 0; + + priv = this->private; + sock = priv->sock; + + opvector = vector; + opcount = count; + + while (opcount) { + if (write) { + ret = writev (sock, opvector, opcount); + + if (ret == 0 || (ret == -1 && errno == EAGAIN)) { + /* done for now */ + break; + } + } else { + ret = readv (sock, opvector, opcount); + + if (ret == -1 && errno == EAGAIN) { + /* done for now */ + break; + } + } + + if (ret == 0) { + /* Mostly due to 'umount' in client */ + gf_log (this->xl->name, GF_LOG_DEBUG, + "EOF from peer %s", this->peerinfo.identifier); + opcount = -1; + errno = ENOTCONN; + break; + } + + if (ret == -1) { + if (errno == EINTR) + continue; + + gf_log (this->xl->name, GF_LOG_ERROR, + "%s failed (%s)", write ? "writev" : "readv", + strerror (errno)); + opcount = -1; + break; + } + + moved = 0; + + while (moved < ret) { + if ((ret - moved) >= opvector[0].iov_len) { + moved += opvector[0].iov_len; + opvector++; + opcount--; + } else { + opvector[0].iov_len -= (ret - moved); + opvector[0].iov_base += (ret - moved); + moved += (ret - moved); + } + while (opcount && !opvector[0].iov_len) { + opvector++; + opcount--; + } + } + } + + if (pending_vector) + *pending_vector = opvector; + + if (pending_count) + *pending_count = opcount; + + return opcount; +} + + +int +__socket_readv (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count) +{ + int ret = -1; + + ret = __socket_rwv (this, vector, count, + pending_vector, pending_count, 0); + + return ret; +} + + +int +__socket_writev (transport_t *this, struct iovec *vector, int count, + struct iovec **pending_vector, int *pending_count) +{ + int ret = -1; + + ret = __socket_rwv (this, vector, count, + pending_vector, pending_count, 1); + + return ret; +} + + +int +__socket_disconnect (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + if (priv->sock != -1) { + ret = shutdown (priv->sock, SHUT_RDWR); + priv->connected = -1; + gf_log (this->xl->name, GF_LOG_DEBUG, + "shutdown() returned %d. set connection state to -1", + ret); + } + + return ret; +} + + +int +__socket_server_bind (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = -1; + int opt = 1; + + priv = this->private; + + ret = setsockopt (priv->sock, SOL_SOCKET, SO_REUSEADDR, + &opt, sizeof (opt)); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "setsockopt() for SO_REUSEADDR failed (%s)", + strerror (errno)); + } + + ret = bind (priv->sock, (struct sockaddr *)&this->myinfo.sockaddr, + this->myinfo.sockaddr_len); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "binding to %s failed: %s", + this->myinfo.identifier, strerror (errno)); + if (errno == EADDRINUSE) { + gf_log (this->xl->name, GF_LOG_ERROR, + "Port is already in use"); + } + } + + return ret; +} + + +int +__socket_nonblock (int fd) +{ + int flags = 0; + int ret = -1; + + flags = fcntl (fd, F_GETFL); + + if (flags != -1) + ret = fcntl (fd, F_SETFL, flags | O_NONBLOCK); + + return ret; +} + + +int +__socket_connect_finish (int fd) +{ + int ret = -1; + int optval = 0; + socklen_t optlen = sizeof (int); + + ret = getsockopt (fd, SOL_SOCKET, SO_ERROR, (void *)&optval, &optlen); + + if (ret == 0 && optval) { + errno = optval; + ret = -1; + } + + return ret; +} + + +void +__socket_reset (transport_t *this) +{ + socket_private_t *priv = NULL; + + priv = this->private; + + /* TODO: use mem-pool on incoming data */ + + if (priv->incoming.hdr_p) + free (priv->incoming.hdr_p); + + if (priv->incoming.buf_p) + free (priv->incoming.buf_p); + + memset (&priv->incoming, 0, sizeof (priv->incoming)); + + event_unregister (this->xl->ctx->event_pool, priv->sock, priv->idx); + close (priv->sock); + priv->sock = -1; + priv->idx = -1; + priv->connected = -1; +} + + +struct ioq * +__socket_ioq_new (transport_t *this, char *buf, int len, + struct iovec *vector, int count, dict_t *refs) +{ + socket_private_t *priv = NULL; + struct ioq *entry = NULL; + + priv = this->private; + + /* TODO: use mem-pool */ + entry = CALLOC (1, sizeof (*entry)); + + assert (count <= (MAX_IOVEC-2)); + + entry->header.colonO[0] = ':'; + entry->header.colonO[1] = 'O'; + entry->header.colonO[2] = '\0'; + entry->header.version = 42; + entry->header.size1 = hton32 (len); + entry->header.size2 = hton32 (iov_length (vector, count)); + + entry->vector[0].iov_base = &entry->header; + entry->vector[0].iov_len = sizeof (entry->header); + entry->count++; + + entry->vector[1].iov_base = buf; + entry->vector[1].iov_len = len; + entry->count++; + + if (vector && count) { + memcpy (&entry->vector[2], vector, sizeof (*vector) * count); + entry->count += count; + } + + entry->pending_vector = entry->vector; + entry->pending_count = entry->count; + + if (refs) + entry->refs = dict_ref (refs); + + entry->buf = buf; + + INIT_LIST_HEAD (&entry->list); + + return entry; +} + + +void +__socket_ioq_entry_free (struct ioq *entry) +{ + list_del_init (&entry->list); + if (entry->refs) + dict_unref (entry->refs); + + /* TODO: use mem-pool */ + free (entry->buf); + + /* TODO: use mem-pool */ + free (entry); +} + + +void +__socket_ioq_flush (transport_t *this) +{ + socket_private_t *priv = NULL; + struct ioq *entry = NULL; + + priv = this->private; + + while (!list_empty (&priv->ioq)) { + entry = priv->ioq_next; + __socket_ioq_entry_free (entry); + } + + return; +} + + +int +__socket_ioq_churn_entry (transport_t *this, struct ioq *entry) +{ + int ret = -1; + + ret = __socket_writev (this, entry->pending_vector, + entry->pending_count, + &entry->pending_vector, + &entry->pending_count); + + if (ret == 0) { + /* current entry was completely written */ + assert (entry->pending_count == 0); + __socket_ioq_entry_free (entry); + } + + return ret; +} + + +int +__socket_ioq_churn (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = 0; + struct ioq *entry = NULL; + + priv = this->private; + + while (!list_empty (&priv->ioq)) { + /* pick next entry */ + entry = priv->ioq_next; + + ret = __socket_ioq_churn_entry (this, entry); + + if (ret != 0) + break; + } + + if (list_empty (&priv->ioq)) { + /* all pending writes done, not interested in POLLOUT */ + priv->idx = event_select_on (this->xl->ctx->event_pool, + priv->sock, priv->idx, -1, 0); + } + + return ret; +} + + +int +socket_event_poll_err (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + __socket_ioq_flush (this); + __socket_reset (this); + } + pthread_mutex_unlock (&priv->lock); + + this->xl->notify (this->xl, GF_EVENT_POLLERR, this); + + return ret; +} + + +int +socket_event_poll_out (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + if (priv->connected == 1) { + ret = __socket_ioq_churn (this); + + if (ret == -1) { + __socket_disconnect (this); + } + } + } + pthread_mutex_unlock (&priv->lock); + + this->xl->notify (this->xl, GF_EVENT_POLLOUT, this); + + return ret; +} + + +int +__socket_proto_validate_header (transport_t *this, + struct socket_header *header, + size_t *size1_p, size_t *size2_p) +{ + size_t size1 = 0; + size_t size2 = 0; + + if (strcmp (header->colonO, ":O")) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket header signature does not match :O (%x.%x.%x)", + header->colonO[0], header->colonO[1], + header->colonO[2]); + return -1; + } + + if (header->version != 42) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket header version does not match 42 != %d", + header->version); + return -1; + } + + size1 = ntoh32 (header->size1); + size2 = ntoh32 (header->size2); + + if (size1 <= 0 || size1 > 1048576) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket header has incorrect size1=%"GF_PRI_SIZET, + size1); + return -1; + } + + if (size2 > (1048576 * 4)) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket header has incorrect size2=%"GF_PRI_SIZET, + size2); + return -1; + } + + if (size1_p) + *size1_p = size1; + + if (size2_p) + *size2_p = size2; + + return 0; +} + + + +/* socket protocol state machine */ + +int +__socket_proto_state_machine (transport_t *this) +{ + int ret = -1; + socket_private_t *priv = NULL; + size_t size1 = 0; + size_t size2 = 0; + int previous_state = -1; + struct socket_header *hdr = NULL; + + + priv = this->private; + + while (priv->incoming.state != SOCKET_PROTO_STATE_COMPLETE) { + /* debug check against infinite loops */ + if (previous_state == priv->incoming.state) { + gf_log (this->xl->name, GF_LOG_ERROR, + "state did not change! (%d) breaking", + previous_state); + ret = -1; + goto unlock; + } + previous_state = priv->incoming.state; + + switch (priv->incoming.state) { + + case SOCKET_PROTO_STATE_NADA: + priv->incoming.pending_vector = + priv->incoming.vector; + + priv->incoming.pending_vector->iov_base = + &priv->incoming.header; + + priv->incoming.pending_vector->iov_len = + sizeof (struct socket_header); + + priv->incoming.state = + SOCKET_PROTO_STATE_HEADER_COMING; + break; + + case SOCKET_PROTO_STATE_HEADER_COMING: + + ret = __socket_readv (this, + priv->incoming.pending_vector, 1, + &priv->incoming.pending_vector, + NULL); + if (ret == 0) { + priv->incoming.state = + SOCKET_PROTO_STATE_HEADER_CAME; + break; + } + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERRNO (errno), + "read (%s) in state %d (%s)", + strerror (errno), + SOCKET_PROTO_STATE_HEADER_COMING, + this->peerinfo.identifier); + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "partial header read on NB socket."); + goto unlock; + } + break; + + case SOCKET_PROTO_STATE_HEADER_CAME: + hdr = &priv->incoming.header; + ret = __socket_proto_validate_header (this, hdr, + &size1, &size2); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket header validate failed (%s). " + "possible mismatch of transport-type " + "between server and client volumes, " + "or version mismatch", + this->peerinfo.identifier); + goto unlock; + } + + priv->incoming.hdrlen = size1; + priv->incoming.buflen = size2; + + /* TODO: use mem-pool */ + priv->incoming.hdr_p = MALLOC (size1); + if (size2) + priv->incoming.buf_p = MALLOC (size2); + + priv->incoming.vector[0].iov_base = + priv->incoming.hdr_p; + + priv->incoming.vector[0].iov_len = size1; + + priv->incoming.vector[1].iov_base = + priv->incoming.buf_p; + + priv->incoming.vector[1].iov_len = size2; + priv->incoming.count = size2 ? 2 : 1; + + priv->incoming.pending_vector = + priv->incoming.vector; + + priv->incoming.pending_count = + priv->incoming.count; + + priv->incoming.state = + SOCKET_PROTO_STATE_DATA_COMING; + break; + + case SOCKET_PROTO_STATE_DATA_COMING: + + ret = __socket_readv (this, + priv->incoming.pending_vector, + priv->incoming.pending_count, + &priv->incoming.pending_vector, + &priv->incoming.pending_count); + if (ret == 0) { + priv->incoming.state = + SOCKET_PROTO_STATE_DATA_CAME; + break; + } + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "read (%s) in state %d (%s)", + strerror (errno), + SOCKET_PROTO_STATE_DATA_COMING, + this->peerinfo.identifier); + goto unlock; + } + + if (ret > 0) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "partial data read on NB socket"); + goto unlock; + } + break; + + case SOCKET_PROTO_STATE_DATA_CAME: + memset (&priv->incoming.vector, 0, + sizeof (priv->incoming.vector)); + priv->incoming.pending_vector = NULL; + priv->incoming.pending_count = 0; + priv->incoming.state = SOCKET_PROTO_STATE_COMPLETE; + break; + + case SOCKET_PROTO_STATE_COMPLETE: + /* not reached */ + break; + + default: + gf_log (this->xl->name, GF_LOG_ERROR, + "undefined state reached: %d", + priv->incoming.state); + goto unlock; + } + } +unlock: + + return ret; +} + + +int +socket_proto_state_machine (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = 0; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + ret = __socket_proto_state_machine (this); + } + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +int +socket_event_poll_in (transport_t *this) +{ + int ret = -1; + + ret = socket_proto_state_machine (this); + + /* call POLLIN on xlator even if complete block is not received, + just to keep the last_received timestamp ticking */ + + if (ret == 0) + ret = this->xl->notify (this->xl, GF_EVENT_POLLIN, this); + + return ret; +} + + +int +socket_connect_finish (transport_t *this) +{ + int ret = -1; + socket_private_t *priv = NULL; + int event = -1; + char notify_xlator = 0; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + if (priv->connected) + goto unlock; + + ret = __socket_connect_finish (priv->sock); + + if (ret == -1 && errno == EINPROGRESS) + ret = 1; + + if (ret == -1 && errno != EINPROGRESS) { + if (!priv->connect_finish_log) { + gf_log (this->xl->name, GF_LOG_ERROR, + "connection failed (%s)", + strerror (errno)); + priv->connect_finish_log = 1; + } + __socket_disconnect (this); + notify_xlator = 1; + event = GF_EVENT_POLLERR; + goto unlock; + } + + if (ret == 0) { + notify_xlator = 1; + + this->myinfo.sockaddr_len = + sizeof (this->myinfo.sockaddr); + + ret = getsockname (priv->sock, + SA (&this->myinfo.sockaddr), + &this->myinfo.sockaddr_len); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "getsockname on (%d) failed (%s)", + priv->sock, strerror (errno)); + __socket_disconnect (this); + event = GF_EVENT_POLLERR; + goto unlock; + } + + priv->connected = 1; + priv->connect_finish_log = 0; + event = GF_EVENT_CHILD_UP; + get_transport_identifiers (this); + } + } +unlock: + pthread_mutex_unlock (&priv->lock); + + if (notify_xlator) + this->xl->notify (this->xl, event, this); + + return 0; +} + + +int +socket_event_handler (int fd, int idx, void *data, + int poll_in, int poll_out, int poll_err) +{ + transport_t *this = NULL; + socket_private_t *priv = NULL; + int ret = 0; + + this = data; + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + priv->idx = idx; + } + pthread_mutex_unlock (&priv->lock); + + if (!priv->connected) { + ret = socket_connect_finish (this); + } + + if (!ret && poll_out) { + ret = socket_event_poll_out (this); + } + + if (!ret && poll_in) { + ret = socket_event_poll_in (this); + } + + if (ret < 0 || poll_err) { + socket_event_poll_err (this); + transport_unref (this); + } + + return 0; +} + + +int +socket_server_event_handler (int fd, int idx, void *data, + int poll_in, int poll_out, int poll_err) +{ + transport_t *this = NULL; + socket_private_t *priv = NULL; + int ret = 0; + int new_sock = -1; + transport_t *new_trans = NULL; + struct sockaddr_storage new_sockaddr = {0, }; + socklen_t addrlen = sizeof (new_sockaddr); + socket_private_t *new_priv = NULL; + glusterfs_ctx_t *ctx = NULL; + + this = data; + priv = this->private; + ctx = this->xl->ctx; + + pthread_mutex_lock (&priv->lock); + { + priv->idx = idx; + + if (poll_in) { + new_sock = accept (priv->sock, SA (&new_sockaddr), + &addrlen); + + if (new_sock == -1) + goto unlock; + + if (!priv->bio) { + ret = __socket_nonblock (new_sock); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "NBIO on %d failed (%s)", + new_sock, strerror (errno)); + close (new_sock); + goto unlock; + } + } + + new_trans = CALLOC (1, sizeof (*new_trans)); + new_trans->xl = this->xl; + new_trans->fini = this->fini; + + memcpy (&new_trans->peerinfo.sockaddr, &new_sockaddr, + addrlen); + new_trans->peerinfo.sockaddr_len = addrlen; + + new_trans->myinfo.sockaddr_len = + sizeof (new_trans->myinfo.sockaddr); + + ret = getsockname (new_sock, + SA (&new_trans->myinfo.sockaddr), + &new_trans->myinfo.sockaddr_len); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "getsockname on %d failed (%s)", + new_sock, strerror (errno)); + close (new_sock); + goto unlock; + } + + get_transport_identifiers (new_trans); + socket_init (new_trans); + new_trans->ops = this->ops; + new_trans->init = this->init; + new_trans->fini = this->fini; + + new_priv = new_trans->private; + + pthread_mutex_lock (&new_priv->lock); + { + new_priv->sock = new_sock; + new_priv->connected = 1; + + transport_ref (new_trans); + new_priv->idx = + event_register (ctx->event_pool, + new_sock, + socket_event_handler, + new_trans, 1, 0); + + if (new_priv->idx == -1) + ret = -1; + } + pthread_mutex_unlock (&new_priv->lock); + } + } +unlock: + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +int +socket_disconnect (transport_t *this) +{ + socket_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + ret = __socket_disconnect (this); + } + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +int +socket_connect (transport_t *this) +{ + int ret = -1; + int sock = -1; + socket_private_t *priv = NULL; + struct sockaddr_storage sockaddr = {0, }; + socklen_t sockaddr_len = 0; + glusterfs_ctx_t *ctx = NULL; + + priv = this->private; + ctx = this->xl->ctx; + + if (!priv) { + gf_log (this->xl->name, GF_LOG_ERROR, + "connect() called on uninitialized transport"); + goto err; + } + + pthread_mutex_lock (&priv->lock); + { + sock = priv->sock; + } + pthread_mutex_unlock (&priv->lock); + + if (sock != -1) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "connect () called on transport already connected"); + goto err; + } + + ret = socket_client_get_remote_sockaddr (this, SA (&sockaddr), + &sockaddr_len); + if (ret == -1) { + /* logged inside client_get_remote_sockaddr */ + goto err; + } + + pthread_mutex_lock (&priv->lock); + { + if (priv->sock != -1) { + gf_log (this->xl->name, GF_LOG_DEBUG, + "connect() -- already connected"); + goto unlock; + } + + memcpy (&this->peerinfo.sockaddr, &sockaddr, sockaddr_len); + this->peerinfo.sockaddr_len = sockaddr_len; + + priv->sock = socket (SA (&sockaddr)->sa_family, + SOCK_STREAM, 0); + + if (priv->sock == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket creation failed (%s)", + strerror (errno)); + goto unlock; + } + + if (!priv->bio) { + ret = __socket_nonblock (priv->sock); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "NBIO on %d failed (%s)", + priv->sock, strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + } + + SA (&this->myinfo.sockaddr)->sa_family = + SA (&this->peerinfo.sockaddr)->sa_family; + + ret = client_bind (this, SA (&this->myinfo.sockaddr), + &this->myinfo.sockaddr_len, priv->sock); + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_WARNING, + "client bind failed: %s", strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + ret = connect (priv->sock, SA (&this->peerinfo.sockaddr), + this->peerinfo.sockaddr_len); + + if (ret == -1 && errno != EINPROGRESS) { + gf_log (this->xl->name, GF_LOG_ERROR, + "connection attempt failed (%s)", + strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + priv->connected = 0; + + transport_ref (this); + + priv->idx = event_register (ctx->event_pool, priv->sock, + socket_event_handler, this, 1, 1); + if (priv->idx == -1) + ret = -1; + } +unlock: + pthread_mutex_unlock (&priv->lock); + +err: + return ret; +} + + +int +socket_listen (transport_t *this) +{ + socket_private_t * priv = NULL; + int ret = -1; + int sock = -1; + struct sockaddr_storage sockaddr; + socklen_t sockaddr_len; + peer_info_t *myinfo = NULL; + glusterfs_ctx_t *ctx = NULL; + + priv = this->private; + myinfo = &this->myinfo; + ctx = this->xl->ctx; + + pthread_mutex_lock (&priv->lock); + { + sock = priv->sock; + } + pthread_mutex_unlock (&priv->lock); + + if (sock != -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "alreading listening"); + return ret; + } + + ret = socket_server_get_local_sockaddr (this, SA (&sockaddr), + &sockaddr_len); + + if (ret == -1) { + return ret; + } + + pthread_mutex_lock (&priv->lock); + { + if (priv->sock != -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "already listening"); + goto unlock; + } + + memcpy (&myinfo->sockaddr, &sockaddr, sockaddr_len); + myinfo->sockaddr_len = sockaddr_len; + + priv->sock = socket (SA (&sockaddr)->sa_family, + SOCK_STREAM, 0); + + if (priv->sock == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket creation failed (%s)", + strerror (errno)); + goto unlock; + } + + if (!priv->bio) { + ret = __socket_nonblock (priv->sock); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "NBIO on %d failed (%s)", + priv->sock, strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + } + + ret = __socket_server_bind (this); + + if (ret == -1) { + /* logged inside __socket_server_bind() */ + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + ret = listen (priv->sock, 10); + + if (ret == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "could not set socket %d to listen mode (%s)", + priv->sock, strerror (errno)); + close (priv->sock); + priv->sock = -1; + goto unlock; + } + + transport_ref (this); + + priv->idx = event_register (ctx->event_pool, priv->sock, + socket_server_event_handler, + this, 1, 0); + + if (priv->idx == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "could not register socket %d with events", + priv->sock); + ret = -1; + close (priv->sock); + priv->sock = -1; + goto unlock; + } + } +unlock: + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +int +socket_receive (transport_t *this, char **hdr_p, size_t *hdrlen_p, + char **buf_p, size_t *buflen_p) +{ + socket_private_t *priv = NULL; + int ret = -1; + + priv = this->private; + + pthread_mutex_lock (&priv->lock); + { + if (priv->connected != 1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "socket not connected to receive"); + goto unlock; + } + + if (!hdr_p || !hdrlen_p || !buf_p || !buflen_p) { + gf_log (this->xl->name, GF_LOG_ERROR, + "bad parameters %p %p %p %p", + hdr_p, hdrlen_p, buf_p, buflen_p); + goto unlock; + } + + if (priv->incoming.state == SOCKET_PROTO_STATE_COMPLETE) { + *hdr_p = priv->incoming.hdr_p; + *hdrlen_p = priv->incoming.hdrlen; + *buf_p = priv->incoming.buf_p; + *buflen_p = priv->incoming.buflen; + + memset (&priv->incoming, 0, sizeof (priv->incoming)); + priv->incoming.state = SOCKET_PROTO_STATE_NADA; + + ret = 0; + } + } +unlock: + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +/* TODO: implement per transfer limit */ +int +socket_submit (transport_t *this, char *buf, int len, + struct iovec *vector, int count, + dict_t *refs) +{ + socket_private_t *priv = NULL; + int ret = -1; + char need_poll_out = 0; + char need_append = 1; + struct ioq *entry = NULL; + glusterfs_ctx_t *ctx = NULL; + + priv = this->private; + ctx = this->xl->ctx; + + pthread_mutex_lock (&priv->lock); + { + if (priv->connected != 1) { + if (!priv->submit_log && !priv->connect_finish_log) { + gf_log (this->xl->name, GF_LOG_ERROR, + "not connected (priv->connected = %d)", + priv->connected); + priv->submit_log = 1; + } + goto unlock; + } + + priv->submit_log = 0; + entry = __socket_ioq_new (this, buf, len, vector, count, refs); + + if (list_empty (&priv->ioq)) { + ret = __socket_ioq_churn_entry (this, entry); + + if (ret == 0) + need_append = 0; + + if (ret > 0) + need_poll_out = 1; + } + + if (need_append) { + list_add_tail (&entry->list, &priv->ioq); + ret = 0; + } + + if (need_poll_out) { + /* first entry to wait. continue writing on POLLOUT */ + priv->idx = event_select_on (ctx->event_pool, + priv->sock, + priv->idx, -1, 1); + } + } +unlock: + pthread_mutex_unlock (&priv->lock); + + return ret; +} + + +struct transport_ops tops = { + .listen = socket_listen, + .connect = socket_connect, + .disconnect = socket_disconnect, + .submit = socket_submit, + .receive = socket_receive +}; + + +int +socket_init (transport_t *this) +{ + socket_private_t *priv = NULL; + + if (this->private) { + gf_log (this->xl->name, GF_LOG_ERROR, + "double init attempted"); + return -1; + } + + priv = CALLOC (1, sizeof (*priv)); + if (!priv) { + gf_log (this->xl->name, GF_LOG_ERROR, + "calloc (1, %"GF_PRI_SIZET") returned NULL", + sizeof (*priv)); + return -1; + } + + pthread_mutex_init (&priv->lock, NULL); + + priv->sock = -1; + priv->idx = -1; + priv->connected = -1; + + INIT_LIST_HEAD (&priv->ioq); + + if (dict_get (this->xl->options, "non-blocking-io")) { + gf_boolean_t tmp_bool = 0; + char *nb_connect = data_to_str (dict_get (this->xl->options, + "non-blocking-io")); + + if (gf_string2boolean (nb_connect, &tmp_bool) == -1) { + gf_log (this->xl->name, GF_LOG_ERROR, + "'non-blocking-io' takes only boolean options," + " not taking any action"); + tmp_bool = 1; + } + priv->bio = 0; + if (!tmp_bool) { + priv->bio = 1; + gf_log (this->xl->name, GF_LOG_WARNING, + "disabling non-blocking IO"); + } + } + + this->private = priv; + + return 0; +} + + +void +fini (transport_t *this) +{ + socket_private_t *priv = this->private; + gf_log (this->xl->name, GF_LOG_DEBUG, + "transport %p destroyed", this); + + pthread_mutex_destroy (&priv->lock); + FREE (priv); +} + + +int32_t +init (transport_t *this) +{ + int ret = -1; + + ret = socket_init (this); + + if (ret == -1) + { + gf_log (this->xl->name, GF_LOG_ERROR, "socket_init() failed"); + } + + return ret; +} + +struct volume_options options[] = { + { .key = {"remote-port", + "transport.remote-port", + "transport.socket.remote-port"}, + .type = GF_OPTION_TYPE_INT + }, + { .key = {"transport.socket.listen-port", "listen-port"}, + .type = GF_OPTION_TYPE_INT + }, + { .key = {"transport.socket.bind-address", "bind-address" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport.socket.connect-path", "connect-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport.socket.bind-path", "bind-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport.socket.listen-path", "listen-path"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "transport.address-family", + "address-family" }, + .value = {"inet", "inet6", "inet/inet6", "inet6/inet", + "unix", "inet-sdp" }, + .type = GF_OPTION_TYPE_STR + }, + + { .key = {NULL} } +}; + diff --git a/transport/socket/src/socket.h b/transport/socket/src/socket.h new file mode 100644 index 000000000..070e69d08 --- /dev/null +++ b/transport/socket/src/socket.h @@ -0,0 +1,106 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _SOCKET_H +#define _SOCKET_H + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "event.h" +#include "transport.h" +#include "logging.h" +#include "dict.h" +#include "mem-pool.h" + +#ifndef MAX_IOVEC +#define MAX_IOVEC 16 +#endif /* MAX_IOVEC */ + +#define GF_DEFAULT_SOCKET_LISTEN_PORT 6996 + +typedef enum { + SOCKET_PROTO_STATE_NADA = 0, + SOCKET_PROTO_STATE_HEADER_COMING, + SOCKET_PROTO_STATE_HEADER_CAME, + SOCKET_PROTO_STATE_DATA_COMING, + SOCKET_PROTO_STATE_DATA_CAME, + SOCKET_PROTO_STATE_COMPLETE, +} socket_proto_state_t; + +struct socket_header { + char colonO[3]; + uint32_t size1; + uint32_t size2; + char version; +} __attribute__((packed)); + + +struct ioq { + union { + struct list_head list; + struct { + struct ioq *next; + struct ioq *prev; + }; + }; + struct socket_header header; + struct iovec vector[MAX_IOVEC]; + int count; + struct iovec *pending_vector; + int pending_count; + char *buf; + dict_t *refs; +}; + + +typedef struct { + int32_t sock; + int32_t idx; + unsigned char connected; // -1 = not connected. 0 = in progress. 1 = connected + char bio; + char connect_finish_log; + char submit_log; + union { + struct list_head ioq; + struct { + struct ioq *ioq_next; + struct ioq *ioq_prev; + }; + }; + struct { + int state; + struct socket_header header; + char *hdr_p; + size_t hdrlen; + char *buf_p; + size_t buflen; + struct iovec vector[2]; + int count; + struct iovec *pending_vector; + int pending_count; + } incoming; + pthread_mutex_t lock; +} socket_private_t; + + +#endif diff --git a/xlators/Makefile.am b/xlators/Makefile.am new file mode 100644 index 000000000..2abb52194 --- /dev/null +++ b/xlators/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = cluster storage protocol performance debug features encryption mount + +CLEANFILES = diff --git a/xlators/bindings/Makefile.am b/xlators/bindings/Makefile.am new file mode 100644 index 000000000..f77665802 --- /dev/null +++ b/xlators/bindings/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = $(BINDINGS_SUBDIRS) diff --git a/xlators/bindings/python/Makefile.am b/xlators/bindings/python/Makefile.am new file mode 100644 index 000000000..af437a64d --- /dev/null +++ b/xlators/bindings/python/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/xlators/bindings/python/src/Makefile.am b/xlators/bindings/python/src/Makefile.am new file mode 100644 index 000000000..c0b9141c6 --- /dev/null +++ b/xlators/bindings/python/src/Makefile.am @@ -0,0 +1,19 @@ + +xlator_PROGRAMS = python.so + +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/bindings + +python_PYTHON = gluster.py glustertypes.py glusterstack.py + +pythondir = $(xlatordir)/python + +python_so_SOURCES = python.c + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \ + $(PYTHON_CPPLAGS) -DGLUSTER_PYTHON_PATH=\"$(pythondir)\" + +AM_LDFLAGS = $(PYTHON_LDFLAGS) + +CLEANFILES = + diff --git a/xlators/bindings/python/src/gluster.py b/xlators/bindings/python/src/gluster.py new file mode 100644 index 000000000..ee0eb1310 --- /dev/null +++ b/xlators/bindings/python/src/gluster.py @@ -0,0 +1,47 @@ +# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> +# This file is part of GlusterFS. +# +# GlusterFS is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published +# by the Free Software Foundation; either version 3 of the License, +# or (at your option) any later version. +# +# GlusterFS is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see +# <http://www.gnu.org/licenses/>. +from ctypes import * +from glustertypes import * +from glusterstack import * +import sys +import inspect + +libglusterfs = CDLL("libglusterfs.so") +_gf_log = libglusterfs._gf_log +_gf_log.restype = c_int32 +_gf_log.argtypes = [c_char_p, c_char_p, c_char_p, c_int32, c_int, c_char_p] + +gf_log_loglevel = c_int.in_dll(libglusterfs, "gf_log_loglevel") + +GF_LOG_NONE = 0 +GF_LOG_CRITICAL = 1 +GF_LOG_ERROR = 2 +GF_LOG_WARNING = 3 +GF_LOG_DEBUG = 4 + +def gf_log(module, level, fmt, *params): + if level <= gf_log_loglevel: + frame = sys._getframe(1) + _gf_log(module, frame.f_code.co_filename, frame.f_code.co_name, + frame.f_lineno, level, fmt, *params) + +class ComplexTranslator(object): + def __init__(self, xlator): + self.xlator = xlator_t.from_address(xlator) + + def __getattr__(self, item): + return getattr(self.xlator, item) diff --git a/xlators/bindings/python/src/glusterstack.py b/xlators/bindings/python/src/glusterstack.py new file mode 100644 index 000000000..ba24c8165 --- /dev/null +++ b/xlators/bindings/python/src/glusterstack.py @@ -0,0 +1,55 @@ +# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> +# This file is part of GlusterFS. +# +# GlusterFS is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published +# by the Free Software Foundation; either version 3 of the License, +# or (at your option) any later version. +# +# GlusterFS is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see +# <http://www.gnu.org/licenses/>. +from ctypes import * +from glustertypes import * + +libc = CDLL("libc.so.6") +calloc = libc.calloc +calloc.argtypes = [c_int, c_int] +calloc.restype = c_void_p + +# TODO: Can these be done in C somehow? +def stack_wind(frame, rfn, obj, fn, *params): + """Frame is a frame object""" + _new = cast(calloc(1, sizeof(call_frame_t)), POINTER(call_frame_t)) + _new[0].root = frame.root + _new[0].next = frame.root[0].frames.next + _new[0].prev = pointer(frame.root[0].frames) + if frame.root[0].frames.next: + frame.root[0].frames.next[0].prev = _new + frame.root[0].frames.next = _new + _new[0].this = obj + # TODO: Type checking like tmp_cbk? + _new[0].ret = rfn + _new[0].parent = pointer(frame) + _new[0].cookie = cast(_new, c_void_p) + # TODO: Initialize lock + #_new.lock.init() + frame.ref_count += 1 + fn(_new, obj, *params) + +def stack_unwind(frame, *params): + """Frame is a frame object""" + fn = frame[0].ret + parent = frame[0].parent[0] + parent.ref_count -= 1 + + op_ret = params[0] + op_err = params[1] + params = params[2:] + fn(parent, call_frame_t.from_address(frame[0].cookie), parent.this, + op_ret, op_err, *params) diff --git a/xlators/bindings/python/src/glustertypes.py b/xlators/bindings/python/src/glustertypes.py new file mode 100644 index 000000000..e9069d07c --- /dev/null +++ b/xlators/bindings/python/src/glustertypes.py @@ -0,0 +1,167 @@ +# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> +# This file is part of GlusterFS. +# +# GlusterFS is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published +# by the Free Software Foundation; either version 3 of the License, +# or (at your option) any later version. +# +# GlusterFS is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see +# <http://www.gnu.org/licenses/>. +from ctypes import * +import collections + +# +# Forward declaration of some gluster types +# +class call_frame_t(Structure): + pass + +class call_ctx_t(Structure): + pass + +class call_pool_t(Structure): + pass + +class xlator_t(Structure): + def _getFirstChild(self): + return self.children[0].xlator + firstChild = property(_getFirstChild) + +class xlator_list_t(Structure): + pass + +class xlator_fops(Structure): + pass + +class xlator_mops(Structure): + pass + +class glusterfs_ctx_t(Structure): + pass + +class list_head(Structure): + pass + +class dict_t(Structure): + pass + +class inode_table_t(Structure): + pass + +class fd_t(Structure): + pass + +class iovec(Structure): + _fields_ = [ + ("iov_base", c_void_p), + ("iov_len", c_size_t), + ] + + def __init__(self, s): + self.iov_base = cast(c_char_p(s), c_void_p) + self.iov_len = len(s) + + def getBytes(self): + return string_at(self.iov_base, self.iov_len) + +# This is a pthread_spinlock_t +# TODO: what happens to volatile-ness? +gf_lock_t = c_int + +uid_t = c_uint32 +gid_t = c_uint32 +pid_t = c_int32 + +off_t = c_int64 + +# +# Function pointer types +# +ret_fn_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(call_frame_t), + POINTER(xlator_t), c_int32, c_int32) + +fini_fn_t = CFUNCTYPE(None, POINTER(xlator_t)) +init_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t)) +event_notify_fn_t = CFUNCTYPE(c_int32, POINTER(xlator_t), c_int32, c_void_p) + +list_head._fields_ = [ + ("next", POINTER(list_head)), + ("prev", POINTER(list_head)), + ] + +call_frame_t._fields_ = [ + ("root", POINTER(call_ctx_t)), + ("parent", POINTER(call_frame_t)), + ("next", POINTER(call_frame_t)), + ("prev", POINTER(call_frame_t)), + ("local", c_void_p), + ("this", POINTER(xlator_t)), + ("ret", ret_fn_t), + ("ref_count", c_int32), + ("lock", gf_lock_t), + ("cookie", c_void_p), + ("op", c_int32), + ("type", c_int8), + ] + +call_ctx_t._fields_ = [ + ("all_frames", list_head), + ("trans", c_void_p), + ("pool", call_pool_t), + ("unique", c_uint64), + ("state", c_void_p), + ("uid", uid_t), + ("gid", gid_t), + ("pid", pid_t), + ("frames", call_frame_t), + ("req_refs", POINTER(dict_t)), + ("rsp_refs", POINTER(dict_t)), + ] + +xlator_t._fields_ = [ + ("name", c_char_p), + ("type", c_char_p), + ("next", POINTER(xlator_t)), + ("prev", POINTER(xlator_t)), + ("parent", POINTER(xlator_t)), + ("children", POINTER(xlator_list_t)), + ("fops", POINTER(xlator_fops)), + ("mops", POINTER(xlator_mops)), + ("fini", fini_fn_t), + ("init", init_fn_t), + ("notify", event_notify_fn_t), + ("options", POINTER(dict_t)), + ("ctx", POINTER(glusterfs_ctx_t)), + ("itable", POINTER(inode_table_t)), + ("ready", c_char), + ("private", c_void_p), + ] + +xlator_list_t._fields_ = [ + ("xlator", POINTER(xlator_t)), + ("next", POINTER(xlator_list_t)), + ] + +fop_functions = collections.defaultdict(lambda: c_void_p) +fop_function_names = ['lookup', 'forget', 'stat', 'fstat', 'chmod', 'fchmod', + 'chown', 'fchown', 'truncate', 'ftruncate', 'utimens', 'access', + 'readlink', 'mknod', 'mkdir', 'unlink', 'rmdir', 'symlink', + 'rename', 'link', 'create', 'open', 'readv', 'writev', 'flush', + 'close', 'fsync', 'opendir', 'readdir', 'closedir', 'fsyncdir', + 'statfs', 'setxattr', 'getxattr', 'removexattr', 'lk', 'writedir', + # TODO: Call backs? + ] + +fop_writev_t = CFUNCTYPE(c_int32, POINTER(call_frame_t), POINTER(xlator_t), + POINTER(fd_t), POINTER(iovec), c_int32, + off_t) + +fop_functions['writev'] = fop_writev_t +xlator_fops._fields_ = [(f, fop_functions[f]) for f in fop_function_names] diff --git a/xlators/bindings/python/src/python.c b/xlators/bindings/python/src/python.c new file mode 100644 index 000000000..739ef7329 --- /dev/null +++ b/xlators/bindings/python/src/python.c @@ -0,0 +1,235 @@ +/* + Copyright (c) 2007 Chris AtLee <chris@atlee.ca> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <Python.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "logging.h" +#include "defaults.h" + +typedef struct +{ + char *scriptname; + PyObject *pXlator; + PyObject *pScriptModule; + PyObject *pGlusterModule; + PyThreadState *pInterp; + + PyObject *pFrameType, *pVectorType, *pFdType; +} python_private_t; + +int32_t +python_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + python_private_t *priv = (python_private_t *)this->private; + gf_log("python", GF_LOG_DEBUG, "In writev"); + if (PyObject_HasAttrString(priv->pXlator, "writev")) + { + + PyObject *retval = PyObject_CallMethod(priv->pXlator, "writev", + "O O O i l", + PyObject_CallMethod(priv->pFrameType, "from_address", "O&", PyLong_FromVoidPtr, frame), + PyObject_CallMethod(priv->pFdType, "from_address", "O&", PyLong_FromVoidPtr, fd), + PyObject_CallMethod(priv->pVectorType, "from_address", "O&", PyLong_FromVoidPtr, vector), + count, + offset); + if (PyErr_Occurred()) + { + PyErr_Print(); + } + Py_XDECREF(retval); + } + else + { + return default_writev(frame, this, fd, vector, count, offset); + } + return 0; +} + +struct xlator_fops fops = { + .writev = python_writev +}; + +struct xlator_mops mops = { +}; + +static PyObject * +AnonModule_FromFile (const char* fname) +{ + // Get the builtins + PyThreadState* pThread = PyThreadState_Get(); + PyObject *pBuiltins = pThread->interp->builtins; + + if (PyErr_Occurred()) + { + PyErr_Print(); + return NULL; + } + + // Create a new dictionary for running code in + PyObject *pModuleDict = PyDict_New(); + PyDict_SetItemString(pModuleDict, "__builtins__", pBuiltins); + Py_INCREF(pBuiltins); + + // Run the file in the new context + FILE* fp = fopen(fname, "r"); + PyRun_File(fp, fname, Py_file_input, pModuleDict, pModuleDict); + fclose(fp); + if (PyErr_Occurred()) + { + PyErr_Print(); + Py_DECREF(pModuleDict); + Py_DECREF(pBuiltins); + return NULL; + } + + // Create an object to hold the new context + PyRun_String("class ModuleWrapper(object):\n\tpass\n", Py_single_input, pModuleDict, pModuleDict); + if (PyErr_Occurred()) + { + PyErr_Print(); + Py_DECREF(pModuleDict); + Py_DECREF(pBuiltins); + return NULL; + } + PyObject *pModule = PyRun_String("ModuleWrapper()", Py_eval_input, pModuleDict, pModuleDict); + if (PyErr_Occurred()) + { + PyErr_Print(); + Py_DECREF(pModuleDict); + Py_DECREF(pBuiltins); + Py_XDECREF(pModule); + return NULL; + } + + // Set the new context's dictionary to the one we used to run the code + // inside + PyObject_SetAttrString(pModule, "__dict__", pModuleDict); + if (PyErr_Occurred()) + { + PyErr_Print(); + Py_DECREF(pModuleDict); + Py_DECREF(pBuiltins); + Py_DECREF(pModule); + return NULL; + } + + return pModule; +} + +int32_t +init (xlator_t *this) +{ + // This is ok to call more than once per process + Py_InitializeEx(0); + + if (!this->children) { + gf_log ("python", GF_LOG_ERROR, + "FATAL: python should have exactly one child"); + return -1; + } + + python_private_t *priv = CALLOC (sizeof (python_private_t), 1); + ERR_ABORT (priv); + + data_t *scriptname = dict_get (this->options, "scriptname"); + if (scriptname) { + priv->scriptname = data_to_str(scriptname); + } else { + gf_log("python", GF_LOG_ERROR, + "FATAL: python requires the scriptname parameter"); + return -1; + } + + priv->pInterp = Py_NewInterpreter(); + + // Adjust python's path + PyObject *syspath = PySys_GetObject("path"); + PyObject *path = PyString_FromString(GLUSTER_PYTHON_PATH); + PyList_Append(syspath, path); + Py_DECREF(path); + + gf_log("python", GF_LOG_DEBUG, + "Loading gluster module"); + + priv->pGlusterModule = PyImport_ImportModule("gluster"); + if (PyErr_Occurred()) + { + PyErr_Print(); + return -1; + } + + priv->pFrameType = PyObject_GetAttrString(priv->pGlusterModule, "call_frame_t"); + priv->pFdType = PyObject_GetAttrString(priv->pGlusterModule, "fd_t"); + priv->pVectorType = PyObject_GetAttrString(priv->pGlusterModule, "iovec"); + + gf_log("python", GF_LOG_DEBUG, "Loading script...%s", priv->scriptname); + + priv->pScriptModule = AnonModule_FromFile(priv->scriptname); + if (!priv->pScriptModule || PyErr_Occurred()) + { + gf_log("python", GF_LOG_ERROR, "Error loading %s", priv->scriptname); + PyErr_Print(); + return -1; + } + + if (!PyObject_HasAttrString(priv->pScriptModule, "xlator")) + { + gf_log("python", GF_LOG_ERROR, "%s does not have a xlator attribute", priv->scriptname); + return -1; + } + gf_log("python", GF_LOG_DEBUG, "Instantiating translator"); + priv->pXlator = PyObject_CallMethod(priv->pScriptModule, "xlator", "O&", + PyLong_FromVoidPtr, this); + if (PyErr_Occurred() || !priv->pXlator) + { + PyErr_Print(); + return -1; + } + + this->private = priv; + + gf_log ("python", GF_LOG_DEBUG, "python xlator loaded"); + return 0; +} + +void +fini (xlator_t *this) +{ + python_private_t *priv = (python_private_t*)(this->private); + Py_DECREF(priv->pXlator); + Py_DECREF(priv->pScriptModule); + Py_DECREF(priv->pGlusterModule); + Py_DECREF(priv->pFrameType); + Py_DECREF(priv->pFdType); + Py_DECREF(priv->pVectorType); + Py_EndInterpreter(priv->pInterp); + return; +} diff --git a/xlators/bindings/python/src/testxlator.py b/xlators/bindings/python/src/testxlator.py new file mode 100644 index 000000000..507455c85 --- /dev/null +++ b/xlators/bindings/python/src/testxlator.py @@ -0,0 +1,56 @@ +# Copyright (c) 2007 Chris AtLee <chris@atlee.ca> +# This file is part of GlusterFS. +# +# GlusterFS is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published +# by the Free Software Foundation; either version 3 of the License, +# or (at your option) any later version. +# +# GlusterFS is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see +# <http://www.gnu.org/licenses/>. + +""" +This is a test translator written in python. + +Important things to note: + This file must be import-able from glusterfsd. This probably means + setting PYTHONPATH to where this file is located. + + This file must have a top-level xlator class object that will be + used to instantiate individual translators. +""" +from gluster import * + +class MyXlator(ComplexTranslator): + name = "MyXlator" + def writev_cbk(self, frame, cookie, op_ret, op_errno, buf): + stack_unwind(frame, op_ret, op_errno, buf) + return 0 + + def writev(self, frame, fd, vector, count, offset): + gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len) + # TODO: Use cookie to pass this to writev_cbk + old_count = vector.iov_len + + data = vector.getBytes().encode("zlib") + + vector = iovec(data) + gf_log(self.name, GF_LOG_WARNING, "writev %i bytes", vector.iov_len) + + @ret_fn_t + def rfn(frame, prev, this, op_ret, op_errno, *params): + if len(params) == 0: + params = [0] + return self.writev_cbk(frame, prev, old_count, op_errno, *params) + + stack_wind(frame, rfn, self.firstChild, + self.firstChild[0].fops[0].writev, fd, vector, count, offset) + return 0 + +xlator = MyXlator diff --git a/xlators/cluster/Makefile.am b/xlators/cluster/Makefile.am new file mode 100644 index 000000000..a6ddb3564 --- /dev/null +++ b/xlators/cluster/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = unify stripe afr dht ha map + +CLEANFILES = diff --git a/xlators/cluster/afr/Makefile.am b/xlators/cluster/afr/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/afr/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/afr/src/Makefile.am b/xlators/cluster/afr/src/Makefile.am new file mode 100644 index 000000000..1bde9e5ba --- /dev/null +++ b/xlators/cluster/afr/src/Makefile.am @@ -0,0 +1,20 @@ +xlator_LTLIBRARIES = afr.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +afr_la_LDFLAGS = -module -avoidversion + +afr_la_SOURCES = afr.c afr-dir-read.c afr-dir-write.c afr-inode-read.c afr-inode-write.c afr-transaction.c afr-self-heal-data.c afr-self-heal-common.c afr-self-heal-metadata.c afr-self-heal-entry.c +afr_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = afr.h afr-transaction.h afr-inode-write.h afr-inode-read.h afr-dir-read.h afr-dir-write.h afr-self-heal.h afr-self-heal-common.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/replicate.so + +install-data-hook: + ln -sf afr.so $(DESTDIR)$(xlatordir)/replicate.so \ No newline at end of file diff --git a/xlators/cluster/afr/src/afr-dir-read.c b/xlators/cluster/afr/src/afr-dir-read.c new file mode 100644 index 000000000..0c65ca852 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-read.c @@ -0,0 +1,345 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" + + +int32_t +afr_opendir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + LOCK (&frame->lock); + { + local = frame->local; + + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + + return 0; +} + + +int32_t +afr_opendir (call_frame_t *frame, xlator_t *this, + loc_t *loc, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int child_count = 0; + int i = 0; + + int ret = -1; + int call_count = -1; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + child_count = priv->child_count; + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + local->fd = fd_ref (fd); + + call_count = local->call_count; + + for (i = 0; i < child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_opendir_cbk, + priv->children[i], + priv->children[i]->fops->opendir, + loc, fd); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); + } + + return 0; +} + + +/** + * Common algorithm for directory read calls: + * + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + * try the next child + * + * Applicable to: readdir + */ + +int32_t +afr_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + gf_dirent_t *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.readdir.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + + this_try = ++local->cont.readdir.last_tried; + unwind = 0; + + STACK_WIND (frame, afr_readdir_cbk, + children[this_try], + children[this_try]->fops->readdir, + local->fd, local->cont.readdir.size, + local->cont.readdir.offset); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int ret = -1; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readdir.last_tried = call_child; + + local->fd = fd_ref (fd); + local->cont.readdir.size = size; + local->cont.readdir.offset = offset; + + STACK_WIND (frame, afr_readdir_cbk, + children[call_child], children[call_child]->fops->readdir, + fd, size, offset); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +int32_t +afr_getdents_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dir_entry_t *entry, int32_t count) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.getdents.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + + this_try = ++local->cont.getdents.last_tried; + unwind = 0; + + STACK_WIND (frame, afr_getdents_cbk, + children[this_try], + children[this_try]->fops->getdents, + local->fd, local->cont.getdents.size, + local->cont.getdents.offset, local->cont.getdents.flag); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, entry, count); + } + + return 0; +} + + +int32_t +afr_getdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, int32_t flag) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.getdents.last_tried = call_child; + + local->fd = fd_ref (fd); + + local->cont.getdents.size = size; + local->cont.getdents.offset = offset; + local->cont.getdents.flag = flag; + + frame->local = local; + + STACK_WIND (frame, afr_getdents_cbk, + children[call_child], children[call_child]->fops->getdents, + fd, size, offset, flag); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + diff --git a/xlators/cluster/afr/src/afr-dir-read.h b/xlators/cluster/afr/src/afr-dir-read.h new file mode 100644 index 000000000..172ec3c90 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-read.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __DIR_READ_H__ +#define __DIR_READ_H__ + + +int32_t +afr_opendir (call_frame_t *frame, xlator_t *this, + loc_t *loc, fd_t *fd); + +int32_t +afr_closedir (call_frame_t *frame, xlator_t *this, + fd_t *fd); + +int32_t +afr_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset); + + +int32_t +afr_getdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset, int32_t flag); + + +int32_t +afr_checksum (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags); + + +#endif /* __DIR_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c new file mode 100644 index 000000000..87a6e09b5 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-write.c @@ -0,0 +1,1786 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" + + +void +afr_build_parent_loc (loc_t *parent, loc_t *child) +{ + char *tmp = NULL; + + if (!child->parent) { + loc_copy (parent, child); + return; + } + + tmp = strdup (child->path); + parent->path = strdup (dirname (tmp)); + FREE (tmp); + + parent->name = strrchr (parent->path, '/'); + if (parent->name) + parent->name++; + + parent->inode = inode_ref (child->parent); + parent->parent = inode_parent (parent->inode, 0, NULL); + parent->ino = parent->inode->ino; +} + + +/* {{{ create */ + +int +afr_create_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.create.fd, + local->cont.create.inode, + &local->cont.create.buf); + return 0; +} + + +int +afr_create_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.create.buf = *buf; + local->cont.create.buf.st_ino = + afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + local->cont.create.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_create_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_create_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->create, + &local->loc, + local->cont.create.flags, + local->cont.create.mode, + local->cont.create.fd); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_create_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.create.flags = flags; + local->cont.create.mode = mode; + local->cont.create.fd = fd_ref (fd); + + local->transaction.fop = afr_create_wind; + local->transaction.done = afr_create_done; + local->transaction.unwind = afr_create_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ mknod */ + +int +afr_mknod_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.mknod.inode, + &local->cont.mknod.buf); + return 0; +} + + +int +afr_mknod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.mknod.buf = *buf; + local->cont.mknod.buf.st_ino = + afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + local->cont.mknod.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_mknod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_mknod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mknod, + &local->loc, local->cont.mknod.mode, + local->cont.mknod.dev); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_mknod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t dev) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.mknod.mode = mode; + local->cont.mknod.dev = dev; + + local->transaction.fop = afr_mknod_wind; + local->transaction.done = afr_mknod_done; + local->transaction.unwind = afr_mknod_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ mkdir */ + + +int +afr_mkdir_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.mkdir.inode, + &local->cont.mkdir.buf); + return 0; +} + + +int +afr_mkdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.mkdir.buf = *buf; + local->cont.mkdir.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.mkdir.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_mkdir_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_mkdir_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mkdir, + &local->loc, local->cont.mkdir.mode); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_mkdir_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.mkdir.mode = mode; + + local->transaction.fop = afr_mkdir_wind; + local->transaction.done = afr_mkdir_done; + local->transaction.unwind = afr_mkdir_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ link */ + + +int +afr_link_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.link.buf.st_ino = local->cont.link.ino; + + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.link.inode, + &local->cont.link.buf); + } + + return 0; +} + + +int +afr_link_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.link.buf = *buf; + local->cont.link.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.link.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_link_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_link_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->link, + &local->loc, + &local->newloc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_link_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); + + local->cont.link.ino = oldloc->inode->ino; + + local->transaction.fop = afr_link_wind; + local->transaction.done = afr_link_done; + local->transaction.unwind = afr_link_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (oldloc->path); + local->transaction.new_basename = AFR_BASENAME (newloc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ symlink */ + + +int +afr_symlink_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + local->cont.symlink.inode, + &local->cont.symlink.buf); + return 0; +} + + +int +afr_symlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + local->op_ret = op_ret; + + if ((local->success_count == 0) + || (child_index == priv->read_child)) { + local->cont.symlink.buf = *buf; + local->cont.symlink.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->cont.symlink.inode = inode; + + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_symlink_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_symlink_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->symlink, + local->cont.symlink.linkpath, + &local->loc); + + if (!--call_count) + break; + + } + } + + return 0; +} + + +int +afr_symlink_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->cont.symlink.ino = loc->inode->ino; + local->cont.symlink.linkpath = strdup (linkpath); + + local->transaction.fop = afr_symlink_wind; + local->transaction.done = afr_symlink_done; + local->transaction.unwind = afr_symlink_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ rename */ + +int +afr_rename_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.rename.buf.st_ino = local->cont.rename.ino; + + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.rename.buf); + } + + return 0; +} + + +int +afr_rename_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if ((op_ret != -1) && (local->success_count == 0)) { + local->op_ret = op_ret; + + if (buf) { + local->cont.rename.buf = *buf; + local->cont.rename.buf.st_ino = + afr_itransform (buf->st_ino, priv->child_count, + child_index); + } + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_rename_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_rename_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rename, + &local->loc, + &local->newloc); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_rename_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, oldloc); + loc_copy (&local->newloc, newloc); + + local->cont.rename.ino = oldloc->inode->ino; + + local->transaction.fop = afr_rename_wind; + local->transaction.done = afr_rename_done; + local->transaction.unwind = afr_rename_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, oldloc); + afr_build_parent_loc (&local->transaction.new_parent_loc, newloc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (oldloc->path); + local->transaction.new_basename = AFR_BASENAME (newloc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_RENAME_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ unlink */ + +int +afr_unlink_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_unlink_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_unlink_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_unlink_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->unlink, + &local->loc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_unlink_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->transaction.fop = afr_unlink_wind; + local->transaction.done = afr_unlink_done; + local->transaction.unwind = afr_unlink_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ rmdir */ + + + +int +afr_rmdir_unwind (call_frame_t *frame, xlator_t *this) +{ + call_frame_t *main_frame = NULL; + afr_local_t *local = NULL; + + local = frame->local; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) { + main_frame = local->transaction.main_frame; + } + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_rmdir_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) + need_unwind = 1; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_rmdir_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_rmdir_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->rmdir, + &local->loc); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_rmdir_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + loc_copy (&local->loc, loc); + + local->transaction.fop = afr_rmdir_wind; + local->transaction.done = afr_rmdir_done; + local->transaction.unwind = afr_rmdir_unwind; + + afr_build_parent_loc (&local->transaction.parent_loc, loc); + + local->transaction.main_frame = frame; + local->transaction.basename = AFR_BASENAME (loc->path); + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (transaction_frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ setdents */ + +int32_t +afr_setdents_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if ((op_ret != -1) && (local->success_count == 0)) { + local->op_ret = op_ret; + local->success_count++; + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_setdents_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_setdents_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setdents, + local->fd, local->cont.setdents.flags, + local->cont.setdents.entries, + local->cont.setdents.count); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_setdents_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_setdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + local->fd = fd_ref (fd); + + local->cont.setdents.flags = flags; + local->cont.setdents.entries = entries; + local->cont.setdents.count = count; + + local->transaction.fop = afr_setdents_wind; + local->transaction.done = afr_setdents_done; + + local->transaction.basename = NULL; + local->transaction.pending = AFR_ENTRY_PENDING; + + afr_transaction (frame, this, AFR_ENTRY_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr/src/afr-dir-write.h b/xlators/cluster/afr/src/afr-dir-write.h new file mode 100644 index 000000000..e6e8a5e79 --- /dev/null +++ b/xlators/cluster/afr/src/afr-dir-write.h @@ -0,0 +1,59 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __DIR_WRITE_H__ +#define __DIR_WRITE_H__ + +int32_t +afr_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd); + +int32_t +afr_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t dev); + +int32_t +afr_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode); + +int32_t +afr_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc); + +int32_t +afr_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc); + +int32_t +afr_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *oldloc); + +int32_t +afr_setdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t flags, dir_entry_t *entries, int32_t count); + +#endif /* __DIR_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-read.c b/xlators/cluster/afr/src/afr-inode-read.c new file mode 100644 index 000000000..a6c99ec05 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-read.c @@ -0,0 +1,721 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" + + +/** + * Common algorithm for inode read calls: + * + * - Try the fop on the first child that is up + * - if we have failed due to ENOTCONN: + * try the next child + * + * Applicable to: access, stat, fstat, readlink, getxattr + */ + +/* {{{ access */ + +int32_t +afr_access_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.access.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.access.last_tried; + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_access_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->access, + &local->loc, local->cont.access.mask); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.access.last_tried = call_child; + loc_copy (&local->loc, loc); + local->cont.access.mask = mask; + + STACK_WIND_COOKIE (frame, afr_access_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->access, + loc, mask); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +/* }}} */ + +/* {{{ stat */ + +int32_t +afr_stat_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int deitransform_child = -1; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + deitransform_child = (long) cookie; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.stat.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.stat.last_tried; + + if (this_try == deitransform_child) { + goto retry; + } + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_stat_cbk, + (void *) (long) deitransform_child, + children[this_try], + children[this_try]->fops->stat, + &local->loc); + } + +out: + if (unwind) { + if (op_ret != -1) + buf->st_ino = local->cont.stat.ino; + + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + call_child = afr_deitransform (loc->inode->ino, priv->child_count); + loc_copy (&local->loc, loc); + + /* + if stat fails from the deitranform'd child, we try + all children starting with the first one + */ + local->cont.stat.last_tried = -1; + local->cont.stat.ino = loc->inode->ino; + + STACK_WIND_COOKIE (frame, afr_stat_cbk, (void *) (long) call_child, + children[call_child], + children[call_child]->fops->stat, + loc); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ fstat */ + +int32_t +afr_fstat_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int deitransform_child = -1; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + deitransform_child = (long) cookie; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.fstat.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.fstat.last_tried; + + if (this_try == deitransform_child) { + /* + skip the deitransform'd child since if we are here + we must have already tried that child + */ + goto retry; + } + + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, + (void *) (long) deitransform_child, + children[this_try], + children[this_try]->fops->fstat, + local->fd); + } + +out: + if (unwind) { + if (op_ret != -1) + buf->st_ino = local->cont.fstat.ino; + + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + VALIDATE_OR_GOTO (fd->inode, out); + + call_child = afr_deitransform (fd->inode->ino, priv->child_count); + + /* + if fstat fails from the deitranform'd child, we try + all children starting with the first one + */ + local->cont.fstat.last_tried = -1; + local->cont.fstat.ino = fd->inode->ino; + local->fd = fd_ref (fd); + + STACK_WIND_COOKIE (frame, afr_fstat_cbk, (void *) (long) call_child, + children[call_child], + children[call_child]->fops->fstat, + fd); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ readlink */ + +int32_t +afr_readlink_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + const char *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.readlink.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.readlink.last_tried; + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_readlink_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->readlink, + &local->loc, + local->cont.readlink.size); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, buf); + } + + return 0; +} + + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t *local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readlink.last_tried = call_child; + loc_copy (&local->loc, loc); + local->cont.readlink.size = size; + + STACK_WIND_COOKIE (frame, afr_readlink_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->readlink, + loc, size); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/* }}} */ + +/* {{{ getxattr */ + +int32_t +afr_getxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *dict) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + priv = this->private; + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + last_tried = local->cont.getxattr.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.getxattr.last_tried; + + unwind = 0; + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->getxattr, + &local->loc, + local->cont.getxattr.name); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, dict); + } + + return 0; +} + + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name) +{ + afr_private_t * priv = NULL; + xlator_t ** children = NULL; + int call_child = 0; + afr_local_t * local = NULL; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + frame->local = local; + + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.getxattr.last_tried = call_child; + loc_copy (&local->loc, loc); + if (name) + local->cont.getxattr.name = strdup (name); + + STACK_WIND_COOKIE (frame, afr_getxattr_cbk, + (void *) (long) call_child, + children[call_child], children[call_child]->fops->getxattr, + loc, name); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/* }}} */ + +/* {{{ readv */ + +/** + * read algorithm: + * + * if the user has specified a read subvolume, use it + * otherwise - + * use the inode number to hash it to one of the subvolumes, and + * read from there (to balance read load) + * + * if any of the above read's fail, try the children in sequence + * beginning at the beginning + */ + +int32_t +afr_readv_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int unwind = 1; + int last_tried = -1; + int this_try = -1; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + VALIDATE_OR_GOTO (priv->children, out); + + children = priv->children; + + local = frame->local; + + if (op_ret == -1) { + retry: + last_tried = local->cont.readv.last_tried; + + if (all_tried (last_tried, priv->child_count)) { + goto out; + } + this_try = ++local->cont.readv.last_tried; + + if (this_try == priv->read_child) { + /* + skip the read child since if we are here + we must have already tried that child + */ + goto retry; + } + + unwind = 0; + + STACK_WIND_COOKIE (frame, afr_readv_cbk, + (void *) (long) this_try, + children[this_try], + children[this_try]->fops->readv, + local->fd, local->cont.readv.size, + local->cont.readv.offset); + } + +out: + if (unwind) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf); + } + + return 0; +} + + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + xlator_t ** children = NULL; + + int call_child = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (fd, out); + + priv = this->private; + children = priv->children; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + frame->local = local; + + if (priv->read_child != -1) { + call_child = priv->read_child; + + /* + if read fails from the read child, we try + all children starting with the first one + */ + local->cont.readv.last_tried = -1; + } else { + call_child = afr_first_up_child (priv); + if (call_child == -1) { + op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "no child is up :("); + goto out; + } + + local->cont.readv.last_tried = call_child; + } + + local->fd = fd_ref (fd); + + local->cont.readv.size = size; + local->cont.readv.offset = offset; + + STACK_WIND_COOKIE (frame, afr_readv_cbk, + (void *) (long) call_child, + children[call_child], + children[call_child]->fops->readv, + fd, size, offset); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL, 0, NULL); + } + return 0; +} + +/* }}} */ diff --git a/xlators/cluster/afr/src/afr-inode-read.h b/xlators/cluster/afr/src/afr-inode-read.h new file mode 100644 index 000000000..6b3bd2da8 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-read.h @@ -0,0 +1,47 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __INODE_READ_H__ +#define __INODE_READ_H__ + +int32_t +afr_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask); + +int32_t +afr_stat (call_frame_t *frame, xlator_t *this, + loc_t *loc); + +int32_t +afr_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd); + +int32_t +afr_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size); + +int32_t +afr_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset); + +int32_t +afr_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name); + +#endif /* __INODE_READ_H__ */ diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c new file mode 100644 index 000000000..267350b2c --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-write.c @@ -0,0 +1,2024 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" + +#include "afr.h" +#include "afr-transaction.h" + + +/* {{{ chmod */ + + +int +afr_chmod_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.chmod.buf.st_ino = local->cont.chmod.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.chmod.buf); + } + return 0; +} + + +int +afr_chmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.chmod.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + afr_chmod_unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_chmod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_chmod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chmod, + &local->loc, + local->cont.chmod.mode); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_chmod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.chmod.mode = mode; + local->cont.chmod.ino = loc->inode->ino; + + local->transaction.fop = afr_chmod_wind; + local->transaction.done = afr_chmod_done; + local->transaction.unwind = afr_chmod_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + + +/* {{{ fchmod */ + +int +afr_fchmod_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.fchmod.buf.st_ino = local->cont.fchmod.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.fchmod.buf); + } + return 0; +} + + +int +afr_fchmod_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.fchmod.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + afr_fchmod_unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_fchmod_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_fchmod_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fchmod, + local->fd, + local->cont.fchmod.mode); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fchmod_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int32_t +afr_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t * transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.fchmod.mode = mode; + local->cont.fchmod.ino = fd->inode->ino; + + local->transaction.fop = afr_fchmod_wind; + local->transaction.done = afr_fchmod_done; + local->transaction.unwind = afr_fchmod_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ chown */ + +int +afr_chown_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.chown.buf.st_ino = local->cont.chown.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.chown.buf); + } + return 0; +} + + +int +afr_chown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.chown.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_chown_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_chown_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chown, + &local->loc, local->cont.chown.uid, + local->cont.chown.gid); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_chown_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.chown.uid = uid; + local->cont.chown.gid = gid; + local->cont.chown.ino = loc->inode->ino; + + local->transaction.fop = afr_chown_wind; + local->transaction.done = afr_chown_done; + local->transaction.unwind = afr_chown_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ chown */ + +int +afr_fchown_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.fchown.buf.st_ino = local->cont.fchown.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.fchown.buf); + } + return 0; +} + + +int +afr_fchown_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int child_index = (long) cookie; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.fchown.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) { + local->transaction.unwind (frame, this); + } + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_fchown_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_fchown_wind_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fchown, + local->fd, local->cont.fchown.uid, + local->cont.fchown.gid); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_fchown_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->cont.fchown.uid = uid; + local->cont.fchown.gid = gid; + local->cont.fchown.ino = fd->inode->ino; + + local->transaction.fop = afr_fchown_wind; + local->transaction.done = afr_fchown_done; + local->transaction.unwind = afr_fchown_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ writev */ + +int +afr_writev_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.writev.buf.st_ino = local->cont.writev.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.writev.buf); + } + return 0; +} + + +int +afr_writev_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.writev.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.unwind (frame, this); + + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_writev_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_writev_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->writev, + local->fd, + local->cont.writev.vector, + local->cont.writev.count, + local->cont.writev.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_writev_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + if (local->cont.writev.refs) + dict_unref (local->cont.writev.refs); + local->cont.writev.refs = NULL; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op = GF_FOP_WRITE; + local->cont.writev.vector = iov_dup (vector, count); + local->cont.writev.count = count; + local->cont.writev.offset = offset; + local->cont.writev.ino = fd->inode->ino; + + if (frame->root->req_refs) + local->cont.writev.refs = dict_ref (frame->root->req_refs); + + local->transaction.fop = afr_writev_wind; + local->transaction.done = afr_writev_done; + local->transaction.unwind = afr_writev_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + if (fd->flags & O_APPEND) { + local->transaction.start = 0; + local->transaction.len = 0; + } else { + local->transaction.start = offset; + local->transaction.len = iov_length (vector, count); + } + + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ truncate */ + +int +afr_truncate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.truncate.buf.st_ino = local->cont.truncate.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.truncate.buf); + } + return 0; +} + + +int +afr_truncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.truncate.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_truncate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_truncate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->truncate, + &local->loc, + local->cont.truncate.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_truncate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.truncate.offset = offset; + local->cont.truncate.ino = loc->inode->ino; + + local->transaction.fop = afr_truncate_wind; + local->transaction.done = afr_truncate_done; + local->transaction.unwind = afr_truncate_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = offset; + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + + +/* }}} */ + +/* {{{ ftruncate */ + + +int +afr_ftruncate_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.ftruncate.buf.st_ino = local->cont.ftruncate.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.ftruncate.buf); + } + return 0; +} + + +int +afr_ftruncate_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.ftruncate.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_ftruncate_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_ftruncate_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->ftruncate, + local->fd, local->cont.ftruncate.offset); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_ftruncate_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op = GF_FOP_FTRUNCATE; + local->op_ret = -1; + + local->cont.ftruncate.offset = offset; + local->cont.ftruncate.ino = fd->inode->ino; + + local->transaction.fop = afr_ftruncate_wind; + local->transaction.done = afr_ftruncate_done; + local->transaction.unwind = afr_ftruncate_unwind; + + local->fd = fd_ref (fd); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = offset; + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_DATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ utimens */ + + +int +afr_utimens_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + local->cont.utimens.buf.st_ino = local->cont.utimens.ino; + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno, + &local->cont.utimens.buf); + } + return 0; +} + + +int +afr_utimens_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int child_index = (long) cookie; + int call_count = -1; + int need_unwind = 1; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (child_went_down (op_ret, op_errno)) + afr_transaction_child_died (frame, this, child_index); + + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + local->cont.utimens.buf = *buf; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_utimens_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_utimens_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->utimens, + &local->loc, + local->cont.utimens.tv); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_utimens_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + + local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec tv[2]) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.utimens.tv[0] = tv[0]; + local->cont.utimens.tv[1] = tv[1]; + + local->cont.utimens.ino = loc->inode->ino; + + local->transaction.fop = afr_utimens_wind; + local->transaction.done = afr_utimens_done; + local->transaction.unwind = afr_utimens_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ setxattr */ + + +int +afr_setxattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) + } + return 0; +} + + +int +afr_setxattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_setxattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_setxattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setxattr, + &local->loc, + local->cont.setxattr.dict, + local->cont.setxattr.flags); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_setxattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int32_t flags) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.setxattr.dict = dict_ref (dict); + local->cont.setxattr.flags = flags; + + local->transaction.fop = afr_setxattr_wind; + local->transaction.done = afr_setxattr_done; + local->transaction.unwind = afr_setxattr_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + +/* }}} */ + +/* {{{ removexattr */ + + +int +afr_removexattr_unwind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + call_frame_t *main_frame = NULL; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.main_frame) + main_frame = local->transaction.main_frame; + local->transaction.main_frame = NULL; + } + UNLOCK (&frame->lock); + + if (main_frame) { + AFR_STACK_UNWIND (main_frame, local->op_ret, local->op_errno) + } + return 0; +} + + +int +afr_removexattr_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + int need_unwind = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + if (local->success_count == 0) { + local->op_ret = op_ret; + } + local->success_count++; + + if (local->success_count == priv->wait_count) { + need_unwind = 1; + } + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (need_unwind) + local->transaction.unwind (frame, this); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int32_t +afr_removexattr_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int i = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_removexattr_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->removexattr, + &local->loc, + local->cont.removexattr.name); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_removexattr_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = frame->local; + + local->transaction.unwind (frame, this); + + AFR_STACK_DESTROY (frame); + + return 0; +} + + +int +afr_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + call_frame_t *transaction_frame = NULL; + + int ret = -1; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + transaction_frame = copy_frame (frame); + if (!transaction_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + transaction_frame->local = local; + + local->op_ret = -1; + + local->cont.removexattr.name = strdup (name); + + local->transaction.fop = afr_removexattr_wind; + local->transaction.done = afr_removexattr_done; + local->transaction.unwind = afr_removexattr_unwind; + + loc_copy (&local->loc, loc); + + local->transaction.main_frame = frame; + local->transaction.start = 0; + local->transaction.len = 0; + local->transaction.pending = AFR_METADATA_PENDING; + + afr_transaction (transaction_frame, this, AFR_METADATA_TRANSACTION); + + op_ret = 0; +out: + if (op_ret == -1) { + if (transaction_frame) + AFR_STACK_DESTROY (transaction_frame); + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-inode-write.h b/xlators/cluster/afr/src/afr-inode-write.h new file mode 100644 index 000000000..9c0b5cad3 --- /dev/null +++ b/xlators/cluster/afr/src/afr-inode-write.h @@ -0,0 +1,63 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __INODE_WRITE_H__ +#define __INODE_WRITE_H__ + +int32_t +afr_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode); + +int32_t +afr_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid); + +int +afr_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid); + +int32_t +afr_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode); + +int32_t +afr_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset); + +int32_t +afr_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset); + +int32_t +afr_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset); + +int32_t +afr_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec tv[2]); + +int32_t +afr_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int32_t flags); + +int32_t +afr_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name); + +#endif /* __INODE_WRITE_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c new file mode 100644 index 000000000..45d065169 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-common.c @@ -0,0 +1,1073 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "glusterfs.h" +#include "xlator.h" +#include "byte-order.h" + +#include "afr.h" +#include "afr-transaction.h" +#include "afr-self-heal-common.h" +#include "afr-self-heal.h" + + +/** + * select_source - select a source and return it + * TODO: take into account option 'favorite-child' + */ + +int +afr_sh_select_source (int sources[], int child_count) +{ + int i; + for (i = 0; i < child_count; i++) + if (sources[i]) + return i; + + return -1; +} + + +/** + * sink_count - return number of sinks in sources array + */ + +int +afr_sh_sink_count (int sources[], int child_count) +{ + int i; + int sinks = 0; + for (i = 0; i < child_count; i++) + if (!sources[i]) + sinks++; + return sinks; +} + +int +afr_sh_source_count (int sources[], int child_count) +{ + int i; + int nsource = 0; + + for (i = 0; i < child_count; i++) + if (sources[i]) + nsource++; + return nsource; +} + + +int +afr_sh_supress_errenous_children (int sources[], int child_errno[], + int child_count) +{ + int i = 0; + + for (i = 0; i < child_count; i++) { + if (child_errno[i] && sources[i]) { + sources[i] = 0; + } + } + + return 0; +} + + +int +afr_sh_supress_empty_children (int sources[], dict_t *xattr[], + struct stat *buf, + int child_count, const char *key) +{ + int i = 0; + int32_t *pending = NULL; + int ret = 0; + int all_xattr_missing = 1; + + /* if the file was created by afr with xattrs */ + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); + if (ret != 0) { + continue; + } + + all_xattr_missing = 0; + break; + } + + if (all_xattr_missing) { + /* supress 0byte files.. this avoids empty file created + by dir selfheal to overwrite the 'good' file */ + for (i = 0; i < child_count; i++) { + if (!buf[i].st_size) + sources[i] = 0; + } + goto out; + } + + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) { + sources[i] = 0; + continue; + } + + ret = dict_get_ptr (xattr[i], (char *)key, VOID(&pending)); + if (ret != 0) { + sources[i] = 0; + continue; + } + + if (!pending) { + sources[i] = 0; + continue; + } + } + +out: + return 0; +} + + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this) +{ + afr_private_t * priv = this->private; + + char *buf = NULL; + char *ptr = NULL; + + int i, j; + + /* 10 digits per entry + 1 space + '[' and ']' */ + buf = MALLOC (priv->child_count * 11 + 8); + + for (i = 0; i < priv->child_count; i++) { + ptr = buf; + ptr += sprintf (ptr, "[ "); + for (j = 0; j < priv->child_count; j++) { + ptr += sprintf (ptr, "%d ", pending_matrix[i][j]); + } + ptr += sprintf (ptr, "]"); + gf_log (this->name, GF_LOG_DEBUG, + "pending_matrix: %s", buf); + } + + FREE (buf); +} + + +void +afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], + int child_count, const char *key) +{ + int i = 0; + int j = 0; + int32_t *pending = NULL; + int ret = -1; + + /* start clean */ + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + pending_matrix[i][j] = 0; + } + } + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + pending = NULL; + + ret = dict_get_ptr (xattr[i], (char *) key, + VOID(&pending)); + if (ret != 0) + continue; + + for (j = 0; j < child_count; j++) { + pending_matrix[i][j] = ntoh32 (pending[j]); + } + } +} + + +/** + * mark_sources: Mark all 'source' nodes and return number of source + * nodes found + */ + +int +afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], int child_count) +{ + int i = 0; + int j = 0; + + int nsources = 0; + + + /* start clean */ + for (i = 0; i < child_count; i++) { + sources[i] = 0; + } + + /* + Let's 'normalize' the pending matrix first, + by disregarding all pending entries that refer + to themselves + */ + for (i = 0; i < child_count; i++) { + pending_matrix[i][i] = 0; + } + + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + if (pending_matrix[j][i]) + break; + } + + if (j == child_count) { + nsources++; + sources[i] = 1; + } + } + + return nsources; +} + + +void +afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[], + int success[], int child_count) +{ + int i = 0; + int j = 0; + + /* start clean */ + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + delta_matrix[i][j] = 0; + } + } + + for (i = 0; i < child_count; i++) { + for (j = 0; j < child_count; j++) { + if (!success[j]) + continue; + delta_matrix[i][j] = -pending_matrix[i][j]; + } + } +} + + +int +afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[], + int child_count, const char *key) +{ + int i = 0; + int j = 0; + + int ret = 0; + + int32_t *pending = 0; + + for (i = 0; i < child_count; i++) { + if (!xattr[i]) + continue; + + pending = CALLOC (sizeof (int32_t), child_count); + for (j = 0; j < child_count; j++) { + pending[j] = hton32 (delta_matrix[i][j]); + } + + ret = dict_set_bin (xattr[i], (char *) key, pending, + child_count * sizeof (int32_t)); + } + + return 0; +} + + +int +afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_METADATA_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + +int +afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_DATA_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + +int +afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this) +{ + afr_private_t *priv = NULL; + int32_t *pending = NULL; + void *tmp_pending = NULL; /* This is required to remove 'type-punned' warnings from gcc */ + + int ret = -1; + int i = 0; + + priv = this->private; + + ret = dict_get_ptr (xattr, AFR_ENTRY_PENDING, &tmp_pending); + + if (ret != 0) + return 0; + + pending = tmp_pending; + for (i = 0; i < priv->child_count; i++) { + if (i == child_count) + continue; + if (pending[i]) + return 1; + } + + return 0; +} + + + +/** + * is_matrix_zero - return true if pending matrix is all zeroes + */ + +int +afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count) +{ + int i, j; + + for (i = 0; i < child_count; i++) + for (j = 0; j < child_count; j++) + if (pending_matrix[i][j]) + return 0; + return 1; +} + + +int +afr_sh_missing_entries_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + +// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); + memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + + if (local->govinda_gOvinda) { + gf_log (this->name, GF_LOG_WARNING, + "aborting selfheal of %s", + local->loc.path); + sh->completion_cbk (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to metadata check on %s", + local->loc.path); + afr_self_heal_metadata (frame, this); + } + + return 0; +} + + +int +sh_missing_entries_unlck_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_missing_entries_done (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_finish (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_self_heal_t *sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %"PRId64"/%s on subvolume %s", + sh->parent_loc.inode->ino, local->loc.name, + priv->children[i]->name); + + STACK_WIND (frame, sh_missing_entries_unlck_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &sh->parent_loc, local->loc.name, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + if (!--call_count) + break; + } + } + return 0; +} + + +static int +sh_destroy_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int op_errno, struct stat *stbuf) +{ + STACK_DESTROY (frame->root); + return 0; +} + + +static int +sh_missing_entries_newentry_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + call_frame_t *chown_frame = NULL; + int call_count = 0; + int child_index = 0; + struct stat *buf = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + buf = &sh->buf[sh->source]; + child_index = (long) cookie; + + if (op_ret == 0) { + chown_frame = copy_frame (frame); + + gf_log (this->name, GF_LOG_DEBUG, + "chown %s to %d %d on subvolume %s", + local->loc.path, buf->st_uid, buf->st_gid, + priv->children[child_index]->name); + + STACK_WIND (chown_frame, sh_destroy_cbk, + priv->children[child_index], + priv->children[child_index]->fops->chown, + &local->loc, + buf->st_uid, buf->st_gid); + } + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + sh_missing_entries_finish (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_mknod (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + mode_t st_mode = 0; + dev_t st_dev = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + st_mode = sh->buf[sh->source].st_mode; + st_dev = sh->buf[sh->source].st_dev; + + gf_log (this->name, GF_LOG_DEBUG, + "mknod %s mode 0%o on %d subvolumes", + local->loc.path, st_mode, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mknod, + &local->loc, st_mode, st_dev); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_mkdir (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + mode_t st_mode = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + st_mode = sh->buf[sh->source].st_mode; + + gf_log (this->name, GF_LOG_DEBUG, + "mkdir %s mode 0%o on %d subvolumes", + local->loc.path, st_mode, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->mkdir, + &local->loc, st_mode); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_symlink (call_frame_t *frame, xlator_t *this, + const char *link) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int enoent_count = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) + if (sh->child_errno[i] == ENOENT) + enoent_count++; + + call_count = enoent_count; + local->call_count = call_count; + + gf_log (this->name, GF_LOG_DEBUG, + "symlink %s -> %s on %d subvolumes", + local->loc.path, link, enoent_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i] == ENOENT) { + STACK_WIND_COOKIE (frame, + sh_missing_entries_newentry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->symlink, + link, &local->loc); + if (!--call_count) + break; + } + } + + return 0; +} + + +static int +sh_missing_entries_readlink_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *link) +{ + if (op_ret > 0) + sh_missing_entries_symlink (frame, this, link); + else + sh_missing_entries_finish (frame, this); + + return 0; +} + + +static int +sh_missing_entries_readlink (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + STACK_WIND (frame, sh_missing_entries_readlink_cbk, + priv->children[sh->source], + priv->children[sh->source]->fops->readlink, + &local->loc, 4096); + + return 0; +} + + +static int +sh_missing_entries_create (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int type = 0; + int i = 0; + afr_private_t *priv = NULL; + int enoent_count = 0; + int govinda_gOvinda = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (sh->child_errno[i]) { + if (sh->child_errno[i] == ENOENT) + enoent_count++; + } else { + if (type) { + if (type != (sh->buf[i].st_mode & S_IFMT)) + govinda_gOvinda = 1; + } else { + sh->source = i; + type = sh->buf[i].st_mode & S_IFMT; + } + } + } + + if (govinda_gOvinda) { + gf_log (this->name, GF_LOG_ERROR, + "conflicing filetypes exist for path %s. returning.", + local->loc.path); + + local->govinda_gOvinda = 1; + sh_missing_entries_finish (frame, this); + return 0; + } + + if (!type) { + gf_log (this->name, GF_LOG_ERROR, + "no source found for %s. all nodes down?. returning.", + local->loc.path); + /* subvolumes down and/or file does not exist */ + sh_missing_entries_finish (frame, this); + return 0; + } + + if (enoent_count == 0) { + gf_log (this->name, GF_LOG_ERROR, + "no missing files - %s. proceeding to metadata check", + local->loc.path); + /* proceed to next step - metadata self-heal */ + sh_missing_entries_finish (frame, this); + return 0; + } + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + sh_missing_entries_mknod (frame, this); + break; + case S_IFLNK: + sh_missing_entries_readlink (frame, this); + break; + case S_IFDIR: + sh_missing_entries_mkdir (frame, this); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown file type: 0%o", type); + local->govinda_gOvinda = 1; + sh_missing_entries_finish (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + int child_index = 0; + afr_local_t *local = NULL; + int call_count = 0; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s is of mode 0%o", + local->loc.path, + priv->children[child_index]->name, + buf->st_mode); + + local->self_heal.buf[child_index] = *buf; + } else { + gf_log (this->name, GF_LOG_WARNING, + "path %s on subvolume %s => -1 (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + local->self_heal.child_errno[child_index] = op_errno; + } + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + sh_missing_entries_create (frame, this); + } + + return 0; +} + + +static int +sh_missing_entries_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int i = 0; + int call_count = 0; + afr_private_t *priv = NULL; + dict_t *xattr_req = NULL; + int ret = -1; + + local = frame->local; + call_count = local->child_count; + priv = this->private; + + local->call_count = call_count; + + xattr_req = dict_new(); + + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, + sh_missing_entries_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +static int +sh_missing_entries_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed == 1) { + sh_missing_entries_finish (frame, this); + return 0; + } + + sh_missing_entries_lookup (frame, this); + } + + return 0; +} + + +static int +afr_self_heal_missing_entries (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + gf_log (this->name, GF_LOG_DEBUG, + "attempting to recreate missing entries for path=%s", + local->loc.path); + + afr_build_parent_loc (&sh->parent_loc, &local->loc); + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, sh_missing_entries_lk_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &sh->parent_loc, local->loc.name, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, + int (*completion_cbk) (call_frame_t *, xlator_t *)) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + gf_log (this->name, GF_LOG_DEBUG, + "performing self heal on %s (metadata=%d data=%d entry=%d)", + local->loc.path, + local->need_metadata_self_heal, + local->need_data_self_heal, + local->need_entry_self_heal); + + sh->completion_cbk = completion_cbk; + + sh->buf = CALLOC (priv->child_count, sizeof (struct stat)); + sh->child_errno = CALLOC (priv->child_count, sizeof (int)); + sh->success = CALLOC (priv->child_count, sizeof (int)); + sh->xattr = CALLOC (priv->child_count, sizeof (dict_t *)); + sh->sources = CALLOC (sizeof (*sh->sources), priv->child_count); + + sh->pending_matrix = CALLOC (sizeof (int32_t *), priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sh->pending_matrix[i] = CALLOC (sizeof (int32_t), + priv->child_count); + } + + sh->delta_matrix = CALLOC (sizeof (int32_t *), priv->child_count); + for (i = 0; i < priv->child_count; i++) { + sh->delta_matrix[i] = CALLOC (sizeof (int32_t), + priv->child_count); + } + + if (local->success_count && local->enoent_count) { + afr_self_heal_missing_entries (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to metadata check on %s", + local->loc.path); + afr_sh_missing_entries_done (frame, this); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-self-heal-common.h b/xlators/cluster/afr/src/afr-self-heal-common.h new file mode 100644 index 000000000..9dd597f07 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-common.h @@ -0,0 +1,66 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __AFR_SELF_HEAL_COMMON_H__ +#define __AFR_SELF_HEAL_COMMON_H__ + +#define FILE_HAS_HOLES(buf) (((buf)->st_size) > ((buf)->st_blocks * 512)) + +int +afr_sh_select_source (int sources[], int child_count); + +int +afr_sh_sink_count (int sources[], int child_count); + +int +afr_sh_source_count (int sources[], int child_count); + +int +afr_sh_supress_errenous_children (int sources[], int child_errno[], + int child_count); + +int +afr_sh_supress_empty_children (int sources[], dict_t *xattr[], + struct stat *buf, + int child_count, const char *key); + +void +afr_sh_print_pending_matrix (int32_t *pending_matrix[], xlator_t *this); + +void +afr_sh_build_pending_matrix (int32_t *pending_matrix[], dict_t *xattr[], + int child_count, const char *key); + +void +afr_sh_pending_to_delta (int32_t *pending_matrix[], int32_t *delta_matrix[], + int32_t success[], int child_count); + +int +afr_sh_mark_sources (int32_t *pending_matrix[], int sources[], + int child_count); + +int +afr_sh_delta_to_xattr (int32_t *delta_matrix[], dict_t *xattr[], + int child_count, const char *key); + +int +afr_sh_is_matrix_zero (int32_t *pending_matrix[], int child_count); + + +#endif /* __AFR_SELF_HEAL_COMMON_H__ */ diff --git a/xlators/cluster/afr/src/afr-self-heal-data.c b/xlators/cluster/afr/src/afr-self-heal-data.c new file mode 100644 index 000000000..3a48da485 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-data.c @@ -0,0 +1,1030 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + + +int +afr_sh_data_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + /* + TODO: cleanup sh->* + */ + + gf_log (this->name, GF_LOG_DEBUG, + "self heal of %s completed", + local->loc.path); + + sh->completion_cbk (frame, this); + + return 0; +} + + +int +afr_sh_data_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + afr_sh_data_done (frame, this); + } + + return 0; +} + + +int +afr_sh_data_close (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + afr_self_heal_t *sh = NULL; + int i = 0; + int call_count = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + if (!sh->healing_fd) { + afr_sh_data_done (frame, this); + return 0; + } + + call_count = sh->active_sinks + 1; + local->call_count = call_count; + + + /* closed source */ + gf_log (this->name, GF_LOG_DEBUG, + "closing fd of %s on %s", + local->loc.path, priv->children[sh->source]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, + (void *) (long) sh->source, + priv->children[sh->source], + priv->children[sh->source]->fops->flush, + sh->healing_fd); + call_count--; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "closing fd of %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_flush_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + sh->healing_fd); + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + int call_count = 0; + int child_index = (long) cookie; + + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_close (frame, this); + } + + return 0; +} + + +int +afr_sh_data_unlock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_UNLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_unlck_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_data_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "finishing data selfheal of %s", local->loc.path); + + afr_sh_data_unlock (frame, this); + + return 0; +} + + +int +afr_sh_data_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_data_finish (frame, this); + + return 0; +} + + +int +afr_sh_data_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_DATA_PENDING); + + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + +int +afr_sh_data_trim_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) + gf_log (this->name, GF_LOG_ERROR, + "ftruncate of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + else + gf_log (this->name, GF_LOG_DEBUG, + "ftruncate of %s on subvolume %s completed", + local->loc.path, + priv->children[child_index]->name); + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_erase_pending (frame, this); + } + + return 0; +} + + +int +afr_sh_data_trim_sinks (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + int *sources = NULL; + int call_count = 0; + int i = 0; + + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sources = sh->sources; + call_count = sh->active_sinks; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (sources[i] || !local->child_up[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_data_trim_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->ftruncate, + sh->healing_fd, sh->file_size); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this); + +int +afr_sh_data_write_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + int child_index = (long) cookie; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + gf_log (this->name, GF_LOG_DEBUG, + "wrote %d bytes of data from %s to child %d, offset %"PRId64"", + op_ret, local->loc.path, child_index, sh->offset - op_ret); + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "write to %s failed on subvolume %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_read_write_iter (frame, this); + } + + return 0; +} + + +int +afr_sh_data_read_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *buf) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + int child_index = (long) cookie; + int i = 0; + int call_count = 0; + + off_t offset; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = sh->active_sinks; + + local->call_count = call_count; + + gf_log (this->name, GF_LOG_DEBUG, + "read %d bytes of data from %s on child %d, offset %"PRId64"", + op_ret, local->loc.path, child_index, sh->offset); + + if (op_ret <= 0) { + afr_sh_data_trim_sinks (frame, this); + return 0; + } + + /* what if we read less than block size? */ + offset = sh->offset; + sh->offset += op_ret; + + frame->root->req_refs = frame->root->rsp_refs; + + if (sh->file_has_holes) { + if (iov_0filled (vector, count) == 0) { + /* the iter function depends on the + sh->offset already being updated + above + */ + afr_sh_data_read_write_iter (frame, this); + goto out; + } + } + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] || !local->child_up[i]) + continue; + + /* this is a sink, so write to it */ + STACK_WIND_COOKIE (frame, afr_sh_data_write_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->writev, + sh->healing_fd, vector, count, offset); + + if (!--call_count) + break; + } + +out: + return 0; +} + + +int +afr_sh_data_read_write (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND_COOKIE (frame, afr_sh_data_read_cbk, + (void *) (long) sh->source, + priv->children[sh->source], + priv->children[sh->source]->fops->readv, + sh->healing_fd, sh->block_size, + sh->offset); + + return 0; +} + + +int +afr_sh_data_read_write_iter (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + goto out; + } + + if (sh->offset >= sh->file_size) { + gf_log (this->name, GF_LOG_DEBUG, + "closing fd's of %s", + local->loc.path); + afr_sh_data_trim_sinks (frame, this); + + goto out; + } + + afr_sh_data_read_write (frame, this); + +out: + return 0; +} + + +int +afr_sh_data_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + /* TODO: some of the open's might fail. + In that case, modify cleanup fn to send flush on those + fd's which are already open */ + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "open of %s failed on child %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "fd for %s opened, commencing sync", + local->loc.path); + + gf_log (this->name, GF_LOG_WARNING, + "sourcing file %s from %s to other sinks", + local->loc.path, priv->children[sh->source]->name); + + afr_sh_data_read_write (frame, this); + } + + return 0; +} + + +int +afr_sh_data_open (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + int source = -1; + int *sources = NULL; + + fd_t *fd = NULL; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = sh->active_sinks + 1; + local->call_count = call_count; + + fd = fd_create (local->loc.inode, frame->root->pid); + sh->healing_fd = fd; + + source = local->self_heal.source; + sources = local->self_heal.sources; + + sh->block_size = 65536; + sh->file_size = sh->buf[source].st_size; + + if (FILE_HAS_HOLES (&sh->buf[source])) + sh->file_has_holes = 1; + + /* open source */ + STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->open, + &local->loc, O_RDONLY|O_LARGEFILE, fd); + call_count--; + + /* open sinks */ + for (i = 0; i < priv->child_count; i++) { + if(sources[i] || !local->child_up[i]) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_data_open_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->open, + &local->loc, + O_WRONLY|O_LARGEFILE, fd); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_data_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for performing self-heal on file %s", + local->loc.path); + afr_sh_data_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing data of %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, active_sinks); + + afr_sh_data_open (frame, this); + + return 0; +} + + +int +afr_sh_data_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int nsources = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_DATA_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_empty_children (sh->sources, sh->xattr, sh->buf, + priv->child_count, AFR_DATA_PENDING); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + nsources = afr_sh_source_count (sh->sources, priv->child_count); + + if ((nsources == 0) + && (priv->favorite_child != -1) + && (sh->child_errno[priv->favorite_child] == 0)) { + + gf_log (this->name, GF_LOG_WARNING, + "Picking favorite child %s as authentic source to resolve conflicting data of %s", + priv->children[priv->favorite_child]->name, + local->loc.path); + + sh->sources[priv->favorite_child] = 1; + + nsources = afr_sh_source_count (sh->sources, + priv->child_count); + } + + if (nsources == 0) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to resolve conflicting data of %s. " + "Please resolve manually by deleting the file %s " + "from all but the preferred subvolume. " + "Please consider 'option favorite-child <>'", + local->loc.path, local->loc.path); + + local->govinda_gOvinda = 1; + + afr_sh_data_finish (frame, this); + return 0; + } + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + /* detect changes not visible through pending flags -- JIC */ + for (i = 0; i < priv->child_count; i++) { + if (i == source || sh->child_errno[i]) + continue; + + if (SIZE_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + } + + afr_sh_data_sync_prepare (frame, this); + + return 0; +} + + +int +afr_sh_data_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + sh->xattr[child_index] = dict_ref (xattr); + sh->buf[child_index] = *buf; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_data_fix (frame, this); + } + + return 0; +} + + +int +afr_sh_data_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_self_heal_t *sh = NULL; + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + dict_t *xattr_req = NULL; + + int call_count = 0; + int i = 0; + int ret = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = local->child_count; + + local->call_count = call_count; + + xattr_req = dict_new(); + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_DATA_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_sh_data_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +int +afr_sh_data_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_data_finish (frame, this); + return 0; + } + + afr_sh_data_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_data_lock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_WRLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_data_lock_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + + + local = frame->local; + sh = &local->self_heal; + + if (local->need_data_self_heal && priv->data_self_heal) { + afr_sh_data_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "not doing data self heal on %s", + local->loc.path); + afr_sh_data_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal-entry.c b/xlators/cluster/afr/src/afr-self-heal-entry.c new file mode 100644 index 000000000..ec341922e --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-entry.c @@ -0,0 +1,2038 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + + +int +afr_sh_entry_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + /* + TODO: cleanup sh->* + */ + + gf_log (this->name, GF_LOG_DEBUG, + "self heal of %s completed", + local->loc.path); + + sh->completion_cbk (frame, this); + + return 0; +} + + +int +afr_sh_entry_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "unlocking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "unlocked inode of %s on child %d", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->healing_fd) + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + afr_sh_entry_done (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_unlock (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_unlck_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->loc, NULL, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_entry_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "finishing entry selfheal of %s", local->loc.path); + + afr_sh_entry_unlock (frame, this); + + return 0; +} + + +int +afr_sh_entry_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_finish (frame, this); + + return 0; +} + + +int +afr_sh_entry_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_ENTRY_PENDING); + + local->call_count = call_count; + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + + +static int +next_active_source (call_frame_t *frame, xlator_t *this, + int current_active_source) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int source = -1; + int next_active_source = -1; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + source = sh->source; + + if (source != -1) { + if (current_active_source != source) + next_active_source = source; + goto out; + } + + /* + the next active sink becomes the source for the + 'conservative decision' of merging all entries + */ + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) + && (local->child_up[i] == 1) + && (i > current_active_source)) { + + next_active_source = i; + break; + } + } +out: + return next_active_source; +} + + + +static int +next_active_sink (call_frame_t *frame, xlator_t *this, + int current_active_sink) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int next_active_sink = -1; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + /* + the next active sink becomes the source for the + 'conservative decision' of merging all entries + */ + + for (i = 0; i < priv->child_count; i++) { + if ((sh->sources[i] == 0) + && (local->child_up[i] == 1) + && (i > current_active_sink)) { + + next_active_sink = i; + break; + } + } + + return next_active_sink; +} + + +int +build_child_loc (xlator_t *this, loc_t *child, loc_t *parent, char *name) +{ + int ret = -1; + + if (!child) { + goto out; + } + + if (strcmp (parent->path, "/") == 0) + asprintf ((char **)&child->path, "/%s", name); + else + asprintf ((char **)&child->path, "%s/%s", parent->path, name); + + if (!child->path) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + child->name = strrchr (child->path, '/'); + if (child->name) + child->name++; + + child->parent = inode_ref (parent->inode); + child->inode = inode_new (parent->inode->table); + + if (!child->inode) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ret = 0; +out: + if (ret == -1) + loc_wipe (child); + + return ret; +} + + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src); + +int +afr_sh_entry_expunge_entry_done (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_expunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_remove_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int active_src = 0; + call_frame_t *frame = NULL; + + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + + active_src = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "removed %s on %s", + expunge_local->loc.path, + priv->children[active_src]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "removing %s on %s failed (%s)", + expunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } + + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_rmdir (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_WARNING, + "removing directory %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->rmdir, + &expunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_expunge_unlink (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_WARNING, + "unlinking file %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_remove_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->unlink, + &expunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_expunge_remove (call_frame_t *expunge_frame, xlator_t *this, + int active_src, struct stat *buf) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int source = 0; + call_frame_t *frame = NULL; + int type = 0; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + source = expunge_sh->source; + + type = (buf->st_mode & S_IFMT); + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + case S_IFLNK: + afr_sh_entry_expunge_unlink (expunge_frame, this, active_src); + + break; + case S_IFDIR: + afr_sh_entry_expunge_rmdir (expunge_frame, this, active_src); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + expunge_local->loc.path, + priv->children[source]->name, type); + goto out; + break; + } + + return 0; +out: + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_lookup_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + active_src = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "lookup of %s on %s failed (%s)", + expunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + goto out; + } + + afr_sh_entry_expunge_remove (expunge_frame, this, active_src, buf); + + return 0; +out: + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_purge (call_frame_t *expunge_frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + + priv = this->private; + expunge_local = expunge_frame->local; + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", + expunge_local->loc.path, priv->children[active_src]->name); + + STACK_WIND_COOKIE (expunge_frame, afr_sh_entry_expunge_lookup_cbk, + (void *) (long) active_src, + priv->children[active_src], + priv->children[active_src]->fops->lookup, + &expunge_local->loc, 0); + + return 0; +} + + +int +afr_sh_entry_expunge_entry_cbk (call_frame_t *expunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int source = 0; + call_frame_t *frame = NULL; + int active_src = 0; + + + priv = this->private; + expunge_local = expunge_frame->local; + expunge_sh = &expunge_local->self_heal; + frame = expunge_sh->sh_frame; + active_src = expunge_sh->active_source; + source = (long) cookie; + + if (op_ret == -1 && op_errno == ENOENT) { + + gf_log (this->name, GF_LOG_DEBUG, + "missing entry %s on %s", + expunge_local->loc.path, + priv->children[source]->name); + + afr_sh_entry_expunge_purge (expunge_frame, this, active_src); + + return 0; + } + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%s exists under %s", + expunge_local->loc.path, + priv->children[source]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s under %s failed (%s)", + expunge_local->loc.path, + priv->children[source]->name, + strerror (op_errno)); + } + + AFR_STACK_DESTROY (expunge_frame); + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_entry (call_frame_t *frame, xlator_t *this, + char *name) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = -1; + call_frame_t *expunge_frame = NULL; + afr_local_t *expunge_local = NULL; + afr_self_heal_t *expunge_sh = NULL; + int active_src = 0; + int source = 0; + int op_errno = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + source = sh->source; + + if ((strcmp (name, ".") == 0) + || (strcmp (name, "..") == 0)) { + gf_log (this->name, GF_LOG_DEBUG, + "skipping inspection of %s under %s", + name, local->loc.path); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "inspecting existance of %s under %s", + name, local->loc.path); + + expunge_frame = copy_frame (frame); + if (!expunge_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (expunge_local, afr_local_t, out); + + expunge_frame->local = expunge_local; + expunge_sh = &expunge_local->self_heal; + expunge_sh->sh_frame = frame; + expunge_sh->active_source = active_src; + + ret = build_child_loc (this, &expunge_local->loc, &local->loc, name); + if (ret != 0) { + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", expunge_local->loc.path, + priv->children[source]->name); + + STACK_WIND_COOKIE (expunge_frame, + afr_sh_entry_expunge_entry_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->lookup, + &expunge_local->loc, 0); + + ret = 0; +out: + if (ret == -1) + afr_sh_entry_expunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_expunge_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + gf_dirent_t *entry = NULL; + off_t last_offset = 0; + int active_src = 0; + int entry_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if (op_ret <= 0) { + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "readdir of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "readdir of %s on subvolume %s complete", + local->loc.path, + priv->children[active_src]->name); + } + + afr_sh_entry_expunge_all (frame, this); + return 0; + } + + list_for_each_entry (entry, &entries->list, list) { + last_offset = entry->d_off; + entry_count++; + } + + gf_log (this->name, GF_LOG_DEBUG, + "readdir'ed %d entries from %s", + entry_count, priv->children[active_src]->name); + + sh->offset = last_offset; + local->call_count = entry_count; + + list_for_each_entry (entry, &entries->list, list) { + afr_sh_entry_expunge_entry (frame, this, entry->d_name); + } + + return 0; +} + +int +afr_sh_entry_expunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND (frame, afr_sh_entry_expunge_readdir_cbk, + priv->children[active_src], + priv->children[active_src]->fops->readdir, + sh->healing_fd, sh->block_size, sh->offset); + + return 0; +} + + +int +afr_sh_entry_expunge_all (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int active_src = -1; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sh->offset = 0; + + if (sh->source == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sources for %s to expunge entries", + local->loc.path); + goto out; + } + + active_src = next_active_sink (frame, this, sh->active_source); + sh->active_source = active_src; + + if (sh->op_failed) { + goto out; + } + + if (active_src == -1) { + /* completed creating missing files on all subvolumes */ + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "expunging entries of %s on %s to other sinks", + local->loc.path, priv->children[active_src]->name); + + afr_sh_entry_expunge_subvol (frame, this, active_src); + + return 0; +out: + afr_sh_entry_erase_pending (frame, this); + return 0; + +} + + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this); + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src); + +int +afr_sh_entry_impunge_entry_done (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_entry_impunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_impunge_utimens_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "utimes set for %s on %s", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "setting utimes of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + } + + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_chown_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + struct timespec ts[2]; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "ownership of %s on %s changed", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "setting ownership of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC + ts[0] = impunge_local->cont.lookup.buf.st_atim; + ts[1] = impunge_local->cont.lookup.buf.st_mtim; +#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC + ts[0] = impunge_local->cont.lookup.buf.st_atimespec; + ts[1] = impunge_local->cont.lookup.buf.st_mtimespec; +#else + ts[0].tv_sec = impunge_local->cont.lookup.buf.st_atime; + ts[1].tv_sec = impunge_local->cont.lookup.buf.st_mtime; +#endif + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_utimens_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->utimens, + &impunge_local->loc, ts); + + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_newfile_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + int call_count = 0; + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + call_frame_t *frame = NULL; + int active_src = 0; + int child_index = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + + child_index = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "creation of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "setting ownership of %s on %s to %d/%d", + impunge_local->loc.path, + priv->children[child_index]->name, + impunge_local->cont.lookup.buf.st_uid, + impunge_local->cont.lookup.buf.st_gid); + + inode->st_mode = stbuf->st_mode; + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_chown_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->chown, + &impunge_local->loc, + impunge_local->cont.lookup.buf.st_uid, + impunge_local->cont.lookup.buf.st_gid); + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_mknod (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating file %s mode=0%o dev=0x%"GF_PRI_DEV" on %s", + impunge_local->loc.path, + stbuf->st_mode, stbuf->st_rdev, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->mknod, + &impunge_local->loc, + stbuf->st_mode, stbuf->st_rdev); + + return 0; +} + + + +int +afr_sh_entry_impunge_mkdir (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating directory %s mode=0%o on %s", + impunge_local->loc.path, + stbuf->st_mode, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->mkdir, + &impunge_local->loc, stbuf->st_mode); + + return 0; +} + + +int +afr_sh_entry_impunge_symlink (call_frame_t *impunge_frame, xlator_t *this, + int child_index, const char *linkname) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + gf_log (this->name, GF_LOG_WARNING, + "creating symlink %s -> %s on %s", + impunge_local->loc.path, linkname, + priv->children[child_index]->name); + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_newfile_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->symlink, + linkname, &impunge_local->loc); + + return 0; +} + + +int +afr_sh_entry_impunge_readlink_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + const char *linkname) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int child_index = -1; + call_frame_t *frame = NULL; + int call_count = -1; + int active_src = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + active_src = impunge_sh->active_source; + + child_index = (long) cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "readlink of %s on %s failed (%s)", + impunge_local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + goto out; + } + + afr_sh_entry_impunge_symlink (impunge_frame, this, child_index, + linkname); + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_readlink (call_frame_t *impunge_frame, xlator_t *this, + int child_index, struct stat *stbuf) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = -1; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + active_src = impunge_sh->active_source; + + STACK_WIND_COOKIE (impunge_frame, afr_sh_entry_impunge_readlink_cbk, + (void *) (long) child_index, + priv->children[active_src], + priv->children[active_src]->fops->readlink, + &impunge_local->loc, 4096); + + return 0; +} + + +int +afr_sh_entry_impunge_recreate_lookup_cbk (call_frame_t *impunge_frame, + void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, + dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + int type = 0; + int child_index = 0; + call_frame_t *frame = NULL; + int call_count = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + + child_index = (long) cookie; + + active_src = impunge_sh->active_source; + + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s on %s (for %s) failed (%s)", + impunge_local->loc.path, + priv->children[active_src]->name, + priv->children[child_index]->name, + strerror (op_errno)); + goto out; + } + + impunge_local->cont.lookup.buf = *buf; + type = (buf->st_mode & S_IFMT); + + switch (type) { + case S_IFSOCK: + case S_IFREG: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + afr_sh_entry_impunge_mknod (impunge_frame, this, + child_index, buf); + break; + case S_IFLNK: + afr_sh_entry_impunge_readlink (impunge_frame, this, + child_index, buf); + break; + case S_IFDIR: + afr_sh_entry_impunge_mkdir (impunge_frame, this, + child_index, buf); + break; + default: + gf_log (this->name, GF_LOG_ERROR, + "%s has unknown file type on %s: 0%o", + impunge_local->loc.path, + priv->children[active_src]->name, type); + goto out; + break; + } + + return 0; + +out: + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_recreate (call_frame_t *impunge_frame, xlator_t *this, + int child_index) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + + active_src = impunge_sh->active_source; + + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_recreate_lookup_cbk, + (void *) (long) child_index, + priv->children[active_src], + priv->children[active_src]->fops->lookup, + &impunge_local->loc, 0); + + return 0; +} + + +int +afr_sh_entry_impunge_entry_cbk (call_frame_t *impunge_frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *x) +{ + afr_private_t *priv = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int call_count = 0; + int child_index = 0; + call_frame_t *frame = NULL; + int active_src = 0; + + priv = this->private; + impunge_local = impunge_frame->local; + impunge_sh = &impunge_local->self_heal; + frame = impunge_sh->sh_frame; + child_index = (long) cookie; + active_src = impunge_sh->active_source; + + if (op_ret == -1 && op_errno == ENOENT) { + /* decrease call_count in recreate-callback */ + gf_log (this->name, GF_LOG_DEBUG, + "missing entry %s on %s", + impunge_local->loc.path, + priv->children[child_index]->name); + + afr_sh_entry_impunge_recreate (impunge_frame, this, + child_index); + return 0; + } + + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%s exists under %s", + impunge_local->loc.path, + priv->children[child_index]->name); + } else { + gf_log (this->name, GF_LOG_ERROR, + "looking up %s under %s failed (%s)", + impunge_local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + } + + LOCK (&impunge_frame->lock); + { + call_count = --impunge_local->call_count; + } + UNLOCK (&impunge_frame->lock); + + if (call_count == 0) { + AFR_STACK_DESTROY (impunge_frame); + afr_sh_entry_impunge_entry_done (frame, this, active_src); + } + + return 0; +} + + +int +afr_sh_entry_impunge_entry (call_frame_t *frame, xlator_t *this, + char *name) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int ret = -1; + call_frame_t *impunge_frame = NULL; + afr_local_t *impunge_local = NULL; + afr_self_heal_t *impunge_sh = NULL; + int active_src = 0; + int i = 0; + int call_count = 0; + int op_errno = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if ((strcmp (name, ".") == 0) + || (strcmp (name, "..") == 0)) { + gf_log (this->name, GF_LOG_DEBUG, + "skipping inspection of %s under %s", + name, local->loc.path); + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "inspecting existance of %s under %s", + name, local->loc.path); + + impunge_frame = copy_frame (frame); + if (!impunge_frame) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + ALLOC_OR_GOTO (impunge_local, afr_local_t, out); + + impunge_frame->local = impunge_local; + impunge_sh = &impunge_local->self_heal; + impunge_sh->sh_frame = frame; + impunge_sh->active_source = active_src; + + ret = build_child_loc (this, &impunge_local->loc, &local->loc, name); + if (ret != 0) { + goto out; + } + + for (i = 0; i < priv->child_count; i++) { + if (i == active_src) + continue; + if (local->child_up[i] == 0) + continue; + if (sh->sources[i] == 1) + continue; + call_count++; + } + + impunge_local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (i == active_src) + continue; + if (local->child_up[i] == 0) + continue; + if (sh->sources[i] == 1) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", impunge_local->loc.path, + priv->children[i]->name); + + STACK_WIND_COOKIE (impunge_frame, + afr_sh_entry_impunge_entry_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &impunge_local->loc, 0); + + if (!--call_count) + break; + } + + ret = 0; +out: + if (ret == -1) + afr_sh_entry_impunge_entry_done (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_impunge_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + gf_dirent_t *entries) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + gf_dirent_t *entry = NULL; + off_t last_offset = 0; + int active_src = 0; + int entry_count = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + active_src = sh->active_source; + + if (op_ret <= 0) { + if (op_ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "readdir of %s on subvolume %s failed (%s)", + local->loc.path, + priv->children[active_src]->name, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "readdir of %s on subvolume %s complete", + local->loc.path, + priv->children[active_src]->name); + } + + afr_sh_entry_impunge_all (frame, this); + return 0; + } + + list_for_each_entry (entry, &entries->list, list) { + last_offset = entry->d_off; + entry_count++; + } + + gf_log (this->name, GF_LOG_DEBUG, + "readdir'ed %d entries from %s", + entry_count, priv->children[active_src]->name); + + sh->offset = last_offset; + local->call_count = entry_count; + + list_for_each_entry (entry, &entries->list, list) { + afr_sh_entry_impunge_entry (frame, this, entry->d_name); + } + + return 0; +} + + +int +afr_sh_entry_impunge_subvol (call_frame_t *frame, xlator_t *this, + int active_src) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + STACK_WIND (frame, afr_sh_entry_impunge_readdir_cbk, + priv->children[active_src], + priv->children[active_src]->fops->readdir, + sh->healing_fd, sh->block_size, sh->offset); + + return 0; +} + + +int +afr_sh_entry_impunge_all (call_frame_t *frame, xlator_t *this) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int active_src = -1; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + sh->offset = 0; + + active_src = next_active_source (frame, this, sh->active_source); + sh->active_source = active_src; + + if (sh->op_failed) { + afr_sh_entry_finish (frame, this); + return 0; + } + + if (active_src == -1) { + /* completed creating missing files on all subvolumes */ + afr_sh_entry_expunge_all (frame, this); + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "impunging entries of %s on %s to other sinks", + local->loc.path, priv->children[active_src]->name); + + afr_sh_entry_impunge_subvol (frame, this, active_src); + + return 0; +} + + +int +afr_sh_entry_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + /* TODO: some of the open's might fail. + In that case, modify cleanup fn to send flush on those + fd's which are already open */ + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "opendir of %s failed on child %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + sh->op_failed = 1; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_entry_finish (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "fd for %s opened, commencing sync", + local->loc.path); + + sh->active_source = -1; + afr_sh_entry_impunge_all (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_open (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + int source = -1; + int *sources = NULL; + + fd_t *fd = NULL; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t *sh = NULL; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = local->self_heal.source; + sources = local->self_heal.sources; + + sh->block_size = 131072; + sh->offset = 0; + + call_count = sh->active_sinks; + if (source != -1) + call_count++; + + local->call_count = call_count; + + fd = fd_create (local->loc.inode, frame->root->pid); + sh->healing_fd = fd; + + if (source != -1) { + gf_log (this->name, GF_LOG_DEBUG, + "opening directory %s on subvolume %s (source)", + local->loc.path, priv->children[source]->name); + + /* open source */ + STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, + (void *) (long) source, + priv->children[source], + priv->children[source]->fops->opendir, + &local->loc, fd); + call_count--; + } + + /* open sinks */ + for (i = 0; i < priv->child_count; i++) { + if (sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "opening directory %s on subvolume %s (sink)", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_opendir_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->opendir, + &local->loc, fd); + + if (!--call_count) + break; + } + + return 0; +} + + +int +afr_sh_entry_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + if (source != -1) + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for self-heal on dir %s", + local->loc.path); + afr_sh_entry_finish (frame, this); + return 0; + } + if (source == -1 && active_sinks < 2) { + gf_log (this->name, GF_LOG_WARNING, + "cannot sync with 0 sources and 1 sink on dir %s", + local->loc.path); + afr_sh_entry_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + if (source != -1) + gf_log (this->name, GF_LOG_DEBUG, + "syncing %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, + active_sinks); + else + gf_log (this->name, GF_LOG_DEBUG, + "no active sources for %s found. " + "merging all entries as a conservative decision", + local->loc.path); + + afr_sh_entry_open (frame, this); + + return 0; +} + + +int +afr_sh_entry_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_ENTRY_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + afr_sh_entry_sync_prepare (frame, this); + + return 0; +} + + + +int +afr_sh_entry_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret != -1) { + sh->xattr[child_index] = dict_ref (xattr); + sh->buf[child_index] = *buf; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + afr_sh_entry_fix (frame, this); + } + + return 0; +} + + + +int +afr_sh_entry_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_self_heal_t * sh = NULL; + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + dict_t *xattr_req = NULL; + int ret = 0; + int call_count = 0; + int i = 0; + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + call_count = local->child_count; + + local->call_count = call_count; + + xattr_req = dict_new(); + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, + afr_sh_entry_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + + +int +afr_sh_entry_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking inode of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed == 1) { + afr_sh_entry_finish (frame, this); + return 0; + } + + afr_sh_entry_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_entry_lock (call_frame_t *frame, xlator_t *this) +{ + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + afr_self_heal_t * sh = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_entry_lock_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->loc, NULL, + ENTRYLK_LOCK_NB, ENTRYLK_WRLCK); + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + + + priv = this->private; + local = frame->local; + sh = &local->self_heal; + + if (local->need_entry_self_heal && priv->entry_self_heal) { + afr_sh_entry_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to completion on %s", + local->loc.path); + afr_sh_entry_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c new file mode 100644 index 000000000..e65a426db --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c @@ -0,0 +1,791 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-transaction.h" +#include "afr-self-heal.h" +#include "afr-self-heal-common.h" + + +int +afr_sh_metadata_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + +// memset (sh->child_errno, 0, sizeof (int) * priv->child_count); + memset (sh->buf, 0, sizeof (struct stat) * priv->child_count); + memset (sh->success, 0, sizeof (int) * priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + + if (local->govinda_gOvinda) { + gf_log (this->name, GF_LOG_WARNING, + "aborting selfheal of %s", + local->loc.path); + sh->completion_cbk (frame, this); + } else { + if (S_ISREG (local->cont.lookup.buf.st_mode)) { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to data check on %s", + local->loc.path); + afr_self_heal_data (frame, this); + return 0; + } + + if (S_ISDIR (local->cont.lookup.buf.st_mode)) { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to entry check on %s", + local->loc.path); + afr_self_heal_entry (frame, this); + return 0; + } + gf_log (this->name, GF_LOG_DEBUG, + "completed self heal of %s", + local->loc.path); + + sh->completion_cbk (frame, this); + } + + return 0; +} + + +int +afr_sh_metadata_unlck_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + int call_count = 0; + + + local = frame->local; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_done (frame, this); + + return 0; +} + + +int +afr_sh_metadata_finish (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + struct flock flock = {0, }; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_UNLCK; + + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "unlocking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND (frame, afr_sh_metadata_unlck_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_sh_metadata_erase_pending_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_finish (frame, this); + + return 0; +} + + +int +afr_sh_metadata_erase_pending (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int i = 0; + dict_t **erase_xattr = NULL; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + + afr_sh_pending_to_delta (sh->pending_matrix, sh->delta_matrix, + sh->success, priv->child_count); + + erase_xattr = CALLOC (sizeof (*erase_xattr), priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + call_count++; + + erase_xattr[i] = get_new_dict(); + dict_ref (erase_xattr[i]); + } + } + + afr_sh_delta_to_xattr (sh->delta_matrix, erase_xattr, + priv->child_count, AFR_METADATA_PENDING); + + local->call_count = call_count; + + if (call_count == 0) { + gf_log (this->name, GF_LOG_WARNING, + "metadata of %s not healed on any subvolume", + local->loc.path); + + afr_sh_metadata_finish (frame, this); + } + + for (i = 0; i < priv->child_count; i++) { + if (!erase_xattr[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "erasing pending flags from %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_erase_pending_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, erase_xattr[i]); + if (!--call_count) + break; + } + + for (i = 0; i < priv->child_count; i++) { + if (erase_xattr[i]) { + dict_unref (erase_xattr[i]); + } + } + FREE (erase_xattr); + + return 0; +} + + +int +afr_sh_metadata_sync_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "setting attributes failed for %s on %s (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + sh->success[child_index] = 0; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_erase_pending (frame, this); + + return 0; +} + + +int +afr_sh_metadata_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + + return 0; +} + + +int +afr_sh_metadata_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_sh_metadata_sync_cbk (frame, cookie, this, op_ret, op_errno); + + return 0; +} + + +int +afr_sh_metadata_sync (call_frame_t *frame, xlator_t *this, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + int active_sinks = 0; + int call_count = 0; + int i = 0; + struct timespec ts[2]; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + active_sinks = sh->active_sinks; + + /* + * 4 calls per sink - chown, chmod, utimes, setxattr + */ + if (xattr) + call_count = active_sinks * 4; + else + call_count = active_sinks * 3; + + local->call_count = call_count; + +#ifdef HAVE_STRUCT_STAT_ST_ATIM_TV_NSEC + ts[0] = sh->buf[source].st_atim; + ts[1] = sh->buf[source].st_mtim; +#elif HAVE_STRUCT_STAT_ST_ATIMESPEC_TV_NSEC + ts[0] = sh->buf[source].st_atimespec; + ts[1] = sh->buf[source].st_mtimespec; +#else + ts[0].tv_sec = sh->buf[source].st_atime; + ts[1].tv_sec = sh->buf[source].st_mtime; +#endif + + for (i = 0; i < priv->child_count; i++) { + if (call_count == 0) { + break; + } + if (sh->sources[i] || !local->child_up[i]) + continue; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing metadata of %s from %s to %s", + local->loc.path, priv->children[source]->name, + priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chown, + &local->loc, + sh->buf[source].st_uid, + sh->buf[source].st_gid); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->chmod, + &local->loc, sh->buf[source].st_mode); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_attr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->utimens, + &local->loc, ts); + + call_count = call_count - 3; + + if (!xattr) + continue; + + STACK_WIND_COOKIE (frame, afr_sh_metadata_xattr_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->setxattr, + &local->loc, xattr, 0); + call_count--; + } + + return 0; +} + + +int +afr_sh_metadata_getxattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int source = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "getxattr of %s failed on subvolume %s (%s). proceeding without xattr", + local->loc.path, priv->children[source]->name, + strerror (op_errno)); + + afr_sh_metadata_sync (frame, this, NULL); + } else { + dict_del (xattr, AFR_DATA_PENDING); + dict_del (xattr, AFR_METADATA_PENDING); + dict_del (xattr, AFR_ENTRY_PENDING); + afr_sh_metadata_sync (frame, this, xattr); + } + + return 0; +} + + +int +afr_sh_metadata_sync_prepare (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int active_sinks = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + source = sh->source; + + for (i = 0; i < priv->child_count; i++) { + if (sh->sources[i] == 0 && local->child_up[i] == 1) { + active_sinks++; + sh->success[i] = 1; + } + } + sh->success[source] = 1; + + if (active_sinks == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "no active sinks for performing self-heal on file %s", + local->loc.path); + afr_sh_metadata_finish (frame, this); + return 0; + } + sh->active_sinks = active_sinks; + + gf_log (this->name, GF_LOG_DEBUG, + "syncing metadata of %s from subvolume %s to %d active sinks", + local->loc.path, priv->children[source]->name, active_sinks); + + STACK_WIND (frame, afr_sh_metadata_getxattr_cbk, + priv->children[source], + priv->children[source]->fops->getxattr, + &local->loc, NULL); + + return 0; +} + + +int +afr_sh_metadata_fix (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int nsources = 0; + int source = 0; + int i = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + afr_sh_build_pending_matrix (sh->pending_matrix, sh->xattr, + priv->child_count, AFR_METADATA_PENDING); + + afr_sh_print_pending_matrix (sh->pending_matrix, this); + + afr_sh_mark_sources (sh->pending_matrix, sh->sources, + priv->child_count); + + afr_sh_supress_errenous_children (sh->sources, sh->child_errno, + priv->child_count); + + nsources = afr_sh_source_count (sh->sources, priv->child_count); + + if ((nsources == 0) + && (priv->favorite_child != -1) + && (sh->child_errno[priv->favorite_child] == 0)) { + + gf_log (this->name, GF_LOG_WARNING, + "Picking favorite child %s as authentic source to resolve conflicting metadata of %s", + priv->children[priv->favorite_child]->name, + local->loc.path); + + sh->sources[priv->favorite_child] = 1; + + nsources = afr_sh_source_count (sh->sources, + priv->child_count); + } + + if (nsources == 0) { + gf_log (this->name, GF_LOG_ERROR, + "Unable to resolve conflicting metadata of %s. " + "Please resolve manually by fixing the " + "permissions/ownership of %s on your subvolumes. " + "You can also consider 'option favorite-child <>'", + local->loc.path, local->loc.path); + + local->govinda_gOvinda = 1; + + afr_sh_metadata_finish (frame, this); + return 0; + } + + source = afr_sh_select_source (sh->sources, priv->child_count); + sh->source = source; + + /* detect changes not visible through pending flags -- JIC */ + for (i = 0; i < priv->child_count; i++) { + if (i == source || sh->child_errno[i]) + continue; + + if (PERMISSION_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + + if (OWNERSHIP_DIFFERS (&sh->buf[i], &sh->buf[source])) + sh->sources[i] = 0; + } + + afr_sh_metadata_sync_prepare (frame, this); + + return 0; +} + + +int +afr_sh_metadata_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = 0; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + child_index = (long) cookie; + + LOCK (&frame->lock); + { + if (op_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s is of mode 0%o", + local->loc.path, + priv->children[child_index]->name, + buf->st_mode); + + sh->buf[child_index] = *buf; + if (xattr) + sh->xattr[child_index] = dict_ref (xattr); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "path %s on subvolume %s => -1 (%s)", + local->loc.path, + priv->children[child_index]->name, + strerror (op_errno)); + + sh->child_errno[child_index] = op_errno; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + afr_sh_metadata_fix (frame, this); + + return 0; +} + + +int +afr_sh_metadata_lookup (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + dict_t *xattr_req = NULL; + int ret = 0; + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + xattr_req = dict_new(); + + if (xattr_req) + ret = dict_set_uint64 (xattr_req, AFR_METADATA_PENDING, + priv->child_count * sizeof(int32_t)); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "looking up %s on %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_lookup_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + &local->loc, xattr_req); + if (!--call_count) + break; + } + } + + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +int +afr_sh_metadata_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int call_count = 0; + int child_index = (long) cookie; + + /* TODO: what if lock fails? */ + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + sh->op_failed = 1; + + gf_log (this->name, + (op_errno == EAGAIN ? GF_LOG_DEBUG : GF_LOG_ERROR), + "locking of %s on child %d failed: %s", + local->loc.path, child_index, + strerror (op_errno)); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "inode of %s on child %d locked", + local->loc.path, child_index); + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (sh->op_failed) { + afr_sh_metadata_finish (frame, this); + return 0; + } + + afr_sh_metadata_lookup (frame, this); + } + + return 0; +} + + +int +afr_sh_metadata_lock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + int call_count = 0; + struct flock flock = {0, }; + + + local = frame->local; + sh = &local->self_heal; + priv = this->private; + + call_count = local->child_count; + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = 0; + flock.l_len = 0; + flock.l_type = F_WRLCK; + + if (local->child_up[i]) { + gf_log (this->name, GF_LOG_DEBUG, + "locking %s on subvolume %s", + local->loc.path, priv->children[i]->name); + + STACK_WIND_COOKIE (frame, afr_sh_metadata_lk_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_self_heal_t *sh = NULL; + afr_private_t *priv = this->private; + + + local = frame->local; + sh = &local->self_heal; + + if (local->need_metadata_self_heal && priv->metadata_self_heal) { + afr_sh_metadata_lock (frame, this); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "proceeding to data check on %s", + local->loc.path); + afr_sh_metadata_done (frame, this); + } + + return 0; +} + diff --git a/xlators/cluster/afr/src/afr-self-heal.h b/xlators/cluster/afr/src/afr-self-heal.h new file mode 100644 index 000000000..1c97a9bc1 --- /dev/null +++ b/xlators/cluster/afr/src/afr-self-heal.h @@ -0,0 +1,52 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __AFR_SELF_HEAL_H__ +#define __AFR_SELF_HEAL_H__ + +#include <sys/stat.h> + +#define FILETYPE_DIFFERS(buf1,buf2) ((S_IFMT & ((struct stat *)buf1)->st_mode) != (S_IFMT & ((struct stat *)buf2)->st_mode)) +#define PERMISSION_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_mode) != (((struct stat *)buf2)->st_mode)) +#define OWNERSHIP_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_uid) != (((struct stat *)buf2)->st_uid) || (((struct stat *)buf1)->st_gid != (((struct stat *)buf2)->st_gid))) +#define SIZE_DIFFERS(buf1,buf2) ((((struct stat *)buf1)->st_size) != (((struct stat *)buf2)->st_size)) + + + +int +afr_sh_has_metadata_pending (dict_t *xattr, int child_count, xlator_t *this); +int +afr_sh_has_entry_pending (dict_t *xattr, int child_count, xlator_t *this); +int +afr_sh_has_data_pending (dict_t *xattr, int child_count, xlator_t *this); + +int +afr_self_heal_entry (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_data (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal_metadata (call_frame_t *frame, xlator_t *this); + +int +afr_self_heal (call_frame_t *frame, xlator_t *this, + int (*completion_cbk) (call_frame_t *, xlator_t *)); + +#endif /* __AFR_SELF_HEAL_H__ */ diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c new file mode 100644 index 000000000..3df9f07e5 --- /dev/null +++ b/xlators/cluster/afr/src/afr-transaction.c @@ -0,0 +1,957 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "dict.h" +#include "byte-order.h" + +#include "afr.h" +#include "afr-transaction.h" + +#include <signal.h> + + +static void +__mark_all_pending (int32_t *pending, int child_count) +{ + int i; + + for (i = 0; i < child_count; i++) + pending[i] = hton32 (1); +} + + +static void +__mark_child_dead (int32_t *pending, int child_count, int child) +{ + pending[child] = 0; +} + + +static void +__mark_down_children (int32_t *pending, int child_count, unsigned char *child_up) +{ + int i; + + for (i = 0; i < child_count; i++) + if (!child_up[i]) + pending[i] = 0; +} + + +static void +__mark_all_success (int32_t *pending, int child_count) +{ + int i; + + for (i = 0; i < child_count; i++) + pending[i] = hton32 (-1); +} + + +static int +__is_first_write_on_fd (xlator_t *this, fd_t *fd) +{ + int op_ret = 0; + int _ret = -1; + + _ret = fd_ctx_get (fd, this, NULL); + if (_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "first writev() on fd=%p, writing changelog", + fd); + + _ret = fd_ctx_set (fd, this, 0xaf1); + op_ret = 1; + } + + return op_ret; +} + + +static int +__changelog_enabled (afr_private_t *priv, afr_transaction_type type) +{ + int ret = 0; + + switch (type) { + case AFR_DATA_TRANSACTION: + if (priv->data_change_log) + ret = 1; + + break; + + case AFR_METADATA_TRANSACTION: + if (priv->metadata_change_log) + ret = 1; + + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + if (priv->entry_change_log) + ret = 1; + + break; + + case AFR_FLUSH_TRANSACTION: + ret = 1; + } + + return ret; +} + + +static int +__changelog_needed_pre_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + fd_t * fd = NULL; + + int op_ret = 0; + + priv = this->private; + local = frame->local; + + if (__changelog_enabled (priv, local->transaction.type)) { + switch (local->op) { + + case GF_FOP_WRITE: + case GF_FOP_FTRUNCATE: + /* + if it's a data transaction, we write the changelog + only on the first write on an fd + */ + + fd = local->fd; + if (!fd || __is_first_write_on_fd (this, fd)) + op_ret = 1; + + break; + + case GF_FOP_FLUSH: + /* only do post-op on flush() */ + + op_ret = 0; + break; + + default: + op_ret = 1; + } + } + + return op_ret; +} + + +static int +__changelog_needed_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = 0; + afr_transaction_type type = -1; + + priv = this->private; + local = frame->local; + type = local->transaction.type; + + if (__changelog_enabled (priv, type) + && (local->op != GF_FOP_WRITE) + && (local->op != GF_FOP_FTRUNCATE)) + ret = 1; + + return ret; +} + + +static int +afr_lock_server_count (afr_private_t *priv, afr_transaction_type type) +{ + int ret = 0; + + switch (type) { + case AFR_FLUSH_TRANSACTION: + case AFR_DATA_TRANSACTION: + ret = priv->data_lock_server_count; + break; + + case AFR_METADATA_TRANSACTION: + ret = priv->metadata_lock_server_count; + break; + + case AFR_ENTRY_TRANSACTION: + case AFR_ENTRY_RENAME_TRANSACTION: + ret = priv->entry_lock_server_count; + break; + } + + return ret; +} + + +/* {{{ unlock */ + +int32_t +afr_unlock_common_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local; + int call_count = 0; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + local->transaction.done (frame, this); + } + + return 0; +} + + +int +afr_unlock (call_frame_t *frame, xlator_t *this) +{ + struct flock flock; + + int i = 0; + int call_count = 0; + + afr_local_t *local = NULL; + afr_private_t * priv = this->private; + + local = frame->local; + + call_count = afr_locked_nodes_count (local->transaction.locked_nodes, + priv->child_count); + + if (call_count == 0) { + local->transaction.done (frame, this); + return 0; + } + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) + call_count *= 2; + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + flock.l_start = local->transaction.start; + flock.l_len = local->transaction.len; + flock.l_type = F_UNLCK; + + if (local->transaction.locked_nodes[i]) { + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + + if (local->fd) { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->finodelk, + local->fd, F_SETLK, &flock); + } else { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + &local->loc, F_SETLK, &flock); + } + + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->transaction.new_parent_loc, + local->transaction.new_basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + call_count--; + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + if (local->fd) { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->fentrylk, + local->fd, + local->transaction.basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + } else { + STACK_WIND (frame, afr_unlock_common_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + &local->transaction.parent_loc, + local->transaction.basename, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + + } + break; + } + + if (!--call_count) + break; + } + } + + return 0; +} + +/* }}} */ + + +/* {{{ pending */ + +int32_t +afr_changelog_post_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int call_count = -1; + + priv = this->private; + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + afr_unlock (frame, this); + } + } + + return 0; +} + + +int +afr_changelog_post_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = this->private; + + int ret = 0; + int i = 0; + int call_count = 0; + + afr_local_t * local = NULL; + dict_t * xattr = dict_ref (get_new_dict ()); + + local = frame->local; + + __mark_all_success (local->pending_array, priv->child_count); + __mark_down_children (local->pending_array, priv->child_count, local->child_up); + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + call_count *= 2; + } + + local->call_count = call_count; + + if (call_count == 0) { + /* no child is up */ + dict_unref (xattr); + afr_unlock (frame, this); + return 0; + } + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + { + if (local->fd) + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + STACK_WIND_COOKIE (frame, afr_changelog_post_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + + call_count--; + } + + /* + set it again because previous stack_wind + might have already returned (think of case + where subvolume is posix) and would have + used the dict as placeholder for return + value + */ + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + { + if (local->fd) + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND (frame, afr_changelog_post_op_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + } + + if (!--call_count) + break; + } + } + + dict_unref (xattr); + return 0; +} + + +int32_t +afr_changelog_pre_op_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + afr_local_t * local = NULL; + afr_private_t * priv = this->private; + loc_t * loc = NULL; + + int call_count = -1; + int child_index = (long) cookie; + + local = frame->local; + loc = &local->loc; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->child_up[child_index] = 0; + + if (op_errno == ENOTSUP) { + gf_log (this->name, GF_LOG_ERROR, + "xattrop not supported by %s", + priv->children[child_index]->name); + local->op_ret = -1; + } else if (!child_went_down (op_ret, op_errno)) { + gf_log (this->name, GF_LOG_ERROR, + "xattrop failed on child %s: %s", + priv->children[child_index]->name, + strerror (op_errno)); + } + local->op_errno = op_errno; + } + + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if ((local->op_ret == -1) && + (local->op_errno == ENOTSUP)) { + local->transaction.resume (frame, this); + } else { + local->transaction.fop (frame, this); + } + } + + return 0; +} + + +int +afr_changelog_pre_op (call_frame_t *frame, xlator_t *this) +{ + afr_private_t * priv = this->private; + + int i = 0; + int ret = 0; + int call_count = 0; + dict_t *xattr = NULL; + + afr_local_t *local = NULL; + + local = frame->local; + xattr = get_new_dict (); + dict_ref (xattr); + + call_count = afr_up_children_count (priv->child_count, + local->child_up); + + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + call_count *= 2; + } + + if (call_count == 0) { + /* no child is up */ + dict_unref (xattr); + afr_unlock (frame, this); + return 0; + } + + local->call_count = call_count; + + __mark_all_pending (local->pending_array, priv->child_count); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + ret = dict_set_static_bin (xattr, + local->transaction.pending, + local->pending_array, + (priv->child_count * + sizeof (int32_t))); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + { + if (local->fd) + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &(local->loc), + GF_XATTROP_ADD_ARRAY, xattr); + } + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.new_parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + + call_count--; + } + + + /* + set it again because previous stack_wind + might have already returned (think of case + where subvolume is posix) and would have + used the dict as placeholder for return + value + */ + + ret = dict_set_static_bin (xattr, local->transaction.pending, + local->pending_array, + priv->child_count * sizeof (int32_t)); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, + "failed to set pending entry"); + + /* fall through */ + + case AFR_ENTRY_TRANSACTION: + { + if (local->fd) + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->fxattrop, + local->fd, + GF_XATTROP_ADD_ARRAY, xattr); + else + STACK_WIND_COOKIE (frame, + afr_changelog_pre_op_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->xattrop, + &local->transaction.parent_loc, + GF_XATTROP_ADD_ARRAY, xattr); + } + + break; + } + + if (!--call_count) + break; + } + } + + dict_unref (xattr); + return 0; +} + +/* }}} */ + +/* {{{ lock */ + +static +int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index); + +int32_t +afr_lock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + int done = 0; + int child_index = (long) cookie; + + int call_count = 0; + + local = frame->local; + priv = this->private; + + LOCK (&frame->lock); + { + if (local->transaction.type == AFR_ENTRY_RENAME_TRANSACTION) { + /* wait for the other lock to return */ + call_count = --local->call_count; + } + + if (op_ret == -1) { + if (op_errno == ENOSYS) { + /* return ENOTSUP */ + gf_log (this->name, GF_LOG_ERROR, + "subvolume does not support locking. " + "please load features/posix-locks xlator on server"); + local->op_ret = op_ret; + done = 1; + } + + local->child_up[child_index] = 0; + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if (call_count == 0) { + if ((local->op_ret == -1) && + (local->op_errno == ENOSYS)) { + afr_unlock (frame, this); + } else { + local->transaction.locked_nodes[child_index] = 1; + local->transaction.lock_count++; + afr_lock_rec (frame, this, child_index + 1); + } + } + + return 0; +} + + +static loc_t * +lower_path (loc_t *l1, const char *b1, loc_t *l2, const char *b2) +{ + int ret = 0; + + ret = strcmp (l1->path, l2->path); + + if (ret == 0) + ret = strcmp (b1, b2); + + if (ret <= 0) + return l1; + else + return l2; +} + + +static +int afr_lock_rec (call_frame_t *frame, xlator_t *this, int child_index) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + struct flock flock; + + loc_t * lower = NULL; + loc_t * higher = NULL; + + const char *lower_name = NULL; + const char *higher_name = NULL; + + local = frame->local; + priv = this->private; + + flock.l_start = local->transaction.start; + flock.l_len = local->transaction.len; + flock.l_type = F_WRLCK; + + /* skip over children that are down */ + while ((child_index < priv->child_count) + && !local->child_up[child_index]) + child_index++; + + if ((child_index == priv->child_count) && + local->transaction.lock_count == 0) { + + gf_log (this->name, GF_LOG_DEBUG, + "unable to lock on even one child"); + + local->op_ret = -1; + local->op_errno = EAGAIN; + + local->transaction.done (frame, this); + + return 0; + + } + + if ((child_index == priv->child_count) + || (local->transaction.lock_count == + afr_lock_server_count (priv, local->transaction.type))) { + + /* we're done locking */ + + if (__changelog_needed_pre_op (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + local->transaction.fop (frame, this); + } + + return 0; + } + + switch (local->transaction.type) { + case AFR_DATA_TRANSACTION: + case AFR_METADATA_TRANSACTION: + case AFR_FLUSH_TRANSACTION: + + if (local->fd) { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->finodelk, + local->fd, F_SETLKW, &flock); + + } else { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->inodelk, + &local->loc, F_SETLKW, &flock); + } + + break; + + case AFR_ENTRY_RENAME_TRANSACTION: + { + local->call_count = 2; + + lower = lower_path (&local->transaction.parent_loc, + local->transaction.basename, + &local->transaction.new_parent_loc, + local->transaction.new_basename); + + lower_name = (lower == &local->transaction.parent_loc ? + local->transaction.basename : + local->transaction.new_basename); + + higher = (lower == &local->transaction.parent_loc ? + &local->transaction.new_parent_loc : + &local->transaction.parent_loc); + + higher_name = (higher == &local->transaction.parent_loc ? + local->transaction.basename : + local->transaction.new_basename); + + + /* TODO: these locks should be blocking */ + + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + lower, lower_name, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + higher, higher_name, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + + break; + } + + case AFR_ENTRY_TRANSACTION: + if (local->fd) { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->fentrylk, + local->fd, + local->transaction.basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + } else { + STACK_WIND_COOKIE (frame, afr_lock_cbk, + (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->entrylk, + &local->transaction.parent_loc, + local->transaction.basename, + ENTRYLK_LOCK, ENTRYLK_WRLCK); + } + + break; + } + + return 0; +} + + +int32_t afr_lock (call_frame_t *frame, xlator_t *this) +{ + return afr_lock_rec (frame, this, 0); +} + + +/* }}} */ + +int32_t +afr_transaction_resume (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + if (__changelog_needed_post_op (frame, this)) { + afr_changelog_post_op (frame, this); + } else { + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + local->transaction.done (frame, this); + } else { + afr_unlock (frame, this); + } + } + + return 0; +} + + +/** + * afr_transaction_child_died - inform that a child died during an fop + */ + +void +afr_transaction_child_died (call_frame_t *frame, xlator_t *this, int child_index) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + __mark_child_dead (local->pending_array, priv->child_count, child_index); +} + + +int32_t +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + local = frame->local; + priv = this->private; + + afr_transaction_local_init (local, priv); + + local->transaction.resume = afr_transaction_resume; + local->transaction.type = type; + + if (afr_lock_server_count (priv, local->transaction.type) == 0) { + if (__changelog_needed_pre_op (frame, this)) { + afr_changelog_pre_op (frame, this); + } else { + local->transaction.fop (frame, this); + } + } else { + afr_lock (frame, this); + } + + return 0; +} diff --git a/xlators/cluster/afr/src/afr-transaction.h b/xlators/cluster/afr/src/afr-transaction.h new file mode 100644 index 000000000..49cdd219f --- /dev/null +++ b/xlators/cluster/afr/src/afr-transaction.h @@ -0,0 +1,36 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __TRANSACTION_H__ +#define __TRANSACTION_H__ + +#define AFR_METADATA_PENDING "trusted.glusterfs.afr.metadata-pending" + +#define AFR_DATA_PENDING "trusted.glusterfs.afr.data-pending" + +#define AFR_ENTRY_PENDING "trusted.glusterfs.afr.entry-pending" + +void +afr_transaction_child_died (call_frame_t *frame, xlator_t *this, + int child_index); + +int32_t +afr_transaction (call_frame_t *frame, xlator_t *this, afr_transaction_type type); + +#endif /* __TRANSACTION_H__ */ diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c new file mode 100644 index 000000000..e4c1a8479 --- /dev/null +++ b/xlators/cluster/afr/src/afr.c @@ -0,0 +1,2338 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include <unistd.h> +#include <fnmatch.h> +#include <sys/time.h> +#include <stdlib.h> +#include <signal.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "afr.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "list.h" +#include "call-stub.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#include "afr-inode-read.h" +#include "afr-inode-write.h" +#include "afr-dir-read.h" +#include "afr-dir-write.h" +#include "afr-transaction.h" + +#include "afr-self-heal.h" + + +/** + * afr_local_cleanup - cleanup everything in frame->local + */ + +void +afr_local_sh_cleanup (afr_local_t *local, xlator_t *this) +{ + afr_self_heal_t *sh = NULL; + afr_private_t *priv = NULL; + int i = 0; + + + sh = &local->self_heal; + priv = this->private; + + if (sh->buf) + FREE (sh->buf); + + if (sh->xattr) { + for (i = 0; i < priv->child_count; i++) { + if (sh->xattr[i]) { + dict_unref (sh->xattr[i]); + sh->xattr[i] = NULL; + } + } + FREE (sh->xattr); + } + + if (sh->child_errno) + FREE (sh->child_errno); + + if (sh->pending_matrix) { + for (i = 0; i < priv->child_count; i++) { + FREE (sh->pending_matrix[i]); + } + FREE (sh->pending_matrix); + } + + if (sh->delta_matrix) { + for (i = 0; i < priv->child_count; i++) { + FREE (sh->delta_matrix[i]); + } + FREE (sh->delta_matrix); + } + + if (sh->sources) + FREE (sh->sources); + + if (sh->success) + FREE (sh->success); + + if (sh->healing_fd) { + fd_unref (sh->healing_fd); + sh->healing_fd = NULL; + } + + loc_wipe (&sh->parent_loc); +} + + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this) +{ + if (!local) + return; + + afr_local_sh_cleanup (local, this); + + FREE (local->child_errno); + FREE (local->pending_array); + + loc_wipe (&local->loc); + loc_wipe (&local->newloc); + + FREE (local->transaction.locked_nodes); + FREE (local->transaction.child_errno); + + FREE (local->transaction.basename); + FREE (local->transaction.new_basename); + + loc_wipe (&local->transaction.parent_loc); + loc_wipe (&local->transaction.new_parent_loc); + + if (local->fd) + fd_unref (local->fd); + + if (local->xattr_req) + dict_unref (local->xattr_req); + + FREE (local->child_up); + + { /* lookup */ + if (local->cont.lookup.xattr) + dict_unref (local->cont.lookup.xattr); + } + + { /* getxattr */ + if (local->cont.getxattr.name) + FREE (local->cont.getxattr.name); + } + + { /* lk */ + if (local->cont.lk.locked_nodes) + FREE (local->cont.lk.locked_nodes); + } + + { /* checksum */ + if (local->cont.checksum.file_checksum) + FREE (local->cont.checksum.file_checksum); + if (local->cont.checksum.dir_checksum) + FREE (local->cont.checksum.dir_checksum); + } + + { /* create */ + if (local->cont.create.fd) + fd_unref (local->cont.create.fd); + } + + { /* writev */ + FREE (local->cont.writev.vector); + } + + { /* setxattr */ + if (local->cont.setxattr.dict) + dict_unref (local->cont.setxattr.dict); + } + + { /* removexattr */ + FREE (local->cont.removexattr.name); + } + + { /* symlink */ + FREE (local->cont.symlink.linkpath); + } +} + + +int +afr_frame_return (call_frame_t *frame) +{ + afr_local_t *local = NULL; + int call_count = 0; + + local = frame->local; + + LOCK (&frame->lock); + { + call_count = --local->call_count; + } + UNLOCK (&frame->lock); + + return call_count; +} + +/** + * first_up_child - return the index of the first child that is up + */ + +int +afr_first_up_child (afr_private_t *priv) +{ + xlator_t ** children = NULL; + int ret = -1; + int i = 0; + + LOCK (&priv->lock); + { + children = priv->children; + for (i = 0; i < priv->child_count; i++) { + if (priv->child_up[i]) { + ret = i; + break; + } + } + } + UNLOCK (&priv->lock); + + return ret; +} + + +/** + * up_children_count - return the number of children that are up + */ + +int +afr_up_children_count (int child_count, unsigned char *child_up) +{ + int i = 0; + int ret = 0; + + for (i = 0; i < child_count; i++) + if (child_up[i]) + ret++; + return ret; +} + + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count) +{ + int ret = 0; + int i; + + for (i = 0; i < child_count; i++) + if (locked_nodes[i]) + ret++; + + return ret; +} + + +ino64_t +afr_itransform (ino64_t ino, int child_count, int child_index) +{ + ino64_t scaled_ino = -1; + + if (ino == ((uint64_t) -1)) { + scaled_ino = ((uint64_t) -1); + goto out; + } + + scaled_ino = (ino * child_count) + child_index; + +out: + return scaled_ino; +} + + +int +afr_deitransform_orig (ino64_t ino, int child_count) +{ + int index = -1; + + index = ino % child_count; + + return index; +} + + +int +afr_deitransform (ino64_t ino, int child_count) +{ + return 0; +} + + +int +afr_self_heal_cbk (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + int ret = -1; + + local = frame->local; + + if (local->govinda_gOvinda) { + ret = inode_ctx_put (local->cont.lookup.inode, this, 1); + + if (ret < 0) { + local->op_ret = -1; + local->op_errno = -ret; + } + } else { + inode_ctx_del (local->cont.lookup.inode, this, NULL); + } + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr); + + return 0; +} + + +int +afr_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + struct stat * lookup_buf = NULL; + int call_count = -1; + int child_index = -1; + int prev_child_index = -1; + uint32_t open_fd_count = 0; + int ret = 0; + + child_index = (long) cookie; + priv = this->private; + + LOCK (&frame->lock); + { + local = frame->local; + + lookup_buf = &local->cont.lookup.buf; + + if (op_ret == -1) { + if (op_errno == ENOENT) + local->enoent_count++; + + if (op_errno != ENOTCONN) + local->op_errno = op_errno; + + goto unlock; + } + + if (afr_sh_has_metadata_pending (xattr, child_index, this)) + local->need_metadata_self_heal = 1; + + if (afr_sh_has_entry_pending (xattr, child_index, this)) + local->need_entry_self_heal = 1; + + if (afr_sh_has_data_pending (xattr, child_index, this)) + local->need_data_self_heal = 1; + + ret = dict_get_uint32 (xattr, GLUSTERFS_OPEN_FD_COUNT, + &open_fd_count); + local->open_fd_count += open_fd_count; + + /* in case of revalidate, we need to send stat of the + * child whose stat was sent during the first lookup. + * (so that time stamp does not vary with revalidate. + * in case it is down, stat of the fist success will + * be replied */ + + /* inode number should be preserved across revalidates */ + + if (local->success_count == 0) { + local->op_ret = op_ret; + + local->cont.lookup.inode = inode; + local->cont.lookup.xattr = dict_ref (xattr); + + *lookup_buf = *buf; + lookup_buf->st_ino = afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } else { + if (FILETYPE_DIFFERS (buf, lookup_buf)) { + /* mismatching filetypes with same name + -- Govinda !! GOvinda !!! + */ + local->govinda_gOvinda = 1; + } + + if (PERMISSION_DIFFERS (buf, lookup_buf)) { + /* mismatching permissions */ + local->need_metadata_self_heal = 1; + } + + if (OWNERSHIP_DIFFERS (buf, lookup_buf)) { + /* mismatching permissions */ + local->need_metadata_self_heal = 1; + } + + if (SIZE_DIFFERS (buf, lookup_buf) + && S_ISREG (buf->st_mode)) { + local->need_data_self_heal = 1; + } + + prev_child_index = afr_deitransform_orig (lookup_buf->st_ino, + priv->child_count); + if (child_index < prev_child_index) { + *lookup_buf = *buf; + lookup_buf->st_ino = afr_itransform (buf->st_ino, + priv->child_count, + child_index); + } + } + + local->success_count++; + } +unlock: + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if (local->op_ret == 0) { + /* KLUDGE: assuming DHT will not itransform in + revalidate */ + if (local->cont.lookup.inode->ino) + lookup_buf->st_ino = + local->cont.lookup.inode->ino; + } + + if (local->success_count && local->enoent_count) { + local->need_metadata_self_heal = 1; + local->need_data_self_heal = 1; + local->need_entry_self_heal = 1; + } + + if (local->success_count) { + /* check for govinda_gOvinda case in previous lookup */ + if (!inode_ctx_get (local->cont.lookup.inode, + this, NULL)) + local->need_data_self_heal = 1; + } + + if ((local->need_metadata_self_heal + || local->need_data_self_heal + || local->need_entry_self_heal) + && (!local->open_fd_count)) { + + if (!local->cont.lookup.inode->st_mode) { + /* fix for RT #602 */ + local->cont.lookup.inode->st_mode = + lookup_buf->st_mode; + } + + afr_self_heal (frame, this, afr_self_heal_cbk); + } else { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, + local->cont.lookup.inode, + &local->cont.lookup.buf, + local->cont.lookup.xattr); + } + } + + return 0; +} + + +int +afr_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + int ret = -1; + int i = 0; + int32_t op_errno = 0; + + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + local->op_ret = -1; + + frame->local = local; + + loc_copy (&local->loc, loc); + + local->reval_child_index = 0; + + local->call_count = priv->child_count; + + local->child_up = memdup (priv->child_up, priv->child_count); + local->child_count = afr_up_children_count (priv->child_count, + local->child_up); + + /* By default assume ENOTCONN. On success it will be set to 0. */ + local->op_errno = ENOTCONN; + + if ((xattr_req == NULL) + && (priv->metadata_self_heal + || priv->data_self_heal + || priv->entry_self_heal)) + local->xattr_req = dict_new (); + else + local->xattr_req = dict_ref (xattr_req); + + if (priv->metadata_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_METADATA_PENDING, + priv->child_count * sizeof(int32_t)); + } + + if (priv->data_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_DATA_PENDING, + priv->child_count * sizeof(int32_t)); + } + + if (priv->entry_self_heal) { + ret = dict_set_uint64 (local->xattr_req, AFR_ENTRY_PENDING, + priv->child_count * sizeof(int32_t)); + } + + ret = dict_set_uint64 (local->xattr_req, GLUSTERFS_OPEN_FD_COUNT, 0); + + for (i = 0; i < priv->child_count; i++) { + STACK_WIND_COOKIE (frame, afr_lookup_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->lookup, + loc, local->xattr_req); + } + + ret = 0; +out: + if (ret == -1) + AFR_STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL, NULL); + + return 0; +} + + +/* {{{ open */ + +int +afr_open_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + afr_local_t * local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd); + return 0; +} + + +int +afr_open_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int call_count = -1; + + priv = this->private; + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + } + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + if ((local->cont.open.flags & O_TRUNC) + && (local->op_ret >= 0)) { + STACK_WIND (frame, afr_open_ftruncate_cbk, + this, this->fops->ftruncate, + fd, 0); + } else { + AFR_STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + } + + return 0; +} + + +int +afr_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int i = 0; + int ret = -1; + + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t wind_flags = flags & (~O_TRUNC); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + + ret = inode_ctx_get (loc->inode, this, NULL); + if (ret == 0) { + /* if ctx is set it means self-heal failed */ + + gf_log (this->name, GF_LOG_WARNING, + "returning EIO, file has to be manually corrected " + "in backend"); + op_errno = EIO; + goto out; + } + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + call_count = local->call_count; + + local->cont.open.flags = flags; + local->fd = fd_ref (fd); + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_open_cbk, (void *) (long) i, + priv->children[i], + priv->children[i]->fops->open, + loc, wind_flags, fd); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, fd); + } + + return 0; +} + +/* }}} */ + +/* {{{ flush */ + +int +afr_flush_wind_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) { + local->transaction.resume (frame, this); + } + + return 0; +} + + +int +afr_flush_wind (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int i = 0; + int call_count = -1; + + local = frame->local; + priv = this->private; + + call_count = afr_up_children_count (priv->child_count, local->child_up); + + if (call_count == 0) { + local->transaction.resume (frame, this); + return 0; + } + + local->call_count = call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND_COOKIE (frame, afr_flush_wind_cbk, + (void *) (long) i, + priv->children[i], + priv->children[i]->fops->flush, + local->fd); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int +afr_flush_done (call_frame_t *frame, xlator_t *this) +{ + afr_local_t *local = NULL; + + local = frame->local; + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_simple_flush_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +static int +__is_fd_ctx_set (xlator_t *this, fd_t *fd) +{ + int _ret = 0; + int op_ret = 0; + + _ret = fd_ctx_get (fd, this, NULL); + if (_ret == 0) + op_ret = 1; + + return op_ret; +} + + +int +afr_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + afr_private_t * priv = NULL; + afr_local_t * local = NULL; + + int ret = -1; + int i = 0; + int call_count = 0; + + int op_ret = -1; + int op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + + if (__is_fd_ctx_set (this, fd)) { + local->op = GF_FOP_FLUSH; + local->transaction.fop = afr_flush_wind; + local->transaction.done = afr_flush_done; + + local->fd = fd_ref (fd); + + local->transaction.start = 0; + local->transaction.len = 0; + + local->transaction.pending = AFR_DATA_PENDING; + + afr_transaction (frame, this, AFR_FLUSH_TRANSACTION); + } else { + /* + * if fd's ctx is not set, then there is no need + * to erase changelog. So just send the flush + */ + + call_count = local->call_count; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_simple_flush_cbk, + priv->children[i], + priv->children[i]->fops->flush, + fd); + + if (!--call_count) + break; + } + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + + return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int +afr_fsync_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +afr_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fsync_cbk, + priv->children[i], + priv->children[i]->fops->fsync, + fd, datasync); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ fsync */ + +int32_t +afr_fsyncdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fsync_cbk, + priv->children[i], + priv->children[i]->fops->fsyncdir, + fd, datasync); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ xattrop */ + +int32_t +afr_xattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); + + return 0; +} + + +int32_t +afr_xattrop (call_frame_t *frame, xlator_t *this, loc_t *loc, + gf_xattrop_flags_t optype, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_xattrop_cbk, + priv->children[i], + priv->children[i]->fops->xattrop, + loc, optype, xattr); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + +/* {{{ fxattrop */ + +int32_t +afr_fxattrop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *xattr) +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, xattr); + + return 0; +} + + +int32_t +afr_fxattrop (call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fxattrop_cbk, + priv->children[i], + priv->children[i]->fops->fxattrop, + fd, optype, xattr); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +/* }}} */ + + +int32_t +afr_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_inodelk (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t cmd, struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_inodelk_cbk, + priv->children[i], + priv->children[i]->fops->inodelk, + loc, cmd, flock); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_finodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_finodelk (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t cmd, struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_finodelk_cbk, + priv->children[i], + priv->children[i]->fops->finodelk, + fd, cmd, flock); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_entrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_entrylk (call_frame_t *frame, xlator_t *this, loc_t *loc, + const char *basename, entrylk_cmd cmd, entrylk_type type) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_entrylk_cbk, + priv->children[i], + priv->children[i]->fops->entrylk, + loc, basename, cmd, type); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + + +int32_t +afr_fentrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0) + local->op_ret = 0; + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int32_t +afr_fentrylk (call_frame_t *frame, xlator_t *this, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_fentrylk_cbk, + priv->children[i], + priv->children[i]->fops->fentrylk, + fd, basename, cmd, type); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_checksum_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + uint8_t *file_checksum, uint8_t *dir_checksum) + +{ + afr_local_t *local = NULL; + + int call_count = -1; + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == 0 && (local->op_ret != 0)) { + local->op_ret = 0; + + local->cont.checksum.file_checksum = MALLOC (ZR_FILENAME_MAX); + memcpy (local->cont.checksum.file_checksum, file_checksum, + ZR_FILENAME_MAX); + + local->cont.checksum.dir_checksum = MALLOC (ZR_FILENAME_MAX); + memcpy (local->cont.checksum.dir_checksum, dir_checksum, + ZR_FILENAME_MAX); + + } + + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->cont.checksum.file_checksum, + local->cont.checksum.dir_checksum); + + return 0; +} + + +int32_t +afr_checksum (call_frame_t *frame, xlator_t *this, loc_t *loc, + int32_t flag) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int ret = -1; + + int i = 0; + int32_t call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + call_count = local->call_count; + frame->local = local; + + for (i = 0; i < priv->child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_checksum_cbk, + priv->children[i], + priv->children[i]->fops->checksum, + loc, flag); + + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + + +int32_t +afr_statfs_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct statvfs *statvfs) +{ + afr_local_t *local = NULL; + + int call_count = 0; + + LOCK (&frame->lock); + { + local = frame->local; + + if (op_ret == 0) { + local->op_ret = op_ret; + + if (local->cont.statfs.buf_set) { + if (statvfs->f_bavail < local->cont.statfs.buf.f_bavail) + local->cont.statfs.buf = *statvfs; + } else { + local->cont.statfs.buf = *statvfs; + local->cont.statfs.buf_set = 1; + } + } + + if (op_ret == -1) + local->op_errno = op_errno; + + } + UNLOCK (&frame->lock); + + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.statfs.buf); + + return 0; +} + + +int32_t +afr_statfs (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + afr_private_t * priv = NULL; + int child_count = 0; + afr_local_t * local = NULL; + int i = 0; + + int ret = -1; + int call_count = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + + priv = this->private; + child_count = priv->child_count; + + ALLOC_OR_GOTO (local, afr_local_t, out); + + ret = AFR_LOCAL_INIT (local, priv); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + frame->local = local; + call_count = local->call_count; + + for (i = 0; i < child_count; i++) { + if (local->child_up[i]) { + STACK_WIND (frame, afr_statfs_cbk, + priv->children[i], + priv->children[i]->fops->statfs, + loc); + if (!--call_count) + break; + } + } + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +int32_t +afr_lk_unlock_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct flock *lock) +{ + afr_local_t * local = NULL; + + int call_count = -1; + + local = frame->local; + call_count = afr_frame_return (frame); + + if (call_count == 0) + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + lock); + + return 0; +} + + +int32_t +afr_lk_unlock (call_frame_t *frame, xlator_t *this) +{ + afr_local_t * local = NULL; + afr_private_t * priv = NULL; + + int i; + int call_count = 0; + + local = frame->local; + priv = this->private; + + call_count = afr_locked_nodes_count (local->cont.lk.locked_nodes, + priv->child_count); + + if (call_count == 0) { + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.lk.flock); + return 0; + } + + local->call_count = call_count; + + local->cont.lk.flock.l_type = F_UNLCK; + + for (i = 0; i < priv->child_count; i++) { + if (local->cont.lk.locked_nodes[i]) { + STACK_WIND (frame, afr_lk_unlock_cbk, + priv->children[i], + priv->children[i]->fops->lk, + local->fd, F_SETLK, + &local->cont.lk.flock); + + if (!--call_count) + break; + } + } + + return 0; +} + + +int32_t +afr_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct flock *lock) +{ + afr_local_t *local = NULL; + afr_private_t *priv = NULL; + + int call_count = -1; + int child_index = -1; + + local = frame->local; + priv = this->private; + + child_index = (long) cookie; + + call_count = --local->call_count; + + if (!child_went_down (op_ret, op_errno) && (op_ret == -1)) { + local->op_ret = -1; + local->op_errno = op_errno; + + afr_lk_unlock (frame, this); + return 0; + } + + if (op_ret == 0) { + local->op_ret = 0; + local->op_errno = 0; + local->cont.lk.flock = *lock; + local->cont.lk.locked_nodes[child_index] = 1; + } + + child_index++; + + if (child_index < priv->child_count) { + STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) child_index, + priv->children[child_index], + priv->children[child_index]->fops->lk, + local->fd, local->cont.lk.cmd, + &local->cont.lk.flock); + } else if (local->op_ret == -1) { + /* all nodes have gone down */ + + AFR_STACK_UNWIND (frame, -1, ENOTCONN, &local->cont.lk.flock); + } else { + /* locking has succeeded on all nodes that are up */ + + AFR_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->cont.lk.flock); + } + + return 0; +} + + +int +afr_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, + struct flock *flock) +{ + afr_private_t *priv = NULL; + afr_local_t *local = NULL; + + int i = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ALLOC_OR_GOTO (local, afr_local_t, out); + AFR_LOCAL_INIT (local, priv); + + frame->local = local; + + local->cont.lk.locked_nodes = CALLOC (priv->child_count, + sizeof (*local->cont.lk.locked_nodes)); + + if (!local->cont.lk.locked_nodes) { + gf_log (this->name, GF_LOG_ERROR, "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + local->fd = fd_ref (fd); + local->cont.lk.cmd = cmd; + local->cont.lk.flock = *flock; + + STACK_WIND_COOKIE (frame, afr_lk_cbk, (void *) (long) 0, + priv->children[i], + priv->children[i]->fops->lk, + fd, cmd, flock); + + op_ret = 0; +out: + if (op_ret == -1) { + AFR_STACK_UNWIND (frame, op_ret, op_errno, NULL); + } + return 0; +} + + +/** + * find_child_index - find the child's index in the array of subvolumes + * @this: AFR + * @child: child + */ + +static int +find_child_index (xlator_t *this, xlator_t *child) +{ + afr_private_t *priv = NULL; + + int i = -1; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if ((xlator_t *) child == priv->children[i]) + break; + } + + return i; +} + + +int32_t +notify (xlator_t *this, int32_t event, + void *data, ...) +{ + afr_private_t * priv = NULL; + unsigned char * child_up = NULL; + + int i = -1; + int up_children = 0; + + priv = this->private; + + if (!priv) + return 0; + + child_up = priv->child_up; + + switch (event) { + case GF_EVENT_CHILD_UP: + i = find_child_index (this, data); + + child_up[i] = 1; + + /* + if all the children were down, and one child came up, + send notify to parent + */ + + for (i = 0; i < priv->child_count; i++) + if (child_up[i]) + up_children++; + + if (up_children == 1) + default_notify (this, event, data); + + break; + + case GF_EVENT_CHILD_DOWN: + i = find_child_index (this, data); + + child_up[i] = 0; + + /* + if all children are down, and this was the last to go down, + send notify to parent + */ + + for (i = 0; i < priv->child_count; i++) + if (child_up[i]) + up_children++; + + if (up_children == 0) + default_notify (this, event, data); + + break; + + default: + default_notify (this, event, data); + } + + return 0; +} + + +static const char *favorite_child_warning_str = "You have specified subvolume '%s' " + "as the 'favorite child'. This means that if a discrepancy in the content " + "or attributes (ownership, permission, etc.) of a file is detected among " + "the subvolumes, the file on '%s' will be considered the definitive " + "version and its contents will OVERWRITE the contents of the file on other " + "subvolumes. All versions of the file except that on '%s' " + "WILL BE LOST."; + +static const char *no_lock_servers_warning_str = "You have set lock-server-count = 0. " + "This means correctness is NO LONGER GUARANTEED in all cases. If two or more " + "applications write to the same region of a file, there is a possibility that " + "its copies will be INCONSISTENT. Set it to a value greater than 0 unless you " + "are ABSOLUTELY SURE of what you are doing and WILL NOT HOLD GlusterFS " + "RESPOSIBLE for inconsistent data. If you are in doubt, set it to a value " + "greater than 0."; + +int32_t +init (xlator_t *this) +{ + afr_private_t * priv = NULL; + int child_count = 0; + xlator_list_t * trav = NULL; + int i = 0; + int ret = -1; + int op_errno = 0; + + char * read_subvol = NULL; + char * fav_child = NULL; + char * self_heal = NULL; + char * change_log = NULL; + + int32_t lock_server_count = 1; + + int fav_ret = -1; + int read_ret = -1; + int dict_ret = -1; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "AFR needs more than one child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + ALLOC_OR_GOTO (this->private, afr_private_t, out); + + priv = this->private; + + read_ret = dict_get_str (this->options, "read-subvolume", &read_subvol); + priv->read_child = -1; + + fav_ret = dict_get_str (this->options, "favorite-child", &fav_child); + priv->favorite_child = -1; + + /* Default values */ + + priv->data_self_heal = 1; + priv->metadata_self_heal = 1; + priv->entry_self_heal = 1; + + dict_ret = dict_get_str (this->options, "data-self-heal", &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->data_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option data-self-heal %s' " + "defaulting to data-self-heal as 'on'", + self_heal); + priv->data_self_heal = 1; + } + } + + dict_ret = dict_get_str (this->options, "metadata-self-heal", + &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->metadata_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option metadata-self-heal %s' " + "defaulting to metadata-self-heal as 'on'", + self_heal); + priv->metadata_self_heal = 1; + } + } + + dict_ret = dict_get_str (this->options, "entry-self-heal", &self_heal); + if (dict_ret == 0) { + ret = gf_string2boolean (self_heal, &priv->entry_self_heal); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option entry-self-heal %s' " + "defaulting to entry-self-heal as 'on'", + self_heal); + priv->entry_self_heal = 1; + } + } + + /* Change log options */ + + priv->data_change_log = 1; + priv->metadata_change_log = 0; + priv->entry_change_log = 1; + + dict_ret = dict_get_str (this->options, "data-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, &priv->data_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option data-change-log %s'. " + "defaulting to data-change-log as 'on'", + change_log); + priv->data_change_log = 1; + } + } + + dict_ret = dict_get_str (this->options, "metadata-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, + &priv->metadata_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option metadata-change-log %s'. " + "defaulting to metadata-change-log as 'off'", + change_log); + priv->metadata_change_log = 0; + } + } + + dict_ret = dict_get_str (this->options, "entry-change-log", + &change_log); + if (dict_ret == 0) { + ret = gf_string2boolean (change_log, &priv->entry_change_log); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "invalid 'option entry-change-log %s'. " + "defaulting to entry-change-log as 'on'", + change_log); + priv->entry_change_log = 1; + } + } + + /* Locking options */ + + priv->data_lock_server_count = 1; + priv->metadata_lock_server_count = 0; + priv->entry_lock_server_count = 1; + + dict_ret = dict_get_int32 (this->options, "data-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting data lock server count to %d", + lock_server_count); + + if (lock_server_count == 0) + gf_log (this->name, GF_LOG_WARNING, + no_lock_servers_warning_str); + + priv->data_lock_server_count = lock_server_count; + } + + + dict_ret = dict_get_int32 (this->options, + "metadata-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting metadata lock server count to %d", + lock_server_count); + priv->metadata_lock_server_count = lock_server_count; + } + + + dict_ret = dict_get_int32 (this->options, "entry-lock-server-count", + &lock_server_count); + if (dict_ret == 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting entry lock server count to %d", + lock_server_count); + + priv->entry_lock_server_count = lock_server_count; + } + + + trav = this->children; + while (trav) { + if (!read_ret && !strcmp (read_subvol, trav->xlator->name)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvolume '%s' specified as read child", + trav->xlator->name); + + priv->read_child = child_count; + } + + if (fav_ret == 0 && !strcmp (fav_child, trav->xlator->name)) { + gf_log (this->name, GF_LOG_WARNING, + favorite_child_warning_str, trav->xlator->name, + trav->xlator->name, trav->xlator->name); + priv->favorite_child = child_count; + } + + child_count++; + trav = trav->next; + } + + /* XXX: return inode numbers from 1st subvolume till + afr supports read-subvolume based on inode's ctx + (and not itransform) for this reason afr_deitransform() + returns 0 always + */ + priv->read_child = 0; + + priv->wait_count = 1; + + priv->child_count = child_count; + LOCK_INIT (&priv->lock); + + priv->child_up = CALLOC (sizeof (unsigned char), child_count); + if (!priv->child_up) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + priv->children = CALLOC (sizeof (xlator_t *), child_count); + if (!priv->children) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + trav = this->children; + i = 0; + while (i < child_count) { + priv->children[i] = trav->xlator; + + trav = trav->next; + i++; + } + + ret = 0; +out: + return ret; +} + + +int +fini (xlator_t *this) +{ + return 0; +} + + +struct xlator_fops fops = { + .lookup = afr_lookup, + .open = afr_open, + .lk = afr_lk, + .flush = afr_flush, + .statfs = afr_statfs, + .fsync = afr_fsync, + .fsyncdir = afr_fsyncdir, + .xattrop = afr_xattrop, + .fxattrop = afr_fxattrop, + .inodelk = afr_inodelk, + .finodelk = afr_finodelk, + .entrylk = afr_entrylk, + .fentrylk = afr_fentrylk, + .checksum = afr_checksum, + + /* inode read */ + .access = afr_access, + .stat = afr_stat, + .fstat = afr_fstat, + .readlink = afr_readlink, + .getxattr = afr_getxattr, + .readv = afr_readv, + + /* inode write */ + .chmod = afr_chmod, + .chown = afr_chown, + .fchmod = afr_fchmod, + .fchown = afr_fchown, + .writev = afr_writev, + .truncate = afr_truncate, + .ftruncate = afr_ftruncate, + .utimens = afr_utimens, + .setxattr = afr_setxattr, + .removexattr = afr_removexattr, + + /* dir read */ + .opendir = afr_opendir, + .readdir = afr_readdir, + .getdents = afr_getdents, + + /* dir write */ + .create = afr_create, + .mknod = afr_mknod, + .mkdir = afr_mkdir, + .unlink = afr_unlink, + .rmdir = afr_rmdir, + .link = afr_link, + .symlink = afr_symlink, + .rename = afr_rename, + .setdents = afr_setdents, +}; + + +struct xlator_mops mops = { +}; + + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"read-subvolume" }, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {"favorite-child"}, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {"data-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"metadata-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"entry-self-heal"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"data-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"metadata-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"entry-change-log"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"data-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {"metadata-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {"entry-lock-server-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 0 + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h new file mode 100644 index 000000000..4cf6cdf9d --- /dev/null +++ b/xlators/cluster/afr/src/afr.h @@ -0,0 +1,523 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef __AFR_H__ +#define __AFR_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "scheduler.h" +#include "call-stub.h" +#include "compat-errno.h" + + +typedef struct _afr_private { + gf_lock_t lock; /* to guard access to child_count, etc */ + unsigned int child_count; /* total number of children */ + + xlator_t **children; + + unsigned char *child_up; + + gf_boolean_t data_self_heal; /* on/off */ + gf_boolean_t metadata_self_heal; /* on/off */ + gf_boolean_t entry_self_heal; /* on/off */ + + + gf_boolean_t data_change_log; /* on/off */ + gf_boolean_t metadata_change_log; /* on/off */ + gf_boolean_t entry_change_log; /* on/off */ + + unsigned int read_child; /* read-subvolume */ + unsigned int favorite_child; /* subvolume to be preferred in resolving + split-brain cases */ + + unsigned int data_lock_server_count; + unsigned int metadata_lock_server_count; + unsigned int entry_lock_server_count; + + unsigned int wait_count; /* # of servers to wait for success */ +} afr_private_t; + +typedef struct { + /* array of stat's, one for each child */ + struct stat *buf; + + /* array of xattr's, one for each child */ + dict_t **xattr; + + /* array of errno's, one for each child */ + int *child_errno; + + int32_t **pending_matrix; + int32_t **delta_matrix; + + int *sources; + int source; + int active_source; + int active_sinks; + int *success; + + fd_t *healing_fd; + int op_failed; + + int file_has_holes; + blksize_t block_size; + off_t file_size; + off_t offset; + + loc_t parent_loc; + int (*completion_cbk) (call_frame_t *frame, xlator_t *this); + call_frame_t *sh_frame; +} afr_self_heal_t; + + +typedef enum { + AFR_DATA_TRANSACTION, /* truncate, write, ... */ + AFR_METADATA_TRANSACTION, /* chmod, chown, ... */ + AFR_ENTRY_TRANSACTION, /* create, rmdir, ... */ + AFR_ENTRY_RENAME_TRANSACTION, /* rename */ + AFR_FLUSH_TRANSACTION, /* flush */ +} afr_transaction_type; + +typedef struct _afr_local { + unsigned int call_count; + unsigned int success_count; + unsigned int enoent_count; + + unsigned int need_metadata_self_heal; + unsigned int need_entry_self_heal; + unsigned int need_data_self_heal; + unsigned int govinda_gOvinda; + + unsigned int reval_child_index; + int32_t op_ret; + int32_t op_errno; + + int32_t *pending_array; + + loc_t loc; + loc_t newloc; + + fd_t *fd; + + glusterfs_fop_t fop; + + unsigned char *child_up; + int child_count; + + int32_t *child_errno; + + dict_t *xattr_req; + int open_fd_count; + /* + This struct contains the arguments for the "continuation" + (scheme-like) of fops + */ + + int op; + struct { + struct { + unsigned char buf_set; + struct statvfs buf; + } statfs; + + struct { + inode_t *inode; + struct stat buf; + dict_t *xattr; + } lookup; + + struct { + int32_t flags; + } open; + + struct { + int32_t cmd; + struct flock flock; + unsigned char *locked_nodes; + } lk; + + struct { + uint8_t *file_checksum; + uint8_t *dir_checksum; + } checksum; + + /* inode read */ + + struct { + int32_t mask; + int last_tried; /* index of the child we tried previously */ + } access; + + struct { + int last_tried; + ino_t ino; + } stat; + + struct { + int last_tried; + ino_t ino; + } fstat; + + struct { + size_t size; + int last_tried; + } readlink; + + struct { + const char *name; + int last_tried; + } getxattr; + + struct { + size_t size; + off_t offset; + int last_tried; + } readv; + + /* dir read */ + + struct { + int success_count; + int32_t op_ret; + int32_t op_errno; + } opendir; + + struct { + int32_t op_ret; + int32_t op_errno; + size_t size; + off_t offset; + + int last_tried; + } readdir; + + struct { + int32_t op_ret; + int32_t op_errno; + + size_t size; + off_t offset; + int32_t flag; + + int last_tried; + } getdents; + + /* inode write */ + + struct { + ino_t ino; + mode_t mode; + struct stat buf; + } chmod; + + struct { + ino_t ino; + mode_t mode; + struct stat buf; + } fchmod; + + struct { + ino_t ino; + uid_t uid; + gid_t gid; + struct stat buf; + } chown; + + struct { + ino_t ino; + uid_t uid; + gid_t gid; + struct stat buf; + } fchown; + + struct { + ino_t ino; + struct stat buf; + + int32_t op_ret; + + struct iovec *vector; + dict_t *refs; + int32_t count; + off_t offset; + } writev; + + struct { + ino_t ino; + off_t offset; + struct stat buf; + } truncate; + + struct { + ino_t ino; + off_t offset; + struct stat buf; + } ftruncate; + + struct { + ino_t ino; + struct timespec tv[2]; + struct stat buf; + } utimens; + + struct { + dict_t *dict; + int32_t flags; + } setxattr; + + struct { + const char *name; + } removexattr; + + /* dir write */ + + struct { + ino_t ino; + fd_t *fd; + int32_t flags; + mode_t mode; + inode_t *inode; + struct stat buf; + } create; + + struct { + ino_t ino; + dev_t dev; + mode_t mode; + inode_t *inode; + struct stat buf; + } mknod; + + struct { + ino_t ino; + int32_t mode; + inode_t *inode; + struct stat buf; + } mkdir; + + struct { + int32_t op_ret; + int32_t op_errno; + } unlink; + + struct { + int32_t op_ret; + int32_t op_errno; + } rmdir; + + struct { + ino_t ino; + struct stat buf; + } rename; + + struct { + ino_t ino; + inode_t *inode; + struct stat buf; + } link; + + struct { + ino_t ino; + inode_t *inode; + struct stat buf; + char *linkpath; + } symlink; + + struct { + int32_t flags; + dir_entry_t *entries; + int32_t count; + } setdents; + } cont; + + struct { + off_t start, len; + + unsigned char *locked_nodes; + int lock_count; + + const char *basename; + const char *new_basename; + + char *pending; + + loc_t parent_loc; + loc_t new_parent_loc; + + afr_transaction_type type; + + int success_count; + int erase_pending; + int failure_count; + + int last_tried; + int32_t *child_errno; + + call_frame_t *main_frame; + + int (*fop) (call_frame_t *frame, xlator_t *this); + + int (*done) (call_frame_t *frame, xlator_t *this); + + int (*resume) (call_frame_t *frame, xlator_t *this); + + int (*unwind) (call_frame_t *frame, xlator_t *this); + } transaction; + + afr_self_heal_t self_heal; +} afr_local_t; + +/* try alloc and if it fails, goto label */ +#define ALLOC_OR_GOTO(var, type, label) do { \ + var = CALLOC (sizeof (type), 1); \ + if (!var) { \ + gf_log (this->name, GF_LOG_ERROR, \ + "out of memory :("); \ + op_errno = ENOMEM; \ + goto label; \ + } \ + } while (0); + + +/* did a call fail due to a child failing? */ +#define child_went_down(op_ret, op_errno) (((op_ret) < 0) && \ + ((op_errno == ENOTCONN) || \ + (op_errno == EBADFD))) + +/* have we tried all children? */ +#define all_tried(i, count) ((i) == (count) - 1) + +void +afr_build_parent_loc (loc_t *parent, loc_t *child); + +int +afr_up_children_count (int child_count, unsigned char *child_up); + +int +afr_locked_nodes_count (unsigned char *locked_nodes, int child_count); + +int +afr_first_up_child (afr_private_t *priv); + +ino64_t +afr_itransform (ino64_t ino, int child_count, int child_index); + +int +afr_deitransform (ino64_t ino, int child_count); + +void +afr_local_cleanup (afr_local_t *local, xlator_t *this); + +int +afr_frame_return (call_frame_t *frame); + +#define AFR_STACK_UNWIND(frame, params ...) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_UNWIND (frame, params); \ + afr_local_cleanup (__local, __this); \ + free (__local); \ +} while (0); + +#define AFR_STACK_DESTROY(frame) \ + do { \ + afr_local_t *__local = NULL; \ + xlator_t *__this = NULL; \ + __local = frame->local; \ + __this = frame->this; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + afr_local_cleanup (__local, __this); \ + free (__local); \ +} while (0); + +/* allocate and return a string that is the basename of argument */ +static inline char * +AFR_BASENAME (const char *str) +{ + char *__tmp_str = NULL; + char *__basename_str = NULL; + __tmp_str = strdup (str); + __basename_str = strdup (basename (__tmp_str)); + FREE (__tmp_str); + return __basename_str; +} + +/* initialize local_t */ +static inline int +AFR_LOCAL_INIT (afr_local_t *local, afr_private_t *priv) +{ + local->child_up = CALLOC (sizeof (*local->child_up), + priv->child_count); + if (!local->child_up) { + return -ENOMEM; + } + + memcpy (local->child_up, priv->child_up, + sizeof (*local->child_up) * priv->child_count); + + + local->call_count = afr_up_children_count (priv->child_count, local->child_up); + if (local->call_count == 0) + return -ENOTCONN; + + local->transaction.erase_pending = 1; + + local->op_ret = -1; + local->op_errno = EUCLEAN; + + return 0; +} + + +static inline int +afr_transaction_local_init (afr_local_t *local, afr_private_t *priv) +{ + local->child_errno = CALLOC (sizeof (*local->child_errno), + priv->child_count); + if (!local->child_errno) { + return -ENOMEM; + } + + local->pending_array = CALLOC (sizeof (*local->pending_array), + priv->child_count); + if (!local->pending_array) { + return -ENOMEM; + } + + local->transaction.locked_nodes = CALLOC (sizeof (*local->transaction.locked_nodes), + priv->child_count); + + local->transaction.child_errno = CALLOC (sizeof (*local->transaction.child_errno), + priv->child_count); + + return 0; +} + +#endif /* __AFR_H__ */ diff --git a/xlators/cluster/dht/Makefile.am b/xlators/cluster/dht/Makefile.am new file mode 100644 index 000000000..f963effea --- /dev/null +++ b/xlators/cluster/dht/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src \ No newline at end of file diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am new file mode 100644 index 000000000..b7d07d137 --- /dev/null +++ b/xlators/cluster/dht/src/Makefile.am @@ -0,0 +1,30 @@ + +xlator_LTLIBRARIES = dht.la nufa.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + + +dht_common_source = dht-layout.c dht-helper.c dht-linkfile.c \ + dht-selfheal.c dht-rename.c dht-hashfn.c dht-hashfn-tea.c + +dht_la_SOURCES = $(dht_common_source) dht.c + +nufa_la_SOURCES = $(dht_common_source) nufa.c + +dht_la_LDFLAGS = -module -avoidversion +dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +nufa_la_LDFLAGS = -module -avoidversion +nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = dht-common.h dht-common.c + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/distribute.so + +install-data-hook: + ln -sf dht.so $(DESTDIR)$(xlatordir)/distribute.so \ No newline at end of file diff --git a/xlators/cluster/dht/src/dht-common.c b/xlators/cluster/dht/src/dht-common.c new file mode 100644 index 000000000..5e4979e31 --- /dev/null +++ b/xlators/cluster/dht/src/dht-common.c @@ -0,0 +1,3470 @@ +/* + Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/* TODO: add NS locking */ + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" +#include "defaults.h" + + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ + +int +dht_lookup_selfheal_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int ret = 0; + + local = frame->local; + ret = op_ret; + + if (ret == 0) { + layout = local->selfheal.layout; + ret = inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + + if (ret == 0) + local->selfheal.layout = NULL; + + if (local->st_ino) { + local->stbuf.st_ino = local->st_ino; + } else { + gf_log (this->name, GF_LOG_WARNING, + "could not find hashed subvolume for %s", + local->loc.path); + } + } + + DHT_STACK_UNWIND (frame, ret, local->op_errno, local->inode, + &local->stbuf, local->xattr); + + return 0; +} + + +int +dht_lookup_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = 0; + int is_dir = 0; + + conf = this->private; + local = frame->local; + prev = cookie; + + layout = local->layout; + + LOCK (&frame->lock); + { + /* TODO: assert equal mode on stbuf->st_mode and + local->stbuf->st_mode + + else mkdir/chmod/chown and fix + */ + /* TODO: assert equal hash type in xattr, local->xattr */ + + /* TODO: always ensure same subvolume is in layout->list[0] */ + + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, xattr); + + if (op_ret == -1) { + local->op_errno = ENOENT; + gf_log (this->name, GF_LOG_WARNING, + "lookup of %s on %s returned error (%s)", + local->loc.path, prev->this->name, + strerror (op_errno)); + + goto unlock; + } + + is_dir = check_is_dir (inode, stbuf, xattr); + if (!is_dir) + goto unlock; + + local->op_ret = 0; + if (local->xattr == NULL) + local->xattr = dict_ref (xattr); + if (local->inode == NULL) + local->inode = inode_ref (inode); + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + if (prev->this == local->hashed_subvol) + local->st_ino = local->stbuf.st_ino; + + } +unlock: + UNLOCK (&frame->lock); + + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + if (local->op_ret == 0) { + ret = dht_layout_normalize (this, &local->loc, layout); + + local->layout = NULL; + + if (ret != 0) { + layout->gen = conf->gen; + + gf_log (this->name, GF_LOG_WARNING, + "fixing assignment on %s", + local->loc.path); + goto selfheal; + } + + inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + + if (local->st_ino) { + local->stbuf.st_ino = local->st_ino; + } else { + gf_log (this->name, GF_LOG_WARNING, + "could not find hashed subvolume for %s", + local->loc.path); + } + } + + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr); + } + + return 0; + +selfheal: + ret = dht_selfheal_directory (frame, dht_lookup_selfheal_cbk, + &local->loc, layout); + + return 0; +} + +int +dht_revalidate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + int is_dir = 0; + int is_linkfile = 0; + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + + if (op_errno != ENOTCONN && op_errno != ENOENT) { + gf_log (this->name, GF_LOG_WARNING, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + } + + goto unlock; + } + + if (S_IFMT & (stbuf->st_mode ^ local->inode->st_mode)) { + gf_log (this->name, GF_LOG_WARNING, + "mismatching filetypes 0%o v/s 0%o for %s", + (stbuf->st_mode & S_IFMT), + (local->inode->st_mode & S_IFMT), + local->loc.path); + + local->op_ret = -1; + local->op_errno = EINVAL; + + goto unlock; + } + + layout = dht_layout_get (this, inode); + + is_dir = check_is_dir (inode, stbuf, xattr); + is_linkfile = check_is_linkfile (inode, stbuf, xattr); + + if (is_linkfile) { + gf_log (this->name, GF_LOG_WARNING, + "linkfile found in revalidate for %s", + local->loc.path); + local->layout_mismatch = 1; + + goto unlock; + } + + if (is_dir) { + ret = dht_layout_dir_mismatch (this, layout, + prev->this, &local->loc, + xattr); + if (ret != 0) { + gf_log (this->name, GF_LOG_WARNING, + "mismatching layouts for %s", + local->loc.path); + + local->layout_mismatch = 1; + + goto unlock; + } + } + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + local->op_ret = 0; + local->stbuf.st_ino = local->st_ino; + + if (!local->xattr) + local->xattr = dict_ref (xattr); + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + if (!S_ISDIR (local->stbuf.st_mode) + && (local->hashed_subvol != local->cached_subvol) + && (local->stbuf.st_nlink == 1)) + local->stbuf.st_mode |= S_ISVTX; + + if (local->layout_mismatch) { + local->op_ret = -1; + local->op_errno = ESTALE; + } + + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr); + } + + return 0; +} + + +int +dht_lookup_linkfile_create_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + xlator_t *cached_subvol = NULL; + + local = frame->local; + cached_subvol = local->cached_subvol; + + layout = dht_layout_for_subvol (this, local->cached_subvol); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + cached_subvol ? cached_subvol->name : "<nil>"); + local->op_ret = -1; + local->op_errno = EINVAL; + goto unwind; + } + + inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + local->op_ret = 0; + if (local->stbuf.st_nlink == 1) + local->stbuf.st_mode |= S_ISVTX; + +unwind: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->xattr); + return 0; +} + + +int +dht_lookup_everywhere_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + int is_linkfile = 0; + int is_dir = 0; + xlator_t *subvol = NULL; + loc_t *loc = NULL; + xlator_t *link_subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + + conf = this->private; + + local = frame->local; + loc = &local->loc; + + prev = cookie; + subvol = prev->this; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + if (op_errno != ENOENT) + local->op_errno = op_errno; + goto unlock; + } + + is_linkfile = check_is_linkfile (inode, buf, xattr); + is_dir = check_is_dir (inode, buf, xattr); + + if (is_linkfile) { + link_subvol = dht_linkfile_subvol (this, inode, buf, + xattr); + gf_log (this->name, GF_LOG_DEBUG, + "found on %s linkfile %s (-> %s)", + subvol->name, loc->path, + link_subvol ? link_subvol->name : "''"); + goto unlock; + } else { + gf_log (this->name, GF_LOG_DEBUG, + "found on %s file %s", + subvol->name, loc->path); + } + + if (!local->cached_subvol) { + /* found one file */ + dht_stat_merge (this, &local->stbuf, buf, subvol); + local->xattr = dict_ref (xattr); + local->cached_subvol = subvol; + } else { + gf_log (this->name, GF_LOG_WARNING, + "multiple subvolumes (%s and %s atleast) have " + "file %s", local->cached_subvol->name, + subvol->name, local->loc.path); + } + } +unlock: + UNLOCK (&frame->lock); + + if (is_linkfile) { + gf_log (this->name, GF_LOG_WARNING, + "deleting stale linkfile %s on %s", + loc->path, subvol->name); + dht_linkfile_unlink (frame, this, subvol, loc); + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + hashed_subvol = local->hashed_subvol; + cached_subvol = local->cached_subvol; + + if (!cached_subvol) { + DHT_STACK_UNWIND (frame, -1, ENOENT, NULL, NULL, NULL); + return 0; + } + + gf_log (this->name, GF_LOG_WARNING, + "linking file %s existing on %s to %s (hash)", + loc->path, cached_subvol->name, hashed_subvol->name); + + dht_linkfile_create (frame, dht_lookup_linkfile_create_cbk, + cached_subvol, hashed_subvol, loc); + } + + return 0; +} + + +int +dht_lookup_everywhere (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int i = 0; + int call_cnt = 0; + + conf = this->private; + local = frame->local; + + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + if (!local->inode) + local->inode = inode_ref (loc->inode); + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_everywhere_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + loc, local->xattr_req); + } + + return 0; +} + + +int +dht_lookup_linkfile_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + call_frame_t *prev = NULL; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + loc_t *loc = NULL; + + prev = cookie; + subvol = prev->this; + + local = frame->local; + loc = &local->loc; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "lookup of %s on %s (following linkfile) failed (%s)", + local->loc.path, subvol->name, strerror (op_errno)); + + dht_lookup_everywhere (frame, this, loc); + return 0; + } + + /* TODO: assert type is non-dir and non-linkfile */ + + if (stbuf->st_nlink == 1) + stbuf->st_mode |= S_ISVTX; + dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + inode_ctx_put (inode, this, (uint64_t)(long)layout); + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + + return 0; +} + + +int +dht_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + dht_layout_t *layout = NULL; + char is_linkfile = 0; + char is_dir = 0; + xlator_t *subvol = NULL; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + int i = 0; + call_frame_t *prev = NULL; + int call_cnt = 0; + + + conf = this->private; + + prev = cookie; + local = frame->local; + loc = &local->loc; + + if (ENTRY_MISSING (op_ret, op_errno)) { + if (conf->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere (frame, this, loc); + return 0; + } + } + + if (op_ret == 0) { + is_dir = check_is_dir (inode, stbuf, xattr); + if (is_dir) { + local->inode = inode_ref (inode); + local->xattr = dict_ref (xattr); + } + } + + if (is_dir || (op_ret == -1 && op_errno == ENOTCONN)) { + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + return 0; + } + + if (op_ret == -1) + goto out; + + is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_dir = check_is_dir (inode, stbuf, xattr); + + if (!is_dir && !is_linkfile) { + /* non-directory and not a linkfile */ + + dht_itransform (this, prev->this, stbuf->st_ino, + &stbuf->st_ino); + + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + inode_ctx_put (inode, this, (uint64_t)(long)layout); + goto out; + } + + if (is_linkfile) { + subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); + + if (!subvol) { + gf_log (this->name, GF_LOG_WARNING, + "linkfile not having link subvolume. path=%s", + loc->path); + dht_lookup_everywhere (frame, this, loc); + return 0; + } + + STACK_WIND (frame, dht_lookup_linkfile_cbk, + subvol, subvol->fops->lookup, + &local->loc, local->xattr_req); + } + + return 0; + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + return 0; +} + + +int +dht_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + xlator_t *subvol = NULL; + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "copying location failed for path=%s", + loc->path); + goto err; + } + + if (xattr_req) { + local->xattr_req = dict_ref (xattr_req); + } else { + local->xattr_req = dict_new (); + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + cached_subvol = dht_subvol_get_cached (this, loc->inode); + + local->cached_subvol = cached_subvol; + local->hashed_subvol = hashed_subvol; + + if (is_revalidate (loc)) { + layout = dht_layout_get (this, loc->inode); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "revalidate without cache. path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + if (layout->gen && (layout->gen < conf->gen)) { + gf_log (this->name, GF_LOG_WARNING, + "incomplete layout failure for path=%s", + loc->path); + op_errno = EAGAIN; + goto err; + } + + local->inode = inode_ref (loc->inode); + local->st_ino = loc->inode->ino; + + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; + + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, + * revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht", 4 * 4); + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_revalidate_cbk, + subvol, subvol->fops->lookup, + loc, local->xattr_req); + + if (!--call_cnt) + break; + } + } else { + /* TODO: remove the hard-coding */ + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht", 4 * 4); + + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht.linkto", 256); + + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s, " + "checking on all the subvols to see if " + "it is a directory", loc->path); + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + return 0; + } + + STACK_WIND (frame, dht_lookup_cbk, + hashed_subvol, hashed_subvol->fops->lookup, + loc, local->xattr_req); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + + +int +dht_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + if (local->inode) + local->stbuf.st_ino = local->inode->ino; + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_stat (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->stat, + loc); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + int i = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + layout = dht_layout_get (this, fd->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "local allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = layout->cnt;; + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->fstat, + fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->chmod, + loc, mode); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->chown, + loc, uid, gid); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + + layout = dht_layout_get (this, fd->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fchmod, + fd, mode); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + layout = dht_layout_get (this, fd->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->fchown, + fd, uid, gid); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec tv[2]) +{ + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + layout = dht_layout_get (this, loc->inode); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no layout for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + if (!layout_is_sane (layout)) { + gf_log (this->name, GF_LOG_ERROR, + "layout is not sane for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = layout->cnt; + + for (i = 0; i < layout->cnt; i++) { + STACK_WIND (frame, dht_attr_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->utimens, + loc, tv); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->truncate, + loc, offset); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (fd->inode); + local->call_cnt = 1; + + STACK_WIND (frame, dht_attr_cbk, + subvol, subvol->fops->ftruncate, + fd, offset); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_err_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } + + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +dht_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->access, + loc, mask); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_readlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, const char *path) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, path); + + return 0; +} + + +int +dht_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_readlink_cbk, + subvol, subvol->fops->readlink, + loc, size); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, dict_t *xattr) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, xattr); + + return 0; +} + + +int +dht_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_getxattr_cbk, + subvol, subvol->fops->getxattr, + loc, key); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr, int flags) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->setxattr, + loc, xattr, flags); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *key) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->removexattr, + loc, key); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fd_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "subvolume %s returned -1 (%s)", + prev->this->name, strerror (op_errno)); + goto unlock; + } + + local->op_ret = 0; + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd); + + return 0; +} + + +int +dht_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int flags, fd_t *fd) +{ + xlator_t *subvol = NULL; + int ret = -1; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + + STACK_WIND (frame, dht_fd_cbk, + subvol, subvol->fops->open, + loc, flags, fd); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + struct iovec *vector, int count, struct stat *stbuf) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + + return 0; +} + + +int +dht_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_readv_cbk, + subvol, subvol->fops->readv, + fd, size, off); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL); + + return 0; +} + + +int +dht_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct stat *stbuf) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, stbuf); + + return 0; +} + + +int +dht_writev (call_frame_t *frame, xlator_t *this, + fd_t *fd, struct iovec *vector, int count, off_t off) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_writev_cbk, + subvol, subvol->fops->writev, + fd, vector, count, off); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, 0); + + return 0; +} + + +int +dht_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->flush, fd); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int datasync) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocatoin failed :("); + goto err; + } + local->call_cnt = 1; + + STACK_WIND (frame, dht_err_cbk, + subvol, subvol->fops->fsync, + fd, datasync); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct flock *flock) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, flock); + + return 0; +} + + +int +dht_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int cmd, struct flock *flock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_lk_cbk, + subvol, subvol->fops->lk, + fd, cmd, flock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + +/* gf_lk no longer exists +int +dht_gf_lk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct flock *flock) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, flock); + + return 0; +} + + +int +dht_gf_lk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int cmd, struct flock *flock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_gf_lk_cbk, + subvol, subvol->fops->gf_lk, + fd, cmd, flock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} +*/ + +int +dht_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, struct statvfs *statvfs) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + local->op_ret = 0; + + /* TODO: normalize sizes */ + local->statvfs.f_bsize = statvfs->f_bsize; + local->statvfs.f_frsize = statvfs->f_frsize; + + local->statvfs.f_blocks += statvfs->f_blocks; + local->statvfs.f_bfree += statvfs->f_bfree; + local->statvfs.f_bavail += statvfs->f_bavail; + local->statvfs.f_files += statvfs->f_files; + local->statvfs.f_ffree += statvfs->f_ffree; + local->statvfs.f_favail += statvfs->f_favail; + local->statvfs.f_fsid = statvfs->f_fsid; + local->statvfs.f_flag = statvfs->f_flag; + local->statvfs.f_namemax = statvfs->f_namemax; + + } +unlock: + UNLOCK (&frame->lock); + + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->statvfs); + + return 0; +} + + +int +dht_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_statfs_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->statfs, loc); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_opendir (call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_fd_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + loc, fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *orig_entries) +{ + dht_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + xlator_t *next = NULL; + dht_layout_t *layout = NULL; + int count = 0; + + + INIT_LIST_HEAD (&entries.list); + prev = cookie; + local = frame->local; + + if (op_ret < 0) + goto done; + + layout = dht_layout_get (this, local->fd->inode); + + list_for_each_entry (orig_entry, &orig_entries->list, list) { + subvol = dht_layout_search (this, layout, orig_entry->d_name); + + if (!subvol || subvol == prev->this) { + entry = gf_dirent_for_name (orig_entry->d_name); + if (!entry) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto unwind; + } + + dht_itransform (this, subvol, orig_entry->d_ino, + &entry->d_ino); + dht_itransform (this, subvol, orig_entry->d_off, + &entry->d_off); + + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + + list_add_tail (&entry->list, &entries.list); + count++; + } + } + op_ret = count; + +done: + if (count == 0) { + next = dht_subvol_next (this, prev->this); + if (!next) { + goto unwind; + } + + STACK_WIND (frame, dht_readdir_cbk, + next, next->fops->readdir, + local->fd, local->size, 0); + return 0; + } + +unwind: + if (op_ret < 0) + op_ret = 0; + + DHT_STACK_UNWIND (frame, op_ret, op_errno, &entries); + + gf_dirent_free (&entries); + + return 0; +} + + +int +dht_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t yoff) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + xlator_t *xvol = NULL; + off_t xoff = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->fd = fd_ref (fd); + local->size = size; + + dht_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); + + /* TODO: do proper readdir */ + STACK_WIND (frame, dht_readdir_cbk, + xvol, xvol->fops->readdir, + fd, size, xoff); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int +dht_fsyncdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + + + local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret == -1) + local->op_errno = op_errno; + + if (op_ret == 0) + local->op_ret = 0; + } + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +dht_fsyncdir (call_frame_t *frame, xlator_t *this, fd_t *fd, int datasync) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->fd = fd_ref (fd); + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_fsyncdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->fsyncdir, + fd, datasync); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_newfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + + + if (op_ret == -1) + goto out; + + prev = cookie; + + dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + layout = dht_layout_for_subvol (this, prev->this); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + ret = inode_ctx_put (inode, this, (uint64_t)(long)layout); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not set inode context"); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + return 0; +} + + +int +dht_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t rdev) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_newfile_cbk, + subvol, subvol->fops->mknod, + loc, mode, rdev); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_symlink (call_frame_t *frame, xlator_t *this, + const char *linkname, loc_t *loc) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_newfile_cbk, + subvol, subvol->fops->symlink, + linkname, loc); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + cached_subvol = dht_subvol_get_cached (this, loc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->call_cnt = 1; + if (hashed_subvol != cached_subvol) + local->call_cnt++; + + STACK_WIND (frame, dht_err_cbk, + cached_subvol, cached_subvol->fops->unlink, loc); + + if (hashed_subvol != cached_subvol) + STACK_WIND (frame, dht_err_cbk, + hashed_subvol, hashed_subvol->fops->unlink, loc); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_link_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_local_t *local = NULL; + + prev = cookie; + local = frame->local; + + if (op_ret == -1) + goto out; + + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + stbuf->st_ino = local->loc.inode->ino; + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + + return 0; +} + + +int +dht_link_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + xlator_t *srcvol = NULL; + + + if (op_ret == -1) + goto err; + + local = frame->local; + srcvol = local->linkfile.srcvol; + + STACK_WIND (frame, dht_link_cbk, + srcvol, srcvol->fops->link, + &local->loc, &local->loc2); + + return 0; + +err: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + + return 0; +} + + +int +dht_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + xlator_t *cached_subvol = NULL; + xlator_t *hashed_subvol = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (newloc, err); + + cached_subvol = dht_subvol_get_cached (this, oldloc->inode); + if (!cached_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", oldloc->path); + op_errno = EINVAL; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed (this, newloc); + if (!hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + newloc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc, oldloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + if (hashed_subvol != cached_subvol) { + dht_linkfile_create (frame, dht_link_linkfile_cbk, + cached_subvol, hashed_subvol, newloc); + } else { + STACK_WIND (frame, dht_link_cbk, + cached_subvol, cached_subvol->fops->link, + oldloc, newloc); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + fd_t *fd, inode_t *inode, struct stat *stbuf) +{ + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + int ret = -1; + + + if (op_ret == -1) + goto out; + + prev = cookie; + + dht_itransform (this, prev->this, stbuf->st_ino, &stbuf->st_ino); + layout = dht_layout_for_subvol (this, prev->this); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + ret = inode_ctx_put (inode, this, (uint64_t)(long)layout); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not set inode context"); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + +out: + DHT_STACK_UNWIND (frame, op_ret, op_errno, fd, inode, stbuf); + return 0; +} + + +int +dht_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + loc, flags, mode, fd); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + + +int +dht_mkdir_selfheal_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + + + local = frame->local; + layout = local->selfheal.layout; + + if (op_ret == 0) { + inode_ctx_put (local->inode, this, (uint64_t)(long)layout); + local->selfheal.layout = NULL; + local->stbuf.st_ino = local->st_ino; + } + + DHT_STACK_UNWIND (frame, op_ret, op_errno, + local->inode, &local->stbuf); + + return 0; +} + + +int +dht_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + int ret = -1; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + + local = frame->local; + prev = cookie; + layout = local->layout; + + LOCK (&frame->lock); + { + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, NULL); + + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + } +unlock: + UNLOCK (&frame->lock); + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + local->layout = NULL; + dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, + &local->loc, layout); + } + + return 0; +} + +int +dht_mkdir_hashed_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int ret = -1; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *hashed_subvol = NULL; + + local = frame->local; + prev = cookie; + layout = local->layout; + conf = this->private; + hashed_subvol = local->hashed_subvol; + + ret = dht_layout_merge (this, layout, prev->this, + op_ret, op_errno, NULL); + + if (op_ret == -1) { + local->op_errno = op_errno; + goto err; + } + local->op_ret = 0; + + dht_stat_merge (this, &local->stbuf, stbuf, prev->this); + + local->st_ino = local->stbuf.st_ino; + + local->call_cnt = conf->subvolume_cnt - 1; + + if (local->call_cnt == 0) { + local->layout = NULL; + dht_selfheal_directory (frame, dht_mkdir_selfheal_cbk, + &local->loc, layout); + } + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == hashed_subvol) + continue; + STACK_WIND (frame, dht_mkdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->mkdir, + &local->loc, local->mode); + } + return 0; +err: + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + return 0; +} + +int +dht_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int ret = -1; + xlator_t *hashed_subvol = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + hashed_subvol = dht_subvol_get_hashed (this, loc); + + if (hashed_subvol == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "hashed subvol not found"); + op_errno = EINVAL; + goto err; + } + + local->hashed_subvol = hashed_subvol; + local->inode = inode_ref (loc->inode); + ret = loc_copy (&local->loc, loc); + local->mode = mode; + + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + STACK_WIND (frame, dht_mkdir_hashed_cbk, + hashed_subvol, + hashed_subvol->fops->mkdir, + loc, mode); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +dht_rmdir_selfheal_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + + local = frame->local; + local->layout = NULL; + + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + + return 0; +} + + +int +dht_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + uint64_t tmp_layout = 0; + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + dht_layout_t *layout = NULL; + + local = frame->local; + prev = cookie; + + LOCK (&frame->lock); + { + if (op_ret == -1) { + local->op_errno = op_errno; + local->op_ret = -1; + + if (op_errno != ENOENT) + local->need_selfheal = 1; + + gf_log (this->name, GF_LOG_ERROR, + "rmdir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto unlock; + } + } +unlock: + UNLOCK (&frame->lock); + + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->need_selfheal) { + inode_ctx_get (local->loc.inode, this, + &tmp_layout); + layout = (dht_layout_t *)(long)tmp_layout; + + /* TODO: neater interface needed below */ + local->stbuf.st_mode = local->loc.inode->st_mode; + + dht_selfheal_restore (frame, dht_rmdir_selfheal_cbk, + &local->loc, layout); + } else { + DHT_STACK_UNWIND (frame, local->op_ret, + local->op_errno); + } + } + + return 0; +} + + +int +dht_rmdir_do (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int i = 0; + + conf = this->private; + local = frame->local; + + if (local->op_ret == -1) + goto err; + + local->call_cnt = conf->subvolume_cnt; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rmdir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->rmdir, + &local->loc); + } + + return 0; + +err: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + return 0; +} + + +int +dht_rmdir_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (op_ret > 2) { + gf_log (this->name, GF_LOG_DEBUG, + "readdir on %s for %s returned %d entries", + prev->this->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rmdir_do (frame, this); + } + + return 0; +} + + +int +dht_rmdir_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "opendir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto err; + } + + STACK_WIND (frame, dht_rmdir_readdir_cbk, + prev->this, prev->this->fops->readdir, + local->fd, 4096, 0); + + return 0; + +err: + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rmdir_do (frame, this); + } + + return 0; +} + + +int +dht_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int op_errno = -1; + int i = -1; + int ret = -1; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->call_cnt = conf->subvolume_cnt; + local->op_ret = 0; + + ret = loc_copy (&local->loc, loc); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->fd = fd_create (local->loc.inode, frame->root->pid); + if (!local->fd) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rmdir_opendir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + loc, local->fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +static int32_t +dht_xattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +dht_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, + dht_xattrop_cbk, + subvol, subvol->fops->xattrop, + loc, flags, dict); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + +static int32_t +dht_fxattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +dht_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + dht_fxattrop_cbk, + subvol, subvol->fops->fxattrop, + fd, flags, dict); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +static int32_t +dht_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +dht_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *lock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, + dht_inodelk_cbk, + subvol, subvol->fops->inodelk, + loc, cmd, lock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +static int32_t +dht_finodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +dht_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *lock) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + + STACK_WIND (frame, + dht_finodelk_cbk, + subvol, subvol->fops->finodelk, + fd, cmd, lock); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +static int32_t +dht_entrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +dht_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + dht_local_t *local = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = dht_subvol_get_cached (this, loc->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", loc->path); + op_errno = EINVAL; + goto err; + } + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->inode = inode_ref (loc->inode); + local->call_cnt = 1; + + STACK_WIND (frame, dht_entrylk_cbk, + subvol, subvol->fops->entrylk, + loc, basename, cmd, type); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + +static int32_t +dht_fentrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + DHT_STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +dht_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + xlator_t *subvol = NULL; + int op_errno = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + + subvol = dht_subvol_get_cached (this, fd->inode); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for fd=%p", fd); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_fentrylk_cbk, + subvol, subvol->fops->fentrylk, + fd, basename, cmd, type); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno); + + return 0; +} + + +int +dht_forget (xlator_t *this, inode_t *inode) +{ + uint64_t tmp_layout = 0; + dht_layout_t *layout = NULL; + + inode_ctx_get (inode, this, &tmp_layout); + + if (!layout) + return 0; + layout = (dht_layout_t *)(long)tmp_layout; + if (!layout->preset) + FREE (layout); + + return 0; +} + + + +static int +dht_init_subvolumes (xlator_t *this, dht_conf_t *conf) +{ + xlator_list_t *subvols = NULL; + int cnt = 0; + + + for (subvols = this->children; subvols; subvols = subvols->next) + cnt++; + + conf->subvolumes = CALLOC (cnt, sizeof (xlator_t *)); + if (!conf->subvolumes) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + return -1; + } + conf->subvolume_cnt = cnt; + + cnt = 0; + for (subvols = this->children; subvols; subvols = subvols->next) + conf->subvolumes[cnt++] = subvols->xlator; + + conf->subvolume_status = CALLOC (cnt, sizeof (char)); + if (!conf->subvolume_status) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + return -1; + } + + return 0; +} + + +int +dht_notify (xlator_t *this, int event, void *data, ...) +{ + xlator_t *subvol = NULL; + int cnt = -1; + int i = -1; + dht_conf_t *conf = NULL; + int ret = -1; + + + conf = this->private; + + switch (event) { + case GF_EVENT_CHILD_UP: + subvol = data; + + conf->gen++; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } + + if (cnt == -1) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_UP bad subvolume %s", + subvol->name); + break; + } + + LOCK (&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 1; + } + UNLOCK (&conf->subvolume_lock); + + break; + + case GF_EVENT_CHILD_DOWN: + subvol = data; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + cnt = i; + break; + } + } + + if (cnt == -1) { + gf_log (this->name, GF_LOG_ERROR, + "got GF_EVENT_CHILD_DOWN bad subvolume %s", + subvol->name); + break; + } + + LOCK (&conf->subvolume_lock); + { + conf->subvolume_status[cnt] = 0; + } + UNLOCK (&conf->subvolume_lock); + + break; + } + + ret = default_notify (this, event, data); + + return ret; +} + diff --git a/xlators/cluster/dht/src/dht-common.h b/xlators/cluster/dht/src/dht-common.h new file mode 100644 index 000000000..17017381b --- /dev/null +++ b/xlators/cluster/dht/src/dht-common.h @@ -0,0 +1,212 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef _DHT_H +#define _DHT_H + + +typedef int (*dht_selfheal_dir_cbk_t) (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno); + + +struct dht_layout { + int cnt; + int preset; + int gen; + int type; + struct { + int err; /* 0 = normal + -1 = dir exists and no xattr + >0 = dir lookup failed with errno + */ + uint32_t start; + uint32_t stop; + xlator_t *xlator; + } list[0]; +}; +typedef struct dht_layout dht_layout_t; + + +struct dht_local { + int call_cnt; + loc_t loc; + loc_t loc2; + int op_ret; + int op_errno; + int layout_mismatch; + struct stat stbuf; + struct statvfs statvfs; + fd_t *fd; + inode_t *inode; + dict_t *xattr; + dict_t *xattr_req; + dht_layout_t *layout; + size_t size; + ino_t st_ino; + xlator_t *src_hashed, *src_cached; + xlator_t *dst_hashed, *dst_cached; + xlator_t *cached_subvol; + xlator_t *hashed_subvol; + char need_selfheal; + struct { + fop_mknod_cbk_t linkfile_cbk; + struct stat stbuf; + loc_t loc; + inode_t *inode; + dict_t *xattr; + xlator_t *srcvol; + } linkfile; + struct { + uint32_t hole_cnt; + uint32_t overlaps_cnt; + uint32_t missing; + uint32_t down; + uint32_t misc; + dht_selfheal_dir_cbk_t dir_cbk; + dht_layout_t *layout; + } selfheal; + + /* needed by nufa */ + int32_t flags; + mode_t mode; + dev_t rdev; +}; +typedef struct dht_local dht_local_t; + + +struct dht_conf { + gf_lock_t subvolume_lock; + int subvolume_cnt; + xlator_t **subvolumes; + xlator_t *local_volume; /* Needed by NUFA */ + char *subvolume_status; + dht_layout_t **file_layouts; + dht_layout_t **dir_layouts; + dht_layout_t *default_dir_layout; + gf_boolean_t search_unhashed; + int gen; +}; +typedef struct dht_conf dht_conf_t; + + +struct dht_disk_layout { + uint32_t cnt; + uint32_t type; + struct { + uint32_t start; + uint32_t stop; + } list[1]; +}; +typedef struct dht_disk_layout dht_disk_layout_t; + +#define ENTRY_MISSING(op_ret, op_errno) (op_ret == -1 && op_errno == ENOENT) + +#define is_fs_root(loc) (strcmp (loc->path, "/") == 0) + +#define is_revalidate(loc) (inode_ctx_get (loc->inode, this, NULL) == 0) + +#define is_last_call(cnt) (cnt == 0) + +#define DHT_LINKFILE_MODE (S_ISVTX) +#define check_is_linkfile(i,s,x) ((s->st_mode & ~S_IFMT) == DHT_LINKFILE_MODE) + +#define check_is_dir(i,s,x) (S_ISDIR(s->st_mode)) + +#define layout_is_sane(layout) ((layout) && (layout->cnt > 0)) + +#define DHT_STACK_UNWIND(frame, params ...) do { \ + dht_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_UNWIND (frame, params); \ + dht_local_wipe (__local); \ + } while (0) + +#define DHT_STACK_DESTROY(frame) do { \ + dht_local_t *__local = NULL; \ + __local = frame->local; \ + frame->local = NULL; \ + STACK_DESTROY (frame->root); \ + dht_local_wipe (__local); \ + } while (0) + +dht_layout_t *dht_layout_new (xlator_t *this, int cnt); +dht_layout_t *dht_layout_get (xlator_t *this, inode_t *inode); +dht_layout_t *dht_layout_for_subvol (xlator_t *this, xlator_t *subvol); +xlator_t *dht_layout_search (xlator_t *this, dht_layout_t *layout, + const char *name); +int dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout); +int dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, + uint32_t *misc_p); +int dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, + xlator_t *subvol, loc_t *loc, dict_t *xattr); + +xlator_t *dht_linkfile_subvol (xlator_t *this, inode_t *inode, + struct stat *buf, dict_t *xattr); +int dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc); + +int dht_layouts_init (xlator_t *this, dht_conf_t *conf); +int dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + int op_ret, int op_errno, dict_t *xattr); + +int dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, + int pos, int32_t **disk_layout_p); +int dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, + int pos, int32_t *disk_layout); + + +int dht_frame_return (call_frame_t *frame); + +int dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y); +int dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol, + uint64_t *x); + +void dht_local_wipe (dht_local_t *local); +dht_local_t *dht_local_init (call_frame_t *frame); +int dht_stat_merge (xlator_t *this, struct stat *to, struct stat *from, + xlator_t *subvol); + +xlator_t *dht_subvol_get_hashed (xlator_t *this, loc_t *loc); +xlator_t *dht_subvol_get_cached (xlator_t *this, inode_t *inode); +xlator_t *dht_subvol_next (xlator_t *this, xlator_t *prev); +int dht_subvol_cnt (xlator_t *this, xlator_t *subvol); + +int dht_hash_compute (int type, const char *name, uint32_t *hash_p); + +int dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *tovol, xlator_t *fromvol, loc_t *loc); +int +dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); +int +dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t cbk, + loc_t *loc, dht_layout_t *layout); + +int dht_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc); +#endif /* _DHT_H */ diff --git a/xlators/cluster/dht/src/dht-hashfn-tea.c b/xlators/cluster/dht/src/dht-hashfn-tea.c new file mode 100644 index 000000000..8437b4955 --- /dev/null +++ b/xlators/cluster/dht/src/dht-hashfn-tea.c @@ -0,0 +1,146 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include <stdint.h> +#include <stdio.h> +#include <string.h> + + +#define DELTA 0x9E3779B9 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ +#define PARTROUNDS 6 /* 6 gets complete mixing */ + + +static int +tearound (int rounds, uint32_t *array, uint32_t *h0, uint32_t *h1) +{ + uint32_t sum = 0; + int n = 0; + uint32_t b0 = 0; + uint32_t b1 = 0; + + b0 = *h0; + b1 = *h1; + + n = rounds; + + do { + sum += DELTA; + b0 += ((b1 << 4) + array[0]) + ^ (b1 + sum) + ^ ((b1 >> 5) + array[1]); + b1 += ((b0 << 4) + array[2]) + ^ (b0 + sum) + ^ ((b0 >> 5) + array[3]); + } while (--n); + + *h0 += b0; + *h1 += b1; + + return 0; +} + + +uint32_t +__pad (int len) +{ + uint32_t pad = 0; + + pad = (uint32_t) len | ((uint32_t) len << 8); + pad |= pad << 16; + + return pad; +} + + +uint32_t +dht_hashfn_tea (const char *msg, int len) +{ + uint32_t h0 = 0x9464a485; + uint32_t h1 = 0x542e1a94; + uint32_t array[4]; + uint32_t pad = 0; + int i = 0; + int j = 0; + int full_quads = 0; + int full_words = 0; + int full_bytes = 0; + uint32_t *intmsg = NULL; + int word = 0; + + + intmsg = (uint32_t *) msg; + pad = __pad (len); + + full_bytes = len; + full_words = len / 4; + full_quads = len / 16; + + for (i = 0; i < full_quads; i++) { + for (j = 0; j < 4; j++) { + word = *intmsg; + array[j] = word; + intmsg++; + full_words--; + full_bytes -= 4; + } + tearound (PARTROUNDS, &array[0], &h0, &h1); + } + + if ((len % 16) == 0) { + goto done; + } + + for (j = 0; j < 4; j++) { + if (full_words) { + word = *intmsg; + array[j] = word; + intmsg++; + full_words--; + full_bytes -= 4; + } else { + array[j] = pad; + while (full_bytes) { + array[j] <<= 8; + array[j] |= msg[len - full_bytes]; + full_bytes--; + } + } + } + tearound (FULLROUNDS, &array[0], &h0, &h1); + +done: + return h0 ^ h1; +} + + +#if 0 +int +main (int argc, char *argv[]) +{ + int i = 0; + int hashval = 0; + + for (i = 1; i < argc; i++) { + hashval = tea (argv[i], strlen (argv[i])); + printf ("%s: %x\n", argv[i], hashval); + } +} +#endif diff --git a/xlators/cluster/dht/src/dht-hashfn.c b/xlators/cluster/dht/src/dht-hashfn.c new file mode 100644 index 000000000..9e321a43c --- /dev/null +++ b/xlators/cluster/dht/src/dht-hashfn.c @@ -0,0 +1,88 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" + + +uint32_t dht_hashfn_tea (const char *name, int len); + + +typedef enum { + DHT_HASH_TYPE_TEA, +} dht_hashfn_type_t; + + +int +dht_hash_compute_internal (int type, const char *name, uint32_t *hash_p) +{ + int ret = 0; + uint32_t hash = 0; + + switch (type) { + case DHT_HASH_TYPE_TEA: + hash = dht_hashfn_tea (name, strlen (name)); + break; + default: + ret = -1; + break; + } + + if (ret == 0) { + *hash_p = hash; + } + + return ret; +} + + +#define MAKE_RSYNC_FRIENDLY_NAME(rsync_frndly_name, name) do { \ + rsync_frndly_name = (char *) name; \ + if (name[0] == '.') { \ + char *dot = 0; \ + int namelen = 0; \ + \ + dot = strrchr (name, '.'); \ + if (dot && dot > (name + 1) && *(dot + 1)) { \ + namelen = (dot - name); \ + rsync_frndly_name = alloca (namelen); \ + strncpy (rsync_frndly_name, name + 1, \ + namelen); \ + rsync_frndly_name[namelen - 1] = 0; \ + } \ + } \ + } while (0); + + +int +dht_hash_compute (int type, const char *name, uint32_t *hash_p) +{ + char *rsync_friendly_name = NULL; + + MAKE_RSYNC_FRIENDLY_NAME (rsync_friendly_name, name); + + return dht_hash_compute_internal (type, rsync_friendly_name, hash_p); +} diff --git a/xlators/cluster/dht/src/dht-helper.c b/xlators/cluster/dht/src/dht-helper.c new file mode 100644 index 000000000..52d072002 --- /dev/null +++ b/xlators/cluster/dht/src/dht-helper.c @@ -0,0 +1,326 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" + + +int +dht_frame_return (call_frame_t *frame) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + + if (!frame) + return -1; + + local = frame->local; + + LOCK (&frame->lock); + { + this_call_cnt = --local->call_cnt; + } + UNLOCK (&frame->lock); + + return this_call_cnt; +} + + +int +dht_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) +{ + dht_conf_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t y = 0; + + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; + } + + conf = this->private; + + max = conf->subvolume_cnt; + cnt = dht_subvol_cnt (this, subvol); + + y = ((x * max) + cnt); + +out: + if (y_p) + *y_p = y; + + return 0; +} + + +int +dht_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, + uint64_t *x_p) +{ + dht_conf_t *conf = NULL; + int cnt = 0; + int max = 0; + uint64_t x = 0; + xlator_t *subvol = 0; + + + conf = this->private; + max = conf->subvolume_cnt; + + cnt = y % max; + x = y / max; + + subvol = conf->subvolumes[cnt]; + + if (subvol_p) + *subvol_p = subvol; + + if (x_p) + *x_p = x; + + return 0; +} + + +void +dht_local_wipe (dht_local_t *local) +{ + if (!local) + return; + + loc_wipe (&local->loc); + loc_wipe (&local->loc2); + + if (local->xattr) + dict_unref (local->xattr); + + if (local->inode) + inode_unref (local->inode); + + if (local->layout) + FREE (local->layout); + + loc_wipe (&local->linkfile.loc); + + if (local->linkfile.xattr) + dict_unref (local->linkfile.xattr); + + if (local->linkfile.inode) + inode_unref (local->linkfile.inode); + + if (local->fd) { + fd_unref (local->fd); + local->fd = NULL; + } + + if (local->xattr_req) + dict_unref (local->xattr_req); + + FREE (local); +} + + +dht_local_t * +dht_local_init (call_frame_t *frame) +{ + dht_local_t *local = NULL; + + /* TODO: use mem-pool */ + local = CALLOC (1, sizeof (*local)); + + if (!local) + return NULL; + + local->op_ret = -1; + local->op_errno = EUCLEAN; + + frame->local = local; + + return local; +} + + +char * +basestr (const char *str) +{ + char *basestr = NULL; + + basestr = strrchr (str, '/'); + if (basestr) + basestr ++; + + return basestr; +} + +xlator_t * +dht_first_up_child (xlator_t *this) +{ + dht_conf_t *conf = NULL; + xlator_t *child = NULL; + int i = 0; + + conf = this->private; + + LOCK (&conf->subvolume_lock); + { + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolume_status[i]) { + child = conf->subvolumes[i]; + break; + } + } + } + UNLOCK (&conf->subvolume_lock); + + return child; +} + +xlator_t * +dht_subvol_get_hashed (xlator_t *this, loc_t *loc) +{ + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + + if (is_fs_root (loc)) { + subvol = dht_first_up_child (this); + goto out; + } + + layout = dht_layout_get (this, loc->parent); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "layout missing path=%s parent=%"PRId64, + loc->path, loc->parent->ino); + goto out; + } + + subvol = dht_layout_search (this, layout, loc->name); + + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "could not find subvolume for path=%s", + loc->path); + goto out; + } + +out: + return subvol; +} + + +xlator_t * +dht_subvol_get_cached (xlator_t *this, inode_t *inode) +{ + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + + + layout = dht_layout_get (this, inode); + + if (!layout) { + goto out; + } + + subvol = layout->list[0].xlator; + +out: + return subvol; +} + + +xlator_t * +dht_subvol_next (xlator_t *this, xlator_t *prev) +{ + dht_conf_t *conf = NULL; + int i = 0; + xlator_t *next = NULL; + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == prev) { + if ((i + 1) < conf->subvolume_cnt) + next = conf->subvolumes[i + 1]; + break; + } + } + + return next; +} + + +int +dht_subvol_cnt (xlator_t *this, xlator_t *subvol) +{ + int i = 0; + int ret = -1; + dht_conf_t *conf = NULL; + + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (subvol == conf->subvolumes[i]) { + ret = i; + break; + } + } + + return ret; +} + + +#define set_if_greater(a, b) do { \ + if ((a) < (b)) \ + (a) = (b); \ + } while (0) + +int +dht_stat_merge (xlator_t *this, struct stat *to, + struct stat *from, xlator_t *subvol) +{ + to->st_dev = from->st_dev; + + dht_itransform (this, subvol, from->st_ino, &to->st_ino); + + to->st_mode = from->st_mode; + to->st_nlink = from->st_nlink; + to->st_uid = from->st_uid; + to->st_gid = from->st_gid; + to->st_rdev = from->st_rdev; + to->st_size += from->st_size; + to->st_blksize = from->st_blksize; + to->st_blocks += from->st_blocks; + + set_if_greater (to->st_atime, from->st_atime); + set_if_greater (to->st_mtime, from->st_mtime); + set_if_greater (to->st_ctime, from->st_ctime); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-layout.c b/xlators/cluster/dht/src/dht-layout.c new file mode 100644 index 000000000..08b4a2746 --- /dev/null +++ b/xlators/cluster/dht/src/dht-layout.c @@ -0,0 +1,543 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" +#include "byte-order.h" + +#define layout_base_size (sizeof (dht_layout_t)) + +#define layout_entry_size (sizeof ((dht_layout_t *)NULL)->list[0]) + +#define layout_size(cnt) (layout_base_size + (cnt * layout_entry_size)) + + +dht_layout_t * +dht_layout_new (xlator_t *this, int cnt) +{ + dht_layout_t *layout = NULL; + + + layout = CALLOC (1, layout_size (cnt)); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + layout->cnt = cnt; + +out: + return layout; +} + + +dht_layout_t * +dht_layout_get (xlator_t *this, inode_t *inode) +{ + uint64_t layout = 0; + int ret = -1; + + ret = inode_ctx_get (inode, this, &layout); + + return (dht_layout_t *)(long)layout; +} + + +xlator_t * +dht_layout_search (xlator_t *this, dht_layout_t *layout, const char *name) +{ + uint32_t hash = 0; + xlator_t *subvol = NULL; + int i = 0; + int ret = 0; + + + ret = dht_hash_compute (layout->type, name, &hash); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "hash computation failed for type=%d name=%s", + layout->type, name); + goto out; + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].start <= hash + && layout->list[i].stop >= hash) { + subvol = layout->list[i].xlator; + break; + } + } + + if (!subvol) { + gf_log (this->name, GF_LOG_DEBUG, + "no subvolume for hash (value) = %u", hash); + } + +out: + return subvol; +} + + +dht_layout_t * +dht_layout_for_subvol (xlator_t *this, xlator_t *subvol) +{ + dht_conf_t *conf = NULL; + dht_layout_t *layout = NULL; + int i = 0; + + + conf = this->private; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (conf->subvolumes[i] == subvol) { + layout = conf->file_layouts[i]; + break; + } + } + + return layout; +} + + +int +dht_layouts_init (xlator_t *this, dht_conf_t *conf) +{ + dht_layout_t *layout = NULL; + int i = 0; + int ret = -1; + + + conf->file_layouts = CALLOC (conf->subvolume_cnt, + sizeof (dht_layout_t *)); + if (!conf->file_layouts) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + layout = dht_layout_new (this, 1); + + if (!layout) { + goto out; + } + + layout->preset = 1; + + layout->list[0].xlator = conf->subvolumes[i]; + + conf->file_layouts[i] = layout; + } + + ret = 0; +out: + return ret; +} + + +int +dht_disk_layout_extract (xlator_t *this, dht_layout_t *layout, + int pos, int32_t **disk_layout_p) +{ + int ret = -1; + int32_t *disk_layout = NULL; + + disk_layout = CALLOC (5, sizeof (int)); + if (!disk_layout) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + disk_layout[0] = hton32 (1); + disk_layout[1] = hton32 (layout->type); + disk_layout[2] = hton32 (layout->list[pos].start); + disk_layout[3] = hton32 (layout->list[pos].stop); + + if (disk_layout_p) + *disk_layout_p = disk_layout; + ret = 0; + +out: + return ret; +} + + +int +dht_disk_layout_merge (xlator_t *this, dht_layout_t *layout, + int pos, int32_t *disk_layout) +{ + int cnt = 0; + int type = 0; + int start_off = 0; + int stop_off = 0; + + + /* TODO: assert disk_layout_ptr is of required length */ + + cnt = ntoh32 (disk_layout[0]); + if (cnt != 1) { + gf_log (this->name, GF_LOG_ERROR, + "disk layout has invalid count %d", cnt); + return -1; + } + + /* TODO: assert type is compatible */ + type = ntoh32 (disk_layout[1]); + start_off = ntoh32 (disk_layout[2]); + stop_off = ntoh32 (disk_layout[3]); + + layout->list[pos].start = start_off; + layout->list[pos].stop = stop_off; + + gf_log (this->name, GF_LOG_DEBUG, + "merged to layout: %u - %u (type %d) from %s", + start_off, stop_off, type, + layout->list[pos].xlator->name); + + return 0; +} + + +int +dht_layout_merge (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + int op_ret, int op_errno, dict_t *xattr) +{ + int i = 0; + int ret = -1; + int err = -1; + int32_t *disk_layout = NULL; + + + if (op_ret != 0) { + err = op_errno; + } + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == NULL) { + layout->list[i].err = err; + layout->list[i].xlator = subvol; + break; + } + } + + if (op_ret != 0) { + ret = 0; + goto out; + } + + if (xattr) { + /* during lookup and not mkdir */ + ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", + VOID(&disk_layout)); + } + + if (ret != 0) { + layout->list[i].err = -1; + gf_log (this->name, GF_LOG_DEBUG, + "missing disk layout on %s. err = %d", + subvol->name, err); + ret = 0; + goto out; + } + + ret = dht_disk_layout_merge (this, layout, i, disk_layout); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "layout merge from subvolume %s failed", + subvol->name); + goto out; + } + layout->list[i].err = 0; + +out: + return ret; +} + + +void +dht_layout_entry_swap (dht_layout_t *layout, int i, int j) +{ + uint32_t start_swap = 0; + uint32_t stop_swap = 0; + xlator_t *xlator_swap = 0; + int err_swap = 0; + + + start_swap = layout->list[i].start; + stop_swap = layout->list[i].stop; + xlator_swap = layout->list[i].xlator; + err_swap = layout->list[i].err; + + layout->list[i].start = layout->list[j].start; + layout->list[i].stop = layout->list[j].stop; + layout->list[i].xlator = layout->list[j].xlator; + layout->list[i].err = layout->list[j].err; + + layout->list[j].start = start_swap; + layout->list[j].stop = stop_swap; + layout->list[j].xlator = xlator_swap; + layout->list[j].err = err_swap; +} + + +int64_t +dht_layout_entry_cmp (dht_layout_t *layout, int i, int j) +{ + int64_t diff = 0; + + if (layout->list[i].err || layout->list[j].err) + diff = layout->list[i].err - layout->list[j].err; + else + diff = (int64_t) layout->list[i].start + - (int64_t) layout->list[j].start; + + return diff; +} + + +int +dht_layout_sort (dht_layout_t *layout) +{ + int i = 0; + int j = 0; + int64_t ret = 0; + + /* TODO: O(n^2) -- bad bad */ + + for (i = 0; i < layout->cnt - 1; i++) { + for (j = i + 1; j < layout->cnt; j++) { + ret = dht_layout_entry_cmp (layout, i, j); + if (ret > 0) + dht_layout_entry_swap (layout, i, j); + } + } + + return 0; +} + + +int +dht_layout_anomalies (xlator_t *this, loc_t *loc, dht_layout_t *layout, + uint32_t *holes_p, uint32_t *overlaps_p, + uint32_t *missing_p, uint32_t *down_p, uint32_t *misc_p) +{ + dht_conf_t *conf = NULL; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + uint32_t hole_cnt = 0; + uint32_t overlap_cnt = 0; + int i = 0; + int ret = 0; + uint32_t prev_stop = 0; + uint32_t last_stop = 0; + char is_virgin = 1; + + + conf = this->private; + + /* TODO: explain WTF is happening */ + + last_stop = layout->list[0].start - 1; + prev_stop = last_stop; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err) { + switch (layout->list[i].err) { + case -1: + case ENOENT: + missing++; + break; + case ENOTCONN: + down++; + break; + default: + misc++; + } + continue; + } + + is_virgin = 0; + + if ((prev_stop + 1) < layout->list[i].start) { + hole_cnt++; + holes += (layout->list[i].start - (prev_stop + 1)); + } + + if ((prev_stop + 1) > layout->list[i].start) { + overlap_cnt++; + overlaps += ((prev_stop + 1) - layout->list[i].start); + } + prev_stop = layout->list[i].stop; + } + + if ((last_stop - prev_stop) || is_virgin) + hole_cnt++; + holes += (last_stop - prev_stop); + + if (holes_p) + *holes_p = hole_cnt; + + if (overlaps_p) + *overlaps_p = overlap_cnt; + + if (missing_p) + *missing_p = missing; + + if (down_p) + *down_p = down; + + if (misc_p) + *misc_p = misc; + + return ret; +} + + +int +dht_layout_normalize (xlator_t *this, loc_t *loc, dht_layout_t *layout) +{ + int ret = 0; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + + + ret = dht_layout_sort (layout); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "sort failed?! how the ...."); + goto out; + } + + ret = dht_layout_anomalies (this, loc, layout, + &holes, &overlaps, + &missing, &down, &misc); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "error while finding anomalies in %s -- not good news", + loc->path); + goto out; + } + + if (holes || overlaps) { + if (missing == layout->cnt) { + gf_log (this->name, GF_LOG_WARNING, + "directory %s looked up first time", + loc->path); + } else { + gf_log (this->name, GF_LOG_ERROR, + "found anomalies in %s. holes=%d overlaps=%d", + loc->path, holes, overlaps); + } + ret = 1; + } + +out: + return ret; +} + + +int +dht_layout_dir_mismatch (xlator_t *this, dht_layout_t *layout, xlator_t *subvol, + loc_t *loc, dict_t *xattr) +{ + int idx = 0; + int pos = -1; + int ret = -1; + int32_t *disk_layout = NULL; + int32_t count = -1; + uint32_t start_off = -1; + uint32_t stop_off = -1; + + + for (idx = 0; idx < layout->cnt; idx++) { + if (layout->list[idx].xlator == subvol) { + pos = idx; + break; + } + } + + if (pos == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "%s - no layout info for subvolume %s", + loc->path, subvol->name); + ret = 1; + goto out; + } + + if (xattr == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "%s - xattr dictionary is NULL", + loc->path); + ret = -1; + goto out; + } + + ret = dict_get_ptr (xattr, "trusted.glusterfs.dht", + VOID(&disk_layout)); + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s - disk layout missing", loc->path); + ret = -1; + goto out; + } + + count = ntoh32 (disk_layout[0]); + if (count != 1) { + gf_log (this->name, GF_LOG_ERROR, + "%s - disk layout has invalid count %d", + loc->path, count); + ret = -1; + goto out; + } + + start_off = ntoh32 (disk_layout[2]); + stop_off = ntoh32 (disk_layout[3]); + + if ((layout->list[pos].start != start_off) + || (layout->list[pos].stop != stop_off)) { + gf_log (this->name, GF_LOG_DEBUG, + "subvol: %s; inode layout - %"PRId32" - %"PRId32"; " + "disk layout - %"PRId32" - %"PRId32, + layout->list[pos].xlator->name, + layout->list[pos].start, layout->list[pos].stop, + start_off, stop_off); + ret = 1; + } else { + ret = 0; + } +out: + return ret; +} + diff --git a/xlators/cluster/dht/src/dht-linkfile.c b/xlators/cluster/dht/src/dht-linkfile.c new file mode 100644 index 000000000..9cc24ccf6 --- /dev/null +++ b/xlators/cluster/dht/src/dht-linkfile.c @@ -0,0 +1,224 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "compat.h" +#include "dht-common.h" + + + +int +dht_linkfile_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + + + local = frame->local; + local->linkfile.linkfile_cbk (frame, cookie, this, op_ret, op_errno, + local->linkfile.inode, + &local->linkfile.stbuf); + + return 0; +} + + +int +dht_linkfile_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + dict_t *xattr = NULL; + data_t *str_data = NULL; + int ret = -1; + + local = frame->local; + prev = cookie; + + if (op_ret == -1) + goto err; + + xattr = get_new_dict (); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->linkfile.xattr = dict_ref (xattr); + local->linkfile.inode = inode_ref (inode); + + str_data = str_to_data (local->linkfile.srcvol->name); + if (!str_data) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + ret = dict_set (xattr, "trusted.glusterfs.dht.linkto", str_data); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to initialize linkfile data"); + op_errno = EINVAL; + } + str_data = NULL; + + local->linkfile.stbuf = *stbuf; + + STACK_WIND (frame, dht_linkfile_xattr_cbk, + prev->this, prev->this->fops->setxattr, + &local->linkfile.loc, local->linkfile.xattr, 0); + + return 0; + +err: + if (str_data) { + data_destroy (str_data); + str_data = NULL; + } + + local->linkfile.linkfile_cbk (frame, cookie, this, + op_ret, op_errno, inode, stbuf); + return 0; +} + + +int +dht_linkfile_create (call_frame_t *frame, fop_mknod_cbk_t linkfile_cbk, + xlator_t *tovol, xlator_t *fromvol, loc_t *loc) +{ + dht_local_t *local = NULL; + + + local = frame->local; + local->linkfile.linkfile_cbk = linkfile_cbk; + local->linkfile.srcvol = tovol; + loc_copy (&local->linkfile.loc, loc); + + STACK_WIND (frame, dht_linkfile_create_cbk, + fromvol, fromvol->fops->mknod, loc, + S_IFREG | DHT_LINKFILE_MODE, 0); + + return 0; +} + + +int +dht_linkfile_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + + local = frame->local; + prev = cookie; + subvol = prev->this; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "unlinking linkfile %s on %s failed (%s)", + local->loc.path, subvol->name, strerror (op_errno)); + } + + DHT_STACK_DESTROY (frame); + + return 0; +} + + +int +dht_linkfile_unlink (call_frame_t *frame, xlator_t *this, + xlator_t *subvol, loc_t *loc) +{ + call_frame_t *unlink_frame = NULL; + dht_local_t *unlink_local = NULL; + + unlink_frame = copy_frame (frame); + if (!unlink_frame) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + unlink_local = dht_local_init (unlink_frame); + if (!unlink_local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + loc_copy (&unlink_local->loc, loc); + + STACK_WIND (unlink_frame, dht_linkfile_unlink_cbk, + subvol, subvol->fops->unlink, + &unlink_local->loc); + + return 0; +err: + if (unlink_frame) + DHT_STACK_DESTROY (unlink_frame); + + return -1; +} + + +xlator_t * +dht_linkfile_subvol (xlator_t *this, inode_t *inode, struct stat *stbuf, + dict_t *xattr) +{ + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + void *volname = NULL; + int i = 0, ret = 0; + + + conf = this->private; + + if (!xattr) + goto out; + + ret = dict_get_ptr (xattr, "trusted.glusterfs.dht.linkto", &volname); + + if ((-1 == ret) || !volname) + goto out; + + for (i = 0; i < conf->subvolume_cnt; i++) { + if (strcmp (conf->subvolumes[i]->name, (char *)volname) == 0) { + subvol = conf->subvolumes[i]; + break; + } + } + +out: + return subvol; +} + + diff --git a/xlators/cluster/dht/src/dht-rename.c b/xlators/cluster/dht/src/dht-rename.c new file mode 100644 index 000000000..e5532f1bc --- /dev/null +++ b/xlators/cluster/dht/src/dht-rename.c @@ -0,0 +1,562 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/* TODO: link(oldpath, newpath) fails if newpath already exists. DHT should + * delete the newpath if it gets EEXISTS from link() call. + */ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" +#include "defaults.h" + + +int +dht_rename_dir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *stbuf) +{ + dht_local_t *local = NULL; + int this_call_cnt = 0; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + /* TODO: undo the damage */ + + gf_log (this->name, GF_LOG_ERROR, + "rename %s -> %s on %s failed (%s)", + local->loc.path, local->loc2.path, + prev->this->name, strerror (op_errno)); + + local->op_ret = op_ret; + local->op_errno = op_errno; + } else { + /* TODO: construct proper stbuf for dir */ + local->stbuf = *stbuf; + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + } + + return 0; +} + + + +int +dht_rename_dir_do (call_frame_t *frame, xlator_t *this) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int i = 0; + + conf = this->private; + local = frame->local; + + if (local->op_ret == -1) + goto err; + + local->call_cnt = conf->subvolume_cnt; + local->op_ret = 0; + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rename_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->rename, + &local->loc, &local->loc2); + } + + return 0; + +err: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno); + return 0; +} + + +int +dht_rename_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *entries) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + local = frame->local; + prev = cookie; + + if (op_ret > 2) { + gf_log (this->name, GF_LOG_DEBUG, + "readdir on %s for %s returned %d entries", + prev->this->name, local->loc.path, op_ret); + local->op_ret = -1; + local->op_errno = ENOTEMPTY; + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rename_dir_do (frame, this); + } + + return 0; +} + + +int +dht_rename_opendir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, fd_t *fd) +{ + dht_local_t *local = NULL; + int this_call_cnt = -1; + call_frame_t *prev = NULL; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "opendir on %s for %s failed (%s)", + prev->this->name, local->loc.path, + strerror (op_errno)); + goto err; + } + + STACK_WIND (frame, dht_rename_readdir_cbk, + prev->this, prev->this->fops->readdir, + local->fd, 4096, 0); + + return 0; + +err: + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_rename_dir_do (frame, this); + } + + return 0; +} + + +int +dht_rename_dir (call_frame_t *frame, xlator_t *this) +{ + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + int i = 0; + int op_errno = -1; + + + conf = frame->this->private; + local = frame->local; + + local->call_cnt = conf->subvolume_cnt; + + local->fd = fd_create (local->loc.inode, frame->root->pid); + if (!local->fd) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->op_ret = 0; + + if (!local->dst_cached) { + dht_rename_dir_do (frame, this); + return 0; + } + + for (i = 0; i < conf->subvolume_cnt; i++) { + STACK_WIND (frame, dht_rename_opendir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->opendir, + &local->loc2, local->fd); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + +int +dht_rename_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + + local = frame->local; + prev = cookie; + + this_call_cnt = dht_frame_return (frame); + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "unlink on %s failed (%s)", + prev->this->name, strerror (op_errno)); + } + + if (is_last_call (this_call_cnt)) + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_rename_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *rename_subvol = NULL; + + local = frame->local; + prev = cookie; + + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "rename on %s failed (%s)", prev->this->name, + strerror (op_errno)); + local->op_ret = op_ret; + local->op_errno = op_errno; + goto unwind; + } + + /* NOTE: rename_subvol is the same subvolume from which dht_rename_cbk + * is called. since rename has already happened on rename_subvol, + * unlink should not be sent for oldpath (either linkfile or cached-file) + * on rename_subvol. */ + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; + + /* TODO: delete files in background */ + + if (src_cached != dst_hashed && src_cached != dst_cached) + local->call_cnt++; + + if (src_hashed != rename_subvol && src_hashed != src_cached) + local->call_cnt++; + + if (dst_cached && dst_cached != dst_hashed && dst_cached != src_cached) + local->call_cnt++; + + if (local->call_cnt == 0) + goto unwind; + + if (src_cached != dst_hashed && src_cached != dst_cached) { + gf_log (this->name, GF_LOG_DEBUG, + "deleting old src datafile %s @ %s", + local->loc.path, src_cached->name); + + STACK_WIND (frame, dht_rename_unlink_cbk, + src_cached, src_cached->fops->unlink, + &local->loc); + } + + if (src_hashed != rename_subvol && src_hashed != src_cached) { + gf_log (this->name, GF_LOG_DEBUG, + "deleting old src linkfile %s @ %s", + local->loc.path, src_hashed->name); + + STACK_WIND (frame, dht_rename_unlink_cbk, + src_hashed, src_hashed->fops->unlink, + &local->loc); + } + + if (dst_cached + && (dst_cached != dst_hashed) + && (dst_cached != src_cached)) { + gf_log (this->name, GF_LOG_DEBUG, + "deleting old dst datafile %s @ %s", + local->loc2.path, dst_cached->name); + + STACK_WIND (frame, dht_rename_unlink_cbk, + dst_cached, dst_cached->fops->unlink, + &local->loc2); + } + return 0; + +unwind: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_do_rename (call_frame_t *frame) +{ + dht_local_t *local = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_cached = NULL; + xlator_t *this = NULL; + xlator_t *rename_subvol = NULL; + + + local = frame->local; + this = frame->this; + + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + src_cached = local->src_cached; + + if (src_cached == dst_cached) + rename_subvol = src_cached; + else + rename_subvol = dst_hashed; + + gf_log (this->name, GF_LOG_DEBUG, + "renaming %s => %s (%s)", + local->loc.path, local->loc2.path, rename_subvol->name); + + STACK_WIND (frame, dht_rename_cbk, + rename_subvol, rename_subvol->fops->rename, + &local->loc, &local->loc2); + + return 0; +} + + +int +dht_rename_links_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + int this_call_cnt = 0; + + + local = frame->local; + prev = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "link/file on %s failed (%s)", + prev->this->name, strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + + this_call_cnt = dht_frame_return (frame); + if (is_last_call (this_call_cnt)) { + if (local->op_ret == -1) + goto unwind; + + dht_do_rename (frame); + } + + return 0; + +unwind: + DHT_STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + + return 0; +} + + +int +dht_rename_create_links (call_frame_t *frame) +{ + dht_local_t *local = NULL; + xlator_t *this = NULL; + xlator_t *src_hashed = NULL; + xlator_t *src_cached = NULL; + xlator_t *dst_hashed = NULL; + xlator_t *dst_cached = NULL; + int call_cnt = 0; + + + local = frame->local; + this = frame->this; + + src_hashed = local->src_hashed; + src_cached = local->src_cached; + dst_hashed = local->dst_hashed; + dst_cached = local->dst_cached; + + if (src_cached == dst_cached) + goto nolinks; + + if (dst_hashed != src_hashed && dst_hashed != src_cached) + call_cnt++; + + if (src_cached != dst_hashed) + call_cnt++; + + local->call_cnt = call_cnt; + + if (dst_hashed != src_hashed && dst_hashed != src_cached) { + gf_log (this->name, GF_LOG_DEBUG, + "linkfile %s @ %s => %s", + local->loc.path, dst_hashed->name, src_cached->name); + dht_linkfile_create (frame, dht_rename_links_cbk, + src_cached, dst_hashed, &local->loc); + } + + if (src_cached != dst_hashed) { + gf_log (this->name, GF_LOG_DEBUG, + "link %s => %s (%s)", local->loc.path, + local->loc2.path, src_cached->name); + STACK_WIND (frame, dht_rename_links_cbk, + src_cached, src_cached->fops->link, + &local->loc, &local->loc2); + } + +nolinks: + if (!call_cnt) { + /* skip to next step */ + dht_do_rename (frame); + } + + return 0; +} + + +int +dht_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + xlator_t *src_cached = NULL; + xlator_t *src_hashed = NULL; + xlator_t *dst_cached = NULL; + xlator_t *dst_hashed = NULL; + int op_errno = -1; + int ret = -1; + dht_local_t *local = NULL; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (newloc, err); + + src_hashed = dht_subvol_get_hashed (this, oldloc); + if (!src_hashed) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + oldloc->path); + op_errno = EINVAL; + goto err; + } + + src_cached = dht_subvol_get_cached (this, oldloc->inode); + if (!src_cached) { + gf_log (this->name, GF_LOG_ERROR, + "no cached subvolume for path=%s", oldloc->path); + op_errno = EINVAL; + goto err; + } + + dst_hashed = dht_subvol_get_hashed (this, newloc); + if (!dst_hashed) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + newloc->path); + op_errno = EINVAL; + goto err; + } + + if (newloc->inode) + dst_cached = dht_subvol_get_cached (this, newloc->inode); + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc, oldloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_copy (&local->loc2, newloc); + if (ret == -1) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + local->src_hashed = src_hashed; + local->src_cached = src_cached; + local->dst_hashed = dst_hashed; + local->dst_cached = dst_cached; + + gf_log (this->name, GF_LOG_DEBUG, + "renaming %s (hash=%s/cache=%s) => %s (hash=%s/cache=%s)", + oldloc->path, src_hashed->name, src_cached->name, + newloc->path, dst_hashed->name, + dst_cached ? dst_cached->name : "<nul>"); + + if (S_ISDIR (oldloc->inode->st_mode)) { + dht_rename_dir (frame, this); + } else { + local->op_ret = 0; + dht_rename_create_links (frame); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht-selfheal.c b/xlators/cluster/dht/src/dht-selfheal.c new file mode 100644 index 000000000..ee32b2253 --- /dev/null +++ b/xlators/cluster/dht/src/dht-selfheal.c @@ -0,0 +1,460 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "xlator.h" +#include "dht-common.h" + + +int +dht_selfheal_dir_finish (call_frame_t *frame, xlator_t *this, int ret) +{ + dht_local_t *local = NULL; + + + local = frame->local; + local->selfheal.dir_cbk (frame, NULL, frame->this, ret, + local->op_errno); + + return 0; +} + + +int +dht_selfheal_dir_xattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + int i = 0; + dht_layout_t *layout = NULL; + int err = 0; + int this_call_cnt = 0; + + local = frame->local; + layout = local->selfheal.layout; + prev = cookie; + subvol = prev->this; + + if (op_ret == 0) + err = 0; + else + err = op_errno; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + layout->list[i].err = err; + break; + } + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_selfheal_dir_finish (frame, this, 0); + } + + return 0; +} + + +int +dht_selfheal_dir_xattr_persubvol (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout, int i) +{ + xlator_t *subvol = NULL; + dict_t *xattr = NULL; + int ret = 0; + xlator_t *this = NULL; + int32_t *disk_layout = NULL; + + + subvol = layout->list[i].xlator; + this = frame->this; + + xattr = get_new_dict (); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = dht_disk_layout_extract (this, layout, i, &disk_layout); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "failed to extract disk layout"); + goto err; + } + + ret = dict_set_bin (xattr, "trusted.glusterfs.dht", + disk_layout, 4 * 4); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set xattr dictionary"); + goto err; + } + disk_layout = NULL; + + gf_log (this->name, GF_LOG_DEBUG, + "setting hash range %u - %u (type %d) on subvolume %s for %s", + layout->list[i].start, layout->list[i].stop, + layout->type, subvol->name, loc->path); + + dict_ref (xattr); + + STACK_WIND (frame, dht_selfheal_dir_xattr_cbk, + subvol, subvol->fops->setxattr, + loc, xattr, 0); + + dict_unref (xattr); + + return 0; + +err: + if (xattr) + dict_destroy (xattr); + + if (disk_layout) + FREE (disk_layout); + + dht_selfheal_dir_xattr_cbk (frame, subvol, frame->this, + -1, ENOMEM); + return 0; +} + + +int +dht_selfheal_dir_xattr (call_frame_t *frame, loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + int missing_xattr = 0; + int i = 0; + int ret = 0; + xlator_t *this = NULL; + + local = frame->local; + this = frame->this; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) + continue; + /* attr missing and layout present */ + missing_xattr++; + } + + gf_log (this->name, GF_LOG_DEBUG, + "%d subvolumes missing xattr for %s", + missing_xattr, loc->path); + + if (missing_xattr == 0) { + dht_selfheal_dir_finish (frame, this, 0); + return 0; + } + + local->call_cnt = missing_xattr; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err != -1 || !layout->list[i].stop) + continue; + + ret = dht_selfheal_dir_xattr_persubvol (frame, loc, layout, i); + + if (--missing_xattr == 0) + break; + } + return 0; +} + + +int +dht_selfheal_dir_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + dht_layout_t *layout = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + int i = 0; + int this_call_cnt = 0; + + + local = frame->local; + layout = local->selfheal.layout; + prev = cookie; + subvol = prev->this; + + if ((op_ret == 0) || (op_errno == EEXIST)) { + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].xlator == subvol) { + layout->list[i].err = -1; + break; + } + } + } + + this_call_cnt = dht_frame_return (frame); + + if (is_last_call (this_call_cnt)) { + dht_selfheal_dir_xattr (frame, &local->loc, layout); + } + + return 0; +} + + +int +dht_selfheal_dir_mkdir (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout, int force) +{ + int missing_dirs = 0; + int i = 0; + dht_local_t *local = NULL; + xlator_t *this = NULL; + + + local = frame->local; + this = frame->this; + + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || force) + missing_dirs++; + } + + if (missing_dirs == 0) { + dht_selfheal_dir_xattr (frame, loc, layout); + return 0; + } + + local->call_cnt = missing_dirs; + for (i = 0; i < layout->cnt; i++) { + if (layout->list[i].err == ENOENT || force) { + gf_log (this->name, GF_LOG_DEBUG, + "creating directory %s on subvol %s", + loc->path, layout->list[i].xlator->name); + + STACK_WIND (frame, dht_selfheal_dir_mkdir_cbk, + layout->list[i].xlator, + layout->list[i].xlator->fops->mkdir, + loc, local->stbuf.st_mode); + } + } + + return 0; +} + +void +dht_selfheal_fix_this_virgin (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + dht_conf_t *conf = NULL; + xlator_t *this = NULL; + uint32_t chunk = 0; + int i = 0; + uint32_t start = 0; + int cnt = 0; + int err = 0; + + this = frame->this; + conf = this->private; + + for (i = 0; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == -1) { + cnt++; + } + } + + chunk = ((unsigned long) 0xffffffff) / cnt; + + start = 0; + for (i = 0; i < layout->cnt; i++) { + err = layout->list[i].err; + if (err == -1) { + layout->list[i].start = start; + layout->list[i].stop = start + chunk - 1; + + start = start + chunk; + + gf_log (this->name, GF_LOG_DEBUG, + "gave fix: %u - %u on %s for %s", + layout->list[i].start, layout->list[i].stop, + layout->list[i].xlator->name, loc->path); + if (--cnt == 0) { + layout->list[i].stop = 0xffffffff; + break; + } + } + } +} + + +int +dht_selfheal_dir_getafix (call_frame_t *frame, loc_t *loc, + dht_layout_t *layout) +{ + dht_conf_t *conf = NULL; + xlator_t *this = NULL; + dht_local_t *local = NULL; + int missing = -1; + int down = -1; + int holes = -1; + int ret = -1; + int i = -1; + + this = frame->this; + conf = this->private; + local = frame->local; + + missing = local->selfheal.missing; + down = local->selfheal.down; + holes = local->selfheal.hole_cnt; + + if ((missing + down) == conf->subvolume_cnt) { + dht_selfheal_fix_this_virgin (frame, loc, layout); + ret = 0; + } + + if (holes <= down) { + /* the down subvol might fill up the holes */ + ret = 0; + } + + for (i = 0; i < layout->cnt; i++) { + /* directory not present */ + if (layout->list[i].err == ENOENT) { + ret = 0; + break; + } + } + + /* TODO: give a fix to these non-virgins */ + + return ret; +} + + +int +dht_selfheal_directory (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + loc_t *loc, dht_layout_t *layout) +{ + dht_local_t *local = NULL; + uint32_t holes = 0; + uint32_t overlaps = 0; + uint32_t missing = 0; + uint32_t down = 0; + uint32_t misc = 0; + int ret = 0; + xlator_t *this = NULL; + + + local = frame->local; + this = frame->this; + + ret = dht_layout_anomalies (this, loc, layout, + &local->selfheal.hole_cnt, + &local->selfheal.overlaps_cnt, + &local->selfheal.missing, + &local->selfheal.down, + &local->selfheal.misc); + + holes = local->selfheal.hole_cnt; + overlaps = local->selfheal.overlaps_cnt; + missing = local->selfheal.missing; + down = local->selfheal.down; + misc = local->selfheal.misc; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = layout; + +/* + if (down) { + gf_log (this->name, GF_LOG_ERROR, + "%d subvolumes down -- not fixing", down); + ret = 0; + goto sorry_no_fix; + } + + if (overlaps) { + gf_log (this->name, GF_LOG_ERROR, + "not fixing overlaps in %s", loc->path); + local->op_errno = EINVAL; + ret = -1; + goto sorry_no_fix; + } + + if (misc) { + gf_log (this->name, GF_LOG_ERROR, + "%d subvolumes have unrecoverable errors", misc); + ret = 0; + goto sorry_no_fix; + } + + if (holes > missing) { + gf_log (this->name, GF_LOG_ERROR, + "%d holes and %d pigeons -- not fixing", + holes, missing); + ret = 0; + goto sorry_no_fix; + } +*/ + ret = dht_selfheal_dir_getafix (frame, loc, layout); + + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "the directory is not a virgin"); + goto sorry_no_fix; + } + + dht_selfheal_dir_mkdir (frame, loc, layout, 0); + + return 0; + +sorry_no_fix: + /* TODO: need to put appropriate local->op_errno */ + dht_selfheal_dir_finish (frame, this, ret); + + return 0; +} + + +int +dht_selfheal_restore (call_frame_t *frame, dht_selfheal_dir_cbk_t dir_cbk, + loc_t *loc, dht_layout_t *layout) +{ + int ret = 0; + dht_local_t *local = NULL; + + + local = frame->local; + + local->selfheal.dir_cbk = dir_cbk; + local->selfheal.layout = layout; + + ret = dht_selfheal_dir_mkdir (frame, loc, layout, 1); + + return 0; +} diff --git a/xlators/cluster/dht/src/dht.c b/xlators/cluster/dht/src/dht.c new file mode 100644 index 000000000..836e7a4e8 --- /dev/null +++ b/xlators/cluster/dht/src/dht.c @@ -0,0 +1,222 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/* TODO: add NS locking */ + +#include "dht-common.c" + +/* TODO: + - use volumename in xattr instead of "dht" + - use NS locks + - handle all cases in self heal layout reconstruction + - complete linkfile selfheal +*/ + + + +int +notify (xlator_t *this, int event, void *data, ...) +{ + int ret = -1; + + ret = dht_notify (this, event, data); + + return ret; +} + +void +fini (xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + conf = this->private; + + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + FREE (conf->file_layouts[i]); + } + FREE (conf->file_layouts); + } + + if (conf->default_dir_layout) + FREE (conf->default_dir_layout); + + if (conf->subvolumes) + FREE (conf->subvolumes); + + if (conf->subvolume_status) + FREE (conf->subvolume_status); + + FREE (conf); + } + + return; +} + +int +init (xlator_t *this) +{ + dht_conf_t *conf = NULL; + char *lookup_unhashed_str = NULL; + int ret = -1; + int i = 0; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "DHT needs more than one child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + conf = CALLOC (1, sizeof (*conf)); + if (!conf) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + conf->search_unhashed = 0; + + if (dict_get_str (this->options, "lookup-unhashed", + &lookup_unhashed_str) == 0) { + gf_string2boolean (lookup_unhashed_str, + &conf->search_unhashed); + } + + ret = dht_init_subvolumes (this, conf); + if (ret == -1) { + goto err; + } + + ret = dht_layouts_init (this, conf); + if (ret == -1) { + goto err; + } + + LOCK_INIT (&conf->subvolume_lock); + + conf->gen = 1; + + this->private = conf; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + FREE (conf->file_layouts[i]); + } + FREE (conf->file_layouts); + } + + if (conf->default_dir_layout) + FREE (conf->default_dir_layout); + + if (conf->subvolumes) + FREE (conf->subvolumes); + + if (conf->subvolume_status) + FREE (conf->subvolume_status); + + FREE (conf); + } + + return -1; +} + + +struct xlator_fops fops = { + .lookup = dht_lookup, + .mknod = dht_mknod, + .create = dht_create, + + .stat = dht_stat, + .chmod = dht_chmod, + .chown = dht_chown, + .fchown = dht_fchown, + .fchmod = dht_fchmod, + .fstat = dht_fstat, + .utimens = dht_utimens, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .access = dht_access, + .readlink = dht_readlink, + .setxattr = dht_setxattr, + .getxattr = dht_getxattr, + .removexattr = dht_removexattr, + .open = dht_open, + .readv = dht_readv, + .writev = dht_writev, + .flush = dht_flush, + .fsync = dht_fsync, + .statfs = dht_statfs, + .lk = dht_lk, + .opendir = dht_opendir, + .readdir = dht_readdir, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, +#if 0 + .setdents = dht_setdents, + .getdents = dht_getdents, + .checksum = dht_checksum, +#endif +}; + + +struct xlator_mops mops = { +}; + + +struct xlator_cbks cbks = { +// .release = dht_release, +// .releasedir = dht_releasedir, + .forget = dht_forget +}; + + +struct volume_options options[] = { + { .key = {"lookup-unhashed"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/dht/src/nufa.c b/xlators/cluster/dht/src/nufa.c new file mode 100644 index 000000000..6333e002f --- /dev/null +++ b/xlators/cluster/dht/src/nufa.c @@ -0,0 +1,684 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "dht-common.c" + +/* TODO: all 'TODO's in dht.c holds good */ + +int +nufa_local_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf, dict_t *xattr) +{ + dht_layout_t *layout = NULL; + xlator_t *subvol = NULL; + char is_linkfile = 0; + char is_dir = 0; + dht_conf_t *conf = NULL; + dht_local_t *local = NULL; + loc_t *loc = NULL; + int i = 0; + call_frame_t *prev = NULL; + int call_cnt = 0; + + + conf = this->private; + + prev = cookie; + local = frame->local; + loc = &local->loc; + + if (ENTRY_MISSING (op_ret, op_errno)) { + if (conf->search_unhashed) { + local->op_errno = ENOENT; + dht_lookup_everywhere (frame, this, loc); + return 0; + } + } + + if (op_ret == -1) + goto out; + + is_linkfile = check_is_linkfile (inode, stbuf, xattr); + is_dir = check_is_dir (inode, stbuf, xattr); + + if (!is_dir && !is_linkfile) { + /* non-directory and not a linkfile */ + + dht_itransform (this, prev->this, stbuf->st_ino, + &stbuf->st_ino); + + layout = dht_layout_for_subvol (this, prev->this); + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "no pre-set layout for subvolume %s", + prev->this->name); + op_ret = -1; + op_errno = EINVAL; + goto err; + } + + inode_ctx_put (inode, this, (uint64_t)(long)layout); + goto out; + } + + if (is_dir) { + call_cnt = conf->subvolume_cnt; + local->call_cnt = call_cnt; + + local->inode = inode_ref (inode); + local->xattr = dict_ref (xattr); + + local->op_ret = 0; + local->op_errno = 0; + + local->layout = dht_layout_new (this, conf->subvolume_cnt); + if (!local->layout) { + op_ret = -1; + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + for (i = 0; i < call_cnt; i++) { + STACK_WIND (frame, dht_lookup_dir_cbk, + conf->subvolumes[i], + conf->subvolumes[i]->fops->lookup, + &local->loc, local->xattr_req); + } + } + + if (is_linkfile) { + subvol = dht_linkfile_subvol (this, inode, stbuf, xattr); + + if (!subvol) { + gf_log (this->name, GF_LOG_WARNING, + "linkfile not having link subvolume. path=%s", + loc->path); + dht_lookup_everywhere (frame, this, loc); + return 0; + } + + STACK_WIND (frame, dht_lookup_linkfile_cbk, + subvol, subvol->fops->lookup, + &local->loc, local->xattr_req); + } + + return 0; + +out: + if (!local->hashed_subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + local->loc.path); + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, dht_lookup_cbk, + local->hashed_subvol, local->hashed_subvol->fops->lookup, + &local->loc, local->xattr_req); + + return 0; + + err: + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, xattr); + return 0; +} + +int +nufa_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + xlator_t *hashed_subvol = NULL; + xlator_t *cached_subvol = NULL; + xlator_t *subvol = NULL; + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + int ret = -1; + int op_errno = -1; + dht_layout_t *layout = NULL; + int i = 0; + int call_cnt = 0; + + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + ret = loc_dup (loc, &local->loc); + if (ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "copying location failed for path=%s", + loc->path); + goto err; + } + + if (xattr_req) { + local->xattr_req = dict_ref (xattr_req); + } else { + local->xattr_req = dict_new (); + } + + hashed_subvol = dht_subvol_get_hashed (this, &local->loc); + cached_subvol = dht_subvol_get_cached (this, local->loc.inode); + + local->cached_subvol = cached_subvol; + local->hashed_subvol = hashed_subvol; + + if (is_revalidate (loc)) { + layout = dht_layout_get (this, loc->inode); + + if (!layout) { + gf_log (this->name, GF_LOG_ERROR, + "revalidate without cache. path=%s", + loc->path); + op_errno = EINVAL; + goto err; + } + + if (layout->gen && (layout->gen < conf->gen)) { + gf_log (this->name, GF_LOG_WARNING, + "incomplete layout failure for path=%s", + loc->path); + op_errno = EAGAIN; + goto err; + } + + local->inode = inode_ref (loc->inode); + local->st_ino = loc->inode->ino; + + local->call_cnt = layout->cnt; + call_cnt = local->call_cnt; + + /* NOTE: we don't require 'trusted.glusterfs.dht.linkto' attribute, + * revalidates directly go to the cached-subvolume. + */ + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht", 4 * 4); + + for (i = 0; i < layout->cnt; i++) { + subvol = layout->list[i].xlator; + + STACK_WIND (frame, dht_revalidate_cbk, + subvol, subvol->fops->lookup, + loc, local->xattr_req); + + if (!--call_cnt) + break; + } + } else { + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht", 4 * 4); + + ret = dict_set_uint32 (local->xattr_req, + "trusted.glusterfs.dht.linkto", 256); + + /* Send it to only local volume */ + STACK_WIND (frame, nufa_local_lookup_cbk, + conf->local_volume, + conf->local_volume->fops->lookup, + loc, local->xattr_req); + } + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +nufa_create_linkfile_create_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + dht_conf_t *conf = NULL; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret == -1) + goto err; + + STACK_WIND (frame, dht_create_cbk, + conf->local_volume, conf->local_volume->fops->create, + &local->loc, local->flags, local->mode, local->fd); + + return 0; + + err: + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + return 0; +} + +int +nufa_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + int op_errno = -1; + int ret = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + conf = this->private; + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + if (subvol != conf->local_volume) { + /* create a link file instead of actual file */ + ret = loc_copy (&local->loc, loc); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->fd = fd_ref (fd); + local->mode = mode; + local->flags = flags; + + dht_linkfile_create (frame, nufa_create_linkfile_create_cbk, + conf->local_volume, subvol, loc); + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_create_cbk, + subvol, subvol->fops->create, + loc, flags, mode, fd); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + + return 0; +} + +int +nufa_mknod_linkfile_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, + inode_t *inode, struct stat *stbuf) +{ + dht_local_t *local = NULL; + call_frame_t *prev = NULL; + dht_conf_t *conf = NULL; + + local = frame->local; + prev = cookie; + conf = this->private; + + if (op_ret >= 0) { + STACK_WIND (frame, dht_newfile_cbk, + conf->local_volume, + conf->local_volume->fops->mknod, + &local->loc, local->mode, local->rdev); + + return 0; + } + + DHT_STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf); + return 0; +} + + +int +nufa_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t rdev) +{ + dht_local_t *local = NULL; + dht_conf_t *conf = NULL; + xlator_t *subvol = NULL; + int op_errno = -1; + int ret = -1; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + + conf = this->private; + + + local = dht_local_init (frame); + if (!local) { + op_errno = ENOMEM; + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + subvol = dht_subvol_get_hashed (this, loc); + if (!subvol) { + gf_log (this->name, GF_LOG_ERROR, + "no subvolume in layout for path=%s", + loc->path); + op_errno = ENOENT; + goto err; + } + + + if (conf->local_volume != subvol) { + /* Create linkfile first */ + ret = loc_copy (&local->loc, loc); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + local->mode = mode; + local->rdev = rdev; + + dht_linkfile_create (frame, nufa_mknod_linkfile_cbk, + conf->local_volume, subvol, loc); + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "creating %s on %s", loc->path, subvol->name); + + STACK_WIND (frame, dht_newfile_cbk, + subvol, subvol->fops->mknod, + loc, mode, rdev); + + return 0; + +err: + op_errno = (op_errno == -1) ? errno : op_errno; + DHT_STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int +notify (xlator_t *this, int event, void *data, ...) +{ + int ret = -1; + + ret = dht_notify (this, event, data); + + return ret; +} + +void +fini (xlator_t *this) +{ + int i = 0; + dht_conf_t *conf = NULL; + + conf = this->private; + + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + FREE (conf->file_layouts[i]); + } + FREE (conf->file_layouts); + } + + if (conf->default_dir_layout) + FREE (conf->default_dir_layout); + + if (conf->subvolumes) + FREE (conf->subvolumes); + + if (conf->subvolume_status) + FREE (conf->subvolume_status); + + FREE (conf); + } + + return; +} + +int +init (xlator_t *this) +{ + dht_conf_t *conf = NULL; + xlator_list_t *trav = NULL; + data_t *data = NULL; + char *local_volname = NULL; + char *lookup_unhashed_str = NULL; + int ret = -1; + int i = 0; + char my_hostname[256]; + + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "DHT needs more than one child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + conf = CALLOC (1, sizeof (*conf)); + if (!conf) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto err; + } + + conf->search_unhashed = 0; + + if (dict_get_str (this->options, "lookup-unhashed", + &lookup_unhashed_str) == 0) { + gf_string2boolean (lookup_unhashed_str, + &conf->search_unhashed); + } + + ret = dht_init_subvolumes (this, conf); + if (ret == -1) { + goto err; + } + + ret = dht_layouts_init (this, conf); + if (ret == -1) { + goto err; + } + + LOCK_INIT (&conf->subvolume_lock); + + conf->gen = 1; + + local_volname = "localhost"; + ret = gethostname (my_hostname, 256); + if (ret < 0) { + gf_log (this->name, GF_LOG_WARNING, + "could not find hostname (%s)", + strerror (errno)); + } + + if (ret == 0) + local_volname = my_hostname; + + data = dict_get (this->options, "local-volume-name"); + if (data) { + local_volname = data->data; + } + + trav = this->children; + while (trav) { + if (strcmp (trav->xlator->name, local_volname) == 0) + break; + trav = trav->next; + } + + if (!trav) { + gf_log (this->name, GF_LOG_ERROR, + "Could not find subvolume named '%s'. " + "Please define volume with the name as the hostname " + "or override it with 'option local-volume-name'", + local_volname); + goto err; + } + /* The volume specified exists */ + conf->local_volume = trav->xlator; + + this->private = conf; + + return 0; + +err: + if (conf) { + if (conf->file_layouts) { + for (i = 0; i < conf->subvolume_cnt; i++) { + FREE (conf->file_layouts[i]); + } + FREE (conf->file_layouts); + } + + if (conf->default_dir_layout) + FREE (conf->default_dir_layout); + + if (conf->subvolumes) + FREE (conf->subvolumes); + + if (conf->subvolume_status) + FREE (conf->subvolume_status); + + FREE (conf); + } + + return -1; +} + + +struct xlator_fops fops = { + .lookup = nufa_lookup, + .create = nufa_create, + .mknod = nufa_mknod, + + .stat = dht_stat, + .chmod = dht_chmod, + .chown = dht_chown, + .fchown = dht_fchown, + .fchmod = dht_fchmod, + .fstat = dht_fstat, + .utimens = dht_utimens, + .truncate = dht_truncate, + .ftruncate = dht_ftruncate, + .access = dht_access, + .readlink = dht_readlink, + .setxattr = dht_setxattr, + .getxattr = dht_getxattr, + .removexattr = dht_removexattr, + .open = dht_open, + .readv = dht_readv, + .writev = dht_writev, + .flush = dht_flush, + .fsync = dht_fsync, + .statfs = dht_statfs, + .lk = dht_lk, + .opendir = dht_opendir, + .readdir = dht_readdir, + .fsyncdir = dht_fsyncdir, + .symlink = dht_symlink, + .unlink = dht_unlink, + .link = dht_link, + .mkdir = dht_mkdir, + .rmdir = dht_rmdir, + .rename = dht_rename, + .inodelk = dht_inodelk, + .finodelk = dht_finodelk, + .entrylk = dht_entrylk, + .fentrylk = dht_fentrylk, + .xattrop = dht_xattrop, + .fxattrop = dht_fxattrop, +#if 0 + .setdents = dht_setdents, + .getdents = dht_getdents, + .checksum = dht_checksum, +#endif +}; + + +struct xlator_mops mops = { +}; + + +struct xlator_cbks cbks = { +// .release = dht_release, +// .releasedir = dht_releasedir, + .forget = dht_forget +}; + + +struct volume_options options[] = { + { .key = {"local-volume-name"}, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = {"lookup-unhashed"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/ha/Makefile.am b/xlators/cluster/ha/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/ha/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/ha/src/Makefile.am b/xlators/cluster/ha/src/Makefile.am new file mode 100644 index 000000000..069a0dcde --- /dev/null +++ b/xlators/cluster/ha/src/Makefile.am @@ -0,0 +1,15 @@ +xlator_LTLIBRARIES = ha.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +ha_la_LDFLAGS = -module -avoidversion + +ha_la_SOURCES = ha-helpers.c ha.c +ha_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = ha.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/cluster/ha/src/ha-helpers.c b/xlators/cluster/ha/src/ha-helpers.c new file mode 100644 index 000000000..8193caf27 --- /dev/null +++ b/xlators/cluster/ha/src/ha-helpers.c @@ -0,0 +1,191 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "xlator.h" +#include "call-stub.h" +#include "defaults.h" +#include "dict.h" +#include "compat-errno.h" +#include "ha.h" + +int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd) +{ + ha_local_t *local = NULL; + int i = -1; + ha_private_t *pvt = NULL; + int child_count = 0; + int ret = -1; + hafd_t *hafdp = NULL; + xlator_t *this = NULL; + uint64_t tmp_hafdp = 0; + + this = frame->this; + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + + if (local == NULL) { + ret = fd_ctx_get (fd, this, &tmp_hafdp); + if (ret < 0) { + goto out; + } + hafdp = (hafd_t *)(long)tmp_hafdp; + local = frame->local = CALLOC (1, sizeof (*local)); + if (local == NULL) { + ret = -ENOMEM; + goto out; + } + local->state = CALLOC (1, child_count); + if (local->state == NULL) { + ret = -ENOMEM; + goto out; + } + + /* take care of the preferred subvolume */ + if (pvt->pref_subvol == -1) + local->active = hafdp->active; + else + local->active = pvt->pref_subvol; + + LOCK (&hafdp->lock); + memcpy (local->state, hafdp->fdstate, child_count); + UNLOCK (&hafdp->lock); + + /* in case the preferred subvolume is down */ + if ((local->active != -1) && (local->state[local->active] == 0)) + local->active = -1; + + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + if (local->active == -1) + local->active = i; + local->tries++; + } + } + if (local->active == -1) { + ret = -ENOTCONN; + goto out; + } + local->fd = fd_ref (fd); + } + ret = 0; +out: + return ret; +} + +int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno) +{ + xlator_t *xl = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int prev_child = -1; + hafd_t *hafdp = NULL; + int ret = -1; + call_stub_t *stub = NULL; + ha_local_t *local = NULL; + uint64_t tmp_hafdp = 0; + + xl = frame->this; + pvt = xl->private; + children = pvt->children; + prev_child = (long) cookie; + local = frame->local; + + if (op_ret == -1) { + gf_log (xl->name, GF_LOG_ERROR ,"(child=%s) (op_ret=%d op_errno=%s)", + children[prev_child]->name, op_ret, strerror (op_errno)); + } + if (op_ret == -1 && (op_errno == ENOTCONN)) { + ret = 0; + if (local->fd) { + ret = fd_ctx_get (local->fd, xl, &tmp_hafdp); + } + hafdp = (hafd_t *)(long)tmp_hafdp; + if (ret == 0) { + if (local->fd) { + LOCK(&hafdp->lock); + hafdp->fdstate[prev_child] = 0; + UNLOCK(&hafdp->lock); + } + local->tries--; + if (local->tries != 0) { + while (1) { + local->active = (local->active + 1) % pvt->child_count; + if (local->state[local->active]) + break; + } + stub = local->stub; + local->stub = NULL; + call_resume (stub); + return -1; + } + } + } + if (local->stub) + call_stub_destroy (local->stub); + if (local->fd) { + FREE (local->state); + fd_unref (local->fd); + } + return 0; +} + +int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode) +{ + int i = -1; + ha_private_t *pvt = NULL; + xlator_t *xl = NULL; + int ret = -1; + ha_local_t *local = NULL; + uint64_t tmp_state = 0; + + xl = frame->this; + pvt = xl->private; + local = frame->local; + + if (local == NULL) { + local = frame->local = CALLOC (1, sizeof (*local)); + if (local == NULL) { + ret = -ENOMEM; + goto out; + } + local->active = pvt->pref_subvol; + ret = inode_ctx_get (inode, xl, &tmp_state); + if (ret < 0) { + goto out; + } + local->state = (char *)(long)tmp_state; + if (local->active != -1 && local->state[local->active] == 0) + local->active = -1; + for (i = 0; i < pvt->child_count; i++) { + if (local->state[i]) { + if (local->active == -1) + local->active = i; + local->tries++; + } + } + if (local->active == -1) { + ret = -ENOTCONN; + goto out; + } + } + ret = 0; +out: + return ret; +} diff --git a/xlators/cluster/ha/src/ha.c b/xlators/cluster/ha/src/ha.c new file mode 100644 index 000000000..4542bdc7e --- /dev/null +++ b/xlators/cluster/ha/src/ha.c @@ -0,0 +1,3479 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/* generate errors randomly, code is simple now, better alogorithm + * can be written to decide what error to be returned and when + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "call-stub.h" +#include "defaults.h" +#include "dict.h" +#include "compat-errno.h" +#include "ha.h" + +/* + * TODO: + * - dbench fails if ha over server side afr + * - lock calls - lock on all subvols. + * - support preferred-subvolume option. code already there. + * - do not alloc the call-stub in case only one subvol is up. + */ + +int +ha_forget (xlator_t *this, + inode_t *inode) +{ + uint64_t stateino = 0; + char *state = NULL; + if (!inode_ctx_del (inode, this, &stateino)) { + state = ((char *)(long)stateino); + FREE (state); + } + + return 0; + +} + +int32_t +ha_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0, callcnt = 0; + char *state = NULL; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_state = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) { + if (pvt->children[i] == prev_frame->this) + break; + } + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_ERROR, "(child=%s) (op_ret=%d op_errno=%s)", + children[i]->name, op_ret, strerror (op_errno)); + } + inode_ctx_get (local->inode, this, &tmp_state); + state = (char *)(long)tmp_state; + + LOCK (&frame->lock); + if (local->revalidate == 1) { + if ((!op_ret) != state[i]) { + local->revalidate_error = 1; + gf_log (this->name, GF_LOG_DEBUG, "revalidate error on %s", + pvt->children[i]->name); + } + } else { + if (op_ret == 0) { + state[i] = 1; + } + } + if (local->op_ret == -1 && op_ret == 0) { + local->op_ret = 0; + local->buf = *buf; + if (dict) + local->dict = dict_ref (dict); + } + if (op_ret == -1 && op_ret != ENOTCONN) + local->op_errno = op_errno; + callcnt = --local->call_count; + UNLOCK (&frame->lock); + + if (callcnt == 0) { + dict_t *ctx = local->dict; + inode_t *inode = local->inode; + if (local->revalidate_error == 1) { + local->op_ret = -1; + local->op_errno = EIO; + gf_log (this->name, GF_LOG_DEBUG, "revalidate error, returning EIO"); + } + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + inode, + &local->buf, + ctx); + if (inode) + inode_unref (inode); + if (ctx) + dict_unref (ctx); + } + return 0; +} + +int32_t +ha_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0; + char *state = NULL; + xlator_t **children = NULL; + int ret = -1; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + children = pvt->children; + + frame->local = local = CALLOC (1, sizeof (*local)); + child_count = pvt->child_count; + local->inode = inode_ref (loc->inode); + + ret = inode_ctx_get (loc->inode, this, NULL); + if (ret) { + state = CALLOC (1, child_count); + inode_ctx_put (loc->inode, this, (uint64_t)(long)state); + } else + local->revalidate = 1; + + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->call_count = child_count; + + for (i = 0; i < child_count; i++) { + STACK_WIND (frame, + ha_lookup_cbk, + children[i], + children[i]->fops->lookup, + loc, + xattr_req); + } + return 0; +} + + int32_t +ha_stat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ha_local_t *local = NULL; + int op_errno = ENOTCONN; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_stat_stub (frame, ha_stat, loc); + + STACK_WIND_COOKIE (frame, + ha_stat_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->stat, + loc); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_chmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_chmod_stub (frame, ha_chmod, loc, mode); + + STACK_WIND_COOKIE (frame, + ha_chmod_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->chmod, + loc, + mode); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_fchmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fchmod_stub (frame, ha_fchmod, fd, mode); + + STACK_WIND_COOKIE (frame, + ha_fchmod_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fchmod, + fd, + mode); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_chown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_chown_stub (frame, ha_chown, loc, uid, gid); + + STACK_WIND_COOKIE (frame, + ha_chown_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->chown, + loc, + uid, + gid); + return 0; +err: + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; +} + + int32_t +ha_fchown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fchown_stub (frame, ha_fchown, fd, uid, gid); + + STACK_WIND_COOKIE (frame, + ha_fchown_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fchown, + fd, + uid, + gid); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_truncate_stub (frame, ha_truncate, loc, offset); + + STACK_WIND_COOKIE (frame, + ha_truncate_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->truncate, + loc, + offset); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_ftruncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_ftruncate_stub (frame, ha_ftruncate, fd, offset); + + STACK_WIND_COOKIE (frame, + ha_ftruncate_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->ftruncate, + fd, + offset); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int32_t +ha_utimens_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_utimens_stub (frame, ha_utimens, loc, tv); + + STACK_WIND_COOKIE (frame, + ha_utimens_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->utimens, + loc, + tv); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int32_t +ha_access_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_access_stub (frame, ha_access, loc, mask); + + STACK_WIND_COOKIE (frame, + ha_access_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->access, + loc, + mask); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + + int32_t +ha_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + path); + } + return 0; +} + +int32_t +ha_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + ha_local_t *local = frame->local; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_readlink_stub (frame, ha_readlink, loc, size); + + STACK_WIND_COOKIE (frame, + ha_readlink_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->readlink, + loc, + size); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int +ha_mknod_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0, ret = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "(path=%s) (op_ret=%d op_errno=%d)", + local->stub->args.mknod.loc.path, op_ret, op_errno); + } + ret = inode_ctx_get (local->stub->args.mknod.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "unwind(-1), inode_ctx_get() error"); + /* It is difficult to handle this error at this stage + * as we still expect more cbks, we can't return as + * of now + */ + } else if (op_ret == 0) { + stateino[i] = 1; + } + LOCK (&frame->lock); + cnt = --local->call_count; + UNLOCK (&frame->lock); + + if (cnt == 0) { + call_stub_t *stub = local->stub; + FREE (local->state); + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->stub->args.mknod.loc.inode, + &local->buf); + call_stub_destroy (stub); + } + return 0; +} + +int32_t +ha_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0, ret = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mknod.loc.path, op_ret, op_errno); + } + + ret = inode_ctx_get (local->stub->args.mknod.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error"); + /* FIXME: handle the case */ + } + if (op_ret == 0) { + stateino[i] = 1; + local->op_ret = 0; + local->first_success = 1; + local->buf = *buf; + } + cnt = --local->call_count; + for (i = local->active + 1; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (cnt == 0 || i == child_count) { + call_stub_t *stub = local->stub; + FREE (local->state); + stub = local->stub; + STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.mknod.loc.inode, &local->buf); + call_stub_destroy (stub); + return 0; + } + + local->active = i; + + if (local->first_success == 0) { + STACK_WIND (frame, + ha_mknod_cbk, + children[i], + children[i]->fops->mknod, + &local->stub->args.mknod.loc, + local->stub->args.mknod.mode, + local->stub->args.mknod.rdev); + return 0; + } + cnt = local->call_count; + + for (; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_mknod_lookup_cbk, + children[i], + children[i]->fops->lookup, + &local->stub->args.mknod.loc, + 0); + if (--cnt == 0) + break; + } + } + return 0; +} + +int32_t +ha_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0; + char *stateino = NULL; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + + frame->local = local = CALLOC (1, sizeof (*local)); + local->stub = fop_mknod_stub (frame, ha_mknod, loc, mode, rdev); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->state = CALLOC (1, child_count); + memcpy (local->state, pvt->state, child_count); + local->active = -1; + + stateino = CALLOC (1, child_count); + inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); + + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + local->call_count++; + if (local->active == -1) + local->active = i; + } + } + + STACK_WIND (frame, + ha_mknod_cbk, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->mknod, + loc, mode, rdev); + return 0; +} + + +int +ha_mkdir_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno); + } + inode_ctx_get (local->stub->args.mkdir.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) + stateino[i] = 1; + + LOCK (&frame->lock); + cnt = --local->call_count; + UNLOCK (&frame->lock); + + if (cnt == 0) { + call_stub_t *stub = local->stub; + FREE (local->state); + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->stub->args.mkdir.loc.inode, + &local->buf); + call_stub_destroy (stub); + } + return 0; +} + +int32_t +ha_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.mkdir.loc.path, op_ret, op_errno); + } + + inode_ctx_get (local->stub->args.mkdir.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) { + stateino[i] = 1; + local->op_ret = 0; + local->first_success = 1; + local->buf = *buf; + } + cnt = --local->call_count; + for (i = local->active + 1; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (cnt == 0 || i == child_count) { + call_stub_t *stub = local->stub; + FREE (local->state); + stub = local->stub; + STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.mkdir.loc.inode, &local->buf); + call_stub_destroy (stub); + return 0; + } + + local->active = i; + + if (local->first_success == 0) { + STACK_WIND (frame, + ha_mkdir_cbk, + children[i], + children[i]->fops->mkdir, + &local->stub->args.mkdir.loc, + local->stub->args.mkdir.mode); + return 0; + } + cnt = local->call_count; + + for (; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_mkdir_lookup_cbk, + children[i], + children[i]->fops->lookup, + &local->stub->args.mkdir.loc, + 0); + if (--cnt == 0) + break; + } + } + return 0; +} + +int32_t +ha_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0; + char *stateino = NULL; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + + frame->local = local = CALLOC (1, sizeof (*local)); + local->stub = fop_mkdir_stub (frame, ha_mkdir, loc, mode); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->state = CALLOC (1, child_count); + memcpy (local->state, pvt->state, child_count); + local->active = -1; + + stateino = CALLOC (1, child_count); + inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + local->call_count++; + if (local->active == -1) + local->active = i; + } + } + + STACK_WIND (frame, + ha_mkdir_cbk, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->mkdir, + loc, mode); + return 0; +} + + int32_t +ha_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, op_ret, op_errno); + } + return 0; +} + +int32_t +ha_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_unlink_stub (frame, ha_unlink, loc); + + STACK_WIND_COOKIE (frame, + ha_unlink_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->unlink, + loc); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + int32_t +ha_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ha_local_t *local = frame->local; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_rmdir_stub (frame, ha_rmdir, loc); + + STACK_WIND_COOKIE (frame, + ha_rmdir_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->rmdir, + loc); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + +int +ha_symlink_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno); + } + inode_ctx_get (local->stub->args.symlink.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) + stateino[i] = 1; + + LOCK (&frame->lock); + cnt = --local->call_count; + UNLOCK (&frame->lock); + + if (cnt == 0) { + call_stub_t *stub = local->stub; + FREE (local->state); + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->stub->args.symlink.loc.inode, + &local->buf); + call_stub_destroy (stub); + } + return 0; +} + +int32_t +ha_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.symlink.loc.path, op_ret, op_errno); + } + inode_ctx_get (local->stub->args.symlink.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) { + stateino[i] = 1; + local->op_ret = 0; + local->first_success = 1; + local->buf = *buf; + } + cnt = --local->call_count; + for (i = local->active + 1; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (cnt == 0 || i == child_count) { + call_stub_t *stub = local->stub; + FREE (local->state); + stub = local->stub; + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->stub->args.symlink.loc.inode, &local->buf); + call_stub_destroy (stub); + return 0; + } + + local->active = i; + + if (local->first_success == 0) { + STACK_WIND (frame, + ha_symlink_cbk, + children[i], + children[i]->fops->symlink, + local->stub->args.symlink.linkname, + &local->stub->args.symlink.loc); + return 0; + } + cnt = local->call_count; + + for (; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_symlink_lookup_cbk, + children[i], + children[i]->fops->lookup, + &local->stub->args.symlink.loc, + 0); + if (--cnt == 0) + break; + } + } + return 0; +} + +int32_t +ha_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkname, + loc_t *loc) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0; + char *stateino = NULL; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + + frame->local = local = CALLOC (1, sizeof (*local)); + local->stub = fop_symlink_stub (frame, ha_symlink, linkname, loc); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->state = CALLOC (1, child_count); + memcpy (local->state, pvt->state, child_count); + local->active = -1; + + stateino = CALLOC (1, child_count); + inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); + + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + local->call_count++; + if (local->active == -1) { + local->active = i; + } + } + } + + STACK_WIND (frame, + ha_symlink_cbk, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->symlink, + linkname, loc); + return 0; +} + + int32_t +ha_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, op_ret, op_errno, buf); + } + return 0; +} + +int32_t +ha_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, oldloc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_rename_stub (frame, ha_rename, oldloc, newloc); + STACK_WIND_COOKIE (frame, + ha_rename_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->rename, + oldloc, newloc); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int +ha_link_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno); + } + inode_ctx_get (local->stub->args.link.newloc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) + stateino[i] = 1; + + LOCK (&frame->lock); + cnt = --local->call_count; + UNLOCK (&frame->lock); + + if (cnt == 0) { + call_stub_t *stub = local->stub; + FREE (local->state); + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->stub->args.link.oldloc.inode, + &local->buf); + call_stub_destroy (stub); + } + return 0; +} + +int32_t +ha_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + int child_count = 0, i = 0, cnt = 0; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + for (i = 0; i < child_count; i++) + if (prev_frame->this == children[i]) + break; + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.link.newloc.path, op_ret, op_errno); + } + inode_ctx_get (local->stub->args.link.newloc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (op_ret == 0) { + stateino[i] = 1; + local->op_ret = 0; + local->first_success = 1; + local->buf = *buf; + } + cnt = --local->call_count; + for (i = local->active + 1; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (cnt == 0 || i == child_count) { + call_stub_t *stub = local->stub; + FREE (local->state); + stub = local->stub; + STACK_UNWIND (frame, local->op_ret, local->op_errno, local->stub->args.link.oldloc.inode, &local->buf); + call_stub_destroy (stub); + return 0; + } + + local->active = i; + + if (local->first_success == 0) { + STACK_WIND (frame, + ha_link_cbk, + children[i], + children[i]->fops->link, + &local->stub->args.link.oldloc, + &local->stub->args.link.newloc); + return 0; + } + cnt = local->call_count; + + for (; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_link_lookup_cbk, + children[i], + children[i]->fops->lookup, + &local->stub->args.link.newloc, + 0); + if (--cnt == 0) + break; + } + } + return 0; +} + +int32_t +ha_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int child_count = 0, i = 0; + char *stateino = NULL; + int32_t ret = 0; + uint64_t tmp_stateino = 0; + + ret = inode_ctx_get (newloc->inode, this, &tmp_stateino); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()"); + } + stateino = (char *)(long)tmp_stateino; + + if (stateino == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "newloc->inode's ctx is NULL, returning EINVAL"); + STACK_UNWIND (frame, -1, EINVAL, oldloc->inode, NULL); + return 0; + } + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + + frame->local = local = CALLOC (1, sizeof (*local)); + local->stub = fop_link_stub (frame, ha_link, oldloc, newloc); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->state = CALLOC (1, child_count); + memcpy (local->state, pvt->state, child_count); + local->active = -1; + + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + local->call_count++; + if (local->active == -1) + local->active = i; + } + } + + STACK_WIND (frame, + ha_link_cbk, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->link, + oldloc, + newloc); + return 0; +} + +int32_t +ha_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int i, child_count = 0, cnt = 0, ret = 0; + char *stateino = NULL; + hafd_t *hafdp = NULL; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + uint64_t tmp_stateino = 0; + uint64_t tmp_hafdp = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + prev_frame = cookie; + children = pvt->children; + + ret = inode_ctx_get (local->stub->args.create.loc.inode, + this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error"); + /* FIXME: handle */ + } + ret = fd_ctx_get (local->stub->args.create.fd, this, &tmp_hafdp); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "dict_to_ptr() error"); + /* FIXME: handle */ + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + for (i = 0; i < child_count; i++) { + if (prev_frame->this == children[i]) + break; + } + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, "(path=%s) (op_ret=%d op_errno=%d)", local->stub->args.create.loc.path, op_ret, op_errno); + } + if (op_ret != -1) { + stateino[i] = 1; + hafdp->fdstate[i] = 1; + if (local->op_ret == -1) { + local->op_ret = 0; + local->buf = *buf; + local->first_success = 1; + } + local->stub->args.create.flags &= (~O_EXCL); + } + LOCK (&frame->lock); + cnt = --local->call_count; + UNLOCK (&frame->lock); + + for (i = local->active + 1; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (cnt == 0 || i == child_count) { + char *state = local->state; + call_stub_t *stub = local->stub; + STACK_UNWIND (frame, local->op_ret, local->op_errno, + stub->args.create.fd, + stub->args.create.loc.inode, &local->buf); + FREE (state); + call_stub_destroy (stub); + return 0; + } + local->active = i; + cnt = local->call_count; + for (; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_create_cbk, + children[i], + children[i]->fops->create, + &local->stub->args.create.loc, + local->stub->args.create.flags, + local->stub->args.create.mode, + local->stub->args.create.fd); + if ((local->first_success == 0) || (cnt == 0)) + break; + } + } + return 0; +} + +int32_t +ha_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, fd_t *fd) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + int i, child_count = 0; + char *stateino = NULL; + xlator_t **children = NULL; + hafd_t *hafdp = NULL; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + children = pvt->children; + + if (local == NULL) { + local = frame->local = CALLOC (1, sizeof (*local)); + local->stub = fop_create_stub (frame, ha_create, loc, flags, mode, fd); + local->state = CALLOC (1, child_count); + local->active = -1; + local->op_ret = -1; + local->op_errno = ENOTCONN; + memcpy (local->state, pvt->state, child_count); + + for (i = 0; i < pvt->child_count; i++) { + if (local->state[i]) { + local->call_count++; + if (local->active == -1) + local->active = i; + } + } + /* FIXME handle active -1 */ + stateino = CALLOC (1, child_count); + hafdp = CALLOC (1, sizeof (*hafdp)); + hafdp->fdstate = CALLOC (1, child_count); + hafdp->path = strdup(loc->path); + LOCK_INIT (&hafdp->lock); + fd_ctx_set (fd, this, (uint64_t)(long)hafdp); + inode_ctx_put (loc->inode, this, (uint64_t)(long)stateino); + } + + STACK_WIND (frame, + ha_create_cbk, + children[local->active], + children[local->active]->fops->create, + loc, flags, mode, fd); + return 0; +} + + int32_t +ha_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int i = 0, child_count = 0, callcnt = 0, ret = 0; + call_frame_t *prev_frame = NULL; + hafd_t *hafdp = NULL; + uint64_t tmp_hafdp = 0; + + local = frame->local; + pvt = this->private; + children = pvt->children; + child_count = pvt->child_count; + prev_frame = cookie; + + ret = fd_ctx_get (local->fd, this, &tmp_hafdp); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()"); + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + for (i = 0; i < child_count; i++) + if (children[i] == prev_frame->this) + break; + LOCK (&frame->lock); + if (op_ret != -1) { + hafdp->fdstate[i] = 1; + local->op_ret = 0; + } + if (op_ret == -1 && op_errno != ENOTCONN) + local->op_errno = op_errno; + callcnt = --local->call_count; + UNLOCK (&frame->lock); + + if (callcnt == 0) { + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->fd); + } + return 0; +} + +int32_t +ha_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, fd_t *fd) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + xlator_t **children = NULL; + int cnt = 0, i, child_count = 0, ret = 0; + hafd_t *hafdp = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + children = pvt->children; + child_count = pvt->child_count; + + + local = frame->local = CALLOC (1, sizeof (*local)); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->fd = fd; + + hafdp = CALLOC (1, sizeof (*hafdp)); + hafdp->fdstate = CALLOC (1, child_count); + hafdp->path = strdup (loc->path); + hafdp->active = -1; + if (pvt->pref_subvol == -1) { + hafdp->active = fd->inode->ino % child_count; + } + + LOCK_INIT (&hafdp->lock); + fd_ctx_set (fd, this, (uint64_t)(long)hafdp); + ret = inode_ctx_get (loc->inode, this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + for (i = 0; i < child_count; i++) + if (stateino[i]) + cnt++; + local->call_count = cnt; + for (i = 0; i < child_count; i++) { + if (stateino[i]) { + STACK_WIND (frame, + ha_open_cbk, + children[i], + children[i]->fops->open, + loc, flags, fd); + if (--cnt == 0) + break; + } + } + return 0; +} + + int32_t +ha_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + vector, + count, + stbuf); + } + return 0; +} + +int32_t +ha_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_readv_stub (frame, ha_readv, fd, size, offset); + + STACK_WIND_COOKIE (frame, + ha_readv_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->readv, + fd, + size, + offset); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + int ret = 0; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + stbuf); + } + return 0; +} + +int32_t +ha_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_writev_stub (frame, ha_writev, fd, vector, count, off); + + STACK_WIND_COOKIE (frame, + ha_writev_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->writev, + fd, + vector, + count, + off); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = 0; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_flush_stub (frame, ha_flush, fd); + STACK_WIND_COOKIE (frame, + ha_flush_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->flush, + fd); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + + int32_t +ha_fsync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = 0; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fsync_stub (frame, ha_fsync, fd, flags); + STACK_WIND_COOKIE (frame, + ha_fsync_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fsync, + fd, + flags); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + int32_t +ha_fstat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fstat_stub (frame, ha_fstat, fd); + STACK_WIND_COOKIE (frame, + ha_fstat_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fstat, + fd); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int32_t +ha_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int i = 0, child_count = 0, callcnt = 0, ret = 0; + call_frame_t *prev_frame = NULL; + hafd_t *hafdp = NULL; + uint64_t tmp_hafdp = 0; + + local = frame->local; + pvt = this->private; + children = pvt->children; + child_count = pvt->child_count; + prev_frame = cookie; + + ret = fd_ctx_get (local->fd, this, &tmp_hafdp); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "dict_ptr_error()"); + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + for (i = 0; i < child_count; i++) + if (children[i] == prev_frame->this) + break; + LOCK (&frame->lock); + if (op_ret != -1) { + hafdp->fdstate[i] = 1; + local->op_ret = 0; + } + if (op_ret == -1 && op_errno != ENOTCONN) + local->op_errno = op_errno; + callcnt = --local->call_count; + UNLOCK (&frame->lock); + + if (callcnt == 0) { + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + local->fd); + } + return 0; +} + +int32_t +ha_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, fd_t *fd) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + char *stateino = NULL; + xlator_t **children = NULL; + int cnt = 0, i, child_count = 0, ret = 0; + hafd_t *hafdp = NULL; + uint64_t tmp_stateino = 0; + + local = frame->local; + pvt = this->private; + children = pvt->children; + child_count = pvt->child_count; + + local = frame->local = CALLOC (1, sizeof (*local)); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->fd = fd; + + hafdp = CALLOC (1, sizeof (*hafdp)); + hafdp->fdstate = CALLOC (1, child_count); + hafdp->path = strdup (loc->path); + LOCK_INIT (&hafdp->lock); + fd_ctx_set (fd, this, (uint64_t)(long)hafdp); + ret = inode_ctx_get (loc->inode, this, &tmp_stateino); + stateino = (char *)(long)tmp_stateino; + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, "inode_ctx_get() error"); + } + for (i = 0; i < child_count; i++) + if (stateino[i]) + cnt++; + local->call_count = cnt; + for (i = 0; i < child_count; i++) { + if (stateino[i]) { + STACK_WIND (frame, + ha_opendir_cbk, + children[i], + children[i]->fops->opendir, + loc, fd); + if (--cnt == 0) + break; + } + } + return 0; +} + + int32_t +ha_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + entries, + count); + } + return 0; +} + +int32_t +ha_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_getdents_stub (frame, ha_getdents, fd, size, offset, flag); + STACK_WIND_COOKIE (frame, + ha_getdents_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->getdents, + fd, + size, + offset, + flag); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL, 0); + return 0; +} + + int32_t +ha_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + + local->stub = fop_setdents_stub (frame, ha_setdents, fd, flags, entries, count); + + STACK_WIND_COOKIE (frame, + ha_setdents_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->setdents, + fd, + flags, + entries, + count); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + int32_t +ha_fsyncdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fsyncdir_stub (frame, ha_fsyncdir, fd, flags); + STACK_WIND_COOKIE (frame, + ha_fsyncdir_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fsyncdir, + fd, + flags); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + + int32_t +ha_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + } + return 0; +} + +int32_t +ha_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + + local->stub = fop_statfs_stub (frame, ha_statfs, loc); + STACK_WIND_COOKIE (frame, + ha_statfs_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->statfs, + loc); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + + int32_t +ha_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_setxattr_stub (frame, ha_setxattr, loc, dict, flags); + STACK_WIND_COOKIE (frame, + ha_setxattr_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->setxattr, + loc, + dict, + flags); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + int32_t +ha_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + dict); + } + return 0; +} + +int32_t +ha_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_getxattr_stub (frame, ha_getxattr, loc, name); + STACK_WIND_COOKIE (frame, + ha_getxattr_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->getxattr, + loc, + name); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; +} + +int32_t +ha_xattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + int ret = -1; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, op_ret, op_errno, dict); + } + return 0; +} + + +int32_t +ha_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + + local->stub = fop_xattrop_stub (frame, ha_xattrop, loc, flags, dict); + + STACK_WIND_COOKIE (frame, + ha_xattrop_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->xattrop, + loc, + flags, + dict); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, dict); + return 0; +} + +int32_t +ha_fxattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + int ret = -1; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) + STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +ha_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_fxattrop_stub (frame, ha_fxattrop, fd, flags, dict); + + STACK_WIND_COOKIE (frame, + ha_fxattrop_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->fxattrop, + fd, + flags, + dict); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, dict); + return 0; +} + + int32_t +ha_removexattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + + local->stub = fop_removexattr_stub (frame, ha_removexattr, loc, name); + + STACK_WIND_COOKIE (frame, + ha_removexattr_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->removexattr, + loc, + name); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + +int32_t +ha_lk_setlk_unlck_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + ha_local_t *local = NULL; + int cnt = 0; + call_stub_t *stub = NULL; + + local = frame->local; + + LOCK (&frame->lock); + cnt = --local->call_count; + if (op_ret == 0) + local->op_ret = 0; + UNLOCK (&frame->lock); + + if (cnt == 0) { + stub = local->stub; + FREE (local->state); + if (stub->args.lk.lock.l_type == F_UNLCK) { + STACK_UNWIND (frame, local->op_ret, local->op_errno, &stub->args.lk.lock); + } else { + STACK_UNWIND (frame, -1, EIO, NULL); + } + call_stub_destroy (stub); + } + return 0; +} + +int32_t +ha_lk_setlk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int i = 0, cnt = 0, j = 0; + int child_count = 0; + call_frame_t *prev_frame = NULL; + char *state = NULL; + + local = frame->local; + pvt = this->private; + children = pvt->children; + child_count = pvt->child_count; + prev_frame = cookie; + state = local->state; + + if (op_ret == 0) + local->op_ret = 0; + + if ((op_ret == 0) || (op_ret == -1 && op_errno == ENOTCONN)) { + for (i = 0; i < child_count; i++) { + if (prev_frame->this == cookie) + break; + } + i++; + for (; i < child_count; i++) { + if (local->state[i]) + break; + } + if (i == child_count) { + call_stub_t *stub = local->stub; + FREE (local->state); + STACK_UNWIND (frame, 0, op_errno, &stub->args.lk.lock); + call_stub_destroy (stub); + return 0; + } + STACK_WIND (frame, + ha_lk_setlk_cbk, + children[i], + children[i]->fops->lk, + local->stub->args.lk.fd, + local->stub->args.lk.cmd, + &local->stub->args.lk.lock); + return 0; + } else { + for (i = 0; i < child_count; i++) { + if (prev_frame->this == cookie) + break; + } + cnt = 0; + for (j = 0; j < i; j++) { + if (state[i]) + cnt++; + } + if (cnt) { + struct flock lock; + lock = local->stub->args.lk.lock; + for (i = 0; i < child_count; i++) { + if (state[i]) { + STACK_WIND (frame, + ha_lk_setlk_unlck_cbk, + children[i], + children[i]->fops->lk, + local->stub->args.lk.fd, + local->stub->args.lk.cmd, + &lock); + if (--cnt == 0) + break; + } + } + return 0; + } else { + FREE (local->state); + call_stub_destroy (local->stub); + STACK_UNWIND (frame, + op_ret, + op_errno, + lock); + return 0; + } + } +} + +int32_t +ha_lk_getlk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + fd_t *fd = NULL; + int child_count = 0, i = 0; + xlator_t **children = NULL; + call_frame_t *prev_frame = NULL; + + local = frame->local; + pvt = this->private; + fd = local->stub->args.lk.fd; + child_count = pvt->child_count; + children = pvt->children; + prev_frame = cookie; + + if (op_ret == 0) { + FREE (local->state); + call_stub_destroy (local->stub); + STACK_UNWIND (frame, 0, 0, lock); + return 0; + } + + for (i = 0; i < child_count; i++) { + if (prev_frame->this == children[i]) + break; + } + + for (; i < child_count; i++) { + if (local->state[i]) + break; + } + + if (i == child_count) { + FREE (local->state); + call_stub_destroy (local->stub); + STACK_UNWIND (frame, op_ret, op_errno, lock); + return 0; + } + + STACK_WIND (frame, + ha_lk_getlk_cbk, + children[i], + children[i]->fops->lk, + fd, + local->stub->args.lk.cmd, + &local->stub->args.lk.lock); + return 0; +} + +int32_t +ha_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + hafd_t *hafdp = NULL; + char *state = NULL; + int child_count = 0, i = 0, cnt = 0, ret = 0; + xlator_t **children = NULL; + uint64_t tmp_hafdp = 0; + + local = frame->local; + pvt = this->private; + child_count = pvt->child_count; + children = pvt->children; + ret = fd_ctx_get (fd, this, &tmp_hafdp); + if (ret < 0) + gf_log (this->name, GF_LOG_ERROR, "fd_ctx_get failed"); + + if (local == NULL) { + local = frame->local = CALLOC (1, sizeof (*local)); + local->active = -1; + local->op_ret = -1; + local->op_errno = ENOTCONN; + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + if (local->active == -1) { + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + local->stub = fop_lk_stub (frame, ha_lk, fd, cmd, lock); + local->state = CALLOC (1, child_count); + state = hafdp->fdstate; + LOCK (&hafdp->lock); + memcpy (local->state, state, child_count); + UNLOCK (&hafdp->lock); + if (cmd == F_GETLK) { + for (i = 0; i < child_count; i++) { + if (local->state[i]) + break; + } + STACK_WIND (frame, + ha_lk_getlk_cbk, + children[i], + children[i]->fops->lk, + fd, + cmd, + lock); + } else if (cmd == F_SETLK && lock->l_type == F_UNLCK) { + for (i = 0; i < child_count; i++) { + if (local->state[i]) + local->call_count++; + } + cnt = local->call_count; + for (i = 0; i < child_count; i++) { + if (local->state[i]) { + STACK_WIND (frame, + ha_lk_setlk_unlck_cbk, + children[i], + children[i]->fops->lk, + fd, cmd, lock); + if (--cnt == 0) + break; + } + } + } else { + for (i = 0; i < child_count; i++) { + if (local->state[i]) + break; + } + STACK_WIND (frame, + ha_lk_setlk_cbk, + children[i], + children[i]->fops->lk, + fd, + cmd, + lock); + } + return 0; +} + + int32_t +ha_inode_entry_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno); + } + return 0; +} + +int32_t +ha_inodelk (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t cmd, + struct flock *lock) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_inodelk_stub (frame, ha_inodelk, loc, cmd, lock); + STACK_WIND_COOKIE (frame, + ha_inode_entry_lk_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->inodelk, + loc, + cmd, + lock); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + +int32_t +ha_entrylk (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *basename, + entrylk_cmd cmd, + entrylk_type type) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_entrylk_stub (frame, ha_entrylk, loc, basename, cmd, type); + STACK_WIND_COOKIE (frame, + ha_inode_entry_lk_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->entrylk, + loc, basename, cmd, type); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno); + return 0; +} + + int32_t +ha_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum) +{ + int ret = -1; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) { + STACK_UNWIND (frame, + op_ret, + op_errno, + file_checksum, + dir_checksum); + } + return 0; +} + +int32_t +ha_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + int op_errno = 0; + ha_local_t *local = NULL; + + op_errno = ha_alloc_init_inode (frame, loc->inode); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_checksum_stub (frame, ha_checksum, loc, flag); + + STACK_WIND_COOKIE (frame, + ha_checksum_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->checksum, + loc, + flag); + return 0; +err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + return 0; +} + +int32_t +ha_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries) +{ + int ret = 0; + + ret = ha_handle_cbk (frame, cookie, op_ret, op_errno); + if (ret == 0) + STACK_UNWIND (frame, op_ret, op_errno, entries); + return 0; +} + +int32_t +ha_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t off) +{ + ha_local_t *local = NULL; + int op_errno = 0; + + op_errno = ha_alloc_init_fd (frame, fd); + if (op_errno < 0) { + op_errno = -op_errno; + goto err; + } + local = frame->local; + local->stub = fop_readdir_stub (frame, ha_readdir, fd, size, off); + STACK_WIND_COOKIE (frame, + ha_readdir_cbk, + (void *)(long)local->active, + HA_ACTIVE_CHILD(this, local), + HA_ACTIVE_CHILD(this, local)->fops->readdir, + fd, size, off); + return 0; +err: + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; +} + +/* Management operations */ + + int32_t +ha_stats_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + int i = 0; + + local = frame->local; + pvt = this->private; + prev_frame = cookie; + children = pvt->children; + + if (op_ret == -1 && op_errno == ENOTCONN) { + for (i = 0; i < pvt->child_count; i++) { + if (prev_frame->this == children[i]) + break; + } + i++; + for (; i < pvt->child_count; i++) { + if (pvt->state[i]) + break; + } + + if (i == pvt->child_count) { + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + STACK_WIND (frame, + ha_stats_cbk, + children[i], + children[i]->mops->stats, + local->flags); + return 0; + } + + STACK_UNWIND (frame, + op_ret, + op_errno, + stats); + return 0; +} + +int32_t +ha_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int i = 0; + + local = frame->local = CALLOC (1, sizeof (*local)); + pvt = this->private; + children = pvt->children; + for (i = 0; i < pvt->child_count; i++) { + if (pvt->state[i]) + break; + } + + if (i == pvt->child_count) { + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + local->flags = flags; + + STACK_WIND (frame, + ha_stats_cbk, + children[i], + children[i]->mops->stats, + flags); + return 0; +} + + +int32_t +ha_getspec_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + char *spec_data) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + call_frame_t *prev_frame = NULL; + xlator_t **children = NULL; + int i = 0; + + local = frame->local; + pvt = this->private; + prev_frame = cookie; + children = pvt->children; + + if (op_ret == -1 && op_errno == ENOTCONN) { + for (i = 0; i < pvt->child_count; i++) { + if (prev_frame->this == children[i]) + break; + } + i++; + for (; i < pvt->child_count; i++) { + if (pvt->state[i]) + break; + } + + if (i == pvt->child_count) { + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + STACK_WIND (frame, + ha_getspec_cbk, + children[i], + children[i]->mops->getspec, + local->pattern, + local->flags); + return 0; + } + + STACK_UNWIND (frame, + op_ret, + op_errno, + spec_data); + return 0; +} + +int32_t +ha_getspec (call_frame_t *frame, + xlator_t *this, + const char *key, + int32_t flags) +{ + ha_local_t *local = NULL; + ha_private_t *pvt = NULL; + xlator_t **children = NULL; + int i = 0; + + local = frame->local = CALLOC (1, sizeof (*local)); + pvt = this->private; + children = pvt->children; + + local = frame->local = CALLOC (1, sizeof (*local)); + for (i = 0; i < pvt->child_count; i++) { + if (pvt->state[i]) + break; + } + + if (i == pvt->child_count) { + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + local->flags = flags; + local->pattern = (char *)key; + + STACK_WIND (frame, + ha_getspec_cbk, + children[i], + children[i]->mops->getspec, + key, flags); + return 0; +} + +int32_t +ha_closedir (xlator_t *this, + fd_t *fd) +{ + hafd_t *hafdp = NULL; + int op_errno = 0; + uint64_t tmp_hafdp = 0; + + op_errno = fd_ctx_del (fd, this, &tmp_hafdp); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error"); + return 0; + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + FREE (hafdp->fdstate); + FREE (hafdp->path); + LOCK_DESTROY (&hafdp->lock); + return 0; +} + +int32_t +ha_close (xlator_t *this, + fd_t *fd) +{ + hafd_t *hafdp = NULL; + int op_errno = 0; + uint64_t tmp_hafdp = 0; + + op_errno = fd_ctx_del (fd, this, &tmp_hafdp); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, "fd_ctx_del() error"); + return 0; + } + hafdp = (hafd_t *)(long)tmp_hafdp; + + FREE (hafdp->fdstate); + FREE (hafdp->path); + LOCK_DESTROY (&hafdp->lock); + return 0; +} + +/* notify */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + ha_private_t *pvt = NULL; + int32_t i = 0, upcnt = 0; + + pvt = this->private; + if (pvt == NULL) { + gf_log (this->name, GF_LOG_DEBUG, "got notify before init()"); + return 0; + } + + switch (event) + { + case GF_EVENT_CHILD_DOWN: + { + for (i = 0; i < pvt->child_count; i++) { + if (data == pvt->children[i]) + break; + } + gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_DOWN from %s", pvt->children[i]->name); + pvt->state[i] = 0; + for (i = 0; i < pvt->child_count; i++) { + if (pvt->state[i]) + break; + } + if (i == pvt->child_count) { + default_notify (this, event, data); + } + } + break; + case GF_EVENT_CHILD_UP: + { + for (i = 0; i < pvt->child_count; i++) { + if (data == pvt->children[i]) + break; + } + + gf_log (this->name, GF_LOG_DEBUG, "GF_EVENT_CHILD_UP from %s", pvt->children[i]->name); + + pvt->state[i] = 1; + + for (i = 0; i < pvt->child_count; i++) { + if (pvt->state[i]) + upcnt++; + } + + if (upcnt == 1) { + default_notify (this, event, data); + } + } + break; + + default: + { + default_notify (this, event, data); + } + } + + return 0; +} + +int +init (xlator_t *this) +{ + ha_private_t *pvt = NULL; + xlator_list_t *trav = NULL; + int count = 0, ret = 0; + + if (!this->children) { + gf_log (this->name,GF_LOG_ERROR, + "FATAL: ha should have one or more child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + trav = this->children; + pvt = CALLOC (1, sizeof (ha_private_t)); + + ret = dict_get_int32 (this->options, "preferred-subvolume", + &pvt->pref_subvol); + if (ret < 0) { + pvt->pref_subvol = -1; + } + + trav = this->children; + while (trav) { + count++; + trav = trav->next; + } + + pvt->child_count = count; + pvt->children = CALLOC (count, sizeof (xlator_t*)); + + trav = this->children; + count = 0; + while (trav) { + pvt->children[count] = trav->xlator; + count++; + trav = trav->next; + } + + pvt->state = CALLOC (1, count); + this->private = pvt; + return 0; +} + +void +fini (xlator_t *this) +{ + ha_private_t *priv = NULL; + priv = this->private; + FREE (priv); + return; +} + + +struct xlator_fops fops = { + .lookup = ha_lookup, + .stat = ha_stat, + .readlink = ha_readlink, + .mknod = ha_mknod, + .mkdir = ha_mkdir, + .unlink = ha_unlink, + .rmdir = ha_rmdir, + .symlink = ha_symlink, + .rename = ha_rename, + .link = ha_link, + .chmod = ha_chmod, + .chown = ha_chown, + .truncate = ha_truncate, + .utimens = ha_utimens, + .create = ha_create, + .open = ha_open, + .readv = ha_readv, + .writev = ha_writev, + .statfs = ha_statfs, + .flush = ha_flush, + .fsync = ha_fsync, + .setxattr = ha_setxattr, + .getxattr = ha_getxattr, + .removexattr = ha_removexattr, + .opendir = ha_opendir, + .readdir = ha_readdir, + .getdents = ha_getdents, + .fsyncdir = ha_fsyncdir, + .access = ha_access, + .ftruncate = ha_ftruncate, + .fstat = ha_fstat, + .lk = ha_lk, + .fchmod = ha_fchmod, + .fchown = ha_fchown, + .setdents = ha_setdents, + .lookup_cbk = ha_lookup_cbk, + .checksum = ha_checksum, + .xattrop = ha_xattrop, + .fxattrop = ha_fxattrop +}; + +struct xlator_mops mops = { + .stats = ha_stats, + .getspec = ha_getspec, +}; + +struct xlator_cbks cbks = { + .release = ha_close, + .releasedir = ha_closedir, + .forget = ha_forget, +}; diff --git a/xlators/cluster/ha/src/ha.h b/xlators/cluster/ha/src/ha.h new file mode 100644 index 000000000..77a04f165 --- /dev/null +++ b/xlators/cluster/ha/src/ha.h @@ -0,0 +1,59 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __HA_H_ +#define __HA_H_ + +typedef struct { + call_stub_t *stub; + int32_t op_ret, op_errno; + int32_t active, tries, revalidate, revalidate_error; + int32_t call_count; + char *state, *pattern; + dict_t *dict; + loc_t *loc; + struct stat buf; + fd_t *fd; + inode_t *inode; + int32_t flags; + int32_t first_success; +} ha_local_t; + +typedef struct { + char *state; + xlator_t **children; + int child_count, pref_subvol; +} ha_private_t; + +typedef struct { + char *fdstate; + char *path; + gf_lock_t lock; + int active; +} hafd_t; + +#define HA_ACTIVE_CHILD(this, local) (((ha_private_t *)this->private)->children[local->active]) + +extern int ha_alloc_init_fd (call_frame_t *frame, fd_t *fd); + +extern int ha_handle_cbk (call_frame_t *frame, void *cookie, int op_ret, int op_errno) ; + +extern int ha_alloc_init_inode (call_frame_t *frame, inode_t *inode); + +#endif diff --git a/xlators/cluster/map/Makefile.am b/xlators/cluster/map/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/map/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/map/src/Makefile.am b/xlators/cluster/map/src/Makefile.am new file mode 100644 index 000000000..44ee4d9ee --- /dev/null +++ b/xlators/cluster/map/src/Makefile.am @@ -0,0 +1,15 @@ +xlator_LTLIBRARIES = map.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +map_la_LDFLAGS = -module -avoidversion + +map_la_SOURCES = map.c map-helper.c +map_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = map.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/cluster/map/src/map-helper.c b/xlators/cluster/map/src/map-helper.c new file mode 100644 index 000000000..4e51219d4 --- /dev/null +++ b/xlators/cluster/map/src/map-helper.c @@ -0,0 +1,357 @@ +/* + Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "map.h" + + +xlator_t * +map_subvol_next (xlator_t *this, xlator_t *prev) +{ + map_private_t *priv = NULL; + xlator_t *next = NULL; + int i = 0; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (priv->xlarray[i].xl == prev) { + if ((i + 1) < priv->child_count) + next = priv->xlarray[i + 1].xl; + break; + } + } + + return next; +} + +int +map_subvol_cnt (xlator_t *this, xlator_t *subvol) +{ + int i = 0; + int ret = -1; + map_private_t *priv = NULL; + + priv = this->private; + + for (i = 0; i < priv->child_count; i++) { + if (subvol == priv->xlarray[i].xl) { + ret = i; + break; + } + } + + return ret; +} + +int +map_itransform (xlator_t *this, xlator_t *subvol, uint64_t x, uint64_t *y_p) +{ + map_private_t *priv = NULL; + int cnt = 0; + int max = 0; + uint64_t y = 0; + + if (x == ((uint64_t) -1)) { + y = (uint64_t) -1; + goto out; + } + + priv = this->private; + + max = priv->child_count; + cnt = map_subvol_cnt (this, subvol); + + y = ((x * max) + cnt); + +out: + if (y_p) + *y_p = y; + + return 0; +} + + +int +map_deitransform (xlator_t *this, uint64_t y, xlator_t **subvol_p, + uint64_t *x_p) +{ + int cnt = 0; + int max = 0; + uint64_t x = 0; + xlator_t *subvol = 0; + map_private_t *priv = NULL; + + priv = this->private; + max = priv->child_count; + + cnt = y % max; + x = y / max; + + subvol = priv->xlarray[cnt].xl; + + if (subvol_p) + *subvol_p = subvol; + + if (x_p) + *x_p = x; + + return 0; +} + + +xlator_t * +get_mapping_subvol_from_path (xlator_t *this, const char *path) +{ + map_private_t *priv = NULL; + struct map_pattern *map = NULL; + + /* To make sure we handle '/' properly */ + if (!strcmp (path, "/")) + return NULL; + + priv = this->private; + + map = priv->map; + while (map) { + if (!strncmp (map->directory, path, map->dir_len)) { + if ((path[map->dir_len] == '/') || + (path[map->dir_len] == '\0')) { + return map->xl; + } + } + + map = map->next; + } + + return priv->default_xl; +} + +xlator_t * +get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode) +{ + uint64_t subvol = 0; + int ret = -1; + + ret = inode_ctx_get (inode, this, &subvol); + if (ret != 0) + return NULL; + + return (xlator_t *)(long)subvol; +} + +int +check_multiple_volume_entry (xlator_t *this, + xlator_t *subvol) +{ + int ret = -1; + int idx = 0; + map_private_t *priv = NULL; + + priv = this->private; + + for (idx = 0; idx < priv->child_count; idx++) { + if (priv->xlarray[idx].xl == subvol) { + if (priv->xlarray[idx].mapped) { + gf_log (this->name, GF_LOG_ERROR, + "subvolume '%s' is already mapped", + subvol->name); + goto out; + } + priv->xlarray[idx].mapped = 1; + ret = 0; + goto out; + } + } + + gf_log (this->name, GF_LOG_ERROR, + "subvolume '%s' is not found", + subvol->name); + + out: + return ret; +} + +int +verify_dir_and_assign_subvol (xlator_t *this, + const char *directory, + const char *subvol) +{ + int default_flag = 0; + int ret = -1; + int idx = 0; + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + struct map_pattern *tmp_map = NULL; + + priv = this->private; + + /* check if directory is valid, ie, its a top level dir, and + * not includes a '*' in it. + */ + if (!strcmp ("*", directory)) { + default_flag = 1; + } else { + if (directory[0] != '/') { + gf_log (this->name, GF_LOG_ERROR, + "map takes absolute path, starting with '/'. " + "not '%s'", directory); + goto out; + } + for (idx = 1; idx < (strlen (directory) - 1); idx++) { + if (directory[idx] == '/') { + gf_log (this->name, GF_LOG_ERROR, + "map takes only top level directory, " + "not '%s'", directory); + goto out; + } + } + } + + /* Assign proper subvolume */ + trav = this->children; + while (trav) { + if (!strcmp (trav->xlator->name, subvol)) { + + /* Check if there is another directory for + * same volume, if yes, return error. + */ + ret = check_multiple_volume_entry (this, + trav->xlator); + if (ret != 0) { + goto out; + } + + ret = 0; + if (default_flag) { + if (priv->default_xl) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "'*' specified more than " + "once. don't confuse me!!!"); + } + + priv->default_xl = trav->xlator; + goto out; + } + + tmp_map = CALLOC (1, sizeof (struct map_pattern)); + tmp_map->xl = trav->xlator; + tmp_map->dir_len = strlen (directory); + + /* make sure that the top level directory starts + * with '/' and ends without '/' + */ + tmp_map->directory = strdup (directory); + if (directory[tmp_map->dir_len - 1] == '/') { + tmp_map->dir_len--; + } + + if (!priv->map) + priv->map = tmp_map; + else { + struct map_pattern *trav_map = NULL; + trav_map = priv->map; + while (trav_map->next) + trav_map = trav_map->next; + trav_map->next = tmp_map; + } + + goto out; + } + + trav = trav->next; + } + + gf_log (this->name, GF_LOG_ERROR, + "map volume '%s' is not proper subvolume", subvol); + + out: + return ret; +} + +int +assign_default_subvol (xlator_t *this, const char *default_xl) +{ + int ret = -1; + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + + priv = this->private; + trav = this->children; + + while (trav) { + if (!strcmp (trav->xlator->name, default_xl)) { + ret = check_multiple_volume_entry (this, + trav->xlator); + if (ret != 0) { + goto out; + } + if (priv->default_xl) + gf_log (this->name, GF_LOG_WARNING, + "default-volume option provided, " + "overriding earlier '*' option"); + priv->default_xl = trav->xlator; + return 0; + } + trav = trav->next; + } + + gf_log (this->name, GF_LOG_ERROR, + "default-volume value is not an valid subvolume. check again"); + out: + return -1; +} + +void +verify_if_all_subvolumes_got_used (xlator_t *this) +{ + int idx = 0; + map_private_t *priv = NULL; + + priv = this->private; + + for (idx = 0; idx < priv->child_count; idx++) { + if (!priv->xlarray[idx].mapped) { + if (!priv->default_xl) { + priv->default_xl = priv->xlarray[idx].xl; + priv->xlarray[idx].mapped = 1; + } else { + gf_log (this->name, GF_LOG_WARNING, + "subvolume '%s' is not mapped to " + "any directory", + priv->xlarray[idx].xl->name); + } + } + } + + if (!priv->default_xl) { + gf_log (this->name, GF_LOG_WARNING, + "default subvolume not specified, filesystem " + "may not work properly. Check 'map' translator " + "documentation for more info"); + } + + return ; +} diff --git a/xlators/cluster/map/src/map.c b/xlators/cluster/map/src/map.c new file mode 100644 index 000000000..8c4b7c83c --- /dev/null +++ b/xlators/cluster/map/src/map.c @@ -0,0 +1,2193 @@ +/* + Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "map.h" + +/* For <op>_cbk functions */ +#include "defaults.c" + + +int32_t +map_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_stat_cbk, + subvol, + subvol->fops->stat, + loc); + + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_chmod_cbk, + subvol, + subvol->fops->chmod, + loc, + mode); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fchmod_cbk, + subvol, + subvol->fops->fchmod, + fd, + mode); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_chown_cbk, + subvol, + subvol->fops->chown, + loc, + uid, + gid); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fchown_cbk, + subvol, + subvol->fops->fchown, + fd, + uid, + gid); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_truncate_cbk, + subvol, + subvol->fops->truncate, + loc, + offset); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_ftruncate_cbk, + subvol, + subvol->fops->ftruncate, + fd, + offset); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_utimens_cbk, + subvol, + subvol->fops->utimens, + loc, + tv); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_access_cbk, + subvol, + subvol->fops->access, + loc, + mask); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_readlink_cbk, + subvol, + subvol->fops->readlink, + loc, + size); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_unlink_cbk, + subvol, + subvol->fops->unlink, + loc); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_rmdir_cbk, + subvol, + subvol->fops->rmdir, + loc); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int32_t op_errno = 1; + xlator_t *old_subvol = NULL; + xlator_t *new_subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (oldloc->inode, err); + VALIDATE_OR_GOTO (oldloc->path, err); + VALIDATE_OR_GOTO (newloc, err); + + old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode); + if (!old_subvol) { + op_errno = EINVAL; + goto err; + } + + if (newloc->path) { + new_subvol = get_mapping_subvol_from_path (this, + newloc->path); + if (new_subvol && (new_subvol != old_subvol)) { + op_errno = EXDEV; + goto err; + } + } + + STACK_WIND (frame, + default_rename_cbk, + old_subvol, + old_subvol->fops->rename, + oldloc, newloc); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int32_t op_errno = 1; + xlator_t *old_subvol = NULL; + xlator_t *new_subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (oldloc, err); + VALIDATE_OR_GOTO (oldloc->inode, err); + VALIDATE_OR_GOTO (oldloc->path, err); + VALIDATE_OR_GOTO (newloc, err); + + old_subvol = get_mapping_subvol_from_ctx (this, oldloc->inode); + if (!old_subvol) { + op_errno = EINVAL; + goto err; + } + + if (newloc->path) { + new_subvol = get_mapping_subvol_from_path (this, + newloc->path); + if (new_subvol && (new_subvol != old_subvol)) { + op_errno = EXDEV; + goto err; + } + } + + STACK_WIND (frame, + default_link_cbk, + old_subvol, + old_subvol->fops->link, + oldloc, newloc); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, fd_t *fd) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_open_cbk, + subvol, + subvol->fops->open, + loc, flags, fd); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_readv_cbk, + subvol, + subvol->fops->readv, + fd, + size, + offset); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_writev_cbk, + subvol, + subvol->fops->writev, + fd, + vector, + count, + off); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_flush_cbk, + subvol, + subvol->fops->flush, + fd); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fsync_cbk, + subvol, + subvol->fops->fsync, + fd, + flags); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fstat_cbk, + subvol, + subvol->fops->fstat, + fd); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_getdents_cbk, + subvol, + subvol->fops->getdents, + fd, + size, + offset, + flag); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_setdents_cbk, + subvol, + subvol->fops->setdents, + fd, + flags, + entries, + count); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fsyncdir_cbk, + subvol, + subvol->fops->fsyncdir, + fd, + flags); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + + + +int32_t +map_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + /* TODO: support for 'get' 'put' API */ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_setxattr_cbk, + subvol, + subvol->fops->setxattr, + loc, + dict, + flags); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + /* TODO: support for 'get' 'put' API */ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_getxattr_cbk, + subvol, + subvol->fops->getxattr, + loc, + name); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_xattrop_cbk, + subvol, + subvol->fops->xattrop, + loc, + flags, + dict); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_fxattrop_cbk, + subvol, + subvol->fops->fxattrop, + fd, + flags, + dict); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_removexattr_cbk, + subvol, + subvol->fops->removexattr, + loc, + name); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_lk_cbk, + subvol, + subvol->fops->lk, + fd, + cmd, + lock); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *lock) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_inodelk_cbk, + subvol, + subvol->fops->inodelk, + loc, cmd, lock); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *lock) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_finodelk_cbk, + subvol, + subvol->fops->finodelk, + fd, cmd, lock); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, default_entrylk_cbk, + subvol, + subvol->fops->entrylk, + loc, basename, cmd, type); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, default_fentrylk_cbk, + subvol, + subvol->fops->fentrylk, + fd, basename, cmd, type); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->inode, err); + VALIDATE_OR_GOTO (loc->path, err); + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + STACK_WIND (frame, + default_checksum_cbk, + subvol, + subvol->fops->checksum, + loc, + flag); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +static int32_t +map_newentry_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + call_frame_t *prev = NULL; + prev = cookie; + + map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; + +} + + +int32_t +map_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + subvol = get_mapping_subvol_from_path (this, loc->path); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set subvolume ptr in inode ctx", + loc->path); + } + + STACK_WIND (frame, + map_newentry_cbk, + subvol, + subvol->fops->mknod, + loc, mode, rdev); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + subvol = get_mapping_subvol_from_path (this, loc->path); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set subvolume ptr in inode ctx", + loc->path); + } + + STACK_WIND (frame, + map_newentry_cbk, + subvol, + subvol->fops->mkdir, + loc, mode); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +int32_t +map_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + subvol = get_mapping_subvol_from_path (this, loc->path); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set subvolume ptr in inode ctx", + loc->path); + } + + STACK_WIND (frame, + map_newentry_cbk, + subvol, + subvol->fops->symlink, + linkpath, loc); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + + +static int32_t +map_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + call_frame_t *prev = NULL; + prev = cookie; + + map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + return 0; +} + +int32_t +map_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, fd_t *fd) +{ + int32_t op_errno = 1; + xlator_t *subvol = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + subvol = get_mapping_subvol_from_path (this, loc->path); + if (!subvol) { + op_errno = EINVAL; + goto err; + } + + op_errno = inode_ctx_put (loc->inode, this, (uint64_t)(long)subvol); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set subvolume ptr in inode ctx", + loc->path); + } + + STACK_WIND (frame, map_create_cbk, + subvol, + subvol->fops->create, + loc, flags, mode, fd); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} + +int32_t +map_single_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + call_frame_t *prev = NULL; + prev = cookie; + + map_itransform (this, prev->this, buf->st_ino, &buf->st_ino); + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf, dict); + + return 0; +} + +int32_t +map_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + int callcnt = 0; + map_local_t *local = NULL; + inode_t *tmp_inode = NULL; + dict_t *tmp_dict = NULL; + + local = frame->local; + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if ((op_ret == 0) && (local->op_ret == -1)) { + local->op_ret = 0; + local->stbuf = *buf; + if (dict) + local->dict = dict_ref (dict); + local->inode = inode_ref (inode); + } + if (op_ret == -1) + local->op_errno = op_errno; + + } + UNLOCK (&frame->lock); + + if (!callcnt) { + tmp_dict = local->dict; + tmp_inode = local->inode; + + STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->inode, + &local->stbuf, local->dict); + + inode_unref (local->inode); + if (tmp_dict) + dict_unref (tmp_dict); + } + + return 0; +} + +int32_t +map_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + int32_t op_errno = EINVAL; + xlator_t *subvol = NULL; + map_local_t *local = NULL; + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + priv = this->private; + + if (loc->inode->ino == 1) + goto root_inode; + + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + subvol = get_mapping_subvol_from_path (this, loc->path); + if (!subvol) { + goto err; + } + + op_errno = inode_ctx_put (loc->inode, this, + (uint64_t)(long)subvol); + if (op_errno != 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s: failed to set subvolume in inode ctx", + loc->path); + } + } + + /* Just one callback */ + STACK_WIND (frame, + map_single_lookup_cbk, + subvol, + subvol->fops->lookup, + loc, + xattr_req); + + return 0; + + root_inode: + local = CALLOC (1, sizeof (map_local_t)); + + frame->local = local; + local->call_count = priv->child_count; + local->op_ret = -1; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + map_lookup_cbk, + trav->xlator, + trav->xlator->fops->lookup, + loc, + xattr_req); + trav = trav->next; + } + + return 0; + + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} +/* + * unify_normalize_stats - + */ +void +map_normalize_stats (struct statvfs *buf, + unsigned long bsize, + unsigned long frsize) +{ + double factor; + + if (buf->f_bsize != bsize) { + factor = ((double) buf->f_bsize) / bsize; + buf->f_bsize = bsize; + buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); + buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); + } + + if (buf->f_frsize != frsize) { + factor = ((double) buf->f_frsize) / frsize; + buf->f_frsize = frsize; + buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); + } +} + + +int32_t +map_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *stbuf) +{ + struct statvfs *dict_buf = NULL; + map_local_t *local = NULL; + int this_call_cnt = 0; + unsigned long bsize; + unsigned long frsize; + + local = frame->local; + + LOCK (&frame->lock); + { + this_call_cnt = --local->call_count; + + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + local->op_ret = 0; + + /* when a call is successfull, add it to local->dict */ + dict_buf = &local->statvfs; + + if (dict_buf->f_bsize != 0) { + bsize = max (dict_buf->f_bsize, + stbuf->f_bsize); + + frsize = max (dict_buf->f_frsize, + stbuf->f_frsize); + map_normalize_stats(dict_buf, bsize, frsize); + map_normalize_stats(stbuf, bsize, frsize); + } else { + dict_buf->f_bsize = stbuf->f_bsize; + dict_buf->f_frsize = stbuf->f_frsize; + } + + dict_buf->f_blocks += stbuf->f_blocks; + dict_buf->f_bfree += stbuf->f_bfree; + dict_buf->f_bavail += stbuf->f_bavail; + dict_buf->f_files += stbuf->f_files; + dict_buf->f_ffree += stbuf->f_ffree; + dict_buf->f_favail += stbuf->f_favail; + dict_buf->f_fsid = stbuf->f_fsid; + dict_buf->f_flag = stbuf->f_flag; + dict_buf->f_namemax = stbuf->f_namemax; + } +unlock: + UNLOCK (&frame->lock); + + if (!this_call_cnt) { + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->statvfs); + } + + return 0; +} + +int32_t +map_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t op_errno = EINVAL; + xlator_t *subvol = NULL; + map_local_t *local = NULL; + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (loc, err); + VALIDATE_OR_GOTO (loc->path, err); + VALIDATE_OR_GOTO (loc->inode, err); + + if (loc->inode->ino == 1) + goto root_inode; + subvol = get_mapping_subvol_from_ctx (this, loc->inode); + if (!subvol) { + goto err; + } + + /* Just one callback */ + STACK_WIND (frame, + default_statfs_cbk, + subvol, + subvol->fops->statfs, + loc); + + return 0; + + root_inode: + local = CALLOC (1, sizeof (map_local_t)); + + priv = this->private; + frame->local = local; + local->call_count = priv->child_count; + local->op_ret = -1; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + map_statfs_cbk, + trav->xlator, + trav->xlator->fops->statfs, + loc); + trav = trav->next; + } + + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + +int32_t +map_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int callcnt = 0; + map_local_t *local = NULL; + fd_t *local_fd = NULL; + + local = frame->local; + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + local->op_errno = op_errno; + goto unlock; + } + + local->op_ret = 0; + } + unlock: + UNLOCK (&frame->lock); + + if (!callcnt) { + local_fd = local->fd; + local->fd = NULL; + + STACK_UNWIND (frame, local->op_ret, + local->op_errno, local_fd); + + fd_unref (local_fd); + } + return 0; +} + + +int32_t +map_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, fd_t *fd) +{ + int32_t op_errno = EINVAL; + xlator_t *subvol = NULL; + map_local_t *local = NULL; + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + if (loc->inode->ino == 1) + goto root_inode; + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + goto err; + } + + /* Just one callback */ + STACK_WIND (frame, + default_opendir_cbk, + subvol, + subvol->fops->opendir, + loc, fd); + return 0; + + root_inode: + local = CALLOC (1, sizeof (map_local_t)); + + priv = this->private; + frame->local = local; + local->call_count = priv->child_count; + local->op_ret = -1; + local->fd = fd_ref (fd); + + trav = this->children; + while (trav) { + STACK_WIND (frame, + map_opendir_cbk, + trav->xlator, + trav->xlator->fops->opendir, + loc, fd); + trav = trav->next; + } + + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +int32_t +map_single_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries) +{ + call_frame_t *prev = NULL; + gf_dirent_t *orig_entry = NULL; + + prev = cookie; + + list_for_each_entry (orig_entry, &entries->list, list) { + map_itransform (this, prev->this, orig_entry->d_ino, + &orig_entry->d_ino); + } + STACK_UNWIND (frame, op_ret, op_errno, entries); + + return 0; +} + + +int +map_readdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int op_ret, int op_errno, gf_dirent_t *orig_entries) +{ + map_local_t *local = NULL; + gf_dirent_t entries; + gf_dirent_t *orig_entry = NULL; + gf_dirent_t *entry = NULL; + call_frame_t *prev = NULL; + xlator_t *subvol = NULL; + xlator_t *next = NULL; + int count = 0; + fd_t *local_fd = NULL; + + INIT_LIST_HEAD (&entries.list); + prev = cookie; + local = frame->local; + + if (op_ret < 0) + goto done; + + list_for_each_entry (orig_entry, &orig_entries->list, list) { + subvol = prev->this; + + entry = gf_dirent_for_name (orig_entry->d_name); + if (!entry) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto unwind; + } + + map_itransform (this, subvol, orig_entry->d_ino, + &entry->d_ino); + map_itransform (this, subvol, orig_entry->d_off, + &entry->d_off); + + entry->d_type = orig_entry->d_type; + entry->d_len = orig_entry->d_len; + + list_add_tail (&entry->list, &entries.list); + count++; + } + + op_ret = count; + +done: + if (count == 0) { + next = map_subvol_next (this, prev->this); + if (!next) { + goto unwind; + } + + STACK_WIND (frame, map_readdir_cbk, + next, next->fops->readdir, + local->fd, local->size, 0); + return 0; + } + +unwind: + if (op_ret < 0) + op_ret = 0; + + local_fd = local->fd; + local->fd = NULL; + + STACK_UNWIND (frame, op_ret, op_errno, &entries); + + fd_unref (local_fd); + + gf_dirent_free (&entries); + + return 0; +} + + +int32_t +map_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t yoff) +{ + int32_t op_errno = EINVAL; + xlator_t *subvol = NULL; + map_local_t *local = NULL; + map_private_t *priv = NULL; + xlator_t *xvol = NULL; + off_t xoff = 0; + + VALIDATE_OR_GOTO (frame, err); + VALIDATE_OR_GOTO (this, err); + VALIDATE_OR_GOTO (fd, err); + VALIDATE_OR_GOTO (fd->inode, err); + + if (fd->inode->ino == 1) + goto root_inode; + + subvol = get_mapping_subvol_from_ctx (this, fd->inode); + if (!subvol) { + goto err; + } + + /* Just one callback */ + + STACK_WIND (frame, + map_single_readdir_cbk, + subvol, + subvol->fops->readdir, + fd, size, yoff); + return 0; + + root_inode: + /* readdir on '/' */ + local = CALLOC (1, sizeof (map_local_t)); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + op_errno = ENOMEM; + goto err; + } + + priv = this->private; + frame->local = local; + local->op_errno = ENOENT; + local->op_ret = -1; + + local->fd = fd_ref (fd); + local->size = size; + + map_deitransform (this, yoff, &xvol, (uint64_t *)&xoff); + + STACK_WIND (frame, map_readdir_cbk, + xvol, xvol->fops->readdir, + fd, size, xoff); + + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL); + + return 0; +} + + +#if 0 +/* TODO : do it later as currently only unify uses this mop and mostly + unify will be used below map */ +int32_t +map_stats_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + STACK_UNWIND (frame, op_ret, op_errno, stats); + return 0; +} + + +int32_t +map_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags) +{ + STACK_WIND (frame, + map_stats_cbk, + subvol, + subvol->mops->stats, + flags); + return 0; + err: + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + + return 0; +} +#endif /* if 0 */ + + +/* TODO: define the behavior of notify */ + + +void +fini (xlator_t *this) +{ + map_private_t *priv = NULL; + struct map_pattern *trav_map = NULL; + struct map_pattern *tmp_map = NULL; + + priv = this->private; + + if (priv) { + if (priv->xlarray) + FREE (priv->xlarray); + + trav_map = priv->map; + while (trav_map) { + tmp_map = trav_map; + trav_map = trav_map->next; + FREE (tmp_map); + } + + FREE(priv); + } + + return; +} + +int +init (xlator_t *this) +{ + map_private_t *priv = NULL; + xlator_list_t *trav = NULL; + int count = 0; + int ret = -1; + char *pattern_string = NULL; + char *map_pair_str = NULL; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_map_pair = NULL; + char *dir_str = NULL; + char *subvol_str = NULL; + char *default_xl = NULL; + + if (!this->children) { + gf_log (this->name,GF_LOG_ERROR, + "FATAL: map should have one or more child defined"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + priv = CALLOC (1, sizeof (map_private_t)); + this->private = priv; + + /* allocate xlator array */ + trav = this->children; + while (trav) { + count++; + trav = trav->next; + } + priv->xlarray = CALLOC (1, sizeof (struct map_xlator_array) * count); + priv->child_count = count; + + /* build xlator array */ + count = 0; + trav = this->children; + while (trav) { + priv->xlarray[count++].xl = trav->xlator; + trav = trav->next; + } + + /* map dir1:brick1;dir2:brick2;dir3:brick3;*:brick4 */ + ret = dict_get_str (this->options, "map-directory", &pattern_string); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "map.pattern not given, can't continue"); + goto err; + } + map_pair_str = strtok_r (pattern_string, ";", &tmp_str); + while (map_pair_str) { + dup_map_pair = strdup (map_pair_str); + dir_str = strtok_r (dup_map_pair, ":", &tmp_str1); + if (!dir_str) { + gf_log (this->name, GF_LOG_ERROR, + "directory string invalid"); + goto err; + } + subvol_str = strtok_r (NULL, ":", &tmp_str1); + if (!subvol_str) { + gf_log (this->name, GF_LOG_ERROR, + "mapping subvolume string invalid"); + goto err; + } + ret = verify_dir_and_assign_subvol (this, + dir_str, + subvol_str); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "verification failed"); + goto err; + } + + FREE (dup_map_pair); + + map_pair_str = strtok_r (NULL, ";", &tmp_str); + } + + /* default-volume brick4 */ + ret = dict_get_str (this->options, "default-volume", &default_xl); + if (ret == 0) { + ret = assign_default_subvol (this, default_xl); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "assigning default failed"); + goto err; + } + } + + verify_if_all_subvolumes_got_used (this); + + return 0; + err: + fini (this); + return -1; +} + + +struct xlator_fops fops = { + .lookup = map_lookup, + .mknod = map_mknod, + .create = map_create, + + .stat = map_stat, + .chmod = map_chmod, + .chown = map_chown, + .fchown = map_fchown, + .fchmod = map_fchmod, + .fstat = map_fstat, + .utimens = map_utimens, + .truncate = map_truncate, + .ftruncate = map_ftruncate, + .access = map_access, + .readlink = map_readlink, + .setxattr = map_setxattr, + .getxattr = map_getxattr, + .removexattr = map_removexattr, + .open = map_open, + .readv = map_readv, + .writev = map_writev, + .flush = map_flush, + .fsync = map_fsync, + .statfs = map_statfs, + .lk = map_lk, + .opendir = map_opendir, + .readdir = map_readdir, + .fsyncdir = map_fsyncdir, + .symlink = map_symlink, + .unlink = map_unlink, + .link = map_link, + .mkdir = map_mkdir, + .rmdir = map_rmdir, + .rename = map_rename, + .inodelk = map_inodelk, + .finodelk = map_finodelk, + .entrylk = map_entrylk, + .fentrylk = map_fentrylk, + .xattrop = map_xattrop, + .fxattrop = map_fxattrop, + .setdents = map_setdents, + .getdents = map_getdents, + .checksum = map_checksum, +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"map-directory"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"default-volume"}, + .type = GF_OPTION_TYPE_XLATOR + }, + + { .key = {NULL} } +}; diff --git a/xlators/cluster/map/src/map.h b/xlators/cluster/map/src/map.h new file mode 100644 index 000000000..0f1aabfd6 --- /dev/null +++ b/xlators/cluster/map/src/map.h @@ -0,0 +1,76 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __MAP_H__ +#define __MAP_H__ + +#include "xlator.h" + +struct map_pattern { + struct map_pattern *next; + xlator_t *xl; + char *directory; + int dir_len; +}; + +struct map_xlator_array { + xlator_t *xl; + int mapped; /* yes/no */ +}; + +typedef struct { + struct map_pattern *map; + xlator_t *default_xl; + struct map_xlator_array *xlarray; + int child_count; +} map_private_t; + +typedef struct { + int32_t op_ret; + int32_t op_errno; + int call_count; + struct statvfs statvfs; + struct stat stbuf; + inode_t *inode; + dict_t *dict; + fd_t *fd; + + size_t size; +} map_local_t; + +xlator_t *map_subvol_next (xlator_t *this, xlator_t *prev); +int map_subvol_cnt (xlator_t *this, xlator_t *subvol); + +int map_itransform (xlator_t *this, xlator_t *subvol, + uint64_t x, uint64_t *y_p); +int map_deitransform (xlator_t *this, uint64_t y, + xlator_t **subvol_p, uint64_t *x_p); + + +xlator_t *get_mapping_subvol_from_path (xlator_t *this, const char *path); +xlator_t *get_mapping_subvol_from_ctx (xlator_t *this, inode_t *inode); + +int check_multiple_volume_entry (xlator_t *this, xlator_t *subvol); +int verify_dir_and_assign_subvol (xlator_t *this, + const char *directory, const char *subvol); +int assign_default_subvol (xlator_t *this, const char *default_xl); +void verify_if_all_subvolumes_got_used (xlator_t *this); + + +#endif /* __MAP_H__ */ diff --git a/xlators/cluster/stripe/Makefile.am b/xlators/cluster/stripe/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/stripe/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/stripe/src/Makefile.am b/xlators/cluster/stripe/src/Makefile.am new file mode 100644 index 000000000..60e0a1568 --- /dev/null +++ b/xlators/cluster/stripe/src/Makefile.am @@ -0,0 +1,14 @@ + +xlator_LTLIBRARIES = stripe.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +stripe_la_LDFLAGS = -module -avoidversion + +stripe_la_SOURCES = stripe.c +stripe_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/cluster/stripe/src/stripe.c b/xlators/cluster/stripe/src/stripe.c new file mode 100644 index 000000000..83787ca2a --- /dev/null +++ b/xlators/cluster/stripe/src/stripe.c @@ -0,0 +1,3286 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/** + * xlators/cluster/stripe: + * Stripe translator, stripes the data accross its child nodes, + * as per the options given in the volfile. The striping works + * fairly simple. It writes files at different offset as per + * calculation. So, 'ls -l' output at the real posix level will + * show file size bigger than the actual size. But when one does + * 'df' or 'du <file>', real size of the file on the server is shown. + * + * WARNING: + * Stripe translator can't regenerate data if a child node gets disconnected. + * So, no 'self-heal' for stripe. Hence the advice, use stripe only when its + * very much necessary, or else, use it in combination with AFR, to have a + * backup copy. + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "logging.h" +#include "defaults.h" +#include "compat.h" +#include "compat-errno.h" +#include <fnmatch.h> +#include <signal.h> + +#define STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \ + if (!(_loc && _loc->inode)) { \ + STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \ + return 0; \ + } \ +} while(0) + +/** + * struct stripe_options : This keeps the pattern and the block-size + * information, which is used for striping on a file. + */ +struct stripe_options { + struct stripe_options *next; + char path_pattern[256]; + uint64_t block_size; +}; + +/** + * Private structure for stripe translator + */ +struct stripe_private { + struct stripe_options *pattern; + xlator_t **xl_array; + uint64_t block_size; + gf_lock_t lock; + uint8_t nodes_down; + int8_t first_child_down; + int8_t child_count; + int8_t state[256]; /* Current state of the child node, + 0 for down, 1 for up */ + gf_boolean_t xattr_supported; /* 0 for no, 1 for yes, default yes */ +}; + +/** + * Used to keep info about the replies received from fops->readv calls + */ +struct readv_replies { + struct iovec *vector; + int32_t count; //count of vector + int32_t op_ret; //op_ret of readv + int32_t op_errno; + struct stat stbuf; /* 'stbuf' is also a part of reply */ +}; + +/** + * Local structure to be passed with all the frames in case of STACK_WIND + */ +struct stripe_local; /* this itself is used inside the structure; */ + +struct stripe_local { + struct stripe_local *next; + call_frame_t *orig_frame; + + /* Used by _cbk functions */ + struct stat stbuf; + struct readv_replies *replies; + struct statvfs statvfs_buf; + dir_entry_t *entry; + struct xlator_stats stats; + + int8_t revalidate; + int8_t failed; + int8_t unwind; + + int32_t node_index; + int32_t call_count; + int32_t wind_count; /* used instead of child_cound + in case of read and write */ + int32_t op_ret; + int32_t op_errno; + int32_t count; + int32_t flags; + char *name; + inode_t *inode; + + loc_t loc; + loc_t loc2; + + /* For File I/O fops */ + dict_t *dict; + + /* General usage */ + off_t offset; + off_t stripe_size; + + int8_t *list; + struct flock lock; + fd_t *fd; + void *value; +}; + +typedef struct stripe_local stripe_local_t; +typedef struct stripe_private stripe_private_t; + +/** + * stripe_get_matching_bs - Get the matching block size for the given path. + */ +int32_t +stripe_get_matching_bs (const char *path, + struct stripe_options *opts, + uint64_t default_bs) +{ + struct stripe_options *trav = NULL; + char *pathname = NULL; + uint64_t block_size = 0; + + block_size = default_bs; + pathname = strdup (path); + trav = opts; + + while (trav) { + if (fnmatch (trav->path_pattern, + pathname, FNM_NOESCAPE) == 0) { + block_size = trav->block_size; + break; + } + trav = trav->next; + } + free (pathname); + + return block_size; +} + + +/* + * stripe_common_cbk - + */ +int32_t +stripe_common_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * stripe_stack_unwind_cbk - This function is used for all the _cbk without + * any extra arguments (other than the minimum given) + * This is called from functions like fsync,unlink,rmdir etc. + * + */ +int32_t +stripe_stack_unwind_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + if (op_ret >= 0) + local->op_ret = op_ret; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->loc.path) + loc_wipe (&local->loc); + if (local->loc2.path) + loc_wipe (&local->loc2); + + STACK_UNWIND (frame, local->op_ret, local->op_errno); + } + return 0; +} + +int32_t +stripe_common_buf_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +/** + * stripe_stack_unwind_buf_cbk - This function is used for all the _cbk with + * 'struct stat *buf' as extra argument (other than minimum) + * This is called from functions like, chmod, fchmod, chown, fchown, + * truncate, ftruncate, utimens etc. + * + * @cookie - this argument should be always 'xlator_t *' of child node + */ +int32_t +stripe_stack_unwind_buf_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + + if (op_ret == 0) { + local->op_ret = 0; + if (local->stbuf.st_blksize == 0) { + local->stbuf = *buf; + /* Because st_blocks gets added again */ + local->stbuf.st_blocks = 0; + } + + if (FIRST_CHILD(this) == + ((call_frame_t *)cookie)->this) { + /* Always, pass the inode number of + first child to the above layer */ + local->stbuf.st_ino = buf->st_ino; + local->stbuf.st_mtime = buf->st_mtime; + } + + local->stbuf.st_blocks += buf->st_blocks; + if (local->stbuf.st_size < buf->st_size) + local->stbuf.st_size = buf->st_size; + if (local->stbuf.st_blksize != buf->st_blksize) { + /* TODO: add to blocks in terms of + original block size */ + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->loc.path) + loc_wipe (&local->loc); + if (local->loc2.path) + loc_wipe (&local->loc2); + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + } + + return 0; +} + +/* In case of symlink, mknod, the file is created on just first node */ +int32_t +stripe_common_inode_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +/** + * stripe_stack_unwind_inode_cbk - This is called by the function like, + * link (), symlink (), mkdir (), mknod () + * This creates a inode for new inode. It keeps a list of all + * the inodes received from the child nodes. It is used while + * forwarding any fops to child nodes. + * + */ +int32_t +stripe_stack_unwind_inode_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + + if (op_ret >= 0) { + local->op_ret = 0; + + if (local->stbuf.st_blksize == 0) { + local->inode = inode; + local->stbuf = *buf; + /* Because st_blocks gets added again */ + local->stbuf.st_blocks = 0; + } + if (FIRST_CHILD(this) == + ((call_frame_t *)cookie)->this) { + local->stbuf.st_ino = buf->st_ino; + local->stbuf.st_mtime = buf->st_mtime; + } + + local->stbuf.st_blocks += buf->st_blocks; + if (local->stbuf.st_size < buf->st_size) + local->stbuf.st_size = buf->st_size; + if (local->stbuf.st_blksize != buf->st_blksize) { + /* TODO: add to blocks in terms of + original block size */ + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf); + } + + return 0; +} + +int32_t +stripe_stack_unwind_inode_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + int32_t callcnt = 0; + dict_t *tmp_dict = NULL; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + if (op_errno != ENOENT) + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + + if (op_ret >= 0) { + local->op_ret = 0; + + if (local->stbuf.st_blksize == 0) { + local->inode = inode; + local->stbuf = *buf; + /* Because st_blocks gets added again */ + local->stbuf.st_blocks = 0; + } + if (FIRST_CHILD(this) == + ((call_frame_t *)cookie)->this) { + local->stbuf.st_ino = buf->st_ino; + local->stbuf.st_mtime = buf->st_mtime; + if (local->dict) + dict_unref (local->dict); + local->dict = dict_ref (dict); + } else { + if (!local->dict) + local->dict = dict_ref (dict); + } + local->stbuf.st_blocks += buf->st_blocks; + if (local->stbuf.st_size < buf->st_size) + local->stbuf.st_size = buf->st_size; + if (local->stbuf.st_blksize != buf->st_blksize) { + /* TODO: add to blocks in terms of + original block size */ + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + tmp_dict = local->dict; + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf, local->dict); + if (tmp_dict) + dict_unref (tmp_dict); + } + + return 0; +} + + +/** + * stripe_lookup - + */ +int32_t +stripe_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + stripe_private_t *priv = this->private; + char send_lookup_to_all = 0; + + if (!(loc && loc->inode)) { + gf_log (this->name, GF_LOG_ERROR, + "wrong argument, returning EINVAL"); + STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + + if ((!loc->inode->st_mode) || + S_ISDIR (loc->inode->st_mode) || + S_ISREG (loc->inode->st_mode)) + send_lookup_to_all = 1; + + if (send_lookup_to_all) { + /* Everytime in stripe lookup, all child nodes + should be looked up */ + local->call_count = priv->child_count; + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_inode_lookup_cbk, + trav->xlator, + trav->xlator->fops->lookup, + loc, xattr_req); + trav = trav->next; + } + } else { + local->call_count = 1; + + STACK_WIND (frame, + stripe_stack_unwind_inode_lookup_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, + loc, xattr_req); + } + + return 0; +} + +/** + * stripe_stat - + */ +int32_t +stripe_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int send_lookup_to_all = 0; + xlator_list_t *trav = NULL; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_lookup_to_all = 1; + + if (!send_lookup_to_all) { + STACK_WIND (frame, + stripe_common_buf_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + local->call_count = priv->child_count; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->stat, + loc); + trav = trav->next; + } + } + return 0; +} + + +/** + * stripe_chmod - + */ +int32_t +stripe_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int send_fop_to_all = 0; + xlator_list_t *trav = NULL; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_fop_to_all = 1; + + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_buf_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chmod, + loc, mode); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + local->call_count = priv->child_count; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->chmod, + loc, mode); + trav = trav->next; + } + } + return 0; +} + + +/** + * stripe_chown - + */ +int32_t +stripe_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + int send_fop_to_all = 0; + xlator_list_t *trav = NULL; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_fop_to_all = 1; + + trav = this->children; + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_buf_cbk, + trav->xlator, + trav->xlator->fops->chown, + loc, uid, gid); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + local->call_count = priv->child_count; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->chown, + loc, uid, gid); + trav = trav->next; + } + } + + return 0; +} + + +/** + * stripe_statfs_cbk - + */ +int32_t +stripe_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *stbuf) +{ + stripe_local_t *local = (stripe_local_t *)frame->local; + int32_t callcnt; + LOCK(&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret != 0 && op_errno != ENOTCONN) { + local->op_errno = op_errno; + } + if (op_ret == 0) { + struct statvfs *dict_buf = &local->statvfs_buf; + dict_buf->f_bsize = stbuf->f_bsize; + dict_buf->f_frsize = stbuf->f_frsize; + dict_buf->f_blocks += stbuf->f_blocks; + dict_buf->f_bfree += stbuf->f_bfree; + dict_buf->f_bavail += stbuf->f_bavail; + dict_buf->f_files += stbuf->f_files; + dict_buf->f_ffree += stbuf->f_ffree; + dict_buf->f_favail += stbuf->f_favail; + dict_buf->f_fsid = stbuf->f_fsid; + dict_buf->f_flag = stbuf->f_flag; + dict_buf->f_namemax = stbuf->f_namemax; + local->op_ret = 0; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + STACK_UNWIND (frame, local->op_ret, + local->op_errno, &local->statvfs_buf); + } + + return 0; +} + + +/** + * stripe_statfs - + */ +int32_t +stripe_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + local->op_errno = ENOTCONN; + frame->local = local; + + local->call_count = ((stripe_private_t *)this->private)->child_count; + while (trav) { + STACK_WIND (frame, + stripe_statfs_cbk, + trav->xlator, + trav->xlator->fops->statfs, + loc); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_truncate - + */ +int32_t +stripe_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + int send_fop_to_all = 0; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_fop_to_all = 1; + + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_buf_cbk, + trav->xlator, + trav->xlator->fops->truncate, + loc, + offset); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->truncate, + loc, + offset); + trav = trav->next; + } + } + + return 0; +} + + +/** + * stripe_utimens - + */ +int32_t +stripe_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + int send_fop_to_all = 0; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_fop_to_all = 1; + + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_buf_cbk, + trav->xlator, + trav->xlator->fops->utimens, + loc, tv); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->utimens, + loc, tv); + trav = trav->next; + } + } + return 0; +} + + +int32_t +stripe_first_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + stripe_local_t *local = frame->local; + xlator_list_t *trav = this->children; + + if (op_ret == -1) + { + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; + } + + local->op_ret = 0; + local->stbuf = *buf; + local->call_count--; + trav = trav->next; /* Skip first child */ + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->rename, + &local->loc, &local->loc2); + trav = trav->next; + } + + return 0; +} +/** + * stripe_rename - + */ +int32_t +stripe_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, EIO, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + local->inode = oldloc->inode; + loc_copy (&local->loc, oldloc); + loc_copy (&local->loc2, newloc); + + local->call_count = priv->child_count; + + frame->local = local; + + STACK_WIND (frame, + stripe_first_rename_cbk, + trav->xlator, + trav->xlator->fops->rename, + oldloc, newloc); + + return 0; +} + + +/** + * stripe_access - + */ +int32_t +stripe_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + STACK_WIND (frame, + stripe_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, + loc, mask); + + return 0; +} + + +/** + * stripe_readlink_cbk - + */ +int32_t +stripe_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + STACK_UNWIND (frame, op_ret, op_errno, path); + + return 0; +} + + +/** + * stripe_readlink - + */ +int32_t +stripe_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + STACK_WIND (frame, + stripe_readlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, + loc, size); + + return 0; +} + + +/** + * stripe_unlink - + */ +int32_t +stripe_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int send_fop_to_all = 0; + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO); + return 0; + } + + if (S_ISDIR (loc->inode->st_mode) || S_ISREG (loc->inode->st_mode)) + send_fop_to_all = 1; + + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_cbk, + trav->xlator, + trav->xlator->fops->unlink, + loc); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->unlink, + loc); + trav = trav->next; + } + } + + return 0; +} + + +int32_t +stripe_first_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + xlator_list_t *trav = this->children; + stripe_local_t *local = frame->local; + + if (op_ret == -1) + { + STACK_UNWIND (frame, op_ret, op_errno); + return 0; + } + + local->call_count--; /* First child successful */ + trav = trav->next; /* Skip first child */ + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->rmdir, + &local->loc); + trav = trav->next; + } + + return 0; +} + +/** + * stripe_rmdir - + */ +int32_t +stripe_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = loc->inode; + loc_copy (&local->loc, loc); + local->call_count = priv->child_count; + + STACK_WIND (frame, + stripe_first_rmdir_cbk, + trav->xlator, + trav->xlator->fops->rmdir, + loc); + + return 0; +} + + +/** + * stripe_setxattr - + */ +int32_t +stripe_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN); + return 0; + } + + STACK_WIND (frame, + stripe_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, dict, flags); + + return 0; +} + + +int32_t +stripe_mknod_ifreg_fail_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf); + } + + return 0; +} + + +/** + */ +int32_t +stripe_mknod_ifreg_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->op_ret == -1) { + local->call_count = priv->child_count; + while (trav) { + STACK_WIND (frame, + stripe_mknod_ifreg_fail_unlink_cbk, + trav->xlator, + trav->xlator->fops->unlink, + &local->loc); + trav = trav->next; + } + return 0; + } + + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf); + } + return 0; +} + +/** + */ +int32_t +stripe_mknod_ifreg_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + int ret = 0; + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + stripe_private_t *priv = this->private; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->failed = 1; + local->op_errno = op_errno; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + /* Get the mapping in inode private */ + /* Get the stat buf right */ + if (local->stbuf.st_blksize == 0) { + local->stbuf = *buf; + /* Because st_blocks gets added again */ + local->stbuf.st_blocks = 0; + } + + /* Always, pass the inode number of first child + to the above layer */ + if (FIRST_CHILD(this) == + ((call_frame_t *)cookie)->this) + local->stbuf.st_ino = buf->st_ino; + + local->stbuf.st_blocks += buf->st_blocks; + if (local->stbuf.st_size < buf->st_size) + local->stbuf.st_size = buf->st_size; + if (local->stbuf.st_blksize != buf->st_blksize) { + /* TODO: add to blocks in terms of + original block size */ + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if ((local->op_ret != -1) && priv->xattr_supported) { + /* Send a setxattr request to nodes where the + files are created */ + int32_t index = 0; + char size_key[256] = {0,}; + char index_key[256] = {0,}; + char count_key[256] = {0,}; + xlator_list_t *trav = this->children; + dict_t *dict = NULL; + + sprintf (size_key, + "trusted.%s.stripe-size", this->name); + sprintf (count_key, + "trusted.%s.stripe-count", this->name); + sprintf (index_key, + "trusted.%s.stripe-index", this->name); + + local->call_count = priv->child_count; + + while (trav) { + dict = get_new_dict (); + dict_ref (dict); + /* TODO: check return value */ + ret = dict_set_int64 (dict, size_key, + local->stripe_size); + ret = dict_set_int32 (dict, count_key, + local->call_count); + ret = dict_set_int32 (dict, index_key, index); + + STACK_WIND (frame, + stripe_mknod_ifreg_setxattr_cbk, + trav->xlator, + trav->xlator->fops->setxattr, + &local->loc, dict, 0); + + dict_unref (dict); + index++; + trav = trav->next; + } + } else { + /* Create itself has failed.. so return + without setxattring */ + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->inode, &local->stbuf); + } + } + + return 0; +} + + +/** + * stripe_mknod - + */ +int32_t +stripe_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, NULL, NULL); + return 0; + } + + if (S_ISREG(mode)) { + /* NOTE: on older kernels (older than 2.6.9), + creat() fops is sent as mknod() + open(). Hence handling + S_IFREG files is necessary */ + if (priv->nodes_down) { + gf_log (this->name, GF_LOG_WARNING, + "Some node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, loc->inode, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->stripe_size = stripe_get_matching_bs (loc->path, + priv->pattern, + priv->block_size); + frame->local = local; + local->inode = loc->inode; + loc_copy (&local->loc, loc); + + /* Everytime in stripe lookup, all child nodes should + be looked up */ + local->call_count = + ((stripe_private_t *)this->private)->child_count; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_mknod_ifreg_cbk, + trav->xlator, + trav->xlator->fops->mknod, + loc, mode, rdev); + trav = trav->next; + } + + /* This case is handled, no need to continue further. */ + return 0; + } + + + STACK_WIND (frame, + stripe_common_inode_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev); + + return 0; +} + + +/** + * stripe_mkdir - + */ +int32_t +stripe_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, NULL, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + local->call_count = priv->child_count; + frame->local = local; + + /* Everytime in stripe lookup, all child nodes should be looked up */ + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_inode_cbk, + trav->xlator, + trav->xlator->fops->mkdir, + loc, mode); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_symlink - + */ +int32_t +stripe_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc) +{ + stripe_private_t *priv = this->private; + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, NULL, NULL); + return 0; + } + + /* send symlink to only first node */ + STACK_WIND (frame, + stripe_common_inode_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + linkpath, loc); + + return 0; +} + +/** + * stripe_link - + */ +int32_t +stripe_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int send_fop_to_all = 0; + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, NULL, NULL); + return 0; + } + + + if (S_ISREG (oldloc->inode->st_mode)) + send_fop_to_all = 1; + + if (!send_fop_to_all) { + STACK_WIND (frame, + stripe_common_inode_cbk, + trav->xlator, + trav->xlator->fops->link, + oldloc, newloc); + } else { + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + /* Everytime in stripe lookup, all child + nodes should be looked up */ + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_inode_cbk, + trav->xlator, + trav->xlator->fops->link, + oldloc, newloc); + trav = trav->next; + } + } + + return 0; +} + +int32_t +stripe_create_fail_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + fd_t *lfd = NULL; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + lfd = local->fd; + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd, local->inode, &local->stbuf); + fd_unref (lfd); + } + return 0; +} + + +/** + * stripe_create_setxattr_cbk - + */ +int32_t +stripe_create_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + fd_t *lfd = NULL; + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->op_ret == -1) { + local->call_count = priv->child_count; + while (trav) { + STACK_WIND (frame, + stripe_create_fail_unlink_cbk, + trav->xlator, + trav->xlator->fops->unlink, + &local->loc); + trav = trav->next; + } + + return 0; + } + + lfd = local->fd; + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd, local->inode, &local->stbuf); + fd_unref (lfd); + } + + return 0; +} + +/** + * stripe_create_cbk - + */ +int32_t +stripe_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + stripe_private_t *priv = this->private; + fd_t *lfd = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->failed = 1; + local->op_errno = op_errno; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + /* Get the mapping in inode private */ + /* Get the stat buf right */ + if (local->stbuf.st_blksize == 0) { + local->stbuf = *buf; + /* Because st_blocks gets added again */ + local->stbuf.st_blocks = 0; + } + + /* Always, pass the inode number of first + child to the above layer */ + if (FIRST_CHILD(this) == + ((call_frame_t *)cookie)->this) + local->stbuf.st_ino = buf->st_ino; + + local->stbuf.st_blocks += buf->st_blocks; + if (local->stbuf.st_size < buf->st_size) + local->stbuf.st_size = buf->st_size; + if (local->stbuf.st_blksize != buf->st_blksize) { + /* TODO: add to blocks in terms of + original block size */ + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->op_ret >= 0) { + fd_ctx_set (local->fd, this, local->stripe_size); + } + + if ((local->op_ret != -1) && + local->stripe_size && priv->xattr_supported) { + /* Send a setxattr request to nodes where + the files are created */ + int ret = 0; + int32_t index = 0; + char size_key[256] = {0,}; + char index_key[256] = {0,}; + char count_key[256] = {0,}; + xlator_list_t *trav = this->children; + dict_t *dict = NULL; + + sprintf (size_key, + "trusted.%s.stripe-size", this->name); + sprintf (count_key, + "trusted.%s.stripe-count", this->name); + sprintf (index_key, + "trusted.%s.stripe-index", this->name); + + local->call_count = priv->child_count; + + while (trav) { + dict = get_new_dict (); + dict_ref (dict); + + /* TODO: check return values */ + ret = dict_set_int64 (dict, size_key, + local->stripe_size); + ret = dict_set_int32 (dict, count_key, + local->call_count); + ret = dict_set_int32 (dict, index_key, index); + + STACK_WIND (frame, + stripe_create_setxattr_cbk, + trav->xlator, + trav->xlator->fops->setxattr, + &local->loc, + dict, + 0); + + dict_unref (dict); + index++; + trav = trav->next; + } + } else { + /* Create itself has failed.. so return + without setxattring */ + lfd = local->fd; + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local->fd, local->inode, &local->stbuf); + + fd_unref (lfd); + } + } + + return 0; +} + + +/** + * stripe_create - If a block-size is specified for the 'name', create the + * file in all the child nodes. If not, create it in only first child. + * + * @name- complete path of the file to be created. + */ +int32_t +stripe_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd) +{ + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + + /* files created in O_APPEND mode does not allow lseek() on fd */ + flags &= ~O_APPEND; + + if (priv->first_child_down || priv->nodes_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, fd, loc->inode, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + local->op_errno = ENOTCONN; + local->stripe_size = stripe_get_matching_bs (loc->path, + priv->pattern, + priv->block_size); + frame->local = local; + local->inode = loc->inode; + loc_copy (&local->loc, loc); + local->fd = fd_ref (fd); + + local->call_count = ((stripe_private_t *)this->private)->child_count; + + trav = this->children; + while (trav) { + STACK_WIND (frame, + stripe_create_cbk, + trav->xlator, + trav->xlator->fops->create, + loc, flags, mode, fd); + trav = trav->next; + } + + return 0; +} + +/** + * stripe_open_cbk - + */ +int32_t +stripe_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + local->failed = 1; + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + + if (op_ret >= 0) + local->op_ret = op_ret; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + + if (local->op_ret >= 0) { + fd_ctx_set (local->fd, this, local->stripe_size); + } + loc_wipe (&local->loc); + STACK_UNWIND (frame, local->op_ret, local->op_errno, fd); + } + + return 0; +} + + +/** + * stripe_getxattr_cbk - + */ +int32_t +stripe_open_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + xlator_list_t *trav = this->children; + stripe_private_t *priv = this->private; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->failed && (local->op_ret != -1)) { + /* If getxattr doesn't fails, call open */ + char size_key[256] = {0,}; + data_t *stripe_size_data = NULL; + + sprintf (size_key, + "trusted.%s.stripe-size", this->name); + stripe_size_data = dict_get (dict, size_key); + + if (stripe_size_data) { + local->stripe_size = + data_to_int64 (stripe_size_data); + /* + if (local->stripe_size != priv->block_size) { + gf_log (this->name, GF_LOG_WARNING, + "file(%s) is having different " + "block-size", local->loc.path); + } + */ + } else { + /* if the file was created using earlier + versions of stripe */ + gf_log (this->name, GF_LOG_CRITICAL, + "[CRITICAL] Seems like file(%s) " + "created using earlier version", + local->loc.path); + } + } + + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_open_cbk, + trav->xlator, + trav->xlator->fops->open, + &local->loc, local->flags, local->fd); + trav = trav->next; + } + } + + return 0; +} + +/** + * stripe_open - + */ +int32_t +stripe_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* files opened in O_APPEND mode does not allow lseek() on fd */ + flags &= ~O_APPEND; + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->fd = fd; + frame->local = local; + local->inode = loc->inode; + loc_copy (&local->loc, loc); + + /* Striped files */ + local->flags = flags; + local->call_count = priv->child_count; + local->stripe_size = stripe_get_matching_bs (loc->path, + priv->pattern, + priv->block_size); + + if (priv->xattr_supported) { + while (trav) { + STACK_WIND (frame, + stripe_open_getxattr_cbk, + trav->xlator, + trav->xlator->fops->getxattr, + loc, NULL); + trav = trav->next; + } + } else { + while (trav) { + STACK_WIND (frame, + stripe_open_cbk, + trav->xlator, + trav->xlator->fops->open, + &local->loc, local->flags, local->fd); + trav = trav->next; + } + } + + return 0; +} + +/** + * stripe_opendir_cbk - + */ +int32_t +stripe_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->failed = 1; + local->op_errno = op_errno; + } + + if (op_ret >= 0) + local->op_ret = op_ret; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + + return 0; +} + + +/** + * stripe_opendir - + */ +int32_t +stripe_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + fd_t *fd) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning EIO"); + STACK_UNWIND (frame, -1, EIO, NULL); + return 0; + } + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + frame->local = local; + local->inode = loc->inode; + local->fd = fd; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_opendir_cbk, + trav->xlator, + trav->xlator->fops->opendir, + loc, fd); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_getxattr_cbk - + */ +int32_t +stripe_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *value) +{ + STACK_UNWIND (frame, op_ret, op_errno, value); + return 0; +} + + +/** + * stripe_getxattr - + */ +int32_t +stripe_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + STACK_WIND (frame, + stripe_getxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, name); + + return 0; +} + +/** + * stripe_removexattr - + */ +int32_t +stripe_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + if (priv->first_child_down) { + gf_log (this->name, GF_LOG_WARNING, + "First node down, returning ENOTCONN"); + STACK_UNWIND (frame, -1, ENOTCONN, NULL); + return 0; + } + + STACK_WIND (frame, + stripe_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + loc, name); + + return 0; +} + + +/** + * stripe_lk_cbk - + */ +int32_t +stripe_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + if (op_errno == ENOTCONN) + local->failed = 1; + } + if (op_ret == 0 && local->op_ret == -1) { + /* First successful call, copy the *lock */ + local->op_ret = 0; + local->lock = *lock; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed) + local->op_ret = -1; + STACK_UNWIND (frame, local->op_ret, + local->op_errno, &local->lock); + } + return 0; +} + + +/** + * stripe_lk - + */ +int32_t +stripe_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + stripe_private_t *priv = this->private; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_lk_cbk, + trav->xlator, + trav->xlator->fops->lk, + fd, cmd, lock); + trav = trav->next; + } + + return 0; +} + +/** + * stripe_writedir - + */ +int32_t +stripe_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->setdents, + fd, flags, entries, count); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_flush - + */ +int32_t +stripe_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->flush, + fd); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_close - + */ +int32_t +stripe_release (xlator_t *this, + fd_t *fd) +{ + return 0; +} + + +/** + * stripe_fsync - + */ +int32_t +stripe_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->fsync, + fd, flags); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_fstat - + */ +int32_t +stripe_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = fd->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->fstat, + fd); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_fchmod - + */ +int32_t +stripe_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = fd->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->fchmod, + fd, mode); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_fchown - + */ +int32_t +stripe_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = fd->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->fchown, + fd, uid, gid); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_ftruncate - + */ +int32_t +stripe_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->inode = fd->inode; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_buf_cbk, + trav->xlator, + trav->xlator->fops->ftruncate, + fd, offset); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_releasedir - + */ +int32_t +stripe_releasedir (xlator_t *this, + fd_t *fd) +{ + return 0; +} + + +/** + * stripe_fsyncdir - + */ +int32_t +stripe_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + stripe_local_t *local = NULL; + stripe_private_t *priv = this->private; + xlator_list_t *trav = this->children; + + STRIPE_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (fd); + + /* Initialization */ + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->op_ret = -1; + frame->local = local; + local->call_count = priv->child_count; + + while (trav) { + STACK_WIND (frame, + stripe_stack_unwind_cbk, + trav->xlator, + trav->xlator->fops->fsyncdir, + fd, + flags); + trav = trav->next; + } + + return 0; +} + + +/** + * stripe_single_readv_cbk - This function is used as return fn, when the + * file name doesn't match the pattern specified for striping. + */ +int32_t +stripe_single_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + return 0; +} + +/** + * stripe_readv_cbk - get all the striped reads, and order it properly, send it + * to above layer after putting it in a single vector. + */ +int32_t +stripe_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + int32_t index = 0; + int32_t callcnt = 0; + call_frame_t *main_frame = NULL; + stripe_local_t *main_local = NULL; + stripe_local_t *local = frame->local; + + index = local->node_index; + main_frame = local->orig_frame; + main_local = main_frame->local; + + LOCK (&main_frame->lock); + { + main_local->replies[index].op_ret = op_ret; + main_local->replies[index].op_errno = op_errno; + if (op_ret >= 0) { + main_local->replies[index].stbuf = *stbuf; + main_local->replies[index].count = count; + main_local->replies[index].vector = + iov_dup (vector, count); + + if (frame->root->rsp_refs) + dict_copy (frame->root->rsp_refs, + main_frame->root->rsp_refs); + } + callcnt = ++main_local->call_count; + } + UNLOCK(&main_frame->lock); + + if (callcnt == main_local->wind_count) { + int32_t final_count = 0; + struct iovec *final_vec = NULL; + struct stat tmp_stbuf = {0,}; + dict_t *refs = main_frame->root->rsp_refs; + + op_ret = 0; + memcpy (&tmp_stbuf, &main_local->replies[0].stbuf, + sizeof (struct stat)); + for (index=0; index < main_local->wind_count; index++) { + /* TODO: check whether each stripe returned 'expected' + * number of bytes + */ + if (main_local->replies[index].op_ret == -1) { + op_ret = -1; + op_errno = main_local->replies[index].op_errno; + break; + } + op_ret += main_local->replies[index].op_ret; + final_count += main_local->replies[index].count; + /* TODO: Do I need to send anything more in stbuf? */ + if (tmp_stbuf.st_size < + main_local->replies[index].stbuf.st_size) { + tmp_stbuf.st_size = + main_local->replies[index].stbuf.st_size; + } + } + if (op_ret != -1) { + final_vec = CALLOC (final_count, + sizeof (struct iovec)); + ERR_ABORT (final_vec); + final_count = 0; + + for (index=0; + index < main_local->wind_count; index++) { + memcpy (final_vec + final_count, + main_local->replies[index].vector, + (main_local->replies[index].count * + sizeof (struct iovec))); + final_count += + main_local->replies[index].count; + + free (main_local->replies[index].vector); + } + } else { + final_vec = NULL; + final_count = 0; + } + /* */ + FREE (main_local->replies); + refs = main_frame->root->rsp_refs; + STACK_UNWIND (main_frame, op_ret, op_errno, + final_vec, final_count, &tmp_stbuf); + + dict_unref (refs); + if (final_vec) + free (final_vec); + } + + STACK_DESTROY (frame->root); + return 0; +} + +/** + * stripe_readv - + */ +int32_t +stripe_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + int32_t index = 0; + int32_t num_stripe = 0; + size_t frame_size = 0; + off_t rounded_end = 0; + uint64_t stripe_size = 0; + off_t rounded_start = 0; + off_t frame_offset = offset; + stripe_local_t *local = NULL; + call_frame_t *rframe = NULL; + stripe_local_t *rlocal = NULL; + xlator_list_t *trav = this->children; + stripe_private_t *priv = this->private; + + fd_ctx_get (fd, this, &stripe_size); + if (!stripe_size) { + STACK_UNWIND (frame, -1, EINVAL, NULL, 0, NULL); + return 0; + } + + /* The file is stripe across the child nodes. Send the read request + * to the child nodes appropriately after checking which region of + * the file is in which child node. Always '0-<stripe_size>' part of + * the file resides in the first child. + */ + rounded_start = floor (offset, stripe_size); + rounded_end = roof (offset+size, stripe_size); + num_stripe = (rounded_end - rounded_start) / stripe_size; + + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + local->wind_count = num_stripe; + frame->local = local; + frame->root->rsp_refs = dict_ref (get_new_dict ()); + + /* This is where all the vectors should be copied. */ + local->replies = CALLOC (1, num_stripe * + sizeof (struct readv_replies)); + ERR_ABORT (local->replies); + + for (index = 0; + index < ((offset / stripe_size) % priv->child_count); + index++) { + trav = trav->next; + } + + for (index = 0; index < num_stripe; index++) { + rframe = copy_frame (frame); + rlocal = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (rlocal); + + frame_size = min (roof (frame_offset+1, stripe_size), + (offset + size)) - frame_offset; + + rlocal->node_index = index; + rlocal->orig_frame = frame; + rframe->local = rlocal; + STACK_WIND (rframe, + stripe_readv_cbk, + trav->xlator, + trav->xlator->fops->readv, + fd, frame_size, frame_offset); + + frame_offset += frame_size; + + trav = trav->next ? trav->next : this->children; + } + + return 0; +} + + +/** + * stripe_writev_cbk - + */ +int32_t +stripe_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + LOCK(&frame->lock); + { + callcnt = ++local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_errno = op_errno; + local->op_ret = -1; + } + if (op_ret >= 0) { + local->op_ret += op_ret; + local->stbuf = *stbuf; + } + } + UNLOCK (&frame->lock); + + if ((callcnt == local->wind_count) && local->unwind) { + STACK_UNWIND (frame, local->op_ret, + local->op_errno, &local->stbuf); + } + return 0; +} + + +/** + * stripe_single_writev_cbk - + */ +int32_t +stripe_single_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} +/** + * stripe_writev - + */ +int32_t +stripe_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + int32_t idx = 0; + int32_t total_size = 0; + int32_t offset_offset = 0; + int32_t remaining_size = 0; + int32_t tmp_count = count; + off_t fill_size = 0; + uint64_t stripe_size = 0; + struct iovec *tmp_vec = vector; + stripe_private_t *priv = this->private; + stripe_local_t *local = NULL; + xlator_list_t *trav = NULL; + + fd_ctx_get (fd, this, &stripe_size); + if (!stripe_size) { + STACK_UNWIND (frame, -1, EINVAL, NULL); + return 0; + } + + /* File has to be stripped across the child nodes */ + for (idx = 0; idx< count; idx ++) { + total_size += tmp_vec[idx].iov_len; + } + remaining_size = total_size; + + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + frame->local = local; + local->stripe_size = stripe_size; + + while (1) { + /* Send striped chunk of the vector to child + nodes appropriately. */ + trav = this->children; + + idx = (((offset + offset_offset) / + local->stripe_size) % priv->child_count); + while (idx) { + trav = trav->next; + idx--; + } + fill_size = (local->stripe_size - + ((offset + offset_offset) % local->stripe_size)); + if (fill_size > remaining_size) + fill_size = remaining_size; + + remaining_size -= fill_size; + + tmp_count = iov_subset (vector, count, offset_offset, + offset_offset + fill_size, NULL); + tmp_vec = CALLOC (tmp_count, sizeof (struct iovec)); + ERR_ABORT (tmp_vec); + tmp_count = iov_subset (vector, count, offset_offset, + offset_offset + fill_size, tmp_vec); + + local->wind_count++; + if (remaining_size == 0) + local->unwind = 1; + + STACK_WIND(frame, + stripe_writev_cbk, + trav->xlator, + trav->xlator->fops->writev, + fd, tmp_vec, tmp_count, offset + offset_offset); + FREE (tmp_vec); + offset_offset += fill_size; + if (remaining_size == 0) + break; + } + + return 0; +} + + + +/* Management operations */ + +/** + * stripe_stats_cbk - Add all the fields received from different clients. + * Once all the clients return, send stats to above layer. + * + */ +int32_t +stripe_stats_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + int32_t callcnt = 0; + stripe_local_t *local = frame->local; + + LOCK(&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "%s returned error %s", + ((call_frame_t *)cookie)->this->name, + strerror (op_errno)); + local->op_ret = -1; + local->op_errno = op_errno; + } + if (op_ret == 0) { + if (local->op_ret == -2) { + /* This is to make sure this is the + first time */ + local->stats = *stats; + local->op_ret = 0; + } else { + local->stats.nr_files += stats->nr_files; + local->stats.free_disk += stats->free_disk; + local->stats.disk_usage += stats->disk_usage; + local->stats.nr_clients += stats->nr_clients; + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stats); + } + + return 0; +} + +/** + * stripe_stats - + */ +int32_t +stripe_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags) +{ + stripe_local_t *local = NULL; + xlator_list_t *trav = this->children; + + local = CALLOC (1, sizeof (stripe_local_t)); + ERR_ABORT (local); + frame->local = local; + local->op_ret = -2; /* to be used as a flag in _cbk */ + local->call_count = ((stripe_private_t*)this->private)->child_count; + while (trav) { + STACK_WIND (frame, + stripe_stats_cbk, + trav->xlator, + trav->xlator->mops->stats, + flags); + trav = trav->next; + } + return 0; +} + +/** + * notify + */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + stripe_private_t *priv = this->private; + int down_client = 0; + int i = 0; + + if (!priv) + return 0; + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + /* get an index number to set */ + for (i = 0; i < priv->child_count; i++) { + if (data == priv->xl_array[i]) + break; + } + priv->state[i] = 1; + for (i = 0; i < priv->child_count; i++) { + if (!priv->state[i]) + down_client++; + } + + LOCK (&priv->lock); + { + priv->nodes_down = down_client; + + if (data == FIRST_CHILD (this)) { + priv->first_child_down = 0; + default_notify (this, event, data); + } + } + UNLOCK (&priv->lock); + } + break; + case GF_EVENT_CHILD_DOWN: + { + /* get an index number to set */ + for (i = 0; i < priv->child_count; i++) { + if (data == priv->xl_array[i]) + break; + } + priv->state[i] = 0; + for (i = 0; i < priv->child_count; i++) { + if (!priv->state[i]) + down_client++; + } + + LOCK (&priv->lock); + { + priv->nodes_down = down_client; + + if (data == FIRST_CHILD (this)) { + priv->first_child_down = 1; + default_notify (this, event, data); + } + } + UNLOCK (&priv->lock); + } + break; + + default: + { + /* */ + default_notify (this, event, data); + } + break; + } + + return 0; +} +/** + * init - This function is called when xlator-graph gets initialized. + * The option given in volfiles are parsed here. + * @this - + */ +int32_t +init (xlator_t *this) +{ + stripe_private_t *priv = NULL; + xlator_list_t *trav = NULL; + data_t *data = NULL; + int32_t count = 0; + + trav = this->children; + while (trav) { + count++; + trav = trav->next; + } + + if (!count) { + gf_log (this->name, GF_LOG_ERROR, + "stripe configured without \"subvolumes\" option. " + "exiting"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + priv = CALLOC (1, sizeof (stripe_private_t)); + ERR_ABORT (priv); + priv->xl_array = CALLOC (1, count * sizeof (xlator_t *)); + ERR_ABORT (priv->xl_array); + priv->child_count = count; + LOCK_INIT (&priv->lock); + + trav = this->children; + count = 0; + while (trav) { + priv->xl_array[count++] = trav->xlator; + trav = trav->next; + } + + if (count > 256) { + gf_log (this->name, GF_LOG_ERROR, + "maximum number of stripe subvolumes supported " + "is 256"); + return -1; + } + + priv->block_size = (128 * GF_UNIT_KB); + /* option stripe-pattern *avi:1GB,*pdf:4096 */ + data = dict_get (this->options, "block-size"); + if (!data) { + gf_log (this->name, GF_LOG_DEBUG, + "No \"option block-size <x>\" given, defaulting " + "to 128KB"); + } else { + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *dup_str = NULL; + char *stripe_str = NULL; + char *pattern = NULL; + char *num = NULL; + struct stripe_options *temp_stripeopt = NULL; + struct stripe_options *stripe_opt = NULL; + + /* Get the pattern for striping. + "option block-size *avi:10MB" etc */ + stripe_str = strtok_r (data->data, ",", &tmp_str); + while (stripe_str) { + dup_str = strdup (stripe_str); + stripe_opt = CALLOC (1, + sizeof (struct stripe_options)); + ERR_ABORT (stripe_opt); + pattern = strtok_r (dup_str, ":", &tmp_str1); + num = strtok_r (NULL, ":", &tmp_str1); + if (num && + (gf_string2bytesize (num, + &stripe_opt->block_size) + != 0)) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + num); + return -1; + } else if (!num && (gf_string2bytesize ( + pattern, + &stripe_opt->block_size) + != 0)) { + /* Possible that there is no pattern given */ + stripe_opt->block_size = (128 * GF_UNIT_KB); + pattern = "*"; + } + memcpy (stripe_opt->path_pattern, + pattern, strlen (pattern)); + + gf_log (this->name, GF_LOG_DEBUG, + "block-size : pattern %s : size %"PRId64, + stripe_opt->path_pattern, + stripe_opt->block_size); + + if (!priv->pattern) { + priv->pattern = stripe_opt; + } else { + temp_stripeopt = priv->pattern; + while (temp_stripeopt->next) + temp_stripeopt = temp_stripeopt->next; + temp_stripeopt->next = stripe_opt; + } + stripe_str = strtok_r (NULL, ",", &tmp_str); + } + } + + priv->xattr_supported = 1; + data = dict_get (this->options, "use-xattr"); + if (data) { + if (gf_string2boolean (data->data, + &priv->xattr_supported) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "error setting hard check for extended " + "attribute"); + //return -1; + } + } + + /* notify related */ + priv->nodes_down = priv->child_count; + this->private = priv; + + return 0; +} + +/** + * fini - Free all the private variables + * @this - + */ +void +fini (xlator_t *this) +{ + stripe_private_t *priv = this->private; + struct stripe_options *prev = NULL; + struct stripe_options *trav = priv->pattern; + while (trav) { + prev = trav; + trav = trav->next; + FREE (prev); + } + FREE (priv->xl_array); + LOCK_DESTROY (&priv->lock); + FREE (priv); + return; +} + + +struct xlator_fops fops = { + .stat = stripe_stat, + .unlink = stripe_unlink, + .symlink = stripe_symlink, + .rename = stripe_rename, + .link = stripe_link, + .chmod = stripe_chmod, + .chown = stripe_chown, + .truncate = stripe_truncate, + .utimens = stripe_utimens, + .create = stripe_create, + .open = stripe_open, + .readv = stripe_readv, + .writev = stripe_writev, + .statfs = stripe_statfs, + .flush = stripe_flush, + .fsync = stripe_fsync, + .setxattr = stripe_setxattr, + .getxattr = stripe_getxattr, + .removexattr = stripe_removexattr, + .access = stripe_access, + .ftruncate = stripe_ftruncate, + .fstat = stripe_fstat, + .readlink = stripe_readlink, + .mkdir = stripe_mkdir, + .rmdir = stripe_rmdir, + .lk = stripe_lk, + .opendir = stripe_opendir, + .fsyncdir = stripe_fsyncdir, + .fchmod = stripe_fchmod, + .fchown = stripe_fchown, + .lookup = stripe_lookup, + .setdents = stripe_setdents, + .mknod = stripe_mknod, +}; + +struct xlator_mops mops = { + .stats = stripe_stats, +}; + +struct xlator_cbks cbks = { + .release = stripe_release, + .releasedir = stripe_releasedir +}; + + +struct volume_options options[] = { + { .key = {"block-size"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"use-xattr"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {NULL} }, +}; diff --git a/xlators/cluster/unify/Makefile.am b/xlators/cluster/unify/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/cluster/unify/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/cluster/unify/src/Makefile.am b/xlators/cluster/unify/src/Makefile.am new file mode 100644 index 000000000..b9e6f63e9 --- /dev/null +++ b/xlators/cluster/unify/src/Makefile.am @@ -0,0 +1,16 @@ + +xlator_LTLIBRARIES = unify.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/cluster + +unify_la_LDFLAGS = -module -avoidversion + +unify_la_SOURCES = unify.c unify-self-heal.c +unify_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = unify.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/cluster/unify/src/unify-self-heal.c b/xlators/cluster/unify/src/unify-self-heal.c new file mode 100644 index 000000000..4885dd91a --- /dev/null +++ b/xlators/cluster/unify/src/unify-self-heal.c @@ -0,0 +1,1225 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/** + * unify-self-heal.c : + * This file implements few functions which enables 'unify' translator + * to be consistent in its behaviour when + * > a node fails, + * > a node gets added, + * > a failed node comes back + * > a new namespace server is added (ie, an fresh namespace server). + * + * This functionality of 'unify' will enable glusterfs to support storage + * system failure, and maintain consistancy. This works both ways, ie, when + * an entry (either file or directory) is found on namespace server, and not + * on storage nodes, its created in storage nodes and vica-versa. + * + * The two fops, where it can be implemented are 'getdents ()' and 'lookup ()' + * + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "unify.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "common-utils.h" + +int32_t +unify_sh_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count); + +int32_t +unify_sh_ns_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count); + +int32_t +unify_bgsh_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count); + +int32_t +unify_bgsh_ns_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count); + +/** + * unify_local_wipe - free all the extra allocation of local->* here. + */ +static void +unify_local_wipe (unify_local_t *local) +{ + /* Free the strdup'd variables in the local structure */ + if (local->name) { + FREE (local->name); + } + + if (local->sh_struct) { + if (local->sh_struct->offset_list) + FREE (local->sh_struct->offset_list); + + if (local->sh_struct->entry_list) + FREE (local->sh_struct->entry_list); + + if (local->sh_struct->count_list) + FREE (local->sh_struct->count_list); + + FREE (local->sh_struct); + } + + loc_wipe (&local->loc1); + loc_wipe (&local->loc2); +} + +int32_t +unify_sh_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + inode_t *inode = NULL; + dict_t *tmp_dict = NULL; + dir_entry_t *prev, *entry, *trav; + + LOCK (&frame->lock); + { + /* if local->call_count == 0, that means, setdents on + * storagenodes is still pending. + */ + if (local->call_count) + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (callcnt == 0) { + if (local->sh_struct->entry_list[0]) { + prev = entry = local->sh_struct->entry_list[0]; + if (!entry) + return 0; + trav = entry->next; + while (trav) { + prev->next = trav->next; + FREE (trav->name); + if (S_ISLNK (trav->buf.st_mode)) + FREE (trav->link); + FREE (trav); + trav = prev->next; + } + FREE (entry); + } + + if (!local->flags) { + if (local->sh_struct->count_list[0] >= + UNIFY_SELF_HEAL_GETDENTS_COUNT) { + /* count == size, that means, there are more entries + to read from */ + //local->call_count = 0; + local->sh_struct->offset_list[0] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + STACK_WIND (frame, + unify_sh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[0], + GF_GET_DIR_ONLY); + } + } else { + inode = local->loc1.inode; + fd_unref (local->fd); + tmp_dict = local->dict; + + unify_local_wipe (local); + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + inode, &local->stbuf, local->dict); + if (tmp_dict) + dict_unref (local->dict); + } + } + + return 0; +} + + +int32_t +unify_sh_ns_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count) +{ + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = 0; + unsigned long final = 0; + dir_entry_t *tmp = CALLOC (1, sizeof (dir_entry_t)); + + local->sh_struct->entry_list[0] = tmp; + local->sh_struct->count_list[0] = count; + if (entry) { + tmp->next = entry->next; + entry->next = NULL; + } + + if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { + final = 1; + } + + LOCK (&frame->lock); + { + /* local->call_count will be '0' till now. make it 1 so, it + can be UNWIND'ed for the last call. */ + local->call_count = priv->child_count; + if (final) + local->flags = 1; + } + UNLOCK (&frame->lock); + + for (index = 0; index < priv->child_count; index++) + { + STACK_WIND_COOKIE (frame, + unify_sh_setdents_cbk, + (void *)index, + priv->xl_array[index], + priv->xl_array[index]->fops->setdents, + local->fd, GF_SET_DIR_ONLY, + local->sh_struct->entry_list[0], count); + } + + return 0; +} + +int32_t +unify_sh_ns_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = (long)cookie; + dir_entry_t *prev, *entry, *trav; + + LOCK (&frame->lock); + { + if (local->sh_struct->entry_list[index]) { + prev = entry = local->sh_struct->entry_list[index]; + trav = entry->next; + while (trav) { + prev->next = trav->next; + FREE (trav->name); + if (S_ISLNK (trav->buf.st_mode)) + FREE (trav->link); + FREE (trav); + trav = prev->next; + } + FREE (entry); + } + } + UNLOCK (&frame->lock); + + if (local->sh_struct->count_list[index] < + UNIFY_SELF_HEAL_GETDENTS_COUNT) { + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + } else { + /* count == size, that means, there are more entries + to read from */ + local->sh_struct->offset_list[index] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + STACK_WIND_COOKIE (frame, + unify_sh_getdents_cbk, + cookie, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[index], + GF_GET_ALL); + + gf_log (this->name, GF_LOG_DEBUG, + "readdir on (%s) with offset %"PRId64"", + priv->xl_array[index]->name, + local->sh_struct->offset_list[index]); + } + + if (!callcnt) { + /* All storage nodes have done unified setdents on NS node. + * Now, do getdents from NS and do setdents on storage nodes. + */ + + /* sh_struct->offset_list is no longer required for + storage nodes now */ + local->sh_struct->offset_list[0] = 0; /* reset */ + + STACK_WIND (frame, + unify_sh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_DIR_ONLY); + } + + return 0; +} + + +/** + * unify_sh_getdents_cbk - + */ +int32_t +unify_sh_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = (long)cookie; + dir_entry_t *tmp = NULL; + + if (op_ret >= 0 && count > 0) { + /* There is some dentry found, just send the dentry to NS */ + tmp = CALLOC (1, sizeof (dir_entry_t)); + local->sh_struct->entry_list[index] = tmp; + local->sh_struct->count_list[index] = count; + if (entry) { + tmp->next = entry->next; + entry->next = NULL; + } + STACK_WIND_COOKIE (frame, + unify_sh_ns_setdents_cbk, + cookie, + NS(this), + NS(this)->fops->setdents, + local->fd, + GF_SET_IF_NOT_PRESENT, + local->sh_struct->entry_list[index], + count); + return 0; + } + + if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + } else { + /* count == size, that means, there are more entries + to read from */ + local->sh_struct->offset_list[index] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + STACK_WIND_COOKIE (frame, + unify_sh_getdents_cbk, + cookie, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[index], + GF_GET_ALL); + + gf_log (this->name, GF_LOG_DEBUG, + "readdir on (%s) with offset %"PRId64"", + priv->xl_array[index]->name, + local->sh_struct->offset_list[index]); + } + + if (!callcnt) { + /* All storage nodes have done unified setdents on NS node. + * Now, do getdents from NS and do setdents on storage nodes. + */ + + /* sh_struct->offset_list is no longer required for + storage nodes now */ + local->sh_struct->offset_list[0] = 0; /* reset */ + + STACK_WIND (frame, + unify_sh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_DIR_ONLY); + } + + return 0; +} + +/** + * unify_sh_opendir_cbk - + * + * @cookie: + */ +int32_t +unify_sh_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int32_t callcnt = 0; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t index = 0; + inode_t *inode = NULL; + dict_t *tmp_dict = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret >= 0) { + local->op_ret = op_ret; + } else { + gf_log (this->name, GF_LOG_WARNING, "failed"); + local->failed = 1; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local->call_count = priv->child_count + 1; + + if (!local->failed) { + /* send getdents() namespace after finishing + storage nodes */ + local->call_count--; + + fd_bind (fd); + + if (local->call_count) { + /* Used as the offset index. This list keeps + * track of offset sent to each node during + * STACK_WIND. + */ + local->sh_struct->offset_list = + calloc (priv->child_count, + sizeof (off_t)); + ERR_ABORT (local->sh_struct->offset_list); + + local->sh_struct->entry_list = + calloc (priv->child_count, + sizeof (dir_entry_t *)); + ERR_ABORT (local->sh_struct->entry_list); + + local->sh_struct->count_list = + calloc (priv->child_count, + sizeof (int)); + ERR_ABORT (local->sh_struct->count_list); + + /* Send getdents on all the fds */ + for (index = 0; + index < priv->child_count; index++) { + STACK_WIND_COOKIE (frame, + unify_sh_getdents_cbk, + (void *)(long)index, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_ALL); + } + + /* did stack wind, so no need to unwind here */ + return 0; + } /* (local->call_count) */ + } /* (!local->failed) */ + + /* Opendir failed on one node. */ + inode = local->loc1.inode; + fd_unref (local->fd); + tmp_dict = local->dict; + + unify_local_wipe (local); + /* Only 'self-heal' failed, lookup() was successful. */ + local->op_ret = 0; + + /* This is lookup_cbk ()'s UNWIND. */ + STACK_UNWIND (frame, local->op_ret, local->op_errno, inode, + &local->stbuf, local->dict); + if (tmp_dict) + dict_unref (tmp_dict); + } + + return 0; +} + +/** + * gf_sh_checksum_cbk - + * + * @frame: frame used in lookup. get a copy of it, and use that copy. + * @this: pointer to unify xlator. + * @inode: pointer to inode, for which the consistency check is required. + * + */ +int32_t +unify_sh_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum) +{ + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t index = 0; + int32_t callcnt = 0; + inode_t *inode = NULL; + dict_t *tmp_dict = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret >= 0) { + if (NS(this) == (xlator_t *)cookie) { + memcpy (local->sh_struct->ns_file_checksum, + file_checksum, ZR_FILENAME_MAX); + memcpy (local->sh_struct->ns_dir_checksum, + dir_checksum, ZR_FILENAME_MAX); + } else { + if (local->entry_count == 0) { + /* Initialize the dir_checksum to be + * used for comparision with other + * storage nodes. Should be done for + * the first successful call *only*. + */ + /* Using 'entry_count' as a flag */ + local->entry_count = 1; + memcpy (local->sh_struct->dir_checksum, + dir_checksum, ZR_FILENAME_MAX); + } + + /* Reply from the storage nodes */ + for (index = 0; + index < ZR_FILENAME_MAX; index++) { + /* Files should be present in + only one node */ + local->sh_struct->file_checksum[index] ^= file_checksum[index]; + + /* directory structure should be + same accross */ + if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) + local->failed = 1; + } + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + for (index = 0; index < ZR_FILENAME_MAX ; index++) { + if (local->sh_struct->file_checksum[index] != + local->sh_struct->ns_file_checksum[index]) { + local->failed = 1; + break; + } + if (local->sh_struct->dir_checksum[index] != + local->sh_struct->ns_dir_checksum[index]) { + local->failed = 1; + break; + } + } + + if (local->failed) { + /* Log it, it should be a rare event */ + gf_log (this->name, GF_LOG_WARNING, + "Self-heal triggered on directory %s", + local->loc1.path); + + /* Any self heal will be done at directory level */ + local->call_count = 0; + local->op_ret = -1; + local->failed = 0; + + local->fd = fd_create (local->loc1.inode, + frame->root->pid); + + local->call_count = priv->child_count + 1; + + for (index = 0; + index < (priv->child_count + 1); index++) { + STACK_WIND_COOKIE (frame, + unify_sh_opendir_cbk, + priv->xl_array[index]->name, + priv->xl_array[index], + priv->xl_array[index]->fops->opendir, + &local->loc1, + local->fd); + } + /* opendir can be done on the directory */ + return 0; + } + + /* no mismatch */ + inode = local->loc1.inode; + tmp_dict = local->dict; + + unify_local_wipe (local); + + /* This is lookup_cbk ()'s UNWIND. */ + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + inode, + &local->stbuf, + local->dict); + if (tmp_dict) + dict_unref (tmp_dict); + } + + return 0; +} + +/* Foreground self-heal part over */ + +/* Background self-heal part */ + +int32_t +unify_bgsh_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + dir_entry_t *prev, *entry, *trav; + + LOCK (&frame->lock); + { + /* if local->call_count == 0, that means, setdents + on storagenodes is still pending. */ + if (local->call_count) + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + + if (callcnt == 0) { + if (local->sh_struct->entry_list[0]) { + prev = entry = local->sh_struct->entry_list[0]; + trav = entry->next; + while (trav) { + prev->next = trav->next; + FREE (trav->name); + if (S_ISLNK (trav->buf.st_mode)) + FREE (trav->link); + FREE (trav); + trav = prev->next; + } + FREE (entry); + } + + if (!local->flags) { + if (local->sh_struct->count_list[0] >= + UNIFY_SELF_HEAL_GETDENTS_COUNT) { + /* count == size, that means, there are more + entries to read from */ + //local->call_count = 0; + local->sh_struct->offset_list[0] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + STACK_WIND (frame, + unify_bgsh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[0], + GF_GET_DIR_ONLY); + } + } else { + fd_unref (local->fd); + unify_local_wipe (local); + STACK_DESTROY (frame->root); + } + } + + return 0; +} + + +int32_t +unify_bgsh_ns_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count) +{ + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = 0; + unsigned long final = 0; + dir_entry_t *tmp = CALLOC (1, sizeof (dir_entry_t)); + + local->sh_struct->entry_list[0] = tmp; + local->sh_struct->count_list[0] = count; + if (entry) { + tmp->next = entry->next; + entry->next = NULL; + } + + if ((count < UNIFY_SELF_HEAL_GETDENTS_COUNT) || !entry) { + final = 1; + } + + LOCK (&frame->lock); + { + /* local->call_count will be '0' till now. make it 1 so, + it can be UNWIND'ed for the last call. */ + local->call_count = priv->child_count; + if (final) + local->flags = 1; + } + UNLOCK (&frame->lock); + + for (index = 0; index < priv->child_count; index++) + { + STACK_WIND_COOKIE (frame, + unify_bgsh_setdents_cbk, + (void *)index, + priv->xl_array[index], + priv->xl_array[index]->fops->setdents, + local->fd, GF_SET_DIR_ONLY, + local->sh_struct->entry_list[0], count); + } + + return 0; +} + +int32_t +unify_bgsh_ns_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = (long)cookie; + dir_entry_t *prev, *entry, *trav; + + if (local->sh_struct->entry_list[index]) { + prev = entry = local->sh_struct->entry_list[index]; + if (!entry) + return 0; + trav = entry->next; + while (trav) { + prev->next = trav->next; + FREE (trav->name); + if (S_ISLNK (trav->buf.st_mode)) + FREE (trav->link); + FREE (trav); + trav = prev->next; + } + FREE (entry); + } + + if (local->sh_struct->count_list[index] < + UNIFY_SELF_HEAL_GETDENTS_COUNT) { + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + } else { + /* count == size, that means, there are more entries + to read from */ + local->sh_struct->offset_list[index] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + STACK_WIND_COOKIE (frame, + unify_bgsh_getdents_cbk, + cookie, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[index], + GF_GET_ALL); + + gf_log (this->name, GF_LOG_DEBUG, + "readdir on (%s) with offset %"PRId64"", + priv->xl_array[index]->name, + local->sh_struct->offset_list[index]); + } + + if (!callcnt) { + /* All storage nodes have done unified setdents on NS node. + * Now, do getdents from NS and do setdents on storage nodes. + */ + + /* sh_struct->offset_list is no longer required for + storage nodes now */ + local->sh_struct->offset_list[0] = 0; /* reset */ + + STACK_WIND (frame, + unify_bgsh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_DIR_ONLY); + } + + return 0; +} + + +/** + * unify_bgsh_getdents_cbk - + */ +int32_t +unify_bgsh_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count) +{ + int32_t callcnt = -1; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + long index = (long)cookie; + dir_entry_t *tmp = NULL; + + if (op_ret >= 0 && count > 0) { + /* There is some dentry found, just send the dentry to NS */ + tmp = CALLOC (1, sizeof (dir_entry_t)); + local->sh_struct->entry_list[index] = tmp; + local->sh_struct->count_list[index] = count; + if (entry) { + tmp->next = entry->next; + entry->next = NULL; + } + STACK_WIND_COOKIE (frame, + unify_bgsh_ns_setdents_cbk, + cookie, + NS(this), + NS(this)->fops->setdents, + local->fd, + GF_SET_IF_NOT_PRESENT, + local->sh_struct->entry_list[index], + count); + return 0; + } + + if (count < UNIFY_SELF_HEAL_GETDENTS_COUNT) { + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + } else { + /* count == size, that means, there are more entries to read from */ + local->sh_struct->offset_list[index] += + UNIFY_SELF_HEAL_GETDENTS_COUNT; + + STACK_WIND_COOKIE (frame, + unify_bgsh_getdents_cbk, + cookie, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + local->sh_struct->offset_list[index], + GF_GET_ALL); + + gf_log (this->name, GF_LOG_DEBUG, + "readdir on (%s) with offset %"PRId64"", + priv->xl_array[index]->name, + local->sh_struct->offset_list[index]); + } + + if (!callcnt) { + /* All storage nodes have done unified setdents on NS node. + * Now, do getdents from NS and do setdents on storage nodes. + */ + + /* sh_struct->offset_list is no longer required for + storage nodes now */ + local->sh_struct->offset_list[0] = 0; /* reset */ + + STACK_WIND (frame, + unify_bgsh_ns_getdents_cbk, + NS(this), + NS(this)->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_DIR_ONLY); + } + + return 0; +} + +/** + * unify_bgsh_opendir_cbk - + * + * @cookie: + */ +int32_t +unify_bgsh_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int32_t callcnt = 0; + int16_t index = 0; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret >= 0) { + local->op_ret = op_ret; + } else { + local->failed = 1; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local->call_count = priv->child_count + 1; + + if (!local->failed) { + /* send getdents() namespace after finishing + storage nodes */ + local->call_count--; + callcnt = local->call_count; + + fd_bind (fd); + + if (local->call_count) { + /* Used as the offset index. This list keeps + track of offset sent to each node during + STACK_WIND. */ + local->sh_struct->offset_list = + calloc (priv->child_count, + sizeof (off_t)); + ERR_ABORT (local->sh_struct->offset_list); + + local->sh_struct->entry_list = + calloc (priv->child_count, + sizeof (dir_entry_t *)); + ERR_ABORT (local->sh_struct->entry_list); + + local->sh_struct->count_list = + calloc (priv->child_count, + sizeof (int)); + ERR_ABORT (local->sh_struct->count_list); + + /* Send getdents on all the fds */ + for (index = 0; + index < priv->child_count; index++) { + STACK_WIND_COOKIE (frame, + unify_bgsh_getdents_cbk, + (void *)(long)index, + priv->xl_array[index], + priv->xl_array[index]->fops->getdents, + local->fd, + UNIFY_SELF_HEAL_GETDENTS_COUNT, + 0, /* In this call, do send '0' as offset */ + GF_GET_ALL); + } + /* did a stack wind, so no need to unwind here */ + return 0; + } /* (local->call_count) */ + } /* (!local->failed) */ + + /* Opendir failed on one node. */ + fd_unref (local->fd); + + unify_local_wipe (local); + STACK_DESTROY (frame->root); + } + + return 0; +} + +/** + * gf_bgsh_checksum_cbk - + * + * @frame: frame used in lookup. get a copy of it, and use that copy. + * @this: pointer to unify xlator. + * @inode: pointer to inode, for which the consistency check is required. + * + */ +int32_t +unify_bgsh_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum) +{ + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t index = 0; + int32_t callcnt = 0; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret >= 0) { + if (NS(this) == (xlator_t *)cookie) { + memcpy (local->sh_struct->ns_file_checksum, + file_checksum, ZR_FILENAME_MAX); + memcpy (local->sh_struct->ns_dir_checksum, + dir_checksum, ZR_FILENAME_MAX); + } else { + if (local->entry_count == 0) { + /* Initialize the dir_checksum to be + * used for comparision with other + * storage nodes. Should be done for + * the first successful call *only*. + */ + /* Using 'entry_count' as a flag */ + local->entry_count = 1; + memcpy (local->sh_struct->dir_checksum, + dir_checksum, ZR_FILENAME_MAX); + } + + /* Reply from the storage nodes */ + for (index = 0; + index < ZR_FILENAME_MAX; index++) { + /* Files should be present in only + one node */ + local->sh_struct->file_checksum[index] ^= file_checksum[index]; + + /* directory structure should be same + accross */ + if (local->sh_struct->dir_checksum[index] != dir_checksum[index]) + local->failed = 1; + } + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + for (index = 0; index < ZR_FILENAME_MAX ; index++) { + if (local->sh_struct->file_checksum[index] != + local->sh_struct->ns_file_checksum[index]) { + local->failed = 1; + break; + } + if (local->sh_struct->dir_checksum[index] != + local->sh_struct->ns_dir_checksum[index]) { + local->failed = 1; + break; + } + } + + if (local->failed) { + /* Log it, it should be a rare event */ + gf_log (this->name, GF_LOG_WARNING, + "Self-heal triggered on directory %s", + local->loc1.path); + + /* Any self heal will be done at the directory level */ + local->op_ret = -1; + local->failed = 0; + + local->fd = fd_create (local->loc1.inode, + frame->root->pid); + local->call_count = priv->child_count + 1; + + for (index = 0; + index < (priv->child_count + 1); index++) { + STACK_WIND_COOKIE (frame, + unify_bgsh_opendir_cbk, + priv->xl_array[index]->name, + priv->xl_array[index], + priv->xl_array[index]->fops->opendir, + &local->loc1, + local->fd); + } + + /* opendir can be done on the directory */ + return 0; + } + + /* no mismatch */ + unify_local_wipe (local); + STACK_DESTROY (frame->root); + } + + return 0; +} + +/* Background self-heal part over */ + + + + +/** + * zr_unify_self_heal - + * + * @frame: frame used in lookup. get a copy of it, and use that copy. + * @this: pointer to unify xlator. + * @inode: pointer to inode, for which the consistency check is required. + * + */ +int32_t +zr_unify_self_heal (call_frame_t *frame, + xlator_t *this, + unify_local_t *local) +{ + unify_private_t *priv = this->private; + call_frame_t *bg_frame = NULL; + unify_local_t *bg_local = NULL; + inode_t *tmp_inode = NULL; + dict_t *tmp_dict = NULL; + int16_t index = 0; + + if (local->inode_generation < priv->inode_generation) { + /* Any self heal will be done at the directory level */ + /* Update the inode's generation to the current generation + value. */ + local->inode_generation = priv->inode_generation; + inode_ctx_put (local->loc1.inode, this, + (uint64_t)(long)local->inode_generation); + + if (priv->self_heal == ZR_UNIFY_FG_SELF_HEAL) { + local->op_ret = 0; + local->failed = 0; + local->call_count = priv->child_count + 1; + local->sh_struct = + calloc (1, sizeof (struct unify_self_heal_struct)); + + /* +1 is for NS */ + for (index = 0; + index < (priv->child_count + 1); index++) { + STACK_WIND_COOKIE (frame, + unify_sh_checksum_cbk, + priv->xl_array[index], + priv->xl_array[index], + priv->xl_array[index]->fops->checksum, + &local->loc1, + 0); + } + + /* Self-heal in foreground, hence no need + to UNWIND here */ + return 0; + } + + /* Self Heal done in background */ + bg_frame = copy_frame (frame); + INIT_LOCAL (bg_frame, bg_local); + loc_copy (&bg_local->loc1, &local->loc1); + bg_local->op_ret = 0; + bg_local->failed = 0; + bg_local->call_count = priv->child_count + 1; + bg_local->sh_struct = + calloc (1, sizeof (struct unify_self_heal_struct)); + + /* +1 is for NS */ + for (index = 0; index < (priv->child_count + 1); index++) { + STACK_WIND_COOKIE (bg_frame, + unify_bgsh_checksum_cbk, + priv->xl_array[index], + priv->xl_array[index], + priv->xl_array[index]->fops->checksum, + &bg_local->loc1, + 0); + } + } + + /* generation number matches, self heal already done or + * self heal done in background: just do STACK_UNWIND + */ + tmp_inode = local->loc1.inode; + tmp_dict = local->dict; + + unify_local_wipe (local); + + /* This is lookup_cbk ()'s UNWIND. */ + STACK_UNWIND (frame, + local->op_ret, + local->op_errno, + tmp_inode, + &local->stbuf, + local->dict); + + if (tmp_dict) + dict_unref (tmp_dict); + + return 0; +} + diff --git a/xlators/cluster/unify/src/unify.c b/xlators/cluster/unify/src/unify.c new file mode 100644 index 000000000..e2a5e14b1 --- /dev/null +++ b/xlators/cluster/unify/src/unify.c @@ -0,0 +1,4451 @@ +/* + Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/** + * xlators/cluster/unify: + * - This xlator is one of the main translator in GlusterFS, which + * actually does the clustering work of the file system. One need to + * understand that, unify assumes file to be existing in only one of + * the child node, and directories to be present on all the nodes. + * + * NOTE: + * Now, unify has support for global namespace, which is used to keep a + * global view of fs's namespace tree. The stat for directories are taken + * just from the namespace, where as for files, just 'st_ino' is taken from + * Namespace node, and other stat info is taken from the actual storage node. + * Also Namespace node helps to keep consistant inode for files across + * glusterfs (re-)mounts. + */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "unify.h" +#include "dict.h" +#include "xlator.h" +#include "hashfn.h" +#include "logging.h" +#include "stack.h" +#include "defaults.h" +#include "common-utils.h" +#include <signal.h> +#include <libgen.h> +#include "compat-errno.h" +#include "compat.h" + +#define UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR(_loc) do { \ + if (!(_loc && _loc->inode)) { \ + STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); \ + return 0; \ + } \ +} while(0) + + +#define UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(_fd) do { \ + if (!(_fd && !fd_ctx_get (_fd, this, NULL))) { \ + STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ + return 0; \ + } \ +} while(0) + +#define UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(_fd) do { \ + if (!_fd) { \ + STACK_UNWIND (frame, -1, EBADFD, NULL, NULL); \ + return 0; \ + } \ +} while(0) + +/** + * unify_local_wipe - free all the extra allocation of local->* here. + */ +static void +unify_local_wipe (unify_local_t *local) +{ + /* Free the strdup'd variables in the local structure */ + if (local->name) { + FREE (local->name); + } + loc_wipe (&local->loc1); + loc_wipe (&local->loc2); +} + + + +/* + * unify_normalize_stats - + */ +void +unify_normalize_stats (struct statvfs *buf, + unsigned long bsize, + unsigned long frsize) +{ + double factor; + + if (buf->f_bsize != bsize) { + factor = ((double) buf->f_bsize) / bsize; + buf->f_bsize = bsize; + buf->f_bfree = (fsblkcnt_t) (factor * buf->f_bfree); + buf->f_bavail = (fsblkcnt_t) (factor * buf->f_bavail); + } + + if (buf->f_frsize != frsize) { + factor = ((double) buf->f_frsize) / frsize; + buf->f_frsize = frsize; + buf->f_blocks = (fsblkcnt_t) (factor * buf->f_blocks); + } +} + + +xlator_t * +unify_loc_subvol (loc_t *loc, xlator_t *this) +{ + unify_private_t *priv = NULL; + xlator_t *subvol = NULL; + int16_t *list = NULL; + long index = 0; + xlator_t *subvol_i = NULL; + int ret = 0; + uint64_t tmp_list = 0; + + priv = this->private; + subvol = NS (this); + + if (!S_ISDIR (loc->inode->st_mode)) { + ret = inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + if (!list) + goto out; + + for (index = 0; list[index] != -1; index++) { + subvol_i = priv->xl_array[list[index]]; + if (subvol_i != NS (this)) { + subvol = subvol_i; + break; + } + } + } +out: + return subvol; +} + + + +/** + * unify_statfs_cbk - + */ +int32_t +unify_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *stbuf) +{ + int32_t callcnt = 0; + struct statvfs *dict_buf = NULL; + unsigned long bsize; + unsigned long frsize; + unify_local_t *local = (unify_local_t *)frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + if (op_ret >= 0) { + /* when a call is successfull, add it to local->dict */ + dict_buf = &local->statvfs_buf; + + if (dict_buf->f_bsize != 0) { + bsize = max (dict_buf->f_bsize, + stbuf->f_bsize); + + frsize = max (dict_buf->f_frsize, + stbuf->f_frsize); + unify_normalize_stats(dict_buf, bsize, frsize); + unify_normalize_stats(stbuf, bsize, frsize); + } else { + dict_buf->f_bsize = stbuf->f_bsize; + dict_buf->f_frsize = stbuf->f_frsize; + } + + dict_buf->f_blocks += stbuf->f_blocks; + dict_buf->f_bfree += stbuf->f_bfree; + dict_buf->f_bavail += stbuf->f_bavail; + dict_buf->f_files += stbuf->f_files; + dict_buf->f_ffree += stbuf->f_ffree; + dict_buf->f_favail += stbuf->f_favail; + dict_buf->f_fsid = stbuf->f_fsid; + dict_buf->f_flag = stbuf->f_flag; + dict_buf->f_namemax = stbuf->f_namemax; + local->op_ret = op_ret; + } else { + /* fop on storage node has failed due to some error */ + if (op_errno != ENOTCONN) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): %s", + prev_frame->this->name, + strerror (op_errno)); + } + local->op_errno = op_errno; + } + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->statvfs_buf); + } + + return 0; +} + +/** + * unify_statfs - + */ +int32_t +unify_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + unify_local_t *local = NULL; + xlator_list_t *trav = this->children; + + INIT_LOCAL (frame, local); + local->call_count = ((unify_private_t *)this->private)->child_count; + + while(trav) { + STACK_WIND (frame, + unify_statfs_cbk, + trav->xlator, + trav->xlator->fops->statfs, + loc); + trav = trav->next; + } + + return 0; +} + +/** + * unify_buf_cbk - + */ +int32_t +unify_buf_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "%s(): child(%s): path(%s): %s", + gf_fop_list[frame->root->op], + prev_frame->this->name, + (local->loc1.path)?local->loc1.path:"", + strerror (op_errno)); + + local->op_errno = op_errno; + if ((op_errno == ENOENT) && priv->optimist) + local->op_ret = 0; + } + + if (op_ret >= 0) { + local->op_ret = 0; + + if (NS (this) == prev_frame->this) { + local->st_ino = buf->st_ino; + /* If the entry is directory, get the stat + from NS node */ + if (S_ISDIR (buf->st_mode) || + !local->stbuf.st_blksize) { + local->stbuf = *buf; + } + } + + if ((!S_ISDIR (buf->st_mode)) && + (NS (this) != prev_frame->this)) { + /* If file, take the stat info from Storage + node. */ + local->stbuf = *buf; + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + /* If the inode number is not filled, operation should + fail */ + if (!local->st_ino) + local->op_ret = -1; + + local->stbuf.st_ino = local->st_ino; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + } + + return 0; +} + +#define check_if_dht_linkfile(s) ((s->st_mode & ~S_IFMT) == S_ISVTX) + +/** + * unify_lookup_cbk - + */ +int32_t +unify_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + inode_t *tmp_inode = NULL; + dict_t *local_dict = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + if ((op_errno != ENOTCONN) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + priv->xl_array[(long)cookie]->name, + local->loc1.path, strerror (op_errno)); + local->op_errno = op_errno; + local->failed = 1; + + } else if (local->revalidate && + !(priv->optimist && (op_errno == ENOENT))) { + + gf_log (this->name, + (op_errno == ENOTCONN) ? + GF_LOG_DEBUG:GF_LOG_ERROR, + "child(%s): path(%s): %s", + priv->xl_array[(long)cookie]->name, + local->loc1.path, strerror (op_errno)); + local->op_errno = op_errno; + local->failed = 1; + } + } + + if (op_ret == 0) { + local->op_ret = 0; + + if (check_if_dht_linkfile(buf)) { + gf_log (this->name, GF_LOG_CRITICAL, + "file %s may be DHT link file on %s, " + "make sure the backend is not shared " + "between unify and DHT", + local->loc1.path, + priv->xl_array[(long)cookie]->name); + } + + if (local->stbuf.st_mode && local->stbuf.st_blksize) { + /* make sure we already have a stbuf + stored in local->stbuf */ + if (S_ISDIR (local->stbuf.st_mode) && + !S_ISDIR (buf->st_mode)) { + gf_log (this->name, GF_LOG_CRITICAL, + "[CRITICAL] '%s' is directory " + "on namespace, non-directory " + "on node '%s', returning EIO", + local->loc1.path, + priv->xl_array[(long)cookie]->name); + local->return_eio = 1; + } + if (!S_ISDIR (local->stbuf.st_mode) && + S_ISDIR (buf->st_mode)) { + gf_log (this->name, GF_LOG_CRITICAL, + "[CRITICAL] '%s' is directory " + "on node '%s', non-directory " + "on namespace, returning EIO", + local->loc1.path, + priv->xl_array[(long)cookie]->name); + local->return_eio = 1; + } + } + + if (!local->revalidate && !S_ISDIR (buf->st_mode)) { + /* This is the first time lookup on file*/ + if (!local->list) { + /* list is not allocated, allocate + the max possible range */ + local->list = CALLOC (1, 2 * (priv->child_count + 2)); + if (!local->list) { + gf_log (this->name, + GF_LOG_CRITICAL, + "Not enough memory"); + STACK_UNWIND (frame, -1, + ENOMEM, inode, + NULL, NULL); + return 0; + } + } + /* update the index of the list */ + local->list [local->index++] = + (int16_t)(long)cookie; + } + + if ((!local->dict) && dict && + (priv->xl_array[(long)cookie] != NS(this))) { + local->dict = dict_ref (dict); + } + + /* index of NS node is == total child count */ + if (priv->child_count == (int16_t)(long)cookie) { + /* Take the inode number from namespace */ + local->st_ino = buf->st_ino; + if (S_ISDIR (buf->st_mode) || + !(local->stbuf.st_blksize)) { + local->stbuf = *buf; + } + } else if (!S_ISDIR (buf->st_mode)) { + /* If file, then get the stat from + storage node */ + local->stbuf = *buf; + } + + if (local->st_nlink < buf->st_nlink) { + local->st_nlink = buf->st_nlink; + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local_dict = local->dict; + if (local->return_eio) { + gf_log (this->name, GF_LOG_CRITICAL, + "[CRITICAL] Unable to fix the path (%s) with " + "self-heal, try manual verification. " + "returning EIO.", local->loc1.path); + unify_local_wipe (local); + STACK_UNWIND (frame, -1, EIO, inode, NULL, NULL); + if (local_dict) { + dict_unref (local_dict); + } + return 0; + } + + if (!local->stbuf.st_blksize) { + /* Inode not present */ + local->op_ret = -1; + } else { + if (!local->revalidate && + !S_ISDIR (local->stbuf.st_mode)) { + /* If its a file, big array is useless, + allocate the smaller one */ + int16_t *list = NULL; + list = CALLOC (1, 2 * (local->index + 1)); + ERR_ABORT (list); + memcpy (list, local->list, 2 * local->index); + /* Make the end of the list as -1 */ + FREE (local->list); + local->list = list; + local->list [local->index] = -1; + /* Update the inode's ctx with proper array */ + /* TODO: log on failure */ + inode_ctx_put (local->loc1.inode, this, + (uint64_t)(long)local->list); + } + + if (S_ISDIR(local->loc1.inode->st_mode)) { + /* lookup is done for directory */ + if (local->failed && priv->self_heal) { + /* Triggering self-heal */ + /* means, self-heal required for this + inode */ + local->inode_generation = 0; + priv->inode_generation++; + } + } else { + local->stbuf.st_ino = local->st_ino; + } + + local->stbuf.st_nlink = local->st_nlink; + } + if (local->op_ret == -1) { + if (!local->revalidate && local->list) + FREE (local->list); + } + + if ((local->op_ret >= 0) && local->failed && + local->revalidate) { + /* Done revalidate, but it failed */ + if (op_errno != ENOTCONN) { + gf_log (this->name, GF_LOG_ERROR, + "Revalidate failed for path(%s): %s", + local->loc1.path, strerror (op_errno)); + } + local->op_ret = -1; + } + + if ((priv->self_heal && !priv->optimist) && + (!local->revalidate && (local->op_ret == 0) && + S_ISDIR(local->stbuf.st_mode))) { + /* Let the self heal be done here */ + zr_unify_self_heal (frame, this, local); + local_dict = NULL; + } else { + /* either no self heal, or op_ret == -1 (failure) */ + tmp_inode = local->loc1.inode; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + tmp_inode, &local->stbuf, local->dict); + } + if (local_dict) { + dict_unref (local_dict); + } + } + + return 0; +} + +/** + * unify_lookup - + */ +int32_t +unify_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int16_t *list = NULL; + long index = 0; + + if (!(loc && loc->inode)) { + gf_log (this->name, GF_LOG_ERROR, + "%s: Argument not right", loc?loc->path:"(null)"); + STACK_UNWIND (frame, -1, EINVAL, NULL, NULL, NULL); + return 0; + } + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL, NULL); + return 0; + } + + if (!inode_ctx_get (loc->inode, this, NULL) && + loc->inode->st_mode && + !S_ISDIR (loc->inode->st_mode)) { + uint64_t tmp_list = 0; + /* check if revalidate or fresh lookup */ + inode_ctx_get (loc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + } + + if (local->list) { + list = local->list; + for (index = 0; list[index] != -1; index++); + if (index != 2) { + if (index < 2) { + gf_log (this->name, GF_LOG_ERROR, + "returning ESTALE for %s: file " + "count is %ld", loc->path, index); + /* Print where all the file is present */ + for (index = 0; + local->list[index] != -1; index++) { + gf_log (this->name, GF_LOG_ERROR, + "%s: found on %s", loc->path, + priv->xl_array[list[index]]->name); + } + unify_local_wipe (local); + STACK_UNWIND (frame, -1, ESTALE, + NULL, NULL, NULL); + return 0; + } else { + /* There are more than 2 presences */ + /* Just log and continue */ + gf_log (this->name, GF_LOG_ERROR, + "%s: file count is %ld", + loc->path, index); + /* Print where all the file is present */ + for (index = 0; + local->list[index] != -1; index++) { + gf_log (this->name, GF_LOG_ERROR, + "%s: found on %s", loc->path, + priv->xl_array[list[index]]->name); + } + } + } + + /* is revalidate */ + local->revalidate = 1; + + for (index = 0; list[index] != -1; index++) + local->call_count++; + + for (index = 0; list[index] != -1; index++) { + char need_break = (list[index+1] == -1); + STACK_WIND_COOKIE (frame, + unify_lookup_cbk, + (void *)(long)list[index], //cookie + priv->xl_array [list[index]], + priv->xl_array [list[index]]->fops->lookup, + loc, + xattr_req); + if (need_break) + break; + } + } else { + if (loc->inode->st_mode) { + if (inode_ctx_get (loc->inode, this, NULL)) { + inode_ctx_get (loc->inode, this, + &local->inode_generation); + } + } + /* This is first call, there is no list */ + /* call count should be all child + 1 namespace */ + local->call_count = priv->child_count + 1; + + for (index = 0; index <= priv->child_count; index++) { + STACK_WIND_COOKIE (frame, + unify_lookup_cbk, + (void *)index, //cookie + priv->xl_array[index], + priv->xl_array[index]->fops->lookup, + loc, + xattr_req); + } + } + + return 0; +} + +/** + * unify_stat - if directory, get the stat directly from NameSpace child. + * if file, check for a hint and send it only there (also to NS). + * if its a fresh stat, then do it on all the nodes. + * + * NOTE: for all the call, sending cookie as xlator pointer, which will be + * used in cbk. + */ +int32_t +unify_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int16_t index = 0; + int16_t *list = NULL; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, NULL); + return 0; + } + local->st_ino = loc->inode->ino; + if (S_ISDIR (loc->inode->st_mode)) { + /* Directory */ + local->call_count = 1; + STACK_WIND (frame, unify_buf_cbk, NS(this), + NS(this)->fops->stat, loc); + } else { + /* File */ + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) + local->call_count++; + + for (index = 0; list[index] != -1; index++) { + char need_break = (list[index+1] == -1); + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->stat, + loc); + if (need_break) + break; + } + } + + return 0; +} + +/** + * unify_access_cbk - + */ +int32_t +unify_access_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +/** + * unify_access - Send request to only namespace, which has all the + * attributes set for the file. + */ +int32_t +unify_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + STACK_WIND (frame, + unify_access_cbk, + NS(this), + NS(this)->fops->access, + loc, + mask); + + return 0; +} + +int32_t +unify_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + inode_t *tmp_inode = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if ((op_ret == -1) && !(priv->optimist && + (op_errno == ENOENT || + op_errno == EEXIST))) { + /* TODO: Decrement the inode_generation of + * this->inode's parent inode, hence the missing + * directory is created properly by self-heal. + * Currently, there is no way to get the parent + * inode directly. + */ + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + priv->xl_array[(long)cookie]->name, + local->loc1.path, strerror (op_errno)); + if (op_errno != EEXIST) + local->failed = 1; + local->op_errno = op_errno; + } + + if (op_ret >= 0) + local->op_ret = 0; + + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (!local->failed) { + inode_ctx_put (local->loc1.inode, this, + priv->inode_generation); + } + + tmp_inode = local->loc1.inode; + unify_local_wipe (local); + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + tmp_inode, &local->stbuf); + } + + return 0; +} + +/** + * unify_ns_mkdir_cbk - + */ +int32_t +unify_ns_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + long index = 0; + + if (op_ret == -1) { + /* No need to send mkdir request to other servers, + * as namespace action failed + */ + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s): %s", + local->name, strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, NULL); + return 0; + } + + /* Create one inode for this entry */ + local->op_ret = 0; + local->stbuf = *buf; + + local->call_count = priv->child_count; + + /* Send mkdir request to all the nodes now */ + for (index = 0; index < priv->child_count; index++) { + STACK_WIND_COOKIE (frame, + unify_mkdir_cbk, + (void *)index, //cookie + priv->xl_array[index], + priv->xl_array[index]->fops->mkdir, + &local->loc1, + local->mode); + } + + return 0; +} + + +/** + * unify_mkdir - + */ +int32_t +unify_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + unify_local_t *local = NULL; + + /* Initialization */ + INIT_LOCAL (frame, local); + local->mode = mode; + + loc_copy (&local->loc1, loc); + + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); + return 0; + } + + STACK_WIND (frame, + unify_ns_mkdir_cbk, + NS(this), + NS(this)->fops->mkdir, + loc, + mode); + return 0; +} + +/** + * unify_rmdir_cbk - + */ +int32_t +unify_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == 0 || (priv->optimist && (op_errno == ENOENT))) + local->op_ret = 0; + if (op_ret == -1) + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno); + } + + return 0; +} + +/** + * unify_ns_rmdir_cbk - + */ +int32_t +unify_ns_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int16_t index = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + if (op_ret == -1) { + /* No need to send rmdir request to other servers, + * as namespace action failed + */ + gf_log (this->name, + ((op_errno != ENOTEMPTY) ? + GF_LOG_ERROR : GF_LOG_DEBUG), + "namespace: path(%s): %s", + local->loc1.path, strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno); + return 0; + } + + local->call_count = priv->child_count; + + for (index = 0; index < priv->child_count; index++) { + STACK_WIND (frame, + unify_rmdir_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->rmdir, + &local->loc1); + } + + return 0; +} + +/** + * unify_rmdir - + */ +int32_t +unify_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + unify_local_t *local = NULL; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + + loc_copy (&local->loc1, loc); + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM); + return 0; + } + + STACK_WIND (frame, + unify_ns_rmdir_cbk, + NS(this), + NS(this)->fops->rmdir, + loc); + + return 0; +} + +/** + * unify_open_cbk - + */ +int32_t +unify_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int32_t callcnt = 0; + unify_local_t *local = frame->local; + + LOCK (&frame->lock); + { + if (op_ret >= 0) { + local->op_ret = op_ret; + if (NS(this) != (xlator_t *)cookie) { + /* Store child node's ptr, used in + all the f*** / FileIO calls */ + fd_ctx_set (fd, this, (uint64_t)(long)cookie); + } + } + if (op_ret == -1) { + local->op_errno = op_errno; + local->failed = 1; + } + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if ((local->failed == 1) && (local->op_ret >= 0)) { + local->call_count = 1; + /* return -1 to user */ + local->op_ret = -1; + //local->op_errno = EIO; + + if (!fd_ctx_get (local->fd, this, NULL)) { + gf_log (this->name, GF_LOG_ERROR, + "Open success on child node, " + "failed on namespace"); + } else { + gf_log (this->name, GF_LOG_ERROR, + "Open success on namespace, " + "failed on child node"); + } + } + + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + } + + return 0; +} + +#ifdef GF_DARWIN_HOST_OS +/** + * unify_create_lookup_cbk - + */ +int32_t +unify_open_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + int32_t callcnt = 0; + int16_t index = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if ((op_ret == -1) && (op_errno != ENOENT)) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + priv->xl_array[(long)cookie]->name, + local->loc1.path, strerror (op_errno)); + local->op_errno = op_errno; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + local->index++; + if (NS(this) == priv->xl_array[(long)cookie]) { + local->list[0] = (int16_t)(long)cookie; + } else { + local->list[1] = (int16_t)(long)cookie; + } + if (S_ISDIR (buf->st_mode)) + local->failed = 1; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + int16_t file_list[3] = {0,}; + local->op_ret = -1; + + file_list[0] = local->list[0]; + file_list[1] = local->list[1]; + file_list[2] = -1; + + if (local->index != 2) { + /* Lookup failed, can't do open */ + gf_log (this->name, GF_LOG_ERROR, + "%s: present on %d nodes", + local->name, local->index); + + if (local->index < 2) { + unify_local_wipe (local); + gf_log (this->name, GF_LOG_ERROR, + "returning as file found on less " + "than 2 nodes"); + STACK_UNWIND (frame, local->op_ret, + local->op_errno, local->fd); + return 0; + } + } + + if (local->failed) { + /* Open on directory, return EISDIR */ + unify_local_wipe (local); + STACK_UNWIND (frame, -1, EISDIR, local->fd); + return 0; + } + + /* Everything is perfect :) */ + local->call_count = 2; + + for (index = 0; file_list[index] != -1; index++) { + char need_break = (file_list[index+1] == -1); + STACK_WIND_COOKIE (frame, + unify_open_cbk, + priv->xl_array[file_list[index]], + priv->xl_array[file_list[index]], + priv->xl_array[file_list[index]]->fops->open, + &local->loc1, + local->flags, + local->fd); + if (need_break) + break; + } + } + + return 0; +} + + +int32_t +unify_open_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + int16_t index = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + if (op_ret == -1) { + STACK_UNWIND (frame, -1, ENOENT); + return 0; + } + + if (path[0] == '/') { + local->name = strdup (path); + ERR_ABORT (local->name); + } else { + char *tmp_str = strdup (local->loc1.path); + char *tmp_base = dirname (tmp_str); + local->name = CALLOC (1, ZR_PATH_MAX); + strcpy (local->name, tmp_base); + strncat (local->name, "/", 1); + strcat (local->name, path); + FREE (tmp_str); + } + + local->list = CALLOC (1, sizeof (int16_t) * 3); + ERR_ABORT (local->list); + local->call_count = priv->child_count + 1; + local->op_ret = -1; + for (index = 0; index <= priv->child_count; index++) { + /* Send the lookup to all the nodes including namespace */ + STACK_WIND_COOKIE (frame, + unify_open_lookup_cbk, + (void *)(long)index, + priv->xl_array[index], + priv->xl_array[index]->fops->lookup, + &local->loc1, + NULL); + } + + return 0; +} +#endif /* GF_DARWIN_HOST_OS */ + +/** + * unify_open - + */ +int32_t +unify_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + unify_private_t *priv = this->private; + unify_local_t *local = NULL; + int16_t *list = NULL; + int16_t index = 0; + int16_t file_list[3] = {0,}; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Init */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + local->fd = fd; + local->flags = flags; + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + local->list = list; + file_list[0] = priv->child_count; /* Thats namespace */ + file_list[2] = -1; + for (index = 0; list[index] != -1; index++) { + local->call_count++; + if (list[index] != priv->child_count) + file_list[1] = list[index]; + } + + if (local->call_count != 2) { + /* If the lookup was done for file */ + gf_log (this->name, GF_LOG_ERROR, + "%s: entry_count is %d", + loc->path, local->call_count); + for (index = 0; local->list[index] != -1; index++) + gf_log (this->name, GF_LOG_ERROR, "%s: found on %s", + loc->path, priv->xl_array[list[index]]->name); + + if (local->call_count < 2) { + gf_log (this->name, GF_LOG_ERROR, + "returning EIO as file found on onlyone node"); + STACK_UNWIND (frame, -1, EIO, fd); + return 0; + } + } + +#ifdef GF_DARWIN_HOST_OS + /* Handle symlink here */ + if (S_ISLNK (loc->inode->st_mode)) { + /* Callcount doesn't matter here */ + STACK_WIND (frame, + unify_open_readlink_cbk, + NS(this), + NS(this)->fops->readlink, + loc, ZR_PATH_MAX); + return 0; + } +#endif /* GF_DARWIN_HOST_OS */ + + local->call_count = 2; + for (index = 0; file_list[index] != -1; index++) { + char need_break = (file_list[index+1] == -1); + STACK_WIND_COOKIE (frame, + unify_open_cbk, + priv->xl_array[file_list[index]], //cookie + priv->xl_array[file_list[index]], + priv->xl_array[file_list[index]]->fops->open, + loc, + flags, + fd); + if (need_break) + break; + } + + return 0; +} + + +int32_t +unify_create_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + unify_local_t *local = frame->local; + inode_t *inode = local->loc1.inode; + + unify_local_wipe (local); + + STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, + inode, &local->stbuf); + + return 0; +} + +/** + * unify_create_open_cbk - + */ +int32_t +unify_create_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int ret = 0; + int32_t callcnt = 0; + unify_local_t *local = frame->local; + inode_t *inode = NULL; + xlator_t *child = NULL; + uint64_t tmp_value = 0; + + LOCK (&frame->lock); + { + if (op_ret >= 0) { + local->op_ret = op_ret; + if (NS(this) != (xlator_t *)cookie) { + /* Store child node's ptr, used in all + the f*** / FileIO calls */ + /* TODO: log on failure */ + ret = fd_ctx_get (fd, this, &tmp_value); + cookie = (void *)(long)tmp_value; + } else { + /* NOTE: open successful on namespace. + * fd's ctx can be used to identify open + * failure on storage subvolume. cool + * ide ;) */ + local->failed = 0; + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + ((xlator_t *)cookie)->name, + local->loc1.path, strerror (op_errno)); + local->op_errno = op_errno; + local->failed = 1; + } + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed == 1 && (local->op_ret >= 0)) { + local->call_count = 1; + /* return -1 to user */ + local->op_ret = -1; + local->op_errno = EIO; + local->fd = fd; + local->call_count = 1; + + if (!fd_ctx_get (local->fd, this, &tmp_value)) { + child = (xlator_t *)(long)tmp_value; + + gf_log (this->name, GF_LOG_ERROR, + "Create success on child node, " + "failed on namespace"); + + STACK_WIND (frame, + unify_create_unlink_cbk, + child, + child->fops->unlink, + &local->loc1); + } else { + gf_log (this->name, GF_LOG_ERROR, + "Create success on namespace, " + "failed on child node"); + + STACK_WIND (frame, + unify_create_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + } + return 0; + } + inode = local->loc1.inode; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, fd, + inode, &local->stbuf); + } + return 0; +} + +/** + * unify_create_lookup_cbk - + */ +int32_t +unify_create_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + int32_t callcnt = 0; + int16_t index = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + priv->xl_array[(long)cookie]->name, + local->loc1.path, strerror (op_errno)); + local->op_errno = op_errno; + local->failed = 1; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + local->list[local->index++] = (int16_t)(long)cookie; + if (NS(this) == priv->xl_array[(long)cookie]) { + local->st_ino = buf->st_ino; + } else { + local->stbuf = *buf; + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + int16_t *list = local->list; + int16_t file_list[3] = {0,}; + local->op_ret = -1; + + local->list [local->index] = -1; + file_list[0] = list[0]; + file_list[1] = list[1]; + file_list[2] = -1; + + local->stbuf.st_ino = local->st_ino; + /* TODO: log on failure */ + inode_ctx_put (local->loc1.inode, this, + (uint64_t)(long)local->list); + + if (local->index != 2) { + /* Lookup failed, can't do open */ + gf_log (this->name, GF_LOG_ERROR, + "%s: present on %d nodes", + local->loc1.path, local->index); + file_list[0] = priv->child_count; + for (index = 0; list[index] != -1; index++) { + gf_log (this->name, GF_LOG_ERROR, + "%s: found on %s", local->loc1.path, + priv->xl_array[list[index]]->name); + if (list[index] != priv->child_count) + file_list[1] = list[index]; + } + + if (local->index < 2) { + unify_local_wipe (local); + gf_log (this->name, GF_LOG_ERROR, + "returning EIO as file found on " + "only one node"); + STACK_UNWIND (frame, -1, EIO, + local->fd, inode, NULL); + return 0; + } + } + /* Everything is perfect :) */ + local->call_count = 2; + + for (index = 0; file_list[index] != -1; index++) { + char need_break = (file_list[index+1] == -1); + STACK_WIND_COOKIE (frame, + unify_create_open_cbk, + priv->xl_array[file_list[index]], + priv->xl_array[file_list[index]], + priv->xl_array[file_list[index]]->fops->open, + &local->loc1, + local->flags, + local->fd); + if (need_break) + break; + } + } + + return 0; +} + + +/** + * unify_create_cbk - + */ +int32_t +unify_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + int ret = 0; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + inode_t *tmp_inode = NULL; + + if (op_ret == -1) { + /* send unlink () on Namespace */ + local->op_errno = op_errno; + local->op_ret = -1; + local->call_count = 1; + gf_log (this->name, GF_LOG_ERROR, + "create failed on %s (file %s, error %s), " + "sending unlink to namespace", + prev_frame->this->name, + local->loc1.path, strerror (op_errno)); + + STACK_WIND (frame, + unify_create_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + + return 0; + } + + if (op_ret >= 0) { + local->op_ret = op_ret; + local->stbuf = *buf; + /* Just inode number should be from NS node */ + local->stbuf.st_ino = local->st_ino; + + /* TODO: log on failure */ + ret = fd_ctx_set (fd, this, (uint64_t)(long)prev_frame->this); + } + + tmp_inode = local->loc1.inode; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, local->fd, + tmp_inode, &local->stbuf); + + return 0; +} + +/** + * unify_ns_create_cbk - + * + */ +int32_t +unify_ns_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + struct sched_ops *sched_ops = NULL; + xlator_t *sched_xl = NULL; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t *list = NULL; + int16_t index = 0; + + if (op_ret == -1) { + /* No need to send create request to other servers, as + namespace action failed. Handle exclusive create here. */ + if ((op_errno != EEXIST) || + ((op_errno == EEXIST) && + ((local->flags & O_EXCL) == O_EXCL))) { + /* If its just a create call without O_EXCL, + don't do this */ + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s): %s", + local->loc1.path, strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + return 0; + } + } + + if (op_ret >= 0) { + /* Get the inode number from the NS node */ + local->st_ino = buf->st_ino; + + local->op_ret = -1; + + /* Start the mapping list */ + list = CALLOC (1, sizeof (int16_t) * 3); + ERR_ABORT (list); + inode_ctx_put (inode, this, (uint64_t)(long)list); + list[0] = priv->child_count; + list[2] = -1; + + /* This means, file doesn't exist anywhere in the Filesystem */ + sched_ops = priv->sched_ops; + + /* Send create request to the scheduled node now */ + sched_xl = sched_ops->schedule (this, local->loc1.path); + if (sched_xl == NULL) + { + /* send unlink () on Namespace */ + local->op_errno = ENOTCONN; + local->op_ret = -1; + local->call_count = 1; + gf_log (this->name, GF_LOG_ERROR, + "no node online to schedule create:(file %s) " + "sending unlink to namespace", + (local->loc1.path)?local->loc1.path:""); + + STACK_WIND (frame, + unify_create_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + + return 0; + } + + for (index = 0; index < priv->child_count; index++) + if (sched_xl == priv->xl_array[index]) + break; + list[1] = index; + + STACK_WIND (frame, unify_create_cbk, + sched_xl, sched_xl->fops->create, + &local->loc1, local->flags, local->mode, fd); + } else { + /* File already exists, and there is no O_EXCL flag */ + + gf_log (this->name, GF_LOG_DEBUG, + "File(%s) already exists on namespace, sending " + "open instead", local->loc1.path); + + local->list = CALLOC (1, sizeof (int16_t) * 3); + ERR_ABORT (local->list); + local->call_count = priv->child_count + 1; + local->op_ret = -1; + for (index = 0; index <= priv->child_count; index++) { + /* Send lookup() to all nodes including namespace */ + STACK_WIND_COOKIE (frame, + unify_create_lookup_cbk, + (void *)(long)index, + priv->xl_array[index], + priv->xl_array[index]->fops->lookup, + &local->loc1, + NULL); + } + } + return 0; +} + +/** + * unify_create - create a file in global namespace first, so other + * clients can see them. Create the file in storage nodes in background. + */ +int32_t +unify_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd) +{ + unify_local_t *local = NULL; + + /* Initialization */ + INIT_LOCAL (frame, local); + local->mode = mode; + local->flags = flags; + local->fd = fd; + + loc_copy (&local->loc1, loc); + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, fd, loc->inode, NULL); + return 0; + } + + STACK_WIND (frame, + unify_ns_create_cbk, + NS(this), + NS(this)->fops->create, + loc, + flags | O_EXCL, + mode, + fd); + + return 0; +} + + +/** + * unify_opendir_cbk - + */ +int32_t +unify_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + STACK_UNWIND (frame, op_ret, op_errno, fd); + + return 0; +} + +/** + * unify_opendir - + */ +int32_t +unify_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + fd_t *fd) +{ + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + STACK_WIND (frame, unify_opendir_cbk, + NS(this), NS(this)->fops->opendir, loc, fd); + + return 0; +} + + +/** + * unify_chmod - + */ +int32_t +unify_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int32_t index = 0; + int32_t callcnt = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + + loc_copy (&local->loc1, loc); + local->st_ino = loc->inode->ino; + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = priv->child_count + 1; + + for (index = 0; index < (priv->child_count + 1); index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->chmod, + loc, mode); + } + } else { + inode_ctx_get (loc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + for (index = 0; local->list[index] != -1; index++) { + local->call_count++; + callcnt++; + } + + for (index = 0; local->list[index] != -1; index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[local->list[index]], + priv->xl_array[local->list[index]]->fops->chmod, + loc, + mode); + if (!--callcnt) + break; + } + } + + return 0; +} + +/** + * unify_chown - + */ +int32_t +unify_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int32_t index = 0; + int32_t callcnt = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + local->st_ino = loc->inode->ino; + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = priv->child_count + 1; + + for (index = 0; index < (priv->child_count + 1); index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->chown, + loc, uid, gid); + } + } else { + inode_ctx_get (loc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + for (index = 0; local->list[index] != -1; index++) { + local->call_count++; + callcnt++; + } + + for (index = 0; local->list[index] != -1; index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[local->list[index]], + priv->xl_array[local->list[index]]->fops->chown, + loc, uid, gid); + if (!--callcnt) + break; + } + } + + return 0; +} + + +/** + * unify_truncate_cbk - + */ +int32_t +unify_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + prev_frame->this->name, + (local->loc1.path)?local->loc1.path:"", + strerror (op_errno)); + local->op_errno = op_errno; + if (!((op_errno == ENOENT) && priv->optimist)) + local->op_ret = -1; + } + + if (op_ret >= 0) { + if (NS (this) == prev_frame->this) { + local->st_ino = buf->st_ino; + /* If the entry is directory, get the + stat from NS node */ + if (S_ISDIR (buf->st_mode) || + !local->stbuf.st_blksize) { + local->stbuf = *buf; + } + } + + if ((!S_ISDIR (buf->st_mode)) && + (NS (this) != prev_frame->this)) { + /* If file, take the stat info from + Storage node. */ + local->stbuf = *buf; + } + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->st_ino) + local->stbuf.st_ino = local->st_ino; + else + local->op_ret = -1; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + } + + return 0; +} + +/** + * unify_truncate - + */ +int32_t +unify_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int32_t index = 0; + int32_t callcnt = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + local->st_ino = loc->inode->ino; + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = 1; + + STACK_WIND (frame, + unify_buf_cbk, + NS(this), + NS(this)->fops->stat, + loc); + } else { + local->op_ret = 0; + inode_ctx_get (loc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + for (index = 0; local->list[index] != -1; index++) { + local->call_count++; + callcnt++; + } + + /* Don't send truncate to NS node */ + STACK_WIND (frame, unify_truncate_cbk, NS(this), + NS(this)->fops->stat, loc); + callcnt--; + + for (index = 0; local->list[index] != -1; index++) { + if (NS(this) != priv->xl_array[local->list[index]]) { + STACK_WIND (frame, + unify_truncate_cbk, + priv->xl_array[local->list[index]], + priv->xl_array[local->list[index]]->fops->truncate, + loc, + offset); + if (!--callcnt) + break; + } + } + } + + return 0; +} + +/** + * unify_utimens - + */ +int32_t +unify_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + unify_local_t *local = NULL; + unify_private_t *priv = this->private; + int32_t index = 0; + int32_t callcnt = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + local->st_ino = loc->inode->ino; + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = priv->child_count + 1; + + for (index = 0; index < (priv->child_count + 1); index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->utimens, + loc, tv); + } + } else { + inode_ctx_get (loc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + for (index = 0; local->list[index] != -1; index++) { + local->call_count++; + callcnt++; + } + + for (index = 0; local->list[index] != -1; index++) { + STACK_WIND (frame, + unify_buf_cbk, + priv->xl_array[local->list[index]], + priv->xl_array[local->list[index]]->fops->utimens, + loc, + tv); + if (!--callcnt) + break; + } + } + + return 0; +} + +/** + * unify_readlink_cbk - + */ +int32_t +unify_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + STACK_UNWIND (frame, op_ret, op_errno, path); + return 0; +} + +/** + * unify_readlink - Read the link only from the storage node. + */ +int32_t +unify_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + unify_private_t *priv = this->private; + int32_t entry_count = 0; + int16_t *list = NULL; + int16_t index = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) + entry_count++; + + if (entry_count >= 2) { + for (index = 0; list[index] != -1; index++) { + if (priv->xl_array[list[index]] != NS(this)) { + STACK_WIND (frame, + unify_readlink_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->readlink, + loc, + size); + break; + } + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "returning ENOENT, no softlink files found " + "on storage node"); + STACK_UNWIND (frame, -1, ENOENT, NULL); + } + + return 0; +} + + +/** + * unify_unlink_cbk - + */ +int32_t +unify_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == 0 || ((op_errno == ENOENT) && priv->optimist)) + local->op_ret = 0; + if (op_ret == -1) + local->op_errno = op_errno; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno); + } + + return 0; +} + + +/** + * unify_unlink - + */ +int32_t +unify_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + unify_private_t *priv = this->private; + unify_local_t *local = NULL; + int16_t *list = NULL; + int16_t index = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) + local->call_count++; + + if (local->call_count) { + for (index = 0; list[index] != -1; index++) { + char need_break = (list[index+1] == -1); + STACK_WIND (frame, + unify_unlink_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->unlink, + loc); + if (need_break) + break; + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "%s: returning ENOENT", loc->path); + STACK_UNWIND (frame, -1, ENOENT); + } + + return 0; +} + + +/** + * unify_readv_cbk - + */ +int32_t +unify_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + return 0; +} + +/** + * unify_readv - + */ +int32_t +unify_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, + unify_readv_cbk, + child, + child->fops->readv, + fd, + size, + offset); + + + return 0; +} + +/** + * unify_writev_cbk - + */ +int32_t +unify_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} + +/** + * unify_writev - + */ +int32_t +unify_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, + unify_writev_cbk, + child, + child->fops->writev, + fd, + vector, + count, + off); + + return 0; +} + +/** + * unify_ftruncate - + */ +int32_t +unify_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + xlator_t *child = NULL; + unify_local_t *local = NULL; + uint64_t tmp_child = 0; + + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR(fd); + + /* Initialization */ + INIT_LOCAL (frame, local); + local->op_ret = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + local->call_count = 2; + + STACK_WIND (frame, unify_truncate_cbk, + child, child->fops->ftruncate, + fd, offset); + + STACK_WIND (frame, unify_truncate_cbk, + NS(this), NS(this)->fops->fstat, + fd); + + return 0; +} + + +/** + * unify_fchmod - + */ +int32_t +unify_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + unify_local_t *local = NULL; + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); + + /* Initialization */ + INIT_LOCAL (frame, local); + local->st_ino = fd->inode->ino; + + if (!fd_ctx_get (fd, this, &tmp_child)) { + /* If its set, then its file */ + child = (xlator_t *)(long)tmp_child; + + local->call_count = 2; + + STACK_WIND (frame, unify_buf_cbk, child, + child->fops->fchmod, fd, mode); + + STACK_WIND (frame, unify_buf_cbk, NS(this), + NS(this)->fops->fchmod, fd, mode); + + } else { + /* this is an directory */ + local->call_count = 1; + + STACK_WIND (frame, unify_buf_cbk, + NS(this), NS(this)->fops->fchmod, fd, mode); + } + + return 0; +} + +/** + * unify_fchown - + */ +int32_t +unify_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + unify_local_t *local = NULL; + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); + + /* Initialization */ + INIT_LOCAL (frame, local); + local->st_ino = fd->inode->ino; + + if (!fd_ctx_get (fd, this, &tmp_child)) { + /* If its set, then its file */ + child = (xlator_t *)(long)tmp_child; + + local->call_count = 2; + + STACK_WIND (frame, unify_buf_cbk, child, + child->fops->fchown, fd, uid, gid); + + STACK_WIND (frame, unify_buf_cbk, NS(this), + NS(this)->fops->fchown, fd, uid, gid); + } else { + local->call_count = 1; + + STACK_WIND (frame, unify_buf_cbk, + NS(this), NS(this)->fops->fchown, + fd, uid, gid); + } + + return 0; +} + +/** + * unify_flush_cbk - + */ +int32_t +unify_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * unify_flush - + */ +int32_t +unify_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_flush_cbk, child, + child->fops->flush, fd); + + return 0; +} + + +/** + * unify_fsync_cbk - + */ +int32_t +unify_fsync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * unify_fsync - + */ +int32_t +unify_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_fsync_cbk, child, + child->fops->fsync, fd, flags); + + return 0; +} + +/** + * unify_fstat - Send fstat FOP to Namespace only if its directory, and to + * both namespace and the storage node if its a file. + */ +int32_t +unify_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + unify_local_t *local = NULL; + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR(fd); + + INIT_LOCAL (frame, local); + local->st_ino = fd->inode->ino; + + if (!fd_ctx_get (fd, this, &tmp_child)) { + /* If its set, then its file */ + child = (xlator_t *)(long)tmp_child; + local->call_count = 2; + + STACK_WIND (frame, unify_buf_cbk, child, + child->fops->fstat, fd); + + STACK_WIND (frame, unify_buf_cbk, NS(this), + NS(this)->fops->fstat, fd); + + } else { + /* this is an directory */ + local->call_count = 1; + STACK_WIND (frame, unify_buf_cbk, NS(this), + NS(this)->fops->fstat, fd); + } + + return 0; +} + +/** + * unify_getdents_cbk - + */ +int32_t +unify_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entry, + int32_t count) +{ + STACK_UNWIND (frame, op_ret, op_errno, entry, count); + return 0; +} + +/** + * unify_getdents - send the FOP request to all the nodes. + */ +int32_t +unify_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag) +{ + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); + + STACK_WIND (frame, unify_getdents_cbk, NS(this), + NS(this)->fops->getdents, fd, size, offset, flag); + + return 0; +} + + +/** + * unify_readdir_cbk - + */ +int32_t +unify_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + + return 0; +} + +/** + * unify_readdir - send the FOP request to all the nodes. + */ +int32_t +unify_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); + + STACK_WIND (frame, unify_readdir_cbk, NS(this), + NS(this)->fops->readdir, fd, size, offset); + + return 0; +} + + +/** + * unify_fsyncdir_cbk - + */ +int32_t +unify_fsyncdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +/** + * unify_fsyncdir - + */ +int32_t +unify_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + UNIFY_CHECK_FD_AND_UNWIND_ON_ERR (fd); + + STACK_WIND (frame, unify_fsyncdir_cbk, + NS(this), NS(this)->fops->fsyncdir, fd, flags); + + return 0; +} + +/** + * unify_lk_cbk - UNWIND frame with the proper return arguments. + */ +int32_t +unify_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + STACK_UNWIND (frame, op_ret, op_errno, lock); + return 0; +} + +/** + * unify_lk - Send it to all the storage nodes, (should be 1) which has file. + */ +int32_t +unify_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_lk_cbk, child, + child->fops->lk, fd, cmd, lock); + + return 0; +} + + +int32_t +unify_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno); + +static int32_t +unify_setxattr_file_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + unify_private_t *private = this->private; + unify_local_t *local = frame->local; + xlator_t *sched_xl = NULL; + struct sched_ops *sched_ops = NULL; + + if (op_ret == -1) { + if (!ENOTSUP) + gf_log (this->name, GF_LOG_ERROR, + "setxattr with XATTR_CREATE on ns: " + "path(%s) key(%s): %s", + local->loc1.path, local->name, + strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno); + return 0; + } + + LOCK (&frame->lock); + { + local->failed = 0; + local->op_ret = 0; + local->op_errno = 0; + local->call_count = 1; + } + UNLOCK (&frame->lock); + + /* schedule XATTR_CREATE on one of the child node */ + sched_ops = private->sched_ops; + + /* Send create request to the scheduled node now */ + sched_xl = sched_ops->schedule (this, local->name); + if (!sched_xl) { + STACK_UNWIND (frame, -1, ENOTCONN); + return 0; + } + + STACK_WIND (frame, + unify_setxattr_cbk, + sched_xl, + sched_xl->fops->setxattr, + &local->loc1, + local->dict, + local->flags); + return 0; +} + +/** + * unify_setxattr_cbk - When all the child nodes return, UNWIND frame. + */ +int32_t +unify_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + dict_t *dict = NULL; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + gf_log (this->name, (((op_errno == ENOENT) || + (op_errno == ENOTSUP))? + GF_LOG_DEBUG : GF_LOG_ERROR), + "child(%s): path(%s): %s", + prev_frame->this->name, + (local->loc1.path)?local->loc1.path:"", + strerror (op_errno)); + if (local->failed == -1) { + local->failed = 1; + } + local->op_errno = op_errno; + } else { + local->failed = 0; + local->op_ret = op_ret; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + if (local->failed && local->name && + ZR_FILE_CONTENT_REQUEST(local->name)) { + dict = get_new_dict (); + dict_set (dict, local->dict->members_list->key, + data_from_dynptr(NULL, 0)); + dict_ref (dict); + + local->call_count = 1; + + STACK_WIND (frame, + unify_setxattr_file_cbk, + NS(this), + NS(this)->fops->setxattr, + &local->loc1, + dict, + XATTR_CREATE); + + dict_unref (dict); + return 0; + } + + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno); + } + + return 0; +} + +/** + * unify_sexattr - This function should be sent to all the storage nodes, + * which contains the file, (excluding namespace). + */ +int32_t +unify_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + unify_private_t *priv = this->private; + unify_local_t *local = NULL; + int16_t *list = NULL; + int16_t index = 0; + int32_t call_count = 0; + uint64_t tmp_list = 0; + data_pair_t *trav = dict->members_list; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + local->failed = -1; + loc_copy (&local->loc1, loc); + + if (S_ISDIR (loc->inode->st_mode)) { + + if (trav && trav->key && ZR_FILE_CONTENT_REQUEST(trav->key)) { + /* direct the storage xlators to change file + content only if file exists */ + local->flags = flags; + local->dict = dict; + local->name = strdup (trav->key); + flags |= XATTR_REPLACE; + } + + local->call_count = priv->child_count; + for (index = 0; index < priv->child_count; index++) { + STACK_WIND (frame, + unify_setxattr_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->setxattr, + loc, dict, flags); + } + return 0; + } + + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) { + if (NS(this) != priv->xl_array[list[index]]) { + local->call_count++; + call_count++; + } + } + + if (local->call_count) { + for (index = 0; list[index] != -1; index++) { + if (priv->xl_array[list[index]] != NS(this)) { + STACK_WIND (frame, + unify_setxattr_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->setxattr, + loc, + dict, + flags); + if (!--call_count) + break; + } + } + return 0; + } + + /* No entry in storage nodes */ + gf_log (this->name, GF_LOG_DEBUG, + "returning ENOENT, file not found on storage node."); + STACK_UNWIND (frame, -1, ENOENT); + + return 0; +} + + +/** + * unify_getxattr_cbk - This function is called from only one child, so, no + * need of any lock or anything else, just send it to above layer + */ +int32_t +unify_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *value) +{ + int32_t callcnt = 0; + dict_t *local_value = NULL; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + + if (op_ret == -1) { + local->op_errno = op_errno; + gf_log (this->name, + (((op_errno == ENOENT) || + (op_errno == ENODATA) || + (op_errno == ENOTSUP)) ? + GF_LOG_DEBUG : GF_LOG_ERROR), + "child(%s): path(%s): %s", + prev_frame->this->name, + (local->loc1.path)?local->loc1.path:"", + strerror (op_errno)); + } else { + if (!local->dict) + local->dict = dict_ref (value); + local->op_ret = op_ret; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local_value = local->dict; + local->dict = NULL; + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + local_value); + + if (local_value) + dict_unref (local_value); + } + + return 0; +} + + +/** + * unify_getxattr - This FOP is sent to only the storage node. + */ +int32_t +unify_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + unify_private_t *priv = this->private; + int16_t *list = NULL; + int16_t index = 0; + int16_t count = 0; + unify_local_t *local = NULL; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + INIT_LOCAL (frame, local); + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = priv->child_count; + for (index = 0; index < priv->child_count; index++) + STACK_WIND (frame, + unify_getxattr_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->getxattr, + loc, + name); + return 0; + } + + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) { + if (NS(this) != priv->xl_array[list[index]]) { + local->call_count++; + count++; + } + } + + if (count) { + for (index = 0; list[index] != -1; index++) { + if (priv->xl_array[list[index]] != NS(this)) { + STACK_WIND (frame, + unify_getxattr_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->getxattr, + loc, + name); + if (!--count) + break; + } + } + } else { + dict_t *tmp_dict = get_new_dict (); + gf_log (this->name, GF_LOG_DEBUG, + "%s: returning ENODATA, no file found on storage node", + loc->path); + STACK_UNWIND (frame, -1, ENODATA, tmp_dict); + dict_destroy (tmp_dict); + } + + return 0; +} + +/** + * unify_removexattr_cbk - Wait till all the child node returns the call + * and then UNWIND to above layer. + */ +int32_t +unify_removexattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret == -1) { + local->op_errno = op_errno; + if (op_errno != ENOTSUP) + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + prev_frame->this->name, + local->loc1.path, strerror (op_errno)); + } else { + local->op_ret = op_ret; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + STACK_UNWIND (frame, local->op_ret, local->op_errno); + } + + return 0; +} + +/** + * unify_removexattr - Send it to all the child nodes which has the files. + */ +int32_t +unify_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + unify_private_t *priv = this->private; + unify_local_t *local = NULL; + int16_t *list = NULL; + int16_t index = 0; + int32_t call_count = 0; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (loc); + + /* Initialization */ + INIT_LOCAL (frame, local); + + if (S_ISDIR (loc->inode->st_mode)) { + local->call_count = priv->child_count; + for (index = 0; index < priv->child_count; index++) + STACK_WIND (frame, + unify_removexattr_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->removexattr, + loc, + name); + + return 0; + } + + inode_ctx_get (loc->inode, this, &tmp_list); + list = (int16_t *)(long)tmp_list; + + for (index = 0; list[index] != -1; index++) { + if (NS(this) != priv->xl_array[list[index]]) { + local->call_count++; + call_count++; + } + } + + if (local->call_count) { + for (index = 0; list[index] != -1; index++) { + if (priv->xl_array[list[index]] != NS(this)) { + STACK_WIND (frame, + unify_removexattr_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->removexattr, + loc, + name); + if (!--call_count) + break; + } + } + return 0; + } + + gf_log (this->name, GF_LOG_DEBUG, + "%s: returning ENOENT, not found on storage node.", loc->path); + STACK_UNWIND (frame, -1, ENOENT); + + return 0; +} + + +int32_t +unify_mknod_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + unify_local_t *local = frame->local; + + if (op_ret == -1) + gf_log (this->name, GF_LOG_ERROR, + "%s: %s", local->loc1.path, strerror (op_errno)); + + unify_local_wipe (local); + /* No log required here as this -1 is for mknod call */ + STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); + return 0; +} + +/** + * unify_mknod_cbk - + */ +int32_t +unify_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + unify_local_t *local = frame->local; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "mknod failed on storage node, sending unlink to " + "namespace"); + local->op_errno = op_errno; + STACK_WIND (frame, + unify_mknod_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + return 0; + } + + local->stbuf = *buf; + local->stbuf.st_ino = local->st_ino; + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf); + return 0; +} + +/** + * unify_ns_mknod_cbk - + */ +int32_t +unify_ns_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + struct sched_ops *sched_ops = NULL; + xlator_t *sched_xl = NULL; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t *list = NULL; + int16_t index = 0; + call_frame_t *prev_frame = cookie; + + if (op_ret == -1) { + /* No need to send mknod request to other servers, + * as namespace action failed + */ + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s): %s", + prev_frame->this->name, local->loc1.path, + strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; + } + + /* Create one inode for this entry */ + local->op_ret = 0; + local->stbuf = *buf; + local->st_ino = buf->st_ino; + + list = CALLOC (1, sizeof (int16_t) * 3); + ERR_ABORT (list); + list[0] = priv->child_count; + list[2] = -1; + inode_ctx_put (inode, this, (uint64_t)(long)list); + + sched_ops = priv->sched_ops; + + /* Send mknod request to scheduled node now */ + sched_xl = sched_ops->schedule (this, local->loc1.path); + if (!sched_xl) { + gf_log (this->name, GF_LOG_ERROR, + "mknod failed on storage node, no node online " + "at the moment, sending unlink to NS"); + local->op_errno = ENOTCONN; + STACK_WIND (frame, + unify_mknod_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + + return 0; + } + + for (index = 0; index < priv->child_count; index++) + if (sched_xl == priv->xl_array[index]) + break; + list[1] = index; + + STACK_WIND (frame, unify_mknod_cbk, + sched_xl, sched_xl->fops->mknod, + &local->loc1, local->mode, local->dev); + + return 0; +} + +/** + * unify_mknod - Create a device on namespace first, and later create on + * the storage node. + */ +int32_t +unify_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + unify_local_t *local = NULL; + + /* Initialization */ + INIT_LOCAL (frame, local); + local->mode = mode; + local->dev = rdev; + loc_copy (&local->loc1, loc); + if (local->loc1.path == NULL) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); + return 0; + } + + STACK_WIND (frame, + unify_ns_mknod_cbk, + NS(this), + NS(this)->fops->mknod, + loc, + mode, + rdev); + + return 0; +} + +int32_t +unify_symlink_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + unify_local_t *local = frame->local; + if (op_ret == -1) + gf_log (this->name, GF_LOG_ERROR, + "%s: %s", local->loc1.path, strerror (op_errno)); + + unify_local_wipe (local); + STACK_UNWIND (frame, -1, local->op_errno, NULL, NULL); + return 0; +} + +/** + * unify_symlink_cbk - + */ +int32_t +unify_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + unify_local_t *local = frame->local; + + if (op_ret == -1) { + /* Symlink on storage node failed, hence send unlink + to the NS node */ + local->op_errno = op_errno; + gf_log (this->name, GF_LOG_ERROR, + "symlink on storage node failed, sending unlink " + "to namespace"); + + STACK_WIND (frame, + unify_symlink_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + + return 0; + } + + local->stbuf = *buf; + local->stbuf.st_ino = local->st_ino; + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf); + + return 0; +} + +/** + * unify_ns_symlink_cbk - + */ +int32_t +unify_ns_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + + struct sched_ops *sched_ops = NULL; + xlator_t *sched_xl = NULL; + int16_t *list = NULL; + unify_local_t *local = frame->local; + unify_private_t *priv = this->private; + int16_t index = 0; + + if (op_ret == -1) { + /* No need to send symlink request to other servers, + * as namespace action failed + */ + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s): %s", + local->loc1.path, strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, NULL, buf); + return 0; + } + + /* Create one inode for this entry */ + local->op_ret = 0; + local->st_ino = buf->st_ino; + + /* Start the mapping list */ + + list = CALLOC (1, sizeof (int16_t) * 3); + ERR_ABORT (list); + list[0] = priv->child_count; //namespace's index + list[2] = -1; + inode_ctx_put (inode, this, (uint64_t)(long)list); + + sched_ops = priv->sched_ops; + + /* Send symlink request to all the nodes now */ + sched_xl = sched_ops->schedule (this, local->loc1.path); + if (!sched_xl) { + /* Symlink on storage node failed, hence send unlink + to the NS node */ + local->op_errno = ENOTCONN; + gf_log (this->name, GF_LOG_ERROR, + "symlink on storage node failed, no node online, " + "sending unlink to namespace"); + + STACK_WIND (frame, + unify_symlink_unlink_cbk, + NS(this), + NS(this)->fops->unlink, + &local->loc1); + + return 0; + } + + for (index = 0; index < priv->child_count; index++) + if (sched_xl == priv->xl_array[index]) + break; + list[1] = index; + + STACK_WIND (frame, + unify_symlink_cbk, + sched_xl, + sched_xl->fops->symlink, + local->name, + &local->loc1); + + return 0; +} + +/** + * unify_symlink - + */ +int32_t +unify_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc) +{ + unify_local_t *local = NULL; + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, loc); + local->name = strdup (linkpath); + + if ((local->name == NULL) || + (local->loc1.path == NULL)) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, loc->inode, NULL); + return 0; + } + + STACK_WIND (frame, + unify_ns_symlink_cbk, + NS(this), + NS(this)->fops->symlink, + linkpath, + loc); + + return 0; +} + + +int32_t +unify_rename_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + int32_t callcnt = 0; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s -> %s): %s", + prev_frame->this->name, + local->loc1.path, local->loc2.path, + strerror (op_errno)); + + } + LOCK (&frame->lock); + { + callcnt = --local->call_count; + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local->stbuf.st_ino = local->st_ino; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, + &local->stbuf); + } + return 0; +} + +int32_t +unify_ns_rename_undo_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + unify_local_t *local = frame->local; + + if (op_ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s -> %s): %s", + local->loc1.path, local->loc2.path, + strerror (op_errno)); + } + + local->stbuf.st_ino = local->st_ino; + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf); + return 0; +} + +int32_t +unify_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int32_t index = 0; + int32_t callcnt = 0; + int16_t *list = NULL; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + call_frame_t *prev_frame = cookie; + + LOCK (&frame->lock); + { + callcnt = --local->call_count; + if (op_ret >= 0) { + if (!S_ISDIR (buf->st_mode)) + local->stbuf = *buf; + local->op_ret = op_ret; + } else { + gf_log (this->name, GF_LOG_ERROR, + "child(%s): path(%s -> %s): %s", + prev_frame->this->name, + local->loc1.path, local->loc2.path, + strerror (op_errno)); + local->op_errno = op_errno; + } + } + UNLOCK (&frame->lock); + + if (!callcnt) { + local->stbuf.st_ino = local->st_ino; + if (S_ISDIR (local->loc1.inode->st_mode)) { + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, local->op_errno, &local->stbuf); + return 0; + } + + if (local->op_ret == -1) { + /* TODO: check this logic */ + + /* Rename failed in storage node, successful on NS, + * hence, rename back the entries in NS */ + /* NOTE: this will be done only if the destination + * doesn't exists, if the destination exists, the + * job of correcting NS is left to self-heal + */ + if (!local->index) { + loc_t tmp_oldloc = { + /* its actual 'newloc->path' */ + .path = local->loc2.path, + .inode = local->loc1.inode, + .parent = local->loc2.parent + }; + + loc_t tmp_newloc = { + /* Actual 'oldloc->path' */ + .path = local->loc1.path, + .parent = local->loc1.parent + }; + + gf_log (this->name, GF_LOG_ERROR, + "rename succussful on namespace, on " + "stroage node failed, reverting back"); + + STACK_WIND (frame, + unify_ns_rename_undo_cbk, + NS(this), + NS(this)->fops->rename, + &tmp_oldloc, + &tmp_newloc); + return 0; + } + } else { + /* Rename successful on storage nodes */ + + int32_t idx = 0; + int16_t *tmp_list = NULL; + uint64_t tmp_list_int64 = 0; + if (local->loc2.inode) { + inode_ctx_get (local->loc2.inode, + this, &tmp_list_int64); + list = (int16_t *)(long)tmp_list_int64; + + } + + if (list) { + for (index = 0; list[index] != -1; index++); + tmp_list = CALLOC (1, index * 2); + memcpy (tmp_list, list, index * 2); + + for (index = 0; list[index] != -1; index++) { + /* TODO: Check this logic. */ + /* If the destination file exists in + * the same storage node where we sent + * 'rename' call, no need to send + * unlink + */ + for (idx = 0; + local->list[idx] != -1; idx++) { + if (tmp_list[index] == local->list[idx]) { + tmp_list[index] = priv->child_count; + continue; + } + } + + if (NS(this) != priv->xl_array[tmp_list[index]]) { + local->call_count++; + callcnt++; + } + } + + if (local->call_count) { + if (callcnt > 1) + gf_log (this->name, + GF_LOG_ERROR, + "%s->%s: more (%d) " + "subvolumes have the " + "newloc entry", + local->loc1.path, + local->loc2.path, + callcnt); + + for (index=0; + tmp_list[index] != -1; index++) { + if (NS(this) != priv->xl_array[tmp_list[index]]) { + STACK_WIND (frame, + unify_rename_unlink_cbk, + priv->xl_array[tmp_list[index]], + priv->xl_array[tmp_list[index]]->fops->unlink, + &local->loc2); + if (!--callcnt) + break; + } + } + + FREE (tmp_list); + return 0; + } + if (tmp_list) + FREE (tmp_list); + } + } + + /* Need not send 'unlink' to storage node */ + unify_local_wipe (local); + STACK_UNWIND (frame, local->op_ret, + local->op_errno, &local->stbuf); + } + + return 0; +} + +int32_t +unify_ns_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + int32_t index = 0; + int32_t callcnt = 0; + int16_t *list = NULL; + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + + if (op_ret == -1) { + /* Free local->new_inode */ + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s -> %s): %s", + local->loc1.path, local->loc2.path, + strerror (op_errno)); + + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; + } + + local->stbuf = *buf; + local->st_ino = buf->st_ino; + + /* Everything is fine. */ + if (S_ISDIR (buf->st_mode)) { + local->call_count = priv->child_count; + for (index=0; index < priv->child_count; index++) { + STACK_WIND (frame, + unify_rename_cbk, + priv->xl_array[index], + priv->xl_array[index]->fops->rename, + &local->loc1, + &local->loc2); + } + + return 0; + } + + local->call_count = 0; + /* send rename */ + list = local->list; + for (index=0; list[index] != -1; index++) { + if (NS(this) != priv->xl_array[list[index]]) { + local->call_count++; + callcnt++; + } + } + + if (local->call_count) { + for (index=0; list[index] != -1; index++) { + if (NS(this) != priv->xl_array[list[index]]) { + STACK_WIND (frame, + unify_rename_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->rename, + &local->loc1, + &local->loc2); + if (!--callcnt) + break; + } + } + } else { + /* file doesn't seem to be present in storage nodes */ + gf_log (this->name, GF_LOG_CRITICAL, + "CRITICAL: source file not in storage node, " + "rename successful on namespace :O"); + unify_local_wipe (local); + STACK_UNWIND (frame, -1, EIO, NULL); + } + return 0; +} + + +/** + * unify_rename - One of the tricky function. The deadliest of all :O + */ +int32_t +unify_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + unify_local_t *local = NULL; + uint64_t tmp_list = 0; + + /* Initialization */ + INIT_LOCAL (frame, local); + loc_copy (&local->loc1, oldloc); + loc_copy (&local->loc2, newloc); + + if ((local->loc1.path == NULL) || + (local->loc2.path == NULL)) { + gf_log (this->name, GF_LOG_CRITICAL, "Not enough memory :O"); + STACK_UNWIND (frame, -1, ENOMEM, NULL); + return 0; + } + + inode_ctx_get (oldloc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + STACK_WIND (frame, + unify_ns_rename_cbk, + NS(this), + NS(this)->fops->rename, + oldloc, + newloc); + return 0; +} + +/** + * unify_link_cbk - + */ +int32_t +unify_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + unify_local_t *local = frame->local; + + if (op_ret >= 0) + local->stbuf = *buf; + local->stbuf.st_ino = local->st_ino; + + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, &local->stbuf); + + return 0; +} + +/** + * unify_ns_link_cbk - + */ +int32_t +unify_ns_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + unify_private_t *priv = this->private; + unify_local_t *local = frame->local; + int16_t *list = local->list; + int16_t index = 0; + + if (op_ret == -1) { + /* No need to send link request to other servers, + * as namespace action failed + */ + gf_log (this->name, GF_LOG_ERROR, + "namespace: path(%s -> %s): %s", + local->loc1.path, local->loc2.path, + strerror (op_errno)); + unify_local_wipe (local); + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; + } + + /* Update inode for this entry */ + local->op_ret = 0; + local->st_ino = buf->st_ino; + + /* Send link request to the node now */ + for (index = 0; list[index] != -1; index++) { + char need_break = (list[index+1] == -1); + if (priv->xl_array[list[index]] != NS (this)) { + STACK_WIND (frame, + unify_link_cbk, + priv->xl_array[list[index]], + priv->xl_array[list[index]]->fops->link, + &local->loc1, + &local->loc2); + } + if (need_break) + break; + } + + return 0; +} + +/** + * unify_link - + */ +int32_t +unify_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + unify_local_t *local = NULL; + uint64_t tmp_list = 0; + + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (oldloc); + UNIFY_CHECK_INODE_CTX_AND_UNWIND_ON_ERR (newloc); + + /* Initialization */ + INIT_LOCAL (frame, local); + + loc_copy (&local->loc1, oldloc); + loc_copy (&local->loc2, newloc); + + inode_ctx_get (oldloc->inode, this, &tmp_list); + local->list = (int16_t *)(long)tmp_list; + + STACK_WIND (frame, + unify_ns_link_cbk, + NS(this), + NS(this)->fops->link, + oldloc, + newloc); + + return 0; +} + + +/** + * unify_checksum_cbk - + */ +int32_t +unify_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *fchecksum, + uint8_t *dchecksum) +{ + STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum); + + return 0; +} + +/** + * unify_checksum - + */ +int32_t +unify_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + STACK_WIND (frame, + unify_checksum_cbk, + NS(this), + NS(this)->fops->checksum, + loc, + flag); + + return 0; +} + + +/** + * unify_finodelk_cbk - + */ +int +unify_finodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * unify_finodelk + */ +int +unify_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int cmd, struct flock *flock) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_finodelk_cbk, + child, child->fops->finodelk, + fd, cmd, flock); + + return 0; +} + + + +/** + * unify_fentrylk_cbk - + */ +int +unify_fentrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * unify_fentrylk + */ +int +unify_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type) + +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_fentrylk_cbk, + child, child->fops->fentrylk, + fd, basename, cmd, type); + + return 0; +} + + + +/** + * unify_fxattrop_cbk - + */ +int +unify_fxattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + STACK_UNWIND (frame, op_ret, op_errno, xattr); + return 0; +} + +/** + * unify_fxattrop + */ +int +unify_fxattrop (call_frame_t *frame, xlator_t *this, + fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) +{ + UNIFY_CHECK_FD_CTX_AND_UNWIND_ON_ERR (fd); + xlator_t *child = NULL; + uint64_t tmp_child = 0; + + fd_ctx_get (fd, this, &tmp_child); + child = (xlator_t *)(long)tmp_child; + + STACK_WIND (frame, unify_fxattrop_cbk, + child, child->fops->fxattrop, + fd, optype, xattr); + + return 0; +} + + +/** + * unify_inodelk_cbk - + */ +int +unify_inodelk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +/** + * unify_inodelk + */ +int +unify_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int cmd, struct flock *flock) +{ + xlator_t *child = NULL; + + child = unify_loc_subvol (loc, this); + + STACK_WIND (frame, unify_inodelk_cbk, + child, child->fops->inodelk, + loc, cmd, flock); + + return 0; +} + + + +/** + * unify_entrylk_cbk - + */ +int +unify_entrylk_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * unify_entrylk + */ +int +unify_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) + +{ + xlator_t *child = NULL; + + child = unify_loc_subvol (loc, this); + + STACK_WIND (frame, unify_entrylk_cbk, + child, child->fops->entrylk, + loc, basename, cmd, type); + + return 0; +} + + + +/** + * unify_xattrop_cbk - + */ +int +unify_xattrop_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *xattr) +{ + STACK_UNWIND (frame, op_ret, op_errno, xattr); + return 0; +} + +/** + * unify_xattrop + */ +int +unify_xattrop (call_frame_t *frame, xlator_t *this, + loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr) +{ + xlator_t *child = NULL; + + child = unify_loc_subvol (loc, this); + + STACK_WIND (frame, unify_xattrop_cbk, + child, child->fops->xattrop, + loc, optype, xattr); + + return 0; +} + + +/** + * notify + */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + unify_private_t *priv = this->private; + struct sched_ops *sched = NULL; + + if (!priv) { + return 0; + } + + sched = priv->sched_ops; + if (!sched) { + gf_log (this->name, GF_LOG_CRITICAL, "No scheduler :O"); + raise (SIGTERM); + return 0; + } + if (priv->namespace == data) { + if (event == GF_EVENT_CHILD_UP) { + sched->notify (this, event, data); + } + return 0; + } + + switch (event) + { + case GF_EVENT_CHILD_UP: + { + /* Call scheduler's update () to enable it for scheduling */ + sched->notify (this, event, data); + + LOCK (&priv->lock); + { + /* Increment the inode's generation, which is + used for self_heal */ + ++priv->inode_generation; + ++priv->num_child_up; + } + UNLOCK (&priv->lock); + + if (!priv->is_up) { + default_notify (this, event, data); + priv->is_up = 1; + } + } + break; + case GF_EVENT_CHILD_DOWN: + { + /* Call scheduler's update () to disable the child node + * for scheduling + */ + sched->notify (this, event, data); + LOCK (&priv->lock); + { + --priv->num_child_up; + } + UNLOCK (&priv->lock); + + if (priv->num_child_up == 0) { + /* Send CHILD_DOWN to upper layer */ + default_notify (this, event, data); + priv->is_up = 0; + } + } + break; + + default: + { + default_notify (this, event, data); + } + break; + } + + return 0; +} + +/** + * init - This function is called first in the xlator, while initializing. + * All the config file options are checked and appropriate flags are set. + * + * @this - + */ +int32_t +init (xlator_t *this) +{ + int32_t ret = 0; + int32_t count = 0; + data_t *scheduler = NULL; + data_t *data = NULL; + xlator_t *ns_xl = NULL; + xlator_list_t *trav = NULL; + xlator_list_t *xlparent = NULL; + xlator_list_t *parent = NULL; + unify_private_t *_private = NULL; + + /* Check for number of child nodes, if there is no child nodes, exit */ + if (!this->children) { + gf_log (this->name, GF_LOG_ERROR, + "No child nodes specified. check \"subvolumes \" " + "option in volfile"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + /* Check for 'scheduler' in volume */ + scheduler = dict_get (this->options, "scheduler"); + if (!scheduler) { + gf_log (this->name, GF_LOG_ERROR, + "\"option scheduler <x>\" is missing in volfile"); + return -1; + } + + /* Setting "option namespace <node>" */ + data = dict_get (this->options, "namespace"); + if(!data) { + gf_log (this->name, GF_LOG_CRITICAL, + "namespace option not specified, Exiting"); + return -1; + } + /* Search namespace in the child node, if found, exit */ + trav = this->children; + while (trav) { + if (strcmp (trav->xlator->name, data->data) == 0) + break; + trav = trav->next; + } + if (trav) { + gf_log (this->name, GF_LOG_CRITICAL, + "namespace node used as a subvolume, Exiting"); + return -1; + } + + /* Search for the namespace node, if found, continue */ + ns_xl = this->next; + while (ns_xl) { + if (strcmp (ns_xl->name, data->data) == 0) + break; + ns_xl = ns_xl->next; + } + if (!ns_xl) { + gf_log (this->name, GF_LOG_CRITICAL, + "namespace node not found in volfile, Exiting"); + return -1; + } + + gf_log (this->name, GF_LOG_DEBUG, + "namespace node specified as %s", data->data); + + _private = CALLOC (1, sizeof (*_private)); + ERR_ABORT (_private); + _private->sched_ops = get_scheduler (this, scheduler->data); + if (!_private->sched_ops) { + gf_log (this->name, GF_LOG_CRITICAL, + "Error while loading scheduler. Exiting"); + FREE (_private); + return -1; + } + + if (ns_xl->parents) { + gf_log (this->name, GF_LOG_CRITICAL, + "Namespace node should not be a child of any other node. Exiting"); + FREE (_private); + return -1; + } + + _private->namespace = ns_xl; + + /* update _private structure */ + { + count = 0; + trav = this->children; + /* Get the number of child count */ + while (trav) { + count++; + trav = trav->next; + } + + gf_log (this->name, GF_LOG_DEBUG, + "Child node count is %d", count); + + _private->child_count = count; + if (count == 1) { + /* TODO: Should I error out here? */ + gf_log (this->name, GF_LOG_CRITICAL, + "WARNING: You have defined only one " + "\"subvolumes\" for unify volume. It may not " + "be the desired config, review your volume " + "volfile. If this is how you are testing it," + " you may hit some performance penalty"); + } + + _private->xl_array = CALLOC (1, + sizeof (xlator_t) * (count + 1)); + ERR_ABORT (_private->xl_array); + + count = 0; + trav = this->children; + while (trav) { + _private->xl_array[count++] = trav->xlator; + trav = trav->next; + } + _private->xl_array[count] = _private->namespace; + + /* self-heal part, start with generation '1' */ + _private->inode_generation = 1; + /* Because, Foreground part is tested well */ + _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; + data = dict_get (this->options, "self-heal"); + if (data) { + if (strcasecmp (data->data, "off") == 0) + _private->self_heal = ZR_UNIFY_SELF_HEAL_OFF; + + if (strcasecmp (data->data, "foreground") == 0) + _private->self_heal = ZR_UNIFY_FG_SELF_HEAL; + + if (strcasecmp (data->data, "background") == 0) + _private->self_heal = ZR_UNIFY_BG_SELF_HEAL; + } + + /* optimist - ask bulde for more about it */ + data = dict_get (this->options, "optimist"); + if (data) { + if (gf_string2boolean (data->data, + &_private->optimist) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "optimist excepts only boolean " + "options"); + } + } + + LOCK_INIT (&_private->lock); + } + + /* Now that everything is fine. */ + this->private = (void *)_private; + { + /* Initialize scheduler, if everything else is successful */ + ret = _private->sched_ops->init (this); + if (ret == -1) { + gf_log (this->name, GF_LOG_CRITICAL, + "Initializing scheduler failed, Exiting"); + FREE (_private); + return -1; + } + + ret = 0; + + /* This section is required because some fops may look + * for 'xl->parent' variable + */ + xlparent = CALLOC (1, sizeof (*xlparent)); + xlparent->xlator = this; + if (!ns_xl->parents) { + ns_xl->parents = xlparent; + } else { + parent = ns_xl->parents; + while (parent->next) + parent = parent->next; + parent->next = xlparent; + } + /* Initialize the namespace volume */ + if (!ns_xl->ready) { + ret = xlator_tree_init (ns_xl); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "initializing namespace node failed, " + "Exiting"); + FREE (_private); + return -1; + } + } + } + + /* Tell namespace node that init is done */ + ns_xl->notify (ns_xl, GF_EVENT_PARENT_UP, this); + + return 0; +} + +/** + * fini - Free all the allocated memory + */ +void +fini (xlator_t *this) +{ + unify_private_t *priv = this->private; + priv->sched_ops->fini (this); + this->private = NULL; + LOCK_DESTROY (&priv->lock); + FREE (priv->xl_array); + FREE (priv); + return; +} + + +struct xlator_fops fops = { + .stat = unify_stat, + .chmod = unify_chmod, + .readlink = unify_readlink, + .mknod = unify_mknod, + .mkdir = unify_mkdir, + .unlink = unify_unlink, + .rmdir = unify_rmdir, + .symlink = unify_symlink, + .rename = unify_rename, + .link = unify_link, + .chown = unify_chown, + .truncate = unify_truncate, + .create = unify_create, + .open = unify_open, + .readv = unify_readv, + .writev = unify_writev, + .statfs = unify_statfs, + .flush = unify_flush, + .fsync = unify_fsync, + .setxattr = unify_setxattr, + .getxattr = unify_getxattr, + .removexattr = unify_removexattr, + .opendir = unify_opendir, + .readdir = unify_readdir, + .fsyncdir = unify_fsyncdir, + .access = unify_access, + .ftruncate = unify_ftruncate, + .fstat = unify_fstat, + .lk = unify_lk, + .fchown = unify_fchown, + .fchmod = unify_fchmod, + .utimens = unify_utimens, + .lookup = unify_lookup, + .getdents = unify_getdents, + .checksum = unify_checksum, + .inodelk = unify_inodelk, + .finodelk = unify_finodelk, + .entrylk = unify_entrylk, + .fentrylk = unify_fentrylk, + .xattrop = unify_xattrop, + .fxattrop = unify_fxattrop +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = { "namespace" }, + .type = GF_OPTION_TYPE_XLATOR + }, + { .key = { "scheduler" }, + .value = { "alu", "rr", "random", "nufa", "switch" }, + .type = GF_OPTION_TYPE_STR + }, + { .key = {"self-heal"}, + .value = { "foreground", "background", "off" }, + .type = GF_OPTION_TYPE_STR + }, + /* TODO: remove it some time later */ + { .key = {"optimist"}, + .type = GF_OPTION_TYPE_BOOL + }, + + { .key = {NULL} }, +}; diff --git a/xlators/cluster/unify/src/unify.h b/xlators/cluster/unify/src/unify.h new file mode 100644 index 000000000..bc18dc53f --- /dev/null +++ b/xlators/cluster/unify/src/unify.h @@ -0,0 +1,132 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#ifndef _UNIFY_H +#define _UNIFY_H + +#include "scheduler.h" +#include "list.h" + +#define MAX_DIR_ENTRY_STRING (32 * 1024) + +#define ZR_UNIFY_SELF_HEAL_OFF 0 +#define ZR_UNIFY_FG_SELF_HEAL 1 +#define ZR_UNIFY_BG_SELF_HEAL 2 + +/* Sometimes one should use completely random numbers.. its good :p */ +#define UNIFY_SELF_HEAL_GETDENTS_COUNT 1024 + +#define NS(xl) (((unify_private_t *)xl->private)->namespace) + +/* This is used to allocate memory for local structure */ +#define INIT_LOCAL(fr, loc) \ +do { \ + loc = CALLOC (1, sizeof (unify_local_t)); \ + ERR_ABORT (loc); \ + if (!loc) { \ + STACK_UNWIND (fr, -1, ENOMEM); \ + return 0; \ + } \ + fr->local = loc; \ + loc->op_ret = -1; \ + loc->op_errno = ENOENT; \ +} while (0) + + + +struct unify_private { + /* Update this structure depending on requirement */ + void *scheduler; /* THIS SHOULD BE THE FIRST VARIABLE, + if xlator is using scheduler */ + struct sched_ops *sched_ops; /* Scheduler options */ + xlator_t *namespace; /* ptr to namespace xlator */ + xlator_t **xl_array; + gf_boolean_t optimist; + int16_t child_count; + int16_t num_child_up; + uint8_t self_heal; + uint8_t is_up; + uint64_t inode_generation; + gf_lock_t lock; +}; +typedef struct unify_private unify_private_t; + +struct unify_self_heal_struct { + uint8_t dir_checksum[ZR_FILENAME_MAX]; + uint8_t ns_dir_checksum[ZR_FILENAME_MAX]; + uint8_t file_checksum[ZR_FILENAME_MAX]; + uint8_t ns_file_checksum[ZR_FILENAME_MAX]; + off_t *offset_list; + int *count_list; + dir_entry_t **entry_list; +}; + + +struct _unify_local_t { + int32_t call_count; + int32_t op_ret; + int32_t op_errno; + mode_t mode; + off_t offset; + dev_t dev; + uid_t uid; + gid_t gid; + int32_t flags; + int32_t entry_count; + int32_t count; // dir_entry_t count; + fd_t *fd; + struct stat stbuf; + struct statvfs statvfs_buf; + struct timespec tv[2]; + char *name; + int32_t revalidate; + + ino_t st_ino; + nlink_t st_nlink; + + dict_t *dict; + + int16_t *list; + int16_t *new_list; /* Used only in case of rename */ + int16_t index; + + int32_t failed; + int32_t return_eio; /* Used in case of different st-mode + present for a given path */ + + uint64_t inode_generation; /* used to store the per directory + * inode_generation. Got from inode's ctx + * of directory inodes + */ + + struct unify_self_heal_struct *sh_struct; + loc_t loc1, loc2; +}; +typedef struct _unify_local_t unify_local_t; + +int32_t zr_unify_self_heal (call_frame_t *frame, + xlator_t *this, + unify_local_t *local); + +#endif /* _UNIFY_H */ diff --git a/xlators/debug/Makefile.am b/xlators/debug/Makefile.am new file mode 100644 index 000000000..16cf893a1 --- /dev/null +++ b/xlators/debug/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = trace error-gen + +CLEANFILES = diff --git a/xlators/debug/error-gen/Makefile.am b/xlators/debug/error-gen/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/debug/error-gen/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/debug/error-gen/src/Makefile.am b/xlators/debug/error-gen/src/Makefile.am new file mode 100644 index 000000000..1bd7f332c --- /dev/null +++ b/xlators/debug/error-gen/src/Makefile.am @@ -0,0 +1,14 @@ + +xlator_LTLIBRARIES = error-gen.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug + +error_gen_la_LDFLAGS = -module -avoidversion + +error_gen_la_SOURCES = error-gen.c +error_gen_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/debug/error-gen/src/error-gen.c b/xlators/debug/error-gen/src/error-gen.c new file mode 100644 index 000000000..9c0b3253e --- /dev/null +++ b/xlators/debug/error-gen/src/error-gen.c @@ -0,0 +1,1780 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" + +typedef struct { + int op_count; +} eg_t; + +int error_gen (xlator_t *this) +{ + eg_t *egp = NULL; + int count = 0; + egp = this->private; + count = ++egp->op_count; + if((count % 10) == 0) { + count = count / 10; + if ((count % 2) == 0) + return ENOTCONN; + else + return EIO; + } + return 0; +} + +static int32_t +error_gen_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + inode, + buf, + dict); + return 0; +} + +int32_t +error_gen_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_lookup_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, + loc, + xattr_req); + return 0; +} + + +int32_t +error_gen_forget (xlator_t *this, + inode_t *inode) +{ + return 0; +} + +int32_t +error_gen_stat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +error_gen_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_stat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc); + return 0; +} + +int32_t +error_gen_chmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +error_gen_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_chmod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chmod, + loc, + mode); + return 0; +} + + +int32_t +error_gen_fchmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +error_gen_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_fchmod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fchmod, + fd, + mode); + return 0; +} + +int32_t +error_gen_chown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +error_gen_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_chown_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chown, + loc, + uid, + gid); + return 0; +} + +int32_t +error_gen_fchown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +error_gen_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_fchown_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fchown, + fd, + uid, + gid); + return 0; +} + +int32_t +error_gen_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +error_gen_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + loc, + offset); + return 0; +} + +int32_t +error_gen_ftruncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +error_gen_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + fd, + offset); + return 0; +} + +int32_t +error_gen_utimens_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + + +int32_t +error_gen_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_utimens_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->utimens, + loc, + tv); + return 0; +} + +int32_t +error_gen_access_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +error_gen_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno); + return 0; + } + STACK_WIND (frame, + error_gen_access_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, + loc, + mask); + return 0; +} + + +int32_t +error_gen_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + path); + return 0; +} + +int32_t +error_gen_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_readlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, + loc, + size); + return 0; +} + + +int32_t +error_gen_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + inode, + buf); + return 0; +} + +int32_t +error_gen_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_mknod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev); + return 0; +} + +int32_t +error_gen_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + inode, + buf); + return 0; +} + +int32_t +error_gen_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_mkdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, + loc, mode); + return 0; +} + +int32_t +error_gen_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +error_gen_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno); + return 0; + } + STACK_WIND (frame, + error_gen_unlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc); + return 0; +} + +int32_t +error_gen_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +error_gen_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno); + return 0; + } + STACK_WIND (frame, + error_gen_rmdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, + loc); + return 0; +} + + +int32_t +error_gen_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +error_gen_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_symlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + linkpath, loc); + return 0; +} + + +int32_t +error_gen_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +error_gen_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_rename_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, + oldloc, newloc); + return 0; +} + + +int32_t +error_gen_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +error_gen_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + return 0; + } + STACK_WIND (frame, + error_gen_link_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, + oldloc, newloc); + return 0; +} + + +int32_t +error_gen_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + return 0; +} + +int32_t +error_gen_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, fd_t *fd) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL, NULL); + return 0; + } + + STACK_WIND (frame, error_gen_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, flags, mode, fd); + return 0; +} + +int32_t +error_gen_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + fd); + return 0; +} + +int32_t +error_gen_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, fd_t *fd) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_open_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, + loc, flags, fd); + return 0; +} + +int32_t +error_gen_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + vector, + count, + stbuf); + return 0; +} + +int32_t +error_gen_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL); + return 0; + } + + + STACK_WIND (frame, + error_gen_readv_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, + size, + offset); + return 0; +} + + +int32_t +error_gen_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + stbuf); + return 0; +} + +int32_t +error_gen_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + + STACK_WIND (frame, + error_gen_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + fd, + vector, + count, + off); + return 0; +} + +int32_t +error_gen_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +error_gen_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno); + return 0; + } + + STACK_WIND (frame, + error_gen_flush_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd); + return 0; +} + + +int32_t +error_gen_fsync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +error_gen_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno); + return 0; + } + + STACK_WIND (frame, + error_gen_fsync_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, + fd, + flags); + return 0; +} + +int32_t +error_gen_fstat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +error_gen_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_fstat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, + fd); + return 0; +} + +int32_t +error_gen_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + fd); + return 0; +} + +int32_t +error_gen_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, fd_t *fd) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_opendir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, + loc, fd); + return 0; +} + + +int32_t +error_gen_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + entries, + count); + return 0; +} + +int32_t +error_gen_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL, 0); + return 0; + } + + STACK_WIND (frame, + error_gen_getdents_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getdents, + fd, + size, + offset, + flag); + return 0; +} + + +int32_t +error_gen_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +error_gen_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL, 0); + return 0; + } + + STACK_WIND (frame, + error_gen_setdents_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setdents, + fd, + flags, + entries, + count); + return 0; +} + + +int32_t +error_gen_fsyncdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +error_gen_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno); + return 0; + } + + STACK_WIND (frame, + error_gen_fsyncdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, + fd, + flags); + return 0; +} + + +int32_t +error_gen_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +error_gen_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_statfs_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, + loc); + return 0; +} + + +int32_t +error_gen_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +error_gen_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno); + return 0; + } + + STACK_WIND (frame, + error_gen_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, + dict, + flags); + return 0; +} + +int32_t +error_gen_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + dict); + return 0; +} + +int32_t +error_gen_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_getxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, + name); + return 0; +} + +int32_t +error_gen_xattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +error_gen_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_xattrop_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, + loc, flags, dict); + return 0; +} + +int32_t +error_gen_fxattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +error_gen_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_fxattrop_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, + fd, flags, dict); + return 0; +} + +int32_t +error_gen_removexattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +error_gen_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_removexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + loc, + name); + return 0; +} + +int32_t +error_gen_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + lock); + return 0; +} + +int32_t +error_gen_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_lk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lk, + fd, + cmd, + lock); + return 0; +} + + +int32_t +error_gen_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +error_gen_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *lock) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno); + return 0; + } + + STACK_WIND (frame, + error_gen_inodelk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, + loc, cmd, lock); + return 0; +} + + +int32_t +error_gen_finodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +error_gen_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *lock) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno); + return 0; + } + + STACK_WIND (frame, + error_gen_finodelk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->finodelk, + fd, cmd, lock); + return 0; +} + + +int32_t +error_gen_entrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +error_gen_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno); + return 0; + } + + STACK_WIND (frame, error_gen_entrylk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, + loc, basename, cmd, type); + return 0; +} + +int32_t +error_gen_fentrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) + +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +error_gen_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno); + return 0; + } + + STACK_WIND (frame, error_gen_fentrylk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fentrylk, + fd, basename, cmd, type); + return 0; +} + + +/* Management operations */ + +int32_t +error_gen_stats_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + stats); + return 0; +} + + +int32_t +error_gen_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_stats_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->mops->stats, + flags); + return 0; +} + + + +int32_t +error_gen_getspec_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + char *spec_data) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + spec_data); + return 0; +} + + +int32_t +error_gen_getspec (call_frame_t *frame, + xlator_t *this, + const char *key, + int32_t flags) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_getspec_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->mops->getspec, + key, flags); + return 0; +} + + +int32_t +error_gen_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + file_checksum, + dir_checksum); + return 0; +} + + +int32_t +error_gen_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_checksum_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->checksum, + loc, + flag); + return 0; +} + +int32_t +error_gen_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries) +{ + STACK_UNWIND (frame, op_ret, op_errno, entries); + return 0; +} + + +int32_t +error_gen_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t off) +{ + int op_errno = 0; + op_errno = error_gen(this); + if (op_errno) { + GF_ERROR(this, "unwind(-1, %s)", strerror (op_errno)); + STACK_UNWIND (frame, -1, op_errno, NULL); + return 0; + } + + STACK_WIND (frame, + error_gen_readdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, + fd, size, off); + return 0; +} + +int32_t +error_gen_closedir (xlator_t *this, + fd_t *fd) +{ + return 0; +} + +int32_t +error_gen_close (xlator_t *this, + fd_t *fd) +{ + return 0; +} + +int +init (xlator_t *this) +{ + eg_t *pvt = NULL; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "error-gen not configured with one subvolume"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + pvt = CALLOC (1, sizeof (eg_t)); + this->private = pvt; + return 0; +} + +void +fini (xlator_t *this) +{ + gf_log (this->name, GF_LOG_DEBUG, "fini called"); + return; +} + + +struct xlator_fops fops = { + .lookup = error_gen_lookup, + .stat = error_gen_stat, + .readlink = error_gen_readlink, + .mknod = error_gen_mknod, + .mkdir = error_gen_mkdir, + .unlink = error_gen_unlink, + .rmdir = error_gen_rmdir, + .symlink = error_gen_symlink, + .rename = error_gen_rename, + .link = error_gen_link, + .chmod = error_gen_chmod, + .chown = error_gen_chown, + .truncate = error_gen_truncate, + .utimens = error_gen_utimens, + .create = error_gen_create, + .open = error_gen_open, + .readv = error_gen_readv, + .writev = error_gen_writev, + .statfs = error_gen_statfs, + .flush = error_gen_flush, + .fsync = error_gen_fsync, + .setxattr = error_gen_setxattr, + .getxattr = error_gen_getxattr, + .removexattr = error_gen_removexattr, + .opendir = error_gen_opendir, + .readdir = error_gen_readdir, + .getdents = error_gen_getdents, + .fsyncdir = error_gen_fsyncdir, + .access = error_gen_access, + .ftruncate = error_gen_ftruncate, + .fstat = error_gen_fstat, + .lk = error_gen_lk, + .fchmod = error_gen_fchmod, + .fchown = error_gen_fchown, + .setdents = error_gen_setdents, + .lookup_cbk = error_gen_lookup_cbk, + .checksum = error_gen_checksum, + .xattrop = error_gen_xattrop, + .fxattrop = error_gen_fxattrop, + .inodelk = error_gen_inodelk, + .finodelk = error_gen_finodelk, + .entrylk = error_gen_entrylk, + .fentrylk = error_gen_fentrylk +}; + +struct xlator_mops mops = { + .stats = error_gen_stats, + .getspec = error_gen_getspec, +}; + +struct xlator_cbks cbks = { + .release = error_gen_close, + .releasedir = error_gen_closedir, +}; diff --git a/xlators/debug/trace/Makefile.am b/xlators/debug/trace/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/debug/trace/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/debug/trace/src/Makefile.am b/xlators/debug/trace/src/Makefile.am new file mode 100644 index 000000000..0f1679a04 --- /dev/null +++ b/xlators/debug/trace/src/Makefile.am @@ -0,0 +1,14 @@ + +xlator_LTLIBRARIES = trace.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/debug + +trace_la_LDFLAGS = -module -avoidversion + +trace_la_SOURCES = trace.c +trace_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/debug/trace/src/trace.c b/xlators/debug/trace/src/trace.c new file mode 100644 index 000000000..3ccf11a83 --- /dev/null +++ b/xlators/debug/trace/src/trace.c @@ -0,0 +1,2321 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/** + * xlators/debug/trace : + * This translator logs all the arguments to the fops/mops and also + * their _cbk functions, which later passes the call to next layer. + * Very helpful translator for debugging. + */ + +#include <time.h> +#include <errno.h> +#include "glusterfs.h" +#include "xlator.h" +#include "common-utils.h" + +#define ERR_EINVAL_NORETURN(cond) \ +do \ + { \ + if ((cond)) \ + { \ + gf_log ("ERROR", GF_LOG_ERROR, \ + "%s: %s: (%s) is true", \ + __FILE__, __FUNCTION__, #cond); \ + } \ + } while (0) + +typedef struct trace_private { + int32_t debug_flag; +} trace_private_t; + +struct { + char *name; + int enabled; +} trace_fop_names[GF_FOP_MAXVALUE]; + +int32_t +trace_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this); + + if (trace_fop_names[GF_FOP_CREATE].enabled) { + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, fd=%p, ino=%"PRIu64"), " + "*buf {st_dev=%"GF_PRI_DEV", st_ino=%"PRIu64", " + "st_mode=%d, st_nlink=%"GF_PRI_NLINK", st_uid=%d, " + "st_gid=%d, st_rdev=%"GF_PRI_DEV", st_size=%"PRId64", " + "st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64", " + "st_atime=%s, st_mtime=%s, st_ctime=%s})", + frame->root->unique, op_ret, fd, inode->ino, buf->st_dev, + buf->st_ino, buf->st_mode, buf->st_nlink, + buf->st_uid, buf->st_gid, buf->st_rdev, buf->st_size, + buf->st_blksize, + buf->st_blocks, atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + return 0; +} + +int32_t +trace_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + ERR_EINVAL_NORETURN (!this); + + if (trace_fop_names[GF_FOP_OPEN].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d, *fd=%p)", + frame->root->unique, op_ret, op_errno, fd); + } + + STACK_UNWIND (frame, op_ret, op_errno, fd); + return 0; +} + +int32_t +trace_stat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this); + + if (trace_fop_names[GF_FOP_STAT].enabled) { + + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, buf {st_dev=%"GF_PRI_DEV", " + "st_ino=%"PRIu64", st_mode=%d, st_nlink=%"GF_PRI_NLINK", " + "st_uid=%d, st_gid=%d, st_rdev=%"GF_PRI_DEV", st_size=%"PRId64 + ", st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64", " + "st_atime=%s, st_mtime=%s, st_ctime=%s})", + frame->root->unique, op_ret, buf->st_dev, buf->st_ino, + buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, + buf->st_rdev, buf->st_size, buf->st_blksize, + buf->st_blocks, atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this); + + if (trace_fop_names[GF_FOP_READ].enabled) { + + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, *buf {st_dev=%"GF_PRI_DEV", " + "st_ino=%"PRIu64", st_mode=%d, st_nlink=%"GF_PRI_NLINK", " + "st_uid=%d, st_gid=%d, st_rdev=%"GF_PRI_DEV", " + "st_size=%"PRId64", st_blksize=%"GF_PRI_BLKSIZE", " + "st_blocks=%"PRId64", st_atime=%s, st_mtime=%s, st_ctime=%s})", + frame->root->unique, op_ret, buf->st_dev, buf->st_ino, + buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, + buf->st_rdev, buf->st_size, buf->st_blksize, buf->st_blocks, + atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, vector, count, buf); + return 0; +} + +int32_t +trace_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this); + + if (trace_fop_names[GF_FOP_WRITE].enabled) { + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, *buf {st_ino=%"PRIu64", " + "st_size=%"PRId64", st_blocks=%"PRId64", st_atime=%s, " + "st_mtime=%s, st_ctime=%s})", + frame->root->unique, op_ret, buf->st_ino, buf->st_size, + buf->st_blocks, atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_GETDENTS].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d, count=%d)", + frame->root->unique, op_ret, op_errno, count); + } + + STACK_UNWIND (frame, op_ret, op_errno, entries, count); + return 0; +} + +int32_t +trace_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *buf) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_READDIR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64" :(op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + + return 0; +} + +int32_t +trace_fsync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_FSYNC].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +trace_chown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_CHOWN].enabled) { + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, *buf {st_ino=%"PRIu64", st_mode=%d, " + "st_uid=%d, st_gid=%d, st_atime=%s, st_mtime=%s, st_ctime=%s})", + frame->root->unique, op_ret, buf->st_ino, buf->st_mode, + buf->st_uid, buf->st_gid, atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_chmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_CHMOD].enabled) { + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, *buf {st_ino=%"PRIu64", st_mode=%d, " + "st_atime=%s, st_mtime=%s, st_ctime=%s})", + frame->root->unique, op_ret, buf->st_ino, buf->st_mode, + atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_fchmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_FCHMOD].enabled) { + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, *buf {st_ino=%"PRIu64", st_mode=%d, " + "st_atime=%s, st_mtime=%s, st_ctime=%s})", + frame->root->unique, op_ret, buf->st_ino, buf->st_mode, + atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_fchown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_FCHOWN].enabled) { + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, *buf {st_ino=%"PRIu64", st_mode=%d, " + "st_uid=%d, st_gid=%d, st_atime=%s, st_mtime=%s, st_ctime=%s})", + frame->root->unique, op_ret, buf->st_ino, buf->st_mode, + buf->st_uid, buf->st_gid, atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_UNLINK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +trace_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_RENAME].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d, buf {st_ino=%"PRIu64"})", + frame->root->unique, op_ret, op_errno, + (buf? buf->st_ino : 0)); + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *buf) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_READLINK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d, buf=%s)", + frame->root->unique, op_ret, op_errno, buf); + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *xattr) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_LOOKUP].enabled) { + if (op_ret >= 0) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, ino=%"PRIu64", " + "*buf {st_dev=%"GF_PRI_DEV", st_ino=%"PRIu64", st_mode=%d, " + "st_nlink=%"GF_PRI_NLINK", st_uid=%d, st_gid=%d, " + "st_rdev=%"GF_PRI_DEV", st_size=%"PRId64", " + "st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64"})", + frame->root->unique, op_ret, inode->ino, buf->st_dev, buf->st_ino, + buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, + buf->st_rdev, buf->st_size, buf->st_blksize, buf->st_blocks); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf, xattr); + return 0; +} + +int32_t +trace_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_SYMLINK].enabled) { + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, ino=%"PRIu64", *buf {st_ino=%"PRIu64", " + "st_mode=%d, st_nlink=%"GF_PRI_NLINK", st_uid=%d, st_gid=%d, " + "st_size=%"PRId64", st_blocks=%"PRId64", st_atime=%s, " + "st_mtime=%s, st_ctime=%s})", + frame->root->unique, op_ret, inode->ino, buf->st_ino, + buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, + buf->st_size, buf->st_blocks, atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +trace_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_MKNOD].enabled) { + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, ino=%"PRIu64", *buf {st_dev=%"GF_PRI_DEV + ", st_ino=%"PRIu64", st_mode=%d, st_nlink=%"GF_PRI_NLINK", " + "st_uid=%d, st_gid=%d, st_rdev=%"GF_PRI_DEV", st_size=%"PRId64 + ", st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64", st_atime=%s, " + "st_mtime=%s, st_ctime=%s})", + frame->root->unique, op_ret, inode->ino, buf->st_dev, buf->st_ino, + buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, + buf->st_rdev, buf->st_size, buf->st_blksize, buf->st_blocks, + atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + + +int32_t +trace_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_MKDIR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d, ino=%"PRIu64"", + frame->root->unique, op_ret, op_errno, + (inode? inode->ino : 0)); + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +trace_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_LINK].enabled) { + if (op_ret >= 0) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, ino=%"PRIu64", " + "*buf {st_nlink=%"GF_PRI_NLINK"})", + frame->root->unique, op_ret, inode->ino, buf->st_nlink); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +trace_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_FLUSH].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +trace_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_OPENDIR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d, fd=%p)", + frame->root->unique, op_ret, op_errno, fd); + } + + STACK_UNWIND (frame, op_ret, op_errno, fd); + return 0; +} + +int32_t +trace_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_RMDIR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +trace_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_TRUNCATE].enabled) { + if (op_ret >= 0) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, *buf {st_size=%"PRId64", st_blksize=%" + GF_PRI_BLKSIZE", st_blocks=%"PRId64"})", + frame->root->unique, op_ret, buf->st_size, buf->st_blksize, + buf->st_blocks); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_utimens_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_UTIMENS].enabled) { + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, *buf {st_atime=%s, st_mtime=%s, " + "st_ctime=%s})", + frame->root->unique, op_ret, atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf) +{ + ERR_EINVAL_NORETURN (!this); + + if (trace_fop_names[GF_FOP_STATFS].enabled) { + if (op_ret >= 0) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": ({f_bsize=%lu, f_frsize=%lu, f_blocks=%"GF_PRI_FSBLK + ", f_bfree=%"GF_PRI_FSBLK", f_bavail=%"GF_PRI_FSBLK", " + "f_files=%"GF_PRI_FSBLK", f_ffree=%"GF_PRI_FSBLK", f_favail=%" + GF_PRI_FSBLK", f_fsid=%lu, f_flag=%lu, f_namemax=%lu}) => ret=%d", + frame->root->unique, buf->f_bsize, buf->f_frsize, buf->f_blocks, + buf->f_bfree, buf->f_bavail, buf->f_files, buf->f_ffree, + buf->f_favail, buf->f_fsid, buf->f_flag, buf->f_namemax, op_ret); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_SETXATTR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +trace_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + ERR_EINVAL_NORETURN (!this || !dict); + + if (trace_fop_names[GF_FOP_GETXATTR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d, dict=%p)", + frame->root->unique, op_ret, op_errno, dict); + } + + STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +trace_removexattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +trace_fsyncdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +trace_access_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_ACCESS].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +trace_ftruncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) { + if (op_ret >= 0) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, *buf {st_size=%"PRId64", " + "st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64"})", + frame->root->unique, op_ret, buf->st_size, buf->st_blksize, + buf->st_blocks); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_fstat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + char atime_buf[256], mtime_buf[256], ctime_buf[256]; + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_FSTAT].enabled) { + if (op_ret >= 0) { + strftime (atime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_atime)); + strftime (mtime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_mtime)); + strftime (ctime_buf, 256, "[%b %d %H:%M:%S]", localtime (&buf->st_ctime)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, *buf {st_dev=%"GF_PRI_DEV", " + "st_ino=%"PRIu64", st_mode=%d, st_nlink=%"GF_PRI_NLINK", " + "st_uid=%d, st_gid=%d, st_rdev=%"GF_PRI_DEV", st_size=%"PRId64", " + "st_blksize=%"GF_PRI_BLKSIZE", st_blocks=%"PRId64", st_atime=%s, " + "st_mtime=%s, st_ctime=%s})", + frame->root->unique, op_ret, buf->st_dev, buf->st_ino, + buf->st_mode, buf->st_nlink, buf->st_uid, buf->st_gid, + buf->st_rdev, buf->st_size, buf->st_blksize, + buf->st_blocks, atime_buf, mtime_buf, ctime_buf); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trace_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_LK].enabled) { + if (op_ret >= 0) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, {l_type=%d, l_whence=%d, " + "l_start=%"PRId64", l_len=%"PRId64", l_pid=%u})", + frame->root->unique, op_ret, lock->l_type, lock->l_whence, + lock->l_start, lock->l_len, lock->l_pid); + } else { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, lock); + return 0; +} + + +int32_t +trace_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_SETDENTS].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": op_ret=%d, op_errno=%d", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +trace_entrylk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_ENTRYLK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": op_ret=%d, op_errno=%d", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +trace_xattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + ERR_EINVAL_NORETURN (!this || !dict); + + if (trace_fop_names[GF_FOP_XATTROP].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +trace_fxattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + ERR_EINVAL_NORETURN (!this || !dict); + + if (trace_fop_names[GF_FOP_FXATTROP].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (op_ret=%d, op_errno=%d)", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +trace_inodelk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_INODELK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": op_ret=%d, op_errno=%d", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +trace_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + ERR_EINVAL_NORETURN (!this || !loc || !basename); + + if (trace_fop_names[GF_FOP_ENTRYLK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc= {path=%s, ino=%"PRIu64"} basename=%s, cmd=%s, type=%s)", + frame->root->unique, loc->path, loc->inode->ino, basename, + ((cmd == ENTRYLK_LOCK) ? "ENTRYLK_LOCK" : "ENTRYLK_UNLOCK"), + ((type == ENTRYLK_RDLCK) ? "ENTRYLK_RDLCK" : "ENTRYLK_WRLCK")); + } + + STACK_WIND (frame, + trace_entrylk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->entrylk, + loc, basename, cmd, type); + return 0; +} + +int32_t +trace_inodelk (call_frame_t *frame, + xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *flock) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_INODELK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, cmd=%s)", + frame->root->unique, loc->path, loc->inode->ino, + ((cmd == F_SETLK)? "F_SETLK" : "unknown")); + } + + STACK_WIND (frame, + trace_inodelk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->inodelk, + loc, cmd, flock); + return 0; +} + + +int32_t +trace_finodelk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + ERR_EINVAL_NORETURN (!this ); + + if (trace_fop_names[GF_FOP_FINODELK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": op_ret=%d, op_errno=%d", + frame->root->unique, op_ret, op_errno); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +trace_finodelk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *flock) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_FINODELK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (fd=%p, cmd=%s)", + frame->root->unique, fd, + ((cmd == F_SETLK) ? "F_SETLK" : "unknown")); + } + + STACK_WIND (frame, + trace_finodelk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->finodelk, + fd, cmd, flock); + return 0; +} + + +int32_t +trace_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_XATTROP].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (path=%s, ino=%"PRIu64" flags=%d)", + frame->root->unique, loc->path, loc->inode->ino, flags); + + } + + STACK_WIND (frame, trace_xattrop_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, + loc, flags, dict); + + return 0; +} + +int32_t +trace_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_FXATTROP].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (fd=%p, flags=%d)", + frame->root->unique, fd, flags); + + } + + STACK_WIND (frame, trace_fxattrop_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fxattrop, + fd, flags, dict); + + return 0; +} + +int32_t +trace_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_LOOKUP].enabled) { + /* TODO: print all the keys mentioned in xattr_req */ + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"})", + frame->root->unique, loc->path, + loc->inode->ino); + } + + STACK_WIND (frame, trace_lookup_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, + loc, xattr_req); + + return 0; +} + +int32_t +trace_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ERR_EINVAL_NORETURN (!this || !loc ); + + + if (trace_fop_names[GF_FOP_STAT].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"})", + frame->root->unique, loc->path, loc->inode->ino); + } + + STACK_WIND (frame, + trace_stat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc); + + return 0; +} + +int32_t +trace_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + ERR_EINVAL_NORETURN (!this || !loc || (size < 1)); + + if (trace_fop_names[GF_FOP_READLINK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, size=%"GF_PRI_SIZET")", + frame->root->unique, loc->path, loc->inode->ino, size); + } + + STACK_WIND (frame, + trace_readlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, + loc, + size); + + return 0; +} + +int32_t +trace_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t dev) +{ + ERR_EINVAL_NORETURN (!this || !loc->path); + + if (trace_fop_names[GF_FOP_MKNOD].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, mode=%d, dev=%"GF_PRI_DEV")", + frame->root->unique, loc->path, loc->inode->ino, mode, dev); + } + + STACK_WIND (frame, + trace_mknod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, + loc, + mode, + dev); + + return 0; +} + +int32_t +trace_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + ERR_EINVAL_NORETURN (!this || !loc || !loc->path); + + if (trace_fop_names[GF_FOP_MKDIR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (path=%s, ino=%"PRIu64", mode=%d)", + frame->root->unique, loc->path, + ((loc->inode)? loc->inode->ino : 0), mode); + } + + STACK_WIND (frame, + trace_mkdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, + loc, + mode); + return 0; +} + +int32_t +trace_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_UNLINK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"})", + frame->root->unique, loc->path, loc->inode->ino); + } + + STACK_WIND (frame, + trace_unlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc); + return 0; +} + +int32_t +trace_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_RMDIR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"})", + frame->root->unique, loc->path, loc->inode->ino); + } + + STACK_WIND (frame, + trace_rmdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, + loc); + + return 0; +} + +int32_t +trace_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc) +{ + ERR_EINVAL_NORETURN (!this || !linkpath || !loc || !loc->path); + + if (trace_fop_names[GF_FOP_SYMLINK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (linkpath=%s, loc {path=%s, ino=%"PRIu64"})", + frame->root->unique, linkpath, loc->path, + ((loc->inode)? loc->inode->ino : 0)); + } + + STACK_WIND (frame, + trace_symlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + linkpath, + loc); + + return 0; +} + +int32_t +trace_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + ERR_EINVAL_NORETURN (!this || !oldloc || !newloc); + + if (trace_fop_names[GF_FOP_RENAME].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (oldloc {path=%s, ino=%"PRIu64"}, " + "newloc{path=%s, ino=%"PRIu64"})", + frame->root->unique, oldloc->path, oldloc->ino, + newloc->path, newloc->ino); + } + + STACK_WIND (frame, + trace_rename_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, + oldloc, + newloc); + + return 0; +} + +int32_t +trace_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + ERR_EINVAL_NORETURN (!this || !oldloc || !newloc); + + if (trace_fop_names[GF_FOP_LINK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (oldloc {path=%s, ino=%"PRIu64"}, " + "newloc {path=%s, ino=%"PRIu64"})", + frame->root->unique, oldloc->path, oldloc->inode->ino, + newloc->path, newloc->inode->ino); + } + + STACK_WIND (frame, + trace_link_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, + oldloc, + newloc); + return 0; +} + +int32_t +trace_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_CHMOD].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, mode=%o)", + frame->root->unique, loc->path, loc->inode->ino, mode); + } + + STACK_WIND (frame, + trace_chmod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chmod, + loc, + mode); + + return 0; +} + +int32_t +trace_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_CHOWN].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, uid=%d, gid=%d)", + frame->root->unique, loc->path, loc->inode->ino, uid, gid); + } + + STACK_WIND (frame, + trace_chown_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chown, + loc, + uid, + gid); + + return 0; +} + +int32_t +trace_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_TRUNCATE].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, offset=%"PRId64")", + frame->root->unique, loc->path, loc->inode->ino, offset); + } + + STACK_WIND (frame, + trace_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + loc, + offset); + + return 0; +} + +int32_t +trace_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + char actime_str[256]; + char modtime_str[256]; + + ERR_EINVAL_NORETURN (!this || !loc || !tv); + + if (trace_fop_names[GF_FOP_UTIMENS].enabled) { + strftime (actime_str, 256, "[%b %d %H:%M:%S]", localtime (&tv[0].tv_sec)); + strftime (modtime_str, 256, "[%b %d %H:%M:%S]", localtime (&tv[1].tv_sec)); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, " + "*tv=%p {actime=%s, modtime=%s})", + frame->root->unique, loc->path, loc->inode->ino, + tv, actime_str, modtime_str); + } + + STACK_WIND (frame, + trace_utimens_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->utimens, + loc, + tv); + + return 0; +} + +int32_t +trace_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_OPEN].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, flags=%d, fd=%p)", + frame->root->unique, loc->path, loc->inode->ino, flags, fd); + } + + STACK_WIND (frame, + trace_open_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, + loc, + flags, + fd); + return 0; +} + +int32_t +trace_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd) +{ + ERR_EINVAL_NORETURN (!this || !loc->path); + + if (trace_fop_names[GF_FOP_CREATE].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, flags=0%o mode=0%o)", + frame->root->unique, loc->path, loc->inode->ino, flags, mode); + } + + STACK_WIND (frame, + trace_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, + flags, + mode, + fd); + return 0; +} + +int32_t +trace_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + ERR_EINVAL_NORETURN (!this || !fd || (size < 1)); + + if (trace_fop_names[GF_FOP_READ].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (*fd=%p, size=%"GF_PRI_SIZET", offset=%"PRId64")", + frame->root->unique, fd, size, offset); + } + + STACK_WIND (frame, + trace_readv_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, + size, + offset); + return 0; +} + +int32_t +trace_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + ERR_EINVAL_NORETURN (!this || !fd || !vector || (count < 1)); + + if (trace_fop_names[GF_FOP_WRITE].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (*fd=%p, *vector=%p, count=%d, offset=%"PRId64")", + frame->root->unique, fd, vector, count, offset); + } + + STACK_WIND (frame, + trace_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + fd, + vector, + count, + offset); + return 0; +} + +int32_t +trace_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_STATFS].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"})", + frame->root->unique, loc->path, + ((loc->inode)? loc->inode->ino : 0)); + } + + STACK_WIND (frame, + trace_statfs_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->statfs, + loc); + return 0; +} + +int32_t +trace_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_FLUSH].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (*fd=%p)", + frame->root->unique, fd); + } + + STACK_WIND (frame, + trace_flush_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd); + return 0; +} + + +int32_t +trace_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_FSYNC].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (flags=%d, *fd=%p)", + frame->root->unique, flags, fd); + } + + STACK_WIND (frame, + trace_fsync_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, + fd, + flags); + return 0; +} + +int32_t +trace_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + ERR_EINVAL_NORETURN (!this || !loc || !dict); + + if (trace_fop_names[GF_FOP_SETXATTR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, dict=%p, flags=%d)", + frame->root->unique, loc->path, + ((loc->inode)? loc->inode->ino : 0), dict, flags); + } + + STACK_WIND (frame, + trace_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, + dict, + flags); + return 0; +} + +int32_t +trace_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_GETXATTR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}), name=%s", + frame->root->unique, loc->path, + ((loc->inode)? loc->inode->ino : 0), name); + } + + STACK_WIND (frame, + trace_getxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, + name); + return 0; +} + +int32_t +trace_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + ERR_EINVAL_NORETURN (!this || !loc || !name); + + if (trace_fop_names[GF_FOP_REMOVEXATTR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (loc {path=%s, ino=%"PRIu64"}, name=%s)", + frame->root->unique, loc->path, + ((loc->inode)? loc->inode->ino : 0), name); + } + + STACK_WIND (frame, + trace_removexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + loc, + name); + + return 0; +} + +int32_t +trace_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + fd_t *fd) +{ + ERR_EINVAL_NORETURN (!this || !loc ); + + if (trace_fop_names[GF_FOP_OPENDIR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64":( loc {path=%s, ino=%"PRIu64"}, fd=%p)", + frame->root->unique, loc->path, loc->inode->ino, fd); + } + + STACK_WIND (frame, + trace_opendir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, + loc, + fd); + return 0; +} + +int32_t +trace_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_GETDENTS].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (fd=%p, size=%"GF_PRI_SIZET", offset=%"PRId64", flag=0x%x)", + frame->root->unique, fd, size, offset, flag); + } + + STACK_WIND (frame, + trace_getdents_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getdents, + fd, + size, + offset, + flag); + return 0; +} + + +int32_t +trace_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_READDIR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (fd=%p, size=%"GF_PRI_SIZET", offset=%"PRId64")", + frame->root->unique, fd, size, offset); + } + + STACK_WIND (frame, + trace_readdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, + fd, + size, + offset); + + return 0; +} + + +int32_t +trace_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_FSYNCDIR].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (datasync=%d, *fd=%p)", + frame->root->unique, datasync, fd); + } + + STACK_WIND (frame, + trace_fsyncdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, + fd, + datasync); + return 0; +} + +int32_t +trace_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + ERR_EINVAL_NORETURN (!this || !loc); + + if (trace_fop_names[GF_FOP_ACCESS].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (*loc {path=%s, ino=%"PRIu64"}, mask=0%o)", + frame->root->unique, loc->path, + ((loc->inode)? loc->inode->ino : 0), mask); + } + + STACK_WIND (frame, + trace_access_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, + loc, + mask); + return 0; +} + +int32_t +trace_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_FTRUNCATE].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (offset=%"PRId64", *fd=%p)", + frame->root->unique, offset, fd); + } + + STACK_WIND (frame, + trace_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + fd, + offset); + + return 0; +} + +int32_t +trace_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_FCHOWN].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (*fd=%p, uid=%d, gid=%d)", + frame->root->unique, fd, uid, gid); + } + + STACK_WIND (frame, + trace_fchown_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fchown, + fd, + uid, + gid); + return 0; +} + +int32_t +trace_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_FCHMOD].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (mode=%o, *fd=%p)", + frame->root->unique, mode, fd); + } + + STACK_WIND (frame, + trace_fchmod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fchmod, + fd, + mode); + return 0; +} + +int32_t +trace_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_FSTAT].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (*fd=%p)", + frame->root->unique, fd); + } + + STACK_WIND (frame, + trace_fstat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, + fd); + return 0; +} + +int32_t +trace_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + ERR_EINVAL_NORETURN (!this || !fd); + + if (trace_fop_names[GF_FOP_LK].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (*fd=%p, cmd=%d, lock {l_type=%d, l_whence=%d, " + "l_start=%"PRId64", l_len=%"PRId64", l_pid=%u})", + frame->root->unique, fd, cmd, lock->l_type, lock->l_whence, + lock->l_start, lock->l_len, lock->l_pid); + } + + STACK_WIND (frame, + trace_lk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lk, + fd, + cmd, + lock); + return 0; +} + +int32_t +trace_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + if (trace_fop_names[GF_FOP_SETDENTS].enabled) { + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (*fd=%p, flags=%d, count=%d", + frame->root->unique, fd, flags, count); + } + + STACK_WIND (frame, + trace_setdents_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setdents, + fd, + flags, + entries, + count); + return 0; +} + + +int32_t +trace_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *fchecksum, + uint8_t *dchecksum) +{ + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": op_ret (%d), op_errno(%d)", + frame->root->unique, op_ret, op_errno); + + STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum); + + return 0; +} + +int32_t +trace_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": loc->path (%s) flag (%d)", + frame->root->unique, loc->path, flag); + + STACK_WIND (frame, + trace_checksum_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->checksum, + loc, + flag); + + return 0; +} + + +int32_t +trace_stats_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": op_ret (%d), op_errno(%d)", + frame->root->unique, op_ret, op_errno); + + STACK_UNWIND (frame, op_ret, op_errno, stats); + return 0; +} + +int32_t +trace_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags) +{ + ERR_EINVAL_NORETURN (!this); + + gf_log (this->name, GF_LOG_NORMAL, + "%"PRId64": (flags=%d)", + frame->root->unique, flags); + + STACK_WIND (frame, + trace_stats_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->mops->stats, + flags); + + return 0; +} + +void +enable_all_calls (int enabled) +{ + int i; + for (i = 0; i < GF_FOP_MAXVALUE; i++) + trace_fop_names[i].enabled = enabled; +} + +void +enable_call (const char *name, int enabled) +{ + int i; + for (i = 0; i < GF_FOP_MAXVALUE; i++) + if (!strcasecmp(trace_fop_names[i].name, name)) + trace_fop_names[i].enabled = enabled; +} + + +/* + include = 1 for "include-ops" + = 0 for "exclude-ops" +*/ +void +process_call_list (const char *list, int include) +{ + enable_all_calls (include ? 0 : 1); + + char *call = strsep ((char **)&list, ","); + while (call) { + enable_call (call, include); + call = strsep ((char **)&list, ","); + } +} + + +int32_t +init (xlator_t *this) +{ + dict_t *options = this->options; + char *includes = NULL, *excludes = NULL; + + if (!this) + return -1; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "trace translator requires one subvolume"); + return -1; + } + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + + includes = data_to_str (dict_get (options, "include-ops")); + excludes = data_to_str (dict_get (options, "exclude-ops")); + + { + int i; + for (i = 0; i < GF_FOP_MAXVALUE; i++) { + trace_fop_names[i].name = (gf_fop_list[i] ? + gf_fop_list[i] : ":O"); + trace_fop_names[i].enabled = 1; + } + } + + if (includes && excludes) { + gf_log (this->name, + GF_LOG_ERROR, + "must specify only one of 'include-ops' and 'exclude-ops'"); + return -1; + } + if (includes) + process_call_list (includes, 1); + if (excludes) + process_call_list (excludes, 0); + + gf_log_set_loglevel (GF_LOG_NORMAL); + + /* Set this translator's inode table pointer to child node's pointer. */ + this->itable = FIRST_CHILD (this)->itable; + + return 0; +} + +void +fini (xlator_t *this) +{ + if (!this) + return; + + gf_log (this->name, GF_LOG_NORMAL, + "trace translator unloaded"); + return; +} + +struct xlator_fops fops = { + .stat = trace_stat, + .readlink = trace_readlink, + .mknod = trace_mknod, + .mkdir = trace_mkdir, + .unlink = trace_unlink, + .rmdir = trace_rmdir, + .symlink = trace_symlink, + .rename = trace_rename, + .link = trace_link, + .chmod = trace_chmod, + .chown = trace_chown, + .truncate = trace_truncate, + .utimens = trace_utimens, + .open = trace_open, + .readv = trace_readv, + .writev = trace_writev, + .statfs = trace_statfs, + .flush = trace_flush, + .fsync = trace_fsync, + .setxattr = trace_setxattr, + .getxattr = trace_getxattr, + .removexattr = trace_removexattr, + .opendir = trace_opendir, + .readdir = trace_readdir, + .fsyncdir = trace_fsyncdir, + .access = trace_access, + .ftruncate = trace_ftruncate, + .fstat = trace_fstat, + .create = trace_create, + .fchown = trace_fchown, + .fchmod = trace_fchmod, + .lk = trace_lk, + .inodelk = trace_inodelk, + .finodelk = trace_finodelk, + .entrylk = trace_entrylk, + .lookup = trace_lookup, + .setdents = trace_setdents, + .getdents = trace_getdents, + .checksum = trace_checksum, + .xattrop = trace_xattrop, + .fxattrop = trace_fxattrop, +}; + +struct xlator_mops mops = { + .stats = trace_stats, +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"include-ops", "include"}, + .type = GF_OPTION_TYPE_STR, + /*.value = { ""} */ + }, + { .key = {"exclude-ops", "exclude"}, + .type = GF_OPTION_TYPE_STR + /*.value = { ""} */ + }, + { .key = {NULL} }, +}; + diff --git a/xlators/encryption/Makefile.am b/xlators/encryption/Makefile.am new file mode 100644 index 000000000..2cbde680f --- /dev/null +++ b/xlators/encryption/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = rot-13 + +CLEANFILES = diff --git a/xlators/encryption/rot-13/Makefile.am b/xlators/encryption/rot-13/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/encryption/rot-13/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/encryption/rot-13/src/Makefile.am b/xlators/encryption/rot-13/src/Makefile.am new file mode 100644 index 000000000..ba5e623d8 --- /dev/null +++ b/xlators/encryption/rot-13/src/Makefile.am @@ -0,0 +1,14 @@ +xlator_LTLIBRARIES = rot-13.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/encryption + +rot_13_la_LDFLAGS = -module -avoidversion + +rot_13_la_SOURCES = rot-13.c +rot_13_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = rot-13.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/encryption/rot-13/src/rot-13.c b/xlators/encryption/rot-13/src/rot-13.c new file mode 100644 index 000000000..7cae46134 --- /dev/null +++ b/xlators/encryption/rot-13/src/rot-13.c @@ -0,0 +1,200 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <ctype.h> +#include <sys/uio.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" +#include "logging.h" + +#include "rot-13.h" + +/* + * This is a rot13 ``encryption'' xlator. It rot13's data when + * writing to disk and rot13's it back when reading it. + * This xlator is meant as an example, NOT FOR PRODUCTION + * USE ;) (hence no error-checking) + */ + +void +rot13 (char *buf, int len) +{ + int i; + for (i = 0; i < len; i++) { + if (buf[i] >= 'a' && buf[i] <= 'z') + buf[i] = 'a' + ((buf[i] - 'a' + 13) % 26); + else if (buf[i] >= 'A' && buf[i] <= 'Z') + buf[i] = 'A' + ((buf[i] - 'A' + 13) % 26); + } +} + +void +rot13_iovec (struct iovec *vector, int count) +{ + int i; + for (i = 0; i < count; i++) { + rot13 (vector[i].iov_base, vector[i].iov_len); + } +} + +int32_t +rot13_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + rot_13_private_t *priv = (rot_13_private_t *)this->private; + + if (priv->decrypt_read) + rot13_iovec (vector, count); + + STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + return 0; +} + +int32_t +rot13_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + STACK_WIND (frame, + rot13_readv_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->readv, + fd, size, offset); + return 0; +} + +int32_t +rot13_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} + +int32_t +rot13_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + rot_13_private_t *priv = (rot_13_private_t *)this->private; + if (priv->encrypt_write) + rot13_iovec (vector, count); + + STACK_WIND (frame, + rot13_writev_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->writev, + fd, vector, count, offset); + return 0; +} + +int32_t +init (xlator_t *this) +{ + data_t *data = NULL; + rot_13_private_t *priv = NULL; + + if (!this->children || this->children->next) { + gf_log ("rot13", GF_LOG_ERROR, + "FATAL: rot13 should have exactly one child"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + priv = CALLOC (sizeof (rot_13_private_t), 1); + ERR_ABORT (priv); + priv->decrypt_read = 1; + priv->encrypt_write = 1; + + data = dict_get (this->options, "encrypt-write"); + if (data) { + if (gf_string2boolean (data->data, &priv->encrypt_write) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "encrypt-write takes only boolean options"); + return -1; + } + } + + data = dict_get (this->options, "decrypt-read"); + if (data) { + if (gf_string2boolean (data->data, &priv->decrypt_read) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "decrypt-read takes only boolean options"); + return -1; + } + } + + this->private = priv; + gf_log ("rot13", GF_LOG_DEBUG, "rot13 xlator loaded"); + return 0; +} + +void +fini (xlator_t *this) +{ + rot_13_private_t *priv = this->private; + + FREE (priv); + + return; +} + +struct xlator_fops fops = { + .readv = rot13_readv, + .writev = rot13_writev +}; + +struct xlator_mops mops = { +}; + + +struct volume_options options[] = { + { .key = {"encrypt-write"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"decrypt-read"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {NULL} }, +}; diff --git a/xlators/encryption/rot-13/src/rot-13.h b/xlators/encryption/rot-13/src/rot-13.h new file mode 100644 index 000000000..43e60c326 --- /dev/null +++ b/xlators/encryption/rot-13/src/rot-13.h @@ -0,0 +1,33 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __ROT_13_H__ +#define __ROT_13_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +typedef struct { + gf_boolean_t encrypt_write; + gf_boolean_t decrypt_read; +} rot_13_private_t; + +#endif /* __ROT_13_H__ */ diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am new file mode 100644 index 000000000..9ac9b6f19 --- /dev/null +++ b/xlators/features/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = locks trash path-convertor filter quota + +CLEANFILES = diff --git a/xlators/features/filter/Makefile.am b/xlators/features/filter/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/features/filter/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/filter/src/Makefile.am b/xlators/features/filter/src/Makefile.am new file mode 100644 index 000000000..fa0b92214 --- /dev/null +++ b/xlators/features/filter/src/Makefile.am @@ -0,0 +1,13 @@ +xlator_LTLIBRARIES = filter.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +filter_la_LDFLAGS = -module -avoidversion + +filter_la_SOURCES = filter.c +filter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/features/filter/src/filter.c b/xlators/features/filter/src/filter.c new file mode 100644 index 000000000..67ea45d3a --- /dev/null +++ b/xlators/features/filter/src/filter.c @@ -0,0 +1,1768 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" + +#define GF_FILTER_NOBODY_UID 65534 +#define GF_FILTER_NOBODY_GID 65534 +#define GF_FILTER_ROOT_UID 0 +#define GF_FILTER_ROOT_GID 0 + +#define GF_MAXIMUM_FILTERING_ALLOWED 32 + +/* + option root-filtering on (off by default) + option translate-uid <uid-range=newuid,uid=newuid> + option translate-gid <gid-range=newgid,gid=newgid> + option read-only <yes|true> + option fixed-uid <uid> + option fixed-gid <gid> + option filter-uid <uid-range,uid> + option filter-gid <gid-range,gid> // not supported yet + +*/ + +struct gf_filter { + /* Flags */ + gf_boolean_t complete_read_only; + char fixed_uid_set; + char fixed_gid_set; + char partial_filter; + + /* Options */ + /* Mapping/Filtering/Translate whatever you want to call */ + int translate_num_uid_entries; + int translate_num_gid_entries; + int translate_input_uid[GF_MAXIMUM_FILTERING_ALLOWED][2]; + int translate_output_uid[GF_MAXIMUM_FILTERING_ALLOWED]; + int translate_input_gid[GF_MAXIMUM_FILTERING_ALLOWED][2]; + int translate_output_gid[GF_MAXIMUM_FILTERING_ALLOWED]; + + /* Fixed uid/gid */ + int fixed_uid; + int fixed_gid; + + /* Filter */ + int filter_num_uid_entries; + int filter_num_gid_entries; + int filter_input_uid[GF_MAXIMUM_FILTERING_ALLOWED][2]; + int filter_input_gid[GF_MAXIMUM_FILTERING_ALLOWED][2]; + +}; + +/* update_frame: The main logic of the whole translator. + Return values: + 0: no change + // TRANSLATE + 1: only uid changed + 2: only gid changed + 3: both uid/gid changed + // FILTER + 4: uid in filter range + 5: gid in filter range // not supported yet + 6: complete fs is readonly +*/ + +#define GF_FILTER_NO_CHANGE 0 +#define GF_FILTER_MAP_UID 1 +#define GF_FILTER_MAP_GID 2 +#define GF_FILTER_MAP_BOTH 3 +#define GF_FILTER_FILTER_UID 4 +#define GF_FILTER_FILTER_GID 5 +#define GF_FILTER_RO_FS 6 + +static int32_t +update_frame (call_frame_t *frame, + inode_t *inode, + struct gf_filter *filter) +{ + uid_t uid = 0; + int32_t idx = 0; + int32_t ret = 0; + int32_t dictret = 0; + uint64_t tmp_uid = 0; + + for (idx = 0; idx < filter->translate_num_uid_entries; idx++) { + if ((frame->root->uid >=filter->translate_input_uid[idx][0]) && + (frame->root->uid <=filter->translate_input_uid[idx][1])) { + dictret = inode_ctx_get (inode, frame->this, &tmp_uid); + uid = (uid_t)tmp_uid; + if (dictret == 0) { + if (frame->root->uid != uid) + ret = GF_FILTER_MAP_UID; + } else { + ret = GF_FILTER_MAP_UID; + } + break; + } + } + + for (idx = 0; idx < filter->translate_num_gid_entries; idx++) { + if ((frame->root->gid >=filter->translate_input_gid[idx][0]) && + (frame->root->gid <=filter->translate_input_gid[idx][1])) { + if (ret == GF_FILTER_NO_CHANGE) + ret = GF_FILTER_MAP_GID; + else + ret = GF_FILTER_MAP_BOTH; + break; + } + } + + + if (filter->complete_read_only) + return GF_FILTER_RO_FS; + + if (filter->partial_filter) { + dictret = inode_ctx_get (inode, frame->this, &tmp_uid); + uid = (uid_t)tmp_uid; + if (dictret != -1) { + for (idx = 0; idx < filter->filter_num_uid_entries; + idx++) { + if ((uid >=filter->filter_input_uid[idx][0]) && + (uid <=filter->filter_input_uid[idx][1])) { + return GF_FILTER_FILTER_UID; + } + } + } + } + + return ret; +} + +/* if 'root' don't change the uid/gid */ +static int32_t +update_stat (struct stat *stbuf, + struct gf_filter *filter) +{ + int32_t idx = 0; + for (idx = 0; idx < filter->translate_num_uid_entries; idx++) { + if (stbuf->st_uid == GF_FILTER_ROOT_UID) + continue; + if ((stbuf->st_uid >= filter->translate_input_uid[idx][0]) && + (stbuf->st_uid <= filter->translate_input_uid[idx][1])) { + stbuf->st_uid = filter->translate_output_uid[idx]; + break; + } + } + + for (idx = 0; idx < filter->translate_num_gid_entries; idx++) { + if (stbuf->st_gid == GF_FILTER_ROOT_GID) + continue; + if ((stbuf->st_gid >= filter->translate_input_gid[idx][0]) && + (stbuf->st_gid <= filter->translate_input_gid[idx][1])) { + stbuf->st_gid = filter->translate_output_gid[idx]; + break; + } + } + + if (filter->fixed_uid_set) { + stbuf->st_uid = filter->fixed_uid; + } + + if (filter->fixed_gid_set) { + stbuf->st_gid = filter->fixed_gid; + } + + return 0; +} + +static int32_t +filter_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *dict) +{ + int ret = 0; + if (op_ret >= 0) { + update_stat (buf, this->private); + ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "couldn't set context"); + } + } + STACK_UNWIND (frame, op_ret, op_errno, inode, buf, dict); + return 0; +} + +int32_t +filter_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + STACK_WIND (frame, + filter_lookup_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, + loc, + xattr_req); + return 0; +} + + +static int32_t +filter_stat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + if (op_ret >= 0) { + update_stat (buf, this->private); + } + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +filter_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + STACK_WIND (frame, + filter_stat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc); + return 0; +} + +static int32_t +filter_chmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + if (op_ret >= 0) { + update_stat (buf, this->private); + } + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +filter_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int32_t ret = 0; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (loc->inode->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (loc->inode->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM, NULL); + return 0; + + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS, NULL); + return 0; + default: + break; + } + + STACK_WIND (frame, + filter_chmod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chmod, + loc, + mode); + return 0; +} + + +static int32_t +filter_fchmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + if (op_ret >= 0) { + update_stat (buf, this->private); + } + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +filter_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + STACK_WIND (frame, + filter_fchmod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fchmod, + fd, + mode); + return 0; +} + +static int32_t +filter_chown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + if (op_ret >= 0) { + update_stat (buf, this->private); + } + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +filter_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + int32_t ret = 0; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (loc->inode->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (loc->inode->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM, NULL); + return 0; + + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS, NULL); + return 0; + default: + break; + } + + STACK_WIND (frame, + filter_chown_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chown, + loc, + uid, + gid); + return 0; +} + +static int32_t +filter_fchown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + if (op_ret >= 0) { + update_stat (buf, this->private); + } + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +filter_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + STACK_WIND (frame, + filter_fchown_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fchown, + fd, + uid, + gid); + return 0; +} + +static int32_t +filter_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + if (op_ret >= 0) { + update_stat (buf, this->private); + } + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +filter_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + int32_t ret = 0; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (loc->inode->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (loc->inode->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM, NULL); + return 0; + + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS, NULL); + return 0; + } + + STACK_WIND (frame, + filter_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + loc, + offset); + return 0; +} + +static int32_t +filter_ftruncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + if (op_ret >= 0) { + update_stat (buf, this->private); + } + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +filter_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + STACK_WIND (frame, + filter_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + fd, + offset); + return 0; +} + +int32_t +filter_utimens_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + if (op_ret >= 0) { + update_stat (buf, this->private); + } + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + + +int32_t +filter_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + int32_t ret = 0; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (loc->inode->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (loc->inode->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM, NULL); + return 0; + + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS, NULL); + return 0; + } + + STACK_WIND (frame, + filter_utimens_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->utimens, + loc, + tv); + return 0; +} + +static int32_t +filter_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *path) +{ + STACK_UNWIND (frame, op_ret, op_errno, path); + return 0; +} + +int32_t +filter_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + int32_t ret = 0; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (loc->inode->st_mode & S_IRGRP) + break; + case GF_FILTER_MAP_BOTH: + if (loc->inode->st_mode & S_IROTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM, NULL); + return 0; + } + STACK_WIND (frame, + filter_readlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, + loc, + size); + return 0; +} + + +static int32_t +filter_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + int ret = 0; + + if (op_ret >= 0) { + update_stat (buf, this->private); + ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "couldn't set context"); + } + } + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +filter_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t rdev) +{ + int ret = 0; + inode_t *parent = loc->parent; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (parent->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (parent->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM); + return 0; + + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS, NULL, NULL); + return 0; + } + STACK_WIND (frame, + filter_mknod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev); + return 0; +} + +static int32_t +filter_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + int ret = 0; + if (op_ret >= 0) { + update_stat (buf, this->private); + ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "couldn't set context"); + } + } + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +filter_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int ret = 0; + inode_t *parent = loc->parent; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (parent->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (parent->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM); + return 0; + + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS, NULL, NULL); + return 0; + } + STACK_WIND (frame, + filter_mkdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, + loc, mode); + return 0; +} + +static int32_t +filter_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +filter_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t ret = 0; + inode_t *parent = loc->parent; + if (!parent) + parent = inode_parent (loc->inode, 0, NULL); + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (parent->st_mode & S_IWGRP) + break; + if (loc->inode->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (parent->st_mode & S_IWOTH) + break; + if (loc->inode->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM); + return 0; + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS); + return 0; + } + STACK_WIND (frame, + filter_unlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc); + return 0; +} + +static int32_t +filter_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +filter_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t ret = 0; + inode_t *parent = loc->parent; + if (!parent) + parent = inode_parent (loc->inode, 0, NULL); + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (parent->st_mode & S_IWGRP) + break; + if (loc->inode->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (parent->st_mode & S_IWOTH) + break; + if (loc->inode->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM); + return 0; + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS); + return 0; + } + STACK_WIND (frame, + filter_rmdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, + loc); + return 0; +} + +static int32_t +filter_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + int ret = 0; + if (op_ret >= 0) { + update_stat (buf, this->private); + ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "couldn't set context"); + } + } + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +filter_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc) +{ + int ret = 0; + inode_t *parent = loc->parent; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (parent->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (parent->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM); + return 0; + + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS, NULL, NULL); + return 0; + } + STACK_WIND (frame, + filter_symlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + linkpath, loc); + return 0; +} + + +static int32_t +filter_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + if (op_ret >= 0) { + update_stat (buf, this->private); + } + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +filter_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int32_t ret = 0; + inode_t *parent = oldloc->parent; + if (!parent) + parent = inode_parent (oldloc->inode, 0, NULL); + ret = update_frame (frame, oldloc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (parent->st_mode & S_IWGRP) + break; + if (oldloc->inode->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (parent->st_mode & S_IWOTH) + break; + if (oldloc->inode->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, + "%s -> %s: returning permission denied", oldloc->path, newloc->path); + STACK_UNWIND (frame, -1, EPERM); + return 0; + + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS, NULL); + return 0; + } + STACK_WIND (frame, + filter_rename_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, + oldloc, newloc); + return 0; +} + + +static int32_t +filter_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + int ret = 0; + if (op_ret >= 0) { + update_stat (buf, this->private); + ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "couldn't set context"); + } + } + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +filter_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int ret = 0; + ret = update_frame (frame, oldloc->inode, this->private); + switch (ret) { + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS, NULL, NULL); + return 0; + } + STACK_WIND (frame, + filter_link_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, + oldloc, newloc); + return 0; +} + + +static int32_t +filter_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + int ret = 0; + if (op_ret >= 0) { + update_stat (buf, this->private); + ret = inode_ctx_put (inode, this, (uint64_t)(long)buf->st_uid); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "couldn't set context"); + } + } + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + return 0; +} + +int32_t +filter_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, fd_t *fd) +{ + int ret = 0; + inode_t *parent = loc->parent; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (parent->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (parent->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM); + return 0; + + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS, NULL, NULL, NULL); + return 0; + } + STACK_WIND (frame, filter_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, flags, mode, fd); + return 0; +} + +static int32_t +filter_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + STACK_UNWIND (frame, op_ret, op_errno, fd); + return 0; +} + +int32_t +filter_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + int32_t ret = 0; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (loc->inode->st_mode & S_IWGRP) + break; + if (!((flags & O_WRONLY) || (flags & O_RDWR)) + && (loc->inode->st_mode & S_IRGRP)) + break; + case GF_FILTER_MAP_BOTH: + if (loc->inode->st_mode & S_IWOTH) + break; + if (!((flags & O_WRONLY) || (flags & O_RDWR)) + && (loc->inode->st_mode & S_IROTH)) + break; + gf_log (this->name, GF_LOG_DEBUG, + "%s: returning permission denied (mode: 0%o, flag=0%o)", + loc->path, loc->inode->st_mode, flags); + STACK_UNWIND (frame, -1, EPERM, fd); + return 0; + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + if (!((flags & O_WRONLY) || (flags & O_RDWR))) + break; + STACK_UNWIND (frame, -1, EROFS, NULL); + return 0; + + } + STACK_WIND (frame, + filter_open_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, + loc, flags, fd); + return 0; +} + +static int32_t +filter_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + if (op_ret >= 0) { + update_stat (stbuf, this->private); + } + STACK_UNWIND (frame, + op_ret, + op_errno, + vector, + count, + stbuf); + return 0; +} + +int32_t +filter_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + STACK_WIND (frame, + filter_readv_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, + size, + offset); + return 0; +} + + +static int32_t +filter_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + if (op_ret >= 0) { + update_stat (stbuf, this->private); + } + STACK_UNWIND (frame, + op_ret, + op_errno, + stbuf); + return 0; +} + +int32_t +filter_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t off) +{ + int32_t ret = 0; + ret = update_frame (frame, fd->inode, this->private); + switch (ret) { + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS, NULL); + return 0; + } + + STACK_WIND (frame, + filter_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + fd, + vector, + count, + off); + return 0; +} + +static int32_t +filter_fstat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + if (op_ret >= 0) { + update_stat (buf, this->private); + } + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +filter_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + STACK_WIND (frame, + filter_fstat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, + fd); + return 0; +} + +static int32_t +filter_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + fd); + return 0; +} + +int32_t +filter_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, fd_t *fd) +{ + int32_t ret = 0; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (loc->inode->st_mode & S_IWGRP) + break; + if (loc->inode->st_mode & S_IRGRP) + break; + case GF_FILTER_MAP_BOTH: + if (loc->inode->st_mode & S_IWOTH) + break; + if (loc->inode->st_mode & S_IROTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM, fd); + return 0; + } + STACK_WIND (frame, + filter_opendir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, + loc, fd); + return 0; +} + + +static int32_t +filter_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +filter_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + + int32_t ret = 0; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (loc->inode->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (loc->inode->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM); + return 0; + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS); + return 0; + } + + STACK_WIND (frame, + filter_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, + dict, + flags); + return 0; +} + +static int32_t +filter_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + dict); + return 0; +} + +int32_t +filter_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + int32_t ret = 0; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (loc->inode->st_mode & S_IRGRP) + break; + case GF_FILTER_MAP_BOTH: + if (loc->inode->st_mode & S_IROTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM, NULL); + return 0; + } + + STACK_WIND (frame, + filter_getxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, + name); + return 0; +} + +static int32_t +filter_removexattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +filter_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + int32_t ret = 0; + ret = update_frame (frame, loc->inode, this->private); + switch (ret) { + case GF_FILTER_MAP_UID: + if (loc->inode->st_mode & S_IWGRP) + break; + case GF_FILTER_MAP_BOTH: + if (loc->inode->st_mode & S_IWOTH) + break; + gf_log (this->name, GF_LOG_DEBUG, "%s: returning permission denied", loc->path); + STACK_UNWIND (frame, -1, EPERM); + return 0; + case GF_FILTER_FILTER_UID: + case GF_FILTER_FILTER_GID: + case GF_FILTER_RO_FS: + STACK_UNWIND (frame, -1, EROFS); + return 0; + } + + STACK_WIND (frame, + filter_removexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + loc, + name); + return 0; +} + +int32_t +init (xlator_t *this) +{ + char *value = NULL; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *tmp_str2 = NULL; + char *dup_str = NULL; + char *input_value_str1 = NULL; + char *input_value_str2 = NULL; + char *output_value_str = NULL; + int32_t input_value = 0; + int32_t output_value = 0; + data_t *option_data = NULL; + struct gf_filter *filter = NULL; + gf_boolean_t tmp_bool = 0; + + if (!this->children || this->children->next) { + gf_log (this->name, + GF_LOG_ERROR, + "translator not configured with exactly one child"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + filter = CALLOC (sizeof (*filter), 1); + ERR_ABORT (filter); + + if (dict_get (this->options, "read-only")) { + value = data_to_str (dict_get (this->options, "read-only")); + if (gf_string2boolean (value, &filter->complete_read_only) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "wrong value provided for 'read-only'"); + return -1; + } + } + + if (dict_get (this->options, "root-squashing")) { + value = data_to_str (dict_get (this->options, "root-squashing")); + if (gf_string2boolean (value, &tmp_bool) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "wrong value provided for 'root-squashing'"); + return -1; + } + if (tmp_bool) { + filter->translate_num_uid_entries = 1; + filter->translate_num_gid_entries = 1; + filter->translate_input_uid[0][0] = GF_FILTER_ROOT_UID; /* root */ + filter->translate_input_uid[0][1] = GF_FILTER_ROOT_UID; /* root */ + filter->translate_input_gid[0][0] = GF_FILTER_ROOT_GID; /* root */ + filter->translate_input_gid[0][1] = GF_FILTER_ROOT_GID; /* root */ + filter->translate_output_uid[0] = GF_FILTER_NOBODY_UID; + filter->translate_output_gid[0] = GF_FILTER_NOBODY_GID; + } + } + + if (dict_get (this->options, "translate-uid")) { + option_data = dict_get (this->options, "translate-uid"); + value = strtok_r (option_data->data, ",", &tmp_str); + while (value) { + dup_str = strdup (value); + input_value_str1 = strtok_r (dup_str, "=", &tmp_str1); + if (input_value_str1) { + /* Check for n-m */ + char *temp_string = strdup (input_value_str1); + input_value_str2 = strtok_r (temp_string, "-", &tmp_str2); + if (gf_string2int (input_value_str2, &input_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + input_value_str2); + return -1; + } + filter->translate_input_uid[filter->translate_num_uid_entries][0] = input_value; + input_value_str2 = strtok_r (NULL, "-", &tmp_str2); + if (input_value_str2) { + if (gf_string2int (input_value_str2, &input_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + input_value_str2); + return -1; + } + } + filter->translate_input_uid[filter->translate_num_uid_entries][1] = input_value; + FREE (temp_string); + output_value_str = strtok_r (NULL, "=", &tmp_str1); + if (output_value_str) { + if (gf_string2int (output_value_str, &output_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + output_value_str); + return -1; + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "mapping string not valid"); + return -1; + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "mapping string not valid"); + return -1; + } + filter->translate_output_uid[filter->translate_num_uid_entries] = output_value; + gf_log (this->name, + GF_LOG_DEBUG, + "pair %d: input uid '%d' will be changed to uid '%d'", + filter->translate_num_uid_entries, input_value, output_value); + + filter->translate_num_uid_entries++; + if (filter->translate_num_uid_entries == GF_MAXIMUM_FILTERING_ALLOWED) + break; + value = strtok_r (NULL, ",", &tmp_str); + FREE (dup_str); + } + } + + tmp_str1 = NULL; + tmp_str2 = NULL; + tmp_str = NULL; + + if (dict_get (this->options, "translate-gid")) { + option_data = dict_get (this->options, "translate-gid"); + value = strtok_r (option_data->data, ",", &tmp_str); + while (value) { + dup_str = strdup (value); + input_value_str1 = strtok_r (dup_str, "=", &tmp_str1); + if (input_value_str1) { + /* Check for n-m */ + char *temp_string = strdup (input_value_str1); + input_value_str2 = strtok_r (temp_string, "-", &tmp_str2); + if (gf_string2int (input_value_str2, &input_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + input_value_str2); + return -1; + } + filter->translate_input_gid[filter->translate_num_gid_entries][0] = input_value; + input_value_str2 = strtok_r (NULL, "-", &tmp_str2); + if (input_value_str2) { + if (gf_string2int (input_value_str2, &input_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + input_value_str2); + return -1; + } + } + filter->translate_input_gid[filter->translate_num_gid_entries][1] = input_value; + FREE (temp_string); + output_value_str = strtok_r (NULL, "=", &tmp_str1); + if (output_value_str) { + if (gf_string2int (output_value_str, &output_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + output_value_str); + return -1; + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "translate-gid value not valid"); + return -1; + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "translate-gid value not valid"); + return -1; + } + + filter->translate_output_gid[filter->translate_num_gid_entries] = output_value; + + gf_log (this->name, GF_LOG_DEBUG, + "pair %d: input gid '%d' will be changed to gid '%d'", + filter->translate_num_gid_entries, input_value, output_value); + + filter->translate_num_gid_entries++; + if (filter->translate_num_gid_entries == GF_MAXIMUM_FILTERING_ALLOWED) + break; + value = strtok_r (NULL, ",", &tmp_str); + FREE (dup_str); + } + } + + tmp_str = NULL; + tmp_str1 = NULL; + + if (dict_get (this->options, "filter-uid")) { + option_data = dict_get (this->options, "filter-uid"); + value = strtok_r (option_data->data, ",", &tmp_str); + while (value) { + dup_str = strdup (value); + /* Check for n-m */ + input_value_str1 = strtok_r (dup_str, "-", &tmp_str1); + if (gf_string2int (input_value_str1, &input_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + input_value_str1); + return -1; + } + filter->filter_input_uid[filter->filter_num_uid_entries][0] = input_value; + input_value_str1 = strtok_r (NULL, "-", &tmp_str1); + if (input_value_str1) { + if (gf_string2int (input_value_str1, &input_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + input_value_str1); + return -1; + } + } + filter->filter_input_uid[filter->filter_num_uid_entries][1] = input_value; + + gf_log (this->name, + GF_LOG_DEBUG, + "filter [%d]: input uid(s) '%s' will be filtered", + filter->filter_num_uid_entries, dup_str); + + filter->filter_num_uid_entries++; + if (filter->filter_num_uid_entries == GF_MAXIMUM_FILTERING_ALLOWED) + break; + value = strtok_r (NULL, ",", &tmp_str); + FREE (dup_str); + } + filter->partial_filter = 1; + } + + tmp_str = NULL; + tmp_str1 = NULL; + + if (dict_get (this->options, "filter-gid")) { + option_data = dict_get (this->options, "filter-gid"); + value = strtok_r (option_data->data, ",", &tmp_str); + while (value) { + dup_str = strdup (value); + /* Check for n-m */ + input_value_str1 = strtok_r (dup_str, "-", &tmp_str1); + if (gf_string2int (input_value_str1, &input_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + input_value_str1); + return -1; + } + filter->filter_input_gid[filter->filter_num_gid_entries][0] = input_value; + input_value_str1 = strtok_r (NULL, "-", &tmp_str1); + if (input_value_str1) { + if (gf_string2int (input_value_str1, &input_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + input_value_str1); + return -1; + } + } + filter->filter_input_gid[filter->filter_num_gid_entries][1] = input_value; + + gf_log (this->name, + GF_LOG_DEBUG, + "filter [%d]: input gid(s) '%s' will be filtered", + filter->filter_num_gid_entries, dup_str); + + filter->filter_num_gid_entries++; + if (filter->filter_num_gid_entries == GF_MAXIMUM_FILTERING_ALLOWED) + break; + value = strtok_r (NULL, ",", &tmp_str); + FREE (dup_str); + } + gf_log (this->name, GF_LOG_ERROR, "this option is not supported currently.. exiting"); + return -1; + filter->partial_filter = 1; + } + + if (dict_get (this->options, "fixed-uid")) { + option_data = dict_get (this->options, "fixed-uid"); + if (gf_string2int (option_data->data, &input_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + option_data->data); + return -1; + } + filter->fixed_uid = input_value; + filter->fixed_uid_set = 1; + } + + if (dict_get (this->options, "fixed-gid")) { + option_data = dict_get (this->options, "fixed-gid"); + if (gf_string2int (option_data->data, &input_value) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\"", + option_data->data); + return -1; + } + filter->fixed_gid = input_value; + filter->fixed_gid_set = 1; + } + + this->private = filter; + return 0; +} + + +void +fini (xlator_t *this) +{ + struct gf_filter *filter = this->private; + + FREE (filter); + + return; +} + + +struct xlator_fops fops = { + .lookup = filter_lookup, + .stat = filter_stat, + .fstat = filter_fstat, + .chmod = filter_chmod, + .fchmod = filter_fchmod, + .readlink = filter_readlink, + .mknod = filter_mknod, + .mkdir = filter_mkdir, + .unlink = filter_unlink, + .rmdir = filter_rmdir, + .symlink = filter_symlink, + .rename = filter_rename, + .link = filter_link, + .chown = filter_chown, + .fchown = filter_fchown, + .truncate = filter_truncate, + .ftruncate = filter_ftruncate, + .create = filter_create, + .open = filter_open, + .readv = filter_readv, + .writev = filter_writev, + .setxattr = filter_setxattr, + .getxattr = filter_getxattr, + .removexattr = filter_removexattr, + .opendir = filter_opendir, + .utimens = filter_utimens, +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = { "root-squashing" }, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = { "read-only" }, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = { "fixed-uid" }, + .type = GF_OPTION_TYPE_INT + }, + { .key = { "fixed-gid" }, + .type = GF_OPTION_TYPE_INT + }, + { .key = { "translate-uid" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "translate-gid" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "filter-uid" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = { "filter-gid" }, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {NULL} }, +}; diff --git a/xlators/features/locks/Makefile.am b/xlators/features/locks/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/features/locks/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/locks/src/Makefile.am b/xlators/features/locks/src/Makefile.am new file mode 100644 index 000000000..ec4a953eb --- /dev/null +++ b/xlators/features/locks/src/Makefile.am @@ -0,0 +1,20 @@ +xlator_LTLIBRARIES = locks.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +locks_la_LDFLAGS = -module -avoidversion + +locks_la_SOURCES = common.c posix.c internal.c +locks_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = locks.h common.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -fno-strict-aliasing -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src $(GF_CFLAGS) -shared -nostartfiles + +CLEANFILES = + +uninstall-local: + rm -f $(DESTDIR)$(xlatordir)/posix-locks.so + +install-data-hook: + ln -sf locks.so $(DESTDIR)$(xlatordir)/posix-locks.so \ No newline at end of file diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c new file mode 100644 index 000000000..9ac1250cc --- /dev/null +++ b/xlators/features/locks/src/common.c @@ -0,0 +1,561 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <unistd.h> +#include <fcntl.h> +#include <limits.h> +#include <pthread.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "compat.h" +#include "xlator.h" +#include "inode.h" +#include "logging.h" +#include "common-utils.h" + +#include "locks.h" + + +int +pl_is_lock_grantable (pl_inode_t *pl_inode, posix_lock_t *lock, + gf_lk_domain_t dom); +static void +__insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock, + gf_lk_domain_t dom); + + +pl_inode_t * +pl_inode_get (xlator_t *this, inode_t *inode) +{ + pl_inode_t *pl_inode = NULL; + mode_t st_mode = 0; + uint64_t tmp_pl_inode = 0; + int ret = 0; + + LOCK (&inode->lock); + { + ret = inode_ctx_get (inode, this, &tmp_pl_inode); + if (ret == 0) { + pl_inode = (pl_inode_t *)(long)tmp_pl_inode; + goto out; + } + + pl_inode = CALLOC (1, sizeof (*pl_inode)); + if (!pl_inode) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + st_mode = inode->st_mode; + if ((st_mode & S_ISGID) && !(st_mode & S_IXGRP)) + pl_inode->mandatory = 1; + + + pthread_mutex_init (&pl_inode->mutex, NULL); + + INIT_LIST_HEAD (&pl_inode->dir_list); + INIT_LIST_HEAD (&pl_inode->ext_list); + INIT_LIST_HEAD (&pl_inode->int_list); + INIT_LIST_HEAD (&pl_inode->rw_list); + + ret = inode_ctx_put (inode, this, (uint64_t)(long)pl_inode); + } +out: + UNLOCK (&inode->lock); + return pl_inode; +} + + +/* Create a new posix_lock_t */ +posix_lock_t * +new_posix_lock (struct flock *flock, transport_t *transport, pid_t client_pid) +{ + posix_lock_t *lock = NULL; + + lock = CALLOC (1, sizeof (posix_lock_t)); + if (!lock) { + return NULL; + } + + lock->fl_start = flock->l_start; + lock->fl_type = flock->l_type; + + if (flock->l_len == 0) + lock->fl_end = LLONG_MAX; + else + lock->fl_end = flock->l_start + flock->l_len - 1; + + lock->transport = transport; + lock->client_pid = client_pid; + + INIT_LIST_HEAD (&lock->list); + + return lock; +} + + +/* Delete a lock from the inode's lock list */ +void +__delete_lock (pl_inode_t *pl_inode, posix_lock_t *lock) +{ + list_del_init (&lock->list); +} + + +/* Destroy a posix_lock */ +void +__destroy_lock (posix_lock_t *lock) +{ + free (lock); +} + + +/* Convert a posix_lock to a struct flock */ +void +posix_lock_to_flock (posix_lock_t *lock, struct flock *flock) +{ + flock->l_pid = lock->client_pid; + flock->l_type = lock->fl_type; + flock->l_start = lock->fl_start; + + if (lock->fl_end == 0) + flock->l_len = LLONG_MAX; + else + flock->l_len = lock->fl_end - lock->fl_start + 1; +} + + +/* Insert the lock into the inode's lock list */ +void +pl_insert_lock (pl_inode_t *pl_inode, posix_lock_t *lock, gf_lk_domain_t dom) +{ + list_add_tail (&lock->list, DOMAIN_HEAD (pl_inode, dom)); + + return; +} + + +/* Return true if the locks overlap, false otherwise */ +int +locks_overlap (posix_lock_t *l1, posix_lock_t *l2) +{ + /* + Note: + FUSE always gives us absolute offsets, so no need to worry + about SEEK_CUR or SEEK_END + */ + + return ((l1->fl_end >= l2->fl_start) && + (l2->fl_end >= l1->fl_start)); +} + + +/* Return true if the locks have the same owner */ +int +same_owner (posix_lock_t *l1, posix_lock_t *l2) +{ + return ((l1->client_pid == l2->client_pid) && + (l1->transport == l2->transport)); +} + + +/* Delete all F_UNLCK locks */ +void +__delete_unlck_locks (pl_inode_t *pl_inode, gf_lk_domain_t dom) +{ + posix_lock_t *l = NULL; + posix_lock_t *tmp = NULL; + + list_for_each_entry_safe (l, tmp, DOMAIN_HEAD (pl_inode, dom), list) { + if (l->fl_type == F_UNLCK) { + __delete_lock (pl_inode, l); + __destroy_lock (l); + } + } +} + + +/* Add two locks */ +static posix_lock_t * +add_locks (posix_lock_t *l1, posix_lock_t *l2) +{ + posix_lock_t *sum = NULL; + + sum = CALLOC (1, sizeof (posix_lock_t)); + if (!sum) + return NULL; + + sum->fl_start = min (l1->fl_start, l2->fl_start); + sum->fl_end = max (l1->fl_end, l2->fl_end); + + return sum; +} + +/* Subtract two locks */ +struct _values { + posix_lock_t *locks[3]; +}; + +/* {big} must always be contained inside {small} */ +static struct _values +subtract_locks (posix_lock_t *big, posix_lock_t *small) +{ + struct _values v = { .locks = {0, 0, 0} }; + + if ((big->fl_start == small->fl_start) && + (big->fl_end == small->fl_end)) { + /* both edges coincide with big */ + v.locks[0] = CALLOC (1, sizeof (posix_lock_t)); + ERR_ABORT (v.locks[0]); + memcpy (v.locks[0], big, sizeof (posix_lock_t)); + v.locks[0]->fl_type = small->fl_type; + } + else if ((small->fl_start > big->fl_start) && + (small->fl_end < big->fl_end)) { + /* both edges lie inside big */ + v.locks[0] = CALLOC (1, sizeof (posix_lock_t)); + ERR_ABORT (v.locks[0]); + v.locks[1] = CALLOC (1, sizeof (posix_lock_t)); + ERR_ABORT (v.locks[1]); + v.locks[2] = CALLOC (1, sizeof (posix_lock_t)); + ERR_ABORT (v.locks[2]); + + memcpy (v.locks[0], big, sizeof (posix_lock_t)); + v.locks[0]->fl_end = small->fl_start - 1; + + memcpy (v.locks[1], small, sizeof (posix_lock_t)); + memcpy (v.locks[2], big, sizeof (posix_lock_t)); + v.locks[2]->fl_start = small->fl_end + 1; + } + /* one edge coincides with big */ + else if (small->fl_start == big->fl_start) { + v.locks[0] = CALLOC (1, sizeof (posix_lock_t)); + ERR_ABORT (v.locks[0]); + v.locks[1] = CALLOC (1, sizeof (posix_lock_t)); + ERR_ABORT (v.locks[1]); + + memcpy (v.locks[0], big, sizeof (posix_lock_t)); + v.locks[0]->fl_start = small->fl_end + 1; + + memcpy (v.locks[1], small, sizeof (posix_lock_t)); + } + else if (small->fl_end == big->fl_end) { + v.locks[0] = CALLOC (1, sizeof (posix_lock_t)); + ERR_ABORT (v.locks[0]); + v.locks[1] = CALLOC (1, sizeof (posix_lock_t)); + ERR_ABORT (v.locks[1]); + + memcpy (v.locks[0], big, sizeof (posix_lock_t)); + v.locks[0]->fl_end = small->fl_start - 1; + + memcpy (v.locks[1], small, sizeof (posix_lock_t)); + } + else { + gf_log ("posix-locks", GF_LOG_DEBUG, + "unexpected case in subtract_locks"); + } + + return v; +} + +/* + Start searching from {begin}, and return the first lock that + conflicts, NULL if no conflict + If {begin} is NULL, then start from the beginning of the list +*/ +static posix_lock_t * +first_overlap (pl_inode_t *pl_inode, posix_lock_t *lock, + gf_lk_domain_t dom) +{ + posix_lock_t *l = NULL; + + list_for_each_entry (l, DOMAIN_HEAD (pl_inode, dom), list) { + if (l->blocked) + continue; + + if (locks_overlap (l, lock)) + return l; + } + + return NULL; +} + + + +/* Return true if lock is grantable */ +int +pl_is_lock_grantable (pl_inode_t *pl_inode, posix_lock_t *lock, + gf_lk_domain_t dom) +{ + posix_lock_t *l = NULL; + int ret = 1; + + list_for_each_entry (l, DOMAIN_HEAD (pl_inode, dom), list) { + if (!l->blocked && locks_overlap (lock, l)) { + if (((l->fl_type == F_WRLCK) + || (lock->fl_type == F_WRLCK)) + && (lock->fl_type != F_UNLCK) + && !same_owner (l, lock)) { + ret = 0; + break; + } + } + } + return ret; +} + + +extern void do_blocked_rw (pl_inode_t *); + + +static void +__insert_and_merge (pl_inode_t *pl_inode, posix_lock_t *lock, + gf_lk_domain_t dom) +{ + posix_lock_t *conf = NULL; + posix_lock_t *t = NULL; + posix_lock_t *sum = NULL; + int i = 0; + struct _values v = { .locks = {0, 0, 0} }; + + list_for_each_entry_safe (conf, t, DOMAIN_HEAD (pl_inode, dom), list) { + if (!locks_overlap (conf, lock)) + continue; + + if (same_owner (conf, lock)) { + if (conf->fl_type == lock->fl_type) { + sum = add_locks (lock, conf); + + sum->fl_type = lock->fl_type; + sum->transport = lock->transport; + sum->client_pid = lock->client_pid; + + __delete_lock (pl_inode, conf); + __destroy_lock (conf); + + __destroy_lock (lock); + __insert_and_merge (pl_inode, sum, dom); + + return; + } else { + sum = add_locks (lock, conf); + + sum->fl_type = conf->fl_type; + sum->transport = conf->transport; + sum->client_pid = conf->client_pid; + + v = subtract_locks (sum, lock); + + __delete_lock (pl_inode, conf); + __destroy_lock (conf); + + __delete_lock (pl_inode, lock); + __destroy_lock (lock); + + __destroy_lock (sum); + + for (i = 0; i < 3; i++) { + if (!v.locks[i]) + continue; + + if (v.locks[i]->fl_type == F_UNLCK) { + __destroy_lock (v.locks[i]); + continue; + } + __insert_and_merge (pl_inode, + v.locks[i], dom); + } + + __delete_unlck_locks (pl_inode, dom); + return; + } + } + + if (lock->fl_type == F_UNLCK) { + continue; + } + + if ((conf->fl_type == F_RDLCK) && (lock->fl_type == F_RDLCK)) { + pl_insert_lock (pl_inode, lock, dom); + return; + } + } + + /* no conflicts, so just insert */ + if (lock->fl_type != F_UNLCK) { + pl_insert_lock (pl_inode, lock, dom); + } else { + __destroy_lock (lock); + } +} + + +void +__grant_blocked_locks (xlator_t *this, pl_inode_t *pl_inode, + gf_lk_domain_t dom, struct list_head *granted) +{ + struct list_head tmp_list; + posix_lock_t *l = NULL; + posix_lock_t *tmp = NULL; + posix_lock_t *conf = NULL; + + INIT_LIST_HEAD (&tmp_list); + + list_for_each_entry_safe (l, tmp, DOMAIN_HEAD (pl_inode, dom), list) { + if (l->blocked) { + conf = first_overlap (pl_inode, l, dom); + if (conf) + continue; + + l->blocked = 0; + list_move_tail (&l->list, &tmp_list); + } + } + + list_for_each_entry_safe (l, tmp, &tmp_list, list) { + list_del_init (&l->list); + + if (pl_is_lock_grantable (pl_inode, l, dom)) { + conf = CALLOC (1, sizeof (*conf)); + + if (!conf) { + l->blocked = 1; + pl_insert_lock (pl_inode, l, dom); + continue; + } + + conf->frame = l->frame; + l->frame = NULL; + + posix_lock_to_flock (l, &conf->user_flock); + + gf_log (this->name, GF_LOG_DEBUG, + "%s (pid=%d) %"PRId64" - %"PRId64" => Granted", + l->fl_type == F_UNLCK ? "Unlock" : "Lock", + l->client_pid, + l->user_flock.l_start, + l->user_flock.l_len); + + __insert_and_merge (pl_inode, l, dom); + + list_add (&conf->list, granted); + } else { + l->blocked = 1; + pl_insert_lock (pl_inode, l, dom); + } + } +} + + +void +grant_blocked_locks (xlator_t *this, pl_inode_t *pl_inode, gf_lk_domain_t dom) +{ + struct list_head granted_list; + posix_lock_t *tmp = NULL; + posix_lock_t *lock = NULL; + + INIT_LIST_HEAD (&granted_list); + + pthread_mutex_lock (&pl_inode->mutex); + { + __grant_blocked_locks (this, pl_inode, dom, &granted_list); + } + pthread_mutex_unlock (&pl_inode->mutex); + + list_for_each_entry_safe (lock, tmp, &granted_list, list) { + list_del_init (&lock->list); + + STACK_UNWIND (lock->frame, 0, 0, &lock->user_flock); + + FREE (lock); + } + + return; +} + + +int +pl_setlk (xlator_t *this, pl_inode_t *pl_inode, posix_lock_t *lock, + int can_block, gf_lk_domain_t dom) +{ + int ret = 0; + + errno = 0; + + pthread_mutex_lock (&pl_inode->mutex); + { + if (pl_is_lock_grantable (pl_inode, lock, dom)) { + gf_log (this->name, GF_LOG_DEBUG, + "%s (pid=%d) %"PRId64" - %"PRId64" => OK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, + lock->user_flock.l_start, + lock->user_flock.l_len); + __insert_and_merge (pl_inode, lock, dom); + } else if (can_block) { + gf_log (this->name, GF_LOG_DEBUG, + "%s (pid=%d) %"PRId64" - %"PRId64" => Blocked", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, + lock->user_flock.l_start, + lock->user_flock.l_len); + lock->blocked = 1; + pl_insert_lock (pl_inode, lock, dom); + ret = -1; + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%s (pid=%d) %"PRId64" - %"PRId64" => NOK", + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, + lock->user_flock.l_start, + lock->user_flock.l_len); + errno = EAGAIN; + ret = -1; + } + } + pthread_mutex_unlock (&pl_inode->mutex); + + grant_blocked_locks (this, pl_inode, dom); + + do_blocked_rw (pl_inode); + + return ret; +} + + +posix_lock_t * +pl_getlk (pl_inode_t *pl_inode, posix_lock_t *lock, gf_lk_domain_t dom) +{ + posix_lock_t *conf = NULL; + + conf = first_overlap (pl_inode, lock, dom); + + if (conf == NULL) { + lock->fl_type = F_UNLCK; + return lock; + } + + return conf; +} diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h new file mode 100644 index 000000000..135f33011 --- /dev/null +++ b/xlators/features/locks/src/common.h @@ -0,0 +1,59 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __COMMON_H__ +#define __COMMON_H__ + +posix_lock_t * +new_posix_lock (struct flock *flock, transport_t *transport, pid_t client_pid); + +pl_inode_t * +pl_inode_get (xlator_t *this, inode_t *inode); + +posix_lock_t * +pl_getlk (pl_inode_t *inode, posix_lock_t *lock, gf_lk_domain_t domain); + +int +pl_setlk (xlator_t *this, pl_inode_t *inode, posix_lock_t *lock, + int can_block, gf_lk_domain_t domain); + +int +pl_is_lock_grantable (pl_inode_t *pl_inode, posix_lock_t *lock, + gf_lk_domain_t dom); + +void +pl_insert_lock (pl_inode_t *pl_inode, posix_lock_t *lock, gf_lk_domain_t dom); + +void +grant_blocked_locks (xlator_t *this, pl_inode_t *inode, gf_lk_domain_t domain); + +void +posix_lock_to_flock (posix_lock_t *lock, struct flock *flock); + +int +locks_overlap (posix_lock_t *l1, posix_lock_t *l2); + +int +same_owner (posix_lock_t *l1, posix_lock_t *l2); + +void __delete_lock (pl_inode_t *, posix_lock_t *); + +void __destroy_lock (posix_lock_t *); + +#endif /* __COMMON_H__ */ diff --git a/xlators/features/locks/src/internal.c b/xlators/features/locks/src/internal.c new file mode 100644 index 000000000..7f454a78e --- /dev/null +++ b/xlators/features/locks/src/internal.c @@ -0,0 +1,762 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "compat.h" +#include "xlator.h" +#include "inode.h" +#include "logging.h" +#include "common-utils.h" +#include "list.h" + +#include "locks.h" +#include "common.h" + + + +static int +delete_locks_of_transport (pl_inode_t *pinode, transport_t *trans) +{ + posix_lock_t *tmp = NULL; + posix_lock_t *l = NULL; + + list_for_each_entry_safe (l, tmp, &pinode->dir_list, list) { + if (l->transport == trans) { + __delete_lock (pinode, tmp); + __destroy_lock (tmp); + } + } + + return 0; +} + + +static posix_lock_t * +__find_exact_matching_lock (pl_inode_t *pinode, posix_lock_t *lock) +{ + posix_lock_t *l = NULL; + posix_lock_t *match = NULL; + + list_for_each_entry (l, DOMAIN_HEAD (pinode, GF_LOCK_INTERNAL), list) { + if (same_owner (l, lock) + && (l->fl_start == lock->fl_start) + && (l->fl_end == lock->fl_end)) { + match = l; + break; + } + } + + return match; +} + +/** + * pl_inodelk: + * + * This fop provides fcntl-style locking on files for internal + * purposes. Locks held through this fop reside in a domain different + * from those held by applications. This fop is for the use of AFR. + */ + + +static int +pl_inodelk_common (call_frame_t *frame, xlator_t *this, + inode_t *inode, int32_t cmd, struct flock *flock) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int can_block = 0; + + posix_locks_private_t * priv = NULL; + transport_t * transport = NULL; + pid_t client_pid = -1; + pl_inode_t * pinode = NULL; + + posix_lock_t * reqlock = NULL; + posix_lock_t * matchlock = NULL; /* steady, fire! */ + + VALIDATE_OR_GOTO (frame, unwind); + VALIDATE_OR_GOTO (inode, unwind); + VALIDATE_OR_GOTO (flock, unwind); + + if ((flock->l_start < 0) || (flock->l_len < 0)) { + op_errno = EINVAL; + goto unwind; + } + + transport = frame->root->trans; + client_pid = frame->root->pid; + + priv = (posix_locks_private_t *) this->private; + + VALIDATE_OR_GOTO (priv, unwind); + + pinode = pl_inode_get (this, inode); + if (!pinode) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto unwind; + } + + if (client_pid == 0) { + /* + special case: this means release all locks + from this transport + */ + + gf_log (this->name, GF_LOG_DEBUG, + "releasing all locks from transport %p", transport); + + delete_locks_of_transport (pinode, transport); + goto unwind; + } + + reqlock = new_posix_lock (flock, transport, client_pid); + if (!reqlock) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + pthread_mutex_lock (&pinode->mutex); + { + switch (cmd) { + case F_SETLKW: + can_block = 1; + reqlock->frame = frame; + reqlock->this = this; + + /* fall through */ + + case F_SETLK: + memcpy (&reqlock->user_flock, flock, sizeof (struct flock)); + + switch (flock->l_type) { + + case F_WRLCK: + if (!pl_is_lock_grantable (pinode, reqlock, GF_LOCK_INTERNAL)) { + if (can_block) { + gf_log (this->name, GF_LOG_DEBUG, + "%s (pid=%d) %"PRId64" - %"PRId64" => blocked", + reqlock->fl_type == F_UNLCK ? "unlock" : "lock", + reqlock->client_pid, + reqlock->user_flock.l_start, + reqlock->user_flock.l_len); + pl_insert_lock (pinode, reqlock, GF_LOCK_INTERNAL); + + goto unlock; + } + + __destroy_lock (reqlock); + + + gf_log (this->name, GF_LOG_DEBUG, + "%s (pid=%d) %"PRId64" - %"PRId64" => NOK", + reqlock->fl_type == F_UNLCK ? "unlock" : "lock", + reqlock->client_pid, reqlock->user_flock.l_start, + reqlock->user_flock.l_len); + op_errno = EAGAIN; + + goto unlock; + } + + gf_log (this->name, GF_LOG_DEBUG, + "%s (pid=%d) %"PRId64" - %"PRId64" => OK", + reqlock->fl_type == F_UNLCK ? "unlock" : "lock", + reqlock->client_pid, + reqlock->user_flock.l_start, + reqlock->user_flock.l_len); + pl_insert_lock (pinode, reqlock, GF_LOCK_INTERNAL); + + break; + + case F_UNLCK: + matchlock = __find_exact_matching_lock (pinode, reqlock); + + __destroy_lock (reqlock); + if (!matchlock) { + op_errno = EINVAL; + goto unlock; + } + + __delete_lock (pinode, matchlock); + __destroy_lock (matchlock); + + break; + + default: + op_errno = ENOTSUP; + gf_log (this->name, GF_LOG_ERROR, + "lock type %d not supported for [F]INODELK", + flock->l_type); + goto unlock; + } + + + break; + + default: + op_errno = ENOTSUP; + gf_log (this->name, GF_LOG_ERROR, + "lock command F_GETLK not supported for [F]INODELK (cmd=%d)", + cmd); + goto unlock; + } + + op_ret = 0; + + unlock: + if (pinode) + pthread_mutex_unlock (&pinode->mutex); + } + +unwind: + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int +pl_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *flock) +{ + return pl_inodelk_common (frame, this, loc->inode, cmd, flock); +} + + +int +pl_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *flock) +{ + return pl_inodelk_common (frame, this, fd->inode, cmd, flock); +} + + +/** + * types_conflict - do two types of lock conflict? + * @t1: type + * @t2: type + * + * two read locks do not conflict + * any other case conflicts + */ + +static int +types_conflict (entrylk_type t1, entrylk_type t2) +{ + return !((t1 == ENTRYLK_RDLCK) && (t2 == ENTRYLK_RDLCK)); +} + +/** + * all_names - does a basename represent all names? + * @basename: name to check + */ + +#define all_names(basename) ((basename == NULL) ? 1 : 0) + +/** + * names_conflict - do two names conflict? + * @n1: name + * @n2: name + */ + +static int +names_conflict (const char *n1, const char *n2) +{ + return all_names (n1) || all_names (n2) || !strcmp (n1, n2); +} + + +static int +names_equal (const char *n1, const char *n2) +{ + return (n1 == NULL && n2 == NULL) || (n1 && n2 && !strcmp (n1, n2)); +} + +/** + * lock_grantable - is this lock grantable? + * @inode: inode in which to look + * @basename: name we're trying to lock + * @type: type of lock + */ + +static pl_entry_lock_t * +__lock_grantable (pl_inode_t *pinode, const char *basename, entrylk_type type) +{ + pl_entry_lock_t *lock = NULL; + + if (list_empty (&pinode->dir_list)) + return NULL; + + list_for_each_entry (lock, &pinode->dir_list, inode_list) { + if (names_conflict (lock->basename, basename) && + types_conflict (lock->type, type)) + return lock; + } + + return NULL; +} + +/** + * find_most_matching_lock - find the lock struct which most matches in order of: + * lock on the exact basename || + * an all_names lock + * + * + * @inode: inode in which to look + * @basename: name to search for + */ + +static pl_entry_lock_t * +__find_most_matching_lock (pl_inode_t *pinode, const char *basename) +{ + pl_entry_lock_t *lock; + pl_entry_lock_t *all = NULL; + pl_entry_lock_t *exact = NULL; + + if (list_empty (&pinode->dir_list)) + return NULL; + + list_for_each_entry (lock, &pinode->dir_list, inode_list) { + if (all_names (lock->basename)) + all = lock; + else if (names_equal (lock->basename, basename)) + exact = lock; + } + + return (exact ? exact : all); +} + + +/** + * insert_new_lock - insert a new dir lock into the inode with the given parameters + * @pinode: inode to insert into + * @basename: basename for the lock + * @type: type of the lock + */ + +static pl_entry_lock_t * +new_entrylk_lock (pl_inode_t *pinode, const char *basename, entrylk_type type, + transport_t *trans) +{ + pl_entry_lock_t *newlock = NULL; + + newlock = CALLOC (sizeof (pl_entry_lock_t), 1); + if (!newlock) { + goto out; + } + + newlock->basename = basename ? strdup (basename) : NULL; + newlock->type = type; + newlock->trans = trans; + + if (type == ENTRYLK_RDLCK) + newlock->read_count = 1; + + INIT_LIST_HEAD (&newlock->inode_list); + INIT_LIST_HEAD (&newlock->blocked_locks); + +out: + return newlock; +} + +/** + * lock_name - lock a name in a directory + * @inode: inode for the directory in which to lock + * @basename: name of the entry to lock + * if null, lock the entire directory + * + * the entire directory being locked is represented as: a single + * pl_entry_lock_t present in the entrylk_locks list with its + * basename = NULL + */ + +int +__lock_name (pl_inode_t *pinode, const char *basename, entrylk_type type, + call_frame_t *frame, xlator_t *this, int nonblock) +{ + pl_entry_lock_t *lock = NULL; + pl_entry_lock_t *conf = NULL; + + transport_t *trans = NULL; + + int ret = -EINVAL; + + trans = frame->root->trans; + + conf = __lock_grantable (pinode, basename, type); + if (conf) { + ret = -EAGAIN; + if (nonblock) + goto out; + + lock = new_entrylk_lock (pinode, basename, type, trans); + + if (!lock) { + ret = -ENOMEM; + goto out; + } + + gf_log (this->name, GF_LOG_DEBUG, + "blocking lock: {pinode=%p, basename=%s}", + pinode, basename); + + lock->frame = frame; + lock->this = this; + lock->blocked = 1; + + list_add (&lock->blocked_locks, &conf->blocked_locks); + + + goto out; + } + + switch (type) { + case ENTRYLK_RDLCK: + lock = __find_most_matching_lock (pinode, basename); + + if (lock && names_equal (lock->basename, basename)) { + lock->read_count++; + + FREE (lock->basename); + FREE (lock); + + lock = NULL; + } else { + lock = new_entrylk_lock (pinode, basename, type, trans); + + if (!lock) { + ret = -ENOMEM; + goto out; + } + + list_add (&lock->inode_list, &pinode->dir_list); + } + break; + + case ENTRYLK_WRLCK: + lock = new_entrylk_lock (pinode, basename, type, trans); + + if (!lock) { + ret = -ENOMEM; + goto out; + } + + list_add (&lock->inode_list, &pinode->dir_list); + break; + } + + ret = 0; +out: + return ret; +} + + +/** + * unlock_name - unlock a name in a directory + * @inode: inode for the directory to unlock in + * @basename: name of the entry to unlock + * if null, unlock the entire directory + */ + +pl_entry_lock_t * +__unlock_name (pl_inode_t *pinode, const char *basename, entrylk_type type) +{ + pl_entry_lock_t *lock = NULL; + pl_entry_lock_t *ret_lock = NULL; + + lock = __find_most_matching_lock (pinode, basename); + + if (!lock) { + gf_log ("locks", GF_LOG_DEBUG, + "unlock on %s (type=%s) attempted but no matching lock found", + basename, type == ENTRYLK_RDLCK ? "ENTRYLK_RDLCK" : + "ENTRYLK_WRLCK"); + goto out; + } + + if (names_equal (lock->basename, basename) + && lock->type == type) { + if (type == ENTRYLK_RDLCK) { + lock->read_count--; + } + if (type == ENTRYLK_WRLCK || lock->read_count == 0) { + list_del (&lock->inode_list); + ret_lock = lock; + } + } else { + gf_log ("locks", GF_LOG_ERROR, + "unlock for a non-existing lock!"); + goto out; + } + +out: + return ret_lock; +} + + +void +__grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, + pl_entry_lock_t *lock, + struct list_head *granted) +{ + int bl_ret = 0; + pl_entry_lock_t *bl = NULL; + pl_entry_lock_t *tmp = NULL; + + list_for_each_entry_safe (bl, tmp, &lock->blocked_locks, + blocked_locks) { + list_del_init (&bl->blocked_locks); + + /* TODO: error checking */ + + gf_log ("locks", GF_LOG_DEBUG, + "trying to unblock: {pinode=%p, basename=%s}", + pl_inode, bl->basename); + + bl_ret = __lock_name (pl_inode, bl->basename, bl->type, + bl->frame, bl->this, 0); + + if (bl_ret == 0) { + list_add (&bl->blocked_locks, granted); + } else { + if (bl->basename) + FREE (bl->basename); + FREE (bl); + } + } + return; +} + + +void +grant_blocked_entry_locks (xlator_t *this, pl_inode_t *pl_inode, + pl_entry_lock_t *unlocked) +{ + struct list_head granted_list; + pl_entry_lock_t *tmp = NULL; + pl_entry_lock_t *lock = NULL; + + INIT_LIST_HEAD (&granted_list); + + pthread_mutex_lock (&pl_inode->mutex); + { + __grant_blocked_entry_locks (this, pl_inode, unlocked, + &granted_list); + } + pthread_mutex_unlock (&pl_inode->mutex); + + list_for_each_entry_safe (lock, tmp, &granted_list, blocked_locks) { + list_del_init (&lock->blocked_locks); + + STACK_UNWIND (lock->frame, 0, 0); + + FREE (lock->basename); + FREE (lock); + } + + FREE (unlocked->basename); + FREE (unlocked); + + return; +} + + +/** + * release_entry_locks_for_transport: release all entry locks from this + * transport for this loc_t + */ + +static int +release_entry_locks_for_transport (xlator_t *this, pl_inode_t *pinode, + transport_t *trans) +{ + pl_entry_lock_t *lock; + pl_entry_lock_t *tmp; + struct list_head granted; + + INIT_LIST_HEAD (&granted); + + pthread_mutex_lock (&pinode->mutex); + { + if (list_empty (&pinode->dir_list)) { + goto unlock; + } + + list_for_each_entry_safe (lock, tmp, &pinode->dir_list, + inode_list) { + if (lock->trans != trans) + continue; + + list_del_init (&lock->inode_list); + __grant_blocked_entry_locks (this, pinode, lock, + &granted); + + FREE (lock->basename); + FREE (lock); + } + } +unlock: + pthread_mutex_unlock (&pinode->mutex); + + list_for_each_entry_safe (lock, tmp, &granted, blocked_locks) { + list_del_init (&lock->blocked_locks); + + STACK_UNWIND (lock->frame, 0, 0); + + FREE (lock->basename); + FREE (lock); + } + + return 0; +} + + +/** + * pl_entrylk: + * + * Locking on names (directory entries) + */ + +int +pl_entrylk_common (call_frame_t *frame, xlator_t *this, + inode_t *inode, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + + transport_t * transport = NULL; + pid_t pid = -1; + + pl_inode_t * pinode = NULL; + int ret = -1; + pl_entry_lock_t *unlocked = NULL; + char unwind = 1; + + pinode = pl_inode_get (this, inode); + if (!pinode) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto out; + } + + pid = frame->root->pid; + transport = frame->root->trans; + + if (pid == 0) { + /* + this is a special case that means release + all locks from this transport + */ + + gf_log (this->name, GF_LOG_DEBUG, + "releasing locks for transport %p", transport); + + release_entry_locks_for_transport (this, pinode, transport); + op_ret = 0; + + goto out; + } + + switch (cmd) { + case ENTRYLK_LOCK: + pthread_mutex_lock (&pinode->mutex); + { + ret = __lock_name (pinode, basename, type, + frame, this, 0); + } + pthread_mutex_unlock (&pinode->mutex); + + if (ret < 0) { + if (ret == -EAGAIN) + unwind = 0; + op_errno = -ret; + goto out; + } + + break; + + case ENTRYLK_LOCK_NB: + pthread_mutex_lock (&pinode->mutex); + { + ret = __lock_name (pinode, basename, type, + frame, this, 1); + } + pthread_mutex_unlock (&pinode->mutex); + + if (ret < 0) { + op_errno = -ret; + goto out; + } + + break; + + case ENTRYLK_UNLOCK: + pthread_mutex_lock (&pinode->mutex); + { + unlocked = __unlock_name (pinode, basename, type); + } + pthread_mutex_unlock (&pinode->mutex); + + if (unlocked) + grant_blocked_entry_locks (this, pinode, unlocked); + + break; + + default: + gf_log (this->name, GF_LOG_ERROR, + "unexpected case!"); + goto out; + } + + op_ret = 0; +out: + if (unwind) { + STACK_UNWIND (frame, op_ret, op_errno); + } + + return 0; +} + + +int +pl_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + return pl_entrylk_common (frame, this, loc->inode, basename, cmd, type); +} + + +int +pl_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + return pl_entrylk_common (frame, this, fd->inode, basename, cmd, type); +} diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h new file mode 100644 index 000000000..8ed7bb63f --- /dev/null +++ b/xlators/features/locks/src/locks.h @@ -0,0 +1,111 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __POSIX_LOCKS_H__ +#define __POSIX_LOCKS_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "compat-errno.h" +#include "transport.h" +#include "stack.h" +#include "call-stub.h" + +struct __pl_fd; + +struct __posix_lock { + struct list_head list; + + short fl_type; + off_t fl_start; + off_t fl_end; + + short blocked; /* waiting to acquire */ + struct flock user_flock; /* the flock supplied by the user */ + xlator_t *this; /* required for blocked locks */ + fd_t *fd; + + call_frame_t *frame; + + /* These two together serve to uniquely identify each process + across nodes */ + + transport_t *transport; /* to identify client node */ + pid_t client_pid; /* pid of client process */ +}; +typedef struct __posix_lock posix_lock_t; + +struct __pl_rw_req_t { + struct list_head list; + call_stub_t *stub; + posix_lock_t region; +}; +typedef struct __pl_rw_req_t pl_rw_req_t; + + +struct __entry_lock { + struct list_head inode_list; /* list_head back to pl_inode_t */ + struct list_head blocked_locks; /* locks blocked due to this lock */ + + call_frame_t *frame; + xlator_t *this; + int blocked; + + const char *basename; + entrylk_type type; + unsigned int read_count; /* number of read locks */ + transport_t *trans; +}; +typedef struct __entry_lock pl_entry_lock_t; + + +/* The "simulated" inode. This contains a list of all the locks associated + with this file */ + +struct __pl_inode { + pthread_mutex_t mutex; + + struct list_head dir_list; /* list of entry locks */ + struct list_head ext_list; /* list of fcntl locks */ + struct list_head int_list; /* list of internal locks */ + struct list_head rw_list; /* list of waiting r/w requests */ + int mandatory; /* if mandatory locking is enabled */ +}; +typedef struct __pl_inode pl_inode_t; + +#define DOMAIN_HEAD(pl_inode, dom) (dom == GF_LOCK_POSIX \ + ? &pl_inode->ext_list \ + : &pl_inode->int_list) + + +struct __pl_fd { + gf_boolean_t nonblocking; /* whether O_NONBLOCK has been set */ +}; +typedef struct __pl_fd pl_fd_t; + + +typedef struct { + gf_boolean_t mandatory; /* if mandatory locking is enabled */ +} posix_locks_private_t; + + +#endif /* __POSIX_LOCKS_H__ */ diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c new file mode 100644 index 000000000..e2b336607 --- /dev/null +++ b/xlators/features/locks/src/posix.c @@ -0,0 +1,834 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <unistd.h> +#include <fcntl.h> +#include <limits.h> +#include <pthread.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "compat.h" +#include "xlator.h" +#include "inode.h" +#include "logging.h" +#include "common-utils.h" + +#include "locks.h" +#include "common.h" + +#ifndef LLONG_MAX +#define LLONG_MAX LONG_LONG_MAX /* compat with old gcc */ +#endif /* LLONG_MAX */ + +/* Forward declarations */ + + +void do_blocked_rw (pl_inode_t *); +static int __rw_allowable (pl_inode_t *, posix_lock_t *, glusterfs_fop_t); + +struct _truncate_ops { + loc_t loc; + fd_t *fd; + off_t offset; + enum {TRUNCATE, FTRUNCATE} op; +}; + + +int +pl_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + struct _truncate_ops *local = NULL; + + local = frame->local; + + if (local->op == TRUNCATE) + loc_wipe (&local->loc); + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + + +static int +truncate_allowed (pl_inode_t *pl_inode, + transport_t *transport, pid_t client_pid, + off_t offset) +{ + posix_lock_t *l = NULL; + posix_lock_t region = {.list = {0, }, }; + int ret = 1; + + region.fl_start = offset; + region.fl_end = LLONG_MAX; + region.transport = transport; + region.client_pid = client_pid; + + pthread_mutex_lock (&pl_inode->mutex); + { + list_for_each_entry (l, &pl_inode->ext_list, list) { + if (!l->blocked + && locks_overlap (®ion, l) + && !same_owner (®ion, l)) { + ret = 0; + break; + } + } + } + pthread_mutex_unlock (&pl_inode->mutex); + + return ret; +} + + +static int +truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + posix_locks_private_t *priv = NULL; + struct _truncate_ops *local = NULL; + inode_t *inode = NULL; + pl_inode_t *pl_inode = NULL; + + + priv = this->private; + local = frame->local; + + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "got error (errno=%d, stderror=%s) from child", + op_errno, strerror (op_errno)); + goto unwind; + } + + if (local->op == TRUNCATE) + inode = local->loc.inode; + else + inode = local->fd->inode; + + pl_inode = pl_inode_get (this, inode); + if (!pl_inode) { + gf_log (this->name, GF_LOG_ERROR, + "unable to get pl_inode from %p", inode); + op_errno = ENOMEM; + goto unwind; + } + + if (priv->mandatory + && pl_inode->mandatory + && !truncate_allowed (pl_inode, frame->root->trans, + frame->root->pid, local->offset)) { + op_errno = EAGAIN; + goto unwind; + } + + switch (local->op) { + case TRUNCATE: + STACK_WIND (frame, pl_truncate_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->truncate, + &local->loc, local->offset); + break; + case FTRUNCATE: + STACK_WIND (frame, pl_truncate_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->ftruncate, + local->fd, local->offset); + break; + } + + return 0; + +unwind: + if (local->op == TRUNCATE) + loc_wipe (&local->loc); + + STACK_UNWIND (frame, -1, ENOMEM, buf); + return 0; +} + + +int +pl_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset) +{ + struct _truncate_ops *local = NULL; + + local = CALLOC (1, sizeof (struct _truncate_ops)); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto unwind; + } + + local->op = TRUNCATE; + local->offset = offset; + loc_copy (&local->loc, loc); + + frame->local = local; + + STACK_WIND (frame, truncate_stat_cbk, FIRST_CHILD (this), + FIRST_CHILD (this)->fops->stat, loc); + + return 0; + +unwind: + STACK_UNWIND (frame, -1, ENOMEM, NULL); + + return 0; +} + + +int +pl_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + struct _truncate_ops *local = NULL; + + local = CALLOC (1, sizeof (struct _truncate_ops)); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto unwind; + } + + local->op = FTRUNCATE; + local->offset = offset; + local->fd = fd; + + frame->local = local; + + STACK_WIND (frame, truncate_stat_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd); + return 0; + +unwind: + STACK_UNWIND (frame, -1, ENOMEM, NULL); + + return 0; +} + + +static void +__delete_locks_of_owner (pl_inode_t *pl_inode, + transport_t *transport, pid_t pid) +{ + posix_lock_t *tmp = NULL; + posix_lock_t *l = NULL; + + /* TODO: what if it is a blocked lock with pending l->frame */ + + list_for_each_entry_safe (l, tmp, &pl_inode->ext_list, list) { + if ((l->transport == transport) + && (l->client_pid == pid)) { + __delete_lock (pl_inode, l); + __destroy_lock (l); + } + } + + list_for_each_entry_safe (l, tmp, &pl_inode->int_list, list) { + if ((l->transport == transport) + && (l->client_pid == pid)) { + __delete_lock (pl_inode, l); + __destroy_lock (l); + } + } + + return; +} + + +int +pl_flush_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + + +int +pl_flush (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + posix_locks_private_t *priv = NULL; + pl_inode_t *pl_inode = NULL; + + priv = this->private; + + pl_inode = pl_inode_get (this, fd->inode); + if (!pl_inode) { + gf_log (this->name, GF_LOG_ERROR, "returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD); + return 0; + } + + pthread_mutex_lock (&pl_inode->mutex); + { + __delete_locks_of_owner (pl_inode, frame->root->trans, + frame->root->pid); + } + pthread_mutex_unlock (&pl_inode->mutex); + + grant_blocked_locks (this, pl_inode, GF_LOCK_POSIX); + grant_blocked_locks (this, pl_inode, GF_LOCK_INTERNAL); + + do_blocked_rw (pl_inode); + + STACK_WIND (frame, pl_flush_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, fd); + return 0; +} + + +int +pl_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + STACK_UNWIND (frame, op_ret, op_errno, fd); + + return 0; +} + + +int +pl_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd) +{ + /* why isn't O_TRUNC being handled ? */ + STACK_WIND (frame, pl_open_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, + loc, flags & ~O_TRUNC, fd); + + return 0; +} + + +int +pl_create_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + + return 0; +} + + +int +pl_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + STACK_WIND (frame, pl_create_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->create, + loc, flags, mode, fd); + return 0; +} + + +int +pl_readv_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + + return 0; +} + +int +pl_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + + return 0; +} + + +void +do_blocked_rw (pl_inode_t *pl_inode) +{ + struct list_head wind_list; + pl_rw_req_t *rw = NULL; + pl_rw_req_t *tmp = NULL; + + INIT_LIST_HEAD (&wind_list); + + pthread_mutex_lock (&pl_inode->mutex); + { + list_for_each_entry_safe (rw, tmp, &pl_inode->rw_list, list) { + if (__rw_allowable (pl_inode, &rw->region, + rw->stub->fop)) { + list_del_init (&rw->list); + list_add_tail (&rw->list, &wind_list); + } + } + } + pthread_mutex_unlock (&pl_inode->mutex); + + list_for_each_entry_safe (rw, tmp, &wind_list, list) { + list_del_init (&rw->list); + call_resume (rw->stub); + free (rw); + } + + return; +} + + +static int +__rw_allowable (pl_inode_t *pl_inode, posix_lock_t *region, + glusterfs_fop_t op) +{ + posix_lock_t *l = NULL; + int ret = 1; + + list_for_each_entry (l, &pl_inode->ext_list, list) { + if (locks_overlap (l, region) && !same_owner (l, region)) { + if ((op == GF_FOP_READ) && (l->fl_type != F_WRLCK)) + continue; + ret = 0; + break; + } + } + + return ret; +} + + +int +pl_readv_cont (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + STACK_WIND (frame, pl_readv_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, + fd, size, offset); + + return 0; +} + + +int +pl_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + posix_locks_private_t *priv = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = {.list = {0, }, }; + int op_ret = 0; + int op_errno = 0; + char allowable = 0; + + + priv = this->private; + + pl_inode = pl_inode_get (this, fd->inode); + + if (priv->mandatory && pl_inode->mandatory) { + region.fl_start = offset; + region.fl_end = offset + size - 1; + region.transport = frame->root->trans; + region.client_pid = frame->root->pid; + + pthread_mutex_lock (&pl_inode->mutex); + { + allowable = __rw_allowable (pl_inode, ®ion, + GF_FOP_READ); + if (allowable) + goto unlock; + + if (fd->flags & O_NONBLOCK) { + gf_log (this->name, GF_LOG_DEBUG, + "returning EWOULDBLOCK"); + op_errno = EWOULDBLOCK; + op_ret = -1; + goto unlock; + } + + rw = CALLOC (1, sizeof (*rw)); + if (!rw) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + op_ret = -1; + goto unlock; + } + + rw->stub = fop_readv_stub (frame, pl_readv_cont, + fd, size, offset); + if (!rw->stub) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + op_ret = -1; + free (rw); + goto unlock; + } + + rw->region = region; + + list_add_tail (&rw->list, &pl_inode->rw_list); + } + unlock: + pthread_mutex_unlock (&pl_inode->mutex); + + goto unwind; + } + + + STACK_WIND (frame, pl_readv_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->readv, + fd, size, offset); + return 0; + +unwind: + if (op_ret == -1) + STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL); + + return 0; +} + + +int +pl_writev_cont (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int count, off_t offset) +{ + STACK_WIND (frame, pl_writev_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, + fd, vector, count, offset); + + return 0; +} + + +int +pl_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset) +{ + posix_locks_private_t *priv = NULL; + pl_inode_t *pl_inode = NULL; + pl_rw_req_t *rw = NULL; + posix_lock_t region = {.list = {0, }, }; + int op_ret = 0; + int op_errno = 0; + char allowable = 0; + + + priv = this->private; + + pl_inode = pl_inode_get (this, fd->inode); + if (priv->mandatory && pl_inode->mandatory) { + region.fl_start = offset; + region.fl_end = offset + iov_length (vector, count) - 1; + region.transport = frame->root->trans; + region.client_pid = frame->root->pid; + + pthread_mutex_lock (&pl_inode->mutex); + { + allowable = __rw_allowable (pl_inode, ®ion, + GF_FOP_WRITE); + if (allowable) + goto unlock; + + if (fd->flags & O_NONBLOCK) { + gf_log (this->name, GF_LOG_DEBUG, + "returning EWOULDBLOCK"); + op_errno = EWOULDBLOCK; + op_ret = -1; + goto unlock; + } + + rw = CALLOC (1, sizeof (*rw)); + if (!rw) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + op_ret = -1; + goto unlock; + } + + rw->stub = fop_writev_stub (frame, pl_writev_cont, + fd, vector, count, offset); + if (!rw->stub) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + op_ret = -1; + free (rw); + goto unlock; + } + + rw->region = region; + + list_add_tail (&rw->list, &pl_inode->rw_list); + } + unlock: + pthread_mutex_unlock (&pl_inode->mutex); + + goto unwind; + } + + + STACK_WIND (frame, pl_writev_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->writev, + fd, vector, count, offset); + return 0; + +unwind: + if (op_ret == -1) + STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL); + + return 0; +} + + +int +pl_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *flock) +{ + transport_t *transport = NULL; + pid_t client_pid = 0; + posix_locks_private_t *priv = NULL; + pl_inode_t *pl_inode = NULL; + int op_ret = 0; + int op_errno = 0; + int can_block = 0; + posix_lock_t *reqlock = NULL; + posix_lock_t *conf = NULL; + int ret = 0; + + transport = frame->root->trans; + client_pid = frame->root->pid; + priv = this->private; + + pl_inode = pl_inode_get (this, fd->inode); + if (!pl_inode) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + reqlock = new_posix_lock (flock, transport, client_pid); + if (!reqlock) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_ret = -1; + op_errno = ENOMEM; + goto unwind; + } + + switch (cmd) { + +#if F_GETLK != F_GETLK64 + case F_GETLK64: +#endif + case F_GETLK: + conf = pl_getlk (pl_inode, reqlock, GF_LOCK_POSIX); + posix_lock_to_flock (conf, flock); + __destroy_lock (reqlock); + + break; + +#if F_SETLKW != F_SETLKW64 + case F_SETLKW64: +#endif + case F_SETLKW: + can_block = 1; + reqlock->frame = frame; + reqlock->this = this; + reqlock->fd = fd; + + /* fall through */ + +#if F_SETLK != F_SETLK64 + case F_SETLK64: +#endif + case F_SETLK: + memcpy (&reqlock->user_flock, flock, sizeof (struct flock)); + ret = pl_setlk (this, pl_inode, reqlock, + can_block, GF_LOCK_POSIX); + + if (ret == -1) { + if (can_block) + goto out; + + gf_log (this->name, GF_LOG_DEBUG, "returning EAGAIN"); + op_ret = -1; + op_errno = EAGAIN; + __destroy_lock (reqlock); + } + } + +unwind: + STACK_UNWIND (frame, op_ret, op_errno, flock); +out: + return 0; +} + + +/* TODO: this function just logs, no action required?? */ +int +pl_forget (xlator_t *this, + inode_t *inode) +{ + pl_inode_t *pl_inode = NULL; + + pl_inode = pl_inode_get (this, inode); + + if (!list_empty (&pl_inode->rw_list)) { + gf_log (this->name, GF_LOG_CRITICAL, + "pending R/W requests found!"); + } + + if (!list_empty (&pl_inode->ext_list)) { + gf_log (this->name, GF_LOG_CRITICAL, + "Pending fcntl locks found!"); + } + + if (!list_empty (&pl_inode->int_list)) { + gf_log (this->name, GF_LOG_CRITICAL, + "Pending internal locks found!"); + } + + if (!list_empty (&pl_inode->dir_list)) { + gf_log (this->name, GF_LOG_CRITICAL, + "Pending entry locks found!"); + } + + FREE (pl_inode); + + return 0; +} + + +int +init (xlator_t *this) +{ + posix_locks_private_t *priv = NULL; + xlator_list_t *trav = NULL; + data_t *mandatory = NULL; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: posix-locks should have exactly one child"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + trav = this->children; + while (trav->xlator->children) + trav = trav->xlator->children; + + if (strncmp ("storage/", trav->xlator->type, 8)) { + gf_log (this->name, GF_LOG_ERROR, + "'posix-locks' not loaded over storage translator"); + return -1; + } + + priv = CALLOC (1, sizeof (*priv)); + + mandatory = dict_get (this->options, "mandatory-locks"); + if (mandatory) { + if (gf_string2boolean (mandatory->data, + &priv->mandatory) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "'mandatory-locks' takes only boolean " + "options"); + return -1; + } + } + + this->private = priv; + return 0; +} + + +int +fini (xlator_t *this) +{ + posix_locks_private_t *priv = NULL; + + priv = this->private; + free (priv); + + return 0; +} + + +int +pl_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *flock); + +int +pl_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *flock); + +int +pl_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type); + +int +pl_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, + entrylk_cmd cmd, entrylk_type type); + +struct xlator_fops fops = { + .create = pl_create, + .truncate = pl_truncate, + .ftruncate = pl_ftruncate, + .open = pl_open, + .readv = pl_readv, + .writev = pl_writev, + .lk = pl_lk, + .inodelk = pl_inodelk, + .finodelk = pl_finodelk, + .entrylk = pl_entrylk, + .fentrylk = pl_fentrylk, + .flush = pl_flush, +}; + + +struct xlator_mops mops = { +}; + + +struct xlator_cbks cbks = { + .forget = pl_forget, +}; + + +struct volume_options options[] = { + { .key = { "mandatory-locks", "mandatory" }, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {NULL} }, +}; diff --git a/xlators/features/locks/tests/unit-test.c b/xlators/features/locks/tests/unit-test.c new file mode 100644 index 000000000..6a1bfbf68 --- /dev/null +++ b/xlators/features/locks/tests/unit-test.c @@ -0,0 +1,75 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "compat.h" +#include "xlator.h" +#include "inode.h" +#include "logging.h" +#include "common-utils.h" +#include "list.h" + +#include "locks.h" +#include "common.h" + +#define expect(cond) if (!(cond)) { goto out; } + +extern int lock_name (pl_inode_t *, const char *, entrylk_type); +extern int unlock_name (pl_inode_t *, const char *, entrylk_type); + +int main (int argc, char **argv) +{ + int ret = 1; + int r = -1; + + pl_inode_t *pinode = CALLOC (sizeof (pl_inode_t), 1); + pthread_mutex_init (&pinode->dir_lock_mutex, NULL); + INIT_LIST_HEAD (&pinode->gf_dir_locks); + + r = lock_name (pinode, NULL, ENTRYLK_WRLCK); expect (r == 0); + { + r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == -EAGAIN); + } + r = unlock_name (pinode, NULL, ENTRYLK_WRLCK); expect (r == 0); + + r = lock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); + { + r = lock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); + { + r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == -EAGAIN); + } + r = unlock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); + } + r = unlock_name (pinode, "foo", ENTRYLK_RDLCK); expect (r == 0); + + r = lock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == 0); + r = unlock_name (pinode, "foo", ENTRYLK_WRLCK); expect (r == 0); + + r = lock_name (pinode, "baz", ENTRYLK_WRLCK); expect (r == 0); + r = lock_name (pinode, "baz", ENTRYLK_RDLCK); expect (r == -EAGAIN); + + ret = 0; +out: + return ret; +} diff --git a/xlators/features/path-convertor/Makefile.am b/xlators/features/path-convertor/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/features/path-convertor/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/path-convertor/src/Makefile.am b/xlators/features/path-convertor/src/Makefile.am new file mode 100644 index 000000000..1fde19352 --- /dev/null +++ b/xlators/features/path-convertor/src/Makefile.am @@ -0,0 +1,14 @@ + +xlator_LTLIBRARIES = path-converter.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +path_converter_la_LDFLAGS = -module -avoidversion + +path_converter_la_SOURCES = path.c +path_converter_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/features/path-convertor/src/path.c b/xlators/features/path-convertor/src/path.c new file mode 100644 index 000000000..41ef1d8a8 --- /dev/null +++ b/xlators/features/path-convertor/src/path.c @@ -0,0 +1,1217 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/* TODO: add gf_log to all the cases returning errors */ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +/** + * xlators/features/path-translator: + * This translator converts the path it gets into user specified targets. + */ + +#include <sys/types.h> +#include <regex.h> +#include <time.h> +#include <errno.h> +#include "glusterfs.h" +#include "xlator.h" + +typedef struct path_private +{ + int32_t this_len; + int32_t start_off; + int32_t end_off; + char *this; + char *that; + char *path; + regex_t *preg; +} path_private_t; + +static char * +name_this_to_that (xlator_t *xl, const char *path, const char *name) +{ + path_private_t *priv = xl->private; + char priv_path[ZR_PATH_MAX] = {0,}; + char *tmp_name = NULL; + int32_t path_len = strlen (path); + int32_t name_len = strlen (name) - ZR_FILE_CONTENT_STRLEN; + int32_t total_len = path_len + name_len; + int32_t i = 0, j = 0; + + if (path_len >= priv->end_off) + return (char *)name; + + if (priv->end_off && (total_len > priv->end_off)) { + j = priv->start_off; + tmp_name = CALLOC (1, (total_len + ZR_FILE_CONTENT_STRLEN)); + ERR_ABORT (tmp_name); + + /* Get the complete path for the file first */ + strcpy (tmp_name, path); + strcat (tmp_name, name + ZR_FILE_CONTENT_STRLEN); + + strncpy (priv_path, tmp_name, priv->start_off); + for (i = priv->start_off; i < priv->end_off; i++) { + if (tmp_name[i] == '/') + continue; + priv_path[j++] = tmp_name[i]; + } + memcpy ((priv_path + j), + (tmp_name + priv->end_off), + (total_len - priv->end_off)); + priv_path[(total_len - (priv->end_off - j))] = '\0'; + + strcpy (tmp_name, ZR_FILE_CONTENT_STR); + strcat (tmp_name, priv_path); + + return tmp_name; + } + + return (char *)name; +} + +/* This function should return + * NULL - + * converted path - if path match + * same path - if it doesn't match + */ +static char * +path_this_to_that (xlator_t *xl, const char *path) +{ + path_private_t *priv = xl->private; + char *priv_path = NULL; + int32_t path_len = strlen (path); + int32_t i = 0, j = 0; + + if (priv->end_off && (path_len > priv->start_off)) { + priv_path = CALLOC (1, path_len); + ERR_ABORT (priv_path); + + if (priv->start_off && (path_len > priv->start_off)) + memcpy (priv_path, path, priv->start_off); + if (path_len > priv->end_off) { + j = priv->start_off; + for (i = priv->start_off; i < priv->end_off; i++) { + if (path[i] == '/') + continue; + priv_path[j++] = path[i]; + } + memcpy ((priv_path + j), + (path + priv->end_off), + (path_len - priv->end_off)); + priv_path[(path_len - (priv->end_off - j))] = '\0'; + } + return priv_path; + } + return (char *)path; +} + +int32_t +path_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + return 0; +} + +int32_t +path_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + STACK_UNWIND (frame, op_ret, op_errno, fd); + return 0; +} + +int32_t +path_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count) +{ + STACK_UNWIND (frame, op_ret, op_errno, entries, count); + return 0; +} + +int32_t +path_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + + +int32_t +path_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +path_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *xattr) +{ + STACK_UNWIND (frame, op_ret, op_errno, inode, buf, xattr); + return 0; +} + + +int32_t +path_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +path_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + + +int32_t +path_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +path_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + +int32_t +path_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + STACK_UNWIND (frame, op_ret, op_errno, fd); + return 0; +} + + +int32_t +path_common_buf_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +path_common_dict_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + STACK_UNWIND (frame, op_ret, op_errno, dict); + return 0; +} + +int32_t +path_common_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/* */ +int32_t +path_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, path_lookup_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, + loc, xattr_req); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_common_buf_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_readlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, + loc, + size); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t dev) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_mknod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, + loc, + mode, + dev); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_mkdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, + loc, + mode); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, + loc); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkpath, + loc_t *loc) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_symlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + linkpath, + loc); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + char *oldloc_path = (char *)oldloc->path; + char *tmp_oldloc_path = NULL; + + char *newloc_path = (char *)newloc->path; + char *tmp_newloc_path = NULL; + + if (!(tmp_oldloc_path = path_this_to_that (this, oldloc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + oldloc->path = tmp_oldloc_path; + + if (!(tmp_newloc_path = path_this_to_that (this, newloc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + newloc->path = tmp_newloc_path; + + STACK_WIND (frame, + path_common_buf_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, + oldloc, + newloc); + + oldloc->path = oldloc_path; + if (tmp_oldloc_path != oldloc_path) + FREE (tmp_oldloc_path); + + newloc->path = newloc_path; + if (tmp_newloc_path != newloc_path) + FREE (tmp_newloc_path); + + return 0; +} + +int32_t +path_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + char *oldloc_path = (char *)oldloc->path; + char *tmp_oldloc_path = NULL; + + char *newloc_path = (char *)newloc->path; + char *tmp_newloc_path = NULL; + + if (!(tmp_oldloc_path = path_this_to_that (this, oldloc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + oldloc->path = tmp_oldloc_path; + + if (!(tmp_newloc_path = path_this_to_that (this, newloc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + newloc->path = tmp_newloc_path; + + STACK_WIND (frame, + path_link_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, + oldloc, + newloc); + + oldloc->path = oldloc_path; + if (tmp_oldloc_path != oldloc_path) + FREE (tmp_oldloc_path); + + newloc->path = newloc_path; + if (tmp_newloc_path != newloc_path) + FREE (tmp_newloc_path); + + return 0; +} + +int32_t +path_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_common_buf_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chmod, + loc, + mode); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_common_buf_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chown, + loc, + uid, + gid); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_common_buf_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + loc, + offset); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_common_buf_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->utimens, + loc, + tv); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_open_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, + loc, + flags, + fd); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, + flags, + mode, + fd); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + char *tmp_name = NULL; + data_pair_t *trav = dict->members_list; + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + if (ZR_FILE_CONTENT_REQUEST(trav->key)) { + tmp_name = name_this_to_that (this, loc->path, trav->key); + if (tmp_name != trav->key) { + trav->key = tmp_name; + } else { + tmp_name = NULL; + } + } + + STACK_WIND (frame, + path_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + loc, + dict, + flags); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + if (tmp_name) + FREE (tmp_name); + + return 0; +} + +int32_t +path_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + char *tmp_name = (char *)name; + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + if (ZR_FILE_CONTENT_REQUEST(name)) { + tmp_name = name_this_to_that (this, loc->path, name); + } + + STACK_WIND (frame, + path_common_dict_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + loc, + tmp_name); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + if (tmp_name != name) + FREE (tmp_name); + + return 0; +} + +int32_t +path_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + char *tmp_name = (char *)name; + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + if (ZR_FILE_CONTENT_REQUEST(name)) { + tmp_name = name_this_to_that (this, loc->path, name); + } + + STACK_WIND (frame, + path_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + loc, + tmp_name); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + if (tmp_name != name) + FREE (tmp_name); + + return 0; +} + +int32_t +path_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + fd_t *fd) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_opendir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->opendir, + loc, + fd); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, + loc, + mask); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *fchecksum, + uint8_t *dchecksum) +{ + STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum); + return 0; +} + +int32_t +path_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_checksum_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->checksum, + loc, + flag); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + + +int32_t +path_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, + entrylk_cmd cmd, entrylk_type type) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, path_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->entrylk, + loc, basename, cmd, type); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + +int32_t +path_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *lock) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_common_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->inodelk, + loc, cmd, lock); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + + +int32_t +path_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + char *loc_path = (char *)loc->path; + char *tmp_path = NULL; + + if (!(tmp_path = path_this_to_that (this, loc->path))) { + STACK_UNWIND (frame, -1, ENOENT, NULL, NULL); + return 0; + } + loc->path = tmp_path; + + STACK_WIND (frame, + path_common_dict_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->xattrop, + loc, + flags, + dict); + + loc->path = loc_path; + if (tmp_path != loc_path) + FREE (tmp_path); + + return 0; +} + + +int32_t +init (xlator_t *this) +{ + dict_t *options = this->options; + path_private_t *priv = NULL; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "path translator requires exactly one subvolume"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + priv = CALLOC (1, sizeof (*priv)); + ERR_ABORT (priv); + if (dict_get (options, "start-offset")) { + priv->start_off = data_to_int32 (dict_get (options, + "start-offset")); + } + if (dict_get (options, "end-offset")) { + priv->end_off = data_to_int32 (dict_get (options, + "end-offset")); + } + + if (dict_get (options, "regex")) { + int32_t ret = 0; + priv->preg = CALLOC (1, sizeof (regex_t)); + ERR_ABORT (priv->preg); + ret = regcomp (priv->preg, + data_to_str (dict_get (options, "regex")), + REG_EXTENDED); + if (ret) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to compile the 'option regex'"); + FREE (priv); + return -1; + } + if (dict_get (options, "replace-with")) { + priv->that = data_to_str (dict_get (options, + "replace-with")); + } else { + priv->that = ""; + } + } + + this->private = priv; + return 0; +} + +void +fini (xlator_t *this) +{ + return; +} + +struct xlator_fops fops = { + .stat = path_stat, + .readlink = path_readlink, + .mknod = path_mknod, + .mkdir = path_mkdir, + .unlink = path_unlink, + .rmdir = path_rmdir, + .symlink = path_symlink, + .rename = path_rename, + .link = path_link, + .chmod = path_chmod, + .chown = path_chown, + .truncate = path_truncate, + .utimens = path_utimens, + .open = path_open, + .setxattr = path_setxattr, + .getxattr = path_getxattr, + .removexattr = path_removexattr, + .opendir = path_opendir, + .access = path_access, + .create = path_create, + .lookup = path_lookup, + .checksum = path_checksum, + .xattrop = path_xattrop, + .entrylk = path_entrylk, + .inodelk = path_inodelk, +}; + + +struct xlator_mops mops = { +}; + + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"start-offset"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 4095 + }, + { .key = {"end-offset"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 4096 + }, + { .key = {"replace-with"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {NULL} }, +}; diff --git a/xlators/features/quota/Makefile.am b/xlators/features/quota/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/features/quota/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/quota/src/Makefile.am b/xlators/features/quota/src/Makefile.am new file mode 100644 index 000000000..886d83964 --- /dev/null +++ b/xlators/features/quota/src/Makefile.am @@ -0,0 +1,13 @@ +xlator_LTLIBRARIES = quota.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +quota_la_LDFLAGS = -module -avoidversion + +quota_la_SOURCES = quota.c +quota_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/features/quota/src/quota.c b/xlators/features/quota/src/quota.c new file mode 100644 index 000000000..c898899b5 --- /dev/null +++ b/xlators/features/quota/src/quota.c @@ -0,0 +1,1056 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/time.h> + +#include "xlator.h" +#include "defaults.h" +#include "common-utils.h" + +struct quota_local { + struct stat stbuf; + inode_t *inode; + char *path; + fd_t *fd; + off_t offset; + int32_t count; + struct iovec *vector; + dict_t *refs; + loc_t loc; +}; + + +struct quota_priv { + char only_first_time; /* Used to make sure a call is done only one time */ + gf_lock_t lock; /* Used while updating variables */ + + uint64_t disk_usage_limit; /* Used for Disk usage quota */ + uint64_t current_disk_usage; /* Keep the current usage value */ + + uint32_t min_free_disk_limit; /* user specified limit, in %*/ + uint32_t current_free_disk; /* current free disk space available, in % */ + uint32_t refresh_interval; /* interval in seconds */ + uint32_t min_disk_last_updated_time; /* used for interval calculation */ +}; + + +int +quota_statvfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *stbuf) +{ + struct quota_priv *priv = this->private; + + if (op_ret >= 0) { + priv->current_free_disk = + (stbuf->f_bavail * 100) / stbuf->f_blocks; + } + + STACK_DESTROY (frame->root); + return 0; +} + + +static void +build_root_loc (xlator_t *this, loc_t *loc) +{ + loc->path = "/"; +} + + +void +gf_quota_usage_subtract (xlator_t *this, size_t size) +{ + struct quota_priv *priv = NULL; + + priv = this->private; + + LOCK (&priv->lock); + { + if (priv->current_disk_usage < size) + priv->current_disk_usage = 0; + else + priv->current_disk_usage -= size; + } + UNLOCK (&priv->lock); +} + + +void +gf_quota_usage_add (xlator_t *this, size_t size) +{ + struct quota_priv *priv = this->private; + + LOCK (&priv->lock); + { + priv->current_disk_usage += size; + } + UNLOCK (&priv->lock); +} + + +void +gf_quota_update_current_free_disk (xlator_t *this) +{ + call_frame_t *frame = NULL; + call_pool_t *pool = NULL; + loc_t loc; + + pool = this->ctx->pool; + frame = create_frame (this, pool); + + build_root_loc (this, &loc); + + STACK_WIND (frame, quota_statvfs_cbk, + this->children->xlator, + this->children->xlator->fops->statfs, &loc); + + return ; +} + + +int +gf_quota_check_free_disk (xlator_t *this) +{ + struct quota_priv * priv = NULL; + struct timeval tv = {0, 0}; + + priv = this->private; + if (priv->min_free_disk_limit) { + gettimeofday (&tv, NULL); + if (tv.tv_sec > (priv->refresh_interval + + priv->min_disk_last_updated_time)) { + priv->min_disk_last_updated_time = tv.tv_sec; + gf_quota_update_current_free_disk (this); + } + if (priv->current_free_disk <= priv->min_free_disk_limit) + return -1; + } + + return 0; +} + + +int +quota_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + struct quota_priv *priv = this->private; + struct quota_local *local = NULL; + + local = frame->local; + + if ((op_ret >= 0) && priv->disk_usage_limit) { + gf_quota_usage_subtract (this, (local->stbuf.st_blocks - + buf->st_blocks) * 512); + loc_wipe (&local->loc); + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + + +int +quota_truncate_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + struct quota_local *local = NULL; + struct quota_priv *priv = NULL; + + priv = this->private; + local = frame->local; + + if (op_ret >= 0) { + local->stbuf = *buf; + } + + STACK_WIND (frame, quota_truncate_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->truncate, + &local->loc, local->offset); + return 0; +} + + +int +quota_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset) +{ + struct quota_local *local = NULL; + struct quota_priv *priv = NULL; + + priv = this->private; + + if (priv->disk_usage_limit) { + local = CALLOC (1, sizeof (struct quota_local)); + frame->local = local; + + loc_copy (&local->loc, loc); + local->offset = offset; + + STACK_WIND (frame, quota_truncate_stat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc); + return 0; + } + + STACK_WIND (frame, quota_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + loc, offset); + return 0; +} + + +int +quota_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + struct quota_priv *priv = NULL; + struct quota_local *local = NULL; + + local = frame->local; + priv = this->private; + + if ((op_ret >= 0) && priv->disk_usage_limit) { + gf_quota_usage_subtract (this, (local->stbuf.st_blocks - + buf->st_blocks) * 512); + fd_unref (local->fd); + } + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + + +int +quota_ftruncate_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + struct quota_local *local = NULL; + struct quota_priv *priv = NULL; + + priv = this->private; + local = frame->local; + + if (op_ret >= 0) { + local->stbuf = *buf; + } + + STACK_WIND (frame, quota_ftruncate_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->ftruncate, + local->fd, local->offset); + return 0; +} + + +int +quota_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + struct quota_local *local = NULL; + struct quota_priv *priv = NULL; + + + priv = this->private; + + if (priv->disk_usage_limit) { + local = CALLOC (1, sizeof (struct quota_local)); + frame->local = local; + + local->fd = fd_ref (fd); + local->offset = offset; + + STACK_WIND (frame, quota_ftruncate_fstat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd); + return 0; + } + + STACK_WIND (frame, quota_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + fd, offset); + return 0; +} + + +int +quota_mknod_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + inode_t *inode, struct stat *buf) +{ + struct quota_priv *priv = NULL; + + priv = this->private; + + if ((op_ret >= 0) && priv->disk_usage_limit) { + gf_quota_usage_add (this, buf->st_blocks * 512); + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + + +int +quota_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t rdev) +{ + struct quota_priv *priv = NULL; + + priv = this->private; + + if (gf_quota_check_free_disk (this) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "min-free-disk limit (%u) crossed, current available is %u", + priv->min_free_disk_limit, priv->current_free_disk); + STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL); + return 0; + } + + if (priv->current_disk_usage > priv->disk_usage_limit) { + gf_log (this->name, GF_LOG_ERROR, + "Disk usage limit (%"PRIu64") crossed, current usage is %"PRIu64"", + priv->disk_usage_limit, priv->current_disk_usage); + STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL); + return 0; + } + + STACK_WIND (frame, quota_mknod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, + loc, mode, rdev); + return 0; +} + + +int +quota_mkdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct stat *buf) +{ + struct quota_priv *priv = NULL; + + priv = this->private; + + if ((op_ret >= 0) && priv->disk_usage_limit) { + gf_quota_usage_subtract (this, buf->st_blocks * 512); + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + + +int +quota_mkdir (call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode) +{ + struct quota_priv *priv = NULL; + + priv = this->private; + + if (gf_quota_check_free_disk (this) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "min-free-disk limit (%u) crossed, current available is %u", + priv->min_free_disk_limit, priv->current_free_disk); + STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL); + return 0; + + } + + if (priv->current_disk_usage > priv->disk_usage_limit) { + gf_log (this->name, GF_LOG_ERROR, + "Disk usage limit (%"PRIu64") crossed, current usage is %"PRIu64"", + priv->disk_usage_limit, priv->current_disk_usage); + STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL); + return 0; + } + + STACK_WIND (frame, quota_mkdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, + loc, mode); + + return 0; +} + + +int +quota_unlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + struct quota_local *local = NULL; + + local = frame->local; + + if (local) { + if (op_ret >= 0) { + gf_quota_usage_subtract (this, + local->stbuf.st_blocks * 512); + } + loc_wipe (&local->loc); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int +quota_unlink_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + struct quota_local *local = NULL; + + local = frame->local; + + if (op_ret >= 0) { + if (buf->st_nlink == 1) { + local->stbuf = *buf; + } + } + + STACK_WIND (frame, quota_unlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + &local->loc); + + return 0; +} + + +int +quota_unlink (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + struct quota_local *local = NULL; + struct quota_priv *priv = NULL; + + priv = this->private; + + if (priv->disk_usage_limit) { + local = CALLOC (1, sizeof (struct quota_local)); + frame->local = local; + + loc_copy (&local->loc, loc); + + STACK_WIND (frame, + quota_unlink_stat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc); + return 0; + } + + STACK_WIND (frame, quota_unlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc); + return 0; +} + + +int +quota_rmdir_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + struct quota_local *local = NULL; + + local = frame->local; + + if (local) { + if (op_ret >= 0) { + gf_quota_usage_subtract (this, local->stbuf.st_blocks * 512); + } + loc_wipe (&local->loc); + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int +quota_rmdir_stat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + struct quota_local *local = NULL; + + local = frame->local; + + if (op_ret >= 0) { + local->stbuf = *buf; + } + + STACK_WIND (frame, quota_rmdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, + &local->loc); + + return 0; +} + + +int +quota_rmdir (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + struct quota_local *local = NULL; + struct quota_priv *priv = NULL; + + priv = this->private; + + if (priv->disk_usage_limit) { + local = CALLOC (1, sizeof (struct quota_local)); + frame->local = local; + + loc_copy (&local->loc, loc); + + STACK_WIND (frame, quota_rmdir_stat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, loc); + return 0; + } + + STACK_WIND (frame, quota_rmdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, + loc); + return 0; +} + + +int +quota_symlink_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct stat *buf) +{ + struct quota_priv *priv = NULL; + + priv = this->private; + + if ((op_ret >= 0) && priv->disk_usage_limit) { + gf_quota_usage_add (this, buf->st_blocks * 512); + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + + +int +quota_symlink (call_frame_t *frame, xlator_t *this, + const char *linkpath, loc_t *loc) +{ + struct quota_priv *priv = NULL; + + priv = this->private; + + if (gf_quota_check_free_disk (this) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "min-free-disk limit (%u) crossed, current available is %u", + priv->min_free_disk_limit, priv->current_free_disk); + STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL); + return 0; + + } + if (priv->current_disk_usage > priv->disk_usage_limit) { + gf_log (this->name, GF_LOG_ERROR, + "Disk usage limit (%"PRIu64") crossed, current usage is %"PRIu64"", + priv->disk_usage_limit, priv->current_disk_usage); + STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL); + return 0; + } + + STACK_WIND (frame, quota_symlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + linkpath, loc); + return 0; +} + + +int +quota_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct stat *buf) +{ + struct quota_priv *priv = this->private; + int ret = 0; + + if ((op_ret >= 0) && priv->disk_usage_limit) { + gf_quota_usage_add (this, buf->st_blocks * 512); + + ret = fd_ctx_set (fd, this, 1); + } + + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + return 0; +} + + +int +quota_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + struct quota_priv *priv = NULL; + + priv = this->private; + + if (gf_quota_check_free_disk (this) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "min-free-disk limit (%u) crossed, current available is %u", + priv->min_free_disk_limit, priv->current_free_disk); + STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL, NULL); + return 0; + + } + if (priv->current_disk_usage > priv->disk_usage_limit) { + gf_log (this->name, GF_LOG_ERROR, + "Disk usage limit (%"PRIu64") crossed, current usage is %"PRIu64"", + priv->disk_usage_limit, priv->current_disk_usage); + STACK_UNWIND (frame, -1, ENOSPC, NULL, NULL, NULL); + return 0; + } + + STACK_WIND (frame, quota_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, flags, mode, fd); + return 0; +} + + +int +quota_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + int ret = 0; + + if (op_ret >= 0) + ret = fd_ctx_set (fd, this, 1); + + STACK_UNWIND (frame, op_ret, op_errno, fd); + return 0; +} + + +int +quota_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd) +{ + STACK_WIND (frame, quota_open_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, + loc, flags, fd); + return 0; +} + + +int +quota_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *stbuf) +{ + struct quota_priv *priv = NULL; + struct quota_local *local = NULL; + + + priv = this->private; + local = frame->local; + + if (priv->disk_usage_limit) { + if (op_ret >= 0) { + gf_quota_usage_add (this, (stbuf->st_blocks - + local->stbuf.st_blocks) * 512); + } + fd_unref (local->fd); + dict_unref (local->refs); + } + + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} + + +int +quota_writev_fstat_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + struct quota_local *local = NULL; + struct quota_priv *priv = NULL; + int iovlen = 0; + + + local = frame->local; + priv = this->private; + + if (op_ret >= 0) { + if (priv->current_disk_usage > priv->disk_usage_limit) { + iovlen = iov_length (local->vector, local->count); + + if (iovlen > (buf->st_blksize - (buf->st_size % buf->st_blksize))) { + fd_unref (local->fd); + dict_unref (local->refs); + STACK_UNWIND (frame, -1, ENOSPC, NULL); + return 0; + } + } + local->stbuf = *buf; + } + + STACK_WIND (frame, quota_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + local->fd, local->vector, local->count, local->offset); + + return 0; +} + + +int +quota_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t off) +{ + struct quota_local *local = NULL; + struct quota_priv *priv = NULL; + + priv = this->private; + + if (gf_quota_check_free_disk (this) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "min-free-disk limit (%u) crossed, current available is %u", + priv->min_free_disk_limit, priv->current_free_disk); + STACK_UNWIND (frame, -1, ENOSPC, NULL); + return 0; + } + + if (priv->disk_usage_limit) { + local = CALLOC (1, sizeof (struct quota_local)); + local->fd = fd_ref (fd); + local->refs = dict_ref (frame->root->req_refs); + local->vector = vector; + local->count = count; + local->offset = off; + frame->local = local; + + STACK_WIND (frame, quota_writev_fstat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, fd); + return 0; + } + + STACK_WIND (frame, quota_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + fd, vector, count, off); + return 0; +} + + +int +quota_removexattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + if (op_ret == -1) { + gf_log (this->name, GF_LOG_CRITICAL, + "failed to remove the disk-usage value: %s", + strerror (op_errno)); + } + + STACK_DESTROY (frame->root); + return 0; +} + + +int +quota_setxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno) +{ + if (op_ret == -1) { + gf_log (this->name, GF_LOG_CRITICAL, + "failed to set the disk-usage value: %s", + strerror (op_errno)); + } + + STACK_DESTROY (frame->root); + return 0; +} + + +int +quota_statfs_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct statvfs *statvfs) +{ + struct quota_priv *priv = NULL; + uint64_t f_blocks = 0; + int64_t f_bfree = 0; + uint64_t f_bused = 0; + + + priv = this->private; + + if (op_ret != 0) + goto unwind; + + f_blocks = priv->disk_usage_limit / statvfs->f_frsize; + f_bused = priv->current_disk_usage / statvfs->f_frsize; + + if (f_blocks && (f_blocks < statvfs->f_blocks)) + statvfs->f_blocks = f_blocks; + + f_bfree = (statvfs->f_blocks - f_bused); + + if (f_bfree >= 0) + statvfs->f_bfree = statvfs->f_bavail = f_bfree; + else + statvfs->f_bfree = statvfs->f_bavail = 0; + +unwind: + STACK_UNWIND (frame, op_ret, op_errno, statvfs); + return 0; +} + + +int +quota_statfs (call_frame_t *frame, xlator_t *this, loc_t *loc) +{ + STACK_WIND (frame, quota_statfs_cbk, + FIRST_CHILD (this), FIRST_CHILD (this)->fops->statfs, loc); + + return 0; +} + + +int +quota_getxattr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, dict_t *value) +{ + data_t *data = NULL; + struct quota_priv *priv = this->private; + + if (op_ret >= 0) { + data = dict_get (value, "trusted.glusterfs-quota-du"); + if (data) { + LOCK (&priv->lock); + { + priv->current_disk_usage = data_to_uint64 (data); + } + UNLOCK (&priv->lock); + + return 0; + } + } + + STACK_DESTROY (frame->root); + + return 0; +} + + +void +gf_quota_get_disk_usage (xlator_t *this) +{ + call_frame_t *frame = NULL; + call_pool_t *pool = NULL; + loc_t loc; + + pool = this->ctx->pool; + frame = create_frame (this, pool); + build_root_loc (this, &loc); + + STACK_WIND (frame, quota_getxattr_cbk, + this->children->xlator, + this->children->xlator->fops->getxattr, + &loc, + "trusted.glusterfs-quota-du"); + return ; +} + + +void +gf_quota_cache_sync (xlator_t *this) +{ + struct quota_priv *priv = NULL; + call_frame_t *frame = NULL; + dict_t *dict = get_new_dict (); + loc_t loc; + + + priv = this->private; + build_root_loc (this, &loc); + + frame = create_frame (this, this->ctx->pool); + dict_set (dict, "trusted.glusterfs-quota-du", + data_from_uint64 (priv->current_disk_usage)); + + STACK_WIND (frame, quota_setxattr_cbk, + this->children->xlator, + this->children->xlator->fops->setxattr, + &loc, dict, 0); +} + + +int +quota_release (xlator_t *this, fd_t *fd) +{ + gf_quota_cache_sync (this); + + return 0; +} + + +/* notify */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + struct quota_priv *priv = this->private; + + switch (event) + { + case GF_EVENT_CHILD_UP: + if (priv->only_first_time) { + priv->only_first_time = 0; + if (priv->disk_usage_limit) { + gf_quota_get_disk_usage (this); + } + } + default: + default_notify (this, event, data); + break; + } + + return 0; +} + + +int32_t +init (xlator_t *this) +{ + int ret = 0; + data_t *data = NULL; + struct quota_priv *_private = NULL; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: quota should have exactly one child"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + _private = CALLOC (1, sizeof (struct quota_priv)); + _private->disk_usage_limit = 0; + data = dict_get (this->options, "disk-usage-limit"); + if (data) { + if (gf_string2bytesize (data->data, &_private->disk_usage_limit) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number '%s' for disk-usage limit", data->data); + ret = -1; + goto out; + } + + LOCK_INIT (&_private->lock); + _private->current_disk_usage = 0; + } + + _private->min_free_disk_limit = 0; + data = dict_get (this->options, "min-free-disk-limit"); + if (data) { + if (gf_string2percent (data->data, &_private->min_free_disk_limit) != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid percent '%s' for min-free-disk limit", data->data); + ret = -1; + goto out; + } + _private->refresh_interval = 20; /* 20seconds is default */ + data = dict_get (this->options, "refresh-interval"); + if (data) { + if (gf_string2time (data->data, + &_private->refresh_interval)!= 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid time '%s' for refresh " + "interval", data->data); + ret = -1; + goto out; + } + } + } + + _private->only_first_time = 1; + this->private = (void *)_private; + ret = 0; + out: + return ret; +} + +void +fini (xlator_t *this) +{ + struct quota_priv *_private = this->private; + + if (_private) { + gf_quota_cache_sync (this); + this->private = NULL; + } + + return ; +} + +struct xlator_fops fops = { + .create = quota_create, + .open = quota_open, + .truncate = quota_truncate, + .ftruncate = quota_ftruncate, + .writev = quota_writev, + .unlink = quota_unlink, + .rmdir = quota_rmdir, + .mknod = quota_mknod, + .mkdir = quota_mkdir, + .symlink = quota_symlink, + .statfs = quota_statfs, +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { + .release = quota_release +}; + +struct volume_options options[] = { + { .key = {"min-free-disk-limit"}, + .type = GF_OPTION_TYPE_PERCENT + }, + { .key = {"refresh-interval"}, + .type = GF_OPTION_TYPE_TIME + }, + { .key = {"disk-usage-limit"}, + .type = GF_OPTION_TYPE_SIZET + }, + { .key = {NULL} }, +}; diff --git a/xlators/features/trash/Makefile.am b/xlators/features/trash/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/features/trash/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/features/trash/src/Makefile.am b/xlators/features/trash/src/Makefile.am new file mode 100644 index 000000000..d61f608aa --- /dev/null +++ b/xlators/features/trash/src/Makefile.am @@ -0,0 +1,13 @@ +xlator_LTLIBRARIES = trash.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features + +trash_la_LDFLAGS = -module -avoidversion + +trash_la_SOURCES = trash.c +trash_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/features/trash/src/trash.c b/xlators/features/trash/src/trash.c new file mode 100644 index 000000000..c8e7357ee --- /dev/null +++ b/xlators/features/trash/src/trash.c @@ -0,0 +1,596 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "defaults.h" + +#include <libgen.h> + +/* TODO: currently it can work only above posix, no other translators + * between them. Not a good thing. Try making more reliable methods. + */ + +struct trash_struct { + inode_t *inode; + loc_t loc1; + loc_t loc2; + char origpath[ZR_PATH_MAX]; + char newpath[ZR_PATH_MAX]; + char oldpath[ZR_PATH_MAX]; // used only in case of rename +}; +typedef struct trash_struct trash_local_t; + +struct trash_priv { + char trash_dir[ZR_PATH_MAX]; +}; +typedef struct trash_priv trash_private_t; + +int32_t +trash_unlink_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); +int32_t +trash_rename_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf); + +/** + * trash_common_unwind_cbk - + */ +int32_t +trash_common_unwind_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + trash_local_t *local = frame->local; + + if (local->loc1.path) + loc_wipe (&local->loc1); + + if (local->loc2.path) + loc_wipe (&local->loc2); + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +/** + * trash_common_unwind_buf_cbk - + */ +int32_t +trash_common_unwind_buf_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + trash_local_t *local = frame->local; + + if (local->loc1.path) + loc_wipe (&local->loc1); + + if (local->loc2.path) + loc_wipe (&local->loc2); + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +trash_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stbuf) +{ + trash_local_t *local = frame->local; + char *tmp_str = strdup (local->newpath); + int32_t count = 0; + char *tmp_path = NULL; + char *tmp_dirname = NULL; + + if (op_ret == -1 && op_errno == ENOENT) { + tmp_dirname = strchr (tmp_str, '/'); + while (tmp_dirname) { + count = tmp_dirname - tmp_str; + if (count == 0) + count = 1; + tmp_path = CALLOC (1, count + 1); + ERR_ABORT (tmp_path); + memcpy (tmp_path, local->newpath, count); + loc_t tmp_loc = { + .inode = NULL, + .path = tmp_path, + }; + + /* TODO:create the directory with proper permissions */ + STACK_WIND_COOKIE (frame, + trash_mkdir_cbk, + tmp_path, + this->children->xlator, + this->children->xlator->fops->mkdir, + &tmp_loc, + 0777); + tmp_dirname = strchr (tmp_str + count + 1, '/'); + } + free (cookie); + free (tmp_str); + return 0; + } + char *dir_name = dirname (tmp_str); + if (strcmp((char*)cookie, dir_name) == 0) { + loc_t new_loc = { + .inode = NULL, + .path = local->newpath + }; + STACK_WIND (frame, + trash_unlink_rename_cbk, + this->children->xlator, + this->children->xlator->fops->rename, + &local->loc2, + &new_loc); + + } + free (cookie); /* strdup (dir_name) was sent here :) */ + free (tmp_str); + return 0; +} + +/** + * trash_unlink_rename_cbk - + */ +int32_t +trash_unlink_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + trash_local_t *local = frame->local; + if (op_ret == -1 && op_errno == ENOENT) { + /* check for the errno, if its ENOENT create directory and call + * rename later + */ + char *tmp_str = strdup (local->newpath); + char *dir_name = dirname (tmp_str); + loc_t tmp_loc = { + .inode = NULL, + .path = dir_name, + }; + /* TODO: create the directory with proper permissions */ + STACK_WIND_COOKIE (frame, + trash_mkdir_cbk, + strdup (dir_name), + this->children->xlator, + this->children->xlator->fops->mkdir, + &tmp_loc, + 0777); + free (tmp_str); + } else if (op_ret == -1 && op_errno == ENOTDIR) { + gf_log (this->name, GF_LOG_WARNING, + "Target exists, cannot keep the copy, deleting"); + STACK_WIND (frame, + trash_common_unwind_cbk, + this->children->xlator, + this->children->xlator->fops->unlink, + &local->loc2); + } else if (op_ret == -1 && op_errno == EISDIR) { + gf_log (this->name, GF_LOG_WARNING, + "Target exists as a directory, cannot keep the copy, " + "deleting"); + STACK_WIND (frame, + trash_common_unwind_cbk, + this->children->xlator, + this->children->xlator->fops->unlink, + &local->loc2); + } else { + /* */ + STACK_UNWIND (frame, 0, op_errno); + } + + return 0; +} + + +/** + * trash_unlink - + */ +int32_t +trash_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + trash_private_t *priv = this->private; + trash_local_t *local = NULL; + time_t utime = 0; + struct tm *tm = NULL; + char timestr[256]; + + if (strncmp (loc->path, priv->trash_dir, + strlen(priv->trash_dir)) == 0) { + /* Trying to rename from the trash can dir, do the + actual unlink */ + STACK_WIND (frame, + trash_common_unwind_cbk, + this->children->xlator, + this->children->xlator->fops->unlink, + loc); + } else { + local = CALLOC (1, sizeof (trash_local_t)); + if (!local) { + STACK_UNWIND (frame, -1, ENOMEM); + return 0; + } + frame->local = local; + + loc_copy (&local->loc2, loc); + + strcpy (local->newpath, priv->trash_dir); + strcat (local->newpath, loc->path); + + utime = time (NULL); + tm = localtime (&utime); + strftime (timestr, 256, ".%Y%m%d%H%M%S", tm); + strcat (local->newpath, timestr); + + { + loc_t new_loc = { + .inode = NULL, + .path = local->newpath + }; + STACK_WIND (frame, + trash_unlink_rename_cbk, + this->children->xlator, + this->children->xlator->fops->rename, + loc, + &new_loc); + } + } + return 0; +} + +/* */ +int32_t +trash_rename_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stbuf) +{ + trash_local_t *local = frame->local; + char *tmp_str = strdup (local->newpath); + + if (op_ret == -1 && op_errno == ENOENT) { + int32_t count = 0; + char *tmp_path = NULL; + char *tmp_dirname = strchr (tmp_str, '/'); + + while (tmp_dirname) { + count = tmp_dirname - tmp_str; + if (count == 0) + count = 1; + tmp_path = CALLOC (1, count + 2); + ERR_ABORT (tmp_path); + memcpy (tmp_path, local->newpath, count); + loc_t tmp_loc = { + .inode = NULL, + .path = tmp_path, + }; + + /* TODO:create the directory with proper permissions */ + STACK_WIND_COOKIE (frame, + trash_rename_mkdir_cbk, + tmp_path, + this->children->xlator, + this->children->xlator->fops->mkdir, + &tmp_loc, + 0777); + tmp_dirname = strchr (tmp_str + count + 1, '/'); + } + free (cookie); + free (tmp_str); + return 0; + } + char *dir_name = dirname (tmp_str); + if (strcmp((char*)cookie, dir_name) == 0) { + loc_t new_loc = { + .inode = NULL, + .path = local->newpath + }; + STACK_WIND (frame, + trash_rename_rename_cbk, + this->children->xlator, + this->children->xlator->fops->rename, + &local->loc2, + &new_loc); + + } + free (cookie); /* strdup (dir_name) was sent here :) */ + free (tmp_str); + return 0; +} + + +/** + * trash_unlink_rename_cbk - + */ +int32_t +trash_rename_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + trash_local_t *local = frame->local; + if (op_ret == -1 && op_errno == ENOENT) { + /* check for the errno, if its ENOENT create directory and call + * rename later + */ + char *tmp_str = strdup (local->newpath); + char *dir_name = dirname (tmp_str); + loc_t tmp_loc = { + .inode = NULL, + .path = dir_name, + }; + /* TODO: create the directory with proper permissions */ + STACK_WIND_COOKIE (frame, + trash_rename_mkdir_cbk, + strdup (dir_name), + this->children->xlator, + this->children->xlator->fops->mkdir, + &tmp_loc, + 0777); + free (tmp_str); + return 0; + } else if (op_ret == -1 && op_errno == ENOTDIR) { + gf_log (this->name, GF_LOG_WARNING, + "Target exists, cannot keep the dest entry %s, " + "renaming", + local->loc2.path); + } else if (op_ret == -1 && op_errno == EISDIR) { + gf_log (this->name, GF_LOG_WARNING, + "Target exists as a directory, cannot keep the " + "copy %s, renaming", + local->loc2.path); + } + loc_t new_loc = { + .inode = NULL, + .parent = local->loc2.parent, + .path = local->loc2.path, + }; + STACK_WIND (frame, + trash_common_unwind_buf_cbk, + this->children->xlator, + this->children->xlator->fops->rename, + &local->loc1, + &new_loc); + + return 0; +} + +/** + * trash_rename_lookup_cbk - + */ +int32_t +trash_rename_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf, + dict_t *xattr) +{ + trash_local_t *local = frame->local; + + if (op_ret == -1) { + STACK_WIND (frame, + trash_common_unwind_buf_cbk, + this->children->xlator, + this->children->xlator->fops->rename, + &local->loc1, + &local->loc2); + return 0; + } + + loc_t oldloc = { + .parent = local->loc2.parent, + .inode = inode, + .path = local->loc2.path, + }; + loc_t newloc = { + .inode = NULL, + .path = local->newpath + }; + STACK_WIND (frame, + trash_rename_rename_cbk, + this->children->xlator, + this->children->xlator->fops->rename, + &oldloc, + &newloc); + + return 0; +} + + +/** + * trash_rename - + */ +int32_t +trash_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + trash_private_t *priv = this->private; + trash_local_t *local = NULL; + time_t utime = 0; + struct tm *tm = NULL; + char timestr[256]; + + if (strncmp (oldloc->path, priv->trash_dir, + strlen(priv->trash_dir)) == 0) { + /* Trying to rename from the trash can dir, + do the actual rename */ + STACK_WIND (frame, + trash_common_unwind_buf_cbk, + this->children->xlator, + this->children->xlator->fops->rename, + oldloc, + newloc); + } else { + /* Trying to rename a regular file from GlusterFS */ + local = CALLOC (1, sizeof (trash_local_t)); + if (!local) { + STACK_UNWIND (frame, -1, ENOMEM, NULL); + return 0; + } + frame->local = local; + loc_copy (&local->loc1, oldloc); + loc_copy (&local->loc2, newloc); + + strcpy (local->newpath, priv->trash_dir); + strcat (local->newpath, newloc->path); + + utime = time (NULL); + tm = localtime (&utime); + strftime (timestr, 256, ".%Y%m%d%H%M%S", tm); + strcat (local->newpath, timestr); + + /* Send a lookup call on newloc, to ensure we are not + overwriting */ + STACK_WIND (frame, + trash_rename_lookup_cbk, + this->children->xlator, + this->children->xlator->fops->lookup, + newloc, + 0); + } + return 0; +} + +/** + * trash_init - + */ +int32_t +init (xlator_t *this) +{ + data_t *trash_dir = NULL; + xlator_list_t *trav = NULL; + trash_private_t *_priv = NULL; + + /* Create .trashcan directory in init */ + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "not configured with exactly one child. exiting"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + trav = this->children; + while (trav->xlator->children) + trav = trav->xlator->children; + + if (strncmp ("storage/", trav->xlator->type, 8)) + { + gf_log (this->name, GF_LOG_ERROR, + "'trash' translator not loaded over storage " + "translator, not a supported setup"); + return -1; + } + + _priv = CALLOC (1, sizeof (*_priv)); + ERR_ABORT (_priv); + + trash_dir = dict_get (this->options, "trash-dir"); + if (!trash_dir) { + gf_log (this->name, GF_LOG_WARNING, + "no option specified for 'trash-dir', " + "using \"/.trashcan/\""); + strcpy (_priv->trash_dir, "/.trashcan"); + } else { + /* Need a path with '/' as the first char, if not + given, append it */ + if (trash_dir->data[0] == '/') { + strcpy (_priv->trash_dir, trash_dir->data); + } else { + strcpy (_priv->trash_dir, "/"); + strcat (_priv->trash_dir, trash_dir->data); + } + } + + this->private = (void *)_priv; + return 0; +} + +void +fini (xlator_t *this) +{ + trash_private_t *priv = this->private; + FREE (priv); + return; +} + + +struct xlator_fops fops = { + .unlink = trash_unlink, + .rename = trash_rename, +}; + +struct xlator_mops mops = { + +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = { "trash-dir" }, + .type = GF_OPTION_TYPE_PATH + }, + { .key = {NULL} }, +}; diff --git a/xlators/meta/Makefile.am b/xlators/meta/Makefile.am new file mode 100644 index 000000000..e1c45f305 --- /dev/null +++ b/xlators/meta/Makefile.am @@ -0,0 +1 @@ +SUBDIRS=src \ No newline at end of file diff --git a/xlators/meta/src/Makefile.am b/xlators/meta/src/Makefile.am new file mode 100644 index 000000000..385ff553f --- /dev/null +++ b/xlators/meta/src/Makefile.am @@ -0,0 +1,10 @@ +xlator_PROGRAMS = meta.so +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/ + +meta_so_SOURCES = meta.c tree.c misc.c view.c +noinst_HEADERS = meta.h tree.h misc.h view.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles + +CLEANFILES = diff --git a/xlators/meta/src/meta.c b/xlators/meta/src/meta.c new file mode 100644 index 000000000..ce49ed2c4 --- /dev/null +++ b/xlators/meta/src/meta.c @@ -0,0 +1,1285 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "dict.h" +#include "xlator.h" + +#include "meta.h" +#include "view.h" + +int32_t +meta_getattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +meta_getattr (call_frame_t *frame, + xlator_t *this, + const char *path) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + + if (file) { + if (file->fops && file->fops->getattr) { + STACK_WIND (frame, meta_getattr_cbk, + this, file->fops->getattr, path); + return 0; + } + else { + STACK_UNWIND (frame, 0, 0, file->stbuf); + return 0; + } + } + else { + STACK_WIND (frame, meta_getattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getattr, + path); + return 0; + } +} + +int32_t +meta_chmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +meta_chmod (call_frame_t *frame, + xlator_t *this, + const char *path, + mode_t mode) +{ + STACK_WIND (frame, + meta_chmod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chmod, + path, + mode); + return 0; +} + +int32_t +meta_chown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +meta_chown (call_frame_t *frame, + xlator_t *this, + const char *path, + uid_t uid, + gid_t gid) +{ + STACK_WIND (frame, + meta_chown_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chown, + path, + uid, + gid); + return 0; +} + + +int32_t +meta_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +meta_truncate (call_frame_t *frame, + xlator_t *this, + const char *path, + off_t offset) +{ + STACK_WIND (frame, + meta_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + path, + offset); + return 0; +} + + +int32_t +meta_ftruncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +meta_ftruncate (call_frame_t *frame, + xlator_t *this, + dict_t *fd, + off_t offset) +{ + STACK_WIND (frame, + meta_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + fd, + offset); + return 0; +} + + +int32_t +meta_utimes_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +meta_utimes (call_frame_t *frame, + xlator_t *this, + const char *path, + struct timespec *buf) +{ + STACK_WIND (frame, + meta_utimes_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->utimes, + path, + buf); + return 0; +} + + +int32_t +meta_access_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +meta_access (call_frame_t *frame, + xlator_t *this, + const char *path, + mode_t mode) +{ + STACK_WIND (frame, + meta_access_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->access, + path, + mode); + return 0; +} + +int32_t +meta_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + char *dest) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + dest); + return 0; +} + +int32_t +meta_readlink (call_frame_t *frame, + xlator_t *this, + const char *path, + size_t size) +{ + STACK_WIND (frame, + meta_readlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, + path, + size); + return 0; +} + +int32_t +meta_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +meta_mknod (call_frame_t *frame, + xlator_t *this, + const char *path, + mode_t mode, + dev_t dev) +{ + STACK_WIND (frame, + meta_mknod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mknod, + path, + mode, + dev); + return 0; +} + +int32_t +meta_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +meta_mkdir (call_frame_t *frame, + xlator_t *this, + const char *path, + mode_t mode) +{ + STACK_WIND (frame, + meta_mkdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->mkdir, + path, + mode); + return 0; +} + +int32_t +meta_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +meta_unlink (call_frame_t *frame, + xlator_t *this, + const char *path) +{ + STACK_WIND (frame, + meta_unlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + path); + return 0; +} + +int32_t +meta_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +meta_rmdir (call_frame_t *frame, + xlator_t *this, + const char *path) +{ + STACK_WIND (frame, + meta_rmdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rmdir, + path); + return 0; +} + +int32_t +meta_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +meta_symlink (call_frame_t *frame, + xlator_t *this, + const char *oldpath, + const char *newpath) +{ + STACK_WIND (frame, + meta_symlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + oldpath, + newpath); + return 0; +} + +int32_t +meta_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +meta_rename (call_frame_t *frame, + xlator_t *this, + const char *oldpath, + const char *newpath) +{ + STACK_WIND (frame, + meta_rename_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, + oldpath, + newpath); + return 0; +} + +int32_t +meta_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +meta_link (call_frame_t *frame, + xlator_t *this, + const char *oldpath, + const char *newpath) +{ + STACK_WIND (frame, + meta_link_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->link, + oldpath, + newpath); + return 0; +} + +struct _open_local { + const char *path; +}; + +int32_t +meta_open_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dict_t *ctx, struct stat *buf) +{ + struct _open_local *local = frame->local; + if (local) + dict_set (ctx, this->name, str_to_data (local->path)); + STACK_UNWIND (frame, op_ret, op_errno, ctx, buf); + return 0; +} + +int32_t +meta_open (call_frame_t *frame, xlator_t *this, + const char *path, int32_t flags, mode_t mode) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + + if (file) { + if (file->fops && file->fops->open) { + struct _open_local *local = CALLOC (1, sizeof (struct _open_local)); + ERR_ABORT (local); + local->path = strdup (path); + frame->local = local; + STACK_WIND (frame, meta_open_cbk, + this, file->fops->open, + path, flags, mode); + return 0; + } + else { + dict_t *ctx = get_new_dict (); + dict_ref (ctx); + dict_set (ctx, this->name, str_to_data (strdup (path))); + STACK_UNWIND (frame, 0, 0, ctx, file->stbuf); + return 0; + } + } + else { + STACK_WIND (frame, meta_open_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->open, + path, flags, mode); + return 0; + } +} + +int32_t +meta_create (call_frame_t *frame, xlator_t *this, + const char *path, int32_t flags, mode_t mode) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + + if (file) { + if (file->fops && file->fops->create) { + struct _open_local *local = CALLOC (1, sizeof (struct _open_local)); + ERR_ABORT (local); + local->path = strdup (path); + frame->local = local; + STACK_WIND (frame, meta_open_cbk, + this, file->fops->create, + path, flags, mode); + return 0; + } + else { + STACK_UNWIND (frame, -1, 0, NULL, NULL); + return 0; + } + } + else { + STACK_WIND (frame, meta_open_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->create, + path, flags, mode); + return 0; + } +} + +int32_t +meta_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + vector, + count); + return 0; +} + +int32_t +meta_readv (call_frame_t *frame, + xlator_t *this, + dict_t *fd, + size_t size, + off_t offset) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + data_t *path_data = dict_get (fd, this->name); + + if (path_data) { + const char *path = data_to_str (path_data); + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + + if (file && file->fops && file->fops->readv) { + STACK_WIND (frame, meta_readv_cbk, + this, file->fops->readv, + fd, size, offset); + return 0; + } + } + else { + STACK_WIND (frame, meta_readv_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readv, + fd, size, offset); + return 0; + } +} + +int32_t +meta_writev_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +meta_writev (call_frame_t *frame, xlator_t *this, + dict_t *fd, + struct iovec *vector, int32_t count, off_t offset) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + data_t *path_data = dict_get (fd, this->name); + + if (path_data) { + const char *path = data_to_str (path_data); + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + + if (file && file->fops && file->fops->writev) { + STACK_WIND (frame, meta_writev_cbk, + this, file->fops->writev, + fd, vector, count, offset); + return 0; + } + } + else { + STACK_WIND (frame, meta_readv_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->writev, + fd, vector, count, offset); + return 0; + } +} + +int32_t +meta_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +meta_flush (call_frame_t *frame, + xlator_t *this, + dict_t *fd) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + data_t *path_data = dict_get (fd, this->name); + + if (path_data) { + const char *path = data_to_str (path_data); + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + + if (file) { + if (file->fops && file->fops->flush) { + STACK_WIND (frame, meta_flush_cbk, + this, file->fops->flush, + fd); + return 0; + } + else { + STACK_UNWIND (frame, 0, 0); + return 0; + } + } + } + else { + STACK_WIND (frame, meta_flush_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->flush, + fd); + return 0; + } +} + +int32_t +meta_release_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +meta_release (call_frame_t *frame, + xlator_t *this, + dict_t *fd) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + data_t *path_data = dict_get (fd, this->name); + + if (path_data) { + const char *path = data_to_str (path_data); + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + + if (file) { + dict_unref (fd); + STACK_UNWIND (frame, 0, 0); + return 0; + } + } + else { + STACK_WIND (frame, meta_release_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->release, + fd); + return 0; + } +} + +int32_t +meta_fsync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +meta_fsync (call_frame_t *frame, + xlator_t *this, + dict_t *fd, + int32_t flags) +{ + STACK_WIND (frame, + meta_fsync_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, + fd, + flags); + return 0; +} + +int32_t +meta_fgetattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +meta_fgetattr (call_frame_t *frame, + xlator_t *this, + dict_t *fd) +{ + STACK_WIND (frame, + meta_fgetattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fgetattr, + fd); + return 0; +} + +int32_t +meta_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *fd) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + fd); + return 0; +} + +int32_t +meta_opendir (call_frame_t *frame, + xlator_t *this, + const char *path) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + meta_dirent_t *dir = lookup_meta_entry (root, path, NULL); + + if (dir) { + dict_t *ctx = get_new_dict (); + dict_set (ctx, this->name, str_to_data (strdup (path))); + STACK_UNWIND (frame, 0, 0, ctx); + return 0; + } + else { + STACK_WIND (frame, meta_opendir_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->opendir, + path); + return 0; + } +} + +int32_t +meta_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count) +{ + meta_private_t *priv = (meta_private_t *)this->private; + + if ((int) cookie == 1) { + dir_entry_t *dir = CALLOC (1, sizeof (dir_entry_t)); + ERR_ABORT (dir); + + dir->name = strdup (".meta"); + memcpy (&dir->buf, priv->tree->stbuf, sizeof (struct stat)); + dir->next = entries->next; + entries->next = dir; + + STACK_UNWIND (frame, op_ret, op_errno, entries, count+1); + return 0; + } + + STACK_UNWIND (frame, op_ret, op_errno, entries, count); + return 0; +} + +int32_t +meta_readdir (call_frame_t *frame, + xlator_t *this, + const char *path) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + + meta_dirent_t *dir = lookup_meta_entry (root, path, NULL); + if (dir) { + if (dir->fops && dir->fops->readdir) { + STACK_WIND (frame, meta_readdir_cbk, + this, dir->fops->readdir, path); + return 0; + } + else { + int count = 0; + dir = dir->children; + dir_entry_t *entries = NULL; + + while (dir) { + dir_entry_t *d = CALLOC (1, sizeof (dir_entry_t)); + ERR_ABORT (d); + d->name = dir->name; + d->buf = *dir->stbuf; + d->next = entries; + entries = d; + count++; + dir = dir->next; + } + + dir_entry_t *header = CALLOC (1, sizeof (dir_entry_t)); + ERR_ABORT (header); + header->next = entries; + STACK_UNWIND (frame, 0, 0, header, count); + return 0; + } + } + else { + if (!strcmp (path, "/")) { + STACK_WIND_COOKIE (frame, meta_readdir_cbk, + (int) 1, /* cookie to tell _cbk to add .meta entry */ + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, + path); + } + else { + STACK_WIND (frame, meta_readdir_cbk, + FIRST_CHILD(this), FIRST_CHILD(this)->fops->readdir, + path); + } + } + return 0; +} + +int32_t +meta_releasedir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +meta_releasedir (call_frame_t *frame, + xlator_t *this, + dict_t *fd) +{ + STACK_WIND (frame, + meta_releasedir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->releasedir, + fd); + return 0; +} + +int32_t +meta_fsyncdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +meta_fsyncdir (call_frame_t *frame, + xlator_t *this, + dict_t *fd, + int32_t flags) +{ + STACK_WIND (frame, + meta_fsyncdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsyncdir, + fd, + flags); + return 0; +} + +int32_t +meta_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + buf); + return 0; +} + +int32_t +meta_statfs (call_frame_t *frame, + xlator_t *this, + const char *path) +{ + STACK_WIND (frame, + meta_statfs_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->statfs, + path); + return 0; +} + +int32_t +meta_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +meta_setxattr (call_frame_t *frame, + xlator_t *this, + const char *path, + const char *name, + const char *value, + size_t size, + int32_t flags) +{ + STACK_WIND (frame, + meta_setxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->setxattr, + path, + name, + value, + size, + flags); + return 0; +} + +int32_t +meta_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + char *value) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + value); + return 0; +} + +int32_t +meta_getxattr (call_frame_t *frame, + xlator_t *this, + const char *path, + const char *name, + size_t size) +{ + STACK_WIND (frame, + meta_getxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getxattr, + path, + name, + size); + return 0; +} + +int32_t +meta_listxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + char *value) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + value); + return 0; +} + +int32_t +meta_listxattr (call_frame_t *frame, + xlator_t *this, + const char *path, + size_t size) +{ + STACK_WIND (frame, + meta_listxattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->listxattr, + path, + size); + return 0; +} + +int32_t +meta_removexattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, + op_ret, + op_errno); + return 0; +} + +int32_t +meta_removexattr (call_frame_t *frame, + xlator_t *this, + const char *path, + const char *name) +{ + STACK_WIND (frame, + meta_removexattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->removexattr, + path, + name); + return 0; +} + +int32_t +meta_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + STACK_UNWIND (frame, + op_ret, + op_errno, + lock); + return 0; +} + +int32_t +meta_lk (call_frame_t *frame, + xlator_t *this, + dict_t *file, + int32_t cmd, + struct flock *lock) +{ + STACK_WIND (frame, + meta_lk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lk, + file, + cmd, + lock); + return 0; +} + +static void +add_xlator_to_tree (meta_dirent_t *tree, xlator_t *this, + const char *prefix) +{ + char *dir; + asprintf (&dir, "%s/%s", prefix, this->name); + + char *children; + asprintf (&children, "%s/%s", dir, "subvolumes"); + + char *type; + asprintf (&type, "%s/%s", dir, "type"); + + char *view; + asprintf (&view, "%s/%s", dir, "view"); + + insert_meta_entry (tree, dir, S_IFDIR, NULL, NULL); + insert_meta_entry (tree, children, S_IFDIR, NULL, NULL); + meta_dirent_t *v = insert_meta_entry (tree, view, S_IFDIR, NULL, + &meta_xlator_view_fops); + v->view_xlator = this; + meta_dirent_t *t = insert_meta_entry (tree, type, S_IFREG, NULL, + &meta_xlator_type_fops); + t->view_xlator = this; + + xlator_list_t *trav = this->children; + while (trav) { + add_xlator_to_tree (tree, trav->xlator, children); + trav = trav->next; + } +} + +static void +build_meta_tree (xlator_t *this) +{ + meta_private_t *priv = (meta_private_t *) this->private; + priv->tree = CALLOC (1, sizeof (meta_dirent_t)); + ERR_ABORT (priv->tree); + priv->tree->name = strdup (".meta"); + priv->tree->stbuf = new_stbuf (); + priv->tree->stbuf->st_mode = S_IFDIR | S_IRUSR | S_IRGRP | S_IROTH | + S_IXUSR | S_IXGRP | S_IXOTH; + + insert_meta_entry (priv->tree, "/.meta/version", + S_IFREG, NULL, &meta_version_fops); + + insert_meta_entry (priv->tree, "/.meta/xlators", + S_IFDIR, NULL, NULL); + + xlator_list_t *trav = this->children; + while (trav) { + add_xlator_to_tree (priv->tree, trav->xlator, "/.meta/xlators"); + trav = trav->next; + } +} + +int32_t +init (xlator_t *this) +{ + if (this->parent != NULL) { + gf_log ("meta", GF_LOG_ERROR, "FATAL: meta should be the root of the xlator tree"); + return -1; + } + + meta_private_t *priv = CALLOC (1, sizeof (meta_private_t)); + ERR_ABORT (priv); + + data_t *directory = dict_get (this->options, "directory"); + if (directory) { + priv->directory = strdup (data_to_str (directory)); + } + else { + priv->directory = ".meta"; + } + + this->private = priv; + build_meta_tree (this); + + return 0; +} + +int32_t +fini (xlator_t *this) +{ + return 0; +} + +struct xlator_fops fops = { + .getattr = meta_getattr, + .readlink = meta_readlink, + .mknod = meta_mknod, + .mkdir = meta_mkdir, + .unlink = meta_unlink, + .rmdir = meta_rmdir, + .symlink = meta_symlink, + .rename = meta_rename, + .link = meta_link, + .chmod = meta_chmod, + .chown = meta_chown, + .truncate = meta_truncate, + .utimes = meta_utimes, + .open = meta_open, + .readv = meta_readv, + .writev = meta_writev, + .statfs = meta_statfs, + .flush = meta_flush, + .release = meta_release, + .fsync = meta_fsync, + .setxattr = meta_setxattr, + .getxattr = meta_getxattr, + .listxattr = meta_listxattr, + .removexattr = meta_removexattr, + .opendir = meta_opendir, + .readdir = meta_readdir, + .releasedir = meta_releasedir, + .fsyncdir = meta_fsyncdir, + .access = meta_access, + .ftruncate = meta_ftruncate, + .fgetattr = meta_fgetattr, + .create = meta_create, + .lk = meta_lk, +}; + +struct xlator_mops mops = { +}; diff --git a/xlators/meta/src/meta.h b/xlators/meta/src/meta.h new file mode 100644 index 000000000..6823ef85b --- /dev/null +++ b/xlators/meta/src/meta.h @@ -0,0 +1,48 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __META_H__ +#define __META_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +struct _meta_dirent { + const char *name; + int type; + struct _meta_dirent *children; + struct _meta_dirent *parent; + struct _meta_dirent *next; + struct stat *stbuf; + xlator_t *view_xlator; + struct xlator_fops *fops; +}; +typedef struct _meta_dirent meta_dirent_t; + +typedef struct { + const char *directory; + meta_dirent_t *tree; +} meta_private_t; + +#include "tree.h" +#include "misc.h" + +#endif /* __META_H__ */ diff --git a/xlators/meta/src/misc.c b/xlators/meta/src/misc.c new file mode 100644 index 000000000..9c2f50d34 --- /dev/null +++ b/xlators/meta/src/misc.c @@ -0,0 +1,67 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <unistd.h> +#include <sys/uio.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "xlator.h" +#include "meta.h" + +#define min(x,y) ((x) < (y) ? (x) : (y)) + +/* /.meta/version */ +static const char *version_str = PACKAGE_NAME " " PACKAGE_VERSION "\n"; + +int32_t +meta_version_readv (call_frame_t *frame, xlator_t *this, + dict_t *fd, size_t size, off_t offset) +{ + static int version_size; + version_size = strlen (version_str); + + struct iovec vec; + vec.iov_base = version_str + offset; + vec.iov_len = min (version_size - offset, size); + + STACK_UNWIND (frame, vec.iov_len, 0, &vec, 1); + return 0; +} + +int32_t +meta_version_getattr (call_frame_t *frame, + xlator_t *this, + const char *path) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + file->stbuf->st_size = strlen (version_str); + STACK_UNWIND (frame, 0, 0, file->stbuf); +} + +struct xlator_fops meta_version_fops = { + .readv = meta_version_readv, + .getattr = meta_version_getattr +}; + diff --git a/xlators/meta/src/misc.h b/xlators/meta/src/misc.h new file mode 100644 index 000000000..433c604eb --- /dev/null +++ b/xlators/meta/src/misc.h @@ -0,0 +1,31 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __MISC_H__ +#define __MISC_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +struct xlator_fops meta_version_fops; + +#endif /* __MISC_H__ */ diff --git a/xlators/meta/src/tree.c b/xlators/meta/src/tree.c new file mode 100644 index 000000000..ec88c42a0 --- /dev/null +++ b/xlators/meta/src/tree.c @@ -0,0 +1,176 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <string.h> + +#include "glusterfs.h" +#include "xlator.h" + +#include "meta.h" + +static int +is_meta_path (const char *path) +{ + while (*path == '/') + path++; + if (!strncmp (path, ".meta", strlen (".meta"))) + return 1; + return 0; +} + +struct stat * +new_stbuf (void) +{ + static int next_inode = 0; + struct stat *stbuf = CALLOC (1, sizeof (struct stat)); + + ERR_ABORT (stbuf); + + stbuf->st_dev = 0; + stbuf->st_ino = next_inode++; + stbuf->st_mode = S_IRUSR | S_IRGRP | S_IROTH; + stbuf->st_nlink = 1; + stbuf->st_uid = 0; + stbuf->st_gid = 0; + stbuf->st_rdev = 0; + stbuf->st_size = 0; + stbuf->st_blksize = 0; + stbuf->st_blocks = 0; + stbuf->st_atime = time (NULL); + stbuf->st_atim.tv_nsec = 0; + stbuf->st_mtime = stbuf->st_atime; + stbuf->st_mtim.tv_nsec = 0; + stbuf->st_ctime = stbuf->st_ctime; + stbuf->st_ctim.tv_nsec = 0; + + return stbuf; +} + +/* find an entry among the siblings of an entry */ +static meta_dirent_t * +find_entry (meta_dirent_t *node, const char *dir) +{ + meta_dirent_t *trav = node; + while (trav) { + if (!strcmp (trav->name, dir)) + return trav; + trav = trav->next; + } + return NULL; +} + +/* + * Return the meta_dirent_t corresponding to the pathname. + * + * If pathname does not exist in the meta tree, try to return + * its highest parent that does exist. The part of the + * pathname that is left over is returned in the value-result + * variable {remain}. + * For example, for "/.meta/xlators/brick1/view/foo/bar/baz", + * return the entry for "/.meta/xlators/brick1/view" + * and set remain to "/bar/baz" + */ + +meta_dirent_t * +lookup_meta_entry (meta_dirent_t *root, const char *path, + char **remain) +{ + char *_path = strdup (path); + + if (!is_meta_path (path)) + return NULL; + + meta_dirent_t *trav = root; + char *dir = strtok (_path, "/"); + dir = strtok (NULL, "/"); + + while (dir) { + meta_dirent_t *ntrav; + ntrav = find_entry (trav->children, dir); + if (!ntrav) { + /* we have reached bottom of the meta tree. + Unknown dragons lie further below */ + if (remain) { + char *piece = dir; + while (piece) { + char *tmp = *remain; + if (*remain) + asprintf (remain, "/%s/%s", *remain, piece); + else + asprintf (remain, "/%s", piece); + if (tmp) free (tmp); + piece = strtok (NULL, "/"); + } + } + return trav; + } + dir = strtok (NULL, "/"); + trav = ntrav; + } + + free (_path); + return trav; +} + +meta_dirent_t * +insert_meta_entry (meta_dirent_t *root, const char *path, + int type, struct stat *stbuf, struct xlator_fops *fops) +{ + if (!is_meta_path (path)) + return NULL; + char *slashpos = strrchr (path, '/'); + char *dir = strndup (path, slashpos - path); + meta_dirent_t *parent = lookup_meta_entry (root, dir, NULL); + if (!dir) + return NULL; + + meta_dirent_t *new = CALLOC (1, sizeof (meta_dirent_t)); + ERR_ABORT (new); + new->name = strdup (slashpos+1); + new->type = type; + new->parent = parent; + new->next = parent->children; + parent->children = new; + if (stbuf) + new->stbuf = stbuf; + else + new->stbuf = new_stbuf (); + + new->stbuf->st_mode |= type; + new->fops = fops; + return new; +} + +int main (void) +{ + meta_dirent_t *root = CALLOC (1, sizeof (meta_dirent_t)); + ERR_ABORT (root); + root->name = strdup (".meta"); + + insert_meta_entry (root, "/.meta/version", S_IFREG, NULL, NULL); + return 0; +} diff --git a/xlators/meta/src/tree.h b/xlators/meta/src/tree.h new file mode 100644 index 000000000..eb2cf0220 --- /dev/null +++ b/xlators/meta/src/tree.h @@ -0,0 +1,35 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __TREE_H__ +#define __TREE_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +meta_dirent_t * +insert_meta_entry (meta_dirent_t *root, const char *path, + int type, struct stat *stbuf, struct xlator_fops *fops); +meta_dirent_t * +lookup_meta_entry (meta_dirent_t *root, const char *path, + char **remain); + +#endif /* __TREE_H__ */ diff --git a/xlators/meta/src/view.c b/xlators/meta/src/view.c new file mode 100644 index 000000000..7104d10e9 --- /dev/null +++ b/xlators/meta/src/view.c @@ -0,0 +1,258 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "xlator.h" + +#include "meta.h" + +/* + * This file contains fops for the files and directories in + * an xlator directory + */ + +/* /.meta/xlators/.../type */ + +int32_t +meta_xlator_type_readv (call_frame_t *frame, xlator_t *this, + dict_t *fd, size_t size, off_t offset) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + data_t *path_data = dict_get (fd, this->name); + + if (path_data) { + const char *path = data_to_str (path_data); + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + xlator_t *view_xlator = file->view_xlator; + + int type_size; + type_size = strlen (view_xlator->type); + + struct iovec vec; + vec.iov_base = view_xlator->type + offset; + vec.iov_len = min (type_size - offset, size); + + STACK_UNWIND (frame, vec.iov_len, 0, &vec, 1); + return 0; + } +} + +int32_t +meta_xlator_type_getattr (call_frame_t *frame, + xlator_t *this, + const char *path) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + xlator_t *view_xlator = file->view_xlator; + file->stbuf->st_size = strlen (view_xlator->type); + + STACK_UNWIND (frame, 0, 0, file->stbuf); + return 0; +} + +struct xlator_fops meta_xlator_type_fops = { + .readv = meta_xlator_type_readv, + .getattr = meta_xlator_type_getattr +}; + +/* + * fops for the "view" directory + * {xlator}/view shows the filesystem as it appears + * to {xlator} + */ + +static int32_t +meta_xlator_view_getattr_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +meta_xlator_view_getattr (call_frame_t *frame, + xlator_t *this, + const char *path) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + char *op_path = NULL; + + meta_dirent_t *file = lookup_meta_entry (root, path, &op_path); + + if (op_path) { + STACK_WIND (frame, meta_xlator_view_getattr_cbk, file->view_xlator, + file->view_xlator->fops->getattr, + op_path); + } + else { + STACK_UNWIND (frame, 0, 0, file->stbuf); + } + + return 0; +} + +static int32_t +meta_xlator_view_readdir_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno, + dir_entry_t *entries, int32_t count) +{ + STACK_UNWIND (frame, op_ret, op_errno, entries, count); + return 0; +} + +int32_t +meta_xlator_view_readdir (call_frame_t *frame, + xlator_t *this, + const char *path) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + char *op_path = NULL; + + meta_dirent_t *dir = lookup_meta_entry (root, path, &op_path); + + STACK_WIND (frame, meta_xlator_view_readdir_cbk, + dir->view_xlator, dir->view_xlator->fops->readdir, + op_path ? op_path : "/"); + return 0; +} + +static int32_t +meta_xlator_view_open_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, + int32_t op_ret, int32_t op_errno, + dict_t *ctx, struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, ctx, buf); + return 0; +} + +int32_t +meta_xlator_view_open (call_frame_t *frame, xlator_t *this, + const char *path, int32_t flags, mode_t mode) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + char *op_path = NULL; + + meta_dirent_t *file = lookup_meta_entry (root, path, &op_path); + STACK_WIND (frame, meta_xlator_view_open_cbk, + file->view_xlator, file->view_xlator->fops->open, + op_path, flags, mode); + return 0; +} + +int32_t +meta_xlator_view_create (call_frame_t *frame, xlator_t *this, + const char *path, int32_t flags, mode_t mode) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + char *op_path = NULL; + + meta_dirent_t *file = lookup_meta_entry (root, path, &op_path); + STACK_WIND (frame, meta_xlator_view_open_cbk, + file->view_xlator, file->view_xlator->fops->create, + op_path, flags, mode); + return 0; +} + +static int32_t +meta_xlator_view_readv_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iovec *vector, + int32_t count) +{ + STACK_UNWIND (frame, op_ret, op_errno, vector, count); + return 0; +} + +int32_t +meta_xlator_view_readv (call_frame_t *frame, xlator_t *this, + dict_t *fd, size_t size, off_t offset) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + data_t *path_data = dict_get (fd, this->name); + + if (path_data) { + const char *path = data_to_str (path_data); + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + + STACK_WIND (frame, meta_xlator_view_readv_cbk, + file->view_xlator, file->view_xlator->fops->readv, + fd, size, offset); + return 0; + } + + STACK_UNWIND (frame, -1, EBADFD, NULL, 0); + return 0; +} + +static int32_t +meta_xlator_view_writev_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +meta_xlator_view_writev (call_frame_t *frame, xlator_t *this, + dict_t *fd, + struct iovec *vector, int32_t count, off_t offset) +{ + meta_private_t *priv = (meta_private_t *) this->private; + meta_dirent_t *root = priv->tree; + data_t *path_data = dict_get (fd, this->name); + + if (path_data) { + const char *path = data_to_str (path_data); + meta_dirent_t *file = lookup_meta_entry (root, path, NULL); + + STACK_WIND (frame, meta_xlator_view_writev_cbk, + file->view_xlator, file->view_xlator->fops->writev, + fd, vector, count, offset); + return 0; + } + + STACK_UNWIND (frame, -1, EBADFD, NULL, 0); + return 0; +} + +struct xlator_fops meta_xlator_view_fops = { + .getattr = meta_xlator_view_getattr, + .readdir = meta_xlator_view_readdir, + .open = meta_xlator_view_open, + .create = meta_xlator_view_create, + .readv = meta_xlator_view_readv, + .writev = meta_xlator_view_writev +}; diff --git a/xlators/meta/src/view.h b/xlators/meta/src/view.h new file mode 100644 index 000000000..2e1ac3ebf --- /dev/null +++ b/xlators/meta/src/view.h @@ -0,0 +1,32 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __VIEW_H__ +#define __VIEW_H__ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +struct xlator_fops meta_xlator_type_fops; +struct xlator_fops meta_xlator_view_fops; + +#endif /* __VIEW_H__ */ diff --git a/xlators/mount/Makefile.am b/xlators/mount/Makefile.am new file mode 100644 index 000000000..945982d95 --- /dev/null +++ b/xlators/mount/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = @FUSE_CLIENT_SUBDIR@ + +CLEANFILES = diff --git a/xlators/mount/fuse/Makefile.am b/xlators/mount/fuse/Makefile.am new file mode 100644 index 000000000..3b344b1d7 --- /dev/null +++ b/xlators/mount/fuse/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src utils + +CLEANFILES = diff --git a/xlators/mount/fuse/src/Makefile.am b/xlators/mount/fuse/src/Makefile.am new file mode 100644 index 000000000..9d8d45e4f --- /dev/null +++ b/xlators/mount/fuse/src/Makefile.am @@ -0,0 +1,14 @@ + +noinst_HEADERS = fuse-extra.h + +xlator_LTLIBRARIES = fuse.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/mount +fuse_la_SOURCES = fuse-bridge.c fuse-extra.c +fuse_la_LDFLAGS = -module -avoidversion -shared -nostartfiles $(GF_FUSE_LDADD) + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \ + -I$(top_srcdir)/libglusterfs/src $(GF_CFLAGS) -DFUSE_USE_VERSION=26 + + +CLEANFILES = + diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c new file mode 100644 index 000000000..8e7055878 --- /dev/null +++ b/xlators/mount/fuse/src/fuse-bridge.c @@ -0,0 +1,2859 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/* + * TODO: + * Need to free_state() when fuse_reply_err() + return. + * Check loc->path for "" after fuse_loc_fill in all fops + * (now being done in getattr, lookup) or better - make + * fuse_loc_fill() and inode_path() return success/failure. + */ + +#include <stdint.h> +#include <signal.h> +#include <pthread.h> + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif /* _CONFIG_H */ + +#include "glusterfs.h" +#include "logging.h" +#include "xlator.h" +#include "glusterfs.h" +#include "defaults.h" +#include "common-utils.h" + +#include <fuse/fuse_lowlevel.h> + +#include "fuse-extra.h" +#include "list.h" +#include "dict.h" + +#include "compat.h" +#include "compat-errno.h" + +/* TODO: when supporting posix acl, remove this definition */ +#define DISABLE_POSIX_ACL + +#define ZR_MOUNTPOINT_OPT "mountpoint" +#define ZR_DIRECT_IO_OPT "direct-io-mode" + +#define BIG_FUSE_CHANNEL_SIZE 1048576 + +struct fuse_private { + int fd; + struct fuse *fuse; + struct fuse_session *se; + struct fuse_chan *ch; + char *volfile; + size_t volfile_size; + char *mount_point; + data_t *buf; + pthread_t fuse_thread; + char fuse_thread_started; + uint32_t direct_io_mode; + uint32_t entry_timeout; + uint32_t attribute_timeout; + +}; +typedef struct fuse_private fuse_private_t; + +#define _FI_TO_FD(fi) ((fd_t *)((long)fi->fh)) + +#define FI_TO_FD(fi) ((_FI_TO_FD (fi))?(fd_ref (_FI_TO_FD(fi))):((fd_t *) 0)) + +#define FUSE_FOP(state, ret, op_num, fop, args ...) \ + do { \ + call_frame_t *frame = get_call_frame_for_req (state, 1); \ + xlator_t *xl = frame->this->children ? \ + frame->this->children->xlator : NULL; \ + dict_t *refs = frame->root->req_refs; \ + frame->root->state = state; \ + frame->root->op = op_num; \ + STACK_WIND (frame, ret, xl, xl->fops->fop, args); \ + dict_unref (refs); \ + } while (0) + + +typedef struct { + void *pool; + xlator_t *this; + inode_table_t *itable; + loc_t loc; + loc_t loc2; + fuse_req_t req; + int32_t flags; + off_t off; + size_t size; + unsigned long nlookup; + fd_t *fd; + dict_t *dict; + char *name; + char is_revalidate; +} fuse_state_t; + +int fuse_chan_receive (struct fuse_chan *ch, + char *buf, + int32_t size); + + +static void +free_state (fuse_state_t *state) +{ + loc_wipe (&state->loc); + + loc_wipe (&state->loc2); + + if (state->dict) { + dict_unref (state->dict); + state->dict = (void *)0xaaaaeeee; + } + if (state->name) { + FREE (state->name); + state->name = NULL; + } + if (state->fd) { + fd_unref (state->fd); + state->fd = (void *)0xfdfdfdfd; + } +#ifdef DEBUG + memset (state, 0x90, sizeof (*state)); +#endif + FREE (state); + state = NULL; +} + + +fuse_state_t * +state_from_req (fuse_req_t req) +{ + fuse_state_t *state; + xlator_t *this = NULL; + + this = fuse_req_userdata (req); + + state = (void *)calloc (1, sizeof (*state)); + ERR_ABORT (state); + state->pool = this->ctx->pool; + state->itable = this->itable; + state->req = req; + state->this = this; + + return state; +} + +static pid_t +get_pid_from_req (fuse_req_t req) +{ + const struct fuse_ctx *ctx = NULL; + ctx = fuse_req_ctx(req); + return ctx->pid; +} + +static call_frame_t * +get_call_frame_for_req (fuse_state_t *state, char d) +{ + call_pool_t *pool = state->pool; + fuse_req_t req = state->req; + const struct fuse_ctx *ctx = NULL; + call_frame_t *frame = NULL; + xlator_t *this = NULL; + fuse_private_t *priv = NULL; + + + if (req) { + this = fuse_req_userdata (req); + } else { + this = state->this; + } + priv = this->private; + + frame = create_frame (this, pool); + + if (req) { + ctx = fuse_req_ctx(req); + + frame->root->uid = ctx->uid; + frame->root->gid = ctx->gid; + frame->root->pid = ctx->pid; + frame->root->unique = req_callid (req); + } + + if (d) { + frame->root->req_refs = dict_ref (get_new_dict ()); + dict_set (frame->root->req_refs, NULL, priv->buf); + } + + frame->root->type = GF_OP_TYPE_FOP_REQUEST; + + return frame; +} + + +GF_MUST_CHECK static int32_t +fuse_loc_fill (loc_t *loc, + fuse_state_t *state, + ino_t ino, + ino_t par, + const char *name) +{ + inode_t *inode = NULL, *parent = NULL; + int32_t ret = -1; + char *path = NULL; + + /* resistance against multiple invocation of loc_fill not to get + reference leaks via inode_search() */ + + inode = loc->inode; + + if (!inode) { + if (ino) + inode = inode_search (state->itable, ino, NULL); + if (par && name) + inode = inode_search (state->itable, par, name); + + loc->inode = inode; + if (inode) + loc->ino = inode->ino; + } + + parent = loc->parent; + if (!parent) { + if (inode) + parent = inode_parent (inode, par, name); + else + parent = inode_search (state->itable, par, NULL); + loc->parent = parent; + } + + if (name && parent) { + ret = inode_path (parent, name, &path); + if (ret <= 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "inode_path failed for %"PRId64"/%s", + parent->ino, name); + goto fail; + } else { + loc->path = path; + } + } else if (inode) { + ret = inode_path (inode, NULL, &path); + if (ret <= 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "inode_path failed for %"PRId64, + inode->ino); + goto fail; + } else { + loc->path = path; + } + } + if (loc->path) { + loc->name = strrchr (loc->path, '/'); + if (loc->name) + loc->name++; + else loc->name = ""; + } + + if ((ino != 1) && + (parent == NULL)) { + gf_log ("fuse-bridge", GF_LOG_ERROR, + "failed to search parent for %"PRId64"/%s (%"PRId64")", + (ino_t)par, name, (ino_t)ino); + ret = -1; + goto fail; + } + ret = 0; +fail: + return ret; +} + + +static int +need_fresh_lookup (int32_t op_ret, int32_t op_errno, + loc_t *loc, struct stat *buf) +{ + if (op_ret == -1) { + gf_log ("fuse-bridge", + (op_errno == ENOENT)? GF_LOG_DEBUG: GF_LOG_WARNING, + "revalidate of %s failed (%s)", + loc->path, strerror (op_errno)); + return 1; + } + + if (loc->inode->ino != buf->st_ino) { + gf_log ("fuse-bridge", GF_LOG_WARNING, + "inode num of %s changed %"PRId64" -> %"PRId64, + loc->path, loc->inode->ino, buf->st_ino); + return 1; + } + + if ((loc->inode->st_mode & S_IFMT) ^ (buf->st_mode & S_IFMT)) { + gf_log ("fuse-bridge", GF_LOG_WARNING, + "inode mode of %s changed 0%o -> 0%o", + loc->path, loc->inode->st_mode, buf->st_mode); + return 1; + } + + return 0; +} + + +static int +fuse_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stat, + dict_t *dict); + +static int +fuse_entry_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *buf) +{ + fuse_state_t *state; + fuse_req_t req; + struct fuse_entry_param e = {0, }; + fuse_private_t *priv = this->private; + + state = frame->root->state; + req = state->req; + + if (!op_ret && state->loc.ino == 1) { + buf->st_ino = 1; + } + + if (state->is_revalidate == 1 + && need_fresh_lookup (op_ret, op_errno, &state->loc, buf)) { + inode_unref (state->loc.inode); + state->loc.inode = inode_new (state->itable); + state->is_revalidate = 2; + + STACK_WIND (frame, fuse_lookup_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, + &state->loc, state->dict); + + return 0; + } + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s() %s => %"PRId64" (%"PRId64")", + frame->root->unique, gf_fop_list[frame->root->op], + state->loc.path, buf->st_ino, state->loc.ino); + + inode_link (inode, state->loc.parent, state->loc.name, buf); + + inode_lookup (inode); + + /* TODO: make these timeouts configurable (via meta?) */ + e.ino = inode->ino; + +#ifdef GF_DARWIN_HOST_OS + e.generation = 0; +#else + e.generation = buf->st_ctime; +#endif + + e.entry_timeout = priv->entry_timeout; + e.attr_timeout = priv->attribute_timeout; + e.attr = *buf; + e.attr.st_blksize = BIG_FUSE_CHANNEL_SIZE; + + if (!e.ino || !buf->st_ino) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s() %s returning inode 0", + frame->root->unique, + gf_fop_list[frame->root->op], state->loc.path); + } + + if (state->loc.parent) + fuse_reply_entry (req, &e); + else + fuse_reply_attr (req, buf, priv->attribute_timeout); + } else { + gf_log ("glusterfs-fuse", + (op_errno == ENOENT ? GF_LOG_DEBUG : GF_LOG_ERROR), + "%"PRId64": %s() %s => -1 (%s)", frame->root->unique, + gf_fop_list[frame->root->op], state->loc.path, + strerror (op_errno)); + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + return 0; +} + + +static int +fuse_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stat, + dict_t *dict) +{ + fuse_entry_cbk (frame, cookie, this, op_ret, op_errno, inode, stat); + return 0; +} + + +static void +fuse_lookup (fuse_req_t req, + fuse_ino_t par, + const char *name) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + + ret = fuse_loc_fill (&state->loc, state, 0, par, name); + + if (ret < 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": LOOKUP %"PRId64"/%s (fuse_loc_fill() failed)", + req_callid (req), (ino_t)par, name); + free_state (state); + fuse_reply_err (req, EINVAL); + return; + } + + if (!state->loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": LOOKUP %s", req_callid (req), + state->loc.path); + + state->loc.inode = inode_new (state->itable); + /* to differntiate in entry_cbk what kind of call it is */ + state->is_revalidate = -1; + } else { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": LOOKUP %s(%"PRId64")", req_callid (req), + state->loc.path, state->loc.inode->ino); + state->is_revalidate = 1; + } + + state->dict = dict_new(); + + FUSE_FOP (state, fuse_lookup_cbk, GF_FOP_LOOKUP, + lookup, &state->loc, state->dict); +} + + +static void +fuse_forget (fuse_req_t req, + fuse_ino_t ino, + unsigned long nlookup) +{ + inode_t *fuse_inode; + fuse_state_t *state; + + if (ino == 1) { + fuse_reply_none (req); + return; + } + + state = state_from_req (req); + fuse_inode = inode_search (state->itable, ino, NULL); + if (fuse_inode) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "got forget on inode (%lu)", ino); + inode_forget (fuse_inode, nlookup); + inode_unref (fuse_inode); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "got forget, but inode (%lu) not found", ino); + } + + free_state (state); + fuse_reply_none (req); +} + + +static int +fuse_attr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + fuse_state_t *state; + fuse_req_t req; + fuse_private_t *priv = this->private; + + state = frame->root->state; + req = state->req; + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", + (buf->st_ino ? GF_LOG_DEBUG : GF_LOG_ERROR), + "%"PRId64": %s() %s => %"PRId64, frame->root->unique, + gf_fop_list[frame->root->op], + state->loc.path ? state->loc.path : "ERR", + buf->st_ino); + + /* TODO: make these timeouts configurable via meta */ + /* TODO: what if the inode number has changed by now */ + buf->st_blksize = BIG_FUSE_CHANNEL_SIZE; + + fuse_reply_attr (req, buf, priv->attribute_timeout); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s() %s => -1 (%s)", frame->root->unique, + gf_fop_list[frame->root->op], + state->loc.path ? state->loc.path : "ERR", + strerror (op_errno)); + + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + return 0; +} + + +static void +fuse_getattr (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd = NULL; + int32_t ret = -1; + + state = state_from_req (req); + + if (ino == 1) { + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if (ret < 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": GETATTR %"PRId64" (fuse_loc_fill() failed)", + req_callid(req), (ino_t)ino); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + if (state->loc.inode) + state->is_revalidate = 1; + else + state->is_revalidate = -1; + + state->dict = dict_new(); + + FUSE_FOP (state, fuse_lookup_cbk, GF_FOP_LOOKUP, + lookup, &state->loc, state->dict); + return; + } + + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + + if (!state->loc.inode) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": GETATTR %"PRId64" (%s) (fuse_loc_fill() returned NULL inode)", + req_callid (req), (int64_t)ino, state->loc.path); + fuse_reply_err (req, EINVAL); + return; + } + + fd = fd_lookup (state->loc.inode, get_pid_from_req (req)); + state->fd = fd; + if (!fd || S_ISDIR (state->loc.inode->st_mode)) { + /* this is the @ret of fuse_loc_fill, checked here + to permit fstat() to happen even when fuse_loc_fill fails + */ + if (ret < 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": GETATTR %"PRId64" (fuse_loc_fill() failed)", + req_callid(req), (ino_t)ino); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": GETATTR %"PRId64" (%s)", + req_callid (req), (int64_t)ino, state->loc.path); + + + FUSE_FOP (state, fuse_attr_cbk, GF_FOP_STAT, + stat, &state->loc); + } else { + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FGETATTR %"PRId64" (%s/%p)", + req_callid (req), (int64_t)ino, state->loc.path, fd); + + FUSE_FOP (state,fuse_attr_cbk, GF_FOP_FSTAT, + fstat, fd); + } +} + + +static int +fuse_fd_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + fuse_state_t *state; + fuse_req_t req; + fuse_private_t *priv = this->private; + + state = frame->root->state; + req = state->req; + + if (op_ret >= 0) { + struct fuse_file_info fi = {0, }; + + fi.fh = (unsigned long) fd; + fi.flags = state->flags; + + if (!S_ISDIR (fd->inode->st_mode)) { + if ((fi.flags & 3) && priv->direct_io_mode) + fi.direct_io = 1; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s() %s => %p", frame->root->unique, + gf_fop_list[frame->root->op], state->loc.path, fd); + + fd_ref (fd); + if (fuse_reply_open (req, &fi) == -ENOENT) { + gf_log ("glusterfs-fuse", GF_LOG_WARNING, + "open() got EINTR"); + fd_unref (fd); + goto out; + } + + fd_bind (fd); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s() %s => -1 (%s)", frame->root->unique, + gf_fop_list[frame->root->op], state->loc.path, + strerror (op_errno)); + + fuse_reply_err (req, op_errno); + } +out: + free_state (state); + STACK_DESTROY (frame->root); + return 0; +} + + + +static void +do_chmod (fuse_req_t req, + fuse_ino_t ino, + struct stat *attr, + struct fuse_file_info *fi) +{ + fuse_state_t *state = state_from_req (req); + fd_t *fd = NULL; + int32_t ret = -1; + + if (fi) { + fd = FI_TO_FD (fi); + state->fd = fd; + } + + if (fd) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FCHMOD %p", req_callid (req), fd); + + FUSE_FOP (state, fuse_attr_cbk, GF_FOP_FCHMOD, + fchmod, fd, attr->st_mode); + } else { + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": CHMOD %"PRId64" (%s) (fuse_loc_fill() failed)", + req_callid (req), (int64_t)ino, + state->loc.path); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": CHMOD %s", req_callid (req), + state->loc.path); + + FUSE_FOP (state, fuse_attr_cbk, GF_FOP_CHMOD, + chmod, &state->loc, attr->st_mode); + } +} + + +static void +do_chown (fuse_req_t req, + fuse_ino_t ino, + struct stat *attr, + int valid, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd = NULL; + int32_t ret = -1; + uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t) -1; + gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t) -1; + + state = state_from_req (req); + + if (fi) { + fd = FI_TO_FD (fi); + state->fd = fd; + } + + if (fd) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FCHOWN %p", req_callid (req), fd); + + FUSE_FOP (state, fuse_attr_cbk, GF_FOP_FCHOWN, + fchown, fd, uid, gid); + } else { + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": CHOWN %"PRId64" (%s) (fuse_loc_fill() failed)", + req_callid (req), (int64_t)ino, + state->loc.path); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": CHOWN %s", req_callid (req), + state->loc.path); + + FUSE_FOP (state, fuse_attr_cbk, GF_FOP_CHOWN, + chown, &state->loc, uid, gid); + } +} + + +static void +do_truncate (fuse_req_t req, + fuse_ino_t ino, + struct stat *attr, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd = NULL; + int32_t ret = -1; + + state = state_from_req (req); + + if (fi) { + fd = FI_TO_FD (fi); + state->fd = fd; + } + if (fd) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FTRUNCATE %p/%"PRId64, req_callid (req), + fd, attr->st_size); + + FUSE_FOP (state, fuse_attr_cbk, GF_FOP_FTRUNCATE, + ftruncate, fd, attr->st_size); + } else { + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": TRUNCATE %s/%"PRId64" (fuse_loc_fill() failed)", + req_callid (req), state->loc.path, + attr->st_size); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": TRUNCATE %s/%"PRId64"(%lu)", + req_callid (req), + state->loc.path, attr->st_size, ino); + + FUSE_FOP (state, fuse_attr_cbk, GF_FOP_TRUNCATE, + truncate, &state->loc, attr->st_size); + } + + return; +} + + +static void +do_utimes (fuse_req_t req, + fuse_ino_t ino, + struct stat *attr) +{ + fuse_state_t *state; + + struct timespec tv[2]; + int32_t ret = -1; + + tv[0].tv_sec = attr->st_atime; + tv[0].tv_nsec = ST_ATIM_NSEC(attr); + tv[1].tv_sec = attr->st_mtime; + tv[1].tv_nsec = ST_ATIM_NSEC(attr); + + state = state_from_req (req); + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": UTIMENS %s (fuse_loc_fill() failed)", + req_callid (req), state->loc.path); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": UTIMENS (%lu)%s", req_callid (req), + ino, state->loc.path); + + FUSE_FOP (state, fuse_attr_cbk, GF_FOP_UTIMENS, + utimens, &state->loc, tv); +} + + +static void +fuse_setattr (fuse_req_t req, + fuse_ino_t ino, + struct stat *attr, + int valid, + struct fuse_file_info *fi) +{ + + if (valid & FUSE_SET_ATTR_MODE) + do_chmod (req, ino, attr, fi); + else if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) + do_chown (req, ino, attr, valid, fi); + else if (valid & FUSE_SET_ATTR_SIZE) + do_truncate (req, ino, attr, fi); + else if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) + do_utimes (req, ino, attr); + else + fuse_getattr (req, ino, fi); +} + + +static int gf_fuse_xattr_enotsup_log; + +static int +fuse_err_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s() %s => 0", frame->root->unique, + gf_fop_list[frame->root->op], + state->loc.path ? state->loc.path : "ERR"); + + fuse_reply_err (req, 0); + } else { + if (frame->root->op == GF_FOP_SETXATTR) { + op_ret = gf_compat_setxattr (state->dict); + if (op_ret == 0) + op_errno = 0; + if (op_errno == ENOTSUP) { + gf_fuse_xattr_enotsup_log++; + if (!(gf_fuse_xattr_enotsup_log % GF_UNIVERSAL_ANSWER)) + gf_log ("glusterfs-fuse", GF_LOG_CRITICAL, + "[ ERROR ] Extended attribute not supported by the backend storage"); + } + } else { + if ((frame->root->op == GF_FOP_REMOVEXATTR) + && (op_errno == ENOATTR)) { + goto nolog; + } + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s() %s => -1 (%s)", + frame->root->unique, + gf_fop_list[frame->root->op], + state->loc.path ? state->loc.path : "ERR", + strerror (op_errno)); + } + nolog: + + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + + +static int +fuse_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret == 0) + inode_unlink (state->loc.inode, state->loc.parent, + state->loc.name); + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s() %s => 0", frame->root->unique, + gf_fop_list[frame->root->op], state->loc.path); + + fuse_reply_err (req, 0); + } else { + gf_log ("glusterfs-fuse", + (op_errno != ENOTEMPTY ? GF_LOG_ERROR : GF_LOG_DEBUG), + "%"PRId64": %s() %s => -1 (%s)", frame->root->unique, + gf_fop_list[frame->root->op], state->loc.path, + strerror (op_errno)); + + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_access (fuse_req_t req, + fuse_ino_t ino, + int mask) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": ACCESS %"PRId64" (%s) (fuse_loc_fill() failed)", + req_callid (req), (int64_t)ino, state->loc.path); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64" ACCESS %s/%lu mask=%d", req_callid (req), + state->loc.path, ino, mask); + + FUSE_FOP (state, fuse_err_cbk, + GF_FOP_ACCESS, access, + &state->loc, mask); + + return; +} + + + +static int +fuse_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *linkname) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret > 0) { + ((char *)linkname)[op_ret] = '\0'; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s => %s", frame->root->unique, + state->loc.path, linkname); + + fuse_reply_readlink(req, linkname); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s => -1 (%s)", frame->root->unique, + state->loc.path, strerror(op_errno)); + + fuse_reply_err(req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_readlink (fuse_req_t req, + fuse_ino_t ino) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64" READLINK %s/%"PRId64" (fuse_loc_fill() returned NULL inode)", + req_callid (req), state->loc.path, + state->loc.inode->ino); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64" READLINK %s/%"PRId64, req_callid (req), + state->loc.path, state->loc.inode->ino); + + FUSE_FOP (state, fuse_readlink_cbk, GF_FOP_READLINK, + readlink, &state->loc, 4096); + + return; +} + + +static void +fuse_mknod (fuse_req_t req, + fuse_ino_t par, + const char *name, + mode_t mode, + dev_t rdev) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + ret = fuse_loc_fill (&state->loc, state, 0, par, name); + if (ret < 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64" MKNOD %s (fuse_loc_fill() failed)", + req_callid (req), state->loc.path); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + state->loc.inode = inode_new (state->itable); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": MKNOD %s", req_callid (req), + state->loc.path); + + FUSE_FOP (state, fuse_entry_cbk, GF_FOP_MKNOD, + mknod, &state->loc, mode, rdev); + + return; +} + + +static void +fuse_mkdir (fuse_req_t req, + fuse_ino_t par, + const char *name, + mode_t mode) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + ret = fuse_loc_fill (&state->loc, state, 0, par, name); + if (ret < 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64" MKDIR %s (fuse_loc_fill() failed)", + req_callid (req), state->loc.path); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + state->loc.inode = inode_new (state->itable); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": MKDIR %s", req_callid (req), + state->loc.path); + + FUSE_FOP (state, fuse_entry_cbk, GF_FOP_MKDIR, + mkdir, &state->loc, mode); + + return; +} + + +static void +fuse_unlink (fuse_req_t req, + fuse_ino_t par, + const char *name) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + + ret = fuse_loc_fill (&state->loc, state, 0, par, name); + + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": UNLINK %s (fuse_loc_fill() returned NULL inode)", + req_callid (req), state->loc.path); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": UNLINK %s", req_callid (req), + state->loc.path); + + FUSE_FOP (state, fuse_unlink_cbk, GF_FOP_UNLINK, + unlink, &state->loc); + + return; +} + + +static void +fuse_rmdir (fuse_req_t req, + fuse_ino_t par, + const char *name) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + ret = fuse_loc_fill (&state->loc, state, 0, par, name); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": RMDIR %s (fuse_loc_fill() failed)", + req_callid (req), state->loc.path); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": RMDIR %s", req_callid (req), + state->loc.path); + + FUSE_FOP (state, fuse_unlink_cbk, GF_FOP_RMDIR, + rmdir, &state->loc); + + return; +} + + +static void +fuse_symlink (fuse_req_t req, + const char *linkname, + fuse_ino_t par, + const char *name) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + ret = fuse_loc_fill (&state->loc, state, 0, par, name); + if (ret < 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64" SYMLINK %s -> %s (fuse_loc_fill() failed)", + req_callid (req), state->loc.path, linkname); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + state->loc.inode = inode_new (state->itable); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": SYMLINK %s -> %s", req_callid (req), + state->loc.path, linkname); + + FUSE_FOP (state, fuse_entry_cbk, GF_FOP_SYMLINK, + symlink, linkname, &state->loc); + + return; +} + + +int +fuse_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s -> %s => 0 (buf->st_ino=%"PRId64" , loc->ino=%"PRId64")", + frame->root->unique, state->loc.path, state->loc2.path, + buf->st_ino, state->loc.ino); + + { + /* ugly ugly - to stay blind to situation where + rename happens on a new inode + */ + buf->st_ino = state->loc.ino; + buf->st_mode = state->loc.inode->st_mode; + } + inode_rename (state->itable, + state->loc.parent, state->loc.name, + state->loc2.parent, state->loc2.name, + state->loc.inode, buf); + + fuse_reply_err (req, 0); + } else { + gf_log ("glusterfs-fuse", + (op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR), + "%"PRId64": %s -> %s => -1 (%s)", frame->root->unique, + state->loc.path, state->loc2.path, + strerror (op_errno)); + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + return 0; +} + + +static void +fuse_rename (fuse_req_t req, + fuse_ino_t oldpar, + const char *oldname, + fuse_ino_t newpar, + const char *newname) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + + ret = fuse_loc_fill (&state->loc, state, 0, oldpar, oldname); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "for %s %"PRId64": RENAME `%s' -> `%s' (fuse_loc_fill() failed)", + state->loc.path, req_callid (req), state->loc.path, + state->loc2.path); + + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + ret = fuse_loc_fill (&state->loc2, state, 0, newpar, newname); + if (ret < 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "for %s %"PRId64": RENAME `%s' -> `%s' (fuse_loc_fill() failed)", + state->loc.path, req_callid (req), state->loc.path, + state->loc2.path); + + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": RENAME `%s (%"PRId64")' -> `%s (%"PRId64")'", + req_callid (req), state->loc.path, state->loc.ino, + state->loc2.path, state->loc2.ino); + + FUSE_FOP (state, fuse_rename_cbk, GF_FOP_RENAME, + rename, &state->loc, &state->loc2); + + return; +} + + +static void +fuse_link (fuse_req_t req, + fuse_ino_t ino, + fuse_ino_t par, + const char *name) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + + ret = fuse_loc_fill (&state->loc, state, 0, par, name); + ret = fuse_loc_fill (&state->loc2, state, ino, 0, NULL); + + if ((state->loc2.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "fuse_loc_fill() failed for %s %"PRId64": LINK %s %s", + state->loc2.path, req_callid (req), + state->loc2.path, state->loc.path); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + state->loc.inode = inode_ref (state->loc2.inode); + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": LINK() %s (%"PRId64") -> %s (%"PRId64")", + req_callid (req), state->loc2.path, state->loc2.ino, + state->loc.path, state->loc.ino); + + FUSE_FOP (state, fuse_entry_cbk, GF_FOP_LINK, + link, &state->loc2, &state->loc); + + return; +} + + +static int +fuse_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + fuse_private_t *priv = this->private; + + struct fuse_file_info fi = {0, }; + struct fuse_entry_param e = {0, }; + + fi.flags = state->flags; + if (op_ret >= 0) { + fi.fh = (unsigned long) fd; + + if ((fi.flags & 3) && priv->direct_io_mode) + fi.direct_io = 1; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s() %s => %p (ino=%"PRId64")", + frame->root->unique, gf_fop_list[frame->root->op], + state->loc.path, fd, buf->st_ino); + + e.ino = buf->st_ino; + +#ifdef GF_DARWIN_HOST_OS + e.generation = 0; +#else + e.generation = buf->st_ctime; +#endif + + e.entry_timeout = priv->entry_timeout; + e.attr_timeout = priv->attribute_timeout; + e.attr = *buf; + e.attr.st_blksize = BIG_FUSE_CHANNEL_SIZE; + + fi.keep_cache = 0; + + inode_link (inode, state->loc.parent, + state->loc.name, buf); + + inode_lookup (inode); + + fd_ref (fd); + if (fuse_reply_create (req, &e, &fi) == -ENOENT) { + gf_log ("glusterfs-fuse", GF_LOG_WARNING, + "create() got EINTR"); + inode_forget (inode, 1); + fd_unref (fd); + goto out; + } + + fd_bind (fd); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s => -1 (%s)", req_callid (req), + state->loc.path, strerror (op_errno)); + fuse_reply_err (req, op_errno); + } +out: + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_create (fuse_req_t req, + fuse_ino_t par, + const char *name, + mode_t mode, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd; + int32_t ret = -1; + + state = state_from_req (req); + state->flags = fi->flags; + + ret = fuse_loc_fill (&state->loc, state, 0, par, name); + if (ret < 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64" CREATE %s (fuse_loc_fill() failed)", + req_callid (req), state->loc.path); + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + state->loc.inode = inode_new (state->itable); + + fd = fd_create (state->loc.inode, get_pid_from_req (req)); + state->fd = fd; + fd->flags = state->flags; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": CREATE %s", req_callid (req), + state->loc.path); + + FUSE_FOP (state, fuse_create_cbk, GF_FOP_CREATE, + create, &state->loc, state->flags, mode, fd); + + return; +} + + +static void +fuse_open (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd; + int32_t ret = -1; + + state = state_from_req (req); + state->flags = fi->flags; + + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": OPEN %s (fuse_loc_fill() failed)", + req_callid (req), state->loc.path); + + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + + fd = fd_create (state->loc.inode, get_pid_from_req (req)); + state->fd = fd; + fd->flags = fi->flags; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": OPEN %s", req_callid (req), + state->loc.path); + + FUSE_FOP (state, fuse_fd_cbk, GF_FOP_OPEN, + open, &state->loc, fi->flags, fd); + + return; +} + + +static int +fuse_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret >= 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": READ => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRId64, + frame->root->unique, + op_ret, state->size, state->off, stbuf->st_size); + + fuse_reply_vec (req, vector, count); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": READ => %d (%s)", frame->root->unique, + op_ret, strerror (op_errno)); + + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + +static void +fuse_readv (fuse_req_t req, + fuse_ino_t ino, + size_t size, + off_t off, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd = NULL; + state = state_from_req (req); + state->size = size; + state->off = off; + + fd = FI_TO_FD (fi); + state->fd = fd; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": READ (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")", + req_callid (req), fd, size, off); + + FUSE_FOP (state, fuse_readv_cbk, GF_FOP_READ, + readv, fd, size, off); + +} + + +static int +fuse_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + if (op_ret >= 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": WRITE => %d/%"GF_PRI_SIZET",%"PRId64"/%"PRId64, + frame->root->unique, + op_ret, state->size, state->off, stbuf->st_size); + + fuse_reply_write (req, op_ret); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": WRITE => -1 (%s)", frame->root->unique, + strerror(op_errno)); + + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_write (fuse_req_t req, + fuse_ino_t ino, + const char *buf, + size_t size, + off_t off, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + struct iovec vector; + fd_t *fd = NULL; + + state = state_from_req (req); + state->size = size; + state->off = off; + fd = FI_TO_FD (fi); + state->fd = fd; + vector.iov_base = (void *)buf; + vector.iov_len = size; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": WRITE (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")", + req_callid (req), fd, size, off); + + FUSE_FOP (state, fuse_writev_cbk, GF_FOP_WRITE, + writev, fd, &vector, 1, off); + return; +} + + +static void +fuse_flush (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd = NULL; + + state = state_from_req (req); + fd = FI_TO_FD (fi); + state->fd = fd; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FLUSH %p", req_callid (req), fd); + + FUSE_FOP (state, fuse_err_cbk, GF_FOP_FLUSH, + flush, fd); + + return; +} + + +static void +fuse_release (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + state = state_from_req (req); + state->fd = FI_TO_FD (fi); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": RELEASE %p", req_callid (req), state->fd); + + fd_unref (state->fd); + + fuse_reply_err (req, 0); + + free_state (state); + return; +} + + +static void +fuse_fsync (fuse_req_t req, + fuse_ino_t ino, + int datasync, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd = NULL; + + state = state_from_req (req); + fd = FI_TO_FD (fi); + state->fd = fd; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": FSYNC %p", req_callid (req), fd); + + FUSE_FOP (state, fuse_err_cbk, GF_FOP_FSYNC, + fsync, fd, datasync); + + return; +} + + +static void +fuse_opendir (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd; + int32_t ret = -1; + + state = state_from_req (req); + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": OPENDIR %s (fuse_loc_fill() failed)", + req_callid (req), state->loc.path); + + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + fd = fd_create (state->loc.inode, get_pid_from_req (req)); + state->fd = fd; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": OPENDIR %s", req_callid (req), + state->loc.path); + + FUSE_FOP (state, fuse_fd_cbk, GF_FOP_OPENDIR, + opendir, &state->loc, fd); +} + +static int +fuse_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + int size = 0; + int entry_size = 0; + char *buf = NULL; + gf_dirent_t *entry = NULL; + struct stat stbuf = {0, }; + + if (op_ret < 0) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": READDIR => -1 (%s)", frame->root->unique, + strerror (op_errno)); + + fuse_reply_err (req, op_errno); + goto out; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": READDIR => %d/%"GF_PRI_SIZET",%"PRId64, + frame->root->unique, op_ret, state->size, state->off); + + list_for_each_entry (entry, &entries->list, list) { + size += fuse_dirent_size (strlen (entry->d_name)); + } + + buf = CALLOC (1, size); + if (!buf) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": READDIR => -1 (%s)", frame->root->unique, + strerror (ENOMEM)); + fuse_reply_err (req, -ENOMEM); + goto out; + } + + size = 0; + list_for_each_entry (entry, &entries->list, list) { + stbuf.st_ino = entry->d_ino; + entry_size = fuse_dirent_size (strlen (entry->d_name)); + fuse_add_direntry (req, buf + size, entry_size, + entry->d_name, &stbuf, + entry->d_off); + size += entry_size; + } + + fuse_reply_buf (req, (void *)buf, size); + +out: + free_state (state); + STACK_DESTROY (frame->root); + if (buf) + FREE (buf); + return 0; + +} + +static void +fuse_readdir (fuse_req_t req, + fuse_ino_t ino, + size_t size, + off_t off, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd = NULL; + + state = state_from_req (req); + state->size = size; + state->off = off; + fd = FI_TO_FD (fi); + state->fd = fd; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": READDIR (%p, size=%"GF_PRI_SIZET", offset=%"PRId64")", + req_callid (req), fd, size, off); + + FUSE_FOP (state, fuse_readdir_cbk, GF_FOP_READDIR, + readdir, fd, size, off); +} + + +static void +fuse_releasedir (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + + state = state_from_req (req); + state->fd = FI_TO_FD (fi); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": RELEASEDIR %p", req_callid (req), state->fd); + + fd_unref (state->fd); + + fuse_reply_err (req, 0); + + free_state (state); + + return; +} + + +static void +fuse_fsyncdir (fuse_req_t req, + fuse_ino_t ino, + int datasync, + struct fuse_file_info *fi) +{ + fuse_state_t *state; + fd_t *fd = NULL; + + fd = FI_TO_FD (fi); + + state = state_from_req (req); + state->fd = fd; + + FUSE_FOP (state, fuse_err_cbk, GF_FOP_FSYNCDIR, + fsyncdir, fd, datasync); + + return; +} + + +static int +fuse_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf) +{ + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + + /* + Filesystems (like ZFS on solaris) reports + different ->f_frsize and ->f_bsize. Old coreutils + df tools use statfs() and do not see ->f_frsize. + the ->f_blocks, ->f_bavail and ->f_bfree are + w.r.t ->f_frsize and not ->f_bsize which makes the + df tools report wrong values. + + Scale the block counts to match ->f_bsize. + */ + /* TODO: with old coreutils, f_bsize is taken from stat()'s st_blksize + * so the df with old coreutils this wont work :( + */ + + if (op_ret == 0) { +#ifndef GF_DARWIN_HOST_OS + /* MacFUSE doesn't respect anyof these tweaks */ + buf->f_blocks *= buf->f_frsize; + buf->f_blocks /= BIG_FUSE_CHANNEL_SIZE; + + buf->f_bavail *= buf->f_frsize; + buf->f_bavail /= BIG_FUSE_CHANNEL_SIZE; + + buf->f_bfree *= buf->f_frsize; + buf->f_bfree /= BIG_FUSE_CHANNEL_SIZE; + + buf->f_frsize = buf->f_bsize = BIG_FUSE_CHANNEL_SIZE; +#endif /* GF_DARWIN_HOST_OS */ + fuse_reply_statfs (req, buf); + + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": ERR => -1 (%s)", frame->root->unique, + strerror(op_errno)); + fuse_reply_err (req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_statfs (fuse_req_t req, + fuse_ino_t ino) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + ret = fuse_loc_fill (&state->loc, state, 1, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": STATFS (fuse_loc_fill() fail)", + req_callid (req)); + + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": STATFS", req_callid (req)); + + FUSE_FOP (state, fuse_statfs_cbk, GF_FOP_STATFS, + statfs, &state->loc); +} + + +static void +fuse_setxattr (fuse_req_t req, + fuse_ino_t ino, + const char *name, + const char *value, + size_t size, + int flags) +{ + fuse_state_t *state; + char *dict_value = NULL; + int32_t ret = -1; + +#ifdef DISABLE_POSIX_ACL + if (!strncmp (name, "system.", 7)) { + fuse_reply_err (req, EOPNOTSUPP); + return; + } +#endif + + state = state_from_req (req); + state->size = size; + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": SETXATTR %s/%"PRId64" (%s) (fuse_loc_fill() failed)", + req_callid (req), + state->loc.path, (int64_t)ino, name); + + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + state->dict = get_new_dict (); + + dict_value = memdup (value, size); + dict_set (state->dict, (char *)name, + data_from_dynptr ((void *)dict_value, size)); + dict_ref (state->dict); + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": SETXATTR %s/%"PRId64" (%s)", req_callid (req), + state->loc.path, (int64_t)ino, name); + + FUSE_FOP (state, fuse_err_cbk, GF_FOP_SETXATTR, + setxattr, &state->loc, state->dict, flags); + + return; +} + + +static int +fuse_xattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + int need_to_free_dict = 0; + int32_t ret = op_ret; + char *value = ""; + fuse_state_t *state = frame->root->state; + fuse_req_t req = state->req; + +#ifdef GF_DARWIN_HOST_OS + /* This is needed in MacFuse, where MacOSX Finder needs some specific + * keys to be supported from FS + */ + int32_t dummy_ret = 0; + if (state->name) { + if (!dict) { + dict = get_new_dict (); + need_to_free_dict = 1; + } + dummy_ret = gf_compat_getxattr (state->name, dict); + if (dummy_ret != -1) + ret = dummy_ret; + } else { + if (!dict) { + dict = get_new_dict (); + need_to_free_dict = 1; + } + dummy_ret = gf_compat_listxattr (ret, dict, state->size); + if (dummy_ret != -1) + ret = dummy_ret; + } +#endif /* DARWIN */ + + if (ret >= 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s() %s => %d", frame->root->unique, + gf_fop_list[frame->root->op], state->loc.path, op_ret); + + /* if successful */ + if (state->name) { + /* if callback for getxattr */ + data_t *value_data = dict_get (dict, state->name); + if (value_data) { + ret = value_data->len; /* Don't return the value for '\0' */ + value = value_data->data; + + /* linux kernel limits the size of xattr value to 64k */ + if (ret > GLUSTERFS_XATTR_LEN_MAX) { + fuse_reply_err (req, E2BIG); + } else if (state->size) { + /* if callback for getxattr and asks for value */ + fuse_reply_buf (req, value, ret); + } else { + /* if callback for getxattr and asks for value length only */ + fuse_reply_xattr (req, ret); + } /* if(ret >...)...else if...else */ + } else if (!strcmp (state->name, "user.glusterfs-booster-volfile")) { + fuse_private_t *priv = this->private; + + if (!priv->volfile) { + int32_t fd = -1, ret = -1; + struct stat st; + char *file = NULL; + + memset (&st, 0, sizeof (st)); + fd = fileno (this->ctx->specfp); + ret = fstat (fd, &st); + if (ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "fstat on fd (%d) failed (%s)", fd, strerror (errno)); + fuse_reply_err (req, ENODATA); + } + + priv->volfile_size = st.st_size; + file = priv->volfile = CALLOC (1, priv->volfile_size); + ret = lseek (fd, 0, SEEK_SET); + while ((ret = read (fd, file, GF_UNIT_KB)) > 0) { + file += ret; + } + } + + if (priv->volfile_size > GLUSTERFS_XATTR_LEN_MAX) { + fuse_reply_err (req, E2BIG); + } else if (state->size) { + /* if callback for getxattr and asks for value */ + fuse_reply_buf (req, priv->volfile, priv->volfile_size); + } else { + /* if callback for getxattr and asks for value length only */ + fuse_reply_xattr (req, priv->volfile_size); + } /* if(ret >...)...else if...else */ + } else if (!strcmp (state->name, "user.glusterfs-booster-path")) { + if (state->size) { + fuse_reply_buf (req, state->loc.path, strlen (state->loc.path) + 1); + } else { + fuse_reply_xattr (req, strlen (state->loc.path) + 1); + } + } else { + fuse_reply_err (req, ENODATA); + } /* if(value_data)...else */ + } else { + /* if callback for listxattr */ + int32_t len = 0; + data_pair_t *trav = dict->members_list; + while (trav) { + len += strlen (trav->key) + 1; + trav = trav->next; + } /* while(trav) */ + value = alloca (len + 1); + ERR_ABORT (value); + len = 0; + trav = dict->members_list; + while (trav) { + strcpy (value + len, trav->key); + value[len + strlen(trav->key)] = '\0'; + len += strlen (trav->key) + 1; + trav = trav->next; + } /* while(trav) */ + if (state->size) { + /* if callback for listxattr and asks for list of keys */ + fuse_reply_buf (req, value, len); + } else { + /* if callback for listxattr and asks for length of keys only */ + fuse_reply_xattr (req, len); + } /* if(state->size)...else */ + } /* if(state->name)...else */ + } else { + /* if failure - no need to check if listxattr or getxattr */ + if (op_errno != ENODATA) { + if (op_errno == ENOTSUP) + { + gf_fuse_xattr_enotsup_log++; + if (!(gf_fuse_xattr_enotsup_log % GF_UNIVERSAL_ANSWER)) + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "[ ERROR ] Extended attribute not supported by the backend storage"); + } + else + { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": %s() %s => -1 (%s)", + frame->root->unique, + gf_fop_list[frame->root->op], + state->loc.path, strerror(op_errno)); + } + } else { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": %s() %s => -1 (%s)", + frame->root->unique, + gf_fop_list[frame->root->op], state->loc.path, + strerror(op_errno)); + } /* if(op_errno!= ENODATA)...else */ + + fuse_reply_err (req, op_errno); + } /* if(op_ret>=0)...else */ + + if (need_to_free_dict) + dict_unref (dict); + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_getxattr (fuse_req_t req, + fuse_ino_t ino, + const char *name, + size_t size) +{ + fuse_state_t *state; + int32_t ret = -1; + +#ifdef DISABLE_POSIX_ACL + if (!strncmp (name, "system.", 7)) { + fuse_reply_err (req, ENODATA); + return; + } +#endif + + state = state_from_req (req); + state->size = size; + state->name = strdup (name); + + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": GETXATTR %s/%"PRId64" (%s) (fuse_loc_fill() failed)", + req_callid (req), state->loc.path, (int64_t)ino, name); + + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": GETXATTR %s/%"PRId64" (%s)", req_callid (req), + state->loc.path, (int64_t)ino, name); + + FUSE_FOP (state, fuse_xattr_cbk, GF_FOP_GETXATTR, + getxattr, &state->loc, name); + + return; +} + + +static void +fuse_listxattr (fuse_req_t req, + fuse_ino_t ino, + size_t size) +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + state->size = size; + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": LISTXATTR %s/%"PRId64" (fuse_loc_fill() failed)", + req_callid (req), state->loc.path, (int64_t)ino); + + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": LISTXATTR %s/%"PRId64, req_callid (req), + state->loc.path, (int64_t)ino); + + FUSE_FOP (state, fuse_xattr_cbk, GF_FOP_GETXATTR, + getxattr, &state->loc, NULL); + + return; +} + + +static void +fuse_removexattr (fuse_req_t req, + fuse_ino_t ino, + const char *name) + +{ + fuse_state_t *state; + int32_t ret = -1; + + state = state_from_req (req); + ret = fuse_loc_fill (&state->loc, state, ino, 0, NULL); + if ((state->loc.inode == NULL) || + (ret < 0)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": REMOVEXATTR %s/%"PRId64" (%s) (fuse_loc_fill() failed)", + req_callid (req), state->loc.path, (int64_t)ino, name); + + fuse_reply_err (req, EINVAL); + free_state (state); + return; + } + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": REMOVEXATTR %s/%"PRId64" (%s)", req_callid (req), + state->loc.path, (int64_t)ino, name); + + FUSE_FOP (state, fuse_err_cbk, GF_FOP_REMOVEXATTR, + removexattr, &state->loc, name); + + return; +} + + +static int gf_fuse_lk_enosys_log; + +static int +fuse_getlk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + fuse_state_t *state = frame->root->state; + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": ERR => 0", frame->root->unique); + fuse_reply_lock (state->req, lock); + } else { + if (op_errno == ENOSYS) { + gf_fuse_lk_enosys_log++; + if (!(gf_fuse_lk_enosys_log % GF_UNIVERSAL_ANSWER)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "[ ERROR ] loading 'features/posix-locks' on server side may help your application"); + } + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "%"PRId64": ERR => -1 (%s)", + frame->root->unique, strerror (op_errno)); + } + fuse_reply_err (state->req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_getlk (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi, + struct flock *lock) +{ + fuse_state_t *state; + fd_t *fd = NULL; + + fd = FI_TO_FD (fi); + state = state_from_req (req); + state->req = req; + state->fd = fd; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": GETLK %p", req_callid (req), fd); + + FUSE_FOP (state, fuse_getlk_cbk, GF_FOP_LK, + lk, fd, F_GETLK, lock); + + return; +} + + +static int +fuse_setlk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + fuse_state_t *state = frame->root->state; + + if (op_ret == 0) { + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": ERR => 0", frame->root->unique); + fuse_reply_err (state->req, 0); + } else { + if (op_errno == ENOSYS) { + gf_fuse_lk_enosys_log++; + if (!(gf_fuse_lk_enosys_log % GF_UNIVERSAL_ANSWER)) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "[ ERROR ] loading 'features/posix-locks' on server side may help your application"); + } + } else { + gf_log ("glusterfs-fuse", + (op_errno == EAGAIN) ? GF_LOG_DEBUG : GF_LOG_ERROR, + "%"PRId64": ERR => -1 (%s)", + frame->root->unique, strerror (op_errno)); + } + + fuse_reply_err (state->req, op_errno); + } + + free_state (state); + STACK_DESTROY (frame->root); + + return 0; +} + + +static void +fuse_setlk (fuse_req_t req, + fuse_ino_t ino, + struct fuse_file_info *fi, + struct flock *lock, + int sleep) +{ + fuse_state_t *state; + fd_t *fd = NULL; + + fd = FI_TO_FD (fi); + state = state_from_req (req); + state->req = req; + state->fd = fd; + + gf_log ("glusterfs-fuse", GF_LOG_DEBUG, + "%"PRId64": SETLK %p (sleep=%d)", req_callid (req), fd, + sleep); + + FUSE_FOP (state, fuse_setlk_cbk, GF_FOP_LK, + lk, fd, (sleep ? F_SETLKW : F_SETLK), lock); + + return; +} + + +static void +fuse_init (void *data, struct fuse_conn_info *conn) +{ + xlator_t *this_xl = NULL; + + if (data == NULL) { + return ; + } + + this_xl = data; + + this_xl->itable = inode_table_new (0, this_xl); + + return ; +} + +static void +fuse_destroy (void *data) +{ + +} + +static struct fuse_lowlevel_ops fuse_ops = { + .init = fuse_init, + .destroy = fuse_destroy, + .lookup = fuse_lookup, + .forget = fuse_forget, + .getattr = fuse_getattr, + .setattr = fuse_setattr, + .opendir = fuse_opendir, + .readdir = fuse_readdir, + .releasedir = fuse_releasedir, + .access = fuse_access, + .readlink = fuse_readlink, + .mknod = fuse_mknod, + .mkdir = fuse_mkdir, + .unlink = fuse_unlink, + .rmdir = fuse_rmdir, + .symlink = fuse_symlink, + .rename = fuse_rename, + .link = fuse_link, + .create = fuse_create, + .open = fuse_open, + .read = fuse_readv, + .write = fuse_write, + .flush = fuse_flush, + .release = fuse_release, + .fsync = fuse_fsync, + .fsyncdir = fuse_fsyncdir, + .statfs = fuse_statfs, + .setxattr = fuse_setxattr, + .getxattr = fuse_getxattr, + .listxattr = fuse_listxattr, + .removexattr = fuse_removexattr, + .getlk = fuse_getlk, + .setlk = fuse_setlk +}; + + +static void * +fuse_thread_proc (void *data) +{ + char *mount_point = NULL; + xlator_t *this = data; + fuse_private_t *priv = this->private; + int32_t res = 0; + data_t *buf = priv->buf; + int32_t ref = 0; + size_t chan_size = fuse_chan_bufsize (priv->ch); + char *recvbuf = CALLOC (1, chan_size); + ERR_ABORT (recvbuf); + + while (!fuse_session_exited (priv->se)) { + + + res = fuse_chan_receive (priv->ch, + recvbuf, + chan_size); + + if (res == -1) { + if (errno != EINTR) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "fuse_chan_receive() returned -1 (%d)", errno); + } + if (errno == ENODEV) + break; + continue; + } + + buf = priv->buf; + + if (res && res != -1) { + if (buf->len < (res)) { + if (buf->data) { + FREE (buf->data); + buf->data = NULL; + } + buf->data = CALLOC (1, res); + ERR_ABORT (buf->data); + buf->len = res; + } + memcpy (buf->data, recvbuf, res); // evil evil + + fuse_session_process (priv->se, + buf->data, + res, + priv->ch); + } + + LOCK (&buf->lock); + ref = buf->refcount; + UNLOCK (&buf->lock); + if (1) { + data_unref (buf); + + priv->buf = data_ref (data_from_dynptr (NULL, 0)); + } + } + if (dict_get (this->options, ZR_MOUNTPOINT_OPT)) + mount_point = data_to_str (dict_get (this->options, + ZR_MOUNTPOINT_OPT)); + if (mount_point) { + gf_log (this->name, GF_LOG_WARNING, + "unmounting %s", mount_point); + dict_del (this->options, ZR_MOUNTPOINT_OPT); + } + fuse_session_remove_chan (priv->ch); + fuse_session_destroy (priv->se); + // fuse_unmount (priv->mount_point, priv->ch); + + raise (SIGTERM); + + return NULL; +} + + +int32_t +notify (xlator_t *this, int32_t event, + void *data, ...) +{ + + switch (event) + { + case GF_EVENT_CHILD_UP: + +#ifndef GF_DARWIN_HOST_OS + /* + * This is because macfuse sends statfs() once the fuse thread + * gets activated, and by that time if the client is not + * connected, it give 'Device not configured' error. Hence, + * create thread only when client sends CHILD_UP (ie, client + * is connected). + */ + + /* TODO: somehow, try to get the mountpoint active as soon as + * init() is complete, so that the hang effect when the + * server is not not started is removed. + */ + + /* This code causes problem with 'automount' too */ + /* case GF_EVENT_CHILD_CONNECTING: */ +#endif /* DARWIN */ + + { + fuse_private_t *private = this->private; + int32_t ret = 0; + + if (!private->fuse_thread_started) + { + private->fuse_thread_started = 1; + + ret = pthread_create (&private->fuse_thread, NULL, + fuse_thread_proc, this); + + if (ret != 0) + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "pthread_create() failed (%s)", strerror (errno)); + assert (ret == 0); + } + break; + } + case GF_EVENT_PARENT_UP: + { + default_notify (this, GF_EVENT_PARENT_UP, data); + } + default: + break; + } + return 0; +} + +int +init (xlator_t *this_xl) +{ + int ret = 0; + dict_t *options = NULL; + char *value_string = NULL; + fuse_private_t *priv = NULL; + struct stat stbuf = {0,}; + +#ifdef GF_DARWIN_HOST_OS + int fuse_argc = 9; + char *fuse_argv[] = {"glusterfs", + "-o", "allow_other", + "-o", "default_permissions", + "-o", "fsname=glusterfs", + "-o", "local", + NULL}; + +#elif GF_LINUX_HOST_OS /* ! DARWIN_OS */ + int fuse_argc = 19; + + char *fuse_argv[] = {"glusterfs", + "-o", "nonempty", + "-o", "max_readahead=1048576", + "-o", "max_read=1048576", + "-o", "max_write=1048576", + "-o", "allow_other", + "-o", "default_permissions", + "-o", "fsname=glusterfs", + "-o", "dev", + "-o", "suid", + NULL}; + +#else /* BSD || SOLARIS */ + /* BSD fuse doesn't support '-o dev', '-o nonempty' option */ + int fuse_argc = 15; + + char *fuse_argv[] = {"glusterfs", + "-o", "max_readahead=1048576", + "-o", "max_read=1048576", + "-o", "max_write=1048576", + "-o", "allow_other", + "-o", "default_permissions", + "-o", "fsname=glusterfs", + "-o", "suid", + NULL}; + +#endif /* ! DARWIN_OS || ! LINUX */ + struct fuse_args args = FUSE_ARGS_INIT (fuse_argc, fuse_argv); + + if (this_xl == NULL) + return -1; + + if (this_xl->options == NULL) + return -1; + + options = this_xl->options; + + if (this_xl->name == NULL) + this_xl->name = strdup ("fuse"); + + priv = CALLOC (1, sizeof (*priv)); + ERR_ABORT (priv); + this_xl->private = (void *) priv; + + +#ifdef GF_DARWIN_HOST_OS + if (dict_get (options, "macfuse-local")) { + /* This way, GlusterFS will be detected as 'servers' instead + * of 'devices'. This method is useful if you want to do + * 'umount <mount_point>' over network, instead of 'eject'ing + * it from desktop. Works better for servers + */ + /* Make the '-o local' in argv as NULL, so that its not + in effect */ + fuse_argv[--args.argc] = NULL; + fuse_argv[--args.argc] = NULL; + } +#endif /* ! DARWIN */ + + /* get options from option dictionary */ + ret = dict_get_str (options, ZR_MOUNTPOINT_OPT, &value_string); + if (value_string == NULL) { + gf_log ("fuse", GF_LOG_ERROR, + "mandatory option mountpoint is not specified"); + return -1; + } + + if (stat (value_string, &stbuf) != 0) { + if (errno == ENOENT) { + gf_log (this_xl->name, GF_LOG_ERROR , + "%s %s does not exist", + ZR_MOUNTPOINT_OPT, value_string); + } else if (errno == ENOTCONN) { + gf_log (this_xl->name, GF_LOG_ERROR , + "mountpoint %s seems to have a stale " + "mount, run 'umount %s' and try again", + value_string, value_string); + } else { + gf_log (this_xl->name, GF_LOG_ERROR , + "%s %s : stat returned %s", + ZR_MOUNTPOINT_OPT, + value_string, strerror (errno)); + } + return -1; + } + + if (S_ISDIR (stbuf.st_mode) == 0) { + gf_log (this_xl->name, GF_LOG_ERROR , + "%s %s is not a directory", + ZR_MOUNTPOINT_OPT, value_string); + return -1; + } + priv->mount_point = strdup (value_string); + + + ret = dict_get_uint32 (options, "attribute-timeout", + &priv->attribute_timeout); + if (!priv->attribute_timeout) + priv->attribute_timeout = 1; /* default */ + + ret = dict_get_uint32 (options, "entry-timeout", + &priv->entry_timeout); + if (!priv->entry_timeout) + priv->entry_timeout = 1; /* default */ + + + priv->direct_io_mode = 1; + ret = dict_get_str (options, ZR_DIRECT_IO_OPT, &value_string); + if (value_string) { + ret = gf_string2boolean (value_string, &priv->direct_io_mode); + } + + priv->ch = fuse_mount (priv->mount_point, &args); + if (priv->ch == NULL) { + if (errno == ENOTCONN) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "A stale mount present on %s. " + "run 'umount %s' and try again", + priv->mount_point, + priv->mount_point); + } else { + if (errno == ENOENT) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "unable to mount on %s. run " + "'modprobe fuse' and try again", + priv->mount_point); + } else { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "fuse_mount() failed with error %s " + "on mount point %s", + strerror (errno), + priv->mount_point); + } + } + + goto cleanup_exit; + } + + priv->se = fuse_lowlevel_new (&args, &fuse_ops, + sizeof (fuse_ops), this_xl); + if (priv->se == NULL) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "fuse_lowlevel_new() failed with error %s on " + "mount point %s", + strerror (errno), priv->mount_point); + goto umount_exit; + } + + ret = fuse_set_signal_handlers (priv->se); + if (ret == -1) { + gf_log ("glusterfs-fuse", GF_LOG_ERROR, + "fuse_set_signal_handlers() failed on mount point %s", + priv->mount_point); + goto umount_exit; + } + + fuse_opt_free_args (&args); + + fuse_session_add_chan (priv->se, priv->ch); + + priv->fd = fuse_chan_fd (priv->ch); + priv->buf = data_ref (data_from_dynptr (NULL, 0)); + + this_xl->ctx->top = this_xl; + return 0; + +umount_exit: + fuse_unmount (priv->mount_point, priv->ch); +cleanup_exit: + fuse_opt_free_args (&args); + FREE (priv->mount_point); + FREE (priv); + return -1; +} + + +void +fini (xlator_t *this_xl) +{ + fuse_private_t *priv = NULL; + char *mount_point = NULL; + + if (this_xl == NULL) + return; + + if ((priv = this_xl->private) == NULL) + return; + + if (dict_get (this_xl->options, ZR_MOUNTPOINT_OPT)) + mount_point = data_to_str (dict_get (this_xl->options, + ZR_MOUNTPOINT_OPT)); + if (mount_point != NULL) { + gf_log (this_xl->name, GF_LOG_WARNING, + "unmounting '%s'", mount_point); + + dict_del (this_xl->options, ZR_MOUNTPOINT_OPT); + fuse_session_exit (priv->se); + fuse_unmount (mount_point, priv->ch); + } +} + +struct xlator_fops fops = { +}; + +struct xlator_cbks cbks = { +}; + +struct xlator_mops mops = { +}; + +struct volume_options options[] = { + { .key = {"direct-io-mode"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"macfuse-local"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"mountpoint", "mount-point"}, + .type = GF_OPTION_TYPE_PATH + }, + { .key = {"attribute-timeout"}, + .type = GF_OPTION_TYPE_TIME, + .min = 0, + .max = 3600 + }, + { .key = {"entry-timeout"}, + .type = GF_OPTION_TYPE_TIME, + .min = 0, + .max = 3600 + }, + { .key = {NULL} }, +}; diff --git a/xlators/mount/fuse/src/fuse-extra.c b/xlators/mount/fuse/src/fuse-extra.c new file mode 100644 index 000000000..93574d174 --- /dev/null +++ b/xlators/mount/fuse/src/fuse-extra.c @@ -0,0 +1,137 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif /* _CONFIG_H */ + +#include "fuse-extra.h" +#include "common-utils.h" +#include <stdio.h> +#include <pthread.h> +#include <stdlib.h> +#include <string.h> +#include "common-utils.h" + +struct fuse_req; +struct fuse_ll; + +struct fuse_req { + struct fuse_ll *f; + uint64_t unique; + int ctr; + pthread_mutex_t lock; + struct fuse_ctx ctx; + struct fuse_chan *ch; + int interrupted; + union { + struct { + uint64_t unique; + } i; + struct { + fuse_interrupt_func_t func; + void *data; + } ni; + } u; + struct fuse_req *next; + struct fuse_req *prev; +}; + +struct fuse_ll { + int debug; + int allow_root; + struct fuse_lowlevel_ops op; + int got_init; + void *userdata; + uid_t owner; + struct fuse_conn_info conn; + struct fuse_req list; + struct fuse_req interrupts; + pthread_mutex_t lock; + int got_destroy; +}; + +struct fuse_out_header { + uint32_t len; + int32_t error; + uint64_t unique; +}; + +uint64_t req_callid (fuse_req_t req) +{ + return req->unique; +} + +static void destroy_req(fuse_req_t req) +{ + pthread_mutex_destroy (&req->lock); + FREE (req); +} + +static void list_del_req(struct fuse_req *req) +{ + struct fuse_req *prev = req->prev; + struct fuse_req *next = req->next; + prev->next = next; + next->prev = prev; +} + +static void +free_req (fuse_req_t req) +{ + int ctr; + struct fuse_ll *f = req->f; + + pthread_mutex_lock(&req->lock); + req->u.ni.func = NULL; + req->u.ni.data = NULL; + pthread_mutex_unlock(&req->lock); + + pthread_mutex_lock(&f->lock); + list_del_req(req); + ctr = --req->ctr; + pthread_mutex_unlock(&f->lock); + if (!ctr) + destroy_req(req); +} + +int32_t +fuse_reply_vec (fuse_req_t req, + struct iovec *vector, + int32_t count) +{ + int32_t error = 0; + struct fuse_out_header out; + struct iovec *iov; + int res; + + iov = alloca ((count + 1) * sizeof (*vector)); + out.unique = req->unique; + out.error = error; + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + memcpy (&iov[1], vector, count * sizeof (*vector)); + count++; + out.len = iov_length(iov, count); + res = fuse_chan_send(req->ch, iov, count); + free_req(req); + + return res; +} diff --git a/xlators/mount/fuse/src/fuse-extra.h b/xlators/mount/fuse/src/fuse-extra.h new file mode 100644 index 000000000..0e8052b5a --- /dev/null +++ b/xlators/mount/fuse/src/fuse-extra.h @@ -0,0 +1,42 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _FUSE_EXTRA_H +#define _FUSE_EXTRA_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif /* _CONFIG_H */ + +#include <stdlib.h> +#include <fuse/fuse_lowlevel.h> + +#define GLUSTERFS_XATTR_LEN_MAX 65536 + +uint64_t req_callid (fuse_req_t req); + +size_t fuse_dirent_size (size_t dname_len); + +int32_t +fuse_reply_vec (fuse_req_t req, + struct iovec *vector, + int32_t count); + +#endif /* _FUSE_EXTRA_H */ diff --git a/xlators/mount/fuse/utils/Makefile.am b/xlators/mount/fuse/utils/Makefile.am new file mode 100644 index 000000000..1217c30da --- /dev/null +++ b/xlators/mount/fuse/utils/Makefile.am @@ -0,0 +1,10 @@ +utildir = $(destdir)/sbin + +if GF_DARWIN_HOST_OS +util_SCRIPTS = mount_glusterfs +else +util_SCRIPTS = mount.glusterfs +endif + +CLEANFILES = + diff --git a/xlators/mount/fuse/utils/mount.glusterfs.in b/xlators/mount/fuse/utils/mount.glusterfs.in new file mode 100755 index 000000000..481fd265f --- /dev/null +++ b/xlators/mount/fuse/utils/mount.glusterfs.in @@ -0,0 +1,152 @@ +#!/bin/sh +# (C) 2006, 2007, 2008 Z RESEARCH Inc. <http://www.zresearch.com> +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of +# the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this program; if not, write to the Free +# Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +# Boston, MA 02110-1301 USA + + + +_init () +{ + # log level definitions + LOG_NONE=NONE; + LOG_CRITICAL=CRITICAL; + LOG_ERROR=ERROR; + LOG_WARNING=WARNING; + LOG_DEBUG=DEBUG; + + # set default log level to ERROR + log_level=$LOG_WARNING; +} + +start_glusterfs () +{ + prefix="@prefix@"; + exec_prefix=@exec_prefix@; + cmd_line=$(echo "@sbindir@/glusterfs"); + + if [ -n "$log_level_str" ]; then + case "$log_level_str" in + "ERROR") + log_level=$LOG_ERROR; + ;; + "DEBUG") + log_level=$LOG_DEBUG; + ;; + "CRITICAL") + log_level=$LOG_CRITICAL; + ;; + "WARNING") + log_level=$LOG_WARNING; + ;; + "NONE") + log_level=$LOG_NONE; + ;; + *) + echo "invalid log level $log_level_str, using ERROR"; + log_level=$LOG_ERROR; + ;; + esac + fi + cmd_line=$(echo "$cmd_line --log-level=$log_level"); + + if [ -n "$log_file" ]; then + cmd_line=$(echo "$cmd_line --log-file=$log_file"); + fi + + if [ -n "$direct_io_mode" ]; then + cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode"); + fi + + if [ -z "$volfile_loc" ]; then + if [ -n "$transport" ]; then + cmd_line=$(echo "$cmd_line \ +--volfile-server=$server_ip \ +--volfile-server-port=$server_port \ +--volfile-server-transport=$transport"); + else + cmd_line=$(echo "$cmd_line \ +--volfile-server=$server_ip \ +--volfile-server-port=$server_port"); + fi + else + cmd_line=$(echo "$cmd_line --volfile=$volfile_loc"); + fi + + if [ -n "$volume_name" ]; then + cmd_line=$(echo "$cmd_line --volume-name=$volume_name"); + fi + + if [ -n "$volume_id" ]; then + cmd_line=$(echo "$cmd_line --volfile-id=$volume_id"); + fi + + cmd_line=$(echo "$cmd_line $mount_point"); + exec $cmd_line; +} + + +main () +{ + options=$(echo "$@" | sed -n 's/.*\-o[ ]*\([^ ]*\).*/\1/p'); + new_log_level=$(echo "$options" | sed -n 's/.*log-level=\([^,]*\).*/\1/p'); + + [ -n "$new_log_level" ] && { + log_level_str="$new_log_level"; + } + log_file=$(echo "$options" | sed -n 's/.*log-file=\([^,]*\).*/\1/p'); + + transport=$(echo "$options" | sed -n 's/.*transport=\([^,]*\).*/\1/p'); + + direct_io_mode=$(echo "$options" | sed -n 's/.*direct-io-mode=\([^,]*\).*/\1/p'); + + volume_name=$(echo "$options" | sed -n 's/.*volume-name=\([^,]*\).*/\1/p'); + + volume_id=$(echo "$options" | sed -n 's/.*volume-id=\([^,]*\).*/\1/p'); + + volfile_loc="$1"; + + [ -r "$volfile_loc" ] || { + server_ip=$(echo "$volfile_loc" | sed -n 's/\([^\:]*\).*/\1/p'); + server_port=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p'); + [ -n "$server_port" ] || { + server_port="6996"; + } + + volfile_loc=""; + } + new_fs_options=$(echo "$options" | sed -e 's/[,]*log-file=[^,]*//' \ + -e 's/[,]*log-level=[^,]*//' \ + -e 's/[,]*volume-name=[^,]*//' \ + -e 's/[,]*direct-io-mode=[^,]*//' \ + -e 's/[,]*transport=[^,]*//' \ + -e 's/[,]*volume-id=[^,]*//'); + # following line is product of love towards sed + # $2=$(echo "$@" | sed -n 's/[^ ]* \([^ ]*\).*/\1/p'); + + mount_point="$2"; + + # Simple check to avoid multiple identical mounts + if grep -q "glusterfs $mount_point fuse" /etc/mtab; then + echo "$0: according to mtab, GlusterFS is already mounted on $mount_point" + exit 1 + fi + + fs_options=$(echo "$fs_options,$new_fs_options"); + + start_glusterfs; +} + +_init "$@" && main "$@"; diff --git a/xlators/mount/fuse/utils/mount_glusterfs.in b/xlators/mount/fuse/utils/mount_glusterfs.in new file mode 100755 index 000000000..1376a8897 --- /dev/null +++ b/xlators/mount/fuse/utils/mount_glusterfs.in @@ -0,0 +1,181 @@ +#!/bin/sh +# (C) 2008 Z RESEARCH Inc. <http://www.zresearch.com> +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of +# the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public +# License along with this program; if not, write to the Free +# Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +# Boston, MA 02110-1301 USA + + + +_init () +{ + # log level definitions + LOG_NONE=NONE; + LOG_CRITICAL=CRITICAL; + LOG_ERROR=ERROR; + LOG_WARNING=WARNING; + LOG_DEBUG=DEBUG; + + # set default log level to ERROR + log_level=$LOG_WARNING; +} + +start_glusterfs () +{ + prefix="@prefix@"; + exec_prefix=@exec_prefix@; + cmd_line=$(echo "@sbindir@/glusterfs"); + + if [ -n "$log_level_str" ]; then + case "$log_level_str" in + "ERROR") + log_level=$LOG_ERROR; + ;; + "DEBUG") + log_level=$LOG_DEBUG; + ;; + "CRITICAL") + log_level=$LOG_CRITICAL; + ;; + "WARNING") + log_level=$LOG_WARNING; + ;; + "NONE") + log_level=$LOG_NONE; + ;; + *) + echo "invalid log level $log_level_str, using ERROR"; + log_level=$LOG_WARNING; + ;; + esac + fi + cmd_line=$(echo "$cmd_line --log-level=$log_level"); + + if [ -n "$log_file" ]; then + cmd_line=$(echo "$cmd_line --log-file=$log_file"); + fi + + if [ -n "$direct_io_mode" ]; then + cmd_line=$(echo "$cmd_line --direct-io-mode=$direct_io_mode"); + fi + + if [ -z "$volfile_loc" ]; then + if [ -n "$transport" ]; then + cmd_line=$(echo "$cmd_line \ +--volfile-server=$server_ip \ +--volfile-server-port=$server_port \ +--volfile-server-transport=$transport"); + else + cmd_line=$(echo "$cmd_line \ +--volfile-server=$server_ip \ +--volfile-server-port=$server_port"); + fi + else + cmd_line=$(echo "$cmd_line --volfile=$volfile_loc"); + fi + + if [ -n "$volume_name" ]; then + cmd_line=$(echo "$cmd_line --volume-name=$volume_name"); + fi + + if [ -n "$volume_id" ]; then + cmd_line=$(echo "$cmd_line --volfile-id=$volume_id"); + fi + + cmd_line=$(echo "$cmd_line $mount_point"); + exec $cmd_line; +} + + +main () +{ + + new_log_level="" + log_file="" + transport="" + direct_io_mode="" + volume_name="" + new_fs_options="" + + while getopts o: opt; do + case "$opt" in + o) + options=$(echo $OPTARG | sed -n 's/.*\-o[ ]*\([^ ]*\).*/\1/p'); + [ -z $new_log_level ] && { + new_log_level=$(echo "$options" | sed -n 's/.*log-level=\([^,]*\).*/\1/p'); + } + + [ -z $log_file ] && { + log_file=$(echo "$options" | sed -n 's/.*log-file=\([^,]*\).*/\1/p'); + } + + [ -z $transport ] && { + transport=$(echo "$options" | sed -n 's/.*transport=\([^,]*\).*/\1/p'); + } + + [ -z $direct_io_mode ] && { + direct_io_mode=$(echo "$options" | sed -n 's/.*direct-io-mode=\([^,]*\).*/\1/p'); + } + + [ -z $volume_name ] && { + volume_name=$(echo "$options" | sed -n 's/.*volume-name=\([^,]*\).*/\1/p'); + } + + [ -z $volume_id ] && { + volume_id=$(echo "$options" | sed -n 's/.*volume-id=\([^,]*\).*/\1/p'); + } + + this_option=$(echo "$options" | sed -e 's/[,]*log-file=[^,]*//' \ + -e 's/[,]*log-level=[^,]*//' \ + -e 's/[,]*volume-name=[^,]*//' \ + -e 's/[,]*direct-io-mode=[^,]*//' \ + -e 's/[,]*transport=[^,]*//' \ + -e 's/[,]*volume-id=[^,]*//'); + new_fs_options="$new_fs_options $this_option"; + ;; + esac + done + + [ -n "$new_log_level" ] && { + log_level_str="$new_log_level"; + } + + # TODO: use getopt. This is very much darwin specific + volfile_loc="$1"; + while [ "$volfile_loc" == "-o" ] ; do + shift ; + shift ; + volfile_loc="$1"; + done + + [ -r "$volfile_loc" ] || { + server_ip=$(echo "$volfile_loc" | sed -n 's/\([^\:]*\).*/\1/p'); + server_port=$(echo "$volfile_loc" | sed -n 's/.*:\([^ ]*\).*/\1/p'); + [ -n "$server_port" ] || { + server_port="6996"; + } + + volfile_loc=""; + } + # following line is product of love towards sed + # $2=$(echo "$@" | sed -n 's/[^ ]* \([^ ]*\).*/\1/p'); + + mount_point="$2"; + + fs_options=$(echo "$fs_options,$new_fs_options"); + + start_glusterfs; +} + +_init "$@" && main "$@"; diff --git a/xlators/performance/Makefile.am b/xlators/performance/Makefile.am new file mode 100644 index 000000000..f7504bbe8 --- /dev/null +++ b/xlators/performance/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = write-behind read-ahead io-threads io-cache symlink-cache + +CLEANFILES = diff --git a/xlators/performance/io-cache/Makefile.am b/xlators/performance/io-cache/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/performance/io-cache/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/performance/io-cache/src/Makefile.am b/xlators/performance/io-cache/src/Makefile.am new file mode 100644 index 000000000..b1bf5bfbf --- /dev/null +++ b/xlators/performance/io-cache/src/Makefile.am @@ -0,0 +1,14 @@ +xlator_LTLIBRARIES = io-cache.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +io_cache_la_LDFLAGS = -module -avoidversion + +io_cache_la_SOURCES = io-cache.c page.c ioc-inode.c +io_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = io-cache.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/io-cache/src/io-cache.c b/xlators/performance/io-cache/src/io-cache.c new file mode 100644 index 000000000..f367cdb88 --- /dev/null +++ b/xlators/performance/io-cache/src/io-cache.c @@ -0,0 +1,1478 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "io-cache.h" +#include <assert.h> +#include <sys/time.h> + +static uint32_t +ioc_get_priority (ioc_table_t *table, + const char *path); + +static uint32_t +ioc_get_priority (ioc_table_t *table, + const char *path); + +static inline ioc_inode_t * +ioc_inode_reupdate (ioc_inode_t *ioc_inode) +{ + ioc_table_t *table = ioc_inode->table; + + list_add_tail (&ioc_inode->inode_lru, + &table->inode_lru[ioc_inode->weight]); + + return ioc_inode; +} + +static inline ioc_inode_t * +ioc_get_inode (dict_t *dict, + char *name) +{ + ioc_inode_t *ioc_inode = NULL; + data_t *ioc_inode_data = dict_get (dict, name); + ioc_table_t *table = NULL; + + if (ioc_inode_data) { + ioc_inode = data_to_ptr (ioc_inode_data); + table = ioc_inode->table; + + ioc_table_lock (table); + { + if (list_empty (&ioc_inode->inode_lru)) { + ioc_inode = ioc_inode_reupdate (ioc_inode); + } + } + ioc_table_unlock (table); + } + + return ioc_inode; +} + +int32_t +ioc_inode_need_revalidate (ioc_inode_t *ioc_inode) +{ + int8_t need_revalidate = 0; + struct timeval tv = {0,}; + int32_t ret = -1; + ioc_table_t *table = ioc_inode->table; + + ret = gettimeofday (&tv, NULL); + + if (time_elapsed (&tv, &ioc_inode->tv) >= table->cache_timeout) + need_revalidate = 1; + + return need_revalidate; +} + +/* + * __ioc_inode_flush - flush all the cached pages of the given inode + * + * @ioc_inode: + * + * assumes lock is held + */ +int32_t +__ioc_inode_flush (ioc_inode_t *ioc_inode) +{ + ioc_page_t *curr = NULL, *next = NULL; + int32_t destroy_size = 0; + int32_t ret = 0; + + list_for_each_entry_safe (curr, next, &ioc_inode->pages, pages) { + ret = ioc_page_destroy (curr); + + if (ret != -1) + destroy_size += ret; + } + + return destroy_size; +} + +void +ioc_inode_flush (ioc_inode_t *ioc_inode) +{ + int32_t destroy_size = 0; + + ioc_inode_lock (ioc_inode); + { + destroy_size = __ioc_inode_flush (ioc_inode); + } + ioc_inode_unlock (ioc_inode); + + if (destroy_size) { + ioc_table_lock (ioc_inode->table); + { + ioc_inode->table->cache_used -= destroy_size; + } + ioc_table_unlock (ioc_inode->table); + } + + return; +} + +/* + * ioc_utimens_cbk - + * + * @frame: + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @stbuf: + * + */ +int32_t +ioc_utimens_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} + +/* + * ioc_utimens - + * + * @frame: + * @this: + * @loc: + * @tv: + * + */ +int32_t +ioc_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec *tv) +{ + uint64_t ioc_inode = 0; + inode_ctx_get (loc->inode, this, &ioc_inode); + + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_WIND (frame, ioc_utimens_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->utimens, + loc, tv); + return 0; +} + +int32_t +ioc_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stbuf, + dict_t *dict) +{ + ioc_inode_t *ioc_inode = NULL; + ioc_local_t *local = frame->local; + ioc_table_t *table = this->private; + ioc_page_t *page = NULL; + data_t *page_data = NULL; + data_t *content_data = NULL; + char *src = NULL; + char *dst = NULL; + char need_unref = 0; + uint8_t cache_still_valid = 0; + uint32_t weight = 0; + uint64_t tmp_ioc_inode = 0; + char *buf = NULL; + char *tmp = NULL; + int i; + + if (op_ret != 0) + goto out; + + inode_ctx_get (inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + if (ioc_inode) { + cache_still_valid = ioc_cache_still_valid (ioc_inode, + stbuf); + + if (!cache_still_valid) { + ioc_inode_flush (ioc_inode); + } + /* update the time-stamp of revalidation */ + ioc_inode_lock (ioc_inode); + { + gettimeofday (&ioc_inode->tv, NULL); + } + ioc_inode_unlock (ioc_inode); + + ioc_table_lock (ioc_inode->table); + { + list_move_tail (&ioc_inode->inode_lru, + &table->inode_lru[ioc_inode->weight]); + } + ioc_table_unlock (ioc_inode->table); + } + + if (local && stbuf->st_size && + local->need_xattr >= stbuf->st_size) { + if (!ioc_inode) { + weight = ioc_get_priority (table, + local->file_loc.path); + ioc_inode = ioc_inode_update (table, + inode, weight); + inode_ctx_put (inode, this, + (uint64_t)(long)ioc_inode); + } + + ioc_inode_lock (ioc_inode); + { + content_data = dict_get (dict, "glusterfs.content"); + page = ioc_page_get (ioc_inode, 0); + + if (content_data) { + if (page) { + dict_unref (page->ref); + free (page->vector); + page->vector = NULL; + + ioc_table_lock (table); + { + table->cache_used -= + page->size; + } + ioc_table_unlock (table); + } else { + page = ioc_page_create (ioc_inode, 0); + } + + dst = CALLOC (1, stbuf->st_size); + page->ref = dict_ref (get_new_dict ()); + page_data = data_from_dynptr (dst, + stbuf->st_size); + dict_set (page->ref, NULL, page_data); + + src = data_to_ptr (content_data); + memcpy (dst, src, stbuf->st_size); + + page->vector = CALLOC (1, + sizeof (*page->vector)); + page->vector->iov_base = dst; + page->vector->iov_len = stbuf->st_size; + page->count = 1; + + page->waitq = NULL; + page->size = stbuf->st_size; + page->ready = 1; + + ioc_table_lock (table); + { + table->cache_used += page->size; + } + ioc_table_unlock (table); + + } else { + if (!(page && page->ready)) { + gf_log (this->name, GF_LOG_DEBUG, + "page not present"); + + ioc_inode_unlock (ioc_inode); + STACK_WIND (frame, + ioc_lookup_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, + &local->file_loc, + local->xattr_req); + return 0; + } + buf = CALLOC (1, stbuf->st_size); + tmp = buf; + + for (i = 0; i < page->count; i++) { + memcpy (tmp, page->vector[i].iov_base, + page->vector[i].iov_len); + tmp += page->vector[i].iov_len; + } + + gf_log (this->name, GF_LOG_DEBUG, + "serving file %s from cache", + local->file_loc.path); + + if (!dict) { + need_unref = 1; + dict = dict_ref ( + get_new_dict ()); + } + dict_set (dict, "glusterfs.content", + data_from_dynptr (buf, + stbuf->st_size)); + } + + ioc_inode->mtime = stbuf->st_mtime; + gettimeofday (&ioc_inode->tv, NULL); + } + ioc_inode_unlock (ioc_inode); + + if (content_data && + ioc_need_prune (ioc_inode->table)) { + ioc_prune (ioc_inode->table); + } + } + + out: + STACK_UNWIND (frame, op_ret, op_errno, inode, stbuf, dict); + + if (need_unref) { + dict_unref (dict); + } + + return 0; +} + +int32_t +ioc_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + uint64_t content_limit = 0; + + if (GF_FILE_CONTENT_REQUESTED(xattr_req, &content_limit)) { + uint64_t tmp_ioc_inode = 0; + ioc_inode_t *ioc_inode = NULL; + ioc_page_t *page = NULL; + ioc_local_t *local = CALLOC (1, sizeof (*local)); + + local->need_xattr = content_limit; + local->file_loc.path = loc->path; + local->file_loc.inode = loc->inode; + frame->local = local; + + inode_ctx_get (loc->inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + + if (ioc_inode) { + ioc_inode_lock (ioc_inode); + { + page = ioc_page_get (ioc_inode, 0); + if ((content_limit <= + ioc_inode->table->page_size) && + page && page->ready) { + local->need_xattr = -1; + } + } + ioc_inode_unlock (ioc_inode); + } + } + + STACK_WIND (frame, + ioc_lookup_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lookup, + loc, + xattr_req); + return 0; +} + +/* + * ioc_forget - + * + * @frame: + * @this: + * @inode: + * + */ +int32_t +ioc_forget (xlator_t *this, + inode_t *inode) +{ + uint64_t ioc_inode = 0; + + inode_ctx_get (inode, this, &ioc_inode); + + if (ioc_inode) + ioc_inode_destroy ((ioc_inode_t *)(long)ioc_inode); + + return 0; +} + + +/* + * ioc_cache_validate_cbk - + * + * @frame: + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @buf + * + */ +int32_t +ioc_cache_validate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + ioc_local_t *local = frame->local; + ioc_inode_t *ioc_inode = NULL; + size_t destroy_size = 0; + struct stat *local_stbuf = stbuf; + + ioc_inode = local->inode; + + if ((op_ret == -1) || + ((op_ret >= 0) && !ioc_cache_still_valid(ioc_inode, stbuf))) { + gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG, + "cache for inode(%p) is invalid. flushing all pages", + ioc_inode); + /* NOTE: only pages with no waiting frames are flushed by + * ioc_inode_flush. page_fault will be generated for all + * the pages which have waiting frames by ioc_inode_wakeup() + */ + ioc_inode_lock (ioc_inode); + { + destroy_size = __ioc_inode_flush (ioc_inode); + if (op_ret >= 0) + ioc_inode->mtime = stbuf->st_mtime; + } + ioc_inode_unlock (ioc_inode); + local_stbuf = NULL; + } + + if (destroy_size) { + ioc_table_lock (ioc_inode->table); + { + ioc_inode->table->cache_used -= destroy_size; + } + ioc_table_unlock (ioc_inode->table); + } + + if (op_ret < 0) + local_stbuf = NULL; + + ioc_inode_lock (ioc_inode); + { + gettimeofday (&ioc_inode->tv, NULL); + } + ioc_inode_unlock (ioc_inode); + + ioc_inode_wakeup (frame, ioc_inode, local_stbuf); + + /* any page-fault initiated by ioc_inode_wakeup() will have its own + * fd_ref on fd, safe to unref validate frame's private copy + */ + fd_unref (local->fd); + + STACK_DESTROY (frame->root); + + return 0; +} + +static int32_t +ioc_wait_on_inode (ioc_inode_t *ioc_inode, + ioc_page_t *page) +{ + ioc_waitq_t *waiter = NULL, *trav = NULL; + uint32_t page_found = 0; + + trav = ioc_inode->waitq; + + while (trav) { + if (trav->data == page) { + page_found = 1; + break; + } + trav = trav->next; + } + + if (!page_found) { + waiter = CALLOC (1, sizeof (ioc_waitq_t)); + ERR_ABORT (waiter); + waiter->data = page; + waiter->next = ioc_inode->waitq; + ioc_inode->waitq = waiter; + } + + return 0; +} + +/* + * ioc_cache_validate - + * + * @frame: + * @ioc_inode: + * @fd: + * + */ +static int32_t +ioc_cache_validate (call_frame_t *frame, + ioc_inode_t *ioc_inode, + fd_t *fd, + ioc_page_t *page) +{ + call_frame_t *validate_frame = NULL; + ioc_local_t *validate_local = NULL; + + validate_local = CALLOC (1, sizeof (ioc_local_t)); + ERR_ABORT (validate_local); + validate_frame = copy_frame (frame); + validate_local->fd = fd_ref (fd); + validate_local->inode = ioc_inode; + validate_frame->local = validate_local; + + STACK_WIND (validate_frame, + ioc_cache_validate_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->fstat, + fd); + + return 0; +} + +static inline uint32_t +is_match (const char *path, + const char *pattern) +{ + char *pathname = strdup (path); + int32_t ret = 0; + + ret = fnmatch (pattern, path, FNM_NOESCAPE); + + free (pathname); + + return (ret == 0); +} + +static uint32_t +ioc_get_priority (ioc_table_t *table, + const char *path) +{ + uint32_t priority = 0; + struct ioc_priority *curr = NULL; + + list_for_each_entry (curr, &table->priority_list, list) { + if (is_match (path, curr->pattern)) + priority = curr->priority; + } + + return priority; +} + +/* + * ioc_open_cbk - open callback for io cache + * + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @fd: + * + */ +int32_t +ioc_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + uint64_t tmp_ioc_inode = 0; + ioc_local_t *local = frame->local; + ioc_table_t *table = this->private; + ioc_inode_t *ioc_inode = NULL; + inode_t *inode = local->file_loc.inode; + uint32_t weight = 0; + const char *path = local->file_loc.path; + + if (op_ret != -1) { + /* look for ioc_inode corresponding to this fd */ + LOCK (&fd->inode->lock); + //{ + + inode_ctx_get (fd->inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + + if (!ioc_inode) { + /* this is the first time someone is opening this + file, assign weight + */ + weight = ioc_get_priority (table, path); + + ioc_inode = ioc_inode_update (table, inode, weight); + inode_ctx_put (fd->inode, this, + (uint64_t)(long)ioc_inode); + } else { + ioc_table_lock (ioc_inode->table); + //{ + list_move_tail (&ioc_inode->inode_lru, + &table->inode_lru[ioc_inode->weight]); + //} + ioc_table_unlock (ioc_inode->table); + } + + //} + UNLOCK (&fd->inode->lock); + + /* If mandatory locking has been enabled on this file, + we disable caching on it */ + if (((inode->st_mode & S_ISGID) && + !(inode->st_mode & S_IXGRP))) { + fd_ctx_set (fd, this, 1); + } + + /* If O_DIRECT open, we disable caching on it */ + if ((local->flags & O_DIRECT)){ + /* O_DIRECT is only for one fd, not the inode + * as a whole + */ + fd_ctx_set (fd, this, 1); + } + } + + FREE (local); + frame->local = NULL; + + STACK_UNWIND (frame, op_ret, op_errno, fd); + + return 0; +} + +/* + * ioc_create_cbk - create callback for io cache + * + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @fd: + * @inode: + * @buf: + * + */ +int32_t +ioc_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + ioc_local_t *local = frame->local; + ioc_table_t *table = this->private; + ioc_inode_t *ioc_inode = NULL; + uint32_t weight = 0; + const char *path = local->file_loc.path; + + if (op_ret != -1) { + { + /* assign weight */ + weight = ioc_get_priority (table, path); + + ioc_inode = ioc_inode_update (table, inode, weight); + LOCK (&fd->inode->lock); + { + inode_ctx_put (fd->inode, this, + (uint64_t)(long)ioc_inode); + } + UNLOCK (&fd->inode->lock); + } + /* If mandatory locking has been enabled on this file, + we disable caching on it */ + if ((inode->st_mode & S_ISGID) && + !(inode->st_mode & S_IXGRP)) { + fd_ctx_set (fd, this, 1); + } + + /* If O_DIRECT open, we disable caching on it */ + if (local->flags & O_DIRECT){ + /* O_DIRECT is only for one fd, not the inode + * as a whole + */ + fd_ctx_set (fd, this, 1); + } + + } + + frame->local = NULL; + FREE (local); + + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + + return 0; +} + +/* + * ioc_open - open fop for io cache + * @frame: + * @this: + * @loc: + * @flags: + * + */ +int32_t +ioc_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + + ioc_local_t *local = CALLOC (1, sizeof (ioc_local_t)); + ERR_ABORT (local); + + local->flags = flags; + local->file_loc.path = loc->path; + local->file_loc.inode = loc->inode; + + frame->local = local; + + STACK_WIND (frame, + ioc_open_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, + loc, + flags, + fd); + + return 0; +} + +/* + * ioc_create - create fop for io cache + * + * @frame: + * @this: + * @pathname: + * @flags: + * @mode: + * + */ +int32_t +ioc_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd) +{ + ioc_local_t *local = CALLOC (1, sizeof (ioc_local_t)); + ERR_ABORT (local); + + local->flags = flags; + local->file_loc.path = loc->path; + frame->local = local; + + STACK_WIND (frame, ioc_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, flags, mode, fd); + return 0; +} + + + + +/* + * ioc_release - release fop for io cache + * + * @frame: + * @this: + * @fd: + * + */ +int32_t +ioc_release (xlator_t *this, + fd_t *fd) +{ + return 0; +} + +/* + * ioc_readv_disabled_cbk + * @frame: + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @vector: + * @count: + * + */ +int32_t +ioc_readv_disabled_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + return 0; +} + + +int32_t +ioc_need_prune (ioc_table_t *table) +{ + int64_t cache_difference = 0; + + ioc_table_lock (table); + { + cache_difference = table->cache_used - table->cache_size; + } + ioc_table_unlock (table); + + if (cache_difference > 0) + return 1; + else + return 0; +} + +/* + * dispatch_requests - + * + * @frame: + * @inode: + * + * + */ +static void +dispatch_requests (call_frame_t *frame, + ioc_inode_t *ioc_inode, + fd_t *fd, + off_t offset, + size_t size) +{ + ioc_local_t *local = frame->local; + ioc_table_t *table = ioc_inode->table; + ioc_page_t *trav = NULL; + ioc_waitq_t *waitq = NULL; + off_t rounded_offset = 0; + off_t rounded_end = 0; + off_t trav_offset = 0; + int32_t fault = 0; + int8_t need_validate = 0; + int8_t might_need_validate = 0; /* if a page exists, do we need + to validate it? */ + + rounded_offset = floor (offset, table->page_size); + rounded_end = roof (offset + size, table->page_size); + trav_offset = rounded_offset; + + /* once a frame does read, it should be waiting on something */ + local->wait_count++; + + /* Requested region can fall in three different pages, + * 1. Ready - region is already in cache, we just have to serve it. + * 2. In-transit - page fault has been generated on this page, we need + * to wait till the page is ready + * 3. Fault - page is not in cache, we have to generate a page fault + */ + + might_need_validate = ioc_inode_need_revalidate (ioc_inode); + + while (trav_offset < rounded_end) { + size_t trav_size = 0; + off_t local_offset = 0; + + ioc_inode_lock (ioc_inode); + //{ + + /* look for requested region in the cache */ + trav = ioc_page_get (ioc_inode, trav_offset); + + local_offset = max (trav_offset, offset); + trav_size = min (((offset+size) - local_offset), + table->page_size); + + if (!trav) { + /* page not in cache, we need to generate page fault */ + trav = ioc_page_create (ioc_inode, trav_offset); + fault = 1; + if (!trav) { + gf_log (frame->this->name, GF_LOG_CRITICAL, + "ioc_page_create returned NULL"); + } + } + + ioc_wait_on_page (trav, frame, local_offset, trav_size); + + if (trav->ready) { + /* page found in cache */ + if (!might_need_validate) { + /* fresh enough */ + gf_log (frame->this->name, GF_LOG_DEBUG, + "cache hit for trav_offset=%"PRId64"" + "/local_offset=%"PRId64"", + trav_offset, local_offset); + waitq = ioc_page_wakeup (trav); + } else { + /* if waitq already exists, fstat revalidate is + already on the way */ + if (!ioc_inode->waitq) { + need_validate = 1; + } + ioc_wait_on_inode (ioc_inode, trav); + } + } + + //} + ioc_inode_unlock (ioc_inode); + + ioc_waitq_return (waitq); + waitq = NULL; + + if (fault) { + fault = 0; + /* new page created, increase the table->cache_used */ + ioc_page_fault (ioc_inode, frame, fd, trav_offset); + } + + if (need_validate) { + need_validate = 0; + gf_log (frame->this->name, GF_LOG_DEBUG, + "sending validate request for " + "inode(%"PRId64") at offset=%"PRId64"", + fd->inode->ino, trav_offset); + ioc_cache_validate (frame, ioc_inode, fd, trav); + } + + trav_offset += table->page_size; + } + + ioc_frame_return (frame); + + if (ioc_need_prune (ioc_inode->table)) { + ioc_prune (ioc_inode->table); + } + + return; +} + + +/* + * ioc_readv - + * + * @frame: + * @this: + * @fd: + * @size: + * @offset: + * + */ +int32_t +ioc_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + uint64_t tmp_ioc_inode = 0; + ioc_inode_t *ioc_inode = NULL; + ioc_local_t *local = NULL; + uint32_t weight = 0; + + inode_ctx_get (fd->inode, this, &tmp_ioc_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_ioc_inode; + if (!ioc_inode) { + /* caching disabled, go ahead with normal readv */ + STACK_WIND (frame, + ioc_readv_disabled_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->readv, + fd, + size, + offset); + return 0; + } + + if (!fd_ctx_get (fd, this, NULL)) { + /* disable caching for this fd, go ahead with normal readv */ + STACK_WIND (frame, + ioc_readv_disabled_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->readv, + fd, + size, + offset); + return 0; + } + + local = (ioc_local_t *) CALLOC (1, sizeof (ioc_local_t)); + ERR_ABORT (local); + INIT_LIST_HEAD (&local->fill_list); + + frame->local = local; + local->pending_offset = offset; + local->pending_size = size; + local->offset = offset; + local->size = size; + local->inode = ioc_inode; + + gf_log (this->name, GF_LOG_DEBUG, + "NEW REQ (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET"", + frame, offset, size); + + weight = ioc_inode->weight; + + ioc_table_lock (ioc_inode->table); + { + list_move_tail (&ioc_inode->inode_lru, + &ioc_inode->table->inode_lru[weight]); + } + ioc_table_unlock (ioc_inode->table); + + dispatch_requests (frame, ioc_inode, fd, offset, size); + + return 0; +} + +/* + * ioc_writev_cbk - + * + * @frame: + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * + */ +int32_t +ioc_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + ioc_local_t *local = frame->local; + uint64_t ioc_inode = 0; + + inode_ctx_get (local->fd->inode, this, &ioc_inode); + + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} + +/* + * ioc_writev + * + * @frame: + * @this: + * @fd: + * @vector: + * @count: + * @offset: + * + */ +int32_t +ioc_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + ioc_local_t *local = NULL; + uint64_t ioc_inode = 0; + + local = CALLOC (1, sizeof (ioc_local_t)); + ERR_ABORT (local); + + /* TODO: why is it not fd_ref'ed */ + local->fd = fd; + frame->local = local; + + inode_ctx_get (fd->inode, this, &ioc_inode); + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_WIND (frame, + ioc_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + fd, + vector, + count, + offset); + + return 0; +} + +/* + * ioc_truncate_cbk - + * + * @frame: + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @buf: + * + */ +int32_t +ioc_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +/* + * ioc_truncate - + * + * @frame: + * @this: + * @loc: + * @offset: + * + */ +int32_t +ioc_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + uint64_t ioc_inode = 0; + inode_ctx_get (loc->inode, this, &ioc_inode); + + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_WIND (frame, + ioc_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + loc, + offset); + return 0; +} + +/* + * ioc_ftruncate - + * + * @frame: + * @this: + * @fd: + * @offset: + * + */ +int32_t +ioc_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + uint64_t ioc_inode = 0; + inode_ctx_get (fd->inode, this, &ioc_inode); + + if (ioc_inode) + ioc_inode_flush ((ioc_inode_t *)(long)ioc_inode); + + STACK_WIND (frame, + ioc_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + fd, + offset); + return 0; +} + +int32_t +ioc_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + STACK_UNWIND (frame, op_ret, op_errno, lock); + return 0; +} + +int32_t +ioc_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + ioc_inode_t *ioc_inode = NULL; + uint64_t tmp_inode = 0; + + inode_ctx_get (fd->inode, this, &tmp_inode); + ioc_inode = (ioc_inode_t *)(long)tmp_inode; + if (!ioc_inode) { + gf_log (this->name, GF_LOG_ERROR, + "inode context is NULL: returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + } + + ioc_inode_lock (ioc_inode); + { + gettimeofday (&ioc_inode->tv, NULL); + } + ioc_inode_unlock (ioc_inode); + + STACK_WIND (frame, ioc_lk_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->lk, fd, cmd, lock); + return 0; +} + +int32_t +ioc_get_priority_list (const char *opt_str, struct list_head *first) +{ + int32_t max_pri = 0; + char *tmp_str = NULL; + char *tmp_str1 = NULL; + char *tmp_str2 = NULL; + char *dup_str = NULL; + char *stripe_str = NULL; + char *pattern = NULL; + char *priority = NULL; + char *string = strdup (opt_str); + struct ioc_priority *curr = NULL; + + /* Get the pattern for cache priority. + * "option priority *.jpg:1,abc*:2" etc + */ + /* TODO: inode_lru in table is statically hard-coded to 5, + * should be changed to run-time configuration + */ + stripe_str = strtok_r (string, ",", &tmp_str); + while (stripe_str) { + curr = CALLOC (1, sizeof (struct ioc_priority)); + ERR_ABORT (curr); + list_add_tail (&curr->list, first); + + dup_str = strdup (stripe_str); + pattern = strtok_r (dup_str, ":", &tmp_str1); + if (!pattern) + return -1; + priority = strtok_r (NULL, ":", &tmp_str1); + if (!priority) + return -1; + gf_log ("io-cache", + GF_LOG_DEBUG, + "ioc priority : pattern %s : priority %s", + pattern, + priority); + curr->pattern = strdup (pattern); + curr->priority = strtol (priority, &tmp_str2, 0); + if (tmp_str2 && (*tmp_str2)) + return -1; + else + max_pri = max (max_pri, curr->priority); + stripe_str = strtok_r (NULL, ",", &tmp_str); + } + + return max_pri; +} + +/* + * init - + * @this: + * + */ +int32_t +init (xlator_t *this) +{ + ioc_table_t *table; + dict_t *options = this->options; + uint32_t index = 0; + char *page_size_string = NULL; + char *cache_size_string = NULL; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: io-cache not configured with exactly " + "one child"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + table = (void *) CALLOC (1, sizeof (*table)); + ERR_ABORT (table); + + table->xl = this; + table->page_size = IOC_PAGE_SIZE; + table->cache_size = IOC_CACHE_SIZE; + + if (dict_get (options, "page-size")) + page_size_string = data_to_str (dict_get (options, + "page-size")); + + if (page_size_string) { + if (gf_string2bytesize (page_size_string, + &table->page_size) != 0) { + gf_log ("io-cache", GF_LOG_ERROR, + "invalid number format \"%s\" of " + "\"option page-size\"", + page_size_string); + return -1; + } + gf_log (this->name, GF_LOG_DEBUG, + "using page-size %"PRIu64"", table->page_size); + } + + if (dict_get (options, "cache-size")) + cache_size_string = data_to_str (dict_get (options, + "cache-size")); + if (cache_size_string) { + if (gf_string2bytesize (cache_size_string, + &table->cache_size) != 0) { + gf_log ("io-cache", GF_LOG_ERROR, + "invalid number format \"%s\" of " + "\"option cache-size\"", + cache_size_string); + return -1; + } + + gf_log (this->name, GF_LOG_DEBUG, + "using cache-size %"PRIu64"", table->cache_size); + } + + table->cache_timeout = 1; + + if (dict_get (options, "cache-timeout")) { + table->cache_timeout = + data_to_uint32 (dict_get (options, + "cache-timeout")); + gf_log (this->name, GF_LOG_DEBUG, + "Using %d seconds to revalidate cache", + table->cache_timeout); + } + + INIT_LIST_HEAD (&table->priority_list); + if (dict_get (options, "priority")) { + char *option_list = data_to_str (dict_get (options, + "priority")); + gf_log (this->name, GF_LOG_DEBUG, + "option path %s", option_list); + /* parse the list of pattern:priority */ + table->max_pri = ioc_get_priority_list (option_list, + &table->priority_list); + + if (table->max_pri == -1) + return -1; + } + table->max_pri ++; + INIT_LIST_HEAD (&table->inodes); + + table->inode_lru = CALLOC (table->max_pri, sizeof (struct list_head)); + ERR_ABORT (table->inode_lru); + for (index = 0; index < (table->max_pri); index++) + INIT_LIST_HEAD (&table->inode_lru[index]); + + pthread_mutex_init (&table->table_lock, NULL); + this->private = table; + return 0; +} + +/* + * fini - + * + * @this: + * + */ +void +fini (xlator_t *this) +{ + ioc_table_t *table = this->private; + + pthread_mutex_destroy (&table->table_lock); + FREE (table); + + this->private = NULL; + return; +} + +struct xlator_fops fops = { + .open = ioc_open, + .create = ioc_create, + .readv = ioc_readv, + .writev = ioc_writev, + .truncate = ioc_truncate, + .ftruncate = ioc_ftruncate, + .utimens = ioc_utimens, + .lookup = ioc_lookup, + .lk = ioc_lk +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { + .forget = ioc_forget, + .release = ioc_release +}; + +struct volume_options options[] = { + { .key = {"priority"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"cache-timeout", "force-revalidate-timeout"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = 60 + }, + { .key = {"page-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 16 * GF_UNIT_KB, + .max = 4 * GF_UNIT_MB + }, + { .key = {"cache-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 4 * GF_UNIT_MB, + .max = 6 * GF_UNIT_GB + }, + { .key = {NULL} }, +}; diff --git a/xlators/performance/io-cache/src/io-cache.h b/xlators/performance/io-cache/src/io-cache.h new file mode 100644 index 000000000..e997f6e7c --- /dev/null +++ b/xlators/performance/io-cache/src/io-cache.h @@ -0,0 +1,330 @@ +/* + Copyright (c) 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __IO_CACHE_H +#define __IO_CACHE_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <sys/types.h> +#include "compat-errno.h" + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "common-utils.h" +#include "call-stub.h" +#include <sys/time.h> +#include <fnmatch.h> + +#define IOC_PAGE_SIZE (1024 * 128) /* 128KB */ +#define IOC_CACHE_SIZE (32 * 1024 * 1024) + +struct ioc_table; +struct ioc_local; +struct ioc_page; +struct ioc_inode; + +struct ioc_priority { + struct list_head list; + char *pattern; + uint32_t priority; +}; + +/* + * ioc_waitq - this structure is used to represents the waiting + * frames on a page + * + * @next: pointer to next object in waitq + * @data: pointer to the frame which is waiting + */ +struct ioc_waitq { + struct ioc_waitq *next; + void *data; + off_t pending_offset; + size_t pending_size; +}; + +/* + * ioc_fill - + * + */ +struct ioc_fill { + struct list_head list; /* list of ioc_fill structures of a frame */ + off_t offset; + size_t size; + struct iovec *vector; + int32_t count; + dict_t *refs; +}; + +struct ioc_local { + mode_t mode; + int32_t flags; + loc_t file_loc; + off_t offset; + size_t size; + int32_t op_ret; + int32_t op_errno; + struct list_head fill_list; /* list of ioc_fill structures */ + off_t pending_offset; /* offset from this frame should continue */ + size_t pending_size; /* size of data this frame is waiting on */ + struct ioc_inode *inode; + int32_t wait_count; + pthread_mutex_t local_lock; + struct ioc_waitq *waitq; + void *stub; + fd_t *fd; + int32_t need_xattr; + dict_t *xattr_req; +}; + +/* + * ioc_page - structure to store page of data from file + * + */ +struct ioc_page { + struct list_head pages; + struct list_head page_lru; + struct ioc_inode *inode; /* inode this page belongs to */ + struct ioc_priority *priority; + char dirty; + char ready; + struct iovec *vector; + int32_t count; + off_t offset; + size_t size; + struct ioc_waitq *waitq; + dict_t *ref; + pthread_mutex_t page_lock; +}; + +struct ioc_inode { + struct ioc_table *table; + struct list_head pages; /* list of pages of this inode */ + struct list_head inode_list; /* list of inodes, maintained by io-cache translator */ + struct list_head inode_lru; + struct list_head page_lru; + struct ioc_waitq *waitq; + pthread_mutex_t inode_lock; + uint32_t weight; /* weight of the inode, increases on each read */ + time_t mtime; /* mtime of the server file when last cached */ + struct timeval tv; /* time-stamp at last re-validate */ +}; + +struct ioc_table { + uint64_t page_size; + uint64_t cache_size; + uint64_t cache_used; + struct list_head inodes; /* list of inodes cached */ + struct list_head active; + struct list_head *inode_lru; + struct list_head priority_list; + int32_t readv_count; + pthread_mutex_t table_lock; + xlator_t *xl; + uint32_t inode_count; + int32_t cache_timeout; + int32_t max_pri; +}; + +typedef struct ioc_table ioc_table_t; +typedef struct ioc_local ioc_local_t; +typedef struct ioc_page ioc_page_t; +typedef struct ioc_inode ioc_inode_t; +typedef struct ioc_waitq ioc_waitq_t; +typedef struct ioc_fill ioc_fill_t; + +void * +str_to_ptr (char *string); + +char * +ptr_to_str (void *ptr); + +int32_t +ioc_readv_disabled_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf); + +ioc_page_t * +ioc_page_get (ioc_inode_t *ioc_inode, + off_t offset); + +ioc_page_t * +ioc_page_create (ioc_inode_t *ioc_inode, + off_t offset); + +void +ioc_page_fault (ioc_inode_t *ioc_inode, + call_frame_t *frame, + fd_t *fd, + off_t offset); +void +ioc_wait_on_page (ioc_page_t *page, + call_frame_t *frame, + off_t offset, + size_t size); + +ioc_waitq_t * +ioc_page_wakeup (ioc_page_t *page); + +void +ioc_page_flush (ioc_page_t *page); + +ioc_waitq_t * +ioc_page_error (ioc_page_t *page, + int32_t op_ret, + int32_t op_errno); +void +ioc_page_purge (ioc_page_t *page); + +void +ioc_frame_return (call_frame_t *frame); + +void +ioc_waitq_return (ioc_waitq_t *waitq); + +void +ioc_frame_fill (ioc_page_t *page, + call_frame_t *frame, + off_t offset, + size_t size); + +#define ioc_inode_lock(ioc_inode) \ + do { \ + gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG, \ + "locked inode(%p)", ioc_inode); \ + pthread_mutex_lock (&ioc_inode->inode_lock); \ + } while (0) + + +#define ioc_inode_unlock(ioc_inode) \ + do { \ + gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG, \ + "unlocked inode(%p)", ioc_inode); \ + pthread_mutex_unlock (&ioc_inode->inode_lock); \ + } while (0) + + +#define ioc_table_lock(table) \ + do { \ + gf_log (table->xl->name, GF_LOG_DEBUG, \ + "locked table(%p)", table); \ + pthread_mutex_lock (&table->table_lock); \ + } while (0) + + +#define ioc_table_unlock(table) \ + do { \ + gf_log (table->xl->name, GF_LOG_DEBUG, \ + "unlocked table(%p)", table); \ + pthread_mutex_unlock (&table->table_lock); \ + } while (0) + + +#define ioc_local_lock(local) \ + do { \ + gf_log (local->inode->table->xl->name, GF_LOG_DEBUG, \ + "locked local(%p)", local); \ + pthread_mutex_lock (&local->local_lock); \ + } while (0) + + +#define ioc_local_unlock(local) \ + do { \ + gf_log (local->inode->table->xl->name, GF_LOG_DEBUG, \ + "unlocked local(%p)", local); \ + pthread_mutex_unlock (&local->local_lock); \ + } while (0) + + +#define ioc_page_lock(page) \ + do { \ + gf_log (page->inode->table->xl->name, GF_LOG_DEBUG, \ + "locked page(%p)", page); \ + pthread_mutex_lock (&page->page_lock); \ + } while (0) + + +#define ioc_page_unlock(page) \ + do { \ + gf_log (page->inode->table->xl->name, GF_LOG_DEBUG, \ + "unlocked page(%p)", page); \ + pthread_mutex_unlock (&page->page_lock); \ + } while (0) + + +static inline uint64_t +time_elapsed (struct timeval *now, + struct timeval *then) +{ + uint64_t sec = now->tv_sec - then->tv_sec; + + if (sec) + return sec; + + return 0; +} + +ioc_inode_t * +ioc_inode_search (ioc_table_t *table, + inode_t *inode); + +void +ioc_inode_destroy (ioc_inode_t *ioc_inode); + +ioc_inode_t * +ioc_inode_update (ioc_table_t *table, + inode_t *inode, + uint32_t weight); + +int64_t +ioc_page_destroy (ioc_page_t *page); + +int32_t +__ioc_inode_flush (ioc_inode_t *ioc_inode); + +void +ioc_inode_flush (ioc_inode_t *ioc_inode); + +void +ioc_inode_wakeup (call_frame_t *frame, + ioc_inode_t *ioc_inode, + struct stat *stbuf); + +int8_t +ioc_cache_still_valid (ioc_inode_t *ioc_inode, + struct stat *stbuf); + +int32_t +ioc_prune (ioc_table_t *table); + +int32_t +ioc_need_prune (ioc_table_t *table); + +#endif /* __READ_AHEAD_H */ diff --git a/xlators/performance/io-cache/src/ioc-inode.c b/xlators/performance/io-cache/src/ioc-inode.c new file mode 100644 index 000000000..2e2e561dd --- /dev/null +++ b/xlators/performance/io-cache/src/ioc-inode.c @@ -0,0 +1,201 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "io-cache.h" + + +/* + * str_to_ptr - convert a string to pointer + * @string: string + * + */ +void * +str_to_ptr (char *string) +{ + void *ptr = (void *)strtoul (string, NULL, 16); + return ptr; +} + + +/* + * ptr_to_str - convert a pointer to string + * @ptr: pointer + * + */ +char * +ptr_to_str (void *ptr) +{ + char *str; + asprintf (&str, "%p", ptr); + return str; +} + +void +ioc_inode_wakeup (call_frame_t *frame, + ioc_inode_t *ioc_inode, + struct stat *stbuf) +{ + ioc_waitq_t *waiter = NULL, *waited = NULL; + ioc_waitq_t *page_waitq = NULL; + int8_t cache_still_valid = 1; + ioc_local_t *local = frame->local; + int8_t need_fault = 0; + ioc_page_t *waiter_page = NULL; + + ioc_inode_lock (ioc_inode); + { + waiter = ioc_inode->waitq; + ioc_inode->waitq = NULL; + } + ioc_inode_unlock (ioc_inode); + + if (stbuf) + cache_still_valid = ioc_cache_still_valid (ioc_inode, stbuf); + else + cache_still_valid = 0; + + if (!waiter) { + gf_log (frame->this->name, GF_LOG_DEBUG, + "cache validate called without any " + "page waiting to be validated"); + } + + while (waiter) { + waiter_page = waiter->data; + page_waitq = NULL; + + if (waiter_page) { + if (cache_still_valid) { + /* cache valid, wake up page */ + ioc_inode_lock (ioc_inode); + { + page_waitq = + ioc_page_wakeup (waiter_page); + } + ioc_inode_unlock (ioc_inode); + if (page_waitq) + ioc_waitq_return (page_waitq); + } else { + /* cache invalid, generate page fault and set + * page->ready = 0, to avoid double faults + */ + ioc_inode_lock (ioc_inode); + + if (waiter_page->ready) { + waiter_page->ready = 0; + need_fault = 1; + } else { + gf_log (frame->this->name, + GF_LOG_DEBUG, + "validate frame(%p) is waiting" + "for in-transit page = %p", + frame, waiter_page); + } + + ioc_inode_unlock (ioc_inode); + + if (need_fault) { + need_fault = 0; + ioc_page_fault (ioc_inode, frame, + local->fd, + waiter_page->offset); + } + } + } + + waited = waiter; + waiter = waiter->next; + + waited->data = NULL; + free (waited); + } +} + +/* + * ioc_inode_update - create a new ioc_inode_t structure and add it to + * the table table. fill in the fields which are derived + * from inode_t corresponding to the file + * + * @table: io-table structure + * @inode: inode structure + * + * not for external reference + */ +ioc_inode_t * +ioc_inode_update (ioc_table_t *table, + inode_t *inode, + uint32_t weight) +{ + ioc_inode_t *ioc_inode = CALLOC (1, sizeof (ioc_inode_t)); + ERR_ABORT (ioc_inode); + + ioc_inode->table = table; + + /* initialize the list for pages */ + INIT_LIST_HEAD (&ioc_inode->pages); + INIT_LIST_HEAD (&ioc_inode->page_lru); + + ioc_table_lock (table); + + table->inode_count++; + list_add (&ioc_inode->inode_list, &table->inodes); + list_add_tail (&ioc_inode->inode_lru, &table->inode_lru[weight]); + + gf_log (table->xl->name, + GF_LOG_DEBUG, + "adding to inode_lru[%d]", weight); + + ioc_table_unlock (table); + + pthread_mutex_init (&ioc_inode->inode_lock, NULL); + ioc_inode->weight = weight; + + return ioc_inode; +} + + +/* + * ioc_inode_destroy - destroy an ioc_inode_t object. + * + * @inode: inode to destroy + * + * to be called only from ioc_forget. + */ +void +ioc_inode_destroy (ioc_inode_t *ioc_inode) +{ + ioc_table_t *table = ioc_inode->table; + + ioc_table_lock (table); + table->inode_count--; + list_del (&ioc_inode->inode_list); + list_del (&ioc_inode->inode_lru); + ioc_table_unlock (table); + + ioc_inode_flush (ioc_inode); + + pthread_mutex_destroy (&ioc_inode->inode_lock); + free (ioc_inode); +} + diff --git a/xlators/performance/io-cache/src/page.c b/xlators/performance/io-cache/src/page.c new file mode 100644 index 000000000..e549f0bb5 --- /dev/null +++ b/xlators/performance/io-cache/src/page.c @@ -0,0 +1,778 @@ +/* + Copyright (c) 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "io-cache.h" +#include <assert.h> +#include <sys/time.h> + +ioc_page_t * +ioc_page_get (ioc_inode_t *ioc_inode, + off_t offset) +{ + int8_t found = 0; + ioc_page_t *page = NULL; + ioc_table_t *table = ioc_inode->table; + off_t rounded_offset = floor (offset, table->page_size); + + if (list_empty (&ioc_inode->pages)) { + return NULL; + } + + list_for_each_entry (page, &ioc_inode->pages, pages) { + if (page->offset == rounded_offset) { + found = 1; + break; + } + } + + /* was previously returning ioc_inode itself.., + * 1st of its type and found one more downstairs :O */ + if (!found){ + page = NULL; + } else { + /* push the page to the end of the lru list */ + list_move_tail (&page->page_lru, &ioc_inode->page_lru); + } + + return page; +} + + +/* + * ioc_page_destroy - + * + * @page: + * + */ +int64_t +ioc_page_destroy (ioc_page_t *page) +{ + int64_t page_size = 0; + + page_size = page->size; + + if (page->waitq) { + /* frames waiting on this page, do not destroy this page */ + page_size = -1; + } else { + + list_del (&page->pages); + list_del (&page->page_lru); + + gf_log (page->inode->table->xl->name, GF_LOG_DEBUG, + "destroying page = %p, offset = %"PRId64" " + "&& inode = %p", + page, page->offset, page->inode); + + if (page->vector){ + dict_unref (page->ref); + free (page->vector); + page->vector = NULL; + } + + page->inode = NULL; + + } + + if (page_size != -1) { + pthread_mutex_destroy (&page->page_lock); + free (page); + } + + return page_size; +} + +/* + * ioc_prune - prune the cache. we have a limit to the number of pages we + * can have in-memory. + * + * @table: ioc_table_t of this translator + * + */ +int32_t +ioc_prune (ioc_table_t *table) +{ + ioc_inode_t *curr = NULL, *next_ioc_inode = NULL; + ioc_page_t *page = NULL, *next = NULL; + int32_t ret = -1; + int32_t index = 0; + uint64_t size_to_prune = 0; + uint64_t size_pruned = 0; + + ioc_table_lock (table); + { + size_to_prune = table->cache_used - table->cache_size; + /* take out the least recently used inode */ + for (index=0; index < table->max_pri; index++) { + list_for_each_entry_safe (curr, next_ioc_inode, + &table->inode_lru[index], + inode_lru) { + /* prune page-by-page for this inode, till + * we reach the equilibrium */ + ioc_inode_lock (curr); + /* { */ + + list_for_each_entry_safe (page, next, + &curr->page_lru, + page_lru) { + /* done with all pages, and not + * reached equilibrium yet?? + * continue with next inode in + * lru_list */ + size_pruned += page->size; + ret = ioc_page_destroy (page); + + if (ret != -1) + table->cache_used -= ret; + + gf_log (table->xl->name, + GF_LOG_DEBUG, + "index = %d && table->cache_" + "used = %"PRIu64" && table->" + "cache_size = %"PRIu64, + index, table->cache_used, + table->cache_size); + + if (size_pruned >= size_to_prune) + break; + } /* list_for_each_entry_safe(page...) */ + if (list_empty (&curr->pages)) { + list_del_init (&curr->inode_lru); + } + + /* } */ + ioc_inode_unlock (curr); + + if (size_pruned >= size_to_prune) + break; + } /* list_for_each_entry_safe (curr...) */ + + if (size_pruned >= size_to_prune) + break; + } /* for(index=0;...) */ + + } /* ioc_inode_table locked region end */ + ioc_table_unlock (table); + + return 0; +} + +/* + * ioc_page_create - create a new page. + * + * @ioc_inode: + * @offset: + * + */ +ioc_page_t * +ioc_page_create (ioc_inode_t *ioc_inode, + off_t offset) +{ + ioc_table_t *table = ioc_inode->table; + ioc_page_t *page = NULL; + off_t rounded_offset = floor (offset, table->page_size); + ioc_page_t *newpage = CALLOC (1, sizeof (*newpage)); + ERR_ABORT (newpage); + + if (ioc_inode) + table = ioc_inode->table; + else { + return NULL; + } + + newpage->offset = rounded_offset; + newpage->inode = ioc_inode; + pthread_mutex_init (&newpage->page_lock, NULL); + + list_add_tail (&newpage->page_lru, &ioc_inode->page_lru); + list_add_tail (&newpage->pages, &ioc_inode->pages); + + page = newpage; + + gf_log ("io-cache", GF_LOG_DEBUG, + "returning new page %p", page); + return page; +} + +/* + * ioc_wait_on_page - pause a frame to wait till the arrival of a page. + * here we need to handle the case when the frame who calls wait_on_page + * himself has caused page_fault + * + * @page: page to wait on + * @frame: call frame who is waiting on page + * + */ +void +ioc_wait_on_page (ioc_page_t *page, + call_frame_t *frame, + off_t offset, + size_t size) +{ + ioc_waitq_t *waitq = NULL; + ioc_local_t *local = frame->local; + + waitq = CALLOC (1, sizeof (*waitq)); + ERR_ABORT (waitq); + + gf_log (frame->this->name, GF_LOG_DEBUG, + "frame(%p) waiting on page = %p, offset=%"PRId64", " + "size=%"GF_PRI_SIZET"", + frame, page, offset, size); + + waitq->data = frame; + waitq->next = page->waitq; + waitq->pending_offset = offset; + waitq->pending_size = size; + page->waitq = waitq; + /* one frame can wait only once on a given page, + * local->wait_count is number of pages a frame is waiting on */ + ioc_local_lock (local); + { + local->wait_count++; + } + ioc_local_unlock (local); +} + + +/* + * ioc_cache_still_valid - see if cached pages ioc_inode are still valid + * against given stbuf + * + * @ioc_inode: + * @stbuf: + * + * assumes ioc_inode is locked + */ +int8_t +ioc_cache_still_valid (ioc_inode_t *ioc_inode, + struct stat *stbuf) +{ + int8_t cache_still_valid = 1; + +#if 0 + if (!stbuf || (stbuf->st_mtime != ioc_inode->mtime) || + (stbuf->st_mtim.tv_nsec != ioc_inode->stbuf.st_mtim.tv_nsec)) + cache_still_valid = 0; + +#else + if (!stbuf || (stbuf->st_mtime != ioc_inode->mtime)) + cache_still_valid = 0; + +#endif + +#if 0 + /* talk with avati@zresearch.com to enable this section */ + if (!ioc_inode->mtime && stbuf) { + cache_still_valid = 1; + ioc_inode->mtime = stbuf->st_mtime; + } +#endif + + return cache_still_valid; +} + + +void +ioc_waitq_return (ioc_waitq_t *waitq) +{ + ioc_waitq_t *trav = NULL; + ioc_waitq_t *next = NULL; + call_frame_t *frame = NULL; + + for (trav = waitq; trav; trav = next) { + next = trav->next; + + frame = trav->data; + ioc_frame_return (frame); + free (trav); + } +} + + +int +ioc_fault_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + ioc_local_t *local = frame->local; + off_t offset = local->pending_offset; + ioc_inode_t *ioc_inode = local->inode; + ioc_table_t *table = ioc_inode->table; + ioc_page_t *page = NULL; + off_t trav_offset = 0; + size_t payload_size = 0; + int32_t destroy_size = 0; + size_t page_size = 0; + ioc_waitq_t *waitq = NULL; + + trav_offset = offset; + payload_size = op_ret; + + ioc_inode_lock (ioc_inode); + { + if (op_ret == -1 || + (op_ret >= 0 && + !ioc_cache_still_valid(ioc_inode, stbuf))) { + gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG, + "cache for inode(%p) is invalid. flushing " + "all pages", ioc_inode); + destroy_size = __ioc_inode_flush (ioc_inode); + } + + if (op_ret >= 0) + ioc_inode->mtime = stbuf->st_mtime; + + gettimeofday (&ioc_inode->tv, NULL); + + if (op_ret < 0) { + /* error, readv returned -1 */ + page = ioc_page_get (ioc_inode, offset); + if (page) + waitq = ioc_page_error (page, op_ret, + op_errno); + } else { + gf_log (ioc_inode->table->xl->name, GF_LOG_DEBUG, + "op_ret = %d", op_ret); + page = ioc_page_get (ioc_inode, offset); + if (!page) { + /* page was flushed */ + /* some serious bug ? */ + gf_log (this->name, GF_LOG_DEBUG, + "wasted copy: %"PRId64"[+%"PRId64"] " + "ioc_inode=%p", offset, + table->page_size, ioc_inode); + } else { + if (page->vector) { + dict_unref (page->ref); + free (page->vector); + page->vector = NULL; + } + + /* keep a copy of the page for our cache */ + page->vector = iov_dup (vector, count); + page->count = count; + if (frame->root->rsp_refs) { + dict_ref (frame->root->rsp_refs); + page->ref = frame->root->rsp_refs; + } else { + /* TODO: we have got a response to + * our request and no data */ + gf_log (this->name, GF_LOG_CRITICAL, + "frame>root>rsp_refs is null"); + } /* if(frame->root->rsp_refs) */ + + /* page->size should indicate exactly how + * much the readv call to the child + * translator returned. earlier op_ret + * from child translator was used, which + * gave rise to a bug where reads from + * io-cached volume were resulting in 0 + * byte replies */ + page_size = iov_length(vector, count); + + page->size = page_size; + + if (page->waitq) { + /* wake up all the frames waiting on + * this page, including + * the frame which triggered fault */ + waitq = ioc_page_wakeup (page); + } /* if(page->waitq) */ + } /* if(!page)...else */ + } /* if(op_ret < 0)...else */ + } /* ioc_inode locked region end */ + ioc_inode_unlock (ioc_inode); + + ioc_waitq_return (waitq); + + if (page_size) { + ioc_table_lock (table); + { + table->cache_used += page_size; + } + ioc_table_unlock (table); + } + + if (destroy_size) { + ioc_table_lock (table); + { + table->cache_used -= destroy_size; + } + ioc_table_unlock (table); + } + + if (ioc_need_prune (ioc_inode->table)) { + ioc_prune (ioc_inode->table); + } + + gf_log (this->name, GF_LOG_DEBUG, "fault frame %p returned", frame); + pthread_mutex_destroy (&local->local_lock); + + fd_unref (local->fd); + + STACK_DESTROY (frame->root); + return 0; +} + +/* + * ioc_page_fault - + * + * @ioc_inode: + * @frame: + * @fd: + * @offset: + * + */ +void +ioc_page_fault (ioc_inode_t *ioc_inode, + call_frame_t *frame, + fd_t *fd, + off_t offset) +{ + ioc_table_t *table = ioc_inode->table; + call_frame_t *fault_frame = copy_frame (frame); + ioc_local_t *fault_local = CALLOC (1, sizeof (ioc_local_t)); + ERR_ABORT (fault_local); + + /* NOTE: copy_frame() means, the frame the fop whose fd_ref we + * are using till now won't be valid till we get reply from server. + * we unref this fd, in fault_cbk */ + fault_local->fd = fd_ref (fd); + + fault_frame->local = fault_local; + pthread_mutex_init (&fault_local->local_lock, NULL); + + INIT_LIST_HEAD (&fault_local->fill_list); + fault_local->pending_offset = offset; + fault_local->pending_size = table->page_size; + fault_local->inode = ioc_inode; + + gf_log (frame->this->name, GF_LOG_DEBUG, + "stack winding page fault for offset = %"PRId64" with " + "frame %p", offset, fault_frame); + + STACK_WIND (fault_frame, ioc_fault_cbk, + FIRST_CHILD(fault_frame->this), + FIRST_CHILD(fault_frame->this)->fops->readv, + fd, table->page_size, offset); + return; +} + +void +ioc_frame_fill (ioc_page_t *page, + call_frame_t *frame, + off_t offset, + size_t size) +{ + ioc_local_t *local = frame->local; + ioc_fill_t *fill = NULL; + off_t src_offset = 0; + off_t dst_offset = 0; + ssize_t copy_size = 0; + ioc_inode_t *ioc_inode = page->inode; + + gf_log (frame->this->name, GF_LOG_DEBUG, + "frame (%p) offset = %"PRId64" && size = %"GF_PRI_SIZET" " + "&& page->size = %"GF_PRI_SIZET" && wait_count = %d", + frame, offset, size, page->size, local->wait_count); + + /* immediately move this page to the end of the page_lru list */ + list_move_tail (&page->page_lru, &ioc_inode->page_lru); + /* fill local->pending_size bytes from local->pending_offset */ + if (local->op_ret != -1 && page->size) { + if (offset > page->offset) + /* offset is offset in file, convert it to offset in + * page */ + src_offset = offset - page->offset; + /*FIXME: since offset is the offset within page is the + * else case valid? */ + else + /* local->pending_offset is in previous page. do not + * fill until we have filled all previous pages */ + dst_offset = page->offset - offset; + + /* we have to copy from offset to either end of this page + * or till the requested size */ + copy_size = min (page->size - src_offset, + size - dst_offset); + + if (copy_size < 0) { + /* if page contains fewer bytes and the required offset + is beyond the page size in the page */ + copy_size = src_offset = 0; + } + + gf_log (page->inode->table->xl->name, GF_LOG_DEBUG, + "copy_size = %"GF_PRI_SIZET" && src_offset = " + "%"PRId64" && dst_offset = %"PRId64"", + copy_size, src_offset, dst_offset); + + { + ioc_fill_t *new = CALLOC (1, sizeof (*new)); + ERR_ABORT (new); + new->offset = page->offset; + new->size = copy_size; + new->refs = dict_ref (page->ref); + new->count = iov_subset (page->vector, + page->count, + src_offset, + src_offset + copy_size, + NULL); + new->vector = CALLOC (new->count, + sizeof (struct iovec)); + ERR_ABORT (new->vector); + new->count = iov_subset (page->vector, + page->count, + src_offset, + src_offset + copy_size, + new->vector); + + + + /* add the ioc_fill to fill_list for this frame */ + if (list_empty (&local->fill_list)) { + /* if list is empty, then this is the first + * time we are filling frame, add the + * ioc_fill_t to the end of list */ + list_add_tail (&new->list, &local->fill_list); + } else { + int8_t found = 0; + /* list is not empty, we need to look for + * where this offset fits in list */ + list_for_each_entry (fill, &local->fill_list, + list) { + if (fill->offset > new->offset) { + found = 1; + break; + } + } + + if (found) { + found = 0; + list_add_tail (&new->list, + &fill->list); + } else { + list_add_tail (&new->list, + &local->fill_list); + } + } + } + local->op_ret += copy_size; + } +} + +/* + * ioc_frame_unwind - frame unwinds only from here + * + * @frame: call frame to unwind + * + * to be used only by ioc_frame_return(), when a frame has + * finished waiting on all pages, required + * + */ +static void +ioc_frame_unwind (call_frame_t *frame) +{ + ioc_local_t *local = frame->local; + ioc_fill_t *fill = NULL, *next = NULL; + int32_t count = 0; + struct iovec *vector = NULL; + int32_t copied = 0; + dict_t *refs = NULL; + struct stat stbuf = {0,}; + int32_t op_ret = 0; + + // ioc_local_lock (local); + refs = get_new_dict (); + + frame->local = NULL; + + if (list_empty (&local->fill_list)) { + gf_log (frame->this->name, GF_LOG_DEBUG, + "frame(%p) has 0 entries in local->fill_list " + "(offset = %"PRId64" && size = %"GF_PRI_SIZET")", + frame, local->offset, local->size); + } + + list_for_each_entry (fill, &local->fill_list, list) { + count += fill->count; + } + + vector = CALLOC (count, sizeof (*vector)); + ERR_ABORT (vector); + + list_for_each_entry_safe (fill, next, &local->fill_list, list) { + memcpy (((char *)vector) + copied, + fill->vector, + fill->count * sizeof (*vector)); + + copied += (fill->count * sizeof (*vector)); + + dict_copy (fill->refs, refs); + + list_del (&fill->list); + dict_unref (fill->refs); + free (fill->vector); + free (fill); + } + + frame->root->rsp_refs = dict_ref (refs); + + op_ret = iov_length (vector, count); + gf_log (frame->this->name, GF_LOG_DEBUG, + "frame(%p) unwinding with op_ret=%d", frame, op_ret); + + // ioc_local_unlock (local); + + STACK_UNWIND (frame, + op_ret, + local->op_errno, + vector, + count, + &stbuf); + + dict_unref (refs); + + pthread_mutex_destroy (&local->local_lock); + free (local); + free (vector); + + return; +} + +/* + * ioc_frame_return - + * @frame: + * + * to be called only when a frame is waiting on an in-transit page + */ +void +ioc_frame_return (call_frame_t *frame) +{ + ioc_local_t *local = frame->local; + int32_t wait_count; + assert (local->wait_count > 0); + + ioc_local_lock (local); + { + wait_count = --local->wait_count; + } + ioc_local_unlock (local); + + if (!wait_count) { + ioc_frame_unwind (frame); + } + + return; +} + +/* + * ioc_page_wakeup - + * @page: + * + * to be called only when a frame is waiting on an in-transit page + */ +ioc_waitq_t * +ioc_page_wakeup (ioc_page_t *page) +{ + ioc_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; + + waitq = page->waitq; + page->waitq = NULL; + + trav = waitq; + page->ready = 1; + + gf_log (page->inode->table->xl->name, GF_LOG_DEBUG, + "page is %p && waitq = %p", page, waitq); + + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + ioc_frame_fill (page, frame, trav->pending_offset, + trav->pending_size); + } + + return waitq; +} + + +/* + * ioc_page_error - + * @page: + * @op_ret: + * @op_errno: + * + */ +ioc_waitq_t * +ioc_page_error (ioc_page_t *page, + int32_t op_ret, + int32_t op_errno) +{ + ioc_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame = NULL; + int64_t ret = 0; + ioc_table_t *table = NULL; + ioc_local_t *local = NULL; + + waitq = page->waitq; + page->waitq = NULL; + + gf_log (page->inode->table->xl->name, GF_LOG_DEBUG, + "page error for page = %p & waitq = %p", page, waitq); + + for (trav = waitq; trav; trav = trav->next) { + + frame = trav->data; + + local = frame->local; + ioc_local_lock (local); + { + if (local->op_ret != -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + } + ioc_local_unlock (local); + } + + table = page->inode->table; + ret = ioc_page_destroy (page); + + if (ret != -1) { + table->cache_used -= ret; + } + + return waitq; +} diff --git a/xlators/performance/io-threads/Makefile.am b/xlators/performance/io-threads/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/performance/io-threads/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/performance/io-threads/src/Makefile.am b/xlators/performance/io-threads/src/Makefile.am new file mode 100644 index 000000000..38dea3eb7 --- /dev/null +++ b/xlators/performance/io-threads/src/Makefile.am @@ -0,0 +1,14 @@ +xlator_LTLIBRARIES = io-threads.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +io_threads_la_LDFLAGS = -module -avoidversion + +io_threads_la_SOURCES = io-threads.c +io_threads_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = io-threads.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/io-threads/src/io-threads.c b/xlators/performance/io-threads/src/io-threads.c new file mode 100644 index 000000000..5acdd627d --- /dev/null +++ b/xlators/performance/io-threads/src/io-threads.c @@ -0,0 +1,1254 @@ +/* + Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "call-stub.h" +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "io-threads.h" + +static void +iot_queue (iot_worker_t *worker, + call_stub_t *stub); + +static call_stub_t * +iot_dequeue (iot_worker_t *worker); + +static iot_worker_t * +iot_schedule (iot_conf_t *conf, + iot_file_t *file, + ino_t ino) +{ + int32_t cnt = (ino % conf->thread_count); + iot_worker_t *trav = conf->workers.next; + + for (; cnt; cnt--) + trav = trav->next; + + if (file) + file->worker = trav; + trav->fd_count++; + return trav; +} + +int32_t +iot_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + iot_conf_t *conf = this->private; + + if (op_ret >= 0) { + iot_file_t *file = CALLOC (1, sizeof (*file)); + ERR_ABORT (file); + + iot_schedule (conf, file, fd->inode->ino); + file->fd = fd; + + fd_ctx_set (fd, this, (uint64_t)(long)file); + + pthread_mutex_lock (&conf->files_lock); + file->next = &conf->files; + file->prev = file->next->prev; + file->next->prev = file; + file->prev->next = file; + pthread_mutex_unlock (&conf->files_lock); + } + STACK_UNWIND (frame, op_ret, op_errno, fd); + return 0; +} + +int32_t +iot_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + STACK_WIND (frame, + iot_open_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, + loc, + flags, + fd); + return 0; +} + + +int32_t +iot_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *stbuf) +{ + iot_conf_t *conf = this->private; + + if (op_ret >= 0) { + iot_file_t *file = CALLOC (1, sizeof (*file)); + ERR_ABORT (file); + + iot_schedule (conf, file, fd->inode->ino); + file->fd = fd; + + fd_ctx_set (fd, this, (uint64_t)(long)file); + + pthread_mutex_lock (&conf->files_lock); + file->next = &conf->files; + file->prev = file->next->prev; + file->next->prev = file; + file->prev->next = file; + pthread_mutex_unlock (&conf->files_lock); + } + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, stbuf); + return 0; +} + +int32_t +iot_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd) +{ + STACK_WIND (frame, + iot_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, + flags, + mode, + fd); + return 0; +} + + + +int32_t +iot_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + iot_local_t *local = frame->local; + + local->frame_size = 0; //iov_length (vector, count); + + STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + + return 0; +} + +static int32_t +iot_readv_wrapper (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + STACK_WIND (frame, + iot_readv_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, + size, + offset); + return 0; +} + +int32_t +iot_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + call_stub_t *stub; + iot_local_t *local = NULL; + iot_file_t *file = NULL; + iot_worker_t *worker = NULL; + uint64_t tmp_file = 0; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, + "fd context is NULL, returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD); + return 0; + } + + file = (iot_file_t *)(long)tmp_file; + worker = file->worker; + + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + frame->local = local; + + stub = fop_readv_stub (frame, + iot_readv_wrapper, + fd, + size, + offset); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, + "cannot get readv call stub"); + STACK_UNWIND (frame, -1, ENOMEM, NULL, 0); + return 0; + } + + iot_queue (worker, stub); + + return 0; +} + +int32_t +iot_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +static int32_t +iot_flush_wrapper (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + STACK_WIND (frame, + iot_flush_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd); + return 0; +} + +int32_t +iot_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + call_stub_t *stub; + iot_local_t *local = NULL; + iot_file_t *file = NULL; + iot_worker_t *worker = NULL; + uint64_t tmp_file = 0; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, + "fd context is NULL, returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD); + return 0; + } + + file = (iot_file_t *)(long)tmp_file; + worker = file->worker; + + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + + frame->local = local; + + stub = fop_flush_stub (frame, + iot_flush_wrapper, + fd); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot get flush_cbk call stub"); + STACK_UNWIND (frame, -1, ENOMEM); + return 0; + } + iot_queue (worker, stub); + + return 0; +} + +int32_t +iot_fsync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +static int32_t +iot_fsync_wrapper (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync) +{ + STACK_WIND (frame, + iot_fsync_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsync, + fd, + datasync); + return 0; +} + +int32_t +iot_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync) +{ + call_stub_t *stub; + iot_local_t *local = NULL; + iot_file_t *file = NULL; + iot_worker_t *worker = NULL; + uint64_t tmp_file = 0; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, + "fd context is NULL, returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD); + return 0; + } + + file = (iot_file_t *)(long)tmp_file; + worker = file->worker; + + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + + frame->local = local; + + stub = fop_fsync_stub (frame, + iot_fsync_wrapper, + fd, + datasync); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot get fsync_cbk call stub"); + STACK_UNWIND (frame, -1, ENOMEM); + return 0; + } + iot_queue (worker, stub); + + return 0; +} + +int32_t +iot_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + iot_local_t *local = frame->local; + + local->frame_size = 0; /* hehe, caught me! */ + + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} + +static int32_t +iot_writev_wrapper (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + STACK_WIND (frame, + iot_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + fd, + vector, + count, + offset); + return 0; +} + +int32_t +iot_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + call_stub_t *stub; + iot_local_t *local = NULL; + iot_file_t *file = NULL; + iot_worker_t *worker = NULL; + uint64_t tmp_file = 0; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, + "fd context is NULL, returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD); + return 0; + } + + file = (iot_file_t *)(long)tmp_file; + worker = file->worker; + + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + + if (frame->root->req_refs) + local->frame_size = dict_serialized_length (frame->root->req_refs); + else + local->frame_size = iov_length (vector, count); + frame->local = local; + + stub = fop_writev_stub (frame, iot_writev_wrapper, + fd, vector, count, offset); + + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot get writev call stub"); + STACK_UNWIND (frame, -1, ENOMEM); + return 0; + } + + iot_queue (worker, stub); + + return 0; +} + + +int32_t +iot_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *flock) +{ + STACK_UNWIND (frame, op_ret, op_errno, flock); + return 0; +} + + +static int32_t +iot_lk_wrapper (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *flock) +{ + STACK_WIND (frame, + iot_lk_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lk, + fd, + cmd, + flock); + return 0; +} + + +int32_t +iot_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *flock) +{ + call_stub_t *stub; + iot_local_t *local = NULL; + iot_file_t *file = NULL; + iot_worker_t *worker = NULL; + uint64_t tmp_file = 0; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, + "fd context is NULL, returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD); + return 0; + } + + file = (iot_file_t *)(long)tmp_file; + worker = file->worker; + + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + frame->local = local; + + stub = fop_lk_stub (frame, iot_lk_wrapper, + fd, cmd, flock); + + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot get fop_lk call stub"); + STACK_UNWIND (frame, -1, ENOMEM, NULL); + return 0; + } + + iot_queue (worker, stub); + + return 0; +} + + +int32_t +iot_stat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + + +static int32_t +iot_stat_wrapper (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + STACK_WIND (frame, + iot_stat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc); + return 0; +} + +int32_t +iot_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + call_stub_t *stub; + iot_local_t *local = NULL; + iot_worker_t *worker = NULL; + iot_conf_t *conf; + fd_t *fd = NULL; + + conf = this->private; + + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + frame->local = local; + + fd = fd_lookup (loc->inode, frame->root->pid); + + if (fd == NULL) { + STACK_WIND(frame, + iot_stat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc); + return 0; + } + + fd_unref (fd); + + worker = iot_schedule (conf, NULL, loc->inode->ino); + + stub = fop_stat_stub (frame, + iot_stat_wrapper, + loc); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot get fop_stat call stub"); + STACK_UNWIND (frame, -1, ENOMEM, NULL); + return 0; + } + iot_queue (worker, stub); + + return 0; +} + + +int32_t +iot_fstat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +static int32_t +iot_fstat_wrapper (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + STACK_WIND (frame, + iot_fstat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, + fd); + return 0; +} + +int32_t +iot_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + call_stub_t *stub; + iot_local_t *local = NULL; + iot_file_t *file = NULL; + iot_worker_t *worker = NULL; + uint64_t tmp_file = 0; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, + "fd context is NULL, returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD); + return 0; + } + + file = (iot_file_t *)(long)tmp_file; + worker = file->worker; + + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + frame->local = local; + stub = fop_fstat_stub (frame, + iot_fstat_wrapper, + fd); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot get fop_fstat call stub"); + STACK_UNWIND (frame, -1, ENOMEM, NULL); + return 0; + } + + iot_queue (worker, stub); + + return 0; +} + +int32_t +iot_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +static int32_t +iot_truncate_wrapper (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + STACK_WIND (frame, + iot_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + loc, + offset); + return 0; +} + +int32_t +iot_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + call_stub_t *stub; + iot_local_t *local = NULL; + iot_worker_t *worker = NULL; + iot_conf_t *conf; + fd_t *fd = NULL; + + conf = this->private; + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + frame->local = local; + + fd = fd_lookup (loc->inode, frame->root->pid); + + if (fd == NULL) { + STACK_WIND(frame, + iot_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + loc, + offset); + return 0; + } + + fd_unref (fd); + + worker = iot_schedule (conf, NULL, loc->inode->ino); + + stub = fop_truncate_stub (frame, + iot_truncate_wrapper, + loc, + offset); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot get fop_stat call stub"); + STACK_UNWIND (frame, -1, ENOMEM, NULL); + return 0; + } + iot_queue (worker, stub); + + return 0; +} + +int32_t +iot_ftruncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +static int32_t +iot_ftruncate_wrapper (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + STACK_WIND (frame, + iot_ftruncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + fd, + offset); + return 0; +} + +int32_t +iot_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + call_stub_t *stub; + iot_local_t *local = NULL; + iot_file_t *file = NULL; + iot_worker_t *worker = NULL; + uint64_t tmp_file = 0; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, + "fd context is NULL, returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD); + return 0; + } + + file = (iot_file_t *)(long)tmp_file; + worker = file->worker; + + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + frame->local = local; + + stub = fop_ftruncate_stub (frame, + iot_ftruncate_wrapper, + fd, + offset); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot get fop_ftruncate call stub"); + STACK_UNWIND (frame, -1, ENOMEM, NULL); + return 0; + } + iot_queue (worker, stub); + + return 0; +} + +int32_t +iot_utimens_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +static int32_t +iot_utimens_wrapper (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + STACK_WIND (frame, + iot_utimens_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->utimens, + loc, + tv); + + return 0; +} + +int32_t +iot_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + call_stub_t *stub; + iot_local_t *local = NULL; + iot_worker_t *worker = NULL; + iot_conf_t *conf; + fd_t *fd = NULL; + + conf = this->private; + + local = CALLOC (1, sizeof (*local)); + ERR_ABORT (local); + frame->local = local; + + fd = fd_lookup (loc->inode, frame->root->pid); + + if (fd == NULL) { + STACK_WIND(frame, + iot_utimens_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->utimens, + loc, + tv); + return 0; + } + + fd_unref (fd); + + worker = iot_schedule (conf, NULL, loc->inode->ino); + + stub = fop_utimens_stub (frame, + iot_utimens_wrapper, + loc, + tv); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot get fop_utimens call stub"); + STACK_UNWIND (frame, -1, ENOMEM, NULL); + return 0; + } + iot_queue (worker, stub); + + return 0; +} + + +int32_t +iot_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *file_checksum, + uint8_t *dir_checksum) +{ + STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum); + return 0; +} + +static int32_t +iot_checksum_wrapper (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags) +{ + STACK_WIND (frame, + iot_checksum_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->checksum, + loc, + flags); + + return 0; +} + +int32_t +iot_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags) +{ + call_stub_t *stub = NULL; + iot_local_t *local = NULL; + iot_worker_t *worker = NULL; + iot_conf_t *conf = NULL; + + conf = this->private; + + local = CALLOC (1, sizeof (*local)); + frame->local = local; + + worker = iot_schedule (conf, NULL, conf->misc_thread_index++); + + stub = fop_checksum_stub (frame, + iot_checksum_wrapper, + loc, + flags); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot get fop_checksum call stub"); + STACK_UNWIND (frame, -1, ENOMEM, NULL, NULL); + return 0; + } + iot_queue (worker, stub); + + return 0; +} + + +int32_t +iot_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +static int32_t +iot_unlink_wrapper (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + STACK_WIND (frame, + iot_unlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + loc); + + return 0; +} + +int32_t +iot_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + call_stub_t *stub = NULL; + iot_local_t *local = NULL; + iot_worker_t *worker = NULL; + iot_conf_t *conf = NULL; + + conf = this->private; + + local = CALLOC (1, sizeof (*local)); + frame->local = local; + + worker = iot_schedule (conf, NULL, conf->misc_thread_index++); + + stub = fop_unlink_stub (frame, iot_unlink_wrapper, loc); + if (!stub) { + gf_log (this->name, GF_LOG_ERROR, "cannot get fop_unlink call stub"); + STACK_UNWIND (frame, -1, ENOMEM); + return 0; + } + iot_queue (worker, stub); + + return 0; +} + +int32_t +iot_release (xlator_t *this, + fd_t *fd) +{ + iot_file_t *file = NULL; + iot_conf_t *conf = NULL; + uint64_t tmp_file = 0; + int ret = 0; + + conf = this->private; + ret = fd_ctx_del (fd, this, &tmp_file); + if (ret) + return 0; + + file = (iot_file_t *)(long)tmp_file; + + pthread_mutex_lock (&conf->files_lock); + { + (file->prev)->next = file->next; + (file->next)->prev = file->prev; + } + pthread_mutex_unlock (&conf->files_lock); + + FREE (file); + return 0; +} + + +static void +iot_queue (iot_worker_t *worker, + call_stub_t *stub) +{ + iot_queue_t *queue; + iot_conf_t *conf = worker->conf; + iot_local_t *local = stub->frame->local; + size_t frame_size = local->frame_size; + + queue = CALLOC (1, sizeof (*queue)); + ERR_ABORT (queue); + queue->stub = stub; + + pthread_mutex_lock (&conf->lock); + + /* + while (worker->queue_size >= worker->queue_limit) + pthread_cond_wait (&worker->q_cond, &worker->lock); + */ + if (conf->cache_size) { + while (frame_size && (conf->current_size >= conf->cache_size)) + pthread_cond_wait (&conf->q_cond, &conf->lock); + } + + queue->next = &worker->queue; + queue->prev = worker->queue.prev; + + queue->next->prev = queue; + queue->prev->next = queue; + + /* dq_cond */ + worker->queue_size++; + worker->q++; + + conf->current_size += local->frame_size; + + pthread_cond_broadcast (&worker->dq_cond); + + pthread_mutex_unlock (&conf->lock); +} + +static call_stub_t * +iot_dequeue (iot_worker_t *worker) +{ + call_stub_t *stub = NULL; + iot_queue_t *queue = NULL; + iot_conf_t *conf = worker->conf; + iot_local_t *local = NULL; + + + pthread_mutex_lock (&conf->lock); + + while (!worker->queue_size) + /* + pthread_cond_wait (&worker->dq_cond, &worker->lock); + */ + pthread_cond_wait (&worker->dq_cond, &conf->lock); + + queue = worker->queue.next; + + queue->next->prev = queue->prev; + queue->prev->next = queue->next; + + stub = queue->stub; + local = stub->frame->local; + + worker->queue_size--; + worker->dq++; + + /* q_cond */ + conf->current_size -= local->frame_size; + + pthread_cond_broadcast (&conf->q_cond); + + pthread_mutex_unlock (&conf->lock); + + FREE (queue); + + return stub; +} + +static void * +iot_worker (void *arg) +{ + iot_worker_t *worker = arg; + + while (1) { + call_stub_t *stub; + + stub = iot_dequeue (worker); + call_resume (stub); + } +} + +#if 0 +static void * +iot_reply (void *arg) +{ + iot_worker_t *reply = arg; + + while (1) { + call_stub_t *stub; + + stub = iot_dequeue (reply); + FREE (stub->frame->local); + stub->frame->local = NULL; + call_resume (stub); + } +} +#endif + +static void +workers_init (iot_conf_t *conf) +{ + int i; + + conf->workers.next = &conf->workers; + conf->workers.prev = &conf->workers; + + for (i=0; i<conf->thread_count; i++) { + + iot_worker_t *worker = CALLOC (1, sizeof (*worker)); + ERR_ABORT (worker); + + worker->next = &conf->workers; + worker->prev = conf->workers.prev; + worker->next->prev = worker; + worker->prev->next = worker; + + worker->queue.next = &worker->queue; + worker->queue.prev = &worker->queue; + + /* + pthread_mutex_init (&worker->lock, NULL); + pthread_cond_init (&worker->q_cond, NULL); + */ + pthread_cond_init (&worker->dq_cond, NULL); + + /* + worker->queue_limit = conf->queue_limit; + */ + + worker->conf = conf; + + pthread_create (&worker->thread, NULL, iot_worker, worker); + } +} + +int32_t +init (xlator_t *this) +{ + iot_conf_t *conf; + dict_t *options = this->options; + + if (!this->children || this->children->next) { + gf_log ("io-threads", + GF_LOG_ERROR, + "FATAL: iot not configured with exactly one child"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + conf = (void *) CALLOC (1, sizeof (*conf)); + ERR_ABORT (conf); + + conf->thread_count = 1; + + if (dict_get (options, "thread-count")) { + conf->thread_count = data_to_int32 (dict_get (options, + "thread-count")); + gf_log ("io-threads", + GF_LOG_DEBUG, + "Using conf->thread_count = %d", + conf->thread_count); + } + + pthread_mutex_init (&conf->lock, NULL); + pthread_cond_init (&conf->q_cond, NULL); + + conf->files.next = &conf->files; + conf->files.prev = &conf->files; + pthread_mutex_init (&conf->files_lock, NULL); + + workers_init (conf); + + this->private = conf; + return 0; +} + +void +fini (xlator_t *this) +{ + iot_conf_t *conf = this->private; + + FREE (conf); + + this->private = NULL; + return; +} + +struct xlator_fops fops = { + .open = iot_open, + .create = iot_create, + .readv = iot_readv, + .writev = iot_writev, + .flush = iot_flush, + .fsync = iot_fsync, + .lk = iot_lk, + .stat = iot_stat, + .fstat = iot_fstat, + .truncate = iot_truncate, + .ftruncate = iot_ftruncate, + .utimens = iot_utimens, + .checksum = iot_checksum, + .unlink = iot_unlink, +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { + .release = iot_release, +}; + +struct volume_options options[] = { + { .key = {"thread-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 32 + }, + { .key = {NULL} }, +}; diff --git a/xlators/performance/io-threads/src/io-threads.h b/xlators/performance/io-threads/src/io-threads.h new file mode 100644 index 000000000..6595d3e27 --- /dev/null +++ b/xlators/performance/io-threads/src/io-threads.h @@ -0,0 +1,99 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __IOT_H +#define __IOT_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "compat-errno.h" +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "common-utils.h" + +#define min(a,b) ((a)<(b)?(a):(b)) +#define max(a,b) ((a)>(b)?(a):(b)) + +struct iot_conf; +struct iot_worker; +struct iot_queue; +struct iot_local; +struct iot_file; + +struct iot_local { + struct iot_file *file; + size_t frame_size; +}; + +struct iot_queue { + struct iot_queue *next, *prev; + call_stub_t *stub; +}; + +struct iot_worker { + struct iot_worker *next, *prev; + struct iot_queue queue; + struct iot_conf *conf; + int64_t q,dq; + pthread_cond_t dq_cond; + /* + pthread_cond_t q_cond; + pthread_mutex_t lock; + */ + int32_t fd_count; + int32_t queue_size; + /* + int32_t queue_limit; + */ + pthread_t thread; +}; + +struct iot_file { + struct iot_file *next, *prev; /* all open files via this xlator */ + struct iot_worker *worker; + fd_t *fd; + int32_t pending_ops; +}; + +struct iot_conf { + int32_t thread_count; + int32_t misc_thread_index; /* Used to schedule the miscellaneous calls like checksum */ + struct iot_worker workers; + struct iot_file files; + pthread_mutex_t files_lock; + + uint64_t cache_size; + off_t current_size; + pthread_cond_t q_cond; + pthread_mutex_t lock; +}; + +typedef struct iot_file iot_file_t; +typedef struct iot_conf iot_conf_t; +typedef struct iot_local iot_local_t; +typedef struct iot_worker iot_worker_t; +typedef struct iot_queue iot_queue_t; + +#endif /* __IOT_H */ diff --git a/xlators/performance/read-ahead/Makefile.am b/xlators/performance/read-ahead/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/performance/read-ahead/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/performance/read-ahead/src/Makefile.am b/xlators/performance/read-ahead/src/Makefile.am new file mode 100644 index 000000000..7bb902282 --- /dev/null +++ b/xlators/performance/read-ahead/src/Makefile.am @@ -0,0 +1,14 @@ +xlator_LTLIBRARIES = read-ahead.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +read_ahead_la_LDFLAGS = -module -avoidversion + +read_ahead_la_SOURCES = read-ahead.c page.c +read_ahead_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = read-ahead.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/read-ahead/src/page.c b/xlators/performance/read-ahead/src/page.c new file mode 100644 index 000000000..3b8d4d209 --- /dev/null +++ b/xlators/performance/read-ahead/src/page.c @@ -0,0 +1,487 @@ +/* + Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "read-ahead.h" +#include <assert.h> + + +ra_page_t * +ra_page_get (ra_file_t *file, + off_t offset) +{ + ra_page_t *page = NULL; + off_t rounded_offset = 0; + + page = file->pages.next; + rounded_offset = floor (offset, file->page_size); + + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; + + if (page == &file->pages || page->offset != rounded_offset) + page = NULL; + + return page; +} + + +ra_page_t * +ra_page_create (ra_file_t *file, off_t offset) +{ + ra_page_t *page = NULL; + off_t rounded_offset = 0; + ra_page_t *newpage = NULL; + + page = file->pages.next; + rounded_offset = floor (offset, file->page_size); + + while (page != &file->pages && page->offset < rounded_offset) + page = page->next; + + if (page == &file->pages || page->offset != rounded_offset) { + newpage = CALLOC (1, sizeof (*newpage)); + if (!newpage) + return NULL; + + newpage->offset = rounded_offset; + newpage->prev = page->prev; + newpage->next = page; + newpage->file = file; + page->prev->next = newpage; + page->prev = newpage; + + page = newpage; + } + + return page; +} + + +void +ra_wait_on_page (ra_page_t *page, call_frame_t *frame) +{ + ra_waitq_t *waitq = NULL; + ra_local_t *local = NULL; + + + local = frame->local; + waitq = CALLOC (1, sizeof (*waitq)); + if (!waitq) { + gf_log (frame->this->name, GF_LOG_ERROR, + "out of memory :("); + return; + } + + waitq->data = frame; + waitq->next = page->waitq; + page->waitq = waitq; + + ra_local_lock (local); + { + local->wait_count++; + } + ra_local_unlock (local); +} + + +void +ra_waitq_return (ra_waitq_t *waitq) +{ + ra_waitq_t *trav = NULL; + ra_waitq_t *next = NULL; + call_frame_t *frame = NULL; + + for (trav = waitq; trav; trav = next) { + next = trav->next; + + frame = trav->data; + ra_frame_return (frame); + free (trav); + } +} + + +int +ra_fault_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct stat *stbuf) +{ + ra_local_t *local = NULL; + off_t pending_offset = 0; + ra_file_t *file = NULL; + ra_page_t *page = NULL; + off_t trav_offset = 0; + size_t payload_size = 0; + ra_waitq_t *waitq = NULL; + fd_t *fd = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + local = frame->local; + fd = local->fd; + + ret = fd_ctx_get (fd, this, &tmp_file); + + file = (ra_file_t *)(long)tmp_file; + pending_offset = local->pending_offset; + trav_offset = pending_offset; + payload_size = op_ret; + + ra_file_lock (file); + { + if (op_ret >= 0) + file->stbuf = *stbuf; + + if (op_ret < 0) { + page = ra_page_get (file, pending_offset); + if (page) + waitq = ra_page_error (page, op_ret, op_errno); + goto unlock; + } + + page = ra_page_get (file, pending_offset); + if (!page) { + gf_log (this->name, GF_LOG_DEBUG, + "wasted copy: %"PRId64"[+%"PRId64"] file=%p", + pending_offset, file->page_size, file); + goto unlock; + } + + if (page->vector) { + dict_unref (page->ref); + free (page->vector); + } + + page->vector = iov_dup (vector, count); + page->count = count; + page->ref = dict_ref (frame->root->rsp_refs); + page->ready = 1; + + page->size = iov_length (vector, count); + + waitq = ra_page_wakeup (page); + } +unlock: + ra_file_unlock (file); + + ra_waitq_return (waitq); + + fd_unref (local->fd); + + free (frame->local); + frame->local = NULL; + + STACK_DESTROY (frame->root); + return 0; +} + + +void +ra_page_fault (ra_file_t *file, + call_frame_t *frame, + off_t offset) +{ + call_frame_t *fault_frame = NULL; + ra_local_t *fault_local = NULL; + + fault_frame = copy_frame (frame); + fault_local = CALLOC (1, sizeof (ra_local_t)); + + fault_frame->local = fault_local; + fault_local->pending_offset = offset; + fault_local->pending_size = file->page_size; + + fault_local->fd = fd_ref (file->fd); + + STACK_WIND (fault_frame, ra_fault_cbk, + FIRST_CHILD (fault_frame->this), + FIRST_CHILD (fault_frame->this)->fops->readv, + file->fd, file->page_size, offset); + return; +} + +void +ra_frame_fill (ra_page_t *page, call_frame_t *frame) +{ + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + off_t src_offset = 0; + off_t dst_offset = 0; + ssize_t copy_size = 0; + ra_fill_t *new = NULL; + + + local = frame->local; + fill = &local->fill; + + if (local->op_ret != -1 && page->size) { + if (local->offset > page->offset) + src_offset = local->offset - page->offset; + else + dst_offset = page->offset - local->offset; + + copy_size = min (page->size - src_offset, + local->size - dst_offset); + + if (copy_size < 0) { + /* if page contains fewer bytes and the required offset + is beyond the page size in the page */ + copy_size = src_offset = 0; + } + + fill = fill->next; + while (fill != &local->fill) { + if (fill->offset > page->offset) { + break; + } + fill = fill->next; + } + + new = CALLOC (1, sizeof (*new)); + + new->offset = page->offset; + new->size = copy_size; + new->refs = dict_ref (page->ref); + new->count = iov_subset (page->vector, page->count, + src_offset, src_offset+copy_size, + NULL); + new->vector = CALLOC (new->count, sizeof (struct iovec)); + + new->count = iov_subset (page->vector, page->count, + src_offset, src_offset+copy_size, + new->vector); + + new->next = fill; + new->prev = new->next->prev; + new->next->prev = new; + new->prev->next = new; + + local->op_ret += copy_size; + } +} + + +void +ra_frame_unwind (call_frame_t *frame) +{ + ra_local_t *local = NULL; + ra_fill_t *fill = NULL; + int32_t count = 0; + struct iovec *vector; + int32_t copied = 0; + dict_t *refs = NULL; + ra_fill_t *next = NULL; + fd_t *fd = NULL; + ra_file_t *file = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + local = frame->local; + fill = local->fill.next; + + refs = get_new_dict (); + + frame->local = NULL; + + while (fill != &local->fill) { + count += fill->count; + fill = fill->next; + } + + vector = CALLOC (count, sizeof (*vector)); + + fill = local->fill.next; + + while (fill != &local->fill) { + next = fill->next; + + memcpy (((char *)vector) + copied, fill->vector, + fill->count * sizeof (*vector)); + + copied += (fill->count * sizeof (*vector)); + dict_copy (fill->refs, refs); + + fill->next->prev = fill->prev; + fill->prev->next = fill->prev; + + dict_unref (fill->refs); + free (fill->vector); + free (fill); + + fill = next; + } + + frame->root->rsp_refs = dict_ref (refs); + + fd = local->fd; + ret = fd_ctx_get (fd, frame->this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + STACK_UNWIND (frame, local->op_ret, local->op_errno, + vector, count, &file->stbuf); + + dict_unref (refs); + pthread_mutex_destroy (&local->local_lock); + free (local); + free (vector); + + return; +} + +/* + * ra_frame_return - + * @frame: + * + */ +void +ra_frame_return (call_frame_t *frame) +{ + ra_local_t *local = NULL; + int32_t wait_count = 0; + + local = frame->local; + assert (local->wait_count > 0); + + ra_local_lock (local); + { + wait_count = --local->wait_count; + } + ra_local_unlock (local); + + if (!wait_count) + ra_frame_unwind (frame); + + return; +} + +/* + * ra_page_wakeup - + * @page: + * + */ +ra_waitq_t * +ra_page_wakeup (ra_page_t *page) +{ + ra_waitq_t *waitq = NULL, *trav = NULL; + call_frame_t *frame; + + waitq = page->waitq; + page->waitq = NULL; + + trav = waitq; + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + ra_frame_fill (page, frame); + } + + return waitq; +} + +/* + * ra_page_purge - + * @page: + * + */ +void +ra_page_purge (ra_page_t *page) +{ + page->prev->next = page->next; + page->next->prev = page->prev; + + if (page->ref) { + dict_unref (page->ref); + } + free (page->vector); + free (page); +} + +/* + * ra_page_error - + * @page: + * @op_ret: + * @op_errno: + * + */ +ra_waitq_t * +ra_page_error (ra_page_t *page, int32_t op_ret, int32_t op_errno) +{ + + ra_waitq_t *waitq = NULL; + ra_waitq_t *trav = NULL; + call_frame_t *frame = NULL; + ra_local_t *local = NULL; + + waitq = page->waitq; + page->waitq = NULL; + + trav = waitq; + for (trav = waitq; trav; trav = trav->next) { + frame = trav->data; + + local = frame->local; + if (local->op_ret != -1) { + local->op_ret = op_ret; + local->op_errno = op_errno; + } + } + + ra_page_purge (page); + + return waitq; +} + +/* + * ra_file_destroy - + * @file: + * + */ +void +ra_file_destroy (ra_file_t *file) +{ + ra_conf_t *conf = NULL; + ra_page_t *trav = NULL; + + conf = file->conf; + + ra_conf_lock (conf); + { + file->prev->next = file->next; + file->next->prev = file->prev; + } + ra_conf_unlock (conf); + + trav = file->pages.next; + while (trav != &file->pages) { + ra_page_error (trav, -1, EINVAL); + trav = file->pages.next; + } + + pthread_mutex_destroy (&file->file_lock); + free (file); +} + diff --git a/xlators/performance/read-ahead/src/read-ahead.c b/xlators/performance/read-ahead/src/read-ahead.c new file mode 100644 index 000000000..0060e00fd --- /dev/null +++ b/xlators/performance/read-ahead/src/read-ahead.c @@ -0,0 +1,890 @@ +/* + Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/* + TODO: + - handle O_DIRECT + - maintain offset, flush on lseek + - ensure efficient memory managment in case of random seek +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "read-ahead.h" +#include <assert.h> +#include <sys/time.h> + + +static void +read_ahead (call_frame_t *frame, + ra_file_t *file); + + +int +ra_open_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, fd_t *fd) +{ + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = CALLOC (1, sizeof (*file)); + if (!file) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto unwind; + } + + ret = fd_ctx_set (fd, this, (uint64_t)(long)file); + + /* If mandatory locking has been enabled on this file, + we disable caching on it */ + + if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP)) + file->disabled = 1; + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long) 0; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long) 0; + file->pages.file = file; + + ra_conf_lock (conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock (conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init (&file->file_lock, NULL); + + if (!file->disabled) { + file->page_count = 1; + } + +unwind: + STACK_UNWIND (frame, op_ret, op_errno, fd); + + return 0; +} + + +int +ra_create_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + fd_t *fd, inode_t *inode, struct stat *buf) +{ + ra_conf_t *conf = NULL; + ra_file_t *file = NULL; + int ret = 0; + + conf = this->private; + + if (op_ret == -1) { + goto unwind; + } + + file = CALLOC (1, sizeof (*file)); + if (!file) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto unwind; + } + + ret = fd_ctx_set (fd, this, (uint64_t)(long)file); + + /* If mandatory locking has been enabled on this file, + we disable caching on it */ + + if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP)) + file->disabled = 1; + + /* If O_DIRECT open, we disable caching on it */ + + if ((fd->flags & O_DIRECT) || (fd->flags & O_WRONLY)) + file->disabled = 1; + + file->offset = (unsigned long long) 0; + //file->size = fd->inode->buf.st_size; + file->conf = conf; + file->pages.next = &file->pages; + file->pages.prev = &file->pages; + file->pages.offset = (unsigned long long) 0; + file->pages.file = file; + + ra_conf_lock (conf); + { + file->next = conf->files.next; + conf->files.next = file; + file->next->prev = file; + file->prev = &conf->files; + } + ra_conf_unlock (conf); + + file->fd = fd; + file->page_count = conf->page_count; + file->page_size = conf->page_size; + pthread_mutex_init (&file->file_lock, NULL); + +unwind: + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + + return 0; +} + + +int +ra_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd) +{ + STACK_WIND (frame, ra_open_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->open, + loc, flags, fd); + + return 0; +} + +int +ra_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, fd_t *fd) +{ + STACK_WIND (frame, ra_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, flags, mode, fd); + + return 0; +} + +/* free cache pages between offset and offset+size, + does not touch pages with frames waiting on it +*/ + +static void +flush_region (call_frame_t *frame, + ra_file_t *file, + off_t offset, + off_t size) +{ + ra_page_t *trav = NULL; + ra_page_t *next = NULL; + + + ra_file_lock (file); + { + trav = file->pages.next; + while (trav != &file->pages + && trav->offset < (offset + size)) { + + next = trav->next; + if (trav->offset >= offset && !trav->waitq) { + ra_page_purge (trav); + } + trav = next; + } + } + ra_file_unlock (file); +} + + + +int +ra_release (xlator_t *this, + fd_t *fd) +{ + uint64_t tmp_file = 0; + int ret = 0; + + ret = fd_ctx_del (fd, this, &tmp_file); + + if (!ret) { + ra_file_destroy ((ra_file_t *)(long)tmp_file); + } + + return 0; +} + + +void +read_ahead (call_frame_t *frame, ra_file_t *file) +{ + off_t ra_offset = 0; + size_t ra_size = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + off_t cap = 0; + char fault = 0; + + if (!file->page_count) + return; + + ra_size = file->page_size * file->page_count; + ra_offset = floor (file->offset, file->page_size); + cap = file->size ? file->size : file->offset + ra_size; + + while (ra_offset < min (file->offset + ra_size, cap)) { + + ra_file_lock (file); + { + trav = ra_page_get (file, ra_offset); + } + ra_file_unlock (file); + + if (!trav) + break; + + ra_offset += file->page_size; + } + + if (trav) + /* comfortable enough */ + return; + + trav_offset = ra_offset; + + trav = file->pages.next; + cap = file->size ? file->size : ra_offset + ra_size; + + while (trav_offset < min(ra_offset + ra_size, cap)) { + fault = 0; + ra_file_lock (file); + { + trav = ra_page_get (file, trav_offset); + if (!trav) { + fault = 1; + trav = ra_page_create (file, trav_offset); + if (trav) + trav->dirty = 1; + } + } + ra_file_unlock (file); + + if (!trav) { + /* OUT OF MEMORY */ + break; + } + + if (fault) { + gf_log (frame->this->name, GF_LOG_DEBUG, + "RA at offset=%"PRId64, trav_offset); + ra_page_fault (file, frame, trav_offset); + } + trav_offset += file->page_size; + } + + return; +} + + +int +ra_need_atime_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iovec *vector, + int32_t count, struct stat *stbuf) +{ + STACK_DESTROY (frame->root); + return 0; +} + + +static void +dispatch_requests (call_frame_t *frame, + ra_file_t *file) +{ + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + off_t rounded_offset = 0; + off_t rounded_end = 0; + off_t trav_offset = 0; + ra_page_t *trav = NULL; + call_frame_t *ra_frame = NULL; + char need_atime_update = 1; + char fault = 0; + + + local = frame->local; + conf = file->conf; + + rounded_offset = floor (local->offset, file->page_size); + rounded_end = roof (local->offset + local->size, file->page_size); + + trav_offset = rounded_offset; + trav = file->pages.next; + + while (trav_offset < rounded_end) { + fault = 0; + + ra_file_lock (file); + { + trav = ra_page_get (file, trav_offset); + if (!trav) { + trav = ra_page_create (file, trav_offset); + fault = 1; + need_atime_update = 0; + } + + if (!trav) + goto unlock; + + if (trav->ready) { + gf_log (frame->this->name, GF_LOG_DEBUG, + "HIT at offset=%"PRId64".", + trav_offset); + ra_frame_fill (trav, frame); + } else { + gf_log (frame->this->name, GF_LOG_DEBUG, + "IN-TRANSIT at offset=%"PRId64".", + trav_offset); + ra_wait_on_page (trav, frame); + need_atime_update = 0; + } + } + unlock: + ra_file_unlock (file); + + if (fault) { + gf_log (frame->this->name, GF_LOG_DEBUG, + "MISS at offset=%"PRId64".", + trav_offset); + ra_page_fault (file, frame, trav_offset); + } + + trav_offset += file->page_size; + } + + if (need_atime_update && conf->force_atime_update) { + /* TODO: use untimens() since readv() can confuse underlying + io-cache and others */ + ra_frame = copy_frame (frame); + STACK_WIND (ra_frame, ra_need_atime_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->readv, + file->fd, 1, 1); + } + + return ; +} + + +int +ra_readv_disabled_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, + struct iovec *vector, int32_t count, struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + + return 0; +} + + +int +ra_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + ra_file_t *file = NULL; + ra_local_t *local = NULL; + ra_conf_t *conf = NULL; + int op_errno = 0; + int ret = 0; + char expected_offset = 1; + uint64_t tmp_file = 0; + + conf = this->private; + + gf_log (this->name, GF_LOG_DEBUG, + "NEW REQ at offset=%"PRId64" for size=%"GF_PRI_SIZET"", + offset, size); + + ret = fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file->offset != offset) { + gf_log (this->name, GF_LOG_DEBUG, + "unexpected offset (%"PRId64" != %"PRId64") resetting", + file->offset, offset); + + expected_offset = file->expected = file->page_count = 0; + } else { + gf_log (this->name, GF_LOG_DEBUG, + "expected offset (%"PRId64") when page_count=%d", + offset, file->page_count); + + if (file->expected < (conf->page_size * conf->page_count)) { + file->expected += size; + file->page_count = min ((file->expected / file->page_size), + conf->page_count); + } + } + + if (!expected_offset) { + flush_region (frame, file, 0, file->pages.prev->offset + 1); + } + + if (file->disabled) { + STACK_WIND (frame, ra_readv_disabled_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->readv, + file->fd, size, offset); + return 0; + } + + local = (void *) CALLOC (1, sizeof (*local)); + if (!local) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + op_errno = ENOMEM; + goto unwind; + } + + local->fd = fd; + local->offset = offset; + local->size = size; + local->wait_count = 1; + + local->fill.next = &local->fill; + local->fill.prev = &local->fill; + + pthread_mutex_init (&local->local_lock, NULL); + + frame->local = local; + + dispatch_requests (frame, file); + + flush_region (frame, file, 0, floor (offset, file->page_size)); + + read_ahead (frame, file); + + ra_frame_return (frame); + + file->offset = offset + size; + + return 0; + +unwind: + STACK_UNWIND (frame, -1, op_errno, NULL, 0, NULL); + + return 0; +} + + +int +ra_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int +ra_flush (call_frame_t *frame, xlator_t *this, fd_t *fd) +{ + ra_file_t *file = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + ret = fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1); + } + + STACK_WIND (frame, ra_flush_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->flush, + fd); + return 0; +} + + +int +ra_fsync (call_frame_t *frame, xlator_t *this, fd_t *fd, + int32_t datasync) +{ + ra_file_t *file = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + ret = fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1); + } + + STACK_WIND (frame, ra_flush_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fsync, + fd, datasync); + return 0; +} + + +int +ra_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *stbuf) +{ + fd_t *fd = NULL; + ra_file_t *file = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + fd = frame->local; + + ret = fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1); + } + + frame->local = NULL; + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} + + +int +ra_writev (call_frame_t *frame, xlator_t *this, fd_t *fd, + struct iovec *vector, int32_t count, off_t offset) +{ + ra_file_t *file = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + ret = fd_ctx_get (fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (file) { + flush_region (frame, file, 0, file->pages.prev->offset+1); + + /* reset the read-ahead counters too */ + file->expected = file->page_count = 0; + } + + frame->local = fd; + + STACK_WIND (frame, ra_writev_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->writev, + fd, vector, count, offset); + + return 0; +} + + +int +ra_attr_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + + +int +ra_truncate (call_frame_t *frame, xlator_t *this, + loc_t *loc, off_t offset) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + inode = loc->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + ret = fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + flush_region (frame, file, 0, + file->pages.prev->offset + 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_attr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->truncate, + loc, offset); + return 0; +} + + +int +ra_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + ret = fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + flush_region (frame, file, 0, + file->pages.prev->offset + 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_attr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fstat, + fd); + return 0; +} + + +int +ra_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + ret = fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + + if (!file) + continue; + flush_region (frame, file, 0, + file->pages.prev->offset + 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_attr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->fchown, + fd, uid, gid); + return 0; +} + + +int +ra_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + ra_file_t *file = NULL; + fd_t *iter_fd = NULL; + inode_t *inode = NULL; + int ret = 0; + uint64_t tmp_file = 0; + + inode = fd->inode; + + LOCK (&inode->lock); + { + list_for_each_entry (iter_fd, &inode->fd_list, inode_list) { + ret = fd_ctx_get (iter_fd, this, &tmp_file); + file = (ra_file_t *)(long)tmp_file; + if (!file) + continue; + flush_region (frame, file, 0, + file->pages.prev->offset + 1); + } + } + UNLOCK (&inode->lock); + + STACK_WIND (frame, ra_attr_cbk, + FIRST_CHILD (this), + FIRST_CHILD (this)->fops->ftruncate, + fd, offset); + return 0; +} + + +int +init (xlator_t *this) +{ + ra_conf_t *conf; + dict_t *options = this->options; + char *page_size_string = NULL; + char *page_count_string = NULL; + + if (!this->children || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: read-ahead not configured with exactly one child"); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + conf = (void *) CALLOC (1, sizeof (*conf)); + ERR_ABORT (conf); + conf->page_size = 256 * 1024; + conf->page_count = 2; + + if (dict_get (options, "page-size")) + page_size_string = data_to_str (dict_get (options, + "page-size")); + if (page_size_string) + { + if (gf_string2bytesize (page_size_string, &conf->page_size) != 0) + { + gf_log ("read-ahead", + GF_LOG_ERROR, + "invalid number format \"%s\" of \"option page-size\"", + page_size_string); + return -1; + } + + gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_size = %"PRIu64"", + conf->page_size); + } + + if (dict_get (options, "page-count")) + page_count_string = data_to_str (dict_get (options, + "page-count")); + if (page_count_string) + { + if (gf_string2uint_base10 (page_count_string, &conf->page_count) != 0) + { + gf_log ("read-ahead", + GF_LOG_ERROR, + "invalid number format \"%s\" of \"option page-count\"", + page_count_string); + return -1; + } + gf_log (this->name, GF_LOG_DEBUG, "Using conf->page_count = %u", + conf->page_count); + } + + if (dict_get (options, "force-atime-update")) { + char *force_atime_update_str = data_to_str (dict_get (options, + "force-atime-update")); + if (gf_string2boolean (force_atime_update_str, &conf->force_atime_update) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "'force-atime-update' takes only boolean options"); + return -1; + } + if (conf->force_atime_update) + gf_log (this->name, GF_LOG_DEBUG, "Forcing atime updates on cache hit"); + } + + conf->files.next = &conf->files; + conf->files.prev = &conf->files; + + pthread_mutex_init (&conf->conf_lock, NULL); + this->private = conf; + return 0; +} + +void +fini (xlator_t *this) +{ + ra_conf_t *conf = this->private; + + pthread_mutex_destroy (&conf->conf_lock); + FREE (conf); + + this->private = NULL; + return; +} + +struct xlator_fops fops = { + .open = ra_open, + .create = ra_create, + .readv = ra_readv, + .writev = ra_writev, + .flush = ra_flush, + .fsync = ra_fsync, + .truncate = ra_truncate, + .ftruncate = ra_ftruncate, + .fstat = ra_fstat, + .fchown = ra_fchown, +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { + .release = ra_release, +}; + +struct volume_options options[] = { + { .key = {"force-atime-update"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"page-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 64 * GF_UNIT_KB, + .max = 2 * GF_UNIT_MB + }, + { .key = {"page-count"}, + .type = GF_OPTION_TYPE_INT, + .min = 1, + .max = 16 + }, + { .key = {NULL} }, +}; diff --git a/xlators/performance/read-ahead/src/read-ahead.h b/xlators/performance/read-ahead/src/read-ahead.h new file mode 100644 index 000000000..d624ca8ab --- /dev/null +++ b/xlators/performance/read-ahead/src/read-ahead.h @@ -0,0 +1,194 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __READ_AHEAD_H +#define __READ_AHEAD_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "common-utils.h" + +struct ra_conf; +struct ra_local; +struct ra_page; +struct ra_file; +struct ra_waitq; + + +struct ra_waitq { + struct ra_waitq *next; + void *data; +}; + + +struct ra_fill { + struct ra_fill *next; + struct ra_fill *prev; + off_t offset; + size_t size; + struct iovec *vector; + int32_t count; + dict_t *refs; +}; + + +struct ra_local { + mode_t mode; + struct ra_fill fill; + off_t offset; + size_t size; + int32_t op_ret; + int32_t op_errno; + off_t pending_offset; + size_t pending_size; + fd_t *fd; + int32_t wait_count; + pthread_mutex_t local_lock; +}; + + +struct ra_page { + struct ra_page *next; + struct ra_page *prev; + struct ra_file *file; + char dirty; + char ready; + struct iovec *vector; + int32_t count; + off_t offset; + size_t size; + struct ra_waitq *waitq; + dict_t *ref; +}; + + +struct ra_file { + struct ra_file *next; + struct ra_file *prev; + struct ra_conf *conf; + fd_t *fd; + int disabled; + size_t expected; + struct ra_page pages; + off_t offset; + size_t size; + int32_t refcount; + pthread_mutex_t file_lock; + struct stat stbuf; + uint64_t page_size; + uint32_t page_count; +}; + + +struct ra_conf { + uint64_t page_size; + uint32_t page_count; + void *cache_block; + struct ra_file files; + gf_boolean_t force_atime_update; + pthread_mutex_t conf_lock; +}; + + +typedef struct ra_conf ra_conf_t; +typedef struct ra_local ra_local_t; +typedef struct ra_page ra_page_t; +typedef struct ra_file ra_file_t; +typedef struct ra_waitq ra_waitq_t; +typedef struct ra_fill ra_fill_t; + +ra_page_t * +ra_page_get (ra_file_t *file, + off_t offset); +ra_page_t * +ra_page_create (ra_file_t *file, + off_t offset); +void +ra_page_fault (ra_file_t *file, + call_frame_t *frame, + off_t offset); +void +ra_wait_on_page (ra_page_t *page, + call_frame_t *frame); +ra_waitq_t * +ra_page_wakeup (ra_page_t *page); + +void +ra_page_flush (ra_page_t *page); + +ra_waitq_t * +ra_page_error (ra_page_t *page, + int32_t op_ret, + int32_t op_errno); +void +ra_page_purge (ra_page_t *page); + +void +ra_frame_return (call_frame_t *frame); +void +ra_frame_fill (ra_page_t *page, + call_frame_t *frame); + +void +ra_file_destroy (ra_file_t *file); + +static inline void +ra_file_lock (ra_file_t *file) +{ + pthread_mutex_lock (&file->file_lock); +} + +static inline void +ra_file_unlock (ra_file_t *file) +{ + pthread_mutex_unlock (&file->file_lock); +} + +static inline void +ra_conf_lock (ra_conf_t *conf) +{ + pthread_mutex_lock (&conf->conf_lock); +} + +static inline void +ra_conf_unlock (ra_conf_t *conf) +{ + pthread_mutex_unlock (&conf->conf_lock); +} +static inline void +ra_local_lock (ra_local_t *local) +{ + pthread_mutex_lock (&local->local_lock); +} + +static inline void +ra_local_unlock (ra_local_t *local) +{ + pthread_mutex_unlock (&local->local_lock); +} + +#endif /* __READ_AHEAD_H */ diff --git a/xlators/performance/stat-prefetch/Makefile.am b/xlators/performance/stat-prefetch/Makefile.am new file mode 100644 index 000000000..af437a64d --- /dev/null +++ b/xlators/performance/stat-prefetch/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/xlators/performance/stat-prefetch/src/Makefile.am b/xlators/performance/stat-prefetch/src/Makefile.am new file mode 100644 index 000000000..e52f2df48 --- /dev/null +++ b/xlators/performance/stat-prefetch/src/Makefile.am @@ -0,0 +1,11 @@ +xlator_PROGRAMS = stat-prefetch.so +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +stat_prefetch_so_SOURCES = stat-prefetch.c +noinst_HEADERS = stat-prefetch.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles + +CLEANFILES = + diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.c b/xlators/performance/stat-prefetch/src/stat-prefetch.c new file mode 100644 index 000000000..f2a78f676 --- /dev/null +++ b/xlators/performance/stat-prefetch/src/stat-prefetch.c @@ -0,0 +1,508 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "stat-prefetch.h" +#include "dict.h" +#include "xlator.h" +#include <sys/time.h> + +struct sp_cache { + struct sp_cache *next; + struct sp_cache *prev; + pid_t pid; + long long tv_time; + char *dirname; + dir_entry_t entries; + int32_t count; + pthread_mutex_t lock; +}; + +static void +stat_prefetch_cache_flush (struct sp_cache *cache, int32_t force) +{ + struct sp_cache *trav; + struct timeval tv; + long long tv_time; + + gettimeofday (&tv, NULL); + tv_time = (tv.tv_usec + (tv.tv_sec * 1000000)); + + pthread_mutex_lock (&cache->lock); + + trav = cache->next; + while (trav != cache) { + struct sp_cache *next = trav->next; + { + if (tv_time > trav->tv_time || force) { + gf_log ("stat-prefetch", + GF_LOG_DEBUG, + "flush on: %s", + trav->dirname); + dir_entry_t *entries; + + trav->prev->next = trav->next; + trav->next->prev = trav->prev; + + entries = trav->entries.next; + + while (entries) { + dir_entry_t *nextentry = entries->next; + { + free (entries->name); + free (entries); + } + entries = nextentry; + } + free (trav->dirname); + free (trav); + } + } + trav = next; + } + + pthread_mutex_unlock (&cache->lock); +} + +static int32_t +stat_prefetch_cache_fill (struct sp_cache *cache, + pid_t pid, + char *dirname, + dir_entry_t *entries) +{ + struct sp_cache *trav; + struct timeval tv; + + pthread_mutex_unlock (&cache->lock); + trav = cache->next; + while (trav != cache) { + // if (trav->pid == pid && !strcmp (trav->dirname, dirname)) { + if (!strcmp (trav->dirname, dirname)) { + break; + } + trav = trav->next; + } + + if (trav == cache) { + trav = CALLOC (1, sizeof (*trav)); + ERR_ABORT (trav); + trav->pid = pid; + trav->dirname = dirname; + + trav->prev = cache->prev; + trav->next = cache; + trav->next->prev = trav; + trav->prev->next = trav; + } else { + free (dirname); + } + + while (trav->entries.next) { + dir_entry_t *tmp = trav->entries.next; + + trav->entries.next = trav->entries.next->next; + free (tmp->name); + free (tmp); + } + trav->entries.next = entries->next; + entries->next = NULL; + + gettimeofday (&tv, NULL); + trav->tv_time = (tv.tv_usec + (tv.tv_sec * 1000000)) + cache->tv_time; + + pthread_mutex_unlock (&cache->lock); + return 0; +} + +static int32_t +stat_prefetch_cache_lookup (struct sp_cache *cache, + pid_t pid, + const char *path, + struct stat *buf) +{ + struct sp_cache *trav; + char *dirname = strdup (path); + char *filename = strrchr (dirname, '/'); + dir_entry_t *entries; + dir_entry_t *prev = NULL; + + *filename = '\0'; + filename ++; + + pthread_mutex_lock (&cache->lock); + trav = cache->next; + while (trav != cache) { + // if ((trav->pid == pid) && !strcmp (dirname, trav->dirname)) + if (!strcmp (dirname, trav->dirname)) + break; + trav = trav->next; + } + if (trav == cache) { + free (dirname); + pthread_mutex_unlock (&cache->lock); + return -1; + } + + entries = trav->entries.next; + prev = &trav->entries; + while (entries) { + if (!strcmp (entries->name, filename)) + break; + prev = entries; + entries = entries->next; + } + if (!entries) { + free (dirname); + pthread_mutex_unlock (&cache->lock); + return -1; + } + + *buf = entries->buf; + prev->next = entries->next; + free (entries->name); + free (entries); + free (dirname); + + pthread_mutex_unlock (&cache->lock); + + return 0; +} + + +int32_t +stat_prefetch_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count) +{ + char *path = frame->local; + pid_t pid = frame->root->pid; + frame->local = NULL; + + STACK_UNWIND (frame, op_ret, op_errno, entries, count); + + if (op_ret == 0) + stat_prefetch_cache_fill (this->private, + pid, + path, + entries); + else + free (path); + + return 0; +} + +int32_t +stat_prefetch_readdir (call_frame_t *frame, + xlator_t *this, + const char *path) +{ + stat_prefetch_cache_flush (this->private, 0); + + frame->local = strdup (path); + STACK_WIND (frame, + stat_prefetch_readdir_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readdir, + path); + return 0; +} + + +int32_t +stat_prefetch_getattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +stat_prefetch_getattr (call_frame_t *frame, + struct xlator *this, + const char *path) +{ + struct stat buf; + pid_t pid = frame->root->pid; + stat_prefetch_cache_flush (this->private, 0); + + if (stat_prefetch_cache_lookup (this->private, + pid, + path, + &buf) == 0) { + STACK_UNWIND (frame, 0, 0, &buf); + return 0; + } + + STACK_WIND (frame, + stat_prefetch_getattr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->getattr, + path); + + return 0; +} + + +int32_t +stat_prefetch_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +stat_prefetch_unlink (call_frame_t *frame, + struct xlator *this, + const char *path) +{ + stat_prefetch_cache_flush (this->private, 1); + + STACK_WIND (frame, + stat_prefetch_unlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->unlink, + path); + + return 0; +} + + +int32_t +stat_prefetch_chmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +stat_prefetch_chmod (call_frame_t *frame, + struct xlator *this, + const char *path, + mode_t mode) +{ + stat_prefetch_cache_flush (this->private, 1); + + STACK_WIND (frame, + stat_prefetch_chmod_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chmod, + path, + mode); + + return 0; +} + + +int32_t +stat_prefetch_chown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +stat_prefetch_chown (call_frame_t *frame, + struct xlator *this, + const char *path, + uid_t uid, + gid_t gid) +{ + stat_prefetch_cache_flush (this->private, 1); + + STACK_WIND (frame, + stat_prefetch_chown_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->chown, + path, + uid, + gid); + + return 0; +} + + +int32_t +stat_prefetch_utimes_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +stat_prefetch_utimes (call_frame_t *frame, + struct xlator *this, + const char *path, + struct timespec *tvp) +{ + stat_prefetch_cache_flush (this->private, 1); + + STACK_WIND (frame, + stat_prefetch_utimes_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->utimes, + path, + tvp); + + return 0; +} + + +int32_t +stat_prefetch_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + +int32_t +stat_prefetch_truncate (call_frame_t *frame, + struct xlator *this, + const char *path, + off_t offset) +{ + stat_prefetch_cache_flush (this->private, 1); + + STACK_WIND (frame, + stat_prefetch_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + path, + offset); + + return 0; +} + + +int32_t +stat_prefetch_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +stat_prefetch_rename (call_frame_t *frame, + struct xlator *this, + const char *oldpath, + const char *newpath) +{ + stat_prefetch_cache_flush (this->private, 1); + + STACK_WIND (frame, + stat_prefetch_rename_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->rename, + oldpath, + newpath); + + return 0; +} + +int32_t +init (struct xlator *this) +{ + struct sp_cache *cache; + dict_t *options = this->options; + + if (!this->children || this->children->next) { + gf_log ("stat-prefetch", + GF_LOG_ERROR, + "FATAL: translator %s does not have exactly one child node", + this->name); + return -1; + } + + cache = (void *) CALLOC (1, sizeof (*cache)); + ERR_ABORT (cache); + cache->next = cache->prev = cache; + + cache->tv_time = 1 * 1000000; + + if (dict_get (options, "cache-seconds")) { + cache->tv_time = (data_to_int64 (dict_get (options, "cache-seconds")) * + 1000000); + } + + pthread_mutex_init (&cache->lock, NULL); + + this->private = cache; + return 0; +} + +void +fini (struct xlator *this) +{ + return; +} + + +struct xlator_fops fops = { + .getattr = stat_prefetch_getattr, + .readdir = stat_prefetch_readdir, + .unlink = stat_prefetch_unlink, + .chmod = stat_prefetch_chmod, + .chown = stat_prefetch_chown, + .rename = stat_prefetch_rename, + .utimes = stat_prefetch_utimes, + .truncate = stat_prefetch_truncate, +}; + +struct xlator_mops mops = { +}; diff --git a/xlators/performance/stat-prefetch/src/stat-prefetch.h b/xlators/performance/stat-prefetch/src/stat-prefetch.h new file mode 100644 index 000000000..7d9645a2a --- /dev/null +++ b/xlators/performance/stat-prefetch/src/stat-prefetch.h @@ -0,0 +1,32 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _STAT_PREFETCH_H_ +#define _STAT_PREFETCH_H_ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <sys/time.h> +#include "xlator.h" + +#endif /* _STAT_PREFETCH_H_ */ diff --git a/xlators/performance/symlink-cache/Makefile.am b/xlators/performance/symlink-cache/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/performance/symlink-cache/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/performance/symlink-cache/src/Makefile.am b/xlators/performance/symlink-cache/src/Makefile.am new file mode 100644 index 000000000..b8b257c18 --- /dev/null +++ b/xlators/performance/symlink-cache/src/Makefile.am @@ -0,0 +1,12 @@ +xlator_LTLIBRARIES = symlink-cache.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +symlink_cache_la_LDFLAGS = -module -avoidversion + +symlink_cache_la_SOURCES = symlink-cache.c +symlink_cache_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/symlink-cache/src/symlink-cache.c b/xlators/performance/symlink-cache/src/symlink-cache.c new file mode 100644 index 000000000..fc207a627 --- /dev/null +++ b/xlators/performance/symlink-cache/src/symlink-cache.c @@ -0,0 +1,399 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "list.h" +#include "compat.h" +#include "compat-errno.h" +#include "common-utils.h" + +struct symlink_cache { + time_t ctime; + char *readlink; +}; + + +static int +symlink_inode_ctx_get (inode_t *inode, xlator_t *this, void **ctx) +{ + int ret = 0; + uint64_t tmp_ctx = 0; + ret = inode_ctx_get (inode, this, &tmp_ctx); + if (-1 == ret) + gf_log (this->name, GF_LOG_ERROR, "dict get failed"); + else + *ctx = (void *)(long)tmp_ctx; + + return 0; +} + + +static int +symlink_inode_ctx_set (inode_t *inode, xlator_t *this, void *ctx) +{ + int ret = 0; + ret = inode_ctx_put (inode, this, (uint64_t)(long) ctx); + if (-1 == ret) + gf_log (this->name, GF_LOG_ERROR, "dict set failed"); + + return 0; +} + + +int +sc_cache_update (xlator_t *this, inode_t *inode, const char *link) +{ + struct symlink_cache *sc = NULL; + + symlink_inode_ctx_get (inode, this, VOID(&sc)); + if (!sc) + return 0; + + if (!sc->readlink) { + gf_log (this->name, GF_LOG_DEBUG, + "updating cache: %s", link); + + sc->readlink = strdup (link); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "not updating existing cache: %s with %s", + sc->readlink, link); + } + + return 0; +} + + +int +sc_cache_set (xlator_t *this, inode_t *inode, struct stat *buf, + const char *link) +{ + struct symlink_cache *sc = NULL; + int ret = -1; + int need_set = 0; + + + symlink_inode_ctx_get (inode, this, VOID(&sc)); + if (!sc) { + need_set = 1; + sc = CALLOC (1, sizeof (*sc)); + if (!sc) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto err; + } + } + + if (sc->readlink) { + gf_log (this->name, GF_LOG_DEBUG, + "replacing old cache: %s with new cache: %s", + sc->readlink, link); + FREE (sc->readlink); + sc->readlink = NULL; + } + + if (link) { + sc->readlink = strdup (link); + if (!sc->readlink) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto err; + } + } + + sc->ctime = buf->st_ctime; + + gf_log (this->name, GF_LOG_DEBUG, + "setting symlink cache: %s", link); + + if (need_set) { + ret = symlink_inode_ctx_set (inode, this, sc); + + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "could not set inode context (%s)", + strerror (-ret)); + goto err; + } + } + + return 0; +err: + + if (sc) { + if (sc->readlink) + FREE (sc->readlink); + sc->readlink = NULL; + FREE (sc); + } + + return -1; +} + + +int +sc_cache_flush (xlator_t *this, inode_t *inode) +{ + struct symlink_cache *sc = NULL; + + symlink_inode_ctx_get (inode, this, VOID(&sc)); + if (!sc) + return 0; + + if (sc->readlink) { + gf_log (this->name, GF_LOG_DEBUG, + "flushing cache: %s", sc->readlink); + + FREE (sc->readlink); + sc->readlink = NULL; + } + + FREE (sc); + + return 0; +} + + +int +sc_cache_validate (xlator_t *this, inode_t *inode, struct stat *buf) +{ + struct symlink_cache *sc = NULL; + uint64_t tmp_sc = 0; + + if (!S_ISLNK (buf->st_mode)) { + sc_cache_flush (this, inode); + return 0; + } + + symlink_inode_ctx_get (inode, this, VOID(&sc)); + + if (!sc) { + sc_cache_set (this, inode, buf, NULL); + inode_ctx_get (inode, this, &tmp_sc); + + if (!sc) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + return 0; + } + sc = (struct symlink_cache *)(long)tmp_sc; + } + + if (sc->ctime == buf->st_ctime) + return 0; + + /* STALE */ + if (sc->readlink) { + gf_log (this->name, GF_LOG_DEBUG, + "flushing cache: %s", sc->readlink); + + FREE (sc->readlink); + sc->readlink = NULL; + } + + sc->ctime = buf->st_ctime; + + return 0; +} + + + +int +sc_cache_get (xlator_t *this, inode_t *inode, char **link) +{ + struct symlink_cache *sc = NULL; + + symlink_inode_ctx_get (inode, this, VOID(&sc)); + + if (!sc) + return 0; + + if (link && sc->readlink) + *link = strdup (sc->readlink); + return 0; +} + + +int +sc_readlink_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + const char *link) +{ + if (op_ret > 0) + sc_cache_update (this, frame->local, link); + + inode_unref (frame->local); + frame->local = NULL; + + STACK_UNWIND (frame, op_ret, op_errno, link); + return 0; +} + + +int +sc_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size) +{ + char *link = NULL; + + sc_cache_get (this, loc->inode, &link); + + if (link) { + /* cache hit */ + gf_log (this->name, GF_LOG_DEBUG, + "cache hit %s -> %s", + loc->path, link); + STACK_UNWIND (frame, strlen (link) + 1, 0, link); + FREE (link); + return 0; + } + + frame->local = inode_ref (loc->inode); + + STACK_WIND (frame, sc_readlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readlink, + loc, size); + + return 0; +} + + +int +sc_symlink_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct stat *buf) +{ + if (op_ret == 0) { + if (frame->local) { + sc_cache_set (this, inode, buf, frame->local); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf); + return 0; +} + + +int +sc_symlink (call_frame_t *frame, xlator_t *this, + const char *dst, loc_t *src) +{ + frame->local = strdup (dst); + + STACK_WIND (frame, sc_symlink_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->symlink, + dst, src); + + return 0; +} + + +int +sc_lookup_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int op_ret, int op_errno, + inode_t *inode, struct stat *buf, dict_t *xattr) +{ + if (op_ret == 0) + sc_cache_validate (this, inode, buf); + else + sc_cache_flush (this, inode); + + STACK_UNWIND (frame, op_ret, op_errno, inode, buf, xattr); + return 0; +} + + +int +sc_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + STACK_WIND (frame, sc_lookup_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, + loc, xattr_req); + + return 0; +} + + +int +sc_forget (xlator_t *this, + inode_t *inode) +{ + sc_cache_flush (this, inode); + + return 0; +} + + +int32_t +init (xlator_t *this) +{ + + if (!this->children || this->children->next) + { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: volume (%s) not configured with exactly one " + "child", this->name); + return -1; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + return 0; +} + + +void +fini (xlator_t *this) +{ + return; +} + + +struct xlator_fops fops = { + .lookup = sc_lookup, + .symlink = sc_symlink, + .readlink = sc_readlink, +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { + .forget = sc_forget, +}; + +struct volume_options options[] = { + { .key = {NULL} }, +}; diff --git a/xlators/performance/write-behind/Makefile.am b/xlators/performance/write-behind/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/performance/write-behind/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/performance/write-behind/src/Makefile.am b/xlators/performance/write-behind/src/Makefile.am new file mode 100644 index 000000000..f800abad5 --- /dev/null +++ b/xlators/performance/write-behind/src/Makefile.am @@ -0,0 +1,12 @@ +xlator_LTLIBRARIES = write-behind.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/performance + +write_behind_la_LDFLAGS = -module -avoidversion + +write_behind_la_SOURCES = write-behind.c +write_behind_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS)\ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c new file mode 100644 index 000000000..04a447d49 --- /dev/null +++ b/xlators/performance/write-behind/src/write-behind.c @@ -0,0 +1,1444 @@ +/* + Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/*TODO: check for non null wb_file_data before getting wb_file */ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "glusterfs.h" +#include "logging.h" +#include "dict.h" +#include "xlator.h" +#include "list.h" +#include "compat.h" +#include "compat-errno.h" +#include "common-utils.h" + +#define MAX_VECTOR_COUNT 8 + +typedef struct list_head list_head_t; +struct wb_conf; +struct wb_page; +struct wb_file; + + +struct wb_conf { + uint64_t aggregate_size; + uint64_t window_size; + uint64_t disable_till; + gf_boolean_t enable_O_SYNC; + gf_boolean_t flush_behind; +}; + + +typedef struct wb_local { + list_head_t winds; + struct wb_file *file; + list_head_t unwind_frames; + int op_ret; + int op_errno; + call_frame_t *frame; +} wb_local_t; + + +typedef struct write_request { + call_frame_t *frame; + off_t offset; + /* int32_t op_ret; + int32_t op_errno; */ + struct iovec *vector; + int32_t count; + dict_t *refs; + char write_behind; + char stack_wound; + char got_reply; + list_head_t list; + list_head_t winds; + /* list_head_t unwinds;*/ +} wb_write_request_t; + + +struct wb_file { + int disabled; + uint64_t disable_till; + off_t offset; + size_t window_size; + int32_t refcount; + int32_t op_ret; + int32_t op_errno; + list_head_t request; + fd_t *fd; + gf_lock_t lock; + xlator_t *this; +}; + + +typedef struct wb_conf wb_conf_t; +typedef struct wb_page wb_page_t; +typedef struct wb_file wb_file_t; + + +int32_t +wb_process_queue (call_frame_t *frame, wb_file_t *file, char flush_all); + +int32_t +wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds); + +int32_t +wb_sync_all (call_frame_t *frame, wb_file_t *file); + +int32_t +__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_size); + + +wb_file_t * +wb_file_create (xlator_t *this, + fd_t *fd) +{ + wb_file_t *file = NULL; + wb_conf_t *conf = this->private; + + file = CALLOC (1, sizeof (*file)); + INIT_LIST_HEAD (&file->request); + + /* fd_ref() not required, file should never decide the existance of + * an fd */ + file->fd= fd; + file->disable_till = conf->disable_till; + file->this = this; + file->refcount = 1; + + fd_ctx_set (fd, this, (uint64_t)(long)file); + + return file; +} + +void +wb_file_destroy (wb_file_t *file) +{ + int32_t refcount = 0; + + LOCK (&file->lock); + { + refcount = --file->refcount; + } + UNLOCK (&file->lock); + + if (!refcount){ + LOCK_DESTROY (&file->lock); + FREE (file); + } + + return; +} + + +int32_t +wb_sync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + wb_local_t *local = NULL; + list_head_t *winds = NULL; + wb_file_t *file = NULL; + wb_write_request_t *request = NULL, *dummy = NULL; + + local = frame->local; + winds = &local->winds; + file = local->file; + + LOCK (&file->lock); + { + list_for_each_entry_safe (request, dummy, winds, winds) { + request->got_reply = 1; + if (!request->write_behind && (op_ret == -1)) { + wb_local_t *per_request_local = request->frame->local; + per_request_local->op_ret = op_ret; + per_request_local->op_errno = op_errno; + } + + /* + request->op_ret = op_ret; + request->op_errno = op_errno; + */ + } + } + UNLOCK (&file->lock); + + if (op_ret == -1) + { + file->op_ret = op_ret; + file->op_errno = op_errno; + } + + wb_process_queue (frame, file, 0); + + /* safe place to do fd_unref */ + fd_unref (file->fd); + + STACK_DESTROY (frame->root); + + return 0; +} + +int32_t +wb_sync_all (call_frame_t *frame, wb_file_t *file) +{ + list_head_t winds; + int32_t bytes = 0; + + INIT_LIST_HEAD (&winds); + + LOCK (&file->lock); + { + bytes = __wb_mark_winds (&file->request, &winds, 0); + } + UNLOCK (&file->lock); + + wb_sync (frame, file, &winds); + + return bytes; +} + + +int32_t +wb_sync (call_frame_t *frame, wb_file_t *file, list_head_t *winds) +{ + wb_write_request_t *dummy = NULL, *request = NULL, *first_request = NULL, *next = NULL; + size_t total_count = 0, count = 0; + size_t copied = 0; + call_frame_t *sync_frame = NULL; + dict_t *refs = NULL; + wb_local_t *local = NULL; + struct iovec *vector = NULL; + int32_t bytes = 0; + size_t bytecount = 0; + + list_for_each_entry (request, winds, winds) + { + total_count += request->count; + bytes += iov_length (request->vector, request->count); + } + + if (!total_count) { + return 0; + } + + list_for_each_entry_safe (request, dummy, winds, winds) { + if (!vector) { + vector = MALLOC (VECTORSIZE (MAX_VECTOR_COUNT)); + refs = get_new_dict (); + + local = CALLOC (1, sizeof (*local)); + INIT_LIST_HEAD (&local->winds); + + first_request = request; + } + + count += request->count; + bytecount = VECTORSIZE (request->count); + memcpy (((char *)vector)+copied, + request->vector, + bytecount); + copied += bytecount; + + if (request->refs) { + dict_copy (request->refs, refs); + } + + next = NULL; + if (request->winds.next != winds) { + next = list_entry (request->winds.next, struct write_request, winds); + } + + list_del_init (&request->winds); + list_add_tail (&request->winds, &local->winds); + + if (!next || ((count + next->count) > MAX_VECTOR_COUNT)) { + sync_frame = copy_frame (frame); + sync_frame->local = local; + local->file = file; + sync_frame->root->req_refs = dict_ref (refs); + fd_ref (file->fd); + STACK_WIND (sync_frame, + wb_sync_cbk, + FIRST_CHILD(sync_frame->this), + FIRST_CHILD(sync_frame->this)->fops->writev, + file->fd, vector, + count, first_request->offset); + + dict_unref (refs); + FREE (vector); + first_request = NULL; + refs = NULL; + vector = NULL; + copied = count = 0; + } + } + + return bytes; +} + + +int32_t +wb_stat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + wb_local_t *local = NULL; + + local = frame->local; + + if (local->file) + fd_unref (local->file->fd); + + STACK_UNWIND (frame, op_ret, op_errno, buf); + + return 0; +} + + +int32_t +wb_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + wb_file_t *file = NULL; + fd_t *iter_fd = NULL; + wb_local_t *local = NULL; + uint64_t tmp_file = 0; + + if (loc->inode) + { + iter_fd = fd_lookup (loc->inode, frame->root->pid); + if (iter_fd) { + if (!fd_ctx_get (iter_fd, this, &tmp_file)) { + file = (wb_file_t *)(long)tmp_file; + } else { + fd_unref (iter_fd); + } + } + if (file) { + wb_sync_all (frame, file); + } + } + + local = CALLOC (1, sizeof (*local)); + local->file = file; + + frame->local = local; + + STACK_WIND (frame, wb_stat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->stat, + loc); + return 0; +} + + +int32_t +wb_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + wb_file_t *file = NULL; + wb_local_t *local = NULL; + uint64_t tmp_file = 0; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, "returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + } + + file = (wb_file_t *)(long)tmp_file; + if (file) { + fd_ref (file->fd); + wb_sync_all (frame, file); + } + + local = CALLOC (1, sizeof (*local)); + local->file = file; + + frame->local = local; + + STACK_WIND (frame, + wb_stat_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fstat, + fd); + return 0; +} + + +int32_t +wb_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + wb_local_t *local = NULL; + + local = frame->local; + if (local->file) + fd_unref (local->file->fd); + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + + +int32_t +wb_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + wb_file_t *file = NULL; + fd_t *iter_fd = NULL; + wb_local_t *local = NULL; + uint64_t tmp_file = 0; + + if (loc->inode) + { + iter_fd = fd_lookup (loc->inode, frame->root->pid); + if (iter_fd) { + if (!fd_ctx_get (iter_fd, this, &tmp_file)){ + file = (wb_file_t *)(long)tmp_file; + } else { + fd_unref (iter_fd); + } + } + + if (file) + { + wb_sync_all (frame, file); + } + } + + local = CALLOC (1, sizeof (*local)); + local->file = file; + + frame->local = local; + + STACK_WIND (frame, + wb_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->truncate, + loc, + offset); + return 0; +} + + +int32_t +wb_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + wb_file_t *file = NULL; + wb_local_t *local = NULL; + uint64_t tmp_file = 0; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, "returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + } + + file = (wb_file_t *)(long)tmp_file; + if (file) + wb_sync_all (frame, file); + + local = CALLOC (1, sizeof (*local)); + local->file = file; + + if (file) + fd_ref (file->fd); + + frame->local = local; + + STACK_WIND (frame, + wb_truncate_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->ftruncate, + fd, + offset); + return 0; +} + + +int32_t +wb_utimens_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *buf) +{ + wb_local_t *local = NULL; + + local = frame->local; + if (local->file) + fd_unref (local->file->fd); + + STACK_UNWIND (frame, op_ret, op_errno, buf); + return 0; +} + + +int32_t +wb_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec tv[2]) +{ + wb_file_t *file = NULL; + fd_t *iter_fd = NULL; + wb_local_t *local = NULL; + uint64_t tmp_file = 0; + + if (loc->inode) { + iter_fd = fd_lookup (loc->inode, frame->root->pid); + if (iter_fd) { + if (!fd_ctx_get (iter_fd, this, &tmp_file)) { + file = (wb_file_t *)(long)tmp_file; + } else { + fd_unref (iter_fd); + } + } + + if (file) + wb_sync_all (frame, file); + } + + local = CALLOC (1, sizeof (*local)); + local->file = file; + + frame->local = local; + + STACK_WIND (frame, + wb_utimens_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->utimens, + loc, + tv); + return 0; +} + +int32_t +wb_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + int32_t flags = 0; + wb_file_t *file = NULL; + wb_conf_t *conf = this->private; + + if (op_ret != -1) + { + file = wb_file_create (this, fd); + + /* If mandatory locking has been enabled on this file, + we disable caching on it */ + + if ((fd->inode->st_mode & S_ISGID) && !(fd->inode->st_mode & S_IXGRP)) + file->disabled = 1; + + /* If O_DIRECT then, we disable chaching */ + if (frame->local) + { + flags = *((int32_t *)frame->local); + if (((flags & O_DIRECT) == O_DIRECT) || + ((flags & O_RDONLY) == O_RDONLY) || + (((flags & O_SYNC) == O_SYNC) && + conf->enable_O_SYNC == _gf_true)) { + file->disabled = 1; + } + } + + LOCK_INIT (&file->lock); + } + + STACK_UNWIND (frame, op_ret, op_errno, fd); + return 0; +} + + +int32_t +wb_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + frame->local = CALLOC (1, sizeof(int32_t)); + *((int32_t *)frame->local) = flags; + + STACK_WIND (frame, + wb_open_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, + loc, flags, fd); + return 0; +} + + +int32_t +wb_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *buf) +{ + wb_file_t *file = NULL; + + if (op_ret != -1) + { + file = wb_file_create (this, fd); + /* + * If mandatory locking has been enabled on this file, + * we disable caching on it + */ + if ((fd->inode->st_mode & S_ISGID) && + !(fd->inode->st_mode & S_IXGRP)) + { + file->disabled = 1; + } + + LOCK_INIT (&file->lock); + } + + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, buf); + return 0; +} + + +int32_t +wb_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd) +{ + STACK_WIND (frame, + wb_create_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, + loc, flags, mode, fd); + return 0; +} + + +int32_t +__wb_cleanup_queue (wb_file_t *file) +{ + wb_write_request_t *request = NULL, *dummy = NULL; + int32_t bytes = 0; + + list_for_each_entry_safe (request, dummy, &file->request, list) + { + if (request->got_reply && request->write_behind) + { + bytes += iov_length (request->vector, request->count); + list_del_init (&request->list); + + FREE (request->vector); + dict_unref (request->refs); + + FREE (request); + } + } + + return bytes; +} + + +int32_t +__wb_mark_wind_all (list_head_t *list, list_head_t *winds) +{ + wb_write_request_t *request = NULL; + size_t size = 0; + + list_for_each_entry (request, list, list) + { + if (!request->stack_wound) + { + size += iov_length (request->vector, request->count); + request->stack_wound = 1; + list_add_tail (&request->winds, winds); + } + } + + return size; +} + + +size_t +__wb_get_aggregate_size (list_head_t *list) +{ + wb_write_request_t *request = NULL; + size_t size = 0; + + list_for_each_entry (request, list, list) + { + if (!request->stack_wound) + { + size += iov_length (request->vector, request->count); + } + } + + return size; +} + +uint32_t +__wb_get_incomplete_writes (list_head_t *list) +{ + wb_write_request_t *request = NULL; + uint32_t count = 0; + + list_for_each_entry (request, list, list) + { + if (request->stack_wound && !request->got_reply) + { + count++; + } + } + + return count; +} + +int32_t +__wb_mark_winds (list_head_t *list, list_head_t *winds, size_t aggregate_conf) +{ + size_t aggregate_current = 0; + uint32_t incomplete_writes = 0; + + incomplete_writes = __wb_get_incomplete_writes (list); + + aggregate_current = __wb_get_aggregate_size (list); + + if ((incomplete_writes == 0) || (aggregate_current >= aggregate_conf)) + { + __wb_mark_wind_all (list, winds); + } + + return aggregate_current; +} + + +size_t +__wb_get_window_size (list_head_t *list) +{ + wb_write_request_t *request = NULL; + size_t size = 0; + + list_for_each_entry (request, list, list) + { + if (request->write_behind && !request->got_reply) + { + size += iov_length (request->vector, request->count); + } + } + + return size; +} + + +size_t +__wb_mark_unwind_till (list_head_t *list, list_head_t *unwinds, size_t size) +{ + size_t written_behind = 0; + wb_write_request_t *request = NULL; + + list_for_each_entry (request, list, list) + { + if (written_behind <= size) + { + if (!request->write_behind) + { + wb_local_t *local = request->frame->local; + written_behind += iov_length (request->vector, request->count); + request->write_behind = 1; + list_add_tail (&local->unwind_frames, unwinds); + } + } + else + { + break; + } + } + + return written_behind; +} + + +int32_t +__wb_mark_unwinds (list_head_t *list, list_head_t *unwinds, size_t window_conf) +{ + size_t window_current = 0; + + window_current = __wb_get_window_size (list); + if (window_current <= window_conf) + { + window_current += __wb_mark_unwind_till (list, unwinds, + window_conf - window_current); + } + + return window_current; +} + + +int32_t +wb_stack_unwind (list_head_t *unwinds) +{ + struct stat buf = {0,}; + wb_local_t *local = NULL, *dummy = NULL; + + list_for_each_entry_safe (local, dummy, unwinds, unwind_frames) + { + list_del_init (&local->unwind_frames); + STACK_UNWIND (local->frame, local->op_ret, local->op_errno, &buf); + } + + return 0; +} + + +int32_t +wb_do_ops (call_frame_t *frame, wb_file_t *file, list_head_t *winds, list_head_t *unwinds) +{ + /* copy the frame before calling wb_stack_unwind, since this request containing current frame might get unwound */ + /* call_frame_t *sync_frame = copy_frame (frame); */ + + wb_stack_unwind (unwinds); + wb_sync (frame, file, winds); + + return 0; +} + + +int32_t +wb_process_queue (call_frame_t *frame, wb_file_t *file, char flush_all) +{ + list_head_t winds, unwinds; + size_t size = 0; + wb_conf_t *conf = file->this->private; + + INIT_LIST_HEAD (&winds); + INIT_LIST_HEAD (&unwinds); + + if (!file) + { + return -1; + } + + size = flush_all ? 0 : conf->aggregate_size; + LOCK (&file->lock); + { + __wb_cleanup_queue (file); + __wb_mark_winds (&file->request, &winds, size); + __wb_mark_unwinds (&file->request, &unwinds, conf->window_size); + } + UNLOCK (&file->lock); + + wb_do_ops (frame, file, &winds, &unwinds); + return 0; +} + + +wb_write_request_t * +wb_enqueue (wb_file_t *file, + call_frame_t *frame, + struct iovec *vector, + int32_t count, + off_t offset) +{ + wb_write_request_t *request = NULL; + wb_local_t *local = CALLOC (1, sizeof (*local)); + + request = CALLOC (1, sizeof (*request)); + + INIT_LIST_HEAD (&request->list); + INIT_LIST_HEAD (&request->winds); + + request->frame = frame; + request->vector = iov_dup (vector, count); + request->count = count; + request->offset = offset; + request->refs = dict_ref (frame->root->req_refs); + + frame->local = local; + local->frame = frame; + local->op_ret = iov_length (vector, count); + local->op_errno = 0; + INIT_LIST_HEAD (&local->unwind_frames); + + LOCK (&file->lock); + { + list_add_tail (&request->list, &file->request); + file->offset = offset + iov_length (vector, count); + } + UNLOCK (&file->lock); + + return request; +} + + +int32_t +wb_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + STACK_UNWIND (frame, op_ret, op_errno, stbuf); + return 0; +} + + +int32_t +wb_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + wb_file_t *file = NULL; + char offset_expected = 1, wb_disabled = 0; + call_frame_t *process_frame = NULL; + size_t size = 0; + uint64_t tmp_file = 0; + + if (vector != NULL) + size = iov_length (vector, count); + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, "returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + } + + file = (wb_file_t *)(long)tmp_file; + if (!file) { + gf_log (this->name, GF_LOG_ERROR, + "wb_file not found for fd %p", fd); + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + } + + LOCK (&file->lock); + { + if (file->disabled || file->disable_till) { + if (size > file->disable_till) { + file->disable_till = 0; + } else { + file->disable_till -= size; + } + wb_disabled = 1; + } + + if (file->offset != offset) + offset_expected = 0; + } + UNLOCK (&file->lock); + + if (wb_disabled) { + STACK_WIND (frame, + wb_writev_cbk, + FIRST_CHILD (frame->this), + FIRST_CHILD (frame->this)->fops->writev, + file->fd, + vector, + count, + offset); + return 0; + } + + process_frame = copy_frame (frame); + + if (!offset_expected) + wb_process_queue (process_frame, file, 1); + + wb_enqueue (file, frame, vector, count, offset); + wb_process_queue (process_frame, file, 0); + + STACK_DESTROY (process_frame->root); + + return 0; +} + + +int32_t +wb_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + wb_local_t *local = NULL; + + local = frame->local; + + STACK_UNWIND (frame, op_ret, op_errno, vector, count, stbuf); + return 0; +} + + +int32_t +wb_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + wb_file_t *file = NULL; + wb_local_t *local = NULL; + uint64_t tmp_file = 0; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, "returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + } + + file = (wb_file_t *)(long)tmp_file; + if (file) + wb_sync_all (frame, file); + + local = CALLOC (1, sizeof (*local)); + local->file = file; + + frame->local = local; + + STACK_WIND (frame, + wb_readv_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->readv, + fd, size, offset); + + return 0; +} + + +int32_t +wb_ffr_bg_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + wb_local_t *local = NULL; + wb_file_t *file = NULL; + + local = frame->local; + file = local->file; + + if (file) { + fd_unref (file->fd); + } + + if (file->op_ret == -1) + { + op_ret = file->op_ret; + op_errno = file->op_errno; + + file->op_ret = 0; + } + + STACK_DESTROY (frame->root); + return 0; +} + + +int32_t +wb_ffr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + wb_local_t *local = NULL; + wb_file_t *file = NULL; + + local = frame->local; + file = local->file; + if (file) { + /* corresponds to the fd_ref() done during wb_file_create() */ + fd_unref (file->fd); + } + + if (file->op_ret == -1) + { + op_ret = file->op_ret; + op_errno = file->op_errno; + + file->op_ret = 0; + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +wb_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + wb_conf_t *conf = NULL; + wb_file_t *file = NULL; + call_frame_t *flush_frame = NULL; + wb_local_t *local = NULL; + uint64_t tmp_file = 0; + + conf = this->private; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, "returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD); + return 0; + } + + file = (wb_file_t *)(long)tmp_file; + + local = CALLOC (1, sizeof (*local)); + local->file = file; + if (file) + fd_ref (file->fd); + + if (&file->request != file->request.next) { + gf_log (this->name, GF_LOG_DEBUG, + "request queue is not empty, it has to be synced"); + } + + if (conf->flush_behind && + (!file->disabled) && (file->disable_till == 0)) { + flush_frame = copy_frame (frame); + STACK_UNWIND (frame, file->op_ret, + file->op_errno); // liar! liar! :O + + flush_frame->local = local; + wb_sync_all (flush_frame, file); + + STACK_WIND (flush_frame, + wb_ffr_bg_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd); + } else { + wb_sync_all (frame, file); + + frame->local = local; + STACK_WIND (frame, + wb_ffr_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->flush, + fd); + } + + return 0; +} + + +int32_t +wb_fsync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + wb_local_t *local = NULL; + wb_file_t *file = NULL; + + local = frame->local; + file = local->file; + + if (file->op_ret == -1) + { + op_ret = file->op_ret; + op_errno = file->op_errno; + + file->op_ret = 0; + } + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +wb_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync) +{ + wb_file_t *file = NULL; + wb_local_t *local = NULL; + uint64_t tmp_file = 0; + + if (fd_ctx_get (fd, this, &tmp_file)) { + gf_log (this->name, GF_LOG_ERROR, "returning EBADFD"); + STACK_UNWIND (frame, -1, EBADFD); + return 0; + } + + file = (wb_file_t *)(long)tmp_file; + if (file) + wb_sync_all (frame, file); + + local = CALLOC (1, sizeof (*local)); + local->file = file; + + frame->local = local; + + STACK_WIND (frame, + wb_fsync_cbk, + FIRST_CHILD(this), + FIRST_CHILD(this)->fops->fsync, + fd, datasync); + return 0; +} + + +int32_t +wb_release (xlator_t *this, + fd_t *fd) +{ + uint64_t file = 0; + + fd_ctx_get (fd, this, &file); + wb_file_destroy ((wb_file_t *)(long)file); + + return 0; +} + + +int32_t +init (xlator_t *this) +{ + dict_t *options = NULL; + wb_conf_t *conf = NULL; + char *aggregate_size_string = NULL; + char *window_size_string = NULL; + char *flush_behind_string = NULL; + char *disable_till_string = NULL; + char *enable_O_SYNC_string = NULL; + int32_t ret = -1; + + if ((this->children == NULL) + || this->children->next) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: write-behind (%s) not configured with exactly one child", + this->name); + return -1; + } + + if (this->parents == NULL) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile"); + } + + options = this->options; + + conf = CALLOC (1, sizeof (*conf)); + + conf->enable_O_SYNC = _gf_false; + ret = dict_get_str (options, "enable-O_SYNC", + &enable_O_SYNC_string); + if (ret == 0) { + ret = gf_string2boolean (enable_O_SYNC_string, + &conf->enable_O_SYNC); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "'enable-O_SYNC' takes only boolean arguments"); + return -1; + } + } + + /* configure 'options aggregate-size <size>' */ + conf->aggregate_size = 0; + ret = dict_get_str (options, "block-size", + &aggregate_size_string); + if (ret == 0) { + ret = gf_string2bytesize (aggregate_size_string, + &conf->aggregate_size); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\" of \"option aggregate-size\"", + aggregate_size_string); + return -1; + } + } + + gf_log (this->name, GF_LOG_DEBUG, + "using aggregate-size = %"PRIu64"", + conf->aggregate_size); + + conf->disable_till = 1; + ret = dict_get_str (options, "disable-for-first-nbytes", + &disable_till_string); + if (ret == 0) { + ret = gf_string2bytesize (disable_till_string, + &conf->disable_till); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\" of \"option disable-for-first-nbytes\"", + disable_till_string); + return -1; + } + } + + gf_log (this->name, GF_LOG_DEBUG, + "disabling write-behind for first %"PRIu64" bytes", + conf->disable_till); + + /* configure 'option window-size <size>' */ + conf->window_size = 0; + ret = dict_get_str (options, "cache-size", + &window_size_string); + if (ret == 0) { + ret = gf_string2bytesize (window_size_string, + &conf->window_size); + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "invalid number format \"%s\" of \"option window-size\"", + window_size_string); + FREE (conf); + return -1; + } + } + + if (!conf->window_size && conf->aggregate_size) { + gf_log (this->name, GF_LOG_WARNING, + "setting window-size to be equal to aggregate-size(%"PRIu64")", + conf->aggregate_size); + conf->window_size = conf->aggregate_size; + } + + if (conf->window_size < conf->aggregate_size) { + gf_log (this->name, GF_LOG_ERROR, + "aggregate-size(%"PRIu64") cannot be more than window-size" + "(%"PRIu64")", conf->window_size, conf->aggregate_size); + FREE (conf); + return -1; + } + + /* configure 'option flush-behind <on/off>' */ + conf->flush_behind = 0; + ret = dict_get_str (options, "flush-behind", + &flush_behind_string); + if (ret == 0) { + ret = gf_string2boolean (flush_behind_string, + &conf->flush_behind); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "'flush-behind' takes only boolean arguments"); + return -1; + } + + if (conf->flush_behind) { + gf_log (this->name, GF_LOG_DEBUG, + "enabling flush-behind"); + } + } + this->private = conf; + return 0; +} + + +void +fini (xlator_t *this) +{ + wb_conf_t *conf = this->private; + + FREE (conf); + return; +} + + +struct xlator_fops fops = { + .writev = wb_writev, + .open = wb_open, + .create = wb_create, + .readv = wb_readv, + .flush = wb_flush, + .fsync = wb_fsync, + .stat = wb_stat, + .fstat = wb_fstat, + .truncate = wb_truncate, + .ftruncate = wb_ftruncate, + .utimens = wb_utimens, +}; + +struct xlator_mops mops = { +}; + +struct xlator_cbks cbks = { + .release = wb_release +}; + +struct volume_options options[] = { + { .key = {"flush-behind"}, + .type = GF_OPTION_TYPE_BOOL + }, + { .key = {"block-size", "aggregate-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 128 * GF_UNIT_KB, + .max = 4 * GF_UNIT_MB + }, + { .key = {"cache-size", "window-size"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 512 * GF_UNIT_KB, + .max = 1 * GF_UNIT_GB + }, + { .key = {"disable-for-first-nbytes"}, + .type = GF_OPTION_TYPE_SIZET, + .min = 1, + .max = 1 * GF_UNIT_MB, + }, + { .key = {"enable-O_SYNC"}, + .type = GF_OPTION_TYPE_BOOL, + }, + { .key = {NULL} }, +}; diff --git a/xlators/protocol/Makefile.am b/xlators/protocol/Makefile.am new file mode 100644 index 000000000..745e277c2 --- /dev/null +++ b/xlators/protocol/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = client server + +CLEANFILES = diff --git a/xlators/protocol/client/Makefile.am b/xlators/protocol/client/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/protocol/client/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/protocol/client/src/Makefile.am b/xlators/protocol/client/src/Makefile.am new file mode 100644 index 000000000..fb720942c --- /dev/null +++ b/xlators/protocol/client/src/Makefile.am @@ -0,0 +1,16 @@ + +xlator_LTLIBRARIES = client.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/protocol + +client_la_LDFLAGS = -module -avoidversion + +client_la_SOURCES = client-protocol.c saved-frames.c +client_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = client-protocol.h saved-frames.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/protocol/client/src/client-protocol.c b/xlators/protocol/client/src/client-protocol.c new file mode 100644 index 000000000..5c93bd6f1 --- /dev/null +++ b/xlators/protocol/client/src/client-protocol.c @@ -0,0 +1,6671 @@ +/* + Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <inttypes.h> + + +#include "glusterfs.h" +#include "client-protocol.h" +#include "compat.h" +#include "dict.h" +#include "protocol.h" +#include "transport.h" +#include "xlator.h" +#include "logging.h" +#include "timer.h" +#include "defaults.h" +#include "compat.h" +#include "compat-errno.h" + +#include <sys/resource.h> +#include <inttypes.h> + +/* for default_*_cbk functions */ +#include "defaults.c" +#include "saved-frames.h" + + +int protocol_client_cleanup (transport_t *trans); +int protocol_client_interpret (xlator_t *this, transport_t *trans, + char *hdr_p, size_t hdrlen, + char *buf_p, size_t buflen); +int +protocol_client_xfer (call_frame_t *frame, xlator_t *this, transport_t *trans, + int type, int op, + gf_hdr_common_t *hdr, size_t hdrlen, + struct iovec *vector, int count, + dict_t *refs); + +static gf_op_t gf_fops[]; +static gf_op_t gf_mops[]; +static gf_op_t gf_cbks[]; + + +static ino_t +this_ino_get_from_inode (inode_t *inode, xlator_t *this) +{ + ino_t ino = 0; + int32_t ret = 0; + + GF_VALIDATE_OR_GOTO ("client", this, out); + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + if (inode->ino == 1) { + ino = 1; + goto out; + } + + ret = inode_ctx_get (inode, this, &ino); + + if (inode->ino && ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "(%"PRId64"): failed to get remote inode number", + inode->ino); + } + +out: + return ino; +} + + +static ino_t +this_ino_get (loc_t *loc, xlator_t *this, int32_t which) +{ + ino_t ino = 0; + int32_t ret = 0; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO ("client", this, out); + + if (which == GF_CLIENT_INODE_SELF) { + inode = loc->inode; + } else if (which == GF_CLIENT_INODE_PARENT) { + inode = loc->parent; + } + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + if (inode->ino == 1) { + ino = 1; + goto out; + } + + ret = inode_ctx_get (inode, this, &ino); + + if (inode->ino && ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s(%s - %"PRId64") failed to get remote inode number", + loc->path, + (which == GF_CLIENT_INODE_SELF? "self" : "parent"), + inode->ino); + } + +out: + return ino; +} + + +static void +this_ino_set (loc_t *loc, xlator_t *this, ino_t ino) +{ + ino_t old_ino = 0; + int32_t ret = -1; + inode_t *inode = NULL; + + GF_VALIDATE_OR_GOTO ("client", this, out); + + inode = loc->inode; + GF_VALIDATE_OR_GOTO (this->name, inode, out); + + ret = inode_ctx_get (inode, this, &old_ino); + + if (old_ino != ino) { + if (old_ino) + gf_log (this->name, GF_LOG_DEBUG, + "%s: inode number changed from %"PRId64" " + "to %"PRId64, + loc->path, old_ino, ino); + + ret = inode_ctx_put (inode, this, ino); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to set remote " + "inode number to inode ctx", + loc->path, ino); + } + } +out: + return; +} + + +static int +this_fd_get (fd_t *file, xlator_t *this, int64_t *remote_fd) +{ + int ret = 0; + int dict_ret = -1; + uint64_t tmp_fd = 0; + + GF_VALIDATE_OR_GOTO ("client", this, out); + GF_VALIDATE_OR_GOTO (this->name, file, out); + GF_VALIDATE_OR_GOTO (this->name, remote_fd, out); + + dict_ret = fd_ctx_get (file, this, &tmp_fd); + + if (dict_ret < 0) { + ret = -1; + } + *remote_fd = (int64_t)tmp_fd; +out: + return ret; +} + + +static void +this_fd_set (fd_t *file, xlator_t *this, loc_t *loc, int64_t fd) +{ + uint64_t old_fd = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("client", this, out); + GF_VALIDATE_OR_GOTO (this->name, file, out); + + ret = fd_ctx_get (file, this, &old_fd); + if (ret >= 0) { + gf_log (this->name, GF_LOG_WARNING, + "%s (%"PRId64"): trying duplicate remote fd set. " + "%"PRId64" over-rides %"PRId64, + loc->path, loc->inode->ino, fd, old_fd); + } + + ret = fd_ctx_set (file, this, (uint64_t)fd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to set remote fd", + loc->path, loc->inode->ino); + } +out: + return; +} + + +static int +client_local_wipe (client_local_t *local) +{ + if (local) { + loc_wipe (&local->loc); + + if (local->fd) + fd_unref (local->fd); + + free (local); + } + + return 0; +} + +/* + * lookup_frame - lookup call frame corresponding to a given callid + * @trans: transport object + * @callid: call id of the frame + * + * not for external reference + */ + +static call_frame_t * +lookup_frame (transport_t *trans, int32_t op, int8_t type, int64_t callid) +{ + client_connection_t *conn = NULL; + call_frame_t *frame = NULL; + + conn = trans->xl_private; + + pthread_mutex_lock (&conn->lock); + { + frame = saved_frames_get (conn->saved_frames, + op, type, callid); + } + pthread_mutex_unlock (&conn->lock); + + return frame; +} + + +static void +call_bail (void *data) +{ + client_connection_t *conn = NULL; + struct timeval current; + int32_t bail_out = 0; + transport_t *trans = NULL; + + GF_VALIDATE_OR_GOTO("client", data, out); + trans = data; + + conn = trans->xl_private; + + gettimeofday (¤t, NULL); + pthread_mutex_lock (&conn->lock); + { + /* Chaining to get call-always functionality from + call-once timer */ + if (conn->timer) { + struct timeval timeout = {0,}; + gf_timer_cbk_t timer_cbk = conn->timer->cbk; + + timeout.tv_sec = 10; + timeout.tv_usec = 0; + + gf_timer_call_cancel (trans->xl->ctx, conn->timer); + conn->timer = gf_timer_call_after (trans->xl->ctx, + timeout, + timer_cbk, + trans); + if (conn->timer == NULL) { + gf_log (trans->xl->name, GF_LOG_DEBUG, + "Cannot create bailout timer"); + } + } + + if (((conn->saved_frames->count > 0) && + (RECEIVE_TIMEOUT(conn, current)) && + (SEND_TIMEOUT(conn, current)))) { + + struct tm last_sent_tm, last_received_tm; + char last_sent[32] = {0,}, last_received[32] = {0,}; + + bail_out = 1; + + localtime_r (&conn->last_sent.tv_sec, + &last_sent_tm); + localtime_r (&conn->last_received.tv_sec, + &last_received_tm); + + strftime (last_sent, 32, + "%Y-%m-%d %H:%M:%S", &last_sent_tm); + strftime (last_received, 32, + "%Y-%m-%d %H:%M:%S", &last_received_tm); + + gf_log (trans->xl->name, GF_LOG_ERROR, + "activating bail-out. pending frames = %d. " + "last sent = %s. last received = %s. " + "transport-timeout = %d", + (int32_t) conn->saved_frames->count, + last_sent, last_received, + conn->transport_timeout); + } + } + + if (bail_out) { + conn->ping_started = 0; + } + + pthread_mutex_unlock (&conn->lock); + + if (bail_out) { + gf_log (trans->xl->name, GF_LOG_CRITICAL, + "bailing transport"); + transport_disconnect (trans); + } +out: + return; +} + + +void +save_frame (transport_t *trans, call_frame_t *frame, + int32_t op, int8_t type, uint64_t callid) +{ + client_connection_t *conn = NULL; + struct timeval timeout = {0, }; + + + conn = trans->xl_private; + + saved_frames_put (conn->saved_frames, frame, op, type, callid); + + if (conn->timer == NULL) { + timeout.tv_sec = 10; + timeout.tv_usec = 0; + conn->timer = gf_timer_call_after (trans->xl->ctx, timeout, + call_bail, (void *) trans); + } +} + + +int +client_get_forgets (xlator_t *this, client_forget_t *forget) +{ + call_frame_t *fr = NULL; + gf_hdr_common_t *hdr = NULL; + size_t hdrlen = 0; + gf_cbk_forget_req_t *req = NULL; + int ret = -1; + client_conf_t *conf = NULL; + int count = 0; + int index = 0; + + conf = this->private; + + if (conf->forget.count > 0) { + count = conf->forget.count; + + hdrlen = gf_hdr_len (req, (count * sizeof (int64_t))); + hdr = gf_hdr_new (req, (count * sizeof (int64_t))); + GF_VALIDATE_OR_GOTO (this->name, hdr, out); + + req = gf_param (hdr); + + req->count = hton32 (count); + for (index = 0; index < count; index++) { + req->ino_array[index] = + hton64 (conf->forget.ino_array[index]); + } + + fr = create_frame (this, this->ctx->pool); + GF_VALIDATE_OR_GOTO (this->name, fr, out); + + conf->forget.frames_in_transit++; + + forget->frame = fr; + forget->hdr = hdr; + forget->hdrlen = hdrlen; + + ret = count; + + conf->forget.count = 0; + } + out: + return ret; +} + + +void +client_ping_timer_expired (void *data) +{ + xlator_t *this = NULL; + transport_t *trans = NULL; + client_conf_t *conf = NULL; + client_connection_t *conn = NULL; + + trans = data; + this = trans->xl; + conf = this->private; + conn = trans->xl_private; + + gf_log (this->name, GF_LOG_ERROR, + "ping timer expired! bailing transport"); + + pthread_mutex_lock (&conn->lock); + { + if (conn->ping_timer) + gf_timer_call_cancel (trans->xl->ctx, + conn->ping_timer); + + conn->ping_started = 0; + conn->ping_timer = NULL; + } + pthread_mutex_unlock (&conn->lock); + transport_disconnect (trans); +} + + +void +client_start_ping (void *data) +{ + xlator_t *this = NULL; + transport_t *trans = NULL; + client_conf_t *conf = NULL; + client_connection_t *conn = NULL; + int32_t ret = -1; + gf_hdr_common_t *hdr = NULL; + struct timeval timeout = {0, }; + call_frame_t *dummy_frame = NULL; + size_t hdrlen = -1; + gf_mop_ping_req_t *req = NULL; + + + trans = data; + this = trans->xl; + conf = this->private; + conn = trans->xl_private; + + pthread_mutex_lock (&conn->lock); + { + if ((conn->saved_frames->count == 0) || + !conn->connected) { + /* using goto looked ugly here, + * hence getting out this way */ + if (conn->ping_timer) + gf_timer_call_cancel (trans->xl->ctx, + conn->ping_timer); + conn->ping_timer = NULL; + conn->ping_started = 0; + /* unlock */ + pthread_mutex_unlock (&conn->lock); + return; + } + + if (conn->saved_frames->count < 0) { + gf_log (this->name, GF_LOG_ERROR, + "saved_frames->count is %"PRId64, + conn->saved_frames->count); + conn->saved_frames->count = 0; + } + timeout.tv_sec = conn->ping_timeout; + timeout.tv_usec = 0; + + conn->ping_timer = + gf_timer_call_after (trans->xl->ctx, timeout, + client_ping_timer_expired, + (void *) trans); + + if (conn->ping_timer == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "unable to setup timer"); + } else + conn->ping_started = 1; + } + pthread_mutex_unlock (&conn->lock); + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + + dummy_frame = create_frame (this, this->ctx->pool); + dummy_frame->local = trans; + + ret = protocol_client_xfer (dummy_frame, this, trans, + GF_OP_TYPE_MOP_REQUEST, GF_MOP_PING, + hdr, hdrlen, NULL, 0, NULL); +} + + +int +client_ping_cbk (call_frame_t *frame, gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + xlator_t *this = NULL; + transport_t *trans = NULL; + client_conf_t *conf = NULL; + client_connection_t *conn = NULL; + struct timeval timeout = {0, }; + int op_ret = 0; + + trans = frame->local; frame->local = NULL; + this = trans->xl; + conf = this->private; + conn = trans->xl_private; + + op_ret = ntoh32 (hdr->rsp.op_ret); + + if (op_ret == -1) { + /* timer expired and transport bailed out */ + gf_log (this->name, GF_LOG_ERROR, "timer must have expired"); + goto out; + } + + pthread_mutex_lock (&conn->lock); + { + timeout.tv_sec = conn->ping_timeout; + timeout.tv_usec = 0; + + gf_timer_call_cancel (trans->xl->ctx, + conn->ping_timer); + + conn->ping_timer = + gf_timer_call_after (trans->xl->ctx, timeout, + client_start_ping, (void *)trans); + if (conn->ping_timer == NULL) + gf_log (this->name, GF_LOG_ERROR, + "gf_timer_call_after() returned NULL"); + } + pthread_mutex_unlock (&conn->lock); +out: + STACK_DESTROY (frame->root); + return 0; +} + + +int +protocol_client_xfer (call_frame_t *frame, xlator_t *this, transport_t *trans, + int type, int op, + gf_hdr_common_t *hdr, size_t hdrlen, + struct iovec *vector, int count, + dict_t *refs) +{ + client_conf_t *conf = NULL; + client_connection_t *conn = NULL; + uint64_t callid = 0; + int32_t ret = -1; + int start_ping = 0; + gf_hdr_common_t rsphdr = {0, }; + client_forget_t forget = {0, }; + uint8_t send_forget = 0; + + + conf = this->private; + + if (!trans) { + /* default to bulk op since it is 'safer' */ + trans = conf->transport[CHANNEL_BULK]; + } + conn = trans->xl_private; + + if (!((type == GF_OP_TYPE_CBK_REQUEST) && + (op == GF_CBK_FORGET))) + { + LOCK (&conf->forget.lock); + { + ret = client_get_forgets (this, &forget); + if (ret <= 0) + send_forget = 0; + else + send_forget = 1; + } + UNLOCK (&conf->forget.lock); + + if (send_forget) { + ret = protocol_client_xfer (forget.frame, this, NULL, + GF_OP_TYPE_CBK_REQUEST, + GF_CBK_FORGET, + forget.hdr, forget.hdrlen, + NULL, 0, NULL); + } + } + + pthread_mutex_lock (&conn->lock); + { + callid = ++conn->callid; + + hdr->callid = hton64 (callid); + hdr->op = hton32 (op); + hdr->type = hton32 (type); + + if (frame) { + hdr->req.uid = hton32 (frame->root->uid); + hdr->req.gid = hton32 (frame->root->gid); + hdr->req.pid = hton32 (frame->root->pid); + } + + if (conn->connected == 0) + transport_connect (trans); + + ret = -1; + + if (conn->connected || + ((type == GF_OP_TYPE_MOP_REQUEST) && + (op == GF_MOP_SETVOLUME))) { + ret = transport_submit (trans, (char *)hdr, hdrlen, + vector, count, refs); + } + + if ((ret >= 0) && frame) { + /* TODO: check this logic */ + gettimeofday (&conn->last_sent, NULL); + save_frame (trans, frame, op, type, callid); + } + + if (!conn->ping_started && (ret >= 0)) { + start_ping = 1; + } + } + pthread_mutex_unlock (&conn->lock); + + if (start_ping) + client_start_ping ((void *) trans); + + if (frame && (ret < 0)) { + rsphdr.op = op; + rsphdr.rsp.op_ret = hton32 (-1); + rsphdr.rsp.op_errno = hton32 (ENOTCONN); + + if (type == GF_OP_TYPE_FOP_REQUEST) { + rsphdr.type = GF_OP_TYPE_FOP_REPLY; + gf_fops[op] (frame, &rsphdr, sizeof (rsphdr), NULL, 0); + } else if (type == GF_OP_TYPE_MOP_REQUEST) { + rsphdr.type = GF_OP_TYPE_MOP_REPLY; + gf_mops[op] (frame, &rsphdr, sizeof (rsphdr), NULL, 0); + } else { + rsphdr.type = GF_OP_TYPE_CBK_REPLY; + gf_cbks[op] (frame, &rsphdr, sizeof (rsphdr), NULL, 0); + } + } + + return ret; +} + + + +/** + * client_create - create function for client protocol + * @frame: call frame + * @this: this translator structure + * @path: complete path to file + * @flags: create flags + * @mode: create mode + * + * external reference through client_protocol_xlator->fops->create + */ + +int +client_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, + mode_t mode, fd_t *fd) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_create_req_t *req = NULL; + size_t hdrlen = 0; + size_t pathlen = 0; + size_t baselen = 0; + int32_t ret = -1; + ino_t par = 0; + client_conf_t *conf = NULL; + client_local_t *local = NULL; + + + conf = this->private; + + if (conf->child) { + STACK_WIND (frame, default_create_cbk, + conf->child, + conf->child->fops->create, + loc, flags, mode, fd); + return 0; + } + + local = calloc (1, sizeof (*local)); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); + + local->fd = fd_ref (fd); + loc_copy (&local->loc, loc); + + frame->local = local; + + pathlen = STRLEN_0(loc->path); + baselen = STRLEN_0(loc->name); + par = this_ino_get (loc, this, GF_CLIENT_INODE_PARENT); + + hdrlen = gf_hdr_len (req, pathlen + baselen); + hdr = gf_hdr_new (req, pathlen + baselen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->flags = hton32 (flags); + req->mode = hton32 (mode); + req->par = hton64 (par); + strcpy (req->path, loc->path); + strcpy (req->bname + pathlen, loc->name); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_CREATE, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, fd, NULL, NULL); + return 0; + +} + +/** + * client_open - open function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location of file + * @flags: open flags + * @mode: open modes + * + * external reference through client_protocol_xlator->fops->open + */ +int +client_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd) +{ + int ret = -1; + gf_hdr_common_t *hdr = NULL; + size_t hdrlen = 0; + gf_fop_open_req_t *req = NULL; + size_t pathlen = 0; + ino_t ino = 0; + client_conf_t *conf = NULL; + client_local_t *local = NULL; + + conf = this->private; + if (conf->child) { + /* */ + STACK_WIND (frame, default_open_cbk, + conf->child, + conf->child->fops->open, + loc, flags, fd); + + return 0; + } + + local = calloc (1, sizeof (*local)); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); + + local->fd = fd_ref (fd); + loc_copy (&local->loc, loc); + + frame->local = local; + + pathlen = STRLEN_0(loc->path); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, pathlen); + hdr = gf_hdr_new (req, pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + req->flags = hton32 (flags); + strcpy (req->path, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_OPEN, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, fd); + return 0; + +} + + +/** + * client_stat - stat function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location + * + * external reference through client_protocol_xlator->fops->stat + */ +int32_t +client_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_stat_req_t *req = NULL; + size_t hdrlen = -1; + int32_t ret = -1; + size_t pathlen = 0; + ino_t ino = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_stat_cbk, + conf->child, + conf->child->fops->stat, + loc); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, pathlen); + hdr = gf_hdr_new (req, pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + strcpy (req->path, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_STAT, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; + +} + + +/** + * client_readlink - readlink function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location + * @size: + * + * external reference through client_protocol_xlator->fops->readlink + */ +int32_t +client_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_readlink_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + size_t pathlen = 0; + ino_t ino = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_readlink_cbk, + conf->child, + conf->child->fops->readlink, + loc, + size); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, pathlen); + hdr = gf_hdr_new (req, pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + req->size = hton32 (size); + strcpy (req->path, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_READLINK, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; + +} + + +/** + * client_mknod - mknod function for client protocol + * @frame: call frame + * @this: this translator structure + * @path: pathname of node + * @mode: + * @dev: + * + * external reference through client_protocol_xlator->fops->mknod + */ +int32_t +client_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t dev) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_mknod_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + size_t pathlen = 0; + size_t baselen = 0; + ino_t par = 0; + client_conf_t *conf = this->private; + client_local_t *local = NULL; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_mknod_cbk, + conf->child, + conf->child->fops->mknod, + loc, mode, dev); + + return 0; + } + + local = calloc (1, sizeof (*local)); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); + + loc_copy (&local->loc, loc); + + frame->local = local; + + pathlen = STRLEN_0(loc->path); + baselen = STRLEN_0(loc->name); + par = this_ino_get (loc, this, GF_CLIENT_INODE_PARENT); + + hdrlen = gf_hdr_len (req, pathlen + baselen); + hdr = gf_hdr_new (req, pathlen + baselen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->par = hton64 (par); + req->mode = hton32 (mode); + req->dev = hton64 (dev); + strcpy (req->path, loc->path); + strcpy (req->bname + pathlen, loc->name); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_MKNOD, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, loc->inode, NULL); + return 0; + +} + + +/** + * client_mkdir - mkdir function for client protocol + * @frame: call frame + * @this: this translator structure + * @path: pathname of directory + * @mode: + * + * external reference through client_protocol_xlator->fops->mkdir + */ +int32_t +client_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_mkdir_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + size_t pathlen = 0; + size_t baselen = 0; + ino_t par = 0; + client_conf_t *conf = this->private; + client_local_t *local = NULL; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_mkdir_cbk, + conf->child, + conf->child->fops->mkdir, + loc, mode); + + return 0; + } + + local = calloc (1, sizeof (*local)); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); + + loc_copy (&local->loc, loc); + + frame->local = local; + + pathlen = STRLEN_0(loc->path); + baselen = STRLEN_0(loc->name); + par = this_ino_get (loc, this, GF_CLIENT_INODE_PARENT); + + hdrlen = gf_hdr_len (req, pathlen + baselen); + hdr = gf_hdr_new (req, pathlen + baselen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->par = hton64 (par); + req->mode = hton32 (mode); + strcpy (req->path, loc->path); + strcpy (req->bname + pathlen, loc->name); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_MKDIR, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, loc->inode, NULL); + return 0; + +} + + + +/** + * client_unlink - unlink function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location of file + * + * external reference through client_protocol_xlator->fops->unlink + */ +int32_t +client_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_unlink_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + size_t pathlen = 0; + size_t baselen = 0; + ino_t par = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_unlink_cbk, + conf->child, + conf->child->fops->unlink, + loc); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + baselen = STRLEN_0(loc->name); + par = this_ino_get (loc, this, GF_CLIENT_INODE_PARENT); + + hdrlen = gf_hdr_len (req, pathlen + baselen); + hdr = gf_hdr_new (req, pathlen + baselen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->par = hton64 (par); + strcpy (req->path, loc->path); + strcpy (req->bname + pathlen, loc->name); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_UNLINK, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL); + return 0; + +} + +/** + * client_rmdir - rmdir function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location + * + * external reference through client_protocol_xlator->fops->rmdir + */ +int32_t +client_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_rmdir_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + size_t pathlen = 0; + size_t baselen = 0; + ino_t par = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_rmdir_cbk, + conf->child, + conf->child->fops->rmdir, + loc); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + baselen = STRLEN_0(loc->name); + par = this_ino_get (loc, this, GF_CLIENT_INODE_PARENT); + + hdrlen = gf_hdr_len (req, pathlen + baselen); + hdr = gf_hdr_new (req, pathlen + baselen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->par = hton64 (par); + strcpy (req->path, loc->path); + strcpy (req->bname + pathlen, loc->name); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_RMDIR, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL); + return 0; + +} + + + +/** + * client_symlink - symlink function for client protocol + * @frame: call frame + * @this: this translator structure + * @oldpath: pathname of target + * @newpath: pathname of symlink + * + * external reference through client_protocol_xlator->fops->symlink + */ +int32_t +client_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkname, + loc_t *loc) +{ + int ret = -1; + gf_hdr_common_t *hdr = NULL; + gf_fop_symlink_req_t *req = NULL; + size_t hdrlen = 0; + size_t pathlen = 0; + size_t newlen = 0; + size_t baselen = 0; + ino_t par = 0; + client_conf_t *conf = this->private; + client_local_t *local = NULL; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_symlink_cbk, + conf->child, + conf->child->fops->symlink, + linkname, loc); + + return 0; + } + + local = calloc (1, sizeof (*local)); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); + + loc_copy (&local->loc, loc); + + frame->local = local; + + pathlen = STRLEN_0 (loc->path); + baselen = STRLEN_0 (loc->name); + newlen = STRLEN_0 (linkname); + par = this_ino_get (loc, this, GF_CLIENT_INODE_PARENT); + + hdrlen = gf_hdr_len (req, pathlen + baselen + newlen); + hdr = gf_hdr_new (req, pathlen + baselen + newlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->par = hton64 (par); + strcpy (req->path, loc->path); + strcpy (req->bname + pathlen, loc->name); + strcpy (req->linkname + pathlen + baselen, linkname); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_SYMLINK, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, loc->inode, NULL); + return 0; + +} + + +/** + * client_rename - rename function for client protocol + * @frame: call frame + * @this: this translator structure + * @oldloc: location of old pathname + * @newloc: location of new pathname + * + * external reference through client_protocol_xlator->fops->rename + */ +int32_t +client_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int ret = -1; + gf_hdr_common_t *hdr = NULL; + gf_fop_rename_req_t *req = NULL; + size_t hdrlen = 0; + size_t oldpathlen = 0; + size_t oldbaselen = 0; + size_t newpathlen = 0; + size_t newbaselen = 0; + ino_t oldpar = 0; + ino_t newpar = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_rename_cbk, + conf->child, + conf->child->fops->rename, + oldloc, newloc); + + return 0; + } + + oldpathlen = STRLEN_0(oldloc->path); + oldbaselen = STRLEN_0(oldloc->name); + newpathlen = STRLEN_0(newloc->path); + newbaselen = STRLEN_0(newloc->name); + oldpar = this_ino_get (oldloc, this, GF_CLIENT_INODE_PARENT); + newpar = this_ino_get (newloc, this, GF_CLIENT_INODE_PARENT); + + hdrlen = gf_hdr_len (req, (oldpathlen + oldbaselen + + newpathlen + newbaselen)); + hdr = gf_hdr_new (req, (oldpathlen + oldbaselen + + newpathlen + newbaselen)); + + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->oldpar = hton64 (oldpar); + req->newpar = hton64 (newpar); + + strcpy (req->oldpath, oldloc->path); + strcpy (req->oldbname + oldpathlen, oldloc->name); + strcpy (req->newpath + oldpathlen + oldbaselen, newloc->path); + strcpy (req->newbname + oldpathlen + oldbaselen + newpathlen, + newloc->name); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_RENAME, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; + +} + + + +/** + * client_link - link function for client protocol + * @frame: call frame + * @this: this translator structure + * @oldloc: location of old pathname + * @newpath: new pathname + * + * external reference through client_protocol_xlator->fops->link + */ + +int32_t +client_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + int ret = -1; + gf_hdr_common_t *hdr = NULL; + gf_fop_link_req_t *req = NULL; + size_t hdrlen = 0; + size_t oldpathlen = 0; + size_t newpathlen = 0; + size_t newbaselen = 0; + ino_t oldino = 0; + ino_t newpar = 0; + client_conf_t *conf = this->private; + client_local_t *local = NULL; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_link_cbk, + conf->child, + conf->child->fops->link, + oldloc, newloc); + + return 0; + } + + local = calloc (1, sizeof (*local)); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); + + loc_copy (&local->loc, oldloc); + + frame->local = local; + + oldpathlen = STRLEN_0(oldloc->path); + newpathlen = STRLEN_0(newloc->path); + newbaselen = STRLEN_0(newloc->name); + oldino = this_ino_get (oldloc, this, GF_CLIENT_INODE_SELF); + newpar = this_ino_get (newloc, this, GF_CLIENT_INODE_PARENT); + + hdrlen = gf_hdr_len (req, oldpathlen + newpathlen + newbaselen); + hdr = gf_hdr_new (req, oldpathlen + newpathlen + newbaselen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + strcpy (req->oldpath, oldloc->path); + strcpy (req->newpath + oldpathlen, newloc->path); + strcpy (req->newbname + oldpathlen + newpathlen, newloc->name); + + req->oldino = hton64 (oldino); + req->newpar = hton64 (newpar); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_LINK, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, oldloc->inode, NULL); + return 0; +} + + + +/** + * client_chmod - chmod function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location + * @mode: + * + * external reference through client_protocol_xlator->fops->chmod + */ +int32_t +client_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_chmod_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + size_t pathlen = 0; + ino_t ino = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_chmod_cbk, + conf->child, + conf->child->fops->chmod, + loc, + mode); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, pathlen); + hdr = gf_hdr_new (req, pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + req->mode = hton32 (mode); + strcpy (req->path, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_CHMOD, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; + +} + + +/** + * client_chown - chown function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location + * @uid: uid of new owner + * @gid: gid of new owner group + * + * external reference through client_protocol_xlator->fops->chown + */ +int32_t +client_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_chown_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + size_t pathlen = 0; + ino_t ino = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_chown_cbk, + conf->child, + conf->child->fops->chown, + loc, + uid, + gid); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, pathlen); + hdr = gf_hdr_new (req, pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + req->uid = hton32 (uid); + req->gid = hton32 (gid); + strcpy (req->path, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_CHOWN, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; + +} + +/** + * client_truncate - truncate function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location + * @offset: + * + * external reference through client_protocol_xlator->fops->truncate + */ +int32_t +client_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_truncate_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + size_t pathlen = 0; + ino_t ino = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_truncate_cbk, + conf->child, + conf->child->fops->truncate, + loc, + offset); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, pathlen); + hdr = gf_hdr_new (req, pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + req->offset = hton64 (offset); + strcpy (req->path, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_TRUNCATE, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; + +} + + + +/** + * client_utimes - utimes function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location + * @tvp: + * + * external reference through client_protocol_xlator->fops->utimes + */ +int32_t +client_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec *tvp) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_utimens_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + size_t pathlen = 0; + ino_t ino = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_utimens_cbk, + conf->child, + conf->child->fops->utimens, + loc, + tvp); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, pathlen); + hdr = gf_hdr_new (req, pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + gf_timespec_from_timespec (req->tv, tvp); + strcpy (req->path, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_UTIMENS, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; + +} + + + +/** + * client_readv - readv function for client protocol + * @frame: call frame + * @this: this translator structure + * @fd: file descriptor structure + * @size: + * @offset: + * + * external reference through client_protocol_xlator->fops->readv + */ +int32_t +client_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_read_req_t *req = NULL; + size_t hdrlen = 0; + int64_t remote_fd = -1; + int ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_readv_cbk, + conf->child, + conf->child->fops->readv, + fd, + size, + offset); + + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd, returning EBADFD", + fd->inode->ino); + STACK_UNWIND (frame, -1, EBADFD, NULL, 0, NULL); + return 0; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + req->size = hton32 (size); + req->offset = hton64 (offset); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_READ, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, NULL, 0, NULL); + return 0; + +} + + +/** + * client_writev - writev function for client protocol + * @frame: call frame + * @this: this translator structure + * @fd: file descriptor structure + * @vector: + * @count: + * @offset: + * + * external reference through client_protocol_xlator->fops->writev + */ +int32_t +client_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_write_req_t *req = NULL; + size_t hdrlen = 0; + int64_t remote_fd = -1; + int ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_writev_cbk, + conf->child, + conf->child->fops->writev, + fd, + vector, + count, + offset); + + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + req->size = hton32 (iov_length (vector, count)); + req->offset = hton64 (offset); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_WRITE, + hdr, hdrlen, vector, count, + frame->root->req_refs); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; + +} + + +/** + * client_statfs - statfs function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location + * + * external reference through client_protocol_xlator->fops->statfs + */ +int32_t +client_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_statfs_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + size_t pathlen = 0; + ino_t ino = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_statfs_cbk, + conf->child, + conf->child->fops->statfs, + loc); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, pathlen); + hdr = gf_hdr_new (req, pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + strcpy (req->path, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_STATFS, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; + +} + + +/** + * client_flush - flush function for client protocol + * @frame: call frame + * @this: this translator structure + * @fd: file descriptor structure + * + * external reference through client_protocol_xlator->fops->flush + */ + +int32_t +client_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_flush_req_t *req = NULL; + size_t hdrlen = 0; + int64_t remote_fd = -1; + int ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_flush_cbk, + conf->child, + conf->child->fops->flush, + fd); + + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + STACK_UNWIND (frame, -1, EBADFD); + return 0; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_FLUSH, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL); + return 0; + +} + + + + +/** + * client_fsync - fsync function for client protocol + * @frame: call frame + * @this: this translator structure + * @fd: file descriptor structure + * @flags: + * + * external reference through client_protocol_xlator->fops->fsync + */ + +int32_t +client_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fsync_req_t *req = NULL; + size_t hdrlen = 0; + int64_t remote_fd = -1; + int32_t ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_fsync_cbk, + conf->child, + conf->child->fops->fsync, + fd, + flags); + + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + STACK_UNWIND(frame, -1, EBADFD); + return 0; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + req->data = hton32 (flags); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_FSYNC, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL); + return 0; + +} + +int32_t +client_xattrop (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_xattrop_req_t *req = NULL; + size_t hdrlen = 0; + size_t dict_len = 0; + int32_t ret = -1; + size_t pathlen = 0; + ino_t ino = 0; + client_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO("client", this, unwind); + + conf = this->private; + if (conf->child) { + /* */ + STACK_WIND (frame, + default_xattrop_cbk, + conf->child, + conf->child->fops->xattrop, + loc, + flags, + dict); + + return 0; + } + + GF_VALIDATE_OR_GOTO(this->name, loc, unwind); + + if (dict) { + dict_len = dict_serialized_length (dict); + if (dict_len < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get serialized length of dict(%p)", + dict); + goto unwind; + } + } + + pathlen = STRLEN_0(loc->path); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, dict_len + pathlen); + hdr = gf_hdr_new (req, dict_len + pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->flags = hton32 (flags); + req->dict_len = hton32 (dict_len); + if (dict) { + ret = dict_serialize (dict, req->dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to serialize dictionary(%p)", + dict); + goto unwind; + } + } + req->ino = hton64 (ino); + strcpy (req->path + dict_len, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_XATTROP, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; +} + + +int32_t +client_fxattrop (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fxattrop_req_t *req = NULL; + size_t hdrlen = 0; + size_t dict_len = 0; + int64_t remote_fd = -1; + int32_t ret = -1; + ino_t ino = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_fxattrop_cbk, + conf->child, + conf->child->fops->fxattrop, + fd, + flags, + dict); + + return 0; + } + + if (dict) { + dict_len = dict_serialized_length (dict); + if (dict_len < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get serialized length of dict(%p)", + dict); + goto unwind; + } + } + + if (fd) { + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + goto unwind; + } + ino = fd->inode->ino; + } + + hdrlen = gf_hdr_len (req, dict_len); + hdr = gf_hdr_new (req, dict_len); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->flags = hton32 (flags); + req->dict_len = hton32 (dict_len); + if (dict) { + ret = dict_serialize (dict, req->dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to serialize dictionary(%p)", + dict); + goto unwind; + } + } + req->fd = hton64 (remote_fd); + req->ino = hton64 (ino); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_FXATTROP, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + +} + + +/** + * client_setxattr - setxattr function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location + * @dict: dictionary which contains key:value to be set. + * @flags: + * + * external reference through client_protocol_xlator->fops->setxattr + */ +int32_t +client_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_setxattr_req_t *req = NULL; + size_t hdrlen = 0; + size_t dict_len = 0; + int ret = -1; + size_t pathlen = 0; + ino_t ino = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_setxattr_cbk, + conf->child, + conf->child->fops->setxattr, + loc, + dict, + flags); + + return 0; + } + + dict_len = dict_serialized_length (dict); + if (dict_len < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get serialized length of dict(%p)", + dict); + goto unwind; + } + + pathlen = STRLEN_0(loc->path); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, dict_len + pathlen); + hdr = gf_hdr_new (req, dict_len + pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + req->flags = hton32 (flags); + req->dict_len = hton32 (dict_len); + + ret = dict_serialize (dict, req->dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to serialize dictionary(%p)", + dict); + goto unwind; + } + + strcpy (req->path + dict_len, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_SETXATTR, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND(frame, -1, EINVAL); + return 0; +} + +/** + * client_getxattr - getxattr function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location structure + * + * external reference through client_protocol_xlator->fops->getxattr + */ +int32_t +client_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + int ret = -1; + gf_hdr_common_t *hdr = NULL; + gf_fop_getxattr_req_t *req = NULL; + size_t hdrlen = 0; + size_t pathlen = 0; + size_t namelen = 0; + ino_t ino = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_getxattr_cbk, + conf->child, + conf->child->fops->getxattr, + loc, + name); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + if (name) + namelen = STRLEN_0(name); + + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, pathlen + namelen); + hdr = gf_hdr_new (req, pathlen + namelen); + GF_VALIDATE_OR_GOTO(frame->this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + req->namelen = hton32 (namelen); + strcpy (req->path, loc->path); + if (name) + strcpy (req->name + pathlen, name); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_GETXATTR, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; +} + +/** + * client_removexattr - removexattr function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location structure + * @name: + * + * external reference through client_protocol_xlator->fops->removexattr + */ +int32_t +client_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + int ret = -1; + gf_hdr_common_t *hdr = NULL; + gf_fop_removexattr_req_t *req = NULL; + size_t hdrlen = 0; + size_t namelen = 0; + size_t pathlen = 0; + ino_t ino = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_removexattr_cbk, + conf->child, + conf->child->fops->removexattr, + loc, + name); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + namelen = STRLEN_0(name); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, pathlen + namelen); + hdr = gf_hdr_new (req, pathlen + namelen); + GF_VALIDATE_OR_GOTO(frame->this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + strcpy (req->path, loc->path); + strcpy (req->name + pathlen, name); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_REMOVEXATTR, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL); + return 0; +} + + +/** + * client_opendir - opendir function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location structure + * + * external reference through client_protocol_xlator->fops->opendir + */ +int32_t +client_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + fd_t *fd) +{ + gf_fop_opendir_req_t *req = NULL; + gf_hdr_common_t *hdr = NULL; + size_t hdrlen = 0; + int ret = -1; + ino_t ino = 0; + size_t pathlen = 0; + client_conf_t *conf = this->private; + client_local_t *local = NULL; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_opendir_cbk, + conf->child, + conf->child->fops->opendir, + loc, fd); + + return 0; + } + + local = calloc (1, sizeof (*local)); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); + + loc_copy (&local->loc, loc); + local->fd = fd_ref (fd); + + frame->local = local; + + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + pathlen = STRLEN_0(loc->path); + + hdrlen = gf_hdr_len (req, pathlen); + hdr = gf_hdr_new (req, pathlen); + GF_VALIDATE_OR_GOTO(frame->this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + strcpy (req->path, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_OPENDIR, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, fd); + return 0; + +} + + +/** + * client_readdir - readdir function for client protocol + * @frame: call frame + * @this: this translator structure + * + * external reference through client_protocol_xlator->fops->readdir + */ + +int32_t +client_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset, + int32_t flag) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_getdents_req_t *req = NULL; + size_t hdrlen = 0; + int64_t remote_fd = -1; + int ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_getdents_cbk, + conf->child, + conf->child->fops->getdents, + fd, + size, + offset, + flag); + + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO(frame->this->name, hdr, unwind); + + req = gf_param (hdr); + GF_VALIDATE_OR_GOTO(frame->this->name, hdr, unwind); + + req->fd = hton64 (remote_fd); + req->size = hton32 (size); + req->offset = hton64 (offset); + req->flags = hton32 (flag); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_GETDENTS, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +unwind: + STACK_UNWIND(frame, -1, EINVAL, NULL, 0); + return 0; +} + +/** + * client_readdir - readdir function for client protocol + * @frame: call frame + * @this: this translator structure + * + * external reference through client_protocol_xlator->fops->readdir + */ + +int32_t +client_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_readdir_req_t *req = NULL; + size_t hdrlen = 0; + int64_t remote_fd = -1; + int ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_readdir_cbk, + conf->child, + conf->child->fops->readdir, + fd, size, offset); + + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + goto unwind; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req->fd = hton64 (remote_fd); + req->size = hton32 (size); + req->offset = hton64 (offset); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_READDIR, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + +} + + + +/** + * client_fsyncdir - fsyncdir function for client protocol + * @frame: call frame + * @this: this translator structure + * @fd: file descriptor structure + * @flags: + * + * external reference through client_protocol_xlator->fops->fsyncdir + */ + +int32_t +client_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fsyncdir_req_t *req = NULL; + size_t hdrlen = 0; + int64_t remote_fd = -1; + int32_t ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_fsyncdir_cbk, + conf->child, + conf->child->fops->fsyncdir, + fd, + flags); + + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + goto unwind; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->data = hton32 (flags); + req->fd = hton64 (remote_fd); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_FSYNCDIR, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + STACK_UNWIND (frame, -1, EBADFD); + return 0; +} + + +/** + * client_access - access function for client protocol + * @frame: call frame + * @this: this translator structure + * @loc: location structure + * @mode: + * + * external reference through client_protocol_xlator->fops->access + */ +int32_t +client_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_access_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + ino_t ino = 0; + size_t pathlen = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_access_cbk, + conf->child, + conf->child->fops->access, + loc, + mask); + + return 0; + } + + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + pathlen = STRLEN_0(loc->path); + + hdrlen = gf_hdr_len (req, pathlen); + hdr = gf_hdr_new (req, pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + req->mask = hton32 (mask); + strcpy (req->path, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_ACCESS, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND(frame, -1, EINVAL); + return 0; + +} + + +/** + * client_ftrucate - ftruncate function for client protocol + * @frame: call frame + * @this: this translator structure + * @fd: file descriptor structure + * @offset: offset to truncate to + * + * external reference through client_protocol_xlator->fops->ftruncate + */ + +int32_t +client_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_ftruncate_req_t *req = NULL; + int64_t remote_fd = -1; + size_t hdrlen = -1; + int ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_ftruncate_cbk, + conf->child, + conf->child->fops->ftruncate, + fd, + offset); + + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + req->offset = hton64 (offset); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_FTRUNCATE, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; + +} + + +/** + * client_fstat - fstat function for client protocol + * @frame: call frame + * @this: this translator structure + * @fd: file descriptor structure + * + * external reference through client_protocol_xlator->fops->fstat + */ + +int32_t +client_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fstat_req_t *req = NULL; + int64_t remote_fd = -1; + size_t hdrlen = -1; + int ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_fstat_cbk, + conf->child, + conf->child->fops->fstat, + fd); + + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + STACK_UNWIND (frame, -1, EBADFD, NULL); + return 0; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_FSTAT, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; + +} + + +/** + * client_lk - lk function for client protocol + * @frame: call frame + * @this: this translator structure + * @fd: file descriptor structure + * @cmd: lock command + * @lock: + * + * external reference through client_protocol_xlator->fops->lk + */ +int32_t +client_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *flock) +{ + int ret = -1; + gf_hdr_common_t *hdr = NULL; + gf_fop_lk_req_t *req = NULL; + size_t hdrlen = 0; + int64_t remote_fd = -1; + int32_t gf_cmd = 0; + int32_t gf_type = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_lk_cbk, + conf->child, + conf->child->fops->lk, + fd, + cmd, + flock); + + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + STACK_UNWIND(frame, -1, EBADFD, NULL); + return 0; + } + + if (cmd == F_GETLK || cmd == F_GETLK64) + gf_cmd = GF_LK_GETLK; + else if (cmd == F_SETLK || cmd == F_SETLK64) + gf_cmd = GF_LK_SETLK; + else if (cmd == F_SETLKW || cmd == F_SETLKW64) + gf_cmd = GF_LK_SETLKW; + else { + gf_log (this->name, GF_LOG_ERROR, + "Unknown cmd (%d)!", gf_cmd); + goto unwind; + } + + switch (flock->l_type) { + case F_RDLCK: + gf_type = GF_LK_F_RDLCK; + break; + case F_WRLCK: + gf_type = GF_LK_F_WRLCK; + break; + case F_UNLCK: + gf_type = GF_LK_F_UNLCK; + break; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + req->cmd = hton32 (gf_cmd); + req->type = hton32 (gf_type); + gf_flock_from_flock (&req->flock, flock); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_LK, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; +} + + +/** + * client_inodelk - inodelk function for client protocol + * @frame: call frame + * @this: this translator structure + * @inode: inode structure + * @cmd: lock command + * @lock: flock struct + * + * external reference through client_protocol_xlator->fops->inodelk + */ +int32_t +client_inodelk (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t cmd, + struct flock *flock) +{ + int ret = -1; + gf_hdr_common_t *hdr = NULL; + gf_fop_inodelk_req_t *req = NULL; + size_t hdrlen = 0; + int32_t gf_cmd = 0; + int32_t gf_type = 0; + ino_t ino = 0; + size_t pathlen = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_inodelk_cbk, + conf->child, + conf->child->fops->inodelk, + loc, cmd, flock); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + if (cmd == F_GETLK || cmd == F_GETLK64) + gf_cmd = GF_LK_GETLK; + else if (cmd == F_SETLK || cmd == F_SETLK64) + gf_cmd = GF_LK_SETLK; + else if (cmd == F_SETLKW || cmd == F_SETLKW64) + gf_cmd = GF_LK_SETLKW; + else { + gf_log (this->name, GF_LOG_ERROR, + "Unknown cmd (%d)!", gf_cmd); + goto unwind; + } + + switch (flock->l_type) { + case F_RDLCK: + gf_type = GF_LK_F_RDLCK; + break; + case F_WRLCK: + gf_type = GF_LK_F_WRLCK; + break; + case F_UNLCK: + gf_type = GF_LK_F_UNLCK; + break; + } + + hdrlen = gf_hdr_len (req, pathlen); + hdr = gf_hdr_new (req, pathlen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + strcpy (req->path, loc->path); + + req->ino = hton64 (ino); + + req->cmd = hton32 (gf_cmd); + req->type = hton32 (gf_type); + gf_flock_from_flock (&req->flock, flock); + + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, + GF_FOP_INODELK, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND(frame, -1, EINVAL); + return 0; + +} + + +/** + * client_finodelk - finodelk function for client protocol + * @frame: call frame + * @this: this translator structure + * @inode: inode structure + * @cmd: lock command + * @lock: flock struct + * + * external reference through client_protocol_xlator->fops->finodelk + */ +int32_t +client_finodelk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *flock) +{ + int ret = -1; + gf_hdr_common_t *hdr = NULL; + gf_fop_finodelk_req_t *req = NULL; + size_t hdrlen = 0; + int32_t gf_cmd = 0; + int32_t gf_type = 0; + int64_t remote_fd = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_finodelk_cbk, + conf->child, + conf->child->fops->finodelk, + fd, cmd, flock); + + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + STACK_UNWIND(frame, -1, EBADFD); + return 0; + } + + if (cmd == F_GETLK || cmd == F_GETLK64) + gf_cmd = GF_LK_GETLK; + else if (cmd == F_SETLK || cmd == F_SETLK64) + gf_cmd = GF_LK_SETLK; + else if (cmd == F_SETLKW || cmd == F_SETLKW64) + gf_cmd = GF_LK_SETLKW; + else { + gf_log (this->name, GF_LOG_ERROR, + "Unknown cmd (%d)!", gf_cmd); + goto unwind; + } + + switch (flock->l_type) { + case F_RDLCK: + gf_type = GF_LK_F_RDLCK; + break; + case F_WRLCK: + gf_type = GF_LK_F_WRLCK; + break; + case F_UNLCK: + gf_type = GF_LK_F_UNLCK; + break; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + + req->cmd = hton32 (gf_cmd); + req->type = hton32 (gf_type); + gf_flock_from_flock (&req->flock, flock); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, + GF_FOP_FINODELK, + hdr, hdrlen, NULL, 0, NULL); + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND(frame, -1, EINVAL); + return 0; +} + + +int32_t +client_entrylk (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name, + entrylk_cmd cmd, + entrylk_type type) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_entrylk_req_t *req = NULL; + size_t pathlen = 0; + size_t hdrlen = -1; + int ret = -1; + ino_t ino = 0; + size_t namelen = 0; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, default_entrylk_cbk, + conf->child, + conf->child->fops->entrylk, + loc, name, cmd, type); + + return 0; + } + + pathlen = STRLEN_0(loc->path); + if (name) + namelen = STRLEN_0(name); + + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + + hdrlen = gf_hdr_len (req, pathlen + namelen); + hdr = gf_hdr_new (req, pathlen + namelen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + req->namelen = hton64 (namelen); + + strcpy (req->path, loc->path); + if (name) + strcpy (req->name + pathlen, name); + + req->cmd = hton32 (cmd); + req->type = hton32 (type); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_ENTRYLK, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND(frame, -1, EINVAL); + return 0; + +} + + +int32_t +client_fentrylk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + const char *name, + entrylk_cmd cmd, + entrylk_type type) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fentrylk_req_t *req = NULL; + int64_t remote_fd = -1; + size_t namelen = 0; + size_t hdrlen = -1; + int ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, default_fentrylk_cbk, + conf->child, + conf->child->fops->fentrylk, + fd, name, cmd, type); + + return 0; + } + + if (name) + namelen = STRLEN_0(name); + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + STACK_UNWIND(frame, -1, EBADFD); + return 0; + } + + hdrlen = gf_hdr_len (req, namelen); + hdr = gf_hdr_new (req, namelen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + req->namelen = hton64 (namelen); + + if (name) + strcpy (req->name, name); + + req->cmd = hton32 (cmd); + req->type = hton32 (type); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_FENTRYLK, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + + STACK_UNWIND(frame, -1, EINVAL); + return 0; +} + + +/* + * client_lookup - lookup function for client protocol + * @frame: call frame + * @this: + * @loc: location + * + * not for external reference + */ +int32_t +client_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_lookup_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + ino_t ino = 0; + ino_t par = 0; + size_t dictlen = 0; + size_t pathlen = 0; + size_t baselen = 0; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + client_conf_t *conf = this->private; + client_local_t *local = NULL; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_lookup_cbk, + conf->child, + conf->child->fops->lookup, + loc, + xattr_req); + + return 0; + } + + local = calloc (1, sizeof (*local)); + GF_VALIDATE_OR_GOTO(this->name, local, unwind); + + loc_copy (&local->loc, loc); + + frame->local = local; + + GF_VALIDATE_OR_GOTO (this->name, loc, unwind); + GF_VALIDATE_OR_GOTO (this->name, loc->path, unwind); + + if (loc->ino != 1) { + par = this_ino_get (loc, this, GF_CLIENT_INODE_PARENT); + GF_VALIDATE_OR_GOTO (this->name, loc->name, unwind); + baselen = STRLEN_0(loc->name); + } else { + ino = 1; + } + + pathlen = STRLEN_0(loc->path); + + if (xattr_req) { + dictlen = dict_serialized_length (xattr_req); + if (dictlen < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get serialized length of dict(%p)", + xattr_req); + ret = dictlen; + goto unwind; + } + } + + hdrlen = gf_hdr_len (req, pathlen + baselen + dictlen); + hdr = gf_hdr_new (req, pathlen + baselen + dictlen); + GF_VALIDATE_OR_GOTO (this->name, hdr, unwind); + + req = gf_param (hdr); + + req->ino = hton64 (ino); + req->par = hton64 (par); + strcpy (req->path, loc->path); + if (baselen) + strcpy (req->path + pathlen, loc->name); + + if (dictlen) { + ret = dict_serialize (xattr_req, req->dict + baselen + pathlen); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to serialize dictionary(%p)", + xattr_req); + goto unwind; + } + } + + req->dictlen = hton32 (dictlen); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_LOOKUP, + hdr, hdrlen, NULL, 0, NULL); + return ret; + +unwind: + STACK_UNWIND (frame, op_ret, op_errno, loc->inode, NULL, NULL); + return ret; +} + + + +/* + * client_fchmod + * + */ +int32_t +client_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fchmod_req_t *req = NULL; + int64_t remote_fd = -1; + size_t hdrlen = -1; + int ret = -1; + int32_t op_errno = EINVAL; + int32_t op_ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_fchmod_cbk, + conf->child, + conf->child->fops->fchmod, + fd, + mode); + + return 0; + } + + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + op_errno = EBADFD; + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + goto unwind; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO (this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + req->mode = hton32 (mode); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_FCHMOD, + hdr, hdrlen, NULL, 0, NULL); + + return 0; + +unwind: + STACK_UNWIND (frame, op_ret, op_errno, NULL); + return 0; +} + + +/* + * client_fchown - + * + * @frame: + * @this: + * @fd: + * @uid: + * @gid: + * + */ +int32_t +client_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fchown_req_t *req = NULL; + int64_t remote_fd = 0; + size_t hdrlen = -1; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int32_t ret = -1; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_fchown_cbk, + conf->child, + conf->child->fops->fchown, + fd, + uid, + gid); + + return 0; + } + + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + op_errno = EBADFD; + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + goto unwind; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO (this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + req->uid = hton32 (uid); + req->gid = hton32 (gid); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_LOWLAT), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_FCHOWN, + hdr, hdrlen, NULL, 0, NULL); + + return 0; + +unwind: + STACK_UNWIND (frame, op_ret, op_errno, NULL); + return 0; + +} + +/** + * client_setdents - + */ +int32_t +client_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_setdents_req_t *req = NULL; + int64_t remote_fd = 0; + char *buffer = NULL; + char *ptr = NULL; + data_t *buf_data = NULL; + dict_t *reply_dict = NULL; + dir_entry_t *trav = NULL; + uint32_t len = 0; + int32_t buf_len = 0; + int32_t ret = -1; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int32_t vec_count = 0; + size_t hdrlen = -1; + struct iovec vector[1]; + client_conf_t *conf = this->private; + + if (conf->child) { + /* */ + STACK_WIND (frame, + default_setdents_cbk, + conf->child, + conf->child->fops->setdents, + fd, + flags, + entries, + count); + + return 0; + } + + GF_VALIDATE_OR_GOTO (this->name, fd, unwind); + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd. returning EBADFD", + fd->inode->ino); + op_errno = EBADFD; + goto unwind; + } + + GF_VALIDATE_OR_GOTO (this->name, entries, unwind); + GF_VALIDATE_OR_GOTO (this->name, count, unwind); + + trav = entries->next; + while (trav) { + len += strlen (trav->name); + len += 1; + len += strlen (trav->link); + len += 1; + len += 256; // max possible for statbuf; + trav = trav->next; + } + buffer = CALLOC (1, len); + GF_VALIDATE_OR_GOTO (this->name, buffer, unwind); + + ptr = buffer; + + trav = entries->next; + while (trav) { + int32_t this_len = 0; + char *tmp_buf = NULL; + struct stat *stbuf = &trav->buf; + { + /* Convert the stat buf to string */ + uint64_t dev = stbuf->st_dev; + uint64_t ino = stbuf->st_ino; + uint32_t mode = stbuf->st_mode; + uint32_t nlink = stbuf->st_nlink; + uint32_t uid = stbuf->st_uid; + uint32_t gid = stbuf->st_gid; + uint64_t rdev = stbuf->st_rdev; + uint64_t size = stbuf->st_size; + uint32_t blksize = stbuf->st_blksize; + uint64_t blocks = stbuf->st_blocks; + + uint32_t atime = stbuf->st_atime; + uint32_t mtime = stbuf->st_mtime; + uint32_t ctime = stbuf->st_ctime; + + uint32_t atime_nsec = ST_ATIM_NSEC(stbuf); + uint32_t mtime_nsec = ST_MTIM_NSEC(stbuf); + uint32_t ctime_nsec = ST_CTIM_NSEC(stbuf); + + asprintf (&tmp_buf, + GF_STAT_PRINT_FMT_STR, + dev, + ino, + mode, + nlink, + uid, + gid, + rdev, + size, + blksize, + blocks, + atime, + atime_nsec, + mtime, + mtime_nsec, + ctime, + ctime_nsec); + } + this_len = sprintf (ptr, "%s/%s%s\n", + trav->name, + tmp_buf, + trav->link); + + FREE (tmp_buf); + trav = trav->next; + ptr += this_len; + } + buf_len = strlen (buffer); + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO (this->name, hdr, unwind); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + req->flags = hton32 (flags); + req->count = hton32 (count); + + { + buf_data = get_new_data (); + GF_VALIDATE_OR_GOTO (this->name, buf_data, unwind); + reply_dict = get_new_dict(); + GF_VALIDATE_OR_GOTO (this->name, reply_dict, unwind); + + buf_data->data = buffer; + buf_data->len = buf_len; + dict_set (reply_dict, NULL, buf_data); + frame->root->rsp_refs = dict_ref (reply_dict); + vector[0].iov_base = buffer; + vector[0].iov_len = buf_len; + vec_count = 1; + } + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_SETDENTS, + hdr, hdrlen, vector, vec_count, + frame->root->rsp_refs); + + return ret; +unwind: + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +/* + * CBKs + */ +/* + * client_forget - forget function for client protocol + * @this: + * @inode: + * + * not for external reference + */ +int32_t +client_forget (xlator_t *this, + inode_t *inode) +{ + ino_t ino = 0; + client_conf_t *conf = NULL; + client_forget_t forget = {0,}; + uint8_t send_forget = 0; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("client", this, out); + conf = this->private; + + if (conf->child) { + /* */ + /* Yenu beda */ + return 0; + } + + GF_VALIDATE_OR_GOTO (this->name, inode, out); + ino = this_ino_get_from_inode (inode, this); + + LOCK (&conf->forget.lock); + { + conf->forget.ino_array[conf->forget.count++] = ino; + + if ((!conf->forget.frames_in_transit) || + (conf->forget.count >= CLIENT_PROTO_FORGET_LIMIT)) { + ret = client_get_forgets (this, &forget); + if (ret <= 0) + send_forget = 0; + else + send_forget = 1; + } + } + UNLOCK (&conf->forget.lock); + + if (send_forget) { + ret = protocol_client_xfer (forget.frame, this, + CLIENT_CHANNEL (this,CHANNEL_BULK), + GF_OP_TYPE_CBK_REQUEST, + GF_CBK_FORGET, + forget.hdr, forget.hdrlen, + NULL, 0, NULL); + } +out: + return 0; +} + +/** + * client_releasedir - releasedir function for client protocol + * @this: this translator structure + * @fd: file descriptor structure + * + * external reference through client_protocol_xlator->cbks->releasedir + */ + +int32_t +client_releasedir (xlator_t *this, fd_t *fd) +{ + call_frame_t *fr = NULL; + int32_t ret = -1; + int64_t remote_fd = 0; + char key[32] = {0,}; + gf_hdr_common_t *hdr = NULL; + size_t hdrlen = 0; + gf_cbk_releasedir_req_t *req = NULL; + client_conf_t *conf = NULL; + + + GF_VALIDATE_OR_GOTO ("client", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + conf = this->private; + if (conf->child) { + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1){ + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd.", + fd->inode->ino); + goto out; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO (this->name, hdr, out); + + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + + { + sprintf (key, "%p", fd); + + pthread_mutex_lock (&conf->mutex); + { + dict_del (conf->saved_fds, key); + } + pthread_mutex_unlock (&conf->mutex); + } + + fr = create_frame (this, this->ctx->pool); + GF_VALIDATE_OR_GOTO (this->name, fr, out); + + ret = protocol_client_xfer (fr, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_CBK_REQUEST, GF_CBK_RELEASEDIR, + hdr, hdrlen, NULL, 0, NULL); +out: + return ret; +} + + +/** + * client_release - release function for client protocol + * @this: this translator structure + * @fd: file descriptor structure + * + * external reference through client_protocol_xlator->cbks->release + * + */ +int +client_release (xlator_t *this, fd_t *fd) +{ + call_frame_t *fr = NULL; + int32_t ret = -1; + int64_t remote_fd = 0; + char key[32] = {0,}; + gf_hdr_common_t *hdr = NULL; + size_t hdrlen = 0; + gf_cbk_release_req_t *req = NULL; + client_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("client", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + conf = this->private; + + if (conf->child) { + return 0; + } + + ret = this_fd_get (fd, this, &remote_fd); + if (ret == -1) { + gf_log (this->name, GF_LOG_DEBUG, + "(%"PRId64"): failed to get remote fd.", + fd->inode->ino); + goto out; + } + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO (this->name, hdr, out); + req = gf_param (hdr); + + req->fd = hton64 (remote_fd); + + { + sprintf (key, "%p", fd); + + pthread_mutex_lock (&conf->mutex); + { + dict_del (conf->saved_fds, key); + } + pthread_mutex_unlock (&conf->mutex); + } + + fr = create_frame (this, this->ctx->pool); + GF_VALIDATE_OR_GOTO (this->name, fr, out); + + ret = protocol_client_xfer (fr, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_CBK_REQUEST, GF_CBK_RELEASE, + hdr, hdrlen, NULL, 0, NULL); +out: + return ret; +} + +/* + * MGMT_OPS + */ + +/** + * client_stats - stats function for client protocol + * @frame: call frame + * @this: this translator structure + * @flags: + * + * external reference through client_protocol_xlator->mops->stats + */ + +int32_t +client_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags) +{ + gf_hdr_common_t *hdr = NULL; + gf_mop_stats_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + client_conf_t *conf = NULL; + + GF_VALIDATE_OR_GOTO ("client", this, unwind); + + conf = this->private; + if (conf->child) { + /* */ + STACK_WIND (frame, + default_stats_cbk, + conf->child, + conf->child->mops->stats, + flags); + + return 0; + } + + + hdrlen = gf_hdr_len (req, 0); + hdr = gf_hdr_new (req, 0); + GF_VALIDATE_OR_GOTO (this->name, hdr, unwind); + + req = gf_param (hdr); + + req->flags = hton32 (flags); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_MOP_REQUEST, GF_MOP_STATS, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + STACK_UNWIND (frame, -1, EINVAL, NULL); + return 0; +} + + +/* Callbacks */ + +int32_t +client_fxattrop_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_xattrop_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t gf_errno = 0; + int32_t op_errno = 0; + int32_t dict_len = 0; + dict_t *dict = NULL; + int32_t ret = -1; + char *dictbuf = NULL; + + rsp = gf_param (hdr); + GF_VALIDATE_OR_GOTO(frame->this->name, rsp, fail); + + op_ret = ntoh32 (hdr->rsp.op_ret); + + if (op_ret >= 0) { + op_ret = -1; + dict_len = ntoh32 (rsp->dict_len); + + if (dict_len > 0) { + dictbuf = memdup (rsp->dict, dict_len); + GF_VALIDATE_OR_GOTO(frame->this->name, dictbuf, fail); + + dict = dict_new(); + GF_VALIDATE_OR_GOTO(frame->this->name, dict, fail); + + ret = dict_unserialize (dictbuf, dict_len, &dict); + if (ret < 0) { + gf_log (frame->this->name, GF_LOG_ERROR, + "failed to serialize dictionary(%p)", + dict); + op_errno = -ret; + goto fail; + } else { + dict->extra_free = dictbuf; + dictbuf = NULL; + } + } + op_ret = 0; + } + gf_errno = ntoh32 (hdr->rsp.op_errno); + op_errno = gf_error_to_errno (gf_errno); + +fail: + STACK_UNWIND (frame, op_ret, op_errno, dict); + + if (dictbuf) + free (dictbuf); + + if (dict) + dict_unref (dict); + + return 0; +} + +int32_t +client_xattrop_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_xattrop_rsp_t *rsp = NULL; + int32_t op_ret = -1; + int32_t gf_errno = EINVAL; + int32_t op_errno = 0; + int32_t dict_len = 0; + dict_t *dict = NULL; + int32_t ret = -1; + char *dictbuf = NULL; + + rsp = gf_param (hdr); + GF_VALIDATE_OR_GOTO(frame->this->name, rsp, fail); + + op_ret = ntoh32 (hdr->rsp.op_ret); + if (op_ret >= 0) { + op_ret = -1; + dict_len = ntoh32 (rsp->dict_len); + + if (dict_len > 0) { + dictbuf = memdup (rsp->dict, dict_len); + GF_VALIDATE_OR_GOTO(frame->this->name, dictbuf, fail); + + dict = get_new_dict(); + GF_VALIDATE_OR_GOTO(frame->this->name, dict, fail); + dict_ref (dict); + + ret = dict_unserialize (dictbuf, dict_len, &dict); + if (ret < 0) { + gf_log (frame->this->name, GF_LOG_ERROR, + "failed to serialize dictionary(%p)", + dict); + goto fail; + } else { + dict->extra_free = dictbuf; + dictbuf = NULL; + } + } + op_ret = 0; + } + gf_errno = ntoh32 (hdr->rsp.op_errno); + op_errno = gf_error_to_errno (gf_errno); + + +fail: + STACK_UNWIND (frame, op_ret, op_errno, dict); + + if (dictbuf) + free (dictbuf); + if (dict) + dict_unref (dict); + + return 0; +} + +/* + * client_chown_cbk - + * + * @frame: + * @args: + * + * not for external reference + */ +int32_t +client_fchown_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + gf_fop_fchown_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret == 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + + +/* + * client_fchmod_cbk + * + * @frame: + * @args: + * + * not for external reference + */ +int32_t +client_fchmod_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + gf_fop_fchmod_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret == 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + + +/* + * client_create_cbk - create callback function for client protocol + * @frame: call frame + * @args: arguments in dictionary + * + * not for external reference + */ + +int +client_create_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_create_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + fd_t *fd = NULL; + inode_t *inode = NULL; + struct stat stbuf = {0, }; + int64_t remote_fd = 0; + char key[32] = {0, }; + int32_t ret = -1; + client_local_t *local = NULL; + client_conf_t *conf = NULL; + + + local = frame->local; frame->local = NULL; + conf = frame->this->private; + fd = local->fd; + inode = local->loc.inode; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = ntoh32 (hdr->rsp.op_errno); + + if (op_ret >= 0) { + remote_fd = ntoh64 (rsp->fd); + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + if (op_ret >= 0) { + this_ino_set (&local->loc, frame->this, stbuf.st_ino); + this_fd_set (fd, frame->this, &local->loc, remote_fd); + + sprintf (key, "%p", fd); + + pthread_mutex_lock (&conf->mutex); + { + ret = dict_set_str (conf->saved_fds, key, ""); + } + pthread_mutex_unlock (&conf->mutex); + + if (ret < 0) { + free (key); + gf_log (frame->this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to save remote fd", + local->loc.path, stbuf.st_ino); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, fd, inode, &stbuf); + + client_local_wipe (local); + + return 0; +} + + +/* + * client_open_cbk - open callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_open_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + int32_t op_ret = -1; + int32_t op_errno = ENOTCONN; + fd_t *fd = NULL; + int64_t remote_fd = 0; + gf_fop_open_rsp_t *rsp = NULL; + char key[32] = {0,}; + int32_t ret = -1; + client_local_t *local = NULL; + client_conf_t *conf = NULL; + + + local = frame->local; frame->local = NULL; + conf = frame->this->private; + fd = local->fd; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = ntoh32 (hdr->rsp.op_errno); + + if (op_ret >= 0) { + remote_fd = ntoh64 (rsp->fd); + } + + if (op_ret >= 0) { + this_fd_set (fd, frame->this, &local->loc, remote_fd); + + sprintf (key, "%p", fd); + + pthread_mutex_lock (&conf->mutex); + { + ret = dict_set_str (conf->saved_fds, key, ""); + } + pthread_mutex_unlock (&conf->mutex); + + if (ret < 0) { + gf_log (frame->this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to save remote fd", + local->loc.path, local->loc.inode->ino); + free (key); + } + + } + + STACK_UNWIND (frame, op_ret, op_errno, fd); + + client_local_wipe (local); + + return 0; +} + +/* + * client_stat_cbk - stat callback for client protocol + * @frame: call frame + * @args: arguments dictionary + * + * not for external reference + */ +int +client_stat_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + gf_fop_stat_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret == 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + +/* + * client_utimens_cbk - utimens callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ + +int32_t +client_utimens_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + gf_fop_utimens_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret == 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + +/* + * client_chmod_cbk - chmod for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_chmod_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + gf_fop_chmod_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret == 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + +/* + * client_chown_cbk - chown for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_chown_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + gf_fop_chown_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret == 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + +/* + * client_mknod_cbk - mknod callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_mknod_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_mknod_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + struct stat stbuf = {0, }; + inode_t *inode = NULL; + client_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + inode = local->loc.inode; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret >= 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + this_ino_set (&local->loc, frame->this, stbuf.st_ino); + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, &stbuf); + + client_local_wipe (local); + + return 0; +} + +/* + * client_symlink_cbk - symlink callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_symlink_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_symlink_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + struct stat stbuf = {0, }; + inode_t *inode = NULL; + client_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + inode = local->loc.inode; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret >= 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + this_ino_set (&local->loc, frame->this, stbuf.st_ino); + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, &stbuf); + + client_local_wipe (local); + + return 0; +} + +/* + * client_link_cbk - link callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_link_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_link_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + struct stat stbuf = {0, }; + inode_t *inode = NULL; + client_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + inode = local->loc.inode; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret >= 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, &stbuf); + + client_local_wipe (local); + + return 0; +} + +/* + * client_truncate_cbk - truncate callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ + +int32_t +client_truncate_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + gf_fop_truncate_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret == 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + +/* client_fstat_cbk - fstat callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ + +int32_t +client_fstat_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + gf_fop_fstat_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret == 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + +/* + * client_ftruncate_cbk - ftruncate callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_ftruncate_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + gf_fop_ftruncate_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret == 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + +/* client_readv_cbk - readv callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external referece + */ + +int32_t +client_readv_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_read_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + struct iovec vector = {0, }; + struct stat stbuf = {0, }; + dict_t *refs = NULL; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret != -1) { + gf_stat_to_stat (&rsp->stat, &stbuf); + vector.iov_base = buf; + vector.iov_len = buflen; + + refs = get_new_dict (); + dict_set (refs, NULL, data_from_dynptr (buf, 0)); + frame->root->rsp_refs = dict_ref (refs); + } + + STACK_UNWIND (frame, op_ret, op_errno, &vector, 1, &stbuf); + + if (refs) + dict_unref (refs); + + return 0; +} + +/* + * client_write_cbk - write callback for client protocol + * @frame: cal frame + * @args: argument dictionary + * + * not for external reference + */ + +int32_t +client_write_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_write_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + struct stat stbuf = {0, }; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret >= 0) + gf_stat_to_stat (&rsp->stat, &stbuf); + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + + +int32_t +client_readdir_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_readdir_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + uint32_t buf_size = 0; + gf_dirent_t entries; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = ntoh32 (hdr->rsp.op_errno); + + INIT_LIST_HEAD (&entries.list); + if (op_ret > 0) { + buf_size = ntoh32 (rsp->size); + gf_dirent_unserialize (&entries, rsp->buf, buf_size); + } + + STACK_UNWIND (frame, op_ret, op_errno, &entries); + + gf_dirent_free (&entries); + + return 0; +} + +/* + * client_fsync_cbk - fsync callback for client protocol + * + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_fsync_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + gf_fop_fsync_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + + +/* + * client_unlink_cbk - unlink callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_unlink_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_unlink_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +/* + * client_rename_cbk - rename callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_rename_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + gf_fop_rename_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret == 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + + +/* + * client_readlink_cbk - readlink callback for client protocol + * + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_readlink_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_readlink_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + char *link = NULL; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret > 0) { + link = rsp->path; + } + + STACK_UNWIND (frame, op_ret, op_errno, link); + return 0; +} + +/* + * client_mkdir_cbk - mkdir callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_mkdir_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_mkdir_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + struct stat stbuf = {0, }; + inode_t *inode = NULL; + client_local_t *local = NULL; + + local = frame->local; + inode = local->loc.inode; + frame->local = NULL; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret >= 0) { + gf_stat_to_stat (&rsp->stat, &stbuf); + this_ino_set (&local->loc, frame->this, stbuf.st_ino); + } + + STACK_UNWIND (frame, op_ret, op_errno, inode, &stbuf); + + client_local_wipe (local); + + return 0; +} + +/* + * client_flush_cbk - flush callback for client protocol + * + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ + +int32_t +client_flush_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + int32_t op_ret = 0; + int32_t op_errno = 0; + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + + +/* + * client_opendir_cbk - opendir callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int +client_opendir_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + int32_t op_ret = -1; + int32_t op_errno = ENOTCONN; + fd_t *fd = NULL; + int64_t remote_fd = 0; + gf_fop_opendir_rsp_t *rsp = NULL; + char key[32] = {0,}; + int32_t ret = -1; + client_local_t *local = NULL; + client_conf_t *conf = NULL; + + + local = frame->local; frame->local = NULL; + conf = frame->this->private; + fd = local->fd; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = ntoh32 (hdr->rsp.op_errno); + + if (op_ret >= 0) { + remote_fd = ntoh64 (rsp->fd); + } + + if (op_ret >= 0) { + this_fd_set (fd, frame->this, &local->loc, remote_fd); + + sprintf (key, "%p", fd); + + pthread_mutex_lock (&conf->mutex); + { + ret = dict_set_str (conf->saved_fds, key, ""); + } + pthread_mutex_unlock (&conf->mutex); + + if (ret < 0) { + free (key); + gf_log (frame->this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to save remote fd", + local->loc.path, local->loc.inode->ino); + } + } + + STACK_UNWIND (frame, op_ret, op_errno, fd); + + client_local_wipe (local); + + return 0; +} + + +/* + * client_rmdir_cbk - rmdir callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ + +int +client_rmdir_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_rmdir_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +/* + * client_access_cbk - access callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_access_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_access_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + + + +/* + * client_lookup_cbk - lookup callback for client protocol + * + * @frame: call frame + * @args: arguments dictionary + * + * not for external reference + */ +int32_t +client_lookup_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct stat stbuf = {0, }; + inode_t *inode = NULL; + dict_t *xattr = NULL; + gf_fop_lookup_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + size_t dict_len = 0; + char *dictbuf = NULL; + int32_t ret = -1; + int32_t gf_errno = 0; + client_local_t *local = NULL; + + local = frame->local; + inode = local->loc.inode; + frame->local = NULL; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + + if (op_ret == 0) { + op_ret = -1; + gf_stat_to_stat (&rsp->stat, &stbuf); + this_ino_set (&local->loc, frame->this, stbuf.st_ino); + + dict_len = ntoh32 (rsp->dict_len); + + if (dict_len > 0) { + dictbuf = memdup (rsp->dict, dict_len); + GF_VALIDATE_OR_GOTO(frame->this->name, dictbuf, fail); + + xattr = dict_new(); + GF_VALIDATE_OR_GOTO(frame->this->name, xattr, fail); + + ret = dict_unserialize (dictbuf, dict_len, &xattr); + if (ret < 0) { + gf_log (frame->this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to unserialize dictionary", + local->loc.path, inode->ino); + goto fail; + } else { + xattr->extra_free = dictbuf; + dictbuf = NULL; + } + } + op_ret = 0; + } + gf_errno = ntoh32 (hdr->rsp.op_errno); + op_errno = gf_error_to_errno (gf_errno); + +fail: + STACK_UNWIND (frame, op_ret, op_errno, inode, &stbuf, xattr); + + client_local_wipe (local); + + if (dictbuf) + free (dictbuf); + + if (xattr) + dict_unref (xattr); + + return 0; +} + +static dir_entry_t * +gf_bin_to_direntry (char *buf, size_t count) +{ + int32_t idx = 0, bread = 0; + size_t rcount = 0; + char *ender = NULL, *buffer = NULL; + char tmp_buf[512] = {0,}; + dir_entry_t *trav = NULL, *prev = NULL; + dir_entry_t *thead = NULL, *head = NULL; + + thead = CALLOC (1, sizeof (dir_entry_t)); + GF_VALIDATE_OR_GOTO("client-protocol", thead, fail); + + buffer = buf; + prev = thead; + + for (idx = 0; idx < count ; idx++) { + bread = 0; + trav = CALLOC (1, sizeof (dir_entry_t)); + GF_VALIDATE_OR_GOTO("client-protocol", trav, fail); + + ender = strchr (buffer, '/'); + if (!ender) + break; + rcount = ender - buffer; + trav->name = CALLOC (1, rcount + 2); + GF_VALIDATE_OR_GOTO("client-protocol", trav->name, fail); + + strncpy (trav->name, buffer, rcount); + bread = rcount + 1; + buffer += bread; + + ender = strchr (buffer, '\n'); + if (!ender) + break; + rcount = ender - buffer; + strncpy (tmp_buf, buffer, rcount); + bread = rcount + 1; + buffer += bread; + + gf_string_to_stat (tmp_buf, &trav->buf); + + ender = strchr (buffer, '\n'); + if (!ender) + break; + rcount = ender - buffer; + *ender = '\0'; + if (S_ISLNK (trav->buf.st_mode)) + trav->link = strdup (buffer); + else + trav->link = ""; + + bread = rcount + 1; + buffer += bread; + + prev->next = trav; + prev = trav; + } + + head = thead; +fail: + return head; +} + +int32_t +gf_free_direntry(dir_entry_t *head) +{ + dir_entry_t *prev = NULL, *trav = NULL; + + prev = head; + GF_VALIDATE_OR_GOTO("client-protocol", prev, fail); + + trav = head->next; + while (trav) { + prev->next = trav->next; + FREE (trav->name); + if (S_ISLNK (trav->buf.st_mode)) + FREE (trav->link); + FREE (trav); + trav = prev->next; + } + FREE (head); +fail: + return 0; +} +/* + * client_getdents_cbk - readdir callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_getdents_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_getdents_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + int32_t gf_errno = 0; + int32_t nr_count = 0; + dir_entry_t *entry = NULL; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + gf_errno = ntoh32 (hdr->rsp.op_errno); + op_errno = gf_error_to_errno (gf_errno); + + if (op_ret >= 0) { + nr_count = ntoh32 (rsp->count); + entry = gf_bin_to_direntry(buf, nr_count); + if (entry == NULL) { + op_ret = -1; + op_errno = EINVAL; + } + } + + STACK_UNWIND (frame, op_ret, op_errno, entry, nr_count); + + if (op_ret >= 0) { + /* Free the buffer */ + FREE (buf); + gf_free_direntry(entry); + } + + return 0; +} + +/* + * client_statfs_cbk - statfs callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_statfs_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct statvfs stbuf = {0, }; + gf_fop_statfs_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret == 0) + { + gf_statfs_to_statfs (&rsp->statfs, &stbuf); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + +/* + * client_fsyncdir_cbk - fsyncdir callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_fsyncdir_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + int32_t op_ret = 0; + int32_t op_errno = 0; + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +/* + * client_setxattr_cbk - setxattr callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_setxattr_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_setxattr_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +/* + * client_getxattr_cbk - getxattr callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_getxattr_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_getxattr_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t gf_errno = 0; + int32_t op_errno = 0; + int32_t dict_len = 0; + dict_t *dict = NULL; + int32_t ret = -1; + char *dictbuf = NULL; + client_local_t *local = NULL; + + local = frame->local; + frame->local = NULL; + + rsp = gf_param (hdr); + GF_VALIDATE_OR_GOTO(frame->this->name, rsp, fail); + + op_ret = ntoh32 (hdr->rsp.op_ret); + + if (op_ret >= 0) { + op_ret = -1; + dict_len = ntoh32 (rsp->dict_len); + + if (dict_len > 0) { + dictbuf = memdup (rsp->dict, dict_len); + GF_VALIDATE_OR_GOTO(frame->this->name, dictbuf, fail); + + dict = dict_new(); + GF_VALIDATE_OR_GOTO(frame->this->name, dict, fail); + + ret = dict_unserialize (dictbuf, dict_len, &dict); + if (ret < 0) { + gf_log (frame->this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to " + "unserialize xattr dictionary", + local->loc.path, local->loc.inode->ino); + goto fail; + } else { + dict->extra_free = dictbuf; + dictbuf = NULL; + } + } + op_ret = 0; + } + gf_errno = ntoh32 (hdr->rsp.op_errno); + op_errno = gf_error_to_errno (gf_errno); +fail: + STACK_UNWIND (frame, op_ret, op_errno, dict); + + client_local_wipe (local); + + if (dictbuf) + free (dictbuf); + + if (dict) + dict_unref (dict); + + return 0; +} + +/* + * client_removexattr_cbk - removexattr callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_removexattr_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + int32_t op_ret = 0; + int32_t op_errno = 0; + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +/* + * client_lk_cbk - lk callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_lk_common_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct flock lock = {0,}; + gf_fop_lk_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret >= 0) { + gf_flock_to_flock (&rsp->flock, &lock); + } + + STACK_UNWIND (frame, op_ret, op_errno, &lock); + return 0; +} + + +/* + * client_gf_file_lk_cbk - gf_file_lk callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_inodelk_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_inodelk_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +client_finodelk_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_finodelk_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +/* + * client_entrylk_cbk - entrylk callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int32_t +client_entrylk_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_entrylk_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +client_fentrylk_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_fentrylk_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +/** + * client_writedir_cbk - + * + * @frame: + * @args: + * + * not for external reference + */ +int32_t +client_setdents_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + int32_t op_ret = 0; + int32_t op_errno = 0; + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + + + +/* + * client_stats_cbk - stats callback for client protocol + * + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ + +int32_t +client_stats_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct xlator_stats stats = {0,}; + gf_mop_stats_rsp_t *rsp = NULL; + char *buffer = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret >= 0) + { + buffer = rsp->buf; + + sscanf (buffer, "%"SCNx64",%"SCNx64",%"SCNx64",%"SCNx64 + ",%"SCNx64",%"SCNx64",%"SCNx64",%"SCNx64"\n", + &stats.nr_files, + &stats.disk_usage, + &stats.free_disk, + &stats.total_disk_size, + &stats.read_usage, + &stats.write_usage, + &stats.disk_speed, + &stats.nr_clients); + } + + STACK_UNWIND (frame, op_ret, op_errno, &stats); + return 0; +} + +/* + * client_getspec - getspec function for client protocol + * @frame: call frame + * @this: client protocol xlator structure + * @flag: + * + * external reference through client_protocol_xlator->fops->getspec + */ +int32_t +client_getspec (call_frame_t *frame, + xlator_t *this, + const char *key, + int32_t flag) +{ + gf_hdr_common_t *hdr = NULL; + gf_mop_getspec_req_t *req = NULL; + size_t hdrlen = -1; + int keylen = 0; + int ret = -1; + + if (key) + keylen = STRLEN_0(key); + + hdrlen = gf_hdr_len (req, keylen); + hdr = gf_hdr_new (req, keylen); + GF_VALIDATE_OR_GOTO(this->name, hdr, unwind); + + req = gf_param (hdr); + req->flags = hton32 (flag); + req->keylen = hton32 (keylen); + if (keylen) + strcpy (req->key, key); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_MOP_REQUEST, GF_MOP_GETSPEC, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +unwind: + if (hdr) + free (hdr); + STACK_UNWIND(frame, -1, EINVAL, NULL); + return 0; +} + + +/* + * client_getspec_cbk - getspec callback for client protocol + * + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ + +int32_t +client_getspec_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_mop_getspec_rsp_t *rsp = NULL; + char *spec_data = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + int32_t gf_errno = 0; + + op_ret = ntoh32 (hdr->rsp.op_ret); + gf_errno = ntoh32 (hdr->rsp.op_errno); + op_errno = gf_error_to_errno (gf_errno); + rsp = gf_param (hdr); + + if (op_ret >= 0) { + spec_data = rsp->spec; + } + + STACK_UNWIND (frame, op_ret, op_errno, spec_data); + return 0; +} + +int32_t +client_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_checksum_req_t *req = NULL; + size_t hdrlen = -1; + int ret = -1; + client_conf_t *conf = this->private; + ino_t ino = 0; + + if (conf->child) { + STACK_WIND (frame, + default_checksum_cbk, + conf->child, + conf->child->fops->checksum, + loc, + flag); + + return 0; + } + + hdrlen = gf_hdr_len (req, strlen (loc->path) + 1); + hdr = gf_hdr_new (req, strlen (loc->path) + 1); + req = gf_param (hdr); + + ino = this_ino_get (loc, this, GF_CLIENT_INODE_SELF); + req->ino = hton64 (ino); + req->flag = hton32 (flag); + strcpy (req->path, loc->path); + + ret = protocol_client_xfer (frame, this, + CLIENT_CHANNEL (this, CHANNEL_BULK), + GF_OP_TYPE_FOP_REQUEST, GF_FOP_CHECKSUM, + hdr, hdrlen, NULL, 0, NULL); + + return ret; +} + +int32_t +client_checksum_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_checksum_rsp_t *rsp = NULL; + int32_t op_ret = 0; + int32_t op_errno = 0; + int32_t gf_errno = 0; + unsigned char *fchecksum = NULL; + unsigned char *dchecksum = NULL; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + gf_errno = ntoh32 (hdr->rsp.op_errno); + op_errno = gf_error_to_errno (gf_errno); + + if (op_ret >= 0) { + fchecksum = rsp->fchecksum; + dchecksum = rsp->dchecksum + ZR_FILENAME_MAX; + } + + STACK_UNWIND (frame, op_ret, op_errno, fchecksum, dchecksum); + return 0; +} + + +/* + * client_setspec_cbk - setspec callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ + +int32_t +client_setspec_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + int32_t op_ret = 0; + int32_t op_errno = 0; + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +/* + * client_setvolume_cbk - setvolume callback for client protocol + * @frame: call frame + * @args: argument dictionary + * + * not for external reference + */ +int +client_setvolume_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_mop_setvolume_rsp_t *rsp = NULL; + client_connection_t *conn = NULL; + client_conf_t *conf = NULL; + glusterfs_ctx_t *ctx = NULL; + xlator_t *this = NULL; + xlator_list_t *parent = NULL; + transport_t *trans = NULL; + dict_t *reply = NULL; + char *remote_subvol = NULL; + char *remote_error = NULL; + char *process_uuid = NULL; + int32_t ret = -1; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int32_t dict_len = 0; + + + trans = frame->local; frame->local = NULL; + this = frame->this; + conf = this->private; + conn = trans->xl_private; + + rsp = gf_param (hdr); + + op_ret = ntoh32 (hdr->rsp.op_ret); + op_errno = gf_error_to_errno (ntoh32 (hdr->rsp.op_errno)); + + if (op_ret < 0 && op_errno == ENOTCONN) { + gf_log (this->name, GF_LOG_ERROR, + "setvolume failed (%s)", + strerror (op_errno)); + goto out; + } + + reply = dict_new (); + GF_VALIDATE_OR_GOTO(this->name, reply, out); + + dict_len = ntoh32 (rsp->dict_len); + ret = dict_unserialize (rsp->buf, dict_len, &reply); + if (ret < 0) { + gf_log (frame->this->name, GF_LOG_ERROR, + "failed to unserialize buffer(%p) to dictionary", + rsp->buf); + goto out; + } + + ret = dict_get_str (reply, "ERROR", &remote_error); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get ERROR string from reply dictionary"); + } + + ret = dict_get_str (reply, "process-uuid", &process_uuid); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "failed to get 'process-uuid' from reply dictionary"); + } + + if (op_ret < 0) { + gf_log (trans->xl->name, GF_LOG_ERROR, + "SETVOLUME on remote-host failed: %s", + remote_error ? remote_error : strerror (op_errno)); + errno = op_errno; + if (op_errno == ENOTCONN) + goto out; + } else { + ctx = get_global_ctx_ptr (); + if (process_uuid && !strcmp (ctx->process_uuid,process_uuid)) { + ret = dict_get_str (this->options, "remote-subvolume", + &remote_subvol); + if (!remote_subvol) + goto out; + + gf_log (this->name, GF_LOG_WARNING, + "attaching to the local volume '%s'", + remote_subvol); + + /* TODO: */ + conf->child = xlator_search_by_name (this, + remote_subvol); + } + gf_log (trans->xl->name, GF_LOG_INFO, + "connection and handshake succeeded"); + + pthread_mutex_lock (&(conn->lock)); + { + conn->connected = 1; + } + pthread_mutex_unlock (&(conn->lock)); + + parent = trans->xl->parents; + while (parent) { + parent->xlator->notify (parent->xlator, + GF_EVENT_CHILD_UP, + trans->xl); + parent = parent->next; + } + } + +out: + STACK_DESTROY (frame->root); + + if (reply) + dict_unref (reply); + + return op_ret; +} + +/* + * client_enosys_cbk - + * @frame: call frame + * + * not for external reference + */ +int +client_enosys_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + STACK_DESTROY (frame->root); + return 0; +} + + +void +client_protocol_reconnect (void *trans_ptr) +{ + transport_t *trans = NULL; + client_connection_t *conn = NULL; + struct timeval tv = {0, 0}; + + trans = trans_ptr; + conn = trans->xl_private; + pthread_mutex_lock (&conn->lock); + { + if (conn->reconnect) + gf_timer_call_cancel (trans->xl->ctx, + conn->reconnect); + conn->reconnect = 0; + + if (conn->connected == 0) { + tv.tv_sec = 10; + + gf_log (trans->xl->name, GF_LOG_DEBUG, + "attempting reconnect"); + transport_connect (trans); + + conn->reconnect = + gf_timer_call_after (trans->xl->ctx, tv, + client_protocol_reconnect, + trans); + } else { + gf_log (trans->xl->name, GF_LOG_DEBUG, + "breaking reconnect chain"); + } + } + pthread_mutex_unlock (&conn->lock); +} + +/* + * client_protocol_cleanup - cleanup function + * @trans: transport object + * + */ +int +protocol_client_cleanup (transport_t *trans) +{ + client_connection_t *conn = NULL; + struct saved_frames *saved_frames = NULL; + + conn = trans->xl_private; + + gf_log (trans->xl->name, GF_LOG_DEBUG, + "cleaning up state in transport object %p", trans); + + pthread_mutex_lock (&conn->lock); + { + saved_frames = conn->saved_frames; + conn->saved_frames = saved_frames_new (); + +/* + trav = conn->saved_fds->members_list; + this = trans->xl; + + while (trav) { + fd_t *fd_tmp = (fd_t *)(long) strtoul (trav->key, + NULL, 0); + fd_ctx_del (fd_tmp, this, NULL); + trav = trav->next; + } + + dict_destroy (conn->saved_fds); + + conn->saved_fds = get_new_dict_full (64); +*/ + /* bailout logic cleanup */ + memset (&(conn->last_sent), 0, + sizeof (conn->last_sent)); + + memset (&(conn->last_received), 0, + sizeof (conn->last_received)); + + if (conn->timer) { + gf_timer_call_cancel (trans->xl->ctx, conn->timer); + conn->timer = NULL; + } + + if (conn->reconnect == NULL) { + /* :O This part is empty.. any thing missing? */ + } + } + pthread_mutex_unlock (&conn->lock); + + saved_frames_destroy (trans->xl, saved_frames, + gf_fops, gf_mops, gf_cbks); + + return 0; +} + + +/* cbk callbacks */ +int +client_releasedir_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + STACK_DESTROY (frame->root); + return 0; +} + + +int +client_release_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + STACK_DESTROY (frame->root); + return 0; +} + + +int +client_forget_cbk (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + client_conf_t *conf = NULL; + client_forget_t forget = {0, }; + uint8_t send_forget = 0; + int32_t ret = -1; + + + conf = frame->this->private; + LOCK (&conf->forget.lock); + { + conf->forget.frames_in_transit--; + + ret = client_get_forgets (frame->this, &forget); + if (ret <= 0) + send_forget = 0; + else + send_forget = 1; + } + UNLOCK (&conf->forget.lock); + + if (send_forget) { + ret = protocol_client_xfer (forget.frame, frame->this, + CLIENT_CHANNEL (frame->this, + CHANNEL_BULK), + GF_OP_TYPE_CBK_REQUEST, + GF_CBK_FORGET, + forget.hdr, forget.hdrlen, + NULL, 0, NULL); + } + + STACK_DESTROY (frame->root); + return 0; +} + + +static gf_op_t gf_fops[] = { + [GF_FOP_STAT] = client_stat_cbk, + [GF_FOP_READLINK] = client_readlink_cbk, + [GF_FOP_MKNOD] = client_mknod_cbk, + [GF_FOP_MKDIR] = client_mkdir_cbk, + [GF_FOP_UNLINK] = client_unlink_cbk, + [GF_FOP_RMDIR] = client_rmdir_cbk, + [GF_FOP_SYMLINK] = client_symlink_cbk, + [GF_FOP_RENAME] = client_rename_cbk, + [GF_FOP_LINK] = client_link_cbk, + [GF_FOP_CHMOD] = client_chmod_cbk, + [GF_FOP_CHOWN] = client_chown_cbk, + [GF_FOP_TRUNCATE] = client_truncate_cbk, + [GF_FOP_OPEN] = client_open_cbk, + [GF_FOP_READ] = client_readv_cbk, + [GF_FOP_WRITE] = client_write_cbk, + [GF_FOP_STATFS] = client_statfs_cbk, + [GF_FOP_FLUSH] = client_flush_cbk, + [GF_FOP_FSYNC] = client_fsync_cbk, + [GF_FOP_SETXATTR] = client_setxattr_cbk, + [GF_FOP_GETXATTR] = client_getxattr_cbk, + [GF_FOP_REMOVEXATTR] = client_removexattr_cbk, + [GF_FOP_OPENDIR] = client_opendir_cbk, + [GF_FOP_GETDENTS] = client_getdents_cbk, + [GF_FOP_FSYNCDIR] = client_fsyncdir_cbk, + [GF_FOP_ACCESS] = client_access_cbk, + [GF_FOP_CREATE] = client_create_cbk, + [GF_FOP_FTRUNCATE] = client_ftruncate_cbk, + [GF_FOP_FSTAT] = client_fstat_cbk, + [GF_FOP_LK] = client_lk_common_cbk, + [GF_FOP_UTIMENS] = client_utimens_cbk, + [GF_FOP_FCHMOD] = client_fchmod_cbk, + [GF_FOP_FCHOWN] = client_fchown_cbk, + [GF_FOP_LOOKUP] = client_lookup_cbk, + [GF_FOP_SETDENTS] = client_setdents_cbk, + [GF_FOP_READDIR] = client_readdir_cbk, + [GF_FOP_INODELK] = client_inodelk_cbk, + [GF_FOP_FINODELK] = client_finodelk_cbk, + [GF_FOP_ENTRYLK] = client_entrylk_cbk, + [GF_FOP_FENTRYLK] = client_fentrylk_cbk, + [GF_FOP_CHECKSUM] = client_checksum_cbk, + [GF_FOP_XATTROP] = client_xattrop_cbk, + [GF_FOP_FXATTROP] = client_fxattrop_cbk, +}; + +static gf_op_t gf_mops[] = { + [GF_MOP_SETVOLUME] = client_setvolume_cbk, + [GF_MOP_GETVOLUME] = client_enosys_cbk, + [GF_MOP_STATS] = client_stats_cbk, + [GF_MOP_SETSPEC] = client_setspec_cbk, + [GF_MOP_GETSPEC] = client_getspec_cbk, + [GF_MOP_PING] = client_ping_cbk, +}; + +static gf_op_t gf_cbks[] = { + [GF_CBK_FORGET] = client_forget_cbk, + [GF_CBK_RELEASE] = client_release_cbk, + [GF_CBK_RELEASEDIR] = client_releasedir_cbk +}; + +/* + * client_protocol_interpret - protocol interpreter + * @trans: transport object + * @blk: data block + * + */ +int +protocol_client_interpret (xlator_t *this, transport_t *trans, + char *hdr_p, size_t hdrlen, + char *buf_p, size_t buflen) +{ + int ret = -1; + call_frame_t *frame = NULL; + gf_hdr_common_t *hdr = NULL; + uint64_t callid = 0; + int type = -1; + int op = -1; + + + hdr = (gf_hdr_common_t *)hdr_p; + + type = ntoh32 (hdr->type); + op = ntoh32 (hdr->op); + callid = ntoh64 (hdr->callid); + + frame = lookup_frame (trans, op, type, callid); + if (frame == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "no frame for callid=%"PRId64" type=%d op=%d", + callid, type, op); + return 0; + } + + switch (type) { + case GF_OP_TYPE_FOP_REPLY: + if ((op > GF_FOP_MAXVALUE) || + (op < 0)) { + gf_log (trans->xl->name, GF_LOG_WARNING, + "invalid fop '%d'", op); + } else { + ret = gf_fops[op] (frame, hdr, hdrlen, buf_p, buflen); + } + break; + case GF_OP_TYPE_MOP_REPLY: + if ((op > GF_MOP_MAXVALUE) || + (op < 0)) { + gf_log (trans->xl->name, GF_LOG_WARNING, + "invalid fop '%d'", op); + } else { + ret = gf_mops[op] (frame, hdr, hdrlen, buf_p, buflen); + } + break; + case GF_OP_TYPE_CBK_REPLY: + if ((op > GF_CBK_MAXVALUE) || + (op < 0)) { + gf_log (trans->xl->name, GF_LOG_WARNING, + "invalid cbk '%d'", op); + } else { + ret = gf_cbks[op] (frame, hdr, hdrlen, buf_p, buflen); + } + break; + default: + gf_log (trans->xl->name, GF_LOG_ERROR, + "invalid packet type: %d", type); + break; + } + + return ret; +} + +/* + * init - initiliazation function. called during loading of client protocol + * @this: + * + */ +int32_t +init (xlator_t *this) +{ + transport_t *trans = NULL; + client_conf_t *conf = NULL; + client_connection_t *conn = NULL; + int32_t transport_timeout = 0; + int32_t ping_timeout = 0; + data_t *remote_subvolume = NULL; + int32_t ret = -1; + int i = 0; + + if (this->children) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: client protocol translator cannot have " + "subvolumes"); + goto out; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + remote_subvolume = dict_get (this->options, "remote-subvolume"); + if (remote_subvolume == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "missing 'option remote-subvolume'."); + goto out; + } + + ret = dict_get_int32 (this->options, "transport-timeout", + &transport_timeout); + if (ret >= 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting transport-timeout to %d", transport_timeout); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "defaulting transport-timeout to 42"); + transport_timeout = 42; + } + + ret = dict_get_int32 (this->options, "ping-timeout", + &ping_timeout); + if (ret >= 0) { + gf_log (this->name, GF_LOG_DEBUG, + "setting ping-timeout to %d", ping_timeout); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "defaulting ping-timeout to 10"); + ping_timeout = 10; + } + + conf = CALLOC (1, sizeof (client_conf_t)); + + LOCK_INIT (&conf->forget.lock); + pthread_mutex_init (&conf->mutex, NULL); + conf->saved_fds = get_new_dict_full (64); + + this->private = conf; + + for (i = 0; i < CHANNEL_MAX; i++) { + trans = transport_load (this->options, this); + if (trans == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to load transport"); + ret = -1; + goto out; + } + + conn = CALLOC (1, sizeof (*conn)); + + conn->saved_frames = saved_frames_new (); + + conn->callid = 1; + + memset (&(conn->last_sent), 0, sizeof (conn->last_sent)); + memset (&(conn->last_received), 0, + sizeof (conn->last_received)); + + conn->transport_timeout = transport_timeout; + conn->ping_timeout = ping_timeout; + + pthread_mutex_init (&conn->lock, NULL); + + trans->xl_private = conn; + conf->transport[i] = transport_ref (trans); + } + +#ifndef GF_DARWIN_HOST_OS + { + struct rlimit lim; + + lim.rlim_cur = 1048576; + lim.rlim_max = 1048576; + + ret = setrlimit (RLIMIT_NOFILE, &lim); + if (ret == -1) { + gf_log (this->name, GF_LOG_WARNING, + "WARNING: Failed to set 'ulimit -n 1M': %s", + strerror(errno)); + lim.rlim_cur = 65536; + lim.rlim_max = 65536; + + ret = setrlimit (RLIMIT_NOFILE, &lim); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set max open fd to 64k: %s", + strerror(errno)); + } else { + gf_log (this->name, GF_LOG_ERROR, + "max open fd set to 64k"); + } + + } + } +#endif + ret = 0; +out: + return ret; +} + +/* + * fini - finish function called during unloading of client protocol + * @this: + * + */ +void +fini (xlator_t *this) +{ + /* TODO: Check if its enough.. how to call transport's fini () */ + client_conf_t *conf = NULL; + + conf = this->private; + this->private = NULL; + + if (conf) { + LOCK_DESTROY (&conf->forget.lock); + FREE (conf); + } + return; +} + + +int +protocol_client_handshake (xlator_t *this, transport_t *trans) +{ + gf_hdr_common_t *hdr = NULL; + gf_mop_setvolume_req_t *req = NULL; + dict_t *options = NULL; + int32_t ret = -1; + int hdrlen = 0; + int dict_len = 0; + call_frame_t *fr = NULL; + char *process_uuid_xl; + + options = this->options; + ret = dict_set_str (options, "version", PACKAGE_VERSION); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set version(%s) in options dictionary", + PACKAGE_VERSION); + } + + asprintf (&process_uuid_xl, "%s-%s", this->ctx->process_uuid, + this->name); + ret = dict_set_dynstr (options, "process-uuid", + process_uuid_xl); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to set process-uuid(%s) in options dictionary", + PACKAGE_VERSION); + } + + dict_len = dict_serialized_length (options); + if (dict_len < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get serialized length of dict(%p)", + options); + ret = dict_len; + goto fail; + } + + hdrlen = gf_hdr_len (req, dict_len); + hdr = gf_hdr_new (req, dict_len); + GF_VALIDATE_OR_GOTO(this->name, hdr, fail); + + req = gf_param (hdr); + + ret = dict_serialize (options, req->buf); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to serialize dictionary(%p)", + options); + goto fail; + } + + req->dict_len = hton32 (dict_len); + fr = create_frame (this, this->ctx->pool); + GF_VALIDATE_OR_GOTO(this->name, fr, fail); + + fr->local = trans; + ret = protocol_client_xfer (fr, this, trans, + GF_OP_TYPE_MOP_REQUEST, GF_MOP_SETVOLUME, + hdr, hdrlen, NULL, 0, NULL); + return ret; +fail: + if (hdr) + free (hdr); + return ret; +} + + +int +protocol_client_pollout (xlator_t *this, transport_t *trans) +{ + client_connection_t *conn = NULL; + + conn = trans->xl_private; + + pthread_mutex_lock (&conn->lock); + { + gettimeofday (&conn->last_sent, NULL); + } + pthread_mutex_unlock (&conn->lock); + + return 0; +} + + +int +protocol_client_pollin (xlator_t *this, transport_t *trans) +{ + client_connection_t *conn = NULL; + int ret = -1; + char *buf = NULL; + size_t buflen = 0; + char *hdr = NULL; + size_t hdrlen = 0; + int connected = 0; + + conn = trans->xl_private; + + pthread_mutex_lock (&conn->lock); + { + gettimeofday (&conn->last_received, NULL); + connected = conn->connected; + } + pthread_mutex_unlock (&conn->lock); + + ret = transport_receive (trans, &hdr, &hdrlen, &buf, &buflen); + + if (ret == 0) + { + ret = protocol_client_interpret (this, trans, hdr, hdrlen, + buf, buflen); + } + + /* TODO: use mem-pool */ + FREE (hdr); + + return ret; +} + + +/* + * client_protocol_notify - notify function for client protocol + * @this: + * @trans: transport object + * @event + * + */ + +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + int ret = -1; + transport_t *trans = NULL; + client_connection_t *conn = NULL; + + trans = data; + + switch (event) { + case GF_EVENT_POLLOUT: + { + ret = protocol_client_pollout (this, trans); + + break; + } + case GF_EVENT_POLLIN: + { + ret = protocol_client_pollin (this, trans); + + break; + } + /* no break for ret check to happen below */ + case GF_EVENT_POLLERR: + { + ret = -1; + protocol_client_cleanup (trans); + } + + conn = trans->xl_private; + if (conn->connected) { + xlator_list_t *parent = NULL; + + gf_log (this->name, GF_LOG_INFO, "disconnected"); + + parent = this->parents; + while (parent) { + parent->xlator->notify (parent->xlator, + GF_EVENT_CHILD_DOWN, + this); + parent = parent->next; + } + + conn->connected = 0; + if (conn->reconnect == 0) + client_protocol_reconnect (trans); + } + break; + + case GF_EVENT_PARENT_UP: + { + xlator_list_t *parent = NULL; + client_conf_t *conf = NULL; + int i = 0; + transport_t *trans = NULL; + + conf = this->private; + for (i = 0; i < CHANNEL_MAX; i++) { + trans = conf->transport[i]; + if (!trans) { + gf_log (this->name, GF_LOG_DEBUG, + "transport init failed"); + return -1; + } + + conn = trans->xl_private; + + gf_log (this->name, GF_LOG_DEBUG, + "got GF_EVENT_PARENT_UP, attempting connect " + "on transport"); + + client_protocol_reconnect (trans); + } + + /* Let the connection/re-connection happen in + * background, for now, don't hang here, + * tell the parents that i am all ok.. + */ + parent = trans->xl->parents; + while (parent) { + parent->xlator->notify (parent->xlator, + GF_EVENT_CHILD_CONNECTING, + trans->xl); + parent = parent->next; + } + } + break; + + case GF_EVENT_CHILD_UP: + { + char *handshake = NULL; + + ret = dict_get_str (this->options, "disable-handshake", + &handshake); + gf_log (this->name, GF_LOG_DEBUG, + "got GF_EVENT_CHILD_UP"); + if ((ret < 0) || + (strcasecmp (handshake, "on"))) { + ret = protocol_client_handshake (this, trans); + } else { + conn = trans->xl_private; + conn->connected = 1; + ret = default_notify (this, event, trans); + } + + if (ret) + transport_disconnect (trans); + + } + break; + + default: + gf_log (this->name, GF_LOG_DEBUG, + "got %d, calling default_notify ()", event); + + default_notify (this, event, data); + break; + } + + return ret; +} + + +struct xlator_fops fops = { + .stat = client_stat, + .readlink = client_readlink, + .mknod = client_mknod, + .mkdir = client_mkdir, + .unlink = client_unlink, + .rmdir = client_rmdir, + .symlink = client_symlink, + .rename = client_rename, + .link = client_link, + .chmod = client_chmod, + .chown = client_chown, + .truncate = client_truncate, + .utimens = client_utimens, + .open = client_open, + .readv = client_readv, + .writev = client_writev, + .statfs = client_statfs, + .flush = client_flush, + .fsync = client_fsync, + .setxattr = client_setxattr, + .getxattr = client_getxattr, + .removexattr = client_removexattr, + .opendir = client_opendir, + .readdir = client_readdir, + .fsyncdir = client_fsyncdir, + .access = client_access, + .ftruncate = client_ftruncate, + .fstat = client_fstat, + .create = client_create, + .lk = client_lk, + .inodelk = client_inodelk, + .finodelk = client_finodelk, + .entrylk = client_entrylk, + .fentrylk = client_fentrylk, + .lookup = client_lookup, + .fchmod = client_fchmod, + .fchown = client_fchown, + .setdents = client_setdents, + .getdents = client_getdents, + .checksum = client_checksum, + .xattrop = client_xattrop, + .fxattrop = client_fxattrop, +}; + +struct xlator_mops mops = { + .stats = client_stats, + .getspec = client_getspec, +}; + +struct xlator_cbks cbks = { + .forget = client_forget, + .release = client_release, + .releasedir = client_releasedir +}; + + +struct volume_options options[] = { + { .key = {"username"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"password"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport-type"}, + .value = {"tcp", "socket", "ib-verbs", "unix", "ib-sdp", + "tcp/client", "ib-verbs/client"}, + .type = GF_OPTION_TYPE_STR + }, + { .key = {"remote-host"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"remote-subvolume"}, + .type = GF_OPTION_TYPE_ANY + }, + { .key = {"transport-timeout"}, + .type = GF_OPTION_TYPE_TIME, + .min = 5, + .max = 1013, + }, + { .key = {"ping-timeout"}, + .type = GF_OPTION_TYPE_TIME, + .min = 5, + .max = 1013, + }, + { .key = {NULL} }, +}; diff --git a/xlators/protocol/client/src/client-protocol.h b/xlators/protocol/client/src/client-protocol.h new file mode 100644 index 000000000..c90cc980d --- /dev/null +++ b/xlators/protocol/client/src/client-protocol.h @@ -0,0 +1,173 @@ +/* + Copyright (c) 2006, 2007 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CLIENT_PROTOCOL_H +#define _CLIENT_PROTOCOL_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <arpa/inet.h> +#include "inode.h" +#include "timer.h" +#include "byte-order.h" + +#define CLIENT_PROTO_FORGET_LIMIT 128 +#define CLIENT_PORT_CIELING 1023 + +#define GF_CLIENT_INODE_SELF 0 +#define GF_CLIENT_INODE_PARENT 1 + +#define CLIENT_CONF(this) ((client_conf_t *)(this->private)) + +#define RECEIVE_TIMEOUT(_cprivate,_current) \ + ((_cprivate->last_received.tv_sec + \ + _cprivate->transport_timeout) < \ + _current.tv_sec) + +#define SEND_TIMEOUT(_cprivate,_current) \ + ((_cprivate->last_sent.tv_sec + \ + _cprivate->transport_timeout) < \ + _current.tv_sec) + +enum { + CHANNEL_BULK = 0, + CHANNEL_LOWLAT = 1, + CHANNEL_MAX +}; +#define CLIENT_CHANNEL(xl,id) \ + (((client_conf_t *)(xl->private))->transport[id]) + +struct client_connection; +typedef struct client_connection client_connection_t; + +#include "stack.h" +#include "xlator.h" +#include "transport.h" +#include "protocol.h" + +struct _client_conf { + transport_t *transport[CHANNEL_MAX]; + xlator_t *child; + + /* enhancement for 'forget', a must required where lot + of stats happening */ + struct { + uint64_t ino_array[CLIENT_PROTO_FORGET_LIMIT + 4]; + uint32_t count; + uint32_t frames_in_transit; + gf_lock_t lock; + } forget; + dict_t *saved_fds; + pthread_mutex_t mutex; +}; +typedef struct _client_conf client_conf_t; + +/* This will be stored in transport_t->xl_private */ +struct client_connection { + pthread_mutex_t lock; + uint64_t callid; + struct saved_frames *saved_frames; + int32_t transport_timeout; + int32_t ping_started; + int32_t ping_timeout; + gf_timer_t *reconnect; + char connected; + uint64_t max_block_size; + struct timeval last_sent; + struct timeval last_received; + gf_timer_t *timer; + gf_timer_t *ping_timer; +}; + +typedef struct { + loc_t loc; + loc_t loc2; + fd_t *fd; +} client_local_t; + +typedef struct { + gf_hdr_common_t *hdr; + size_t hdrlen; + call_frame_t *frame; +} client_forget_t; + +static inline void +gf_string_to_stat(char *string, struct stat *stbuf) +{ + uint64_t dev = 0; + uint64_t ino = 0; + uint32_t mode = 0; + uint32_t nlink = 0; + uint32_t uid = 0; + uint32_t gid = 0; + uint64_t rdev = 0; + uint64_t size = 0; + uint32_t blksize = 0; + uint64_t blocks = 0; + uint32_t atime = 0; + uint32_t atime_nsec = 0; + uint32_t mtime = 0; + uint32_t mtime_nsec = 0; + uint32_t ctime = 0; + uint32_t ctime_nsec = 0; + + sscanf (string, GF_STAT_PRINT_FMT_STR, + &dev, + &ino, + &mode, + &nlink, + &uid, + &gid, + &rdev, + &size, + &blksize, + &blocks, + &atime, + &atime_nsec, + &mtime, + &mtime_nsec, + &ctime, + &ctime_nsec); + + stbuf->st_dev = dev; + stbuf->st_ino = ino; + stbuf->st_mode = mode; + stbuf->st_nlink = nlink; + stbuf->st_uid = uid; + stbuf->st_gid = gid; + stbuf->st_rdev = rdev; + stbuf->st_size = size; + stbuf->st_blksize = blksize; + stbuf->st_blocks = blocks; + + stbuf->st_atime = atime; + stbuf->st_mtime = mtime; + stbuf->st_ctime = ctime; + + ST_ATIM_NSEC_SET(stbuf, atime_nsec); + ST_MTIM_NSEC_SET(stbuf, mtime_nsec); + ST_CTIM_NSEC_SET(stbuf, ctime_nsec); + +} + +#endif diff --git a/xlators/protocol/client/src/saved-frames.c b/xlators/protocol/client/src/saved-frames.c new file mode 100644 index 000000000..0d1366d82 --- /dev/null +++ b/xlators/protocol/client/src/saved-frames.c @@ -0,0 +1,178 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#include "saved-frames.h" +#include "common-utils.h" +#include "protocol.h" +#include "xlator.h" + + + +struct saved_frames * +saved_frames_new (void) +{ + struct saved_frames *saved_frames = NULL; + + saved_frames = CALLOC (sizeof (*saved_frames), 1); + if (!saved_frames) { + return NULL; + } + + INIT_LIST_HEAD (&saved_frames->fops.list); + INIT_LIST_HEAD (&saved_frames->mops.list); + INIT_LIST_HEAD (&saved_frames->cbks.list); + + return saved_frames; +} + + +struct saved_frame * +get_head_frame_for_type (struct saved_frames *frames, int8_t type) +{ + struct saved_frame *head_frame = NULL; + + switch (type) { + case GF_OP_TYPE_FOP_REQUEST: + case GF_OP_TYPE_FOP_REPLY: + head_frame = &frames->fops; + break; + case GF_OP_TYPE_MOP_REQUEST: + case GF_OP_TYPE_MOP_REPLY: + head_frame = &frames->mops; + break; + case GF_OP_TYPE_CBK_REQUEST: + case GF_OP_TYPE_CBK_REPLY: + head_frame = &frames->cbks; + break; + } + + return head_frame; +} + + +int +saved_frames_put (struct saved_frames *frames, call_frame_t *frame, + int32_t op, int8_t type, int64_t callid) +{ + struct saved_frame *saved_frame = NULL; + struct saved_frame *head_frame = NULL; + + head_frame = get_head_frame_for_type (frames, type); + + saved_frame = CALLOC (sizeof (*saved_frame), 1); + if (!saved_frame) { + return -ENOMEM; + } + + INIT_LIST_HEAD (&saved_frame->list); + saved_frame->frame = frame; + saved_frame->op = op; + saved_frame->type = type; + saved_frame->callid = callid; + +// gettimeofday (&saved_frame->saved_at, NULL); + + list_add (&saved_frame->list, &head_frame->list); + frames->count++; + + return 0; +} + + +call_frame_t * +saved_frames_get (struct saved_frames *frames, int32_t op, + int8_t type, int64_t callid) +{ + struct saved_frame *saved_frame = NULL; + struct saved_frame *tmp = NULL; + struct saved_frame *head_frame = NULL; + call_frame_t *frame = NULL; + + head_frame = get_head_frame_for_type (frames, type); + + list_for_each_entry (tmp, &head_frame->list, list) { + if (tmp->callid == callid) { + list_del_init (&tmp->list); + frames->count--; + saved_frame = tmp; + break; + } + } + + if (saved_frame) + frame = saved_frame->frame; + + FREE (saved_frame); + + return frame; +} + + +void +saved_frames_unwind (xlator_t *this, struct saved_frames *saved_frames, + struct saved_frame *head, + gf_op_t gf_ops[], char *gf_op_list[]) +{ + struct saved_frame *trav = NULL; + struct saved_frame *tmp = NULL; + + gf_hdr_common_t hdr = {0, }; + call_frame_t *frame = NULL; + dict_t *reply = NULL; + + reply = get_new_dict(); + dict_ref (reply); + + hdr.rsp.op_ret = hton32 (-1); + hdr.rsp.op_errno = hton32 (ENOTCONN); + + list_for_each_entry_safe (trav, tmp, &head->list, list) { + gf_log (this->name, GF_LOG_ERROR, + "forced unwinding frame type(%d) op(%s)", + trav->type, gf_op_list[trav->op]); + + hdr.type = hton32 (trav->type); + hdr.op = hton32 (trav->op); + + frame = trav->frame; + frame->root->rsp_refs = reply; + + saved_frames->count--; + + gf_ops[trav->op] (frame, &hdr, sizeof (hdr), NULL, 0); + + list_del_init (&trav->list); + FREE (trav); + } + + dict_unref (reply); +} + + +void +saved_frames_destroy (xlator_t *this, struct saved_frames *frames, + gf_op_t gf_fops[], gf_op_t gf_mops[], gf_op_t gf_cbks[]) +{ + saved_frames_unwind (this, frames, &frames->fops, gf_fops, gf_fop_list); + saved_frames_unwind (this, frames, &frames->mops, gf_mops, gf_mop_list); + saved_frames_unwind (this, frames, &frames->cbks, gf_cbks, gf_cbk_list); + + FREE (frames); +} diff --git a/xlators/protocol/client/src/saved-frames.h b/xlators/protocol/client/src/saved-frames.h new file mode 100644 index 000000000..e402feba3 --- /dev/null +++ b/xlators/protocol/client/src/saved-frames.h @@ -0,0 +1,74 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _SAVED_FRAMES_H +#define _SAVED_FRAMES_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <stdint.h> +#include <sys/time.h> +#include "stack.h" +#include "list.h" +#include "protocol.h" + +/* UGLY: have common typedef b/w saved-frames.c and protocol-client.c */ +typedef int32_t (*gf_op_t) (call_frame_t *frame, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen); + + +struct saved_frame { + union { + struct list_head list; + struct { + struct saved_frame *frame_next; + struct saved_frame *frame_prev; + }; + }; + + struct timeval saved_at; + call_frame_t *frame; + int32_t op; + int8_t type; + uint64_t callid; +}; + + +struct saved_frames { + int64_t count; + struct saved_frame fops; + struct saved_frame mops; + struct saved_frame cbks; +}; + + +struct saved_frames *saved_frames_new (); +int saved_frames_put (struct saved_frames *frames, call_frame_t *frame, + int32_t op, int8_t type, int64_t callid); +call_frame_t *saved_frames_get (struct saved_frames *frames, int32_t op, + int8_t type, int64_t callid); +void saved_frames_destroy (xlator_t *this, struct saved_frames *frames, + gf_op_t gf_fops[], gf_op_t gf_mops[], + gf_op_t gf_cbks[]); + +#endif /* _SAVED_FRAMES_H */ diff --git a/xlators/protocol/server/Makefile.am b/xlators/protocol/server/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/protocol/server/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/protocol/server/src/Makefile.am b/xlators/protocol/server/src/Makefile.am new file mode 100644 index 000000000..dcd92aeed --- /dev/null +++ b/xlators/protocol/server/src/Makefile.am @@ -0,0 +1,18 @@ + +xlator_LTLIBRARIES = server.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/protocol + +server_la_LDFLAGS = -module -avoidversion + +server_la_SOURCES = server-protocol.c server-dentry.c server-helpers.c +server_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = server-protocol.h server-helpers.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -Wall -D$(GF_HOST_OS) \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \ + -DDATADIR=\"$(localstatedir)\" -DCONFDIR=\"$(sysconfdir)/glusterfs\" \ + $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/protocol/server/src/server-dentry.c b/xlators/protocol/server/src/server-dentry.c new file mode 100644 index 000000000..d3a69a393 --- /dev/null +++ b/xlators/protocol/server/src/server-dentry.c @@ -0,0 +1,413 @@ +#include "glusterfs.h" +#include "xlator.h" +#include "server-protocol.h" +#include "server-helpers.h" +#include <libgen.h> + +/* SERVER_DENTRY_STATE_PREPARE - prepare a fresh state for use + * + * @state - an empty state + * @loc - loc_t which needs to resolved + * @parent - most immediate parent of @loc available in dentry cache + * @resolved - component of @loc->path which has been resolved + * through dentry cache + */ +#define SERVER_DENTRY_STATE_PREPARE(_state,_loc,_parent,_resolved) do { \ + size_t pathlen = 0; \ + size_t resolvedlen = 0; \ + char *path = NULL; \ + int pad = 0; \ + pathlen = strlen (_loc->path) + 1; \ + path = CALLOC (1, pathlen); \ + _state->loc.parent = inode_ref (_parent); \ + _state->loc.inode = inode_new (_state->itable); \ + if (_resolved) { \ + resolvedlen = strlen (_resolved); \ + strncpy (path, _resolved, resolvedlen); \ + _state->resolved = memdup (path, pathlen); \ + if (resolvedlen == 1) /* only root resolved */ \ + pad = 0; \ + else { \ + pad = 1; \ + path[resolvedlen] = '/'; \ + } \ + strcpy_till (path + resolvedlen + pad, loc->path + resolvedlen + pad, '/'); \ + } else { \ + strncpy (path, _loc->path, pathlen); \ + } \ + _state->loc.path = path; \ + _state->loc.name = strrchr (path, '/'); \ + if (_state->loc.name) \ + _state->loc.name++; \ + _state->path = strdup (_loc->path); \ + }while (0); + +/* SERVER_DENTRY_UPDATE_STATE - update a server_state_t, to prepare state + * for new lookup + * + * @state - state to be updated. + */ +#define SERVER_DENTRY_UPDATE_STATE(_state) do { \ + char *path = NULL; \ + size_t pathlen = 0; \ + strcpy (_state->resolved, _state->loc.path); \ + pathlen = strlen (_state->loc.path); \ + if (!strcmp (_state->resolved, _state->path)) { \ + free (_state->resolved); \ + _state->resolved = NULL; \ + goto resume; \ + } \ + \ + path = (char *)(_state->loc.path + pathlen); \ + path[0] = '/'; \ + strcpy_till (path + 1, \ + _state->path + pathlen + 1, '/'); \ + _state->loc.name = strrchr (_state->loc.path, '/'); \ + if (_state->loc.name) \ + _state->loc.name++; \ + inode_unref (_state->loc.parent); \ + _state->loc.parent = inode_ref (_state->loc.inode); \ + inode_unref (_state->loc.inode); \ + _state->loc.inode = inode_new (_state->itable); \ + }while (0); + +/* NOTE: should be used only for a state which was created by __do_path_resolve + * using any other state will result in double free corruption. + */ +#define SERVER_STATE_CLEANUP(_state) do { \ + if (_state->resolved) \ + free (_state->resolved); \ + if (_state->path) \ + free (_state->path); \ + server_loc_wipe (&_state->loc); \ + free_state (_state); \ + } while (0); + +/* strcpy_till - copy @dname to @dest, until 'delim' is encountered in @dest + * @dest - destination string + * @dname - source string + * @delim - delimiter character + * + * return - NULL is returned if '0' is encountered in @dname, otherwise returns + * a pointer to remaining string begining in @dest. + */ +static char * +strcpy_till (char *dest, const char *dname, char delim) +{ + char *src = NULL; + int idx = 0; + char *ret = NULL; + + src = (char *)dname; + while (src[idx] && (src[idx] != delim)) { + dest[idx] = src[idx]; + idx++; + } + + dest[idx] = 0; + + if (src[idx] == 0) + ret = NULL; + else + ret = &(src[idx]); + + return ret; +} + +/* __server_path_to_parenti - derive parent inode for @path. if immediate parent is + * not available in the dentry cache, return nearest + * available parent inode and set @reslv to the path of + * the returned directory. + * + * @itable - inode table + * @path - path whose parent has to be looked up. + * @reslv - if immediate parent is not available, reslv will be set to path of the + * resolved parent. + * + * return - should never return NULL. should at least return '/' inode. + */ +static inode_t * +__server_path_to_parenti (inode_table_t *itable, + const char *path, + char **reslv) +{ + char *resolved_till = NULL; + char *strtokptr = NULL; + char *component = NULL; + char *next_component = NULL; + char *pathdup = NULL; + inode_t *curr = NULL; + inode_t *parent = NULL; + size_t pathlen = 0; + + + pathlen = STRLEN_0 (path); + resolved_till = CALLOC (1, pathlen); + + GF_VALIDATE_OR_GOTO("server-dentry", resolved_till, out); + pathdup = strdup (path); + GF_VALIDATE_OR_GOTO("server-dentry", pathdup, out); + + parent = inode_ref (itable->root); + curr = NULL; + + component = strtok_r (pathdup, "/", &strtokptr); + + while (component) { + curr = inode_search (itable, parent->ino, component); + if (!curr) { + /* if current component was the last component + set it to NULL + */ + component = strtok_r (NULL, "/", &strtokptr); + break; + } + + /* It is OK to append the component even if it is the + last component in the path, because, if 'next_component' + returns NULL, @parent will remain the same and + @resolved_till will not be sent back + */ + + strcat (resolved_till, "/"); + strcat (resolved_till, component); + + next_component = strtok_r (NULL, "/", &strtokptr); + + if (next_component) { + inode_unref (parent); + parent = curr; + curr = NULL; + } else { + /* will break */ + inode_unref (curr); + } + + component = next_component; + } + + free (pathdup); + + if (component) { + *reslv = resolved_till; + } else { + free (resolved_till); + } +out: + return parent; +} + + +/* __do_path_resolve_cbk - + * + * @frame - + * @cookie - + * @this - + * @op_ret - + * @op_errno - + * @inode - + * @stbuf - + * @dict - + * + */ +static int32_t +__do_path_resolve_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stbuf, + dict_t *dict) +{ + server_state_t *state = NULL; + call_stub_t *stub = NULL; + inode_t *parent = NULL; + + stub = frame->local; + state = CALL_STATE(frame); + + parent = state->loc.parent; + + if (op_ret == -1) { + if (strcmp (state->path, state->loc.path)) + parent = NULL; + + server_stub_resume (stub, op_ret, op_errno, NULL, parent); + goto cleanup; + } else { + if (inode->ino == 0) { + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "looked up for %s (%"PRId64"/%s)", + state->loc.path, state->loc.parent->ino, state->loc.name); + inode_link (inode, state->loc.parent, state->loc.name, stbuf); + inode_lookup (inode); + } + + if (state->resolved) { + SERVER_DENTRY_UPDATE_STATE(state); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "looking up for %s (%"PRId64"/%s)", + state->loc.path, state->loc.parent->ino, state->loc.name); + + STACK_WIND (frame, + __do_path_resolve_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->lookup, + &(state->loc), + 0); + + goto out; + } + resume: + /* we are done, call stub_resume() to do rest of the job */ + server_stub_resume (stub, op_ret, op_errno, inode, parent); + cleanup: + SERVER_STATE_CLEANUP(state); + /* stub will be freed by stub_resume, leave no traces */ + frame->local = NULL; + STACK_DESTROY (frame->root); + } +out: + return 0; +} + +/* __do_path_resolve - resolve @loc->path into @loc->inode and @loc->parent. also + * update the dentry cache + * + * @stub - call stub to resume after resolving @loc->path + * @loc - loc to resolve before resuming @stub. + * + * return - return value of __do_path_resolve doesn't matter to the caller, if @stub + * is not NULL. + */ +static int32_t +__do_path_resolve (call_stub_t *stub, + const loc_t *loc) +{ + int32_t ret = -1; + char *resolved = NULL; + call_frame_t *new_frame = NULL; + server_state_t *state = NULL, *new_state = NULL; + inode_t *parent = NULL; + + state = CALL_STATE(stub->frame); + parent = loc->parent; + if (parent) { + inode_ref (parent); + gf_log (BOUND_XL(stub->frame)->name, GF_LOG_DEBUG, + "loc->parent(%"PRId64") already present. sending lookup " + "for %"PRId64"/%s", parent->ino, parent->ino, loc->name); + resolved = strdup (loc->path); + resolved = dirname (resolved); + } else { + parent = __server_path_to_parenti (state->itable, loc->path, &resolved); + } + + if (parent == NULL) { + /* fire in the bush.. run! run!! run!!! */ + gf_log ("server", + GF_LOG_CRITICAL, + "failed to get parent inode number"); + goto panic; + } + + if (resolved) { + gf_log (BOUND_XL(stub->frame)->name, + GF_LOG_DEBUG, + "resolved path(%s) till %"PRId64"(%s). " + "sending lookup for remaining path", + loc->path, parent->ino, resolved); + } + + { + new_frame = server_copy_frame (stub->frame); + new_state = CALL_STATE(new_frame); + + SERVER_DENTRY_STATE_PREPARE(new_state, loc, parent, resolved); + + if (parent) + inode_unref (parent); /* __server_path_to_parenti()'s inode_ref */ + free (resolved); + /* now interpret state as: + * state->path - compelete pathname to resolve + * state->resolved - pathname resolved from dentry cache + */ + new_frame->local = stub; + STACK_WIND (new_frame, + __do_path_resolve_cbk, + BOUND_XL(new_frame), + BOUND_XL(new_frame)->fops->lookup, + &(new_state->loc), + 0); + goto out; + } +panic: + server_stub_resume (stub, -1, ENOENT, NULL, NULL); +out: + return ret; +} + + +/* + * do_path_lookup - transform a pathname into inode, with the compelete + * dentry tree upto inode built. + * + * @stub - call stub to resume after completing pathname to inode transform + * @loc - location. valid fields that do_path_lookup() uses in @loc are + * @loc->path - pathname + * @loc->ino - inode number + * + * return - do_path_lookup returns only after complete dentry tree is built + * upto @loc->path. + */ +int32_t +do_path_lookup (call_stub_t *stub, + const loc_t *loc) +{ + char *pathname = NULL; + char *directory = NULL; + inode_t *inode = NULL; + inode_t *parent = NULL; + server_state_t *state = NULL; + + state = CALL_STATE(stub->frame); + + inode = inode_from_path (state->itable, loc->path); + pathname = strdup (loc->path); + directory = dirname (pathname); + parent = inode_from_path (state->itable, directory); + + if (inode && parent) { + gf_log (BOUND_XL(stub->frame)->name, + GF_LOG_DEBUG, + "resolved path(%s) to %"PRId64"/%"PRId64"(%s)", + loc->path, parent->ino, inode->ino, loc->name); + server_stub_resume (stub, 0, 0, inode, parent); + inode_unref (inode); + inode_unref (parent); + } else { + gf_log (BOUND_XL(stub->frame)->name, + GF_LOG_DEBUG, + "resolved path(%s) to %p(%"PRId64")/%p(%"PRId64")", + loc->path, parent, (parent ? parent->ino : 0), + inode, (inode ? inode->ino : 0)); + if (parent) { + inode_unref (parent); + } else if (inode) { + inode_unref (inode); + gf_log (BOUND_XL(stub->frame)->name, + GF_LOG_ERROR, + "undesired behaviour. inode(%"PRId64") for %s " + "exists without parent (%s)", + inode->ino, loc->path, directory); + } + __do_path_resolve (stub, loc); + } + + if (pathname) + free (pathname); + + return 0; +} diff --git a/xlators/protocol/server/src/server-helpers.c b/xlators/protocol/server/src/server-helpers.c new file mode 100644 index 000000000..b51c11aa9 --- /dev/null +++ b/xlators/protocol/server/src/server-helpers.c @@ -0,0 +1,586 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include "server-protocol.h" +#include "server-helpers.h" + + +/* server_loc_fill - derive a loc_t for a given inode number + * + * NOTE: make sure that @loc is empty, because any pointers it holds with reference will + * be leaked after returning from here. + */ +int +server_loc_fill (loc_t *loc, server_state_t *state, + ino_t ino, ino_t par, + const char *name, const char *path) +{ + inode_t *inode = NULL; + inode_t *parent = NULL; + int32_t ret = -1; + char *dentry_path = NULL; + + + GF_VALIDATE_OR_GOTO ("server", loc, out); + GF_VALIDATE_OR_GOTO ("server", state, out); + GF_VALIDATE_OR_GOTO ("server", path, out); + + /* anything beyond this point is success */ + ret = 0; + loc->ino = ino; + inode = loc->inode; + if (inode == NULL) { + if (ino) + inode = inode_search (state->itable, ino, NULL); + + if ((inode == NULL) && + (par && name)) + inode = inode_search (state->itable, par, name); + + loc->inode = inode; + if (inode) + loc->ino = inode->ino; + } + + parent = loc->parent; + if (parent == NULL) { + if (inode) + parent = inode_parent (inode, par, name); + else + parent = inode_search (state->itable, par, NULL); + loc->parent = parent; + } + + if (name && parent) { + ret = inode_path (parent, name, &dentry_path); + if (ret < 0) { + gf_log (state->bound_xl->name, GF_LOG_DEBUG, + "failed to build path for %"PRId64"/%s: %s", + parent->ino, name, strerror (-ret)); + } + } else if (inode) { + ret = inode_path (inode, NULL, &dentry_path); + if (ret < 0) { + gf_log (state->bound_xl->name, GF_LOG_DEBUG, + "failed to build path for %"PRId64": %s", + inode->ino, strerror (-ret)); + + inode_unref (loc->inode); + loc->inode = NULL; + } + } + + if (dentry_path) { + if (strcmp (dentry_path, path)) { + gf_log (state->bound_xl->name, GF_LOG_DEBUG, + "paths differ for inode(%"PRId64"): " + "client path = %s. dentry path = %s", + ino, path, dentry_path); + } + + loc->path = dentry_path; + loc->name = strrchr (loc->path, '/'); + if (loc->name) + loc->name++; + } else { + loc->path = strdup (path); + loc->name = strrchr (loc->path, '/'); + if (loc->name) + loc->name++; + } + +out: + return ret; +} + +/* + * stat_to_str - convert struct stat to a ASCII string + * @stbuf: struct stat pointer + * + * not for external reference + */ +char * +stat_to_str (struct stat *stbuf) +{ + char *tmp_buf = NULL; + + uint64_t dev = stbuf->st_dev; + uint64_t ino = stbuf->st_ino; + uint32_t mode = stbuf->st_mode; + uint32_t nlink = stbuf->st_nlink; + uint32_t uid = stbuf->st_uid; + uint32_t gid = stbuf->st_gid; + uint64_t rdev = stbuf->st_rdev; + uint64_t size = stbuf->st_size; + uint32_t blksize = stbuf->st_blksize; + uint64_t blocks = stbuf->st_blocks; + uint32_t atime = stbuf->st_atime; + uint32_t mtime = stbuf->st_mtime; + uint32_t ctime = stbuf->st_ctime; + + uint32_t atime_nsec = ST_ATIM_NSEC(stbuf); + uint32_t mtime_nsec = ST_MTIM_NSEC(stbuf); + uint32_t ctime_nsec = ST_CTIM_NSEC(stbuf); + + + asprintf (&tmp_buf, + GF_STAT_PRINT_FMT_STR, + dev, + ino, + mode, + nlink, + uid, + gid, + rdev, + size, + blksize, + blocks, + atime, + atime_nsec, + mtime, + mtime_nsec, + ctime, + ctime_nsec); + + return tmp_buf; +} + + +void +server_loc_wipe (loc_t *loc) +{ + if (loc->parent) + inode_unref (loc->parent); + if (loc->inode) + inode_unref (loc->inode); + if (loc->path) + free ((char *)loc->path); +} + +void +free_state (server_state_t *state) +{ + transport_t *trans = NULL; + + trans = state->trans; + + if (state->fd) + fd_unref (state->fd); + + transport_unref (trans); + + if (state->xattr_req) + dict_unref (state->xattr_req); + + FREE (state); +} + + +call_frame_t * +server_copy_frame (call_frame_t *frame) +{ + call_frame_t *new_frame = NULL; + server_state_t *state = NULL, *new_state = NULL; + + state = frame->root->state; + + new_frame = copy_frame (frame); + + new_state = CALLOC (1, sizeof (server_state_t)); + + new_frame->root->op = frame->root->op; + new_frame->root->type = frame->root->type; + new_frame->root->trans = state->trans; + new_frame->root->state = new_state; + + new_state->bound_xl = state->bound_xl; + new_state->trans = transport_ref (state->trans); + new_state->itable = state->itable; + + return new_frame; +} + +int32_t +gf_add_locker (struct _lock_table *table, + loc_t *loc, + fd_t *fd, + pid_t pid) +{ + int32_t ret = -1; + struct _locker *new = NULL; + uint8_t dir = 0; + + new = CALLOC (1, sizeof (struct _locker)); + if (new == NULL) { + gf_log ("server", GF_LOG_ERROR, + "failed to allocate memory for \'struct _locker\'"); + goto out; + } + INIT_LIST_HEAD (&new->lockers); + + if (fd == NULL) { + loc_copy (&new->loc, loc); + dir = S_ISDIR (new->loc.inode->st_mode); + } else { + new->fd = fd_ref (fd); + dir = S_ISDIR (fd->inode->st_mode); + } + + new->pid = pid; + + LOCK (&table->lock); + { + if (dir) + list_add_tail (&new->lockers, &table->dir_lockers); + else + list_add_tail (&new->lockers, &table->file_lockers); + } + UNLOCK (&table->lock); +out: + return ret; +} + +int32_t +gf_del_locker (struct _lock_table *table, + loc_t *loc, + fd_t *fd, + pid_t pid) +{ + struct _locker *locker = NULL, *tmp = NULL; + int32_t ret = 0; + uint8_t dir = 0; + struct list_head *head = NULL; + struct list_head del; + + INIT_LIST_HEAD (&del); + + if (fd) { + dir = S_ISDIR (fd->inode->st_mode); + } else { + dir = S_ISDIR (loc->inode->st_mode); + } + + LOCK (&table->lock); + { + if (dir) { + head = &table->dir_lockers; + } else { + head = &table->file_lockers; + } + + list_for_each_entry_safe (locker, tmp, head, lockers) { + if (locker->fd && + fd && + (locker->fd == fd) && (locker->pid == pid)) { + list_move_tail (&locker->lockers, &del); + } else if (locker->loc.inode && + loc && + (locker->loc.inode == loc->inode) && + (locker->pid == pid)) { + list_move_tail (&locker->lockers, &del); + } + } + } + UNLOCK (&table->lock); + + tmp = NULL; + locker = NULL; + + list_for_each_entry_safe (locker, tmp, &del, lockers) { + list_del_init (&locker->lockers); + if (locker->fd) + fd_unref (locker->fd); + else + loc_wipe (&locker->loc); + + free (locker); + } + + return ret; +} + +int32_t +gf_direntry_to_bin (dir_entry_t *head, + char **bufferp) +{ + dir_entry_t *trav = NULL; + uint32_t len = 0; + uint32_t this_len = 0; + char *buffer = NULL; + size_t buflen = -1; + char *ptr = NULL; + char *tmp_buf = NULL; + + trav = head->next; + while (trav) { + len += strlen (trav->name); + len += 1; + len += strlen (trav->link); + len += 1; /* for '\n' */ + len += 256; // max possible for statbuf; + trav = trav->next; + } + + buffer = CALLOC (1, len); + if (buffer == NULL) { + gf_log ("server", GF_LOG_ERROR, + "failed to allocate memory for buffer"); + goto out; + } + + ptr = buffer; + trav = head->next; + while (trav) { + tmp_buf = stat_to_str (&trav->buf); + /* tmp_buf will have \n before \0 */ + + this_len = sprintf (ptr, "%s/%s%s\n", + trav->name, tmp_buf, + trav->link); + + FREE (tmp_buf); + trav = trav->next; + ptr += this_len; + } + if (bufferp) + *bufferp = buffer; + buflen = strlen (buffer); + +out: + return buflen; +} + + +static struct _lock_table * +gf_lock_table_new (void) +{ + struct _lock_table *new = NULL; + + new = CALLOC (1, sizeof (struct _lock_table)); + if (new == NULL) { + gf_log ("server-protocol", GF_LOG_CRITICAL, + "failed to allocate memory for new lock table"); + goto out; + } + INIT_LIST_HEAD (&new->dir_lockers); + INIT_LIST_HEAD (&new->file_lockers); + LOCK_INIT (&new->lock); +out: + return new; +} + + +int +server_connection_destroy (xlator_t *this, server_connection_t *conn) +{ + + call_frame_t *frame = NULL, *tmp_frame = NULL; + xlator_t *bound_xl = NULL; + int32_t ret = -1; + server_state_t *state = NULL; + struct list_head file_lockers; + struct list_head dir_lockers; + struct _lock_table *ltable = NULL; + struct _locker *locker = NULL, *tmp = NULL; + struct flock flock = {0,}; + + + bound_xl = (xlator_t *) (conn->bound_xl); + + if (bound_xl) { + /* trans will have ref_count = 1 after this call, but its + ok since this function is called in + GF_EVENT_TRANSPORT_CLEANUP */ + frame = create_frame (this, this->ctx->pool); + + pthread_mutex_lock (&(conn->lock)); + { + if (conn->ltable) { + ltable = conn->ltable; + conn->ltable = NULL; + } + } + pthread_mutex_unlock (&conn->lock); + + INIT_LIST_HEAD (&file_lockers); + INIT_LIST_HEAD (&dir_lockers); + + LOCK (<able->lock); + { + list_splice_init (<able->file_lockers, + &file_lockers); + + list_splice_init (<able->dir_lockers, &dir_lockers); + } + UNLOCK (<able->lock); + free (ltable); + + flock.l_type = F_UNLCK; + flock.l_start = 0; + flock.l_len = 0; + list_for_each_entry_safe (locker, + tmp, &file_lockers, lockers) { + tmp_frame = copy_frame (frame); + /* + pid = 0 is a special case that tells posix-locks + to release all locks from this transport + */ + tmp_frame->root->pid = 0; + tmp_frame->root->trans = conn; + + if (locker->fd) { + STACK_WIND (tmp_frame, server_nop_cbk, + bound_xl, + bound_xl->fops->finodelk, + locker->fd, F_SETLK, &flock); + fd_unref (locker->fd); + } else { + STACK_WIND (tmp_frame, server_nop_cbk, + bound_xl, + bound_xl->fops->inodelk, + &(locker->loc), F_SETLK, &flock); + loc_wipe (&locker->loc); + } + + list_del_init (&locker->lockers); + free (locker); + } + + tmp = NULL; + locker = NULL; + list_for_each_entry_safe (locker, tmp, &dir_lockers, lockers) { + tmp_frame = copy_frame (frame); + + tmp_frame->root->pid = 0; + tmp_frame->root->trans = conn; + + if (locker->fd) { + STACK_WIND (tmp_frame, server_nop_cbk, + bound_xl, + bound_xl->fops->fentrylk, + locker->fd, NULL, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + fd_unref (locker->fd); + } else { + STACK_WIND (tmp_frame, server_nop_cbk, + bound_xl, + bound_xl->fops->entrylk, + &(locker->loc), NULL, + ENTRYLK_UNLOCK, ENTRYLK_WRLCK); + loc_wipe (&locker->loc); + } + + list_del_init (&locker->lockers); + free (locker); + } + + state = CALL_STATE (frame); + if (state) + free (state); + STACK_DESTROY (frame->root); + + pthread_mutex_lock (&(conn->lock)); + { + if (conn->fdtable) { + gf_fd_fdtable_destroy (conn->fdtable); + conn->fdtable = NULL; + } + } + pthread_mutex_unlock (&conn->lock); + + } + + gf_log (this->name, GF_LOG_INFO, "destroyed connection of %s", + conn->id); + + FREE (conn->id); + FREE (conn); + + return ret; +} + + +server_connection_t * +server_connection_get (xlator_t *this, const char *id) +{ + server_connection_t *conn = NULL; + server_connection_t *trav = NULL; + server_conf_t *conf = NULL; + + conf = this->private; + + pthread_mutex_lock (&conf->mutex); + { + list_for_each_entry (trav, &conf->conns, list) { + if (!strcmp (id, trav->id)) { + conn = trav; + break; + } + } + + if (!conn) { + conn = (void *) CALLOC (1, sizeof (*conn)); + + conn->id = strdup (id); + conn->fdtable = gf_fd_fdtable_alloc (); + conn->ltable = gf_lock_table_new (); + + pthread_mutex_init (&conn->lock, NULL); + + list_add (&conn->list, &conf->conns); + } + + conn->ref++; + } + pthread_mutex_unlock (&conf->mutex); + + return conn; +} + + +void +server_connection_put (xlator_t *this, server_connection_t *conn) +{ + server_conf_t *conf = NULL; + server_connection_t *todel = NULL; + + conf = this->private; + + pthread_mutex_lock (&conf->mutex); + { + conn->ref--; + + if (!conn->ref) { + list_del_init (&conn->list); + todel = conn; + } + } + pthread_mutex_unlock (&conf->mutex); + + if (todel) { + server_connection_destroy (this, todel); + } + + return; +} diff --git a/xlators/protocol/server/src/server-helpers.h b/xlators/protocol/server/src/server-helpers.h new file mode 100644 index 000000000..36c0ce98e --- /dev/null +++ b/xlators/protocol/server/src/server-helpers.h @@ -0,0 +1,77 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __SERVER_HELPERS_H__ +#define __SERVER_HELPERS_H__ + +#define CALL_STATE(frame) ((server_state_t *)frame->root->state) + +#define BOUND_XL(frame) ((xlator_t *) CALL_STATE(frame)->bound_xl) + +#define TRANSPORT_FROM_FRAME(frame) ((transport_t *) CALL_STATE(frame)->trans) + +#define SERVER_CONNECTION(frame) \ + ((server_connection_t *) TRANSPORT_FROM_FRAME(frame)->xl_private) + +#define SERVER_CONF(frame) \ + ((server_conf_t *)TRANSPORT_FROM_FRAME(frame)->xl->private) + +#define TRANSPORT_FROM_XLATOR(this) ((((server_conf_t *)this->private))->trans) + +#define INODE_LRU_LIMIT(this) \ + (((server_conf_t *)(this->private))->inode_lru_limit) + +#define IS_ROOT_INODE(inode) (inode == inode->table->root) + +#define IS_NOT_ROOT(pathlen) ((pathlen > 2)? 1 : 0) + +int32_t +server_loc_fill (loc_t *loc, + server_state_t *state, + ino_t ino, + ino_t par, + const char *name, + const char *path); + +char * +stat_to_str (struct stat *stbuf); + +call_frame_t * +server_copy_frame (call_frame_t *frame); + +void free_state (server_state_t *state); + +void server_loc_wipe (loc_t *loc); + +int32_t +gf_add_locker (struct _lock_table *table, + loc_t *loc, + fd_t *fd, + pid_t pid); + +int32_t +gf_del_locker (struct _lock_table *table, + loc_t *loc, + fd_t *fd, + pid_t pid); + +int32_t +gf_direntry_to_bin (dir_entry_t *head, + char **bufferp); +#endif /* __SERVER_HELPERS_H__ */ diff --git a/xlators/protocol/server/src/server-protocol.c b/xlators/protocol/server/src/server-protocol.c new file mode 100644 index 000000000..a5198c1ed --- /dev/null +++ b/xlators/protocol/server/src/server-protocol.c @@ -0,0 +1,7984 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif +#include <time.h> +#include <sys/uio.h> +#include <sys/resource.h> + +#include <libgen.h> + +#include "transport.h" +#include "fnmatch.h" +#include "xlator.h" +#include "protocol.h" +#include "server-protocol.h" +#include "server-helpers.h" +#include "call-stub.h" +#include "defaults.h" +#include "list.h" +#include "dict.h" +#include "compat.h" +#include "compat-errno.h" + + +static void +protocol_server_reply (call_frame_t *frame, + int type, int op, + gf_hdr_common_t *hdr, size_t hdrlen, + struct iovec *vector, int count, + dict_t *refs) +{ + server_state_t *state = NULL; + xlator_t *bound_xl = NULL; + transport_t *trans = NULL; + + bound_xl = BOUND_XL(frame); + state = CALL_STATE(frame); + trans = state->trans; + + hdr->callid = hton64 (frame->root->unique); + hdr->type = hton32 (type); + hdr->op = hton32 (op); + + transport_submit (trans, (char *)hdr, hdrlen, vector, count, refs); + /* TODO: If transport submit fails, there is no reply sent to client, + * its bailed out as of now.. loggically, only this frame should fail. + */ + + STACK_DESTROY (frame->root); + + if (state) + free_state (state); + +} + + +/* + * server_fchmod_cbk + */ +int32_t +server_fchmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fchmod_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + server_state_t *state = NULL; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) { + gf_stat_from_stat (&rsp->stat, stbuf); + } else { + state = CALL_STATE(frame); + + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": FCHMOD %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_FCHMOD, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_fchmod + * + */ +int32_t +server_fchmod (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + server_connection_t *conn = NULL; + gf_fop_fchmod_req_t *req = NULL; + server_state_t *state = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->mode = ntoh32 (req->mode); + } + + GF_VALIDATE_OR_GOTO(bound_xl->name, state->fd, fail); + + STACK_WIND (frame, + server_fchmod_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->fchmod, + state->fd, + state->mode); + + return 0; +fail: + server_fchmod_cbk (frame, NULL, frame->this, + -1, EINVAL, NULL); + return 0; +} + + +/* + * server_fchown_cbk + */ +int32_t +server_fchown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fchown_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + server_state_t *state = NULL; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) { + gf_stat_from_stat (&rsp->stat, stbuf); + } else { + state = CALL_STATE(frame); + + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": FCHOWN %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_FCHOWN, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_fchown + * + */ +int32_t +server_fchown (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + server_connection_t *conn = NULL; + gf_fop_fchown_req_t *req = NULL; + server_state_t *state = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->uid = ntoh32 (req->uid); + state->gid = ntoh32 (req->gid); + } + + GF_VALIDATE_OR_GOTO(bound_xl->name, state->fd, fail); + + STACK_WIND (frame, + server_fchown_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->fchown, + state->fd, + state->uid, + state->gid); + + return 0; +fail: + server_fchown_cbk (frame, NULL, frame->this, + -1, EINVAL, NULL); + return 0; +} + +/* + * server_setdents_cbk - writedir callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * + * not for external reference + */ +int32_t +server_setdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_setdents_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_SETDENTS, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_lk_cbk - lk callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @lock: + * + * not for external reference + */ +int32_t +server_lk_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct flock *lock) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_lk_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + server_state_t *state = NULL; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) { + gf_flock_from_flock (&rsp->flock, lock); + } else if (op_errno != ENOSYS) { + state = CALL_STATE(frame); + + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": LK %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_LK, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +int32_t +server_inodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + server_connection_t *conn = NULL; + gf_hdr_common_t *hdr = NULL; + gf_fop_inodelk_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + conn = SERVER_CONNECTION(frame); + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret >= 0) { + if (state->flock.l_type == F_UNLCK) + gf_del_locker (conn->ltable, + &state->loc, NULL, frame->root->pid); + else + gf_add_locker (conn->ltable, + &state->loc, NULL, frame->root->pid); + } else if (op_errno != ENOSYS) { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": INODELK %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, op_ret, + strerror (op_errno)); + } + + server_loc_wipe (&state->loc); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_INODELK, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +int32_t +server_finodelk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + server_connection_t *conn = NULL; + gf_hdr_common_t *hdr = NULL; + gf_fop_finodelk_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + conn = SERVER_CONNECTION(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + state = CALL_STATE(frame); + + if (op_ret >= 0) { + if (state->flock.l_type == F_UNLCK) + gf_del_locker (conn->ltable, + NULL, state->fd, frame->root->pid); + else + gf_add_locker (conn->ltable, + NULL, state->fd, frame->root->pid); + } else if (op_errno != ENOSYS) { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": FINODELK %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_FINODELK, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_entrylk_cbk - + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @lock: + * + * not for external reference + */ +int32_t +server_entrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + server_connection_t *conn = NULL; + gf_hdr_common_t *hdr = NULL; + gf_fop_entrylk_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + conn = SERVER_CONNECTION(frame); + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret >= 0) { + if (state->cmd == ENTRYLK_UNLOCK) + gf_del_locker (conn->ltable, + &state->loc, NULL, frame->root->pid); + else + gf_add_locker (conn->ltable, + &state->loc, NULL, frame->root->pid); + } else if (op_errno != ENOSYS) { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": INODELK %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, op_ret, + strerror (op_errno)); + } + + server_loc_wipe (&state->loc); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_ENTRYLK, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +int32_t +server_fentrylk_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + server_connection_t *conn = NULL; + gf_hdr_common_t *hdr = NULL; + gf_fop_fentrylk_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + conn = SERVER_CONNECTION(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret >= 0) { + state = CALL_STATE(frame); + if (state->cmd == ENTRYLK_UNLOCK) + gf_del_locker (conn->ltable, + NULL, state->fd, frame->root->pid); + else + gf_add_locker (conn->ltable, + NULL, state->fd, frame->root->pid); + } else if (op_errno != ENOSYS) { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": FENTRYLK %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_FENTRYLK, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_access_cbk - access callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * + * not for external reference + */ +int32_t +server_access_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_access_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_ACCESS, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_utimens_cbk - utimens callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @stbuf: + * + * not for external reference + */ +int32_t +server_utimens_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_utimens_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) + gf_stat_from_stat (&rsp->stat, stbuf); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_UTIMENS, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_chmod_cbk - chmod callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @stbuf: + * + * not for external reference + */ +int32_t +server_chmod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_chmod_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) + gf_stat_from_stat (&rsp->stat, stbuf); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_CHMOD, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_chown_cbk - chown callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @stbuf: + * + * not for external reference + */ +int32_t +server_chown_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_chown_rsp_t *rsp = NULL; + server_state_t *state = NULL; + int32_t gf_errno = 0; + size_t hdrlen = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) + gf_stat_from_stat (&rsp->stat, stbuf); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_CHOWN, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_rmdir_cbk - rmdir callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * + * not for external reference + */ +int32_t +server_rmdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_rmdir_rsp_t *rsp = NULL; + server_state_t *state = NULL; + int32_t gf_errno = 0; + size_t hdrlen = 0; + + state = CALL_STATE(frame); + + if (op_ret == 0) { + inode_unlink (state->loc.inode, state->loc.parent, + state->loc.name); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": RMDIR %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, + op_ret, strerror (op_errno)); + } + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_RMDIR, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_mkdir_cbk - mkdir callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @stbuf: + * + * not for external reference + */ +int32_t +server_mkdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_mkdir_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret >= 0) { + gf_stat_from_stat (&rsp->stat, stbuf); + inode_link (inode, state->loc.parent, state->loc.name, stbuf); + inode_lookup (inode); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": MKDIR %s ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + op_ret, strerror (op_errno)); + } + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_MKDIR, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_mknod_cbk - mknod callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @stbuf: + * + * not for external reference + */ +int32_t +server_mknod_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_mknod_rsp_t *rsp = NULL; + server_state_t *state = NULL; + int32_t gf_errno = 0; + size_t hdrlen = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret >= 0) { + gf_stat_from_stat (&rsp->stat, stbuf); + inode_link (inode, state->loc.parent, state->loc.name, stbuf); + inode_lookup (inode); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": MKNOD %s ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + op_ret, strerror (op_errno)); + } + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_MKNOD, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_fsyncdir_cbk - fsyncdir callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * + * not for external reference + */ +int32_t +server_fsyncdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fsyncdir_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + server_state_t *state = NULL; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + if (op_ret < 0) { + state = CALL_STATE(frame); + + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": FSYNCDIR %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_FSYNCDIR, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_getdents_cbk - readdir callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * @entries: + * @count: + * + * not for external reference + */ +int32_t +server_getdents_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dir_entry_t *entries, + int32_t count) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_getdents_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t vec_count = 0; + int32_t gf_errno = 0; + int32_t ret = -1; + dict_t *reply_dict = NULL; + char *buffer = NULL; + size_t buflen = 0; + struct iovec vector[1]; + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if (op_ret >= 0) { + buflen = gf_direntry_to_bin (entries, &buffer); + if (buflen < 0) { + gf_log (this->name, GF_LOG_ERROR, + "fd - %"PRId64" (%"PRId64"): failed to convert " + "entries list to string buffer", + state->fd_no, state->fd->inode->ino); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + reply_dict = dict_new (); + if (reply_dict == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "fd - %"PRId64" (%"PRId64"): failed to get new dict", + state->fd_no, state->fd->inode->ino); + op_ret = -1; + op_errno = ENOMEM; + goto out; + } + + ret = dict_set_dynptr (reply_dict, NULL, + buffer, buflen); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "fd - %"PRId64" (%"PRId64"): failed to set read buffer " + "to reply dictionary", + state->fd_no, state->fd->inode->ino); + op_ret = -1; + op_errno = -ret; + goto out; + } + frame->root->rsp_refs = reply_dict; + vector[0].iov_base = buffer; + vector[0].iov_len = buflen; + vec_count = 1; + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": GETDENTS %"PRId64" (%"PRId64"): %"PRId32" (%s)", + frame->root->unique, + state->fd_no, + state->fd ? state->fd->inode->ino : 0, + op_ret, strerror (op_errno)); + vector[0].iov_base = NULL; + vector[0].iov_len = 0; + } + +out: + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + rsp->count = hton32 (count); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_GETDENTS, + hdr, hdrlen, vector, vec_count, + frame->root->rsp_refs); + + if (reply_dict) + dict_unref (reply_dict); + + return 0; +} + + +/* + * server_readdir_cbk - getdents callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * + * not for external reference + */ +int32_t +server_readdir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + gf_dirent_t *entries) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_readdir_rsp_t *rsp = NULL; + size_t hdrlen = 0; + size_t buf_size = 0; + int32_t gf_errno = 0; + server_state_t *state = NULL; + + if (op_ret > 0) + buf_size = gf_dirent_serialize (entries, NULL, 0); + + hdrlen = gf_hdr_len (rsp, buf_size); + hdr = gf_hdr_new (rsp, buf_size); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret > 0) { + rsp->size = hton32 (buf_size); + gf_dirent_serialize (entries, rsp->buf, buf_size); + } else { + state = CALL_STATE(frame); + + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": READDIR %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_READDIR, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_releasedir_cbk - releasedir callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * + * not for external reference + */ +int32_t +server_releasedir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_cbk_releasedir_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + protocol_server_reply (frame, GF_OP_TYPE_CBK_REPLY, GF_CBK_RELEASEDIR, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_opendir_cbk - opendir callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * @fd: file descriptor structure of opened directory + * + * not for external reference + */ +int32_t +server_opendir_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + server_connection_t *conn = NULL; + gf_hdr_common_t *hdr = NULL; + gf_fop_opendir_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + conn = SERVER_CONNECTION(frame); + + state = CALL_STATE(frame); + + if (op_ret >= 0) { + fd_bind (fd); + + state->fd_no = gf_fd_unused_get (conn->fdtable, fd); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": OPENDIR %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, + op_ret, strerror (op_errno)); + + /* NOTE: corresponding to fd_create()'s ref */ + if (state->fd) + fd_unref (state->fd); + } + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + rsp->fd = hton64 (state->fd_no); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_OPENDIR, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_statfs_cbk - statfs callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * @buf: + * + * not for external reference + */ +int32_t +server_statfs_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct statvfs *buf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_statfs_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret >= 0) { + gf_statfs_from_statfs (&rsp->statfs, buf); + } + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_STATFS, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_removexattr_cbk - removexattr callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * + * not for external reference + */ +int32_t +server_removexattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_removexattr_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_REMOVEXATTR, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_getxattr_cbk - getxattr callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * @value: + * + * not for external reference + */ +int32_t +server_getxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_getxattr_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t len = 0; + int32_t gf_errno = 0; + int32_t ret = -1; + + state = CALL_STATE(frame); + + if (op_ret >= 0) { + len = dict_serialized_length (dict); + if (len < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to get serialized length of " + "reply dict", + state->loc.path, state->ino); + op_ret = -1; + op_errno = EINVAL; + len = 0; + } + } + + hdrlen = gf_hdr_len (rsp, len + 1); + hdr = gf_hdr_new (rsp, len + 1); + rsp = gf_param (hdr); + + if (op_ret >= 0) { + ret = dict_serialize (dict, rsp->dict); + if (len < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to serialize reply dict", + state->loc.path, state->ino); + op_ret = -1; + op_errno = -ret; + } + } + rsp->dict_len = hton32 (len); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_GETXATTR, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_setxattr_cbk - setxattr callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * + * not for external reference + */ +int32_t +server_setxattr_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_setxattr_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_SETXATTR, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_rename_cbk - rename callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * + * not for external reference + */ +int32_t +server_rename_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_rename_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) { + stbuf->st_ino = state->loc.inode->ino; + stbuf->st_mode = state->loc.inode->st_mode; + + gf_log (state->bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": RENAME_CBK (%"PRId64") %"PRId64"/%s " + "==> %"PRId64"/%s", + frame->root->unique, state->loc.inode->ino, + state->loc.parent->ino, state->loc.name, + state->loc2.parent->ino, state->loc2.name); + + inode_rename (state->itable, + state->loc.parent, state->loc.name, + state->loc2.parent, state->loc2.name, + state->loc.inode, stbuf); + gf_stat_from_stat (&rsp->stat, stbuf); + } + + server_loc_wipe (&(state->loc)); + server_loc_wipe (&(state->loc2)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_RENAME, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_unlink_cbk - unlink callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * + * not for external reference + */ +int32_t +server_unlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_unlink_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + state = CALL_STATE(frame); + + if (op_ret == 0) { + gf_log (state->bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": UNLINK_CBK %"PRId64"/%s (%"PRId64")", + frame->root->unique, state->loc.parent->ino, + state->loc.name, state->loc.inode->ino); + + inode_unlink (state->loc.inode, state->loc.parent, + state->loc.name); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": UNLINK %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, + op_ret, strerror (op_errno)); + } + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_UNLINK, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_symlink_cbk - symlink callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * + * not for external reference + */ +int32_t +server_symlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_symlink_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno_to_error (op_errno)); + + if (op_ret >= 0) { + gf_stat_from_stat (&rsp->stat, stbuf); + inode_link (inode, state->loc.parent, state->loc.name, stbuf); + inode_lookup (inode); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": SYMLINK %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, + op_ret, strerror (op_errno)); + } + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_SYMLINK, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_link_cbk - link callback for server protocol + * @frame: call frame + * @this: + * @op_ret: + * @op_errno: + * @stbuf: + * + * not for external reference + */ +int32_t +server_link_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_link_rsp_t *rsp = NULL; + server_state_t *state = NULL; + int32_t gf_errno = 0; + size_t hdrlen = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) { + stbuf->st_ino = state->loc.inode->ino; + gf_stat_from_stat (&rsp->stat, stbuf); + gf_log (state->bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": LINK (%"PRId64") %"PRId64"/%s ==> %"PRId64"/%s", + frame->root->unique, inode->ino, state->loc2.parent->ino, + state->loc2.name, state->loc.parent->ino, state->loc.name); + + inode_link (inode, state->loc2.parent, + state->loc2.name, stbuf); + } else { + gf_log (state->bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": LINK (%"PRId64") %"PRId64"/%s ==> %"PRId64"/%s " + " ==> %"PRId32" (%s)", + frame->root->unique, inode->ino, state->loc2.parent->ino, + state->loc2.name, state->loc.parent->ino, state->loc.name, + op_ret, strerror (op_errno)); + } + + server_loc_wipe (&(state->loc)); + server_loc_wipe (&(state->loc2)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_LINK, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_truncate_cbk - truncate callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @stbuf: + * + * not for external reference + */ +int32_t +server_truncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_truncate_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) { + gf_stat_from_stat (&rsp->stat, stbuf); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": TRUNCATE %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, + op_ret, strerror (op_errno)); + } + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_TRUNCATE, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_fstat_cbk - fstat callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @stbuf: + * + * not for external reference + */ +int32_t +server_fstat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fstat_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + server_state_t *state = NULL; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) { + gf_stat_from_stat (&rsp->stat, stbuf); + } else { + state = CALL_STATE(frame); + + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": FSTAT %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_FSTAT, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_ftruncate_cbk - ftruncate callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @stbuf: + * + * not for external reference + */ +int32_t +server_ftruncate_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_ftruncate_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + server_state_t *state = NULL; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) { + gf_stat_from_stat (&rsp->stat, stbuf); + } else { + state = CALL_STATE(frame); + + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": FTRUNCATE %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_FTRUNCATE, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_flush_cbk - flush callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * + * not for external reference + */ +int32_t +server_flush_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_flush_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + server_state_t *state = NULL; + + if (op_ret < 0) { + state = CALL_STATE(frame); + + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": FLUSH %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_FLUSH, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_fsync_cbk - fsync callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * + * not for external reference + */ +int32_t +server_fsync_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_fsync_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + server_state_t *state = NULL; + + if (op_ret < 0) { + state = CALL_STATE(frame); + + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": FSYNC %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_FSYNC, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_release_cbk - rleease callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * + * not for external reference + */ +int32_t +server_release_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_cbk_release_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + protocol_server_reply (frame, GF_OP_TYPE_CBK_REPLY, GF_CBK_RELEASE, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_writev_cbk - writev callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * + * not for external reference + */ + +int32_t +server_writev_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_write_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + server_state_t *state = NULL; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno_to_error (op_errno)); + + if (op_ret >= 0) { + gf_stat_from_stat (&rsp->stat, stbuf); + } else { + state = CALL_STATE(frame); + + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": WRITEV %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + protocol_server_reply (frame, + GF_OP_TYPE_FOP_REPLY, GF_FOP_WRITE, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_readv_cbk - readv callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @vector: + * @count: + * + * not for external reference + */ +int32_t +server_readv_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct iovec *vector, + int32_t count, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_read_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + server_state_t *state = NULL; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret >= 0) { + gf_stat_from_stat (&rsp->stat, stbuf); + } else { + state = CALL_STATE(frame); + + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": READV %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_READ, + hdr, hdrlen, vector, count, + frame->root->rsp_refs); + + return 0; +} + + +/* + * server_open_cbk - open callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @fd: + * + * not for external reference + */ +int32_t +server_open_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd) +{ + server_connection_t *conn = NULL; + gf_hdr_common_t *hdr = NULL; + gf_fop_open_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + conn = SERVER_CONNECTION(frame); + + state = CALL_STATE(frame); + + if (op_ret >= 0) { + fd_bind (fd); + + state->fd_no = gf_fd_unused_get (conn->fdtable, fd); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": OPEN %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, + op_ret, strerror (op_errno)); + + /* NOTE: corresponding to fd_create()'s ref */ + if (state->fd) + fd_unref (state->fd); + } + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + rsp->fd = hton64 (state->fd_no); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_OPEN, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_create_cbk - create callback for server + * @frame: call frame + * @cookie: + * @this: translator structure + * @op_ret: + * @op_errno: + * @fd: file descriptor + * @inode: inode structure + * @stbuf: struct stat of created file + * + * not for external reference + */ +int32_t +server_create_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + fd_t *fd, + inode_t *inode, + struct stat *stbuf) +{ + server_connection_t *conn = NULL; + gf_hdr_common_t *hdr = NULL; + gf_fop_create_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + conn = SERVER_CONNECTION(frame); + + state = CALL_STATE(frame); + + if (op_ret >= 0) { + gf_log (state->bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": CREATE %"PRId64"/%s (%"PRId64")", + frame->root->unique, state->loc.parent->ino, + state->loc.name, stbuf->st_ino); + + inode_link (inode, state->loc.parent, state->loc.name, stbuf); + inode_lookup (inode); + + fd_bind (fd); + + state->fd_no = gf_fd_unused_get (conn->fdtable, fd); + + if ((state->fd_no < 0) || (fd == 0)) { + op_ret = state->fd_no; + op_errno = errno; + } + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": CREATE %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, + op_ret, strerror (op_errno)); + + /* NOTE: corresponding to fd_create()'s ref */ + if (state->fd) + fd_unref (state->fd); + + } + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + rsp->fd = hton64 (state->fd_no); + + if (op_ret >= 0) + gf_stat_from_stat (&rsp->stat, stbuf); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_CREATE, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_readlink_cbk - readlink callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @buf: + * + * not for external reference + */ +int32_t +server_readlink_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + const char *buf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_readlink_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + size_t linklen = 0; + int32_t gf_errno = 0; + + state = CALL_STATE(frame); + + if (op_ret >= 0) { + linklen = strlen (buf) + 1; + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": READLINK %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, + op_ret, strerror (op_errno)); + } + + hdrlen = gf_hdr_len (rsp, linklen); + hdr = gf_hdr_new (rsp, linklen); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno_to_error (op_errno)); + + if (op_ret >= 0) + strcpy (rsp->path, buf); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_READLINK, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_stat_cbk - stat callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @stbuf: + * + * not for external reference + */ +int32_t +server_stat_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + struct stat *stbuf) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_stat_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + state = CALL_STATE(frame); + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno_to_error (op_errno)); + + if (op_ret == 0) { + gf_stat_from_stat (&rsp->stat, stbuf); + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": STAT %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, + op_ret, strerror (op_errno)); + } + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_STAT, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_forget_cbk - forget callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * + * not for external reference + */ +int32_t +server_forget_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno) +{ + gf_hdr_common_t *hdr = NULL; + gf_cbk_forget_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + protocol_server_reply (frame, GF_OP_TYPE_CBK_REPLY, GF_CBK_FORGET, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * server_lookup_cbk - lookup callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: + * @op_errno: + * @inode: + * @stbuf: + * + * not for external reference + */ +int32_t +server_lookup_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + struct stat *stbuf, + dict_t *dict) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_lookup_rsp_t *rsp = NULL; + server_state_t *state = NULL; + inode_t *root_inode = NULL; + int32_t dict_len = 0; + size_t hdrlen = 0; + int32_t gf_errno = 0; + int32_t ret = -1; + + state = CALL_STATE(frame); + if ((op_errno == ESTALE) && (op_ret == -1)) { + /* Send lookup again with new ctx dictionary */ + loc_t loc = {0,}; + + root_inode = BOUND_XL(frame)->itable->root; + if (state->loc.inode != root_inode) { + if (state->loc.inode) + inode_unref (state->loc.inode); + state->loc.inode = inode_new (BOUND_XL(frame)->itable); + } + loc.inode = state->loc.inode; + loc.path = state->path; + state->is_revalidate = 2; + STACK_WIND (frame, server_lookup_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->lookup, + &loc, + state->xattr_req); + return 0; + } + + if (dict) { + dict_len = dict_serialized_length (dict); + if (dict_len < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to get serialized " + "length of reply dict", + state->loc.path, state->loc.inode->ino); + op_ret = -1; + op_errno = EINVAL; + dict_len = 0; + } + } + + hdrlen = gf_hdr_len (rsp, dict_len); + hdr = gf_hdr_new (rsp, dict_len); + rsp = gf_param (hdr); + + if ((op_ret >= 0) && dict) { + ret = dict_serialize (dict, rsp->dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to serialize reply dict", + state->loc.path, state->loc.inode->ino); + op_ret = -1; + op_errno = -ret; + dict_len = 0; + } + } + rsp->dict_len = hton32 (dict_len); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret == 0) { + root_inode = BOUND_XL(frame)->itable->root; + if (inode == root_inode) { + /* we just looked up root ("/") */ + stbuf->st_ino = 1; + if (inode->st_mode == 0) + inode->st_mode = stbuf->st_mode; + } + + gf_stat_from_stat (&rsp->stat, stbuf); + + if (inode->ino == 0) { + inode_link (inode, state->loc.parent, + state->loc.name, stbuf); + inode_lookup (inode); + } + } else { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": LOOKUP %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, + op_ret, strerror (op_errno)); + } + + server_loc_wipe (&state->loc); + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_LOOKUP, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +int32_t +server_xattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_xattrop_rsp_t *rsp = NULL; + server_state_t *state = NULL; + size_t hdrlen = 0; + int32_t len = 0; + int32_t gf_errno = 0; + int32_t ret = -1; + + state = CALL_STATE(frame); + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": XATTROP %s (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->loc.path, + state->loc.inode ? state->loc.inode->ino : 0, + op_ret, strerror (op_errno)); + } + + if ((op_ret >= 0) && dict) { + len = dict_serialized_length (dict); + if (len < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to get serialized length" + " for reply dict", + state->loc.path, state->loc.inode->ino); + op_ret = -1; + op_errno = EINVAL; + len = 0; + } + } + + hdrlen = gf_hdr_len (rsp, len + 1); + hdr = gf_hdr_new (rsp, len + 1); + rsp = gf_param (hdr); + + if ((op_ret >= 0) && dict) { + ret = dict_serialize (dict, rsp->dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to serialize reply dict", + state->loc.path, state->loc.inode->ino); + op_ret = -1; + op_errno = -ret; + len = 0; + } + } + rsp->dict_len = hton32 (len); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + server_loc_wipe (&(state->loc)); + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_XATTROP, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +int32_t +server_fxattrop_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + dict_t *dict) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_xattrop_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t len = 0; + int32_t gf_errno = 0; + int32_t ret = -1; + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if (op_ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "%"PRId64": FXATTROP %"PRId64" (%"PRId64") ==> %"PRId32" (%s)", + frame->root->unique, state->fd_no, + state->fd ? state->fd->inode->ino : 0, op_ret, + strerror (op_errno)); + } + + if ((op_ret >= 0) && dict) { + len = dict_serialized_length (dict); + if (len < 0) { + gf_log (this->name, GF_LOG_ERROR, + "fd - %"PRId64" (%"PRId64"): failed to get " + "serialized length for reply dict", + state->fd_no, state->fd->inode->ino); + op_ret = -1; + op_errno = EINVAL; + len = 0; + } + } + + hdrlen = gf_hdr_len (rsp, len + 1); + hdr = gf_hdr_new (rsp, len + 1); + rsp = gf_param (hdr); + + if ((op_ret >= 0) && dict) { + ret = dict_serialize (dict, rsp->dict); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "fd - %"PRId64" (%"PRId64"): failed to " + "serialize reply dict", + state->fd_no, state->fd->inode->ino); + op_ret = -1; + op_errno = -ret; + len = 0; + } + } + rsp->dict_len = hton32 (len); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_FXATTROP, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * server_stub_resume - this is callback function used whenever an fop does + * STACK_WIND to fops->lookup in order to lookup the inode + * for a pathname. this case of doing fops->lookup arises + * when fop searches in inode table for pathname and search + * fails. + * + * @stub: call stub + * @op_ret: + * @op_errno: + * @inode: + * @parent: + * + * not for external reference + */ +int32_t +server_stub_resume (call_stub_t *stub, + int32_t op_ret, + int32_t op_errno, + inode_t *inode, + inode_t *parent) +{ + inode_t *server_inode = inode; + + if (!stub) { + return 0; + } + switch (stub->fop) + { + case GF_FOP_RENAME: + if (stub->args.rename.old.inode == NULL) { + loc_t *newloc = NULL; + /* now we are called by lookup of oldpath. */ + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": RENAME (%s -> %s) on %s " + "returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.rename.old.path, + stub->args.rename.new.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + /* lookup of oldpath failed, UNWIND to + * server_rename_cbk with ret=-1 and + * errno=ENOENT + */ + server_rename_cbk (stub->frame, + NULL, + stub->frame->this, + -1, + ENOENT, + NULL); + server_loc_wipe (&stub->args.rename.old); + server_loc_wipe (&stub->args.rename.new); + FREE (stub); + return 0; + } + + if (stub->args.rename.old.parent == NULL) + stub->args.rename.old.parent = + inode_ref (parent); + + /* store inode information of oldpath in our stub + * and search for newpath in inode table. + */ + if (server_inode) { + stub->args.rename.old.inode = + inode_ref (server_inode); + + stub->args.rename.old.ino = + server_inode->ino; + } + + /* now lookup for newpath */ + newloc = &stub->args.rename.new; + + if (newloc->parent == NULL) { + /* lookup for newpath */ + do_path_lookup (stub, newloc); + break; + } else { + /* found newpath in inode cache */ + call_resume (stub); + break; + } + } else { + /* we are called by the lookup of newpath */ + if (stub->args.rename.new.parent == NULL) + stub->args.rename.new.parent = + inode_ref (parent); + } + + /* after looking up for oldpath as well as newpath, + * we are ready to resume */ + { + call_resume (stub); + } + break; + + case GF_FOP_OPEN: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": OPEN (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.open.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_open_cbk (stub->frame, + NULL, + stub->frame->this, + -1, + ENOENT, + NULL); + FREE (stub->args.open.loc.path); + FREE (stub); + return 0; + } + if (stub->args.open.loc.parent == NULL) + stub->args.open.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.open.loc.inode == NULL)) { + stub->args.open.loc.inode = inode_ref (server_inode); + stub->args.open.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_LOOKUP: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, + GF_LOG_DEBUG, + "%"PRId64": LOOKUP (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.lookup.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_lookup_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL, NULL, + NULL); + server_loc_wipe (&stub->args.lookup.loc); + FREE (stub); + return 0; + } + + if (stub->args.lookup.loc.parent == NULL) + stub->args.lookup.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.lookup.loc.inode == NULL)) { + stub->args.lookup.loc.inode = inode_ref (server_inode); + stub->args.lookup.loc.ino = server_inode->ino; + } + + call_resume (stub); + + break; + } + + case GF_FOP_STAT: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": STAT (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.stat.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + server_stat_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL); + server_loc_wipe (&stub->args.stat.loc); + FREE (stub); + return 0; + } + + /* TODO:reply from here only, we already have stat structure */ + if (stub->args.stat.loc.parent == NULL) + stub->args.stat.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.stat.loc.inode == NULL)) { + stub->args.stat.loc.inode = inode_ref (server_inode); + stub->args.stat.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_XATTROP: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": XATTROP (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.xattrop.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + server_xattrop_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL); + server_loc_wipe (&stub->args.xattrop.loc); + FREE (stub); + return 0; + } + + if (stub->args.xattrop.loc.parent == NULL) + stub->args.xattrop.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.xattrop.loc.inode == NULL)) { + stub->args.xattrop.loc.inode = + inode_ref (server_inode); + + stub->args.xattrop.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_UNLINK: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": UNLINK (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.unlink.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + server_unlink_cbk (stub->frame, NULL, + stub->frame->this, + -1, ENOENT); + server_loc_wipe (&stub->args.unlink.loc); + FREE (stub); + return 0; + } + + if (stub->args.unlink.loc.parent == NULL) + stub->args.unlink.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.unlink.loc.inode == NULL)) { + stub->args.unlink.loc.inode = inode_ref (server_inode); + stub->args.unlink.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_SYMLINK: + { + if ((op_ret < 0) && (parent == NULL)) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": SYMLINK (%s -> %s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.symlink.loc.path, + stub->args.symlink.linkname, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + server_symlink_cbk (stub->frame, NULL, + stub->frame->this, + -1, ENOENT, + NULL, NULL); + server_loc_wipe (&stub->args.symlink.loc); + FREE (stub); + return 0; + } + + if (stub->args.symlink.loc.parent == NULL) + stub->args.symlink.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.symlink.loc.inode == NULL)) { + stub->args.symlink.loc.inode = + inode_ref (server_inode); + stub->args.symlink.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_RMDIR: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": RMDIR (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.rmdir.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + server_rmdir_cbk (stub->frame, + NULL, + stub->frame->this, + -1, + ENOENT); + server_loc_wipe (&stub->args.rmdir.loc); + FREE (stub); + return 0; + } + + if (stub->args.rmdir.loc.parent == NULL) + stub->args.rmdir.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.rmdir.loc.inode == NULL)) { + stub->args.rmdir.loc.inode = inode_ref (server_inode); + stub->args.rmdir.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_CHMOD: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": CHMOD (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.chmod.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + server_chmod_cbk (stub->frame, + NULL, + stub->frame->this, + -1, + ENOENT, + NULL); + server_loc_wipe (&stub->args.chmod.loc); + FREE (stub); + return 0; + } + + if (stub->args.chmod.loc.parent == NULL) + stub->args.chmod.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.chmod.loc.inode == NULL)) { + stub->args.chmod.loc.inode = inode_ref (server_inode); + stub->args.chmod.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_CHOWN: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": CHOWN (%s) on %s returning ENOENT: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.chown.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + server_chown_cbk (stub->frame, + NULL, + stub->frame->this, + -1, + ENOENT, + NULL); + server_loc_wipe (&stub->args.chown.loc); + FREE (stub); + return 0; + } + + if (stub->args.chown.loc.parent == NULL) + stub->args.chown.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.chown.loc.inode == NULL)) { + stub->args.chown.loc.inode = inode_ref (server_inode); + stub->args.chown.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_LINK: + { + if (stub->args.link.oldloc.inode == NULL) { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": LINK (%s -> %s) on %s returning " + "error for oldloc: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.link.oldloc.path, + stub->args.link.newloc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_link_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL, NULL); + server_loc_wipe (&stub->args.link.oldloc); + server_loc_wipe (&stub->args.link.newloc); + FREE (stub); + return 0; + } + + if (stub->args.link.oldloc.parent == NULL) + stub->args.link.oldloc.parent = + inode_ref (parent); + + if (server_inode && + (stub->args.link.oldloc.inode == NULL)) { + stub->args.link.oldloc.inode = + inode_ref (server_inode); + stub->args.link.oldloc.ino = server_inode->ino; + } + + if (stub->args.link.newloc.parent == NULL) { + do_path_lookup (stub, + &(stub->args.link.newloc)); + break; + } + } else { + /* we are called by the lookup of newpath */ + if ((op_ret < 0) && (parent == NULL)) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": LINK (%s -> %s) on %s returning " + "error for newloc: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.link.oldloc.path, + stub->args.link.newloc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_link_cbk (stub->frame, NULL, + stub->frame->this, + -1, ENOENT, + NULL, NULL); + + server_loc_wipe (&stub->args.link.oldloc); + server_loc_wipe (&stub->args.link.newloc); + FREE (stub); + break; + } + + if (stub->args.link.newloc.parent == NULL) { + stub->args.link.newloc.parent = + inode_ref (parent); + } + + if (server_inode && + (stub->args.link.newloc.inode == NULL)) { + /* as new.inode doesn't get forget, it + * needs to be unref'd here */ + stub->args.link.newloc.inode = + inode_ref (server_inode); + stub->args.link.newloc.ino = server_inode->ino; + } + } + call_resume (stub); + break; + } + + case GF_FOP_TRUNCATE: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": TRUNCATE (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.truncate.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_truncate_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL); + server_loc_wipe (&stub->args.truncate.loc); + FREE (stub); + return 0; + } + + if (stub->args.truncate.loc.parent == NULL) + stub->args.truncate.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.truncate.loc.inode == NULL)) { + stub->args.truncate.loc.inode = + inode_ref (server_inode); + stub->args.truncate.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_STATFS: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": STATFS (%s) on %s returning ENOENT: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.statfs.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_statfs_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL); + server_loc_wipe (&stub->args.statfs.loc); + FREE (stub); + return 0; + } + + if (stub->args.statfs.loc.parent == NULL) + stub->args.statfs.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.statfs.loc.inode == NULL)) { + stub->args.statfs.loc.inode = inode_ref (server_inode); + stub->args.statfs.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_SETXATTR: + { + dict_t *dict = stub->args.setxattr.dict; + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": SETXATTR (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.setxattr.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_setxattr_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT); + + server_loc_wipe (&stub->args.setxattr.loc); + dict_unref (dict); + FREE (stub); + return 0; + } + + if (stub->args.setxattr.loc.parent == NULL) + stub->args.setxattr.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.setxattr.loc.inode == NULL)) { + stub->args.setxattr.loc.inode = + inode_ref (server_inode); + stub->args.setxattr.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_GETXATTR: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": GETXATTR (%s) on %s for key %s " + "returning error: %"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.getxattr.loc.path, + BOUND_XL(stub->frame)->name, + stub->args.getxattr.name ? + stub->args.getxattr.name : "<nul>", + op_ret, op_errno); + + server_getxattr_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL); + server_loc_wipe (&stub->args.getxattr.loc); + FREE (stub); + return 0; + } + + if (stub->args.getxattr.loc.parent == NULL) + stub->args.getxattr.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.getxattr.loc.inode == NULL)) { + stub->args.getxattr.loc.inode = + inode_ref (server_inode); + stub->args.getxattr.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_REMOVEXATTR: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": REMOVEXATTR (%s) on %s for key %s " + "returning error: %"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.removexattr.loc.path, + BOUND_XL(stub->frame)->name, + stub->args.removexattr.name, + op_ret, op_errno); + + server_removexattr_cbk (stub->frame, + NULL, + stub->frame->this, + -1, + ENOENT); + server_loc_wipe (&stub->args.removexattr.loc); + FREE (stub); + return 0; + } + + if (stub->args.removexattr.loc.parent == NULL) + stub->args.removexattr.loc.parent = inode_ref (parent); + + if (server_inode && + (stub->args.removexattr.loc.inode == NULL)) { + stub->args.removexattr.loc.inode = + inode_ref (server_inode); + stub->args.removexattr.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_OPENDIR: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": OPENDIR (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.opendir.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_opendir_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL); + server_loc_wipe (&stub->args.opendir.loc); + FREE (stub); + return 0; + } + + if (stub->args.opendir.loc.parent == NULL) + stub->args.opendir.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.opendir.loc.inode == NULL)) { + stub->args.opendir.loc.inode = + inode_ref (server_inode); + stub->args.opendir.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_ACCESS: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": ACCESS (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.access.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_access_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT); + server_loc_wipe (&stub->args.access.loc); + FREE (stub); + return 0; + } + + if (stub->args.access.loc.parent == NULL) + stub->args.access.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.access.loc.inode == NULL)) { + stub->args.access.loc.inode = inode_ref (server_inode); + stub->args.access.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + + case GF_FOP_UTIMENS: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": UTIMENS (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.utimens.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_utimens_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL); + server_loc_wipe (&stub->args.utimens.loc); + FREE (stub); + return 0; + } + + if (stub->args.utimens.loc.parent == NULL) + stub->args.utimens.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.utimens.loc.inode == NULL)) { + stub->args.utimens.loc.inode = + inode_ref (server_inode); + stub->args.utimens.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + + case GF_FOP_READLINK: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": READLINK (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.readlink.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_readlink_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL); + server_loc_wipe (&stub->args.readlink.loc); + FREE (stub); + return 0; + } + + if (stub->args.readlink.loc.parent == NULL) + stub->args.readlink.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.readlink.loc.inode == NULL)) { + stub->args.readlink.loc.inode = + inode_ref (server_inode); + stub->args.readlink.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + case GF_FOP_MKDIR: + { + if ((op_ret < 0) && (parent == NULL)) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": MKDIR (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.mkdir.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_mkdir_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL, NULL); + server_loc_wipe (&stub->args.mkdir.loc); + FREE (stub); + break; + } + + if (stub->args.mkdir.loc.parent == NULL) + stub->args.mkdir.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.mkdir.loc.inode == NULL)) { + stub->args.mkdir.loc.inode = inode_ref (server_inode); + stub->args.mkdir.loc.ino = server_inode->ino; + } + + call_resume (stub); + break; + } + + case GF_FOP_CREATE: + { + if ((op_ret < 0) && (parent == NULL)) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": CREATE (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.create.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_create_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL, NULL, + NULL); + if (stub->args.create.fd) + fd_unref (stub->args.create.fd); + server_loc_wipe (&stub->args.create.loc); + FREE (stub); + break; + } + + if (stub->args.create.loc.parent == NULL) + stub->args.create.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.create.loc.inode == NULL)) { + stub->args.create.loc.inode = inode_ref (server_inode); + stub->args.create.loc.ino = server_inode->ino; + } + + call_resume (stub); + break; + } + + case GF_FOP_MKNOD: + { + if ((op_ret < 0) && (parent == NULL)) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": MKNOD (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.mknod.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_mknod_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT, + NULL, NULL); + server_loc_wipe (&stub->args.mknod.loc); + FREE (stub); + break; + } + + if (stub->args.mknod.loc.parent == NULL) + stub->args.mknod.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.mknod.loc.inode == NULL)) { + stub->args.mknod.loc.inode = inode_ref (server_inode); + stub->args.mknod.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + case GF_FOP_ENTRYLK: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": ENTRYLK (%s) on %s for key %s returning " + "error: %"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.entrylk.loc.path, + BOUND_XL(stub->frame)->name, + stub->args.entrylk.name ? + stub->args.entrylk.name : "<nul>", + op_ret, op_errno); + + server_entrylk_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT); + server_loc_wipe (&stub->args.entrylk.loc); + FREE (stub); + break; + } + + if (stub->args.entrylk.loc.parent == NULL) + stub->args.entrylk.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.entrylk.loc.inode == NULL)) { + stub->args.entrylk.loc.inode = inode_ref (server_inode); + stub->args.entrylk.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + case GF_FOP_INODELK: + { + if (op_ret < 0) { + gf_log (stub->frame->this->name, GF_LOG_ERROR, + "%"PRId64": INODELK (%s) on %s returning error: " + "%"PRId32" (%"PRId32")", + stub->frame->root->unique, + stub->args.inodelk.loc.path, + BOUND_XL(stub->frame)->name, + op_ret, op_errno); + + server_inodelk_cbk (stub->frame, + NULL, + stub->frame->this, + -1, ENOENT); + server_loc_wipe (&stub->args.inodelk.loc); + FREE (stub); + break; + } + + if (stub->args.inodelk.loc.parent == NULL) + stub->args.inodelk.loc.parent = inode_ref (parent); + + if (server_inode && (stub->args.inodelk.loc.inode == NULL)) { + stub->args.inodelk.loc.inode = + inode_ref (server_inode); + stub->args.inodelk.loc.ino = server_inode->ino; + } + call_resume (stub); + break; + } + default: + call_resume (stub); + } + + return 0; +} + +static int +server_lookup_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if ((state->loc.parent == NULL) && + (loc->parent)) + state->loc.parent = inode_ref (loc->parent); + + if (state->loc.inode == NULL) { + if (loc->inode == NULL) + state->loc.inode = inode_new (state->itable); + else + /* FIXME: why another lookup? */ + state->loc.inode = inode_ref (loc->inode); + } else { + if (loc->inode && (state->loc.inode != loc->inode)) { + if (state->loc.inode) + inode_unref (state->loc.inode); + state->loc.inode = inode_ref (loc->inode); + } + } + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": LOOKUP \'%"PRId64"/%s\'", + frame->root->unique, state->par, state->bname); + + STACK_WIND (frame, + server_lookup_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->lookup, + &(state->loc), + xattr_req); + return 0; +} + +/* + * server_lookup - lookup function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int +server_lookup (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_lookup_req_t *req = NULL; + server_state_t *state = NULL; + call_stub_t *lookup_stub = NULL; + int32_t ret = -1; + size_t pathlen = 0, baselen = 0; + size_t dictlen = 0; + dict_t *xattr_req = NULL; + char *req_dictbuf = NULL; + + req = gf_param (hdr); + + state = CALL_STATE(frame); + { + + pathlen = STRLEN_0 (req->path); + dictlen = ntoh32 (req->dictlen); + + /* NOTE: lookup() uses req->ino only to identify if a lookup() + * is requested for 'root' or not + */ + state->ino = ntoh64 (req->ino); + if (state->ino != 1) + state->ino = 0; + + state->par = ntoh64 (req->par); + state->path = req->path; + if (IS_NOT_ROOT(pathlen)) { + state->bname = req->bname + pathlen; + baselen = STRLEN_0 (state->bname); + } + + if (dictlen) { + /* Unserialize the dictionary */ + req_dictbuf = memdup (req->dict + pathlen + baselen, dictlen); + GF_VALIDATE_OR_GOTO(bound_xl->name, req_dictbuf, fail); + + xattr_req = dict_new (); + GF_VALIDATE_OR_GOTO(bound_xl->name, xattr_req, fail); + + ret = dict_unserialize (req_dictbuf, dictlen, &xattr_req); + if (ret < 0) { + gf_log (bound_xl->name, GF_LOG_ERROR, + "%"PRId64": %s (%"PRId64"): failed to " + "unserialize request buffer to dictionary", + frame->root->unique, state->loc.path, + state->ino); + free (req_dictbuf); + goto fail; + } else{ + xattr_req->extra_free = req_dictbuf; + state->xattr_req = xattr_req; + xattr_req = NULL; + } + } + } + + ret = server_loc_fill (&state->loc, state, + state->ino, state->par, state->bname, + state->path); + + if (state->loc.inode) { + /* revalidate */ + state->is_revalidate = 1; + } else { + /* fresh lookup or inode was previously pruned out */ + state->is_revalidate = -1; + } + + lookup_stub = fop_lookup_stub (frame, server_lookup_resume, + &(state->loc), state->xattr_req); + GF_VALIDATE_OR_GOTO(bound_xl->name, lookup_stub, fail); + + if ((state->loc.parent == NULL) && + IS_NOT_ROOT(pathlen)) + do_path_lookup (lookup_stub, &(state->loc)); + else + call_resume (lookup_stub); + + return 0; +fail: + server_lookup_cbk (frame, NULL, frame->this, + -1,EINVAL, + NULL, NULL, NULL); + if (xattr_req) + dict_unref (xattr_req); + + return 0; +} + + +/* + * server_forget - forget function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_forget (call_frame_t *frame, xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + int index = 0; + ino_t ino = 0; + int32_t count = 0; + inode_t *inode = NULL; + gf_cbk_forget_req_t *req = NULL; + + req = gf_param (hdr); + count = ntoh32 (req->count); + + for (index = 0; index < count; index++) { + + ino = ntoh64 (req->ino_array[index]); + + if (!ino) + continue; + + inode = inode_search (bound_xl->itable, ino, NULL); + + if (inode) { + inode_forget (inode, 0); + inode_unref (inode); + } else { + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": FORGET %"PRId64" not found " + "in inode table", + frame->root->unique, ino); + } + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": FORGET \'%"PRId64"\'", + frame->root->unique, ino); + } + + server_forget_cbk (frame, NULL, bound_xl, 0, 0); + + return 0; +} + + + +int32_t +server_stat_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": STAT \'%s (%"PRId64")\'", + frame->root->unique, state->loc.path, state->loc.ino); + + STACK_WIND (frame, + server_stat_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->stat, + loc); + return 0; +} + +/* + * server_stat - stat function for server + * @frame: call frame + * @bound_xl: translator this server is bound to + * @params: parameters dictionary + * + * not for external reference + */ +int32_t +server_stat (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *stat_stub = NULL; + gf_fop_stat_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + + state->ino = ntoh64 (req->ino); + state->path = req->path; + pathlen = STRLEN_0(state->path); + + ret = server_loc_fill (&(state->loc), state, + state->ino, state->par, state->bname, + state->path); + + stat_stub = fop_stat_stub (frame, + server_stat_resume, + &(state->loc)); + GF_VALIDATE_OR_GOTO(bound_xl->name, stat_stub, fail); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (stat_stub, &(state->loc)); + } else { + call_resume (stat_stub); + } + return 0; +fail: + server_stat_cbk (frame, NULL, frame->this, + -1, EINVAL, + NULL); + return 0; +} + + +int32_t +server_readlink_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": READLINK \'%s (%"PRId64")\'", + frame->root->unique, state->loc.path, state->loc.ino); + + STACK_WIND (frame, + server_readlink_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->readlink, + loc, + size); + return 0; +} + +/* + * server_readlink - readlink function for server + * @frame: call frame + * @bound_xl: translator this server is bound to + * @params: parameters dictionary + * + * not for external reference + */ +int32_t +server_readlink (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *readlink_stub = NULL; + gf_fop_readlink_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + + req = gf_param (hdr); + state = CALL_STATE(frame); + + state->size = ntoh32 (req->size); + + state->ino = ntoh64 (req->ino); + state->path = req->path; + + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + readlink_stub = fop_readlink_stub (frame, + server_readlink_resume, + &(state->loc), + state->size); + GF_VALIDATE_OR_GOTO(bound_xl->name, readlink_stub, fail); + + if ((state->loc.parent == NULL) || + (state->loc.inode == NULL)) { + do_path_lookup (readlink_stub, &(state->loc)); + } else { + call_resume (readlink_stub); + } + return 0; +fail: + server_readlink_cbk (frame, NULL,frame->this, + -1, EINVAL, + NULL); + return 0; +} + +int32_t +server_create_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd) +{ + server_state_t *state = CALL_STATE(frame); + + if (state->loc.parent == NULL) + state->loc.parent = inode_ref (loc->parent); + + state->loc.inode = inode_new (state->itable); + GF_VALIDATE_OR_GOTO(BOUND_XL(frame)->name, state->loc.inode, fail); + + state->fd = fd_create (state->loc.inode, frame->root->pid); + GF_VALIDATE_OR_GOTO(BOUND_XL(frame)->name, state->fd, fail); + + state->fd->flags = flags; + state->fd = fd_ref (state->fd); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": CREATE \'%"PRId64"/%s\'", + frame->root->unique, state->par, state->bname); + + STACK_WIND (frame, + server_create_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->create, + &(state->loc), + flags, + mode, + state->fd); + + return 0; +fail: + server_create_cbk (frame, NULL, frame->this, + -1, EINVAL, + NULL, NULL, NULL); + return 0; +} + + +/* + * server_create - create function for server + * @frame: call frame + * @bound_xl: translator this server is bound to + * @params: parameters dictionary + * + * not for external reference + */ +int32_t +server_create (call_frame_t *frame, xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_create_req_t *req = NULL; + server_state_t *state = NULL; + call_stub_t *create_stub = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + pathlen = STRLEN_0(req->path); + + state->par = ntoh64 (req->par); + state->path = req->path; + if (IS_NOT_ROOT(pathlen)) + state->bname = req->bname + pathlen; + + state->mode = ntoh32 (req->mode); + state->flags = ntoh32 (req->flags); + } + + ret = server_loc_fill (&(state->loc), state, + 0, state->par, state->bname, + state->path); + + create_stub = fop_create_stub (frame, server_create_resume, + &(state->loc), state->flags, + state->mode, state->fd); + GF_VALIDATE_OR_GOTO(bound_xl->name, create_stub, fail); + + if (state->loc.parent == NULL) { + do_path_lookup (create_stub, &state->loc); + } else { + call_resume (create_stub); + } + return 0; +fail: + server_create_cbk (frame, NULL, frame->this, + -1, EINVAL, + NULL, NULL, NULL); + return 0; +} + + +int32_t +server_open_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + server_state_t *state = CALL_STATE(frame); + fd_t *new_fd = NULL; + + new_fd = fd_create (loc->inode, frame->root->pid); + GF_VALIDATE_OR_GOTO(BOUND_XL(frame)->name, new_fd, fail); + + new_fd->flags = flags; + + state->fd = fd_ref (new_fd); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": OPEN \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_open_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->open, + loc, + flags, + state->fd); + + return 0; +fail: + server_open_cbk (frame, NULL, frame->this, + -1, EINVAL, + NULL); + return 0; +} + +/* + * server_open - open function for server protocol + * @frame: call frame + * @bound_xl: translator this server protocol is bound to + * @params: parameters dictionary + * + * not for external reference + */ +int32_t +server_open (call_frame_t *frame, xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *open_stub = NULL; + gf_fop_open_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->ino = ntoh64 (req->ino); + state->path = req->path; + pathlen = STRLEN_0(state->path); + state->flags = ntoh32 (req->flags); + } + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + open_stub = fop_open_stub (frame, + server_open_resume, + &(state->loc), state->flags, NULL); + GF_VALIDATE_OR_GOTO(bound_xl->name, open_stub, fail); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (open_stub, &state->loc); + } else { + call_resume (open_stub); + } + return 0; +fail: + server_open_cbk (frame, NULL, frame->this, + -1, EINVAL, + NULL); + return 0; +} + + +/* + * server_readv - readv function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_readv (call_frame_t *frame, xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_read_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->size = ntoh32 (req->size); + state->offset = ntoh64 (req->offset); + } + + GF_VALIDATE_OR_GOTO(bound_xl->name, state->fd, fail); + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": READV \'fd=%"PRId64" (%"PRId64"); " + "offset=%"PRId64"; size=%"PRId64, + frame->root->unique, state->fd_no, state->fd->inode->ino, + state->offset, (int64_t)state->size); + + STACK_WIND (frame, + server_readv_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->readv, + state->fd, state->size, state->offset); + return 0; +fail: + server_readv_cbk (frame, NULL, frame->this, + -1, EINVAL, NULL, 0, NULL); + return 0; +} + + +/* + * server_writev - writev function for server + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_writev (call_frame_t *frame, xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + server_connection_t *conn = NULL; + gf_fop_write_req_t *req = NULL; + struct iovec iov = {0, }; + dict_t *refs = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->offset = ntoh64 (req->offset); + } + + GF_VALIDATE_OR_GOTO(bound_xl->name, state->fd, fail); + + iov.iov_base = buf; + iov.iov_len = buflen; + + refs = dict_new (); + GF_VALIDATE_OR_GOTO(bound_xl->name, refs, fail); + + ret = dict_set_dynptr (refs, NULL, buf, buflen); + if (ret < 0) { + gf_log (frame->this->name, GF_LOG_ERROR, + "fd - %"PRId64" (%"PRId64"): failed to set buffer entry " + "to req_refs", + state->fd_no, state->fd->inode->ino); + goto fail; + } else { + buf = NULL; + } + + frame->root->req_refs = refs; + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": WRITEV \'fd=%"PRId64" (%"PRId64"); " + "offset=%"PRId64"; size=%"PRId64, + frame->root->unique, state->fd_no, state->fd->inode->ino, + state->offset, (int64_t)buflen); + + STACK_WIND (frame, + server_writev_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->writev, + state->fd, &iov, 1, state->offset); + + if (refs) + dict_unref (refs); + return 0; +fail: + server_writev_cbk (frame, NULL, frame->this, + -1, EINVAL, NULL); + + if (buf) + free (buf); + + if (refs) + dict_unref (refs); + + return 0; +} + + + +/* + * server_release - release function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_release (call_frame_t *frame, xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_cbk_release_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + + state->fd_no = ntoh64 (req->fd); + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + GF_VALIDATE_OR_GOTO(bound_xl->name, state->fd, fail); + + gf_fd_put (conn->fdtable, + state->fd_no); + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": RELEASE \'fd=%"PRId64" (%"PRId64")\'", + frame->root->unique, state->fd_no, state->fd->inode->ino); + + STACK_WIND (frame, + server_release_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->flush, + state->fd); + return 0; +fail: + server_release_cbk (frame, NULL, frame->this, + -1, EINVAL); + return 0; +} + + +/* + * server_fsync - fsync function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameters dictionary + * + * not for external reference + */ +int32_t +server_fsync (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_fsync_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->flags = ntoh32 (req->data); + } + + GF_VALIDATE_OR_GOTO(bound_xl->name, state->fd, fail); + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": FSYNC \'fd=%"PRId64" (%"PRId64")\'", + frame->root->unique, state->fd_no, state->fd->inode->ino); + + STACK_WIND (frame, + server_fsync_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->fsync, + state->fd, state->flags); + return 0; +fail: + server_fsync_cbk (frame, NULL, frame->this, + -1, EINVAL); + + return 0; +} + + +/* + * server_flush - flush function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_flush (call_frame_t *frame, xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_flush_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + } + + GF_VALIDATE_OR_GOTO(bound_xl->name, state->fd, fail); + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": FLUSH \'fd=%"PRId64" (%"PRId64")\'", + frame->root->unique, state->fd_no, state->fd->inode->ino); + + STACK_WIND (frame, + server_flush_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->flush, + state->fd); + return 0; + +fail: + server_flush_cbk (frame, NULL, frame->this, + -1, EINVAL); + + return 0; +} + + +/* + * server_ftruncate - ftruncate function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameters dictionary + * + * not for external reference + */ +int32_t +server_ftruncate (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_ftruncate_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->offset = ntoh64 (req->offset); + } + + GF_VALIDATE_OR_GOTO(bound_xl->name, state->fd, fail); + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": FTRUNCATE \'fd=%"PRId64" (%"PRId64"); " + "offset=%"PRId64"\'", + frame->root->unique, state->fd_no, state->fd->inode->ino, + state->offset); + + STACK_WIND (frame, + server_ftruncate_cbk, + bound_xl, + bound_xl->fops->ftruncate, + state->fd, + state->offset); + return 0; +fail: + server_ftruncate_cbk (frame, NULL, frame->this, + -1, EINVAL, NULL); + + return 0; +} + + +/* + * server_fstat - fstat function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_fstat (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_fstat_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + } + + + if (state->fd == NULL) { + gf_log (frame->this->name, GF_LOG_ERROR, + "fd - %"PRId64": unresolved fd", + state->fd_no); + + server_fstat_cbk (frame, NULL, frame->this, + -1, EINVAL, NULL); + + goto out; + } + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": FSTAT \'fd=%"PRId64" (%"PRId64")\'", + frame->root->unique, state->fd_no, state->fd->inode->ino); + + STACK_WIND (frame, + server_fstat_cbk, + bound_xl, + bound_xl->fops->fstat, + state->fd); +out: + return 0; +} + + +int32_t +server_truncate_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": TRUNCATE \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_truncate_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->truncate, + loc, + offset); + return 0; +} + + +/* + * server_truncate - truncate function for server protocol + * @frame: call frame + * @bound_xl: + * @params: + * + * not for external reference + */ +int32_t +server_truncate (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *truncate_stub = NULL; + gf_fop_truncate_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + pathlen = STRLEN_0(req->path); + state->offset = ntoh64 (req->offset); + + state->path = req->path; + state->ino = ntoh64 (req->ino); + } + + + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + truncate_stub = fop_truncate_stub (frame, + server_truncate_resume, + &(state->loc), + state->offset); + if ((state->loc.parent == NULL) || + (state->loc.inode == NULL)) { + do_path_lookup (truncate_stub, &(state->loc)); + } else { + call_resume (truncate_stub); + } + + return 0; +} + + + + + +int32_t +server_unlink_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if (state->loc.parent == NULL) + state->loc.parent = inode_ref (loc->parent); + + if (state->loc.inode == NULL) + state->loc.inode = inode_ref (loc->inode); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": UNLINK \'%"PRId64"/%s (%"PRId64")\'", + frame->root->unique, state->par, state->path, + state->loc.inode->ino); + + STACK_WIND (frame, + server_unlink_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->unlink, + loc); + return 0; +} + +/* + * server_unlink - unlink function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_unlink (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *unlink_stub = NULL; + gf_fop_unlink_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + + pathlen = STRLEN_0(req->path); + + state->par = ntoh64 (req->par); + state->path = req->path; + if (IS_NOT_ROOT(pathlen)) + state->bname = req->bname + pathlen; + + ret = server_loc_fill (&(state->loc), state, + 0, state->par, state->bname, + state->path); + + unlink_stub = fop_unlink_stub (frame, + server_unlink_resume, + &(state->loc)); + + if ((state->loc.parent == NULL) || + (state->loc.inode == NULL)) { + do_path_lookup (unlink_stub, &state->loc); + } else { + call_resume (unlink_stub); + } + + return 0; +} + + + + + +int32_t +server_setxattr_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int32_t flags) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": SETXATTR \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_setxattr_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->setxattr, + loc, + dict, + flags); + return 0; +} + +/* + * server_setxattr - setxattr function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ + +int32_t +server_setxattr (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *setxattr_stub = NULL; + gf_fop_setxattr_req_t *req = NULL; + dict_t *dict = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + size_t dict_len = 0; + char *req_dictbuf = NULL; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + dict_len = ntoh32 (req->dict_len); + + state->path = req->path + dict_len; + + pathlen = STRLEN_0(state->path); + state->ino = ntoh64 (req->ino); + + state->flags = ntoh32 (req->flags); + } + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + { + /* Unserialize the dictionary */ + req_dictbuf = memdup (req->dict, dict_len); + GF_VALIDATE_OR_GOTO(bound_xl->name, req_dictbuf, fail); + + dict = dict_new (); + GF_VALIDATE_OR_GOTO(bound_xl->name, dict, fail); + + ret = dict_unserialize (req_dictbuf, dict_len, &dict); + if (ret < 0) { + gf_log (bound_xl->name, GF_LOG_ERROR, + "%"PRId64": %s (%"PRId64"): failed to " + "unserialize request buffer to dictionary", + frame->root->unique, state->loc.path, + state->ino); + free (req_dictbuf); + goto fail; + } else{ + dict->extra_free = req_dictbuf; + } + } + + setxattr_stub = fop_setxattr_stub (frame, + server_setxattr_resume, + &(state->loc), + dict, + state->flags); + GF_VALIDATE_OR_GOTO(bound_xl->name, setxattr_stub, fail); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (setxattr_stub, &(state->loc)); + } else { + call_resume (setxattr_stub); + } + + if (dict) + dict_unref (dict); + + return 0; +fail: + if (dict) + dict_unref (dict); + + server_setxattr_cbk (frame, NULL, frame->this, + -1, ENOENT); + return 0; + +} + + + +int32_t +server_fxattrop (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + server_connection_t *conn = NULL; + gf_fop_fxattrop_req_t *req = NULL; + dict_t *dict = NULL; + server_state_t *state = NULL; + size_t dict_len = 0; + char *req_dictbuf = NULL; + int32_t ret = -1; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + dict_len = ntoh32 (req->dict_len); + state->ino = ntoh64 (req->ino); + state->flags = ntoh32 (req->flags); + } + + if (dict_len) { + /* Unserialize the dictionary */ + req_dictbuf = memdup (req->dict, dict_len); + GF_VALIDATE_OR_GOTO(bound_xl->name, req_dictbuf, fail); + + dict = dict_new (); + GF_VALIDATE_OR_GOTO(bound_xl->name, dict, fail); + + ret = dict_unserialize (req_dictbuf, dict_len, &dict); + if (ret < 0) { + gf_log (bound_xl->name, GF_LOG_ERROR, + "fd - %"PRId64" (%"PRId64"): failed to unserialize " + "request buffer to dictionary", + state->fd_no, state->fd->inode->ino); + free (req_dictbuf); + goto fail; + } else { + dict->extra_free = req_dictbuf; + } + } + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": FXATTROP \'fd=%"PRId64" (%"PRId64")\'", + frame->root->unique, state->fd_no, state->fd->inode->ino); + + STACK_WIND (frame, + server_fxattrop_cbk, + bound_xl, + bound_xl->fops->fxattrop, + state->fd, + state->flags, + dict); + if (dict) + dict_unref (dict); + return 0; +fail: + if (dict) + dict_unref (dict); + + server_fxattrop_cbk (frame, NULL, frame->this, + -1, EINVAL, NULL); + return 0; +} + +int32_t +server_xattrop_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + gf_xattrop_flags_t flags, + dict_t *dict) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": XATTROP \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_xattrop_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->xattrop, + loc, + flags, + dict); + return 0; +} + +int32_t +server_xattrop (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_xattrop_req_t *req = NULL; + dict_t *dict = NULL; + server_state_t *state = NULL; + call_stub_t *xattrop_stub = NULL; + int32_t ret = -1; + size_t pathlen = 0; + size_t dict_len = 0; + char *req_dictbuf = NULL; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + dict_len = ntoh32 (req->dict_len); + state->ino = ntoh64 (req->ino); + state->path = req->path + dict_len; + pathlen = STRLEN_0(state->path); + state->flags = ntoh32 (req->flags); + } + + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + if (dict_len) { + /* Unserialize the dictionary */ + req_dictbuf = memdup (req->dict, dict_len); + GF_VALIDATE_OR_GOTO(bound_xl->name, req_dictbuf, fail); + + dict = dict_new (); + GF_VALIDATE_OR_GOTO(bound_xl->name, dict, fail); + + ret = dict_unserialize (req_dictbuf, dict_len, &dict); + if (ret < 0) { + gf_log (bound_xl->name, GF_LOG_ERROR, + "%s (%"PRId64"): failed to unserialize " + "request buffer to dictionary", + state->loc.path, state->ino); + goto fail; + } else { + dict->extra_free = req_dictbuf; + } + } + xattrop_stub = fop_xattrop_stub (frame, + server_xattrop_resume, + &(state->loc), + state->flags, + dict); + GF_VALIDATE_OR_GOTO(bound_xl->name, xattrop_stub, fail); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (xattrop_stub, &(state->loc)); + } else { + call_resume (xattrop_stub); + } + + if (dict) + dict_unref (dict); + return 0; +fail: + if (dict) + dict_unref (dict); + + server_xattrop_cbk (frame, NULL, frame->this, + -1, EINVAL, + NULL); + return 0; +} + + +int32_t +server_getxattr_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": GETXATTR \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_getxattr_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->getxattr, + loc, + name); + return 0; +} + +/* + * server_getxattr - getxattr function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_getxattr (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_getxattr_req_t *req = NULL; + call_stub_t *getxattr_stub = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t namelen = 0; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + pathlen = STRLEN_0(req->path); + + state->path = req->path; + state->ino = ntoh64 (req->ino); + + namelen = ntoh32 (req->namelen); + if (namelen) + state->name = (req->name + pathlen); + } + + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + getxattr_stub = fop_getxattr_stub (frame, + server_getxattr_resume, + &(state->loc), + state->name); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (getxattr_stub, &(state->loc)); + } else { + call_resume (getxattr_stub); + } + + return 0; +} + + + +int32_t +server_removexattr_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": REMOVEXATTR \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_removexattr_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->removexattr, + loc, + name); + return 0; +} + +/* + * server_removexattr - removexattr function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_removexattr (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_removexattr_req_t *req = NULL; + call_stub_t *removexattr_stub = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + pathlen = STRLEN_0(req->path); + + state->path = req->path; + state->ino = ntoh64 (req->ino); + + state->name = (req->name + pathlen); + } + + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + removexattr_stub = fop_removexattr_stub (frame, + server_removexattr_resume, + &(state->loc), + state->name); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (removexattr_stub, &(state->loc)); + } else { + call_resume (removexattr_stub); + } + + return 0; +} + + +/* + * server_statfs - statfs function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_statfs (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_statfs_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + + req = gf_param (hdr); + + state = CALL_STATE(frame); + state->ino = ntoh64 (req->ino); + state->path = req->path; + + ret = server_loc_fill (&state->loc, state, + state->ino, 0, NULL, state->path); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": STATFS \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_statfs_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->statfs, + &(state->loc)); + + return 0; +} + + + +int32_t +server_opendir_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + fd_t *fd) +{ + server_state_t *state = CALL_STATE(frame); + fd_t *new_fd = NULL; + + new_fd = fd_create (loc->inode, frame->root->pid); + state->fd = fd_ref (new_fd); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": OPENDIR \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_opendir_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->opendir, + loc, + state->fd); + return 0; +} + + +/* + * server_opendir - opendir function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_opendir (call_frame_t *frame, xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *opendir_stub = NULL; + gf_fop_opendir_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->path = req->path; + pathlen = STRLEN_0(state->path); + state->ino = ntoh64 (req->ino); + } + + ret = server_loc_fill (&state->loc, state, + state->ino, 0, NULL, state->path); + + opendir_stub = fop_opendir_stub (frame, + server_opendir_resume, + &(state->loc), + NULL); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (opendir_stub, &(state->loc)); + } else { + call_resume (opendir_stub); + } + + return 0; +} + + +/* + * server_releasedir - releasedir function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_releasedir (call_frame_t *frame, xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_cbk_releasedir_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + + state->fd_no = ntoh64 (req->fd); + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + if (state->fd == NULL) { + gf_log (frame->this->name, GF_LOG_ERROR, + "fd - %"PRId64": unresolved fd", + state->fd_no); + + server_releasedir_cbk (frame, NULL, frame->this, + -1, EINVAL); + goto out; + } + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": RELEASEDIR \'fd=%"PRId64" (%"PRId64")\'", + frame->root->unique, state->fd_no, state->fd->inode->ino); + + gf_fd_put (conn->fdtable, state->fd_no); + + server_releasedir_cbk (frame, NULL, frame->this, + 0, 0); +out: + return 0; +} + + +/* + * server_readdir - readdir function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_getdents (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_getdents_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->size = ntoh32 (req->size); + state->offset = ntoh64 (req->offset); + state->flags = ntoh32 (req->flags); + } + + + if (state->fd == NULL) { + gf_log (frame->this->name, GF_LOG_ERROR, + "fd - %"PRId64": unresolved fd", + state->fd_no); + + server_getdents_cbk (frame, NULL, frame->this, + -1, EINVAL, NULL, 0); + + goto out; + } + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": GETDENTS \'fd=%"PRId64" (%"PRId64"); " + "offset=%"PRId64"; size=%"PRId64, + frame->root->unique, state->fd_no, state->fd->inode->ino, + state->offset, (int64_t)state->size); + + STACK_WIND (frame, + server_getdents_cbk, + bound_xl, + bound_xl->fops->getdents, + state->fd, + state->size, + state->offset, + state->flags); +out: + return 0; +} + + +/* + * server_readdir - readdir function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_readdir (call_frame_t *frame, xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_readdir_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->size = ntoh32 (req->size); + state->offset = ntoh64 (req->offset); + } + + + if (state->fd == NULL) { + gf_log (frame->this->name, GF_LOG_ERROR, + "fd - %"PRId64": unresolved fd", + state->fd_no); + + server_readdir_cbk (frame, NULL, frame->this, + -1, EINVAL, NULL); + + goto out; + } + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": READDIR \'fd=%"PRId64" (%"PRId64"); " + "offset=%"PRId64"; size=%"PRId64, + frame->root->unique, state->fd_no, state->fd->inode->ino, + state->offset, (int64_t)state->size); + + STACK_WIND (frame, + server_readdir_cbk, + bound_xl, + bound_xl->fops->readdir, + state->fd, state->size, state->offset); +out: + return 0; +} + + + +/* + * server_fsyncdir - fsyncdir function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_fsyncdir (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_fsyncdir_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->flags = ntoh32 (req->data); + } + + if (state->fd == NULL) { + gf_log (frame->this->name, GF_LOG_ERROR, + "fd - %"PRId64": unresolved fd", + state->fd_no); + + server_fsyncdir_cbk (frame, NULL, frame->this, + -1, EINVAL); + goto out; + } + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": FSYNCDIR \'fd=%"PRId64" (%"PRId64")\'", + frame->root->unique, state->fd_no, state->fd->inode->ino); + + STACK_WIND (frame, + server_fsyncdir_cbk, + bound_xl, + bound_xl->fops->fsyncdir, + state->fd, state->flags); +out: + return 0; +} + + +int32_t +server_mknod_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t dev) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if (state->loc.parent == NULL) + state->loc.parent = inode_ref (loc->parent); + + state->loc.inode = inode_new (state->itable); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": MKNOD \'%"PRId64"/%s\'", + frame->root->unique, state->par, state->bname); + + STACK_WIND (frame, + server_mknod_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->mknod, + &(state->loc), mode, dev); + + return 0; +} +/* + * server_mknod - mknod function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_mknod (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_mknod_req_t *req = NULL; + server_state_t *state = NULL; + call_stub_t *mknod_stub = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + pathlen = STRLEN_0(req->path); + + state->par = ntoh64 (req->par); + state->path = req->path; + if (IS_NOT_ROOT(pathlen)) + state->bname = req->bname + pathlen; + + state->mode = ntoh32 (req->mode); + state->dev = ntoh64 (req->dev); + } + ret = server_loc_fill (&(state->loc), state, + 0, state->par, state->bname, + state->path); + + mknod_stub = fop_mknod_stub (frame, server_mknod_resume, + &(state->loc), state->mode, state->dev); + + if (state->loc.parent == NULL) { + do_path_lookup (mknod_stub, &(state->loc)); + } else { + call_resume (mknod_stub); + } + + return 0; +} + +int32_t +server_mkdir_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) + +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if (state->loc.parent == NULL) + state->loc.parent = inode_ref (loc->parent); + + state->loc.inode = inode_new (state->itable); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": MKDIR \'%"PRId64"/%s\'", + frame->root->unique, state->par, state->bname); + + STACK_WIND (frame, + server_mkdir_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->mkdir, + &(state->loc), + state->mode); + + return 0; +} + +/* + * server_mkdir - mkdir function for server protocol + * @frame: call frame + * @bound_xl: + * @params: + * + * not for external reference + */ +int32_t +server_mkdir (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_mkdir_req_t *req = NULL; + server_state_t *state = NULL; + call_stub_t *mkdir_stub = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + pathlen = STRLEN_0(req->path); + state->mode = ntoh32 (req->mode); + + state->path = req->path; + state->bname = req->bname + pathlen; + state->par = ntoh64 (req->par); + } + + + ret = server_loc_fill (&(state->loc), state, + 0, state->par, state->bname, + state->path); + + mkdir_stub = fop_mkdir_stub (frame, server_mkdir_resume, + &(state->loc), state->mode); + + if (state->loc.parent == NULL) { + do_path_lookup (mkdir_stub, &(state->loc)); + } else { + call_resume (mkdir_stub); + } + + return 0; +} + + +int32_t +server_rmdir_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if (state->loc.parent == NULL) + state->loc.parent = inode_ref (loc->parent); + + if (state->loc.inode == NULL) + state->loc.inode = inode_ref (loc->inode); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": RMDIR \'%"PRId64"/%s\'", + frame->root->unique, state->par, state->bname); + + STACK_WIND (frame, + server_rmdir_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->rmdir, + loc); + return 0; +} + +/* + * server_rmdir - rmdir function for server protocol + * @frame: call frame + * @bound_xl: + * @params: + * + * not for external reference + */ +int32_t +server_rmdir (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *rmdir_stub = NULL; + gf_fop_rmdir_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + pathlen = STRLEN_0(req->path); + state->path = req->path; + state->par = ntoh64 (req->par); + state->bname = req->bname + pathlen; + } + + + ret = server_loc_fill (&(state->loc), state, + state->ino, state->par, state->bname, + state->path); + + rmdir_stub = fop_rmdir_stub (frame, + server_rmdir_resume, + &(state->loc)); + + if ((state->loc.parent == NULL) || + (state->loc.inode == NULL)) { + do_path_lookup (rmdir_stub, &(state->loc)); + } else { + call_resume (rmdir_stub); + } + + return 0; +} + + + +int32_t +server_chown_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": CHOWN \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, server_chown_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->chown, + loc, uid, gid); + return 0; +} + + +/* + * server_chown - chown function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_chown (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *chown_stub = NULL; + gf_fop_chown_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->ino = ntoh64 (req->ino); + state->path = req->path; + pathlen = STRLEN_0(state->path); + state->uid = ntoh32 (req->uid); + state->gid = ntoh32 (req->gid); + } + + + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + chown_stub = fop_chown_stub (frame, + server_chown_resume, + &(state->loc), + state->uid, + state->gid); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (chown_stub, &(state->loc)); + } else { + call_resume (chown_stub); + } + + return 0; +} + + +int32_t +server_chmod_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": CHMOD \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_chmod_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->chmod, + loc, + mode); + return 0; + +} + +/* + * server_chmod - chmod function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_chmod (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *chmod_stub = NULL; + gf_fop_chmod_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + + state = CALL_STATE(frame); + { + state->ino = ntoh64 (req->ino); + state->path = req->path; + pathlen = STRLEN_0(state->path); + + state->mode = ntoh32 (req->mode); + } + + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + chmod_stub = fop_chmod_stub (frame, + server_chmod_resume, + &(state->loc), + state->mode); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (chmod_stub, &(state->loc)); + } else { + call_resume (chmod_stub); + } + + return 0; +} + + +int32_t +server_utimens_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec *tv) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": UTIMENS \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_utimens_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->utimens, + loc, + tv); + return 0; +} + +/* + * server_utimens - utimens function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_utimens (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *utimens_stub = NULL; + gf_fop_utimens_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->ino = ntoh64 (req->ino); + state->path = req->path; + pathlen = STRLEN_0(state->path); + + gf_timespec_to_timespec (req->tv, state->tv); + } + + + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + utimens_stub = fop_utimens_stub (frame, + server_utimens_resume, + &(state->loc), + state->tv); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (utimens_stub, &(state->loc)); + } else { + call_resume (utimens_stub); + } + + return 0; +} + + + +int32_t +server_inodelk_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, int32_t cmd, + struct flock *flock) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + if (state->loc.inode == NULL) { + state->loc.inode = inode_ref (loc->inode); + } + + if (state->loc.parent == NULL) { + state->loc.parent = inode_ref (loc->parent); + } + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": INODELK \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_inodelk_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->inodelk, + loc, cmd, flock); + return 0; + +} + + +int32_t +server_inodelk (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *inodelk_stub = NULL; + gf_fop_inodelk_req_t *req = NULL; + server_state_t *state = NULL; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->cmd = ntoh32 (req->cmd); + switch (state->cmd) { + case GF_LK_GETLK: + state->cmd = F_GETLK; + break; + case GF_LK_SETLK: + state->cmd = F_SETLK; + break; + case GF_LK_SETLKW: + state->cmd = F_SETLKW; + break; + } + + state->type = ntoh32 (req->type); + + pathlen = STRLEN_0(req->path); + + state->path = req->path; + state->ino = ntoh64 (req->ino); + + gf_flock_to_flock (&req->flock, &state->flock); + + switch (state->type) { + case GF_LK_F_RDLCK: + state->flock.l_type = F_RDLCK; + break; + case GF_LK_F_WRLCK: + state->flock.l_type = F_WRLCK; + break; + case GF_LK_F_UNLCK: + state->flock.l_type = F_UNLCK; + break; + } + + } + + server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + inodelk_stub = fop_inodelk_stub (frame, + server_inodelk_resume, + &state->loc, state->cmd, &state->flock); + + if ((state->loc.parent == NULL) || + (state->loc.inode == NULL)) { + do_path_lookup (inodelk_stub, &(state->loc)); + } else { + call_resume (inodelk_stub); + } + + return 0; +} + + +int32_t +server_finodelk (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_finodelk_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->cmd = ntoh32 (req->cmd); + switch (state->cmd) { + case GF_LK_GETLK: + state->cmd = F_GETLK; + break; + case GF_LK_SETLK: + state->cmd = F_SETLK; + break; + case GF_LK_SETLKW: + state->cmd = F_SETLKW; + break; + } + + state->type = ntoh32 (req->type); + + gf_flock_to_flock (&req->flock, &state->flock); + + switch (state->type) { + case GF_LK_F_RDLCK: + state->flock.l_type = F_RDLCK; + break; + case GF_LK_F_WRLCK: + state->flock.l_type = F_WRLCK; + break; + case GF_LK_F_UNLCK: + state->flock.l_type = F_UNLCK; + break; + } + + } + + if (state->fd == NULL) { + gf_log (frame->this->name, GF_LOG_ERROR, + "fd - %"PRId64": unresolved fd", + state->fd_no); + + server_finodelk_cbk (frame, NULL, frame->this, + -1, EINVAL); + return -1; + } + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": FINODELK \'fd=%"PRId64" (%"PRId64")\'", + frame->root->unique, state->fd_no, state->fd->inode->ino); + + STACK_WIND (frame, server_finodelk_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->finodelk, + state->fd, state->cmd, &state->flock); + return 0; +} + + +int32_t +server_entrylk_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, const char *name, + entrylk_cmd cmd, entrylk_type type) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if (state->loc.inode == NULL) + state->loc.inode = inode_ref (loc->inode); + + if ((state->loc.parent == NULL) && + (loc->parent)) + state->loc.parent = inode_ref (loc->parent); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": ENTRYLK \'%s (%"PRId64") \'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_entrylk_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->entrylk, + loc, name, cmd, type); + return 0; + +} + +/* + * server_entrylk - entrylk function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_entrylk (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_entrylk_req_t *req = NULL; + server_state_t *state = NULL; + call_stub_t *entrylk_stub = NULL; + size_t pathlen = 0; + size_t namelen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + pathlen = STRLEN_0(req->path); + + state->path = req->path; + state->ino = ntoh64 (req->ino); + namelen = ntoh64 (req->namelen); + if (namelen) + state->name = req->name + pathlen; + + state->cmd = ntoh32 (req->cmd); + state->type = ntoh32 (req->type); + } + + + server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + entrylk_stub = fop_entrylk_stub (frame, + server_entrylk_resume, + &state->loc, state->name, state->cmd, + state->type); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (entrylk_stub, &(state->loc)); + } else { + call_resume (entrylk_stub); + } + + return 0; +} + + +int32_t +server_fentrylk (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_fentrylk_req_t *req = NULL; + server_state_t *state = NULL; + size_t namelen = 0; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->cmd = ntoh32 (req->cmd); + state->type = ntoh32 (req->type); + namelen = ntoh64 (req->namelen); + + if (namelen) + state->name = req->name; + } + + if (state->fd == NULL) { + gf_log (frame->this->name, GF_LOG_ERROR, + "fd - %"PRId64": unresolved fd", + state->fd_no); + + server_fentrylk_cbk (frame, NULL, frame->this, + -1, EINVAL); + return -1; + } + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": FENTRYLK \'fd=%"PRId64" (%"PRId64")\'", + frame->root->unique, state->fd_no, state->fd->inode->ino); + + STACK_WIND (frame, server_fentrylk_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->fentrylk, + state->fd, state->name, state->cmd, state->type); + return 0; +} + + +int32_t +server_access_resume (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": ACCESS \'%s (%"PRId64")\'", + frame->root->unique, state->path, state->ino); + + STACK_WIND (frame, + server_access_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->access, + loc, + mask); + return 0; +} + +/* + * server_access - access function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_access (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + call_stub_t *access_stub = NULL; + gf_fop_access_req_t *req = NULL; + server_state_t *state = NULL; + int32_t ret = -1; + size_t pathlen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + + state->mask = ntoh32 (req->mask); + + state->ino = ntoh64 (req->ino); + state->path = req->path; + pathlen = STRLEN_0(state->path); + + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, state->path); + + access_stub = fop_access_stub (frame, + server_access_resume, + &(state->loc), + state->mask); + + if (((state->loc.parent == NULL) && IS_NOT_ROOT(pathlen)) || + (state->loc.inode == NULL)) { + do_path_lookup (access_stub, &(state->loc)); + } else { + call_resume (access_stub); + } + + return 0; +} + + +int32_t +server_symlink_resume (call_frame_t *frame, + xlator_t *this, + const char *linkname, + loc_t *loc) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + if (state->loc.parent == NULL) + state->loc.parent = inode_ref (loc->parent); + + state->loc.inode = inode_new (BOUND_XL(frame)->itable); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": SYMLINK \'%"PRId64"/%s \'", + frame->root->unique, state->par, state->bname); + + STACK_WIND (frame, + server_symlink_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->symlink, + linkname, + &(state->loc)); + + return 0; +} + +/* + * server_symlink- symlink function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ + +int32_t +server_symlink (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + server_state_t *state = NULL; + gf_fop_symlink_req_t *req = NULL; + call_stub_t *symlink_stub = NULL; + int32_t ret = -1; + size_t pathlen = 0; + size_t baselen = 0; + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + pathlen = STRLEN_0(req->path); + baselen = STRLEN_0(req->bname + pathlen); + + state->par = ntoh64 (req->par); + state->path = req->path; + state->bname = req->bname + pathlen; + + state->name = (req->linkname + pathlen + baselen); + } + + ret = server_loc_fill (&(state->loc), state, + 0, state->par, state->bname, + state->path); + + symlink_stub = fop_symlink_stub (frame, server_symlink_resume, + state->name, &(state->loc)); + + if (state->loc.parent == NULL) { + do_path_lookup (symlink_stub, &(state->loc)); + } else { + call_resume (symlink_stub); + } + + return 0; +} + +int32_t +server_link_resume (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if (state->loc.parent == NULL) + state->loc.parent = inode_ref (oldloc->parent); + + if (state->loc.inode == NULL) { + state->loc.inode = inode_ref (oldloc->inode); + } else if (state->loc.inode != oldloc->inode) { + if (state->loc.inode) + inode_unref (state->loc.inode); + state->loc.inode = inode_ref (oldloc->inode); + } + + if (state->loc2.parent == NULL) + state->loc2.parent = inode_ref (newloc->parent); + + state->loc2.inode = inode_ref (state->loc.inode); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": LINK \'%"PRId64"/%s ==> %s (%"PRId64")\'", + frame->root->unique, state->par2, state->bname2, + state->path, state->ino); + + STACK_WIND (frame, + server_link_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->link, + &(state->loc), + &(state->loc2)); + return 0; +} + +/* + * server_link - link function for server protocol + * @frame: call frame + * @bound_xl: + * @params: + * + * not for external reference + */ +int32_t +server_link (call_frame_t *frame, + xlator_t *this, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_link_req_t *req = NULL; + server_state_t *state = NULL; + call_stub_t *link_stub = NULL; + int32_t ret = -1; + size_t oldpathlen = 0; + size_t newpathlen = 0; + size_t newbaselen = 0; + + req = gf_param (hdr); + + state = CALL_STATE(frame); + { + oldpathlen = STRLEN_0(req->oldpath); + newpathlen = STRLEN_0(req->newpath + oldpathlen); + newbaselen = STRLEN_0(req->newbname + oldpathlen + newpathlen); + + state->path = req->oldpath; + state->path2 = req->newpath + oldpathlen; + state->bname2 = req->newbname + oldpathlen + newpathlen; + state->ino = ntoh64 (req->oldino); + state->par2 = ntoh64 (req->newpar); + } + + ret = server_loc_fill (&(state->loc), state, + state->ino, 0, NULL, + state->path); + ret = server_loc_fill (&(state->loc2), state, + 0, state->par2, state->bname2, + state->path2); + + link_stub = fop_link_stub (frame, server_link_resume, + &(state->loc), &(state->loc2)); + + if ((state->loc.parent == NULL) || + (state->loc.inode == NULL)) { + do_path_lookup (link_stub, &(state->loc)); + } else if (state->loc2.parent == NULL) { + do_path_lookup (link_stub, &(state->loc2)); + } else { + call_resume (link_stub); + } + + return 0; +} + + +int32_t +server_rename_resume (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if (state->loc.parent == NULL) + state->loc.parent = inode_ref (oldloc->parent); + + if (state->loc.inode == NULL) { + state->loc.inode = inode_ref (oldloc->inode); + } + + if (state->loc2.parent == NULL) + state->loc2.parent = inode_ref (newloc->parent); + + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": RENAME %s (%"PRId64"/%s) ==> %s (%"PRId64"/%s)", + frame->root->unique, state->path, state->par, state->bname, + state->path2, state->par2, state->bname2); + + STACK_WIND (frame, + server_rename_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->rename, + &(state->loc), + &(state->loc2)); + return 0; +} + +/* + * server_rename - rename function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ +int32_t +server_rename (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_fop_rename_req_t *req = NULL; + server_state_t *state = NULL; + call_stub_t *rename_stub = NULL; + int32_t ret = -1; + size_t oldpathlen = 0; + size_t oldbaselen = 0; + size_t newpathlen = 0; + size_t newbaselen = 0; + + req = gf_param (hdr); + + state = CALL_STATE(frame); + { + oldpathlen = STRLEN_0(req->oldpath); + oldbaselen = STRLEN_0(req->oldbname + oldpathlen); + newpathlen = STRLEN_0(req->newpath + oldpathlen + oldbaselen); + newbaselen = STRLEN_0(req->newbname + oldpathlen + + oldbaselen + newpathlen); + + state->path = req->oldpath; + state->bname = req->oldbname + oldpathlen; + state->path2 = req->newpath + oldpathlen + oldbaselen; + state->bname2 = (req->newbname + oldpathlen + oldbaselen + + newpathlen); + + state->par = ntoh64 (req->oldpar); + state->par2 = ntoh64 (req->newpar); + } + + ret = server_loc_fill (&(state->loc), state, + 0, state->par, state->bname, + state->path); + ret = server_loc_fill (&(state->loc2), state, + 0, state->par2, state->bname2, + state->path2); + + rename_stub = fop_rename_stub (frame, + server_rename_resume, + &(state->loc), + &(state->loc2)); + + if ((state->loc.parent == NULL) || + (state->loc.inode == NULL)){ + do_path_lookup (rename_stub, &(state->loc)); + } else if ((state->loc2.parent == NULL)){ + do_path_lookup (rename_stub, &(state->loc2)); + } else { + /* we have found inode for both oldpath and newpath in + * inode cache. lets continue with fops->rename() */ + call_resume (rename_stub); + } + + return 0; +} + + +/* + * server_lk - lk function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + * not for external reference + */ + +int32_t +server_lk (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + struct flock lock = {0, }; + gf_fop_lk_req_t *req = NULL; + server_state_t *state = NULL; + server_connection_t *conn = NULL; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + { + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->cmd = ntoh32 (req->cmd); + state->type = ntoh32 (req->type); + } + + + if (state->fd == NULL) { + gf_log (frame->this->name, GF_LOG_ERROR, + "fd - %"PRId64": unresolved fd", + state->fd_no); + + server_lk_cbk (frame, NULL, frame->this, + -1, EINVAL, NULL); + + goto out; + } + + switch (state->cmd) { + case GF_LK_GETLK: + state->cmd = F_GETLK; + break; + case GF_LK_SETLK: + state->cmd = F_SETLK; + break; + case GF_LK_SETLKW: + state->cmd = F_SETLKW; + break; + } + + switch (state->type) { + case GF_LK_F_RDLCK: + lock.l_type = F_RDLCK; + break; + case GF_LK_F_WRLCK: + lock.l_type = F_WRLCK; + break; + case GF_LK_F_UNLCK: + lock.l_type = F_UNLCK; + break; + default: + gf_log (bound_xl->name, GF_LOG_ERROR, + "fd - %"PRId64" (%"PRId64"): Unknown lock type: %"PRId32"!", + state->fd_no, state->fd->inode->ino, state->type); + break; + } + + gf_flock_to_flock (&req->flock, &lock); + + gf_log (BOUND_XL(frame)->name, GF_LOG_DEBUG, + "%"PRId64": LK \'fd=%"PRId64" (%"PRId64")\'", + frame->root->unique, state->fd_no, state->fd->inode->ino); + + STACK_WIND (frame, server_lk_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->lk, + state->fd, state->cmd, &lock); + +out: + return 0; +} + + +/* + * server_writedir - + * + * @frame: + * @bound_xl: + * @params: + * + */ +int32_t +server_setdents (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + server_connection_t *conn = NULL; + gf_fop_setdents_req_t *req = NULL; + server_state_t *state = NULL; + dir_entry_t *entry = NULL; + dir_entry_t *trav = NULL; + dir_entry_t *prev = NULL; + int32_t count = 0; + int32_t i = 0; + int32_t bread = 0; + char *ender = NULL; + char *buffer_ptr = NULL; + char tmp_buf[512] = {0,}; + + conn = SERVER_CONNECTION(frame); + + req = gf_param (hdr); + state = CALL_STATE(frame); + + state->fd_no = ntoh64 (req->fd); + if (state->fd_no >= 0) + state->fd = gf_fd_fdptr_get (conn->fdtable, + state->fd_no); + + state->nr_count = ntoh32 (req->count); + + if (state->fd == NULL) { + gf_log (frame->this->name, GF_LOG_ERROR, + "fd - %"PRId64": unresolved fd", + state->fd_no); + + server_setdents_cbk (frame, NULL, frame->this, + -1, EINVAL); + + goto out; + } + + if (buf == NULL) { + gf_log (frame->this->name, GF_LOG_ERROR, + "fd - %"PRId64" (%"PRId64"): received a null buffer, " + "returning EINVAL", + state->fd_no, state->fd->inode->ino); + + server_setdents_cbk (frame, NULL, frame->this, + -1, EINVAL); + + goto out; + } + + entry = CALLOC (1, sizeof (dir_entry_t)); + ERR_ABORT (entry); + prev = entry; + buffer_ptr = buf; + + for (i = 0; i < state->nr_count ; i++) { + bread = 0; + trav = CALLOC (1, sizeof (dir_entry_t)); + ERR_ABORT (trav); + + ender = strchr (buffer_ptr, '/'); + if (!ender) + break; + count = ender - buffer_ptr; + trav->name = CALLOC (1, count + 2); + ERR_ABORT (trav->name); + + strncpy (trav->name, buffer_ptr, count); + bread = count + 1; + buffer_ptr += bread; + + ender = strchr (buffer_ptr, '\n'); + if (!ender) + break; + count = ender - buffer_ptr; + strncpy (tmp_buf, buffer_ptr, count); + bread = count + 1; + buffer_ptr += bread; + + /* TODO: use str_to_stat instead */ + { + uint64_t dev; + uint64_t ino; + uint32_t mode; + uint32_t nlink; + uint32_t uid; + uint32_t gid; + uint64_t rdev; + uint64_t size; + uint32_t blksize; + uint64_t blocks; + uint32_t atime; + uint32_t atime_nsec; + uint32_t mtime; + uint32_t mtime_nsec; + uint32_t ctime; + uint32_t ctime_nsec; + + sscanf (tmp_buf, GF_STAT_PRINT_FMT_STR, + &dev, + &ino, + &mode, + &nlink, + &uid, + &gid, + &rdev, + &size, + &blksize, + &blocks, + &atime, + &atime_nsec, + &mtime, + &mtime_nsec, + &ctime, + &ctime_nsec); + + trav->buf.st_dev = dev; + trav->buf.st_ino = ino; + trav->buf.st_mode = mode; + trav->buf.st_nlink = nlink; + trav->buf.st_uid = uid; + trav->buf.st_gid = gid; + trav->buf.st_rdev = rdev; + trav->buf.st_size = size; + trav->buf.st_blksize = blksize; + trav->buf.st_blocks = blocks; + + trav->buf.st_atime = atime; + trav->buf.st_mtime = mtime; + trav->buf.st_ctime = ctime; + + ST_ATIM_NSEC_SET(&trav->buf, atime_nsec); + ST_MTIM_NSEC_SET(&trav->buf, mtime_nsec); + ST_CTIM_NSEC_SET(&trav->buf, ctime_nsec); + + } + + ender = strchr (buffer_ptr, '\n'); + if (!ender) + break; + count = ender - buffer_ptr; + *ender = '\0'; + if (S_ISLNK (trav->buf.st_mode)) { + trav->link = strdup (buffer_ptr); + } else + trav->link = ""; + bread = count + 1; + buffer_ptr += bread; + + prev->next = trav; + prev = trav; + } + + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": SETDENTS \'fd=%"PRId64" (%"PRId64"); count=%"PRId64, + frame->root->unique, state->fd_no, state->fd->inode->ino, + (int64_t)state->nr_count); + + STACK_WIND (frame, + server_setdents_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->setdents, + state->fd, + state->flags, + entry, + state->nr_count); + + + /* Free the variables allocated in this fop here */ + trav = entry->next; + prev = entry; + while (trav) { + prev->next = trav->next; + FREE (trav->name); + if (S_ISLNK (trav->buf.st_mode)) + FREE (trav->link); + FREE (trav); + trav = prev->next; + } + FREE (entry); + +out: + return 0; +} + + + +/* xxx_MOPS */ + +/* Management Calls */ +/* + * mop_getspec - getspec function for server protocol + * @frame: call frame + * @bound_xl: + * @params: + * + */ +int32_t +mop_getspec (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_hdr_common_t *_hdr = NULL; + gf_mop_getspec_rsp_t *rsp = NULL; + int32_t ret = -1; + int32_t op_errno = ENOENT; + int32_t gf_errno = 0; + int32_t spec_fd = -1; + size_t file_len = 0; + size_t _hdrlen = 0; + char tmp_filename[ZR_FILENAME_MAX] = {0,}; + char data_key[256] = {0,}; + char *filename = NULL; + struct stat stbuf = {0,}; + peer_info_t *peerinfo = NULL; + transport_t *trans = NULL; + + gf_mop_getspec_req_t *req = NULL; + uint32_t flags = 0; + uint32_t keylen = 0; + char *key = NULL; + + req = gf_param (hdr); + flags = ntoh32 (req->flags); + keylen = ntoh32 (req->keylen); + if (keylen) { + key = req->key; + } + + trans = TRANSPORT_FROM_FRAME(frame); + + peerinfo = &(trans->peerinfo); + /* Inform users that this option is changed now */ + ret = dict_get_str (frame->this->options, "client-volume-filename", + &filename); + if (ret == 0) { + gf_log (trans->xl->name, GF_LOG_WARNING, + "option 'client-volume-specfile' is changed to " + "'volume-filename.<key>' which now takes 'key' as an " + "option to choose/fetch different files from server. " + "Refer documentation or contact developers for more " + "info. Currently defaulting to given file '%s'", + filename); + } + + if (key && !filename) { + sprintf (data_key, "volume-filename.%s", key); + ret = dict_get_str (frame->this->options, data_key, &filename); + if (ret < 0) { + gf_log (trans->xl->name, GF_LOG_ERROR, + "failed to get corresponding volume file " + "for the key '%s'. using default file %s", + key, GLUSTERFSD_SPEC_PATH); + } + } + if (!filename) { + filename = GLUSTERFSD_SPEC_PATH; + if (!key) + gf_log (trans->xl->name, GF_LOG_WARNING, + "using default volume file %s", + GLUSTERFSD_SPEC_PATH); + } + + { + sprintf (tmp_filename, "%s.%s", + filename, peerinfo->identifier); + + /* Try for ip specific client volfile. + * If not found, then go for, regular client file. + */ + ret = open (tmp_filename, O_RDONLY); + spec_fd = ret; + if (spec_fd < 0) { + gf_log (trans->xl->name, GF_LOG_DEBUG, + "Unable to open %s (%s)", + tmp_filename, strerror (errno)); + /* fall back */ + ret = open (filename, O_RDONLY); + spec_fd = ret; + if (spec_fd < 0) { + gf_log (trans->xl->name, GF_LOG_ERROR, + "Unable to open %s (%s)", + filename, strerror (errno)); + goto fail; + } + } else { + /* Successful */ + filename = tmp_filename; + } + } + + /* to allocate the proper buffer to hold the file data */ + { + ret = stat (filename, &stbuf); + if (ret < 0){ + gf_log (trans->xl->name, GF_LOG_ERROR, + "Unable to stat %s (%s)", + filename, strerror (errno)); + goto fail; + } + + file_len = stbuf.st_size; + } + +fail: + op_errno = errno; + + _hdrlen = gf_hdr_len (rsp, file_len + 1); + _hdr = gf_hdr_new (rsp, file_len + 1); + rsp = gf_param (_hdr); + + _hdr->rsp.op_ret = hton32 (ret); + gf_errno = gf_errno_to_error (op_errno); + _hdr->rsp.op_errno = hton32 (gf_errno); + + if (file_len) { + read (spec_fd, rsp->spec, file_len); + close (spec_fd); + } + protocol_server_reply (frame, GF_OP_TYPE_MOP_REPLY, GF_MOP_GETSPEC, + _hdr, _hdrlen, NULL, 0, NULL); + + return 0; +} + +int32_t +server_checksum_cbk (call_frame_t *frame, + void *cookie, + xlator_t *this, + int32_t op_ret, + int32_t op_errno, + uint8_t *fchecksum, + uint8_t *dchecksum) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_checksum_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + + hdrlen = gf_hdr_len (rsp, ZR_FILENAME_MAX + 1 + ZR_FILENAME_MAX + 1); + hdr = gf_hdr_new (rsp, ZR_FILENAME_MAX + 1 + ZR_FILENAME_MAX + 1); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + if (op_ret >= 0) { + memcpy (rsp->fchecksum, fchecksum, ZR_FILENAME_MAX); + rsp->fchecksum[ZR_FILENAME_MAX] = '\0'; + memcpy (rsp->dchecksum + ZR_FILENAME_MAX, + dchecksum, ZR_FILENAME_MAX); + rsp->dchecksum[ZR_FILENAME_MAX + ZR_FILENAME_MAX] = '\0'; + } + + protocol_server_reply (frame, GF_OP_TYPE_FOP_REPLY, GF_FOP_CHECKSUM, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +int32_t +server_checksum (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + loc_t loc = {0,}; + int32_t flag = 0; + gf_fop_checksum_req_t *req = NULL; + + req = gf_param (hdr); + + loc.path = req->path; + loc.ino = ntoh64 (req->ino); + loc.inode = NULL; + flag = ntoh32 (req->flag); + + gf_log (bound_xl->name, GF_LOG_DEBUG, + "%"PRId64": CHECKSUM \'%s (%"PRId64")\'", + frame->root->unique, loc.path, loc.ino); + + STACK_WIND (frame, + server_checksum_cbk, + BOUND_XL(frame), + BOUND_XL(frame)->fops->checksum, + &loc, + flag); + + return 0; +} + + +/* + * mop_unlock - unlock management function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + */ +int32_t +mop_getvolume (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + return 0; +} + +struct __get_xl_struct { + const char *name; + xlator_t *reply; +}; + +void __check_and_set (xlator_t *each, + void *data) +{ + if (!strcmp (each->name, + ((struct __get_xl_struct *) data)->name)) + ((struct __get_xl_struct *) data)->reply = each; +} + +static xlator_t * +get_xlator_by_name (xlator_t *some_xl, + const char *name) +{ + struct __get_xl_struct get = { + .name = name, + .reply = NULL + }; + + xlator_foreach (some_xl, __check_and_set, &get); + + return get.reply; +} + + +/* + * mop_setvolume - setvolume management function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + */ +int +mop_setvolume (call_frame_t *frame, xlator_t *bound_xl, + gf_hdr_common_t *req_hdr, size_t req_hdrlen, + char *req_buf, size_t req_buflen) +{ + server_connection_t *conn = NULL; + server_conf_t *conf = NULL; + gf_hdr_common_t *rsp_hdr = NULL; + gf_mop_setvolume_req_t *req = NULL; + gf_mop_setvolume_rsp_t *rsp = NULL; + peer_info_t *peerinfo = NULL; + int32_t ret = -1; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int32_t gf_errno = 0; + dict_t *reply = NULL; + dict_t *config_params = NULL; + dict_t *params = NULL; + char *name = NULL; + char *version = NULL; + char *process_uuid = NULL; + xlator_t *xl = NULL; + transport_t *trans = NULL; + size_t rsp_hdrlen = -1; + size_t dict_len = -1; + size_t req_dictlen = -1; + + params = dict_new (); + reply = dict_new (); + + req = gf_param (req_hdr); + req_dictlen = ntoh32 (req->dict_len); + ret = dict_unserialize (req->buf, req_dictlen, ¶ms); + + config_params = dict_copy_with_ref (frame->this->options, NULL); + trans = TRANSPORT_FROM_FRAME(frame); + conf = SERVER_CONF(frame); + + if (ret < 0) { + ret = dict_set_str (reply, "ERROR", + "Internal error: failed to unserialize " + "request dictionary"); + if (ret < 0) + gf_log (bound_xl->name, GF_LOG_ERROR, + "failed to set error msg \"%s\"", + "Internal error: failed to unserialize " + "request dictionary"); + + op_ret = -1; + op_errno = EINVAL; + goto fail; + } + + ret = dict_get_str (params, "process-uuid", &process_uuid); + if (ret < 0) { + ret = dict_set_str (reply, "ERROR", + "UUID not specified"); + if (ret < 0) + gf_log (bound_xl->name, GF_LOG_ERROR, + "failed to set error msg"); + + op_ret = -1; + op_errno = EINVAL; + goto fail; + } + + + conn = server_connection_get (frame->this, process_uuid); + if (trans->xl_private != conn) + trans->xl_private = conn; + + ret = dict_get_str (params, "version", &version); + if (ret < 0) { + ret = dict_set_str (reply, "ERROR", + "No version number specified"); + if (ret < 0) + gf_log (trans->xl->name, GF_LOG_ERROR, + "failed to set error msg"); + + op_ret = -1; + op_errno = EINVAL; + goto fail; + } + + ret = strcmp (version, PACKAGE_VERSION); + if (ret != 0) { + char *msg = NULL; + asprintf (&msg, + "Version mismatch: client(%s) Vs server (%s)", + version, PACKAGE_VERSION); + ret = dict_set_dynstr (reply, "ERROR", msg); + if (ret < 0) + gf_log (trans->xl->name, GF_LOG_ERROR, + "failed to set error msg"); + + op_ret = -1; + op_errno = EINVAL; + goto fail; + } + + + ret = dict_get_str (params, + "remote-subvolume", &name); + if (ret < 0) { + ret = dict_set_str (reply, "ERROR", + "No remote-subvolume option specified"); + if (ret < 0) + gf_log (trans->xl->name, GF_LOG_ERROR, + "failed to set error msg"); + + op_ret = -1; + op_errno = EINVAL; + goto fail; + } + + xl = get_xlator_by_name (frame->this, name); + if (xl == NULL) { + char *msg = NULL; + asprintf (&msg, "remote-subvolume \"%s\" is not found", name); + ret = dict_set_dynstr (reply, "ERROR", msg); + if (ret < 0) + gf_log (trans->xl->name, GF_LOG_ERROR, + "failed to set error msg"); + + op_ret = -1; + op_errno = ENOENT; + goto fail; + } + + peerinfo = &trans->peerinfo; + ret = dict_set_static_ptr (params, "peer-info", peerinfo); + if (ret < 0) + gf_log (trans->xl->name, GF_LOG_ERROR, + "failed to set peer-info"); + + if (conf->auth_modules == NULL) { + gf_log (trans->xl->name, GF_LOG_ERROR, + "Authentication module not initialized"); + } + + ret = gf_authenticate (params, config_params, + conf->auth_modules); + if (ret == AUTH_ACCEPT) { + gf_log (trans->xl->name, GF_LOG_INFO, + "accepted client from %s", + peerinfo->identifier); + op_ret = 0; + conn->bound_xl = xl; + ret = dict_set_str (reply, "ERROR", "Success"); + if (ret < 0) + gf_log (trans->xl->name, GF_LOG_ERROR, + "failed to set error msg"); + } else { + gf_log (trans->xl->name, GF_LOG_ERROR, + "Cannot authenticate client from %s", + peerinfo->identifier); + op_ret = -1; + op_errno = EACCES; + ret = dict_set_str (reply, "ERROR", "Authentication failed"); + if (ret < 0) + gf_log (bound_xl->name, GF_LOG_ERROR, + "failed to set error msg"); + + goto fail; + } + + if (conn->bound_xl == NULL) { + ret = dict_set_str (reply, "ERROR", + "Check volfile and handshake " + "options in protocol/client"); + if (ret < 0) + gf_log (trans->xl->name, GF_LOG_ERROR, + "failed to set error msg"); + + op_ret = -1; + op_errno = EACCES; + goto fail; + } + + if ((conn->bound_xl != NULL) && + (ret >= 0) && + (conn->bound_xl->itable == NULL)) { + /* create inode table for this bound_xl, if one doesn't + already exist */ + int32_t lru_limit = 1024; + + lru_limit = INODE_LRU_LIMIT (frame->this); + + gf_log (trans->xl->name, GF_LOG_DEBUG, + "creating inode table with lru_limit=%"PRId32", " + "xlator=%s", lru_limit, conn->bound_xl->name); + + conn->bound_xl->itable = + inode_table_new (lru_limit, + conn->bound_xl); + } + + ret = dict_set_str (reply, "process-uuid", + xl->ctx->process_uuid); + +fail: + dict_len = dict_serialized_length (reply); + if (dict_len < 0) { + gf_log (xl->name, GF_LOG_ERROR, + "failed to get serialized length of reply dict"); + op_ret = -1; + op_errno = EINVAL; + dict_len = 0; + } + + rsp_hdr = gf_hdr_new (rsp, dict_len); + rsp_hdrlen = gf_hdr_len (rsp, dict_len); + rsp = gf_param (rsp_hdr); + + if (dict_len) { + ret = dict_serialize (reply, rsp->buf); + if (ret < 0) { + gf_log (xl->name, GF_LOG_ERROR, + "failed to serialize reply dict"); + op_ret = -1; + op_errno = -ret; + } + } + rsp->dict_len = hton32 (dict_len); + + rsp_hdr->rsp.op_ret = hton32 (op_ret); + gf_errno = gf_errno_to_error (op_errno); + rsp_hdr->rsp.op_errno = hton32 (gf_errno); + + protocol_server_reply (frame, GF_OP_TYPE_MOP_REPLY, GF_MOP_SETVOLUME, + rsp_hdr, rsp_hdrlen, NULL, 0, NULL); + + dict_unref (params); + dict_unref (reply); + dict_unref (config_params); + + return 0; +} + +/* + * server_mop_stats_cbk - stats callback for server management operation + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * @stats:err + * + * not for external reference + */ + +int32_t +server_mop_stats_cbk (call_frame_t *frame, + void *cookie, + xlator_t *xl, + int32_t ret, + int32_t op_errno, + struct xlator_stats *stats) +{ + /* TODO: get this information from somewhere else, not extern */ + gf_hdr_common_t *hdr = NULL; + gf_mop_stats_rsp_t *rsp = NULL; + char buffer[256] = {0,}; + int64_t glusterfsd_stats_nr_clients = 0; + size_t hdrlen = 0; + size_t buf_len = 0; + int32_t gf_errno = 0; + + if (ret >= 0) { + sprintf (buffer, + "%"PRIx64",%"PRIx64",%"PRIx64 + ",%"PRIx64",%"PRIx64",%"PRIx64 + ",%"PRIx64",%"PRIx64"\n", + stats->nr_files, + stats->disk_usage, + stats->free_disk, + stats->total_disk_size, + stats->read_usage, + stats->write_usage, + stats->disk_speed, + glusterfsd_stats_nr_clients); + + buf_len = strlen (buffer); + } + + hdrlen = gf_hdr_len (rsp, buf_len + 1); + hdr = gf_hdr_new (rsp, buf_len + 1); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (ret); + gf_errno = gf_errno_to_error (op_errno); + hdr->rsp.op_errno = hton32 (gf_errno); + + strcpy (rsp->buf, buffer); + + protocol_server_reply (frame, GF_OP_TYPE_MOP_REPLY, GF_MOP_STATS, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + + +/* + * mop_unlock - unlock management function for server protocol + * @frame: call frame + * @bound_xl: + * @params: parameter dictionary + * + */ +static int32_t +mop_stats (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + int32_t flag = 0; + gf_mop_stats_req_t *req = NULL; + + req = gf_param (hdr); + + flag = ntoh32 (req->flags); + + STACK_WIND (frame, + server_mop_stats_cbk, + bound_xl, + bound_xl->mops->stats, + flag); + + return 0; +} + +int32_t +mop_ping (call_frame_t *frame, + xlator_t *bound_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen) +{ + gf_hdr_common_t *rsp_hdr = NULL; + gf_mop_ping_rsp_t *rsp = NULL; + size_t rsp_hdrlen = 0; + + rsp_hdrlen = gf_hdr_len (rsp, 0); + rsp_hdr = gf_hdr_new (rsp, 0); + + hdr->rsp.op_ret = 0; + + protocol_server_reply (frame, GF_OP_TYPE_MOP_REPLY, GF_MOP_PING, + rsp_hdr, rsp_hdrlen, NULL, 0, NULL); + + return 0; +} +/* + * unknown_op_cbk - This function is called when a opcode for unknown + * type is called. Helps to keep the backward/forward + * compatiblity + * @frame: call frame + * @type: + * @opcode: + * + */ + +int32_t +unknown_op_cbk (call_frame_t *frame, + int32_t type, + int32_t opcode) +{ + gf_hdr_common_t *hdr = NULL; + gf_fop_flush_rsp_t *rsp = NULL; + size_t hdrlen = 0; + int32_t gf_errno = 0; + hdrlen = gf_hdr_len (rsp, 0); + hdr = gf_hdr_new (rsp, 0); + rsp = gf_param (hdr); + + hdr->rsp.op_ret = hton32 (-1); + gf_errno = gf_errno_to_error (ENOSYS); + hdr->rsp.op_errno = hton32 (gf_errno); + + protocol_server_reply (frame, type, opcode, + hdr, hdrlen, NULL, 0, NULL); + + return 0; +} + +/* + * get_frame_for_transport - get call frame for specified transport object + * + * @trans: transport object + * + */ +static call_frame_t * +get_frame_for_transport (transport_t *trans) +{ + call_frame_t *frame = NULL; + call_pool_t *pool = NULL; + server_connection_t *conn = NULL; + server_state_t *state = NULL;; + + GF_VALIDATE_OR_GOTO("server", trans, out); + + if (trans->xl && trans->xl->ctx) + pool = trans->xl->ctx->pool; + GF_VALIDATE_OR_GOTO("server", pool, out); + + frame = create_frame (trans->xl, pool); + GF_VALIDATE_OR_GOTO("server", frame, out); + + state = CALLOC (1, sizeof (*state)); + GF_VALIDATE_OR_GOTO("server", state, out); + + conn = trans->xl_private; + if (conn) { + if (conn->bound_xl) + state->itable = conn->bound_xl->itable; + state->bound_xl = conn->bound_xl; + } + + state->trans = transport_ref (trans); + + frame->root->trans = conn; + frame->root->state = state; /* which socket */ + frame->root->unique = 0; /* which call */ + +out: + return frame; +} + +/* + * get_frame_for_call - create a frame into the capable of + * generating and replying the reply packet by itself. + * By making a call with this frame, the last UNWIND + * function will have all needed state from its + * frame_t->root to send reply. + * @trans: + * @blk: + * @params: + * + * not for external reference + */ +static call_frame_t * +get_frame_for_call (transport_t *trans, gf_hdr_common_t *hdr) +{ + call_frame_t *frame = NULL; + + frame = get_frame_for_transport (trans); + + frame->root->op = ntoh32 (hdr->op); + frame->root->type = ntoh32 (hdr->type); + + frame->root->uid = ntoh32 (hdr->req.uid); + frame->root->unique = ntoh64 (hdr->callid); /* which call */ + frame->root->gid = ntoh32 (hdr->req.gid); + frame->root->pid = ntoh32 (hdr->req.pid); + + return frame; +} + +/* + * prototype of operations function for each of mop and + * fop at server protocol level + * + * @frame: call frame pointer + * @bound_xl: the xlator that this frame is bound to + * @params: parameters dictionary + * + * to be used by protocol interpret, _not_ for exterenal reference + */ +typedef int32_t (*gf_op_t) (call_frame_t *frame, xlator_t *bould_xl, + gf_hdr_common_t *hdr, size_t hdrlen, + char *buf, size_t buflen); + + +static gf_op_t gf_fops[] = { + [GF_FOP_STAT] = server_stat, + [GF_FOP_READLINK] = server_readlink, + [GF_FOP_MKNOD] = server_mknod, + [GF_FOP_MKDIR] = server_mkdir, + [GF_FOP_UNLINK] = server_unlink, + [GF_FOP_RMDIR] = server_rmdir, + [GF_FOP_SYMLINK] = server_symlink, + [GF_FOP_RENAME] = server_rename, + [GF_FOP_LINK] = server_link, + [GF_FOP_CHMOD] = server_chmod, + [GF_FOP_CHOWN] = server_chown, + [GF_FOP_TRUNCATE] = server_truncate, + [GF_FOP_OPEN] = server_open, + [GF_FOP_READ] = server_readv, + [GF_FOP_WRITE] = server_writev, + [GF_FOP_STATFS] = server_statfs, + [GF_FOP_FLUSH] = server_flush, + [GF_FOP_FSYNC] = server_fsync, + [GF_FOP_SETXATTR] = server_setxattr, + [GF_FOP_GETXATTR] = server_getxattr, + [GF_FOP_REMOVEXATTR] = server_removexattr, + [GF_FOP_OPENDIR] = server_opendir, + [GF_FOP_GETDENTS] = server_getdents, + [GF_FOP_FSYNCDIR] = server_fsyncdir, + [GF_FOP_ACCESS] = server_access, + [GF_FOP_CREATE] = server_create, + [GF_FOP_FTRUNCATE] = server_ftruncate, + [GF_FOP_FSTAT] = server_fstat, + [GF_FOP_LK] = server_lk, + [GF_FOP_UTIMENS] = server_utimens, + [GF_FOP_FCHMOD] = server_fchmod, + [GF_FOP_FCHOWN] = server_fchown, + [GF_FOP_LOOKUP] = server_lookup, + [GF_FOP_SETDENTS] = server_setdents, + [GF_FOP_READDIR] = server_readdir, + [GF_FOP_INODELK] = server_inodelk, + [GF_FOP_FINODELK] = server_finodelk, + [GF_FOP_ENTRYLK] = server_entrylk, + [GF_FOP_FENTRYLK] = server_fentrylk, + [GF_FOP_CHECKSUM] = server_checksum, + [GF_FOP_XATTROP] = server_xattrop, + [GF_FOP_FXATTROP] = server_fxattrop, +}; + + + +static gf_op_t gf_mops[] = { + [GF_MOP_SETVOLUME] = mop_setvolume, + [GF_MOP_GETVOLUME] = mop_getvolume, + [GF_MOP_STATS] = mop_stats, + [GF_MOP_GETSPEC] = mop_getspec, + [GF_MOP_PING] = mop_ping, +}; + +static gf_op_t gf_cbks[] = { + [GF_CBK_FORGET] = server_forget, + [GF_CBK_RELEASE] = server_release, + [GF_CBK_RELEASEDIR] = server_releasedir +}; + +int +protocol_server_interpret (xlator_t *this, transport_t *trans, + char *hdr_p, size_t hdrlen, char *buf, + size_t buflen) +{ + server_connection_t *conn = NULL; + gf_hdr_common_t *hdr = NULL; + xlator_t *bound_xl = NULL; + call_frame_t *frame = NULL; + peer_info_t *peerinfo = NULL; + int32_t type = -1; + int32_t op = -1; + int32_t ret = -1; + + hdr = (gf_hdr_common_t *)hdr_p; + type = ntoh32 (hdr->type); + op = ntoh32 (hdr->op); + + conn = trans->xl_private; + if (conn) + bound_xl = conn->bound_xl; + + peerinfo = &trans->peerinfo; + switch (type) { + case GF_OP_TYPE_FOP_REQUEST: + if ((op < 0) || + (op > GF_FOP_MAXVALUE)) { + gf_log (this->name, GF_LOG_ERROR, + "invalid fop %"PRId32" from client %s", + op, peerinfo->identifier); + break; + } + if (bound_xl == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "Received fop %"PRId32" before " + "authentication.", op); + break; + } + frame = get_frame_for_call (trans, hdr); + ret = gf_fops[op] (frame, bound_xl, hdr, hdrlen, buf, buflen); + break; + + case GF_OP_TYPE_MOP_REQUEST: + if (op < 0 || op > GF_MOP_MAXVALUE) { + gf_log (this->name, GF_LOG_ERROR, + "invalid mop %"PRId32" from client %s", + op, peerinfo->identifier); + break; + } + frame = get_frame_for_call (trans, hdr); + ret = gf_mops[op] (frame, bound_xl, hdr, hdrlen, buf, buflen); + break; + + case GF_OP_TYPE_CBK_REQUEST: + if (op < 0 || op > GF_CBK_MAXVALUE) { + gf_log (this->name, GF_LOG_ERROR, + "invalid cbk %"PRId32" from client %s", + op, peerinfo->identifier); + break; + } + if (bound_xl == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "Received cbk %d before authentication.", op); + break; + } + + frame = get_frame_for_call (trans, hdr); + ret = gf_cbks[op] (frame, bound_xl, hdr, hdrlen, buf, buflen); + break; + + default: + break; + } + + return ret; +} + + +/* + * server_nop_cbk - nop callback for server protocol + * @frame: call frame + * @cookie: + * @this: + * @op_ret: return value + * @op_errno: errno + * + * not for external reference + */ +int +server_nop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno) +{ + server_state_t *state = NULL; + + state = CALL_STATE(frame); + + if (state) + free_state (state); + STACK_DESTROY (frame->root); + return 0; +} + + +static void +get_auth_types (dict_t *this, + char *key, + data_t *value, + void *data) +{ + dict_t *auth_dict = data; + char *saveptr = NULL, *tmp = NULL; + char *key_cpy = NULL; + int32_t ret = -1; + + key_cpy = strdup (key); + GF_VALIDATE_OR_GOTO("server", key_cpy, out); + + tmp = strtok_r (key_cpy, ".", &saveptr); + ret = strcmp (tmp, "auth"); + if (ret == 0) { + tmp = strtok_r (NULL, ".", &saveptr); + if (strcmp (tmp, "ip") == 0) { + /* TODO: backward compatibility, remove when + newer versions are available */ + tmp = "addr"; + gf_log ("server", GF_LOG_WARNING, + "assuming 'auth.ip' to be 'auth.addr'"); + } + ret = dict_set_dynptr (auth_dict, tmp, NULL, 0); + if (ret < 0) { + gf_log ("server", GF_LOG_ERROR, + "failed to dict_set_dynptr"); + } + } + + FREE (key_cpy); +out: + return; +} + + +static int +validate_auth_options (xlator_t *this, dict_t *dict) +{ + int ret = -1; + int error = 0; + xlator_list_t *trav = NULL; + data_pair_t *pair = NULL; + char *saveptr = NULL, *tmp = NULL; + char *key_cpy = NULL; + + trav = this->children; + while (trav) { + error = -1; + for (pair = dict->members_list; pair; pair = pair->next) { + key_cpy = strdup (pair->key); + tmp = strtok_r (key_cpy, ".", &saveptr); + ret = strcmp (tmp, "auth"); + if (ret == 0) { + /* for module type */ + tmp = strtok_r (NULL, ".", &saveptr); + /* for volume name */ + tmp = strtok_r (NULL, ".", &saveptr); + } + + if (strcmp (tmp, trav->xlator->name) == 0) { + error = 0; + free (key_cpy); + break; + } + free (key_cpy); + } + if (-1 == error) { + gf_log (this->name, GF_LOG_ERROR, + "volume '%s' defined as subvolume, but no " + "authentication defined for the same", + trav->xlator->name); + break; + } + trav = trav->next; + } + + return error; +} + + +/* + * init - called during server protocol initialization + * + * @this: + * + */ +int +init (xlator_t *this) +{ + int32_t ret = -1; + transport_t *trans = NULL; + server_conf_t *conf = NULL; + + if (this->children == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "protocol/server should have subvolume"); + goto out; + } + + trans = transport_load (this->options, this); + if (trans == NULL) { + gf_log (this->name, GF_LOG_ERROR, + "failed to load transport"); + goto out; + } + + ret = transport_listen (trans); + if (ret == -1) { + gf_log (this->name, GF_LOG_ERROR, + "failed to bind/listen on socket"); + goto out; + } + + conf = CALLOC (1, sizeof (server_conf_t)); + GF_VALIDATE_OR_GOTO(this->name, conf, out); + + INIT_LIST_HEAD (&conf->conns); + pthread_mutex_init (&conf->mutex, NULL); + + conf->trans = trans; + + conf->auth_modules = dict_new (); + GF_VALIDATE_OR_GOTO(this->name, conf->auth_modules, out); + + dict_foreach (this->options, get_auth_types, + conf->auth_modules); + ret = validate_auth_options (this, this->options); + if (ret == -1) { + /* logging already done in validate_auth_options function. */ + goto out; + } + + ret = gf_auth_init (this, conf->auth_modules); + if (ret) { + dict_unref (conf->auth_modules); + goto out; + } + + this->private = conf; + + ret = dict_get_int32 (this->options, "inode-lru-limit", + &conf->inode_lru_limit); + if (ret < 0) { + conf->inode_lru_limit = 1024; + } + + ret = dict_get_int32 (this->options, "limits.transaction-size", + &conf->max_block_size); + if (ret < 0) { + gf_log (this->name, GF_LOG_DEBUG, + "defaulting limits.transaction-size to %d", + DEFAULT_BLOCK_SIZE); + conf->max_block_size = DEFAULT_BLOCK_SIZE; + } + +#ifndef GF_DARWIN_HOST_OS + { + struct rlimit lim; + + lim.rlim_cur = 1048576; + lim.rlim_max = 1048576; + + if (setrlimit (RLIMIT_NOFILE, &lim) == -1) { + gf_log (this->name, GF_LOG_WARNING, + "WARNING: Failed to set 'ulimit -n 1M': %s", + strerror(errno)); + lim.rlim_cur = 65536; + lim.rlim_max = 65536; + + if (setrlimit (RLIMIT_NOFILE, &lim) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set max open fd to 64k: %s", + strerror(errno)); + } else { + gf_log (this->name, GF_LOG_ERROR, + "max open fd set to 64k"); + } + } + } +#endif + this->ctx->top = this; + + ret = 0; +out: + return ret; +} + + + +int +protocol_server_pollin (xlator_t *this, transport_t *trans) +{ + char *hdr = NULL; + size_t hdrlen = 0; + char *buf = NULL; + size_t buflen = 0; + int ret = -1; + + + ret = transport_receive (trans, &hdr, &hdrlen, &buf, &buflen); + + if (ret == 0) + ret = protocol_server_interpret (this, trans, hdr, + hdrlen, buf, buflen); + + /* TODO: use mem-pool */ + FREE (hdr); + + return ret; +} + + +/* + * fini - finish function for server protocol, called before + * unloading server protocol. + * + * @this: + * + */ +void +fini (xlator_t *this) +{ + server_conf_t *conf = this->private; + + GF_VALIDATE_OR_GOTO(this->name, conf, out); + + if (conf->auth_modules) { + dict_unref (conf->auth_modules); + } + + FREE (conf); + this->private = NULL; +out: + return; +} + +/* + * server_protocol_notify - notify function for server protocol + * @this: + * @trans: + * @event: + * + */ +int +notify (xlator_t *this, int32_t event, void *data, ...) +{ + int ret = 0; + transport_t *trans = data; + + switch (event) { + case GF_EVENT_POLLIN: + ret = protocol_server_pollin (this, trans); + break; + case GF_EVENT_POLLERR: + { + peer_info_t *peerinfo = NULL; + + peerinfo = &(trans->peerinfo); + gf_log (trans->xl->name, GF_LOG_INFO, "%s disconnected", + peerinfo->identifier); + + ret = -1; + transport_disconnect (trans); + } + break; + + case GF_EVENT_TRANSPORT_CLEANUP: + { + if (trans->xl_private) + server_connection_put (this, trans->xl_private); + } + break; + + default: + default_notify (this, event, data); + break; + } + + return ret; +} + + +struct xlator_mops mops = { +}; + +struct xlator_fops fops = { +}; + +struct xlator_cbks cbks = { +}; + +struct volume_options options[] = { + { .key = {"transport-type"}, + .value = {"tcp", "socket", "ib-verbs", "unix", "ib-sdp", + "tcp/server", "ib-verbs/server"}, + .type = GF_OPTION_TYPE_STR + }, + { .key = {"volume-filename.*"}, + .type = GF_OPTION_TYPE_PATH, + }, + { .key = {"inode-lru-limit"}, + .type = GF_OPTION_TYPE_INT, + .min = 0, + .max = (1 * GF_UNIT_MB) + }, + { .key = {"client-volume-filename"}, + .type = GF_OPTION_TYPE_PATH + }, + { .key = {NULL} }, +}; diff --git a/xlators/protocol/server/src/server-protocol.h b/xlators/protocol/server/src/server-protocol.h new file mode 100644 index 000000000..cc5f6f951 --- /dev/null +++ b/xlators/protocol/server/src/server-protocol.h @@ -0,0 +1,143 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _SERVER_PROTOCOL_H_ +#define _SERVER_PROTOCOL_H_ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <pthread.h> + +#include "glusterfs.h" +#include "xlator.h" +#include "logging.h" +#include "call-stub.h" +#include "authenticate.h" +#include "fd.h" +#include "byte-order.h" + +#define DEFAULT_BLOCK_SIZE 4194304 /* 4MB */ +#define GLUSTERFSD_SPEC_PATH CONFDIR "/glusterfs-client.vol" + +typedef struct _server_state server_state_t; + +struct _locker { + struct list_head lockers; + loc_t loc; + fd_t *fd; + pid_t pid; +}; + +struct _lock_table { + struct list_head file_lockers; + struct list_head dir_lockers; + gf_lock_t lock; + size_t count; +}; + + +/* private structure per connection (transport object) + * used as transport_t->xl_private + */ +struct _server_connection { + struct list_head list; + char *id; + int ref; + pthread_mutex_t lock; + char disconnected; + fdtable_t *fdtable; + struct _lock_table *ltable; + xlator_t *bound_xl; +}; + +typedef struct _server_connection server_connection_t; + + +server_connection_t * +server_connection_get (xlator_t *this, const char *id); + +void +server_connection_put (xlator_t *this, server_connection_t *conn); + +int +server_connection_destroy (xlator_t *this, server_connection_t *conn); + +int +server_nop_cbk (call_frame_t *frame, void *cookie, + xlator_t *this, int32_t op_ret, int32_t op_errno); + + +typedef struct { + dict_t *auth_modules; + transport_t *trans; + int32_t max_block_size; + int32_t inode_lru_limit; + pthread_mutex_t mutex; + struct list_head conns; +} server_conf_t; + + +struct _server_state { + transport_t *trans; + xlator_t *bound_xl; + loc_t loc; + loc_t loc2; + int flags; + fd_t *fd; + size_t size; + off_t offset; + mode_t mode; + dev_t dev; + uid_t uid; + gid_t gid; + size_t nr_count; + int cmd; + int type; + char *name; + int name_len; + inode_table_t *itable; + int64_t fd_no; + ino_t ino; + ino_t par; + ino_t ino2; + ino_t par2; + char *path; + char *path2; + char *bname; + char *bname2; + int mask; + char is_revalidate; + dict_t *xattr_req; + struct flock flock; + struct timespec tv[2]; + char *resolved; +}; + + +int +server_stub_resume (call_stub_t *stub, int32_t op_ret, int32_t op_errno, + inode_t *inode, inode_t *parent); + +int +do_path_lookup (call_stub_t *stub, const loc_t *loc); + +#endif diff --git a/xlators/storage/Makefile.am b/xlators/storage/Makefile.am new file mode 100644 index 000000000..59b968969 --- /dev/null +++ b/xlators/storage/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = posix $(BDB_SUBDIR) + +CLEANFILES = diff --git a/xlators/storage/bdb/Makefile.am b/xlators/storage/bdb/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/storage/bdb/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/storage/bdb/src/Makefile.am b/xlators/storage/bdb/src/Makefile.am new file mode 100644 index 000000000..c0ab394bc --- /dev/null +++ b/xlators/storage/bdb/src/Makefile.am @@ -0,0 +1,18 @@ + +xlator_LTLIBRARIES = bdb.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage + +bdb_la_LDFLAGS = -module -avoidversion + +bdb_la_SOURCES = bctx.c bdb-ll.c bdb.c +bdb_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = bdb.h + +AM_CFLAGS = -fPIC -D_FILE_OFFSET_BITS=64 -D__USE_FILE_OFFSET64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles $(GF_CFLAGS) + +AM_LDFLAGS = -ldb + +CLEANFILES = + diff --git a/xlators/storage/bdb/src/bctx.c b/xlators/storage/bdb/src/bctx.c new file mode 100644 index 000000000..2bfa3ea87 --- /dev/null +++ b/xlators/storage/bdb/src/bctx.c @@ -0,0 +1,394 @@ +/* + Copyright (c) 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <list.h> +#include <bdb.h> +#include <libgen.h> /* for dirname */ + +static void +__destroy_bctx (bctx_t *bctx) +{ + if (bctx->directory) + FREE (bctx->directory); + + if (bctx->db_path) + FREE (bctx->db_path); + + FREE (bctx); +} + +static void +__unhash_bctx (bctx_t *bctx) +{ + list_del_init (&bctx->b_hash); +} + +static int32_t +bctx_table_prune (bctx_table_t *table) +{ + int32_t ret = 0; + struct list_head purge = {0,}; + struct list_head *next = NULL; + bctx_t *entry = NULL; + bctx_t *del = NULL, *tmp = NULL; + + if (!table) + return 0; + + INIT_LIST_HEAD (&purge); + + LOCK (&table->lock); + { + if ((table->lru_limit) && + (table->lru_size > table->lru_limit)) { + while (table->lru_size > table->lru_limit) { + next = table->b_lru.next; + entry = list_entry (next, bctx_t, list); + + list_move_tail (next, &table->purge); + __unhash_bctx (entry); + + table->lru_size--; + ret++; + } + } + list_move_tail (&purge, &table->purge); + list_del_init (&table->purge); + } + UNLOCK (&table->lock); + + { + list_for_each_entry_safe (del, tmp, &purge, list) { + list_del_init (&del->list); + if (del->dbp) { + ret = del->dbp->close (del->dbp, 0); + if (ret != 0) { + gf_log (table->this->name, GF_LOG_ERROR, + "failed to close db on path (%s): %s", + del->directory, db_strerror (ret)); + } else { + gf_log (table->this->name, GF_LOG_WARNING, + "close db for path %s; table->lru_count = %d", + del->directory, table->lru_size); + } + } + __destroy_bctx (del); + } + } + + return ret; +} + + +/* struct bdb_ctx related */ +static inline uint32_t +bdb_key_hash (char *key, uint32_t hash_size) +{ + uint32_t hash = 0; + + hash = *key; + + if (hash) { + for (key += 1; *key != '\0'; key++) { + hash = (hash << 5) - hash + *key; + } + } + + return (hash + *key) % hash_size; +} + +static void +__hash_bctx (bctx_t *bctx) +{ + bctx_table_t *table = NULL; + char *key = NULL; + + table = bctx->table; + + MAKE_KEY_FROM_PATH (key, bctx->directory); + bctx->key_hash = bdb_key_hash (key, table->hash_size); + + list_del_init (&bctx->b_hash); + list_add (&bctx->b_hash, &table->b_hash[bctx->key_hash]); +} + +static inline bctx_t * +__bctx_passivate (bctx_t *bctx) +{ + if (bctx->dbp) { + list_move_tail (&bctx->list, &(bctx->table->b_lru)); + bctx->table->lru_size++; + } else { + list_move_tail (&bctx->list, &bctx->table->purge); + __unhash_bctx (bctx); + } + return bctx; +} + +static inline bctx_t * +__bctx_activate (bctx_t *bctx) +{ + list_move (&bctx->list, &bctx->table->active); + bctx->table->lru_size--; + + return bctx; +} + +static bctx_t * +__bdb_ctx_unref (bctx_t *bctx) +{ + assert (bctx->ref); + + --bctx->ref; + + if (!bctx->ref) + bctx = __bctx_passivate (bctx); + + return bctx; +} + + +bctx_t * +bctx_unref (bctx_t *bctx) +{ + bctx_table_t *table = NULL; + + if (!bctx && !bctx->table) + return NULL; + + table = bctx->table; + + LOCK (&table->lock); + { + bctx = __bdb_ctx_unref (bctx); + } + UNLOCK (&table->lock); + + bctx_table_prune (table); + + return bctx; +} + +/* + * NOTE: __bdb_ctx_ref() is called only after holding table->lock and bctx->lock, in that order + */ +static inline bctx_t * +__bctx_ref (bctx_t *bctx) +{ + if (!bctx->ref) + __bctx_activate (bctx); + + bctx->ref++; + + return bctx; +} + +bctx_t * +bctx_ref (bctx_t *bctx) +{ + LOCK (&(bctx->table->lock)); + { + __bctx_ref (bctx); + } + UNLOCK (&(bctx->table->lock)); + + return bctx; +} + + +#define BDB_THIS(table) (table->this) + +static inline bctx_t * +__create_bctx (bctx_table_t *table, + const char *path) +{ + bctx_t *bctx = NULL; + char *db_path = NULL; + + bctx = CALLOC (1, sizeof (*bctx)); + GF_VALIDATE_OR_GOTO ("bctx", bctx, out); + + bctx->table = table; + bctx->directory = strdup (path); + GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out); + + MAKE_REAL_PATH_TO_STORAGE_DB (db_path, BDB_THIS (table), path); + + bctx->db_path = strdup (db_path); + GF_VALIDATE_OR_GOTO ("bctx", bctx->directory, out); + + INIT_LIST_HEAD (&bctx->c_list); + INIT_LIST_HEAD (&bctx->list); + INIT_LIST_HEAD (&bctx->b_hash); + + LOCK_INIT (&bctx->lock); + + __hash_bctx (bctx); + + list_add (&bctx->list, &table->b_lru); + table->lru_size++; + +out: + return bctx; +} + +/* bctx_lookup - lookup bctx_t for the directory @directory. (see description of bctx_t in bdb.h) + * + * @table: bctx_table_t for this instance of bdb. + * @directory: directory for which bctx_t is being looked up. + */ +bctx_t * +bctx_lookup (bctx_table_t *table, + const char *directory) +{ + char *key = NULL; + uint32_t key_hash = 0; + bctx_t *trav = NULL, *bctx = NULL, *tmp = NULL; + int32_t need_break = 0; + + GF_VALIDATE_OR_GOTO ("bctx", table, out); + GF_VALIDATE_OR_GOTO ("bctx", directory, out); + + MAKE_KEY_FROM_PATH (key, directory); + key_hash = bdb_key_hash (key, table->hash_size); + + LOCK (&table->lock); + { + if (!list_empty (&table->b_hash[key_hash])) { + list_for_each_entry_safe (trav, tmp, &table->b_hash[key_hash], b_hash) { + LOCK(&trav->lock); + if (!strcmp(trav->directory, directory)) { + bctx = __bctx_ref (trav); + need_break = 1; + } + UNLOCK(&trav->lock); + if (need_break) + break; + } + } + + if (!bctx) { + bctx = __create_bctx (table, directory); + bctx = __bctx_ref (bctx); + } + } + UNLOCK (&table->lock); +out: + return bctx; +} + + +bctx_t * +bctx_parent (bctx_table_t *table, + const char *path) +{ + char *pathname = NULL, *directory = NULL; + bctx_t *bctx = NULL; + + GF_VALIDATE_OR_GOTO ("bctx", table, out); + GF_VALIDATE_OR_GOTO ("bctx", path, out); + + pathname = strdup (path); + GF_VALIDATE_OR_GOTO ("bctx", pathname, out); + directory = dirname (pathname); + + bctx = bctx_lookup (table, directory); + GF_VALIDATE_OR_GOTO ("bctx", bctx, out); + +out: + if (pathname) + free (pathname); + return bctx; +} + +inline int32_t +bdb_db_rename (bctx_table_t *table, + const char *oldpath, + const char *newpath) +{ + DB_ENV *dbenv = NULL; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bctx", table, out); + GF_VALIDATE_OR_GOTO ("bctx", oldpath, out); + GF_VALIDATE_OR_GOTO ("bctx", newpath, out); + + dbenv = table->dbenv; + GF_VALIDATE_OR_GOTO ("bctx", dbenv, out); + + LOCK (&table->lock); + { + ret = dbenv->dbrename (dbenv, NULL, oldpath, NULL, newpath, 0); + + if (ret != 0) { + gf_log ("bctx", + GF_LOG_ERROR, + "failed to rename %s to %s: %s", + oldpath, newpath, db_strerror (ret)); + } else { + gf_log ("bctx", + GF_LOG_DEBUG, + "successfully renamed %s to %s: %s", + oldpath, newpath, db_strerror (ret)); + } + } + UNLOCK (&table->lock); + +out: + return ret; +} + +bctx_t * +bctx_rename (bctx_t *bctx, + const char *db_newpath) +{ + bctx_table_t *table = NULL; + int32_t ret = -1; + + table = bctx->table; + + LOCK (&table->lock); + { + __unhash_bctx (bctx); + list_del_init (&bctx->list); + if (bctx->dbp) { + ret = bctx->dbp->close (bctx->dbp, 0); + if (ret != 0) { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "failed to close db for directory %s (%s)", + bctx->directory, db_strerror (ret)); + } + bctx->dbp = NULL; + } + } + UNLOCK (&table->lock); + + ret = bdb_db_rename (table, bctx->db_path, db_newpath); + + if (ret != 0) { + gf_log ("bctx", + GF_LOG_ERROR, + "bdb_db_rename failed for directory %s", + bctx->directory); + bctx = NULL; + } + + return bctx; +} diff --git a/xlators/storage/bdb/src/bdb-ll.c b/xlators/storage/bdb/src/bdb-ll.c new file mode 100644 index 000000000..40e7d1877 --- /dev/null +++ b/xlators/storage/bdb/src/bdb-ll.c @@ -0,0 +1,1455 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include <libgen.h> +#include "bdb.h" +#include <list.h> +/* + * implement the procedures to interact with bdb */ + +/**************************************************************** + * + * General wrappers and utility procedures for bdb xlator + * + ****************************************************************/ +#define BDB_LL_PAGE_SIZE_DEFAULT 4096 +#define BDB_LL_PAGE_SIZE_MIN 4096 +#define BDB_LL_PAGE_SIZE_MAX 65536 + +ino_t +bdb_inode_transform (ino_t parent, + bctx_t *bctx) +{ + struct bdb_private *private = NULL; + ino_t ino = -1; + + GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); + + private = bctx->table->this->private; + + LOCK (&private->ino_lock); + ino = ++private->next_ino; + UNLOCK (&private->ino_lock); +out: + return ino; +} + + +/*********************************************************** + * + * bdb storage database utilities + * + **********************************************************/ + +/* + * bdb_db_open - opens a storage db. + * + * @ctx: context specific to the directory for which we are supposed to open db + * + * see, if we have empty slots to open a db. + * if (no-empty-slots), then prune open dbs and close as many as possible + * if (empty-slot-available), tika muchkonDu db open maaDu + * + * NOTE: illi baro munche lock hiDkobEku + */ +static DB * +bdb_db_open (bctx_t *bctx) +{ + DB *storage_dbp = NULL; + int32_t op_ret = -1; + bctx_table_t *table = NULL; + + GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); + + table = bctx->table; + GF_VALIDATE_OR_GOTO ("bdb-ll", table, out); + + /* we have to do the following, we can't deny someone of db_open ;) */ + op_ret = db_create (&storage_dbp, table->dbenv, 0); + if (op_ret != 0) { + gf_log ("bdb-ll", GF_LOG_ERROR, + "failed to do db_create for directory %s (%s)", + bctx->directory, db_strerror (op_ret)); + storage_dbp = NULL; + goto out; + } + + if (table->page_size) { + op_ret = storage_dbp->set_pagesize (storage_dbp, + table->page_size); + if (op_ret != 0) { + gf_log ("bdb-ll", GF_LOG_ERROR, + "failed to set the page_size (%"PRIu64") for directory %s (%s)", + table->page_size, bctx->directory, db_strerror (op_ret)); + } else { + gf_log ("bdb-ll", GF_LOG_DEBUG, + "page-size (%"PRIu64") set on DB", + table->page_size); + } + } + + op_ret = storage_dbp->open (storage_dbp, + NULL, + bctx->db_path, + NULL, + table->access_mode, + table->dbflags, + 0); + if (op_ret != 0 ) { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "failed to open storage-db for directory %s (%s)", + bctx->db_path, db_strerror (op_ret)); + storage_dbp = NULL; + } + +out: + return storage_dbp; +} + + + +int32_t +bdb_cursor_close (bctx_t *bctx, + DBC *cursorp) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); + GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out); + + LOCK (&bctx->lock); + { +#ifdef HAVE_BDB_CURSOR_GET + ret = cursorp->close (cursorp); +#else + ret = cursorp->c_close (cursorp); +#endif + if ((ret != 0)) { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "failed to close db cursor for directory %s (%s)", + bctx->directory, db_strerror (ret)); + } + } + UNLOCK (&bctx->lock); + +out: + return ret; +} + + +int32_t +bdb_cursor_open (bctx_t *bctx, + DBC **cursorpp) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); + GF_VALIDATE_OR_GOTO ("bdb-ll", cursorpp, out); + + LOCK (&bctx->lock); + { + if (bctx->dbp) { + /* do nothing, just continue */ + ret = 0; + } else { + bctx->dbp = bdb_db_open (bctx); + if (!bctx->dbp) { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "failed to open storage db for %s", + bctx->directory); + ret = -1; + } else { + ret = 0; + } + } + + if (ret == 0) { + /* all set, lets open cursor */ + ret = bctx->dbp->cursor (bctx->dbp, NULL, cursorpp, 0); + if (ret != 0) { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "failed to create a cursor for %s (%s)", + bctx->directory, db_strerror (ret)); + } + } + } + UNLOCK (&bctx->lock); + +out: + return ret; +} + + +/* cache related */ +static bdb_cache_t * +bdb_cache_lookup (bctx_t *bctx, + char *path) +{ + bdb_cache_t *bcache = NULL; + bdb_cache_t *trav = NULL; + char *key = NULL; + + GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); + GF_VALIDATE_OR_GOTO ("bdb-ll", path, out); + + MAKE_KEY_FROM_PATH (key, path); + + LOCK (&bctx->lock); + { + list_for_each_entry (trav, &bctx->c_list, c_list) { + if (!strcmp (trav->key, key)){ + bcache = trav; + break; + } + } + } + UNLOCK (&bctx->lock); + +out: + return bcache; +} + +static int32_t +bdb_cache_insert (bctx_t *bctx, + DBT *key, + DBT *data) +{ + bdb_cache_t *bcache = NULL; + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); + GF_VALIDATE_OR_GOTO ("bdb-ll", key, out); + GF_VALIDATE_OR_GOTO ("bdb-ll", data, out); + + LOCK (&bctx->lock); + { + if (bctx->c_count > 5) { + /* most of the times, we enter here */ + /* FIXME: ugly, not supposed to disect any of the + * 'struct list_head' directly */ + if (!list_empty (&bctx->c_list)) { + bcache = list_entry (bctx->c_list.prev, bdb_cache_t, c_list); + list_del_init (&bcache->c_list); + } + if (bcache->key) { + free (bcache->key); + bcache->key = strdup ((char *)key->data); + GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock); + } else { + /* should never come here */ + gf_log ("bdb-ll", + GF_LOG_CRITICAL, + "bcache->key (null)"); + } /* if(bcache->key)...else */ + if (bcache->data) { + free (bcache->data); + bcache->data = memdup (data->data, data->size); + GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock); + bcache->size = data->size; + } else { + /* should never come here */ + gf_log ("bdb-ll", + GF_LOG_CRITICAL, + "bcache->data (null)"); + } /* if(bcache->data)...else */ + list_add (&bcache->c_list, &bctx->c_list); + ret = 0; + } else { + /* we will be entering here very rarely */ + bcache = CALLOC (1, sizeof (*bcache)); + GF_VALIDATE_OR_GOTO ("bdb-ll", bcache, unlock); + bcache->key = strdup ((char *)(key->data)); + GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->key, unlock); + bcache->data = memdup (data->data, data->size); + GF_VALIDATE_OR_GOTO ("bdb-ll", bcache->data, unlock); + bcache->size = data->size; + list_add (&bcache->c_list, &bctx->c_list); + bctx->c_count++; + ret = 0; + } /* if(private->c_count < 5)...else */ + } +unlock: + UNLOCK (&bctx->lock); +out: + return ret; +} + +static int32_t +bdb_cache_delete (bctx_t *bctx, + char *key) +{ + bdb_cache_t *bcache = NULL; + bdb_cache_t *trav = NULL; + + GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); + GF_VALIDATE_OR_GOTO ("bdb-ll", key, out); + + LOCK (&bctx->lock); + { + list_for_each_entry (trav, &bctx->c_list, c_list) { + if (!strcmp (trav->key, key)){ + bctx->c_count--; + bcache = trav; + break; + } + } + + if (bcache) { + list_del_init (&bcache->c_list); + free (bcache->key); + free (bcache->data); + free (bcache); + } + } + UNLOCK (&bctx->lock); + +out: + return 0; +} + +void * +bdb_db_stat (bctx_t *bctx, + DB_TXN *txnid, + uint32_t flags) +{ + DB *storage = NULL; + void *stat = NULL; + int32_t ret = -1; + + LOCK (&bctx->lock); + { + if (bctx->dbp == NULL) { + bctx->dbp = bdb_db_open (bctx); + storage = bctx->dbp; + } else { + /* we are just fine, lets continue */ + storage = bctx->dbp; + } /* if(bctx->dbp==NULL)...else */ + } + UNLOCK (&bctx->lock); + + GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); + + ret = storage->stat (storage, txnid, &stat, flags); + + if (ret != 0) { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "failed to do DB->stat() on db file %s: %s", + bctx->db_path, db_strerror (ret)); + } else { + gf_log ("bdb-ll", + GF_LOG_DEBUG, + "successfully called DB->stat() on db file %s", + bctx->db_path); + } +out: + return stat; + +} + +/* bdb_storage_get - retrieve a key/value pair corresponding to @path from the corresponding + * db file. + * + * @bctx: bctx_t * corresponding to the parent directory of @path. (should always be a valid + * bctx). bdb_storage_get should never be called if @bctx = NULL. + * @txnid: NULL if bdb_storage_get is not embedded in an explicit transaction or a valid + * DB_TXN *, when embedded in an explicit transaction. + * @path: path of the file to read from (translated to a database key using MAKE_KEY_FROM_PATH) + * @buf: char ** - pointer to a pointer to char. a read buffer is created in this procedure + * and pointer to the buffer is passed through @buf to the caller. + * @size: size of the file content to be read. + * @offset: offset from which the file content to be read. + * + * NOTE: bdb_storage_get tries to open DB, if @bctx->dbp == NULL (@bctx->dbp == NULL, + * nobody has opened DB till now or DB was closed by bdb_table_prune()). + * + * NOTE: if private->cache is set (bdb xlator's internal caching enabled), then bdb_storage_get + * first looks up the cache for key/value pair. if bdb_lookup_cache fails, then only + * DB->get() is called. also, inserts a newly read key/value pair to cache through + * bdb_insert_to_cache. + * + * return: 'number of bytes read' on success or -1 on error. + * + * also see: bdb_lookup_cache, bdb_insert_to_cache for details about bdb xlator's internal cache. + */ +int32_t +bdb_db_get (bctx_t *bctx, + DB_TXN *txnid, + const char *path, + char **buf, + size_t size, + off_t offset) +{ + DB *storage = NULL; + DBT key = {0,}; + DBT value = {0,}; + int32_t ret = -1; + char *key_string = NULL; + bdb_cache_t *bcache = NULL; + int32_t db_flags = 0; + uint8_t need_break = 0; + int32_t retries = 1; + + GF_VALIDATE_OR_GOTO ("bdb-ll", bctx, out); + GF_VALIDATE_OR_GOTO ("bdb-ll", path, out); + + MAKE_KEY_FROM_PATH (key_string, path); + + if (bctx->cache && + ((bcache = bdb_cache_lookup (bctx, key_string)) != NULL)) { + if (buf) { + *buf = CALLOC (1, bcache->size); + GF_VALIDATE_OR_GOTO ("bdb-ll", buf, out); + memcpy (*buf, (bcache->data + offset), bcache->size); + } + ret = bcache->size; + } else { + LOCK (&bctx->lock); + { + if (bctx->dbp == NULL) { + bctx->dbp = bdb_db_open (bctx); + storage = bctx->dbp; + } else { + /* we are just fine, lets continue */ + storage = bctx->dbp; + } /* if(bctx->dbp==NULL)...else */ + } + UNLOCK (&bctx->lock); + + GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); + + key.data = (char *)key_string; + key.size = strlen (key_string); + key.flags = DB_DBT_USERMEM; + + if (bctx->cache){ + /* we are called to return the size of the file */ + value.flags = DB_DBT_MALLOC; + } else { + if (size) { + value.flags = DB_DBT_MALLOC | DB_DBT_PARTIAL; + } else { + value.flags = DB_DBT_MALLOC; + } + value.dlen = size; + value.doff = offset; + } + + do { + /* TODO: we prefer to give our own buffer to value.data + * and ask bdb to fill in it */ + ret = storage->get (storage, txnid, &key, &value, db_flags); + + if (ret == DB_NOTFOUND) { + gf_log ("bdb-ll", + GF_LOG_DEBUG, + "failed to do DB->get() for key: %s." + " key not found in storage DB", key_string); + ret = -1; + need_break = 1; + } else if (ret == DB_LOCK_DEADLOCK) { + retries++; + gf_log ("bdb-ll", + GF_LOG_ERROR, + "deadlock detected in DB->put. retrying DB->put (%d)", + retries); + }else if (ret == 0) { + /* successfully read data, lets set everything in place + * and return */ + if (buf) { + *buf = CALLOC (1, value.size); + ERR_ABORT (*buf); + memcpy (*buf, value.data, value.size); + } + ret = value.size; + if (bctx->cache) + bdb_cache_insert (bctx, &key, &value); + free (value.data); + need_break = 1; + } else { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "failed to do DB->get() for key %s: %s", + key_string, db_strerror (ret)); + ret = -1; + need_break = 1; + } + } while (!need_break); + } +out: + return ret; +}/* bdb_db_get */ + +/* bdb_storage_put - insert a key/value specified to the corresponding DB. + * + * @bctx: bctx_t * corresponding to the parent directory of @path. + * (should always be a valid bctx). bdb_storage_put should never be called if @bctx = NULL. + * @txnid: NULL if bdb_storage_put is not embedded in an explicit transaction or a valid + * DB_TXN *, when embedded in an explicit transaction. + * @key_string: key of the database entry. + * @buf: pointer to the buffer data to be written as data for @key_string. + * @size: size of @buf. + * @offset: offset in the key's data to be modified with provided data. + * @flags: valid flags are BDB_TRUNCATE_RECORD (to reduce the data of @key_string to 0 size). + * + * NOTE: bdb_storage_put tries to open DB, if @bctx->dbp == NULL (@bctx->dbp == NULL, + * nobody has opened DB till now or DB was closed by bdb_table_prune()). + * + * NOTE: bdb_storage_put deletes the key/value from bdb xlator's internal cache. + * + * return: 0 on success or -1 on error. + * + * also see: bdb_cache_delete for details on how a cached key/value pair is removed. + */ +int32_t +bdb_db_put (bctx_t *bctx, + DB_TXN *txnid, + const char *key_string, + const char *buf, + size_t size, + off_t offset, + int32_t flags) +{ + DB *storage = NULL; + DBT key = {0,}, value = {0,}; + int32_t ret = -1; + int32_t db_flags = DB_AUTO_COMMIT; + uint8_t need_break = 0; + int32_t retries = 1; + + LOCK (&bctx->lock); + { + if (bctx->dbp == NULL) { + bctx->dbp = bdb_db_open (bctx); + storage = bctx->dbp; + } else { + /* we are just fine, lets continue */ + storage = bctx->dbp; + } + } + UNLOCK (&bctx->lock); + + GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); + + if (bctx->cache) { + ret = bdb_cache_delete (bctx, (char *)key_string); + GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out); + } + + key.data = (void *)key_string; + key.size = strlen (key_string); + + /* NOTE: bdb lets us expand the file, suppose value.size > value.len, then value.len bytes + * from value.doff offset and value.size bytes will be written from value.doff and + * data from value.doff + value.dlen will be pushed value.doff + value.size + */ + value.data = (void *)buf; + + if (flags & BDB_TRUNCATE_RECORD) { + value.size = size; + value.doff = 0; + value.dlen = offset; + } else { + value.size = size; + value.dlen = size; + value.doff = offset; + } + value.flags = DB_DBT_PARTIAL; + if (buf == NULL && size == 0) + /* truncate called us */ + value.flags = 0; + + do { + ret = storage->put (storage, txnid, &key, &value, db_flags); + if (ret == DB_LOCK_DEADLOCK) { + retries++; + gf_log ("bdb-ll", + GF_LOG_ERROR, + "deadlock detected in DB->put. retrying DB->put (%d)", + retries); + } else if (ret) { + /* write failed */ + gf_log ("bdb-ll", + GF_LOG_ERROR, + "failed to do DB->put() for key %s: %s", + key_string, db_strerror (ret)); + need_break = 1; + } else { + /* successfully wrote */ + ret = 0; + need_break = 1; + } + } while (!need_break); +out: + return ret; +}/* bdb_db_put */ + + +/* bdb_storage_del - delete a key/value pair corresponding to @path from corresponding db file. + * + * @bctx: bctx_t * corresponding to the parent directory of @path. + * (should always be a valid bctx). bdb_storage_del should never be called + * if @bctx = NULL. + * @txnid: NULL if bdb_storage_del is not embedded in an explicit transaction or a + * valid DB_TXN *, when embedded in an explicit transaction. + * @path: path to the file, whose key/value pair has to be deleted. + * + * NOTE: bdb_storage_del tries to open DB, if @bctx->dbp == NULL (@bctx->dbp == NULL, + * nobody has opened DB till now or DB was closed by bdb_table_prune()). + * + * return: 0 on success or -1 on error. + */ +int32_t +bdb_db_del (bctx_t *bctx, + DB_TXN *txnid, + const char *path) +{ + DB *storage = NULL; + DBT key = {0,}; + char *key_string = NULL; + int32_t ret = -1; + int32_t db_flags = 0; + uint8_t need_break = 0; + int32_t retries = 1; + + MAKE_KEY_FROM_PATH (key_string, path); + + LOCK (&bctx->lock); + { + if (bctx->dbp == NULL) { + bctx->dbp = bdb_db_open (bctx); + storage = bctx->dbp; + } else { + /* we are just fine, lets continue */ + storage = bctx->dbp; + } + } + UNLOCK (&bctx->lock); + + GF_VALIDATE_OR_GOTO ("bdb-ll", storage, out); + + ret = bdb_cache_delete (bctx, key_string); + GF_VALIDATE_OR_GOTO ("bdb-ll", (ret == 0), out); + + key.data = key_string; + key.size = strlen (key_string); + key.flags = DB_DBT_USERMEM; + + do { + ret = storage->del (storage, txnid, &key, db_flags); + + if (ret == DB_NOTFOUND) { + gf_log ("bdb-ll", + GF_LOG_DEBUG, + "failed to delete %s from storage db, doesn't exist in storage DB", + path); + need_break = 1; + } else if (ret == DB_LOCK_DEADLOCK) { + retries++; + gf_log ("bdb-ll", + GF_LOG_ERROR, + "deadlock detected in DB->put. retrying DB->put (%d)", + retries); + }else if (ret == 0) { + /* successfully deleted the entry */ + gf_log ("bdb-ll", + GF_LOG_DEBUG, + "deleted %s from storage db", path); + ret = 0; + need_break = 1; + } else { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "failed to delete %s from storage db: %s", + path, db_strerror (ret)); + ret = -1; + need_break = 1; + } + } while (!need_break); +out: + return ret; +} + +/* NOTE: bdb version compatibility wrapper */ +int32_t +bdb_cursor_get (DBC *cursorp, + DBT *key, + DBT *value, + int32_t flags) +{ + int32_t ret = -1; + + GF_VALIDATE_OR_GOTO ("bdb-ll", cursorp, out); + +#ifdef HAVE_BDB_CURSOR_GET + ret = cursorp->get (cursorp, key, value, flags); +#else + ret = cursorp->c_get (cursorp, key, value, flags); +#endif + if ((ret != 0) && (ret != DB_NOTFOUND)) { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "failed to CURSOR->get() for key %s (%s)", + (char *)key->data, db_strerror (ret)); + } + +out: + return ret; +}/* bdb_cursor_get */ + + +int32_t +bdb_dirent_size (DBT *key) +{ + return ALIGN (24 /* FIX MEEEE!!! */ + key->size); +} + + +/* bdb_extract_bfd - translate a fd_t to a bfd (either a 'struct bdb_bfd' or 'struct bdb_dir') + * + * @fd->ctx is with bdb specific file handle during a successful bdb_open (also bdb_create) + * or bdb_opendir. + * + * return: 'struct bdb_bfd *' or 'struct bdb_dir *' on success, or NULL on failure. + */ +inline void * +bdb_extract_bfd (fd_t *fd, + xlator_t *this) +{ + uint64_t tmp_bfd = 0; + void *bfd = NULL; + + GF_VALIDATE_OR_GOTO ("bdb-ll", fd, out); + GF_VALIDATE_OR_GOTO ("bdb-ll", this, out); + + fd_ctx_get (fd, this, &tmp_bfd); + bfd = (void *)(long)bfd; + +out: + return bfd; +} + +/* bdb_dbenv_init - initialize DB_ENV + * + * initialization includes: + * 1. opening DB_ENV (db_env_create(), DB_ENV->open()). + * NOTE: see private->envflags for flags used. + * 2. DB_ENV->set_lg_dir - set log directory to be used for storing log files + * (log files are the files in which transaction logs are written by db). + * 3. DB_ENV->set_flags (DB_LOG_AUTOREMOVE) - set DB_ENV to automatically clear + * the unwanted log files (flushed at each checkpoint). + * 4. DB_ENV->set_errfile - set errfile to be used by db to report detailed error logs. + * used only for debbuging purpose. + * + * return: returns a valid DB_ENV * on success or NULL on error. + * + */ +static DB_ENV * +bdb_dbenv_init (xlator_t *this, + char *directory) +{ + /* Create a DB environment */ + DB_ENV *dbenv = NULL; + int32_t ret = 0; + bdb_private_t *private = NULL; + int32_t fatal_flags = 0; + + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (directory, out); + + private = this->private; + VALIDATE_OR_GOTO (private, out); + + ret = db_env_create (&dbenv, 0); + VALIDATE_OR_GOTO ((ret == 0), out); + + /* NOTE: set_errpfx returns 'void' */ + dbenv->set_errpfx(dbenv, this->name); + + ret = dbenv->set_lk_detect (dbenv, DB_LOCK_DEFAULT); + VALIDATE_OR_GOTO ((ret == 0), out); + + ret = dbenv->open(dbenv, directory, + private->envflags, + S_IRUSR | S_IWUSR); + if ((ret != 0) && (ret != DB_RUNRECOVERY)) { + gf_log (this->name, + GF_LOG_CRITICAL, + "failed to open DB environment (%s)", + db_strerror (ret)); + dbenv = NULL; + goto out; + } else if (ret == DB_RUNRECOVERY) { + fatal_flags = ((private->envflags & (~DB_RECOVER)) | DB_RECOVER_FATAL); + ret = dbenv->open(dbenv, directory, + fatal_flags, + S_IRUSR | S_IWUSR); + if (ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to open DB environment (%s) with DB_REOVER_FATAL", + db_strerror (ret)); + dbenv = NULL; + goto out; + } else { + gf_log (this->name, + GF_LOG_WARNING, + "opened DB environment after DB_RECOVER_FATAL: %s", + db_strerror (ret)); + } + } else { + gf_log (this->name, + GF_LOG_DEBUG, + "DB environment successfull opened: %s", + db_strerror (ret)); + } + + + +#if (DB_VERSION_MAJOR == 4 && \ + DB_VERSION_MINOR == 7) + if (private->log_auto_remove) { + ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 1); + } else { + ret = dbenv->log_set_config (dbenv, DB_LOG_AUTO_REMOVE, 0); + } +#else + if (private->log_auto_remove) { + ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 1); + } else { + ret = dbenv->set_flags (dbenv, DB_LOG_AUTOREMOVE, 0); + } +#endif + if (ret != 0) { + gf_log ("bctx", + GF_LOG_ERROR, + "failed to set DB_LOG_AUTOREMOVE on dbenv: %s", db_strerror (ret)); + } else { + gf_log ("bctx", + GF_LOG_DEBUG, + "DB_LOG_AUTOREMOVE set on dbenv"); + } + + if (private->transaction) { + ret = dbenv->set_flags(dbenv, DB_AUTO_COMMIT, 1); + + if (ret != 0) { + gf_log ("bctx", + GF_LOG_ERROR, + "failed to set DB_AUTO_COMMIT on dbenv: %s", + db_strerror (ret)); + } else { + gf_log ("bctx", + GF_LOG_DEBUG, + "DB_AUTO_COMMIT set on dbenv"); + } + + if (private->txn_timeout) { + ret = dbenv->set_timeout (dbenv, + private->txn_timeout, + DB_SET_TXN_TIMEOUT); + if (ret != 0) { + gf_log ("bctx", + GF_LOG_ERROR, + "failed to set TXN_TIMEOUT to %d milliseconds " + "on dbenv: %s", + private->txn_timeout, db_strerror (ret)); + } else { + gf_log ("bctx", + GF_LOG_DEBUG, + "TXN_TIMEOUT set to %d milliseconds", + private->txn_timeout); + } + } + + if (private->lock_timeout) { + ret = dbenv->set_timeout(dbenv, + private->txn_timeout, + DB_SET_LOCK_TIMEOUT); + + if (ret != 0) { + gf_log ("bctx", + GF_LOG_ERROR, + "failed to set LOCK_TIMEOUT to %d milliseconds " + "on dbenv: %s", + private->lock_timeout, db_strerror (ret)); + } else { + gf_log ("bctx", + GF_LOG_DEBUG, + "LOCK_TIMEOUT set to %d milliseconds", + private->lock_timeout); + } + } + + ret = dbenv->set_lg_dir (dbenv, private->logdir); + + if (ret != 0) { + gf_log ("bctx", + GF_LOG_ERROR, + "failed to set log directory for dbenv: %s", db_strerror (ret)); + } else { + gf_log ("bctx", + GF_LOG_DEBUG, + "set dbenv log dir to %s", private->logdir); + } + + } + + if (private->errfile) { + private->errfp = fopen (private->errfile, "a+"); + if (private->errfp) { + dbenv->set_errfile (dbenv, private->errfp); + } else { + gf_log ("bctx", + GF_LOG_ERROR, + "failed to open errfile: %s", strerror (errno)); + } + } + +out: + return dbenv; +} + +#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv) + +/* bdb_checkpoint - during transactional usage, db does not directly write the data to db + * files, instead db writes a 'log' (similar to a journal entry) into a + * log file. db normally clears the log files during opening of an + * environment. since we expect a filesystem server to run for a pretty + * long duration and flushing 'log's during dbenv->open would prove very + * costly, if we accumulate the log entries for one complete run of + * glusterfs server. to flush the logs frequently, db provides a mechanism + * called 'checkpointing'. when we do a checkpoint, db flushes the logs to + * disk (writes changes to db files) and we can also clear the accumulated + * log files after checkpointing. NOTE: removing unwanted log files is not + * part of dbenv->txn_checkpoint() call. + * + * @data: xlator_t of the current instance of bdb xlator. + * + * bdb_checkpoint is called in a different thread from the main glusterfs thread. bdb + * xlator creates the checkpoint thread after successfully opening the db environment. + * NOTE: bdb_checkpoint thread shares the DB_ENV handle with the filesystem thread. + * + * db environment checkpointing frequency is controlled by + * 'option checkpoint-timeout <time-in-seconds>' in volfile. + * + * NOTE: checkpointing thread is started only if 'option transaction on' specified in + * volfile. checkpointing is not valid for non-transactional environments. + * + */ +static void * +bdb_checkpoint (void *data) +{ + xlator_t *this = NULL; + struct bdb_private *private = NULL; + DB_ENV *dbenv = NULL; + int32_t ret = 0; + uint32_t active = 0; + + this = (xlator_t *) data; + dbenv = BDB_ENV(this); + private = this->private; + + for (;;sleep (private->checkpoint_timeout)) { + LOCK (&private->active_lock); + active = private->active; + UNLOCK (&private->active_lock); + + if (active) { + ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0); + if (ret) { + gf_log ("bctx", + GF_LOG_ERROR, + "failed to checkpoint environment: %s", db_strerror (ret)); + } else { + gf_log ("bctx", + GF_LOG_DEBUG, + "checkpointing successful"); + } + } else { + ret = dbenv->txn_checkpoint (dbenv, 1024, 0, 0); + if (ret) { + gf_log ("bctx", + GF_LOG_ERROR, + "failed to do final checkpoint environment: %s", + db_strerror (ret)); + } else { + gf_log ("bctx", + GF_LOG_DEBUG, + "final checkpointing successful"); + } + break; + } + } + + return NULL; +} + +static inline void +BDB_CACHE_INIT (xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + /* cache is always on */ + private->cache = ON; +} + +static inline void +BDB_LOG_REMOVE_INIT(xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + private->log_auto_remove = 1; + gf_log (this->name, + GF_LOG_DEBUG, + "DB_ENV will use DB_LOG_AUTO_REMOVE"); +} + +static inline void +BDB_ERRFILE_INIT (xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + data_t *errfile = NULL; + + errfile = dict_get (options, "errfile"); + if (errfile) { + private->errfile = strdup (errfile->data); + gf_log (this->name, + GF_LOG_DEBUG, + "using errfile: %s", private->errfile); + } +} + +static inline void +BDB_TABLE_INIT (xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + bctx_table_t *table = NULL; + int32_t idx = 0; + + data_t *lru_limit = NULL; + data_t *page_size = NULL; + + table = CALLOC (1, sizeof (*table)); + if (table) { + INIT_LIST_HEAD(&(table->b_lru)); + INIT_LIST_HEAD(&(table->active)); + INIT_LIST_HEAD(&(table->purge)); + + LOCK_INIT (&table->lock); + LOCK_INIT (&table->checkpoint_lock); + + table->transaction = private->transaction; + table->access_mode = private->access_mode; + table->dbflags = private->dbflags; + table->this = this; + + { + lru_limit = dict_get (options, "lru-limit"); + + /* TODO: set max lockers and max txns to accomodate + * for more than lru_limit */ + if (lru_limit) { + table->lru_limit = strtol (lru_limit->data, NULL, 0); + gf_log ("bdb-ll", + GF_LOG_DEBUG, + "setting bctx lru limit to %d", table->lru_limit); + } else { + table->lru_limit = BDB_DEFAULT_LRU_LIMIT; + } + } + + { + page_size = dict_get (options, "page-size"); + + if (page_size) + { + if (gf_string2bytesize (page_size->data, + &table->page_size) != 0) { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "invalid number format \"%s\"" + " of \"option page-size\"", + page_size->data); + } + + if (!(table->page_size >= BDB_LL_PAGE_SIZE_MIN && + table->page_size <= BDB_LL_PAGE_SIZE_MAX)) { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "pagesize %s is out of range." + "Allowed pagesize is between %d and %d", + page_size->data, + BDB_LL_PAGE_SIZE_MIN, + BDB_LL_PAGE_SIZE_MAX); + } + } + else { + table->page_size = BDB_LL_PAGE_SIZE_DEFAULT; + } + gf_log ("bdb-ll", + GF_LOG_DEBUG, "using page-size %"PRIu64, + table->page_size); + } + + table->hash_size = BDB_DEFAULT_HASH_SIZE; + table->b_hash = CALLOC (BDB_DEFAULT_HASH_SIZE, sizeof (struct list_head)); + + for (idx = 0; idx < table->hash_size; idx++) + INIT_LIST_HEAD(&(table->b_hash[idx])); + + private->b_table = table; + } else { + gf_log ("bdb-ll", + GF_LOG_CRITICAL, + "failed to allocate bctx table: out of memory"); + } +} + +static inline void +BDB_DIRECTORY_INIT (xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + data_t *directory = NULL; + data_t *logdir = NULL; + int32_t op_ret = -1; + struct stat stbuf = {0}; + + directory = dict_get (options, "directory"); + + if (directory) { + logdir = dict_get (options, "logdir"); + + if (logdir == NULL) { + gf_log ("bdb-ll", + GF_LOG_DEBUG, + "using default logdir as database home"); + private->logdir = strdup (directory->data); + + } else { + private->logdir = strdup (logdir->data); + gf_log ("bdb-ll", + GF_LOG_DEBUG, + "using logdir: %s", private->logdir); + umask (000); + if (mkdir (private->logdir, 0777) == 0) { + gf_log ("bdb-ll", GF_LOG_WARNING, + "logdir specified (%s) not exists, created", + private->logdir); + } + + op_ret = stat (private->logdir, &stbuf); + if ((op_ret != 0) || !S_ISDIR (stbuf.st_mode)) { + gf_log ("bdb-ll", + GF_LOG_ERROR, + "specified logdir doesn't exist, " + "using default (environment home directory: %s)", + directory->data); + private->logdir = strdup (directory->data); + } + } + + private->b_table->dbenv = bdb_dbenv_init (this, directory->data); + + if (!private->b_table->dbenv) { + gf_log ("bdb-ll", GF_LOG_ERROR, + "failed to initialize db environment"); + FREE (private); + op_ret = -1; + } else { + if (private->transaction) { + /* all well, start the checkpointing thread */ + LOCK_INIT (&private->active_lock); + + LOCK (&private->active_lock); + private->active = 1; + UNLOCK (&private->active_lock); + pthread_create (&private->checkpoint_thread, NULL, + bdb_checkpoint, this); + } + } + } +} + +static inline void +BDB_DIR_MODE_INIT (xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + data_t *dir_mode = NULL; + char *endptr = NULL; + + dir_mode = dict_get (options, "dir-mode"); + + if (dir_mode) { + private->dir_mode = strtol (dir_mode->data, &endptr, 8); + if ((*endptr) || + (!IS_VALID_FILE_MODE(private->dir_mode))) { + gf_log (this->name, + GF_LOG_DEBUG, + "invalid dir-mode %o. setting to default %o", + private->dir_mode, + DEFAULT_DIR_MODE); + private->dir_mode = DEFAULT_DIR_MODE; + } else { + gf_log (this->name, + GF_LOG_DEBUG, + "setting dir-mode to %o", private->dir_mode); + private->dir_mode = private->dir_mode; + } + } else { + private->dir_mode = DEFAULT_DIR_MODE; + } + + private->dir_mode = private->dir_mode | S_IFDIR; +} + +static inline void +BDB_FILE_MODE_INIT (xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + data_t *file_mode = NULL; + char *endptr = NULL; + + file_mode = dict_get (options, "file-mode"); + + if (file_mode) { + private->file_mode = strtol (file_mode->data, &endptr, 8); + + if ((*endptr) || + (!IS_VALID_FILE_MODE(private->file_mode))) { + gf_log (this->name, + GF_LOG_DEBUG, + "invalid file-mode %o. setting to default %o", + private->file_mode, + DEFAULT_FILE_MODE); + private->file_mode = DEFAULT_FILE_MODE; + } else { + gf_log (this->name, + GF_LOG_DEBUG, + "setting file-mode to %o", private->file_mode); + private->file_mode = private->file_mode; + } + } else { + private->file_mode = DEFAULT_FILE_MODE; + } + + private->symlink_mode = private->file_mode | S_IFLNK; + private->file_mode = private->file_mode | S_IFREG; +} + +static inline void +BDB_CHECKPOINT_TIMEOUT_INIT (xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + data_t *checkpoint_timeout = NULL; + + checkpoint_timeout = dict_get (options, "checkpoint-timeout"); + + private->checkpoint_timeout = BDB_DEFAULT_CHECKPOINT_TIMEOUT; + + if (checkpoint_timeout) { + private->checkpoint_timeout = strtol (checkpoint_timeout->data, NULL, 0); + + if (private->checkpoint_timeout < 5 || private->checkpoint_timeout > 60) { + gf_log (this->name, + GF_LOG_WARNING, + "checkpoint-timeout %d seconds too %s", + private->checkpoint_timeout, + (private->checkpoint_timeout < 5)?"low":"high"); + } else { + gf_log (this->name, + GF_LOG_DEBUG, + "setting checkpoint-timeout to %d seconds", + private->checkpoint_timeout); + } + } else { + gf_log (this->name, + GF_LOG_DEBUG, + "setting checkpoint-timeout to default: %d seconds", + private->checkpoint_timeout); + } +} + +static inline void +BDB_LOCK_TIMEOUT_INIT (xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + data_t *lock_timeout = NULL; + + lock_timeout = dict_get (options, "lock-timeout"); + + if (lock_timeout) { + private->lock_timeout = strtol (lock_timeout->data, NULL, 0); + + if (private->lock_timeout > 4260000) { + /* db allows us to DB_SET_LOCK_TIMEOUT to be set to a + * maximum of 71 mins (4260000 milliseconds) */ + gf_log (this->name, + GF_LOG_DEBUG, + "lock-timeout %d, out of range", + private->lock_timeout); + private->lock_timeout = 0; + } else { + gf_log (this->name, + GF_LOG_DEBUG, + "setting lock-timeout to %d milliseconds", + private->lock_timeout); + } + } +} + +static inline void +BDB_TRANSACTION_TIMEOUT_INIT (xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + data_t *txn_timeout = NULL; + txn_timeout = dict_get (options, "transaction-timeout"); + + if (txn_timeout) { + private->txn_timeout = strtol (txn_timeout->data, NULL, 0); + + if (private->txn_timeout > 4260000) { + /* db allows us to DB_SET_TXN_TIMEOUT to be set to a maximum + * of 71 mins (4260000 milliseconds) */ + gf_log (this->name, + GF_LOG_DEBUG, + "transaction-timeout %d, out of range", + private->txn_timeout); + private->txn_timeout = 0; + } else { + gf_log (this->name, + GF_LOG_DEBUG, + "setting transaction-timeout to %d milliseconds", + private->txn_timeout); + } + } +} + +static inline void +BDB_TRANSACTION_INIT (xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + data_t *mode = NULL; + + mode = dict_get (options, "mode"); + + if (mode && !strcmp (mode->data, "off")) { + gf_log (this->name, + GF_LOG_DEBUG, + "cache mode selected"); + private->envflags = DB_CREATE | DB_INIT_LOG | + DB_INIT_MPOOL | DB_THREAD; + private->dbflags = DB_CREATE | DB_THREAD; + private->transaction = OFF; + } else { + gf_log (this->name, + GF_LOG_DEBUG, + "persistant mode selected"); + private->transaction = ON; + private->envflags = DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | + DB_INIT_MPOOL | DB_INIT_TXN | DB_RECOVER | DB_THREAD; + private->dbflags = DB_CREATE | DB_THREAD; + } +} + +static inline void +BDB_ACCESS_MODE_INIT (xlator_t *this, + dict_t *options, + struct bdb_private *private) +{ + data_t *access_mode = NULL; + + access_mode = dict_get (options, "access-mode"); + + if (access_mode && !strcmp (access_mode->data, "btree")) { + gf_log (this->name, + GF_LOG_DEBUG, + "using access mode BTREE"); + private->access_mode = DB_BTREE; + } else { + gf_log (this->name, + GF_LOG_DEBUG, + "using access mode HASH"); + private->access_mode = DB_HASH; + } +} + + +/* bdb_db_init - initialize bdb xlator + * + * reads the options from @options dictionary and sets appropriate values in @this->private. + * also initializes DB_ENV. + * + * return: 0 on success or -1 on error (with logging the error through gf_log()). + */ +int +bdb_db_init (xlator_t *this, + dict_t *options) +{ + /* create a db entry for root */ + int32_t op_ret = 0; + bdb_private_t *private = NULL; + + private = this->private; + + BDB_CACHE_INIT (this, options, private); + + BDB_ACCESS_MODE_INIT (this, options, private); + + BDB_TRANSACTION_INIT (this, options, private); + + BDB_TRANSACTION_TIMEOUT_INIT (this, options, private); + + BDB_LOCK_TIMEOUT_INIT (this, options, private); + + { + LOCK_INIT (&private->ino_lock); + private->next_ino = 2; + } + + BDB_CHECKPOINT_TIMEOUT_INIT (this, options, private); + + BDB_FILE_MODE_INIT (this, options, private); + + BDB_DIR_MODE_INIT (this, options, private); + + BDB_TABLE_INIT (this, options, private); + + BDB_ERRFILE_INIT (this, options, private); + + BDB_LOG_REMOVE_INIT (this, options, private); + + BDB_DIRECTORY_INIT (this, options, private); + + return op_ret; +} diff --git a/xlators/storage/bdb/src/bdb.c b/xlators/storage/bdb/src/bdb.c new file mode 100644 index 000000000..e820e867a --- /dev/null +++ b/xlators/storage/bdb/src/bdb.c @@ -0,0 +1,3371 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +/* bdb based storage translator - named as 'bdb' translator + * + * + * There can be only two modes for files existing on bdb translator: + * 1. DIRECTORY - directories are stored by bdb as regular directories on background + * file-system. directories also have an entry in the ns_db.db of their parent directory. + * 2. REGULAR FILE - regular files are stored as records in the storage_db.db present in + * the directory. regular files also have an entry in ns_db.db + * + * Internally bdb has a maximum of three different types of logical files associated with + * each directory: + * 1. storage_db.db - storage database, used to store the data corresponding to regular + * files in the form of key/value pair. file-name is the 'key' and data + * is 'value'. + * 2. directory (all subdirectories) - any subdirectory will have a regular directory entry. + */ +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#define __XOPEN_SOURCE 500 + +#include <stdint.h> +#include <sys/time.h> +#include <errno.h> +#include <ftw.h> +#include <libgen.h> + +#include "glusterfs.h" +#include "dict.h" +#include "logging.h" +#include "bdb.h" +#include "xlator.h" +#include "defaults.h" +#include "common-utils.h" + +/* to be used only by fops, nobody else */ +#define BDB_ENV(this) ((((struct bdb_private *)this->private)->b_table)->dbenv) +#define B_TABLE(this) (((struct bdb_private *)this->private)->b_table) + + +int32_t +bdb_mknod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode, + dev_t dev) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + char *key_string = NULL; /* after translating loc->path to DB key */ + char *db_path = NULL; + bctx_t *bctx = NULL; + struct stat stbuf = {0,}; + + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + if (!S_ISREG(mode)) { + gf_log (this->name, + GF_LOG_DEBUG, + "mknod for non-regular file"); + op_ret = -1; + op_errno = EPERM; + goto out; + } /* if(!S_ISREG(mode)) */ + + bctx = bctx_parent (B_TABLE(this), loc->path); + + if (bctx == NULL) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to get bctx for path: %s", loc->path); + op_ret = -1; + op_errno = ENOENT; + goto out; + } /* if(bctx == NULL) */ + + MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); + + op_ret = lstat (db_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + db_path, strerror (op_errno)); + goto out; + } + + MAKE_KEY_FROM_PATH (key_string, loc->path); + op_ret = bdb_db_put (bctx, NULL, key_string, NULL, 0, 0, 0); + if (op_ret > 0) { + /* create successful */ + stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); + stbuf.st_mode = mode; + stbuf.st_size = 0; + stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); + } else { + gf_log (this->name, + GF_LOG_ERROR, + "bdb_db_get() failed for path: %s", loc->path); + op_ret = -1; + op_errno = ENOENT; + }/* if (!op_ret)...else */ + +out: + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + frame->root->rsp_refs = NULL; + + STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + return 0; +} + +static inline int32_t +is_dir_empty (xlator_t *this, + loc_t *loc) +{ + int32_t ret = 1; + bctx_t *bctx = NULL; + DIR *dir = NULL; + char *real_path = NULL; + void *dbstat = NULL; + struct dirent *entry = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + bctx = bctx_lookup (B_TABLE(this), loc->path); + if (bctx == NULL) { + gf_log (this->name, + GF_LOG_DEBUG, + "failed to get bctx from inode for dir: %s," + "assuming empty directory", + loc->path); + ret = 1; + goto out; + } + + dbstat = bdb_db_stat (bctx, NULL, 0); + if (dbstat) { + switch (bctx->table->access_mode) + { + case DB_HASH: + ret = (((DB_HASH_STAT *)dbstat)->hash_nkeys == 0); + break; + case DB_BTREE: + case DB_RECNO: + ret = (((DB_BTREE_STAT *)dbstat)->bt_nkeys == 0); + break; + case DB_QUEUE: + ret = (((DB_QUEUE_STAT *)dbstat)->qs_nkeys == 0); + break; + case DB_UNKNOWN: + gf_log (this->name, + GF_LOG_CRITICAL, + "unknown access-mode set for db"); + ret = 0; + } + } else { + gf_log (this->name, + GF_LOG_ERROR, + "failed to get db stat for db at path: %s", loc->path); + ret = 1; + goto out; + } + + MAKE_REAL_PATH (real_path, this, loc->path); + dir = opendir (real_path); + if (dir == NULL) { + gf_log (this->name, + GF_LOG_DEBUG, + "failed to opendir(%s)", loc->path); + ret = 0; + goto out; + } + + while ((entry = readdir (dir))) { + if ((!IS_BDB_PRIVATE_FILE(entry->d_name)) && + (!IS_DOT_DOTDOT(entry->d_name))) { + gf_log (this->name, + GF_LOG_DEBUG, + "directory (%s) not empty, has a non-db entry", + loc->path); + ret = 0; + break; + }/* if(!IS_BDB_PRIVATE_FILE()) */ + } /* while(true) */ + closedir (dir); +out: + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + return ret; +} + +int32_t +bdb_rename (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + struct bdb_private *private = NULL; + bctx_table_t *table = NULL; + bctx_t *oldbctx = NULL; + bctx_t *newbctx = NULL; + bctx_t *tmpbctx = NULL; + int32_t op_ret = -1; + int32_t op_errno = ENOENT; + int32_t read_size = 0; + struct stat stbuf = {0,}; + struct stat old_stbuf = {0,}; + DB_TXN *txnid = NULL; + char *real_newpath = NULL; + char *real_oldpath = NULL; + char *oldkey = NULL; + char *newkey = NULL; + char *buf = NULL; /* pointer to temporary buffer, where + * the contents of a file are read, if + * file being renamed is a regular file */ + char *real_db_newpath = NULL; + char *tmp_db_newpath = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, newloc, out); + GF_VALIDATE_OR_GOTO (this->name, oldloc, out); + + private = this->private; + table = private->b_table; + + MAKE_REAL_PATH (real_oldpath, this, oldloc->path); + + if (S_ISREG (oldloc->inode->st_mode)) { + oldbctx = bctx_parent (B_TABLE(this), oldloc->path); + MAKE_REAL_PATH (real_newpath, this, newloc->path); + + op_ret = lstat (real_newpath, &stbuf); + + if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))) { + op_ret = -1; + op_errno = EISDIR; + goto out; + } + if (op_ret == 0) { + /* destination is a symlink */ + MAKE_KEY_FROM_PATH (oldkey, oldloc->path); + MAKE_KEY_FROM_PATH (newkey, newloc->path); + + op_ret = unlink (real_newpath); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to unlink %s (%s)", + newloc->path, strerror (op_errno)); + goto out; + } + newbctx = bctx_parent (B_TABLE (this), newloc->path); + GF_VALIDATE_OR_GOTO (this->name, newbctx, out); + + op_ret = bdb_txn_begin (BDB_ENV(this), &txnid); + + if ((read_size = + bdb_db_get (oldbctx, txnid, oldkey, &buf, 0, 0)) < 0) { + bdb_txn_abort (txnid); + } else if ((op_ret = + bdb_db_del (oldbctx, txnid, oldkey)) != 0) { + bdb_txn_abort (txnid); + } else if ((op_ret = bdb_db_put (newbctx, txnid, + newkey, buf, + read_size, 0, 0)) != 0) { + bdb_txn_abort (txnid); + } else { + bdb_txn_commit (txnid); + } + + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (newbctx); + } else { + /* destination doesn't exist or a regular file */ + MAKE_KEY_FROM_PATH (oldkey, oldloc->path); + MAKE_KEY_FROM_PATH (newkey, newloc->path); + + newbctx = bctx_parent (B_TABLE (this), newloc->path); + GF_VALIDATE_OR_GOTO (this->name, newbctx, out); + + op_ret = bdb_txn_begin (BDB_ENV(this), &txnid); + + if ((read_size = bdb_db_get (oldbctx, txnid, + oldkey, &buf, + 0, 0)) < 0) { + bdb_txn_abort (txnid); + } else if ((op_ret = bdb_db_del (oldbctx, + txnid, oldkey)) != 0) { + bdb_txn_abort (txnid); + } else if ((op_ret = bdb_db_put (newbctx, txnid, + newkey, buf, + read_size, 0, 0)) != 0) { + bdb_txn_abort (txnid); + } else { + bdb_txn_commit (txnid); + } + + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (newbctx); + } + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (oldbctx); + } else if (S_ISLNK (oldloc->inode->st_mode)) { + MAKE_REAL_PATH (real_newpath, this, newloc->path); + op_ret = lstat (real_newpath, &stbuf); + if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))) { + op_ret = -1; + op_errno = EISDIR; + goto out; + } + + if (op_ret == 0){ + /* destination exists and is also a symlink */ + MAKE_REAL_PATH (real_oldpath, this, oldloc->path); + op_ret = rename (real_oldpath, real_newpath); + op_errno = errno; + + if (op_ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to rename symlink %s (%s)", + oldloc->path, strerror (op_errno)); + } + goto out; + } + + /* destination doesn't exist */ + MAKE_REAL_PATH (real_oldpath, this, oldloc->path); + MAKE_KEY_FROM_PATH (newkey, newloc->path); + newbctx = bctx_parent (B_TABLE (this), newloc->path); + GF_VALIDATE_OR_GOTO (this->name, newbctx, out); + + op_ret = bdb_db_del (newbctx, txnid, newkey); + if (op_ret != 0) { + /* no problem */ + } + op_ret = rename (real_oldpath, real_newpath); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to rename %s to %s (%s)", + oldloc->path, newloc->path, strerror (op_errno)); + goto out; + } + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (newbctx); + } else if (S_ISDIR (oldloc->inode->st_mode) && + (old_stbuf.st_nlink == 2)) { + + tmp_db_newpath = tempnam (private->export_path, "rename_temp"); + GF_VALIDATE_OR_GOTO (this->name, tmp_db_newpath, out); + + MAKE_REAL_PATH (real_newpath, this, newloc->path); + + MAKE_REAL_PATH_TO_STORAGE_DB (real_db_newpath, this, newloc->path); + + oldbctx = bctx_lookup (B_TABLE(this), oldloc->path); + op_ret = -1; + op_errno = EINVAL; + GF_VALIDATE_OR_GOTO (this->name, oldbctx, out); + + op_ret = lstat (real_newpath, &stbuf); + if ((op_ret == 0) && + S_ISDIR (stbuf.st_mode) && + is_dir_empty (this, newloc)) { + + tmpbctx = bctx_rename (oldbctx, tmp_db_newpath); + op_ret = -1; + op_errno = ENOENT; + GF_VALIDATE_OR_GOTO (this->name, tmpbctx, out); + + op_ret = rename (real_oldpath, real_newpath); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "rename directory %s to %s failed: %s", + oldloc->path, newloc->path, + strerror (errno)); + op_ret = bdb_db_rename (table, + tmp_db_newpath, + oldbctx->db_path); + if (op_ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "renaming temp database back to old db failed" + " for directory %s", oldloc->path); + goto out; + } else { + /* this is a error case, set op_errno & op_ret */ + op_ret = -1; + op_errno = ENOENT; /* TODO: errno */ + } + } + op_ret = bdb_db_rename (table, tmp_db_newpath, real_db_newpath); + if (op_ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "renaming temp database to new db failed" + " for directory %s", oldloc->path); + goto out; + } + } else if ((op_ret != 0) && (errno == ENOENT)) { + tmp_db_newpath = tempnam (private->export_path, "rename_temp"); + GF_VALIDATE_OR_GOTO (this->name, tmp_db_newpath, out); + + tmpbctx = bctx_rename (oldbctx, tmp_db_newpath); + op_ret = -1; + op_errno = ENOENT; + GF_VALIDATE_OR_GOTO (this->name, tmpbctx, out); + + op_ret = rename (real_oldpath, real_newpath); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "rename directory %s to %s failed: %s", + oldloc->path, newloc->path, + strerror (errno)); + op_ret = bdb_db_rename (table, + tmp_db_newpath, + oldbctx->db_path); + if (op_ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "renaming temp database back to old db failed" + " for directory %s", oldloc->path); + goto out; + } else { + /* this is a error case, set op_errno & op_ret */ + op_ret = -1; + op_errno = ENOENT; /* TODO: errno */ + } + } else { + op_ret = bdb_db_rename (table, + tmp_db_newpath, + real_db_newpath); + if (op_ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "renaming temp database to new db failed" + " for directory %s", oldloc->path); + goto out; + } else { + /* this is a error case, set op_errno & op_ret */ + op_ret = -1; + op_errno = ENOENT; /* TODO: errno */ + } + } + } + } else { + gf_log (this->name, + GF_LOG_CRITICAL, + "rename called on non-existent file type"); + op_ret = -1; + op_errno = EPERM; + } + +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + return 0; +} + +int32_t +bdb_link (call_frame_t *frame, + xlator_t *this, + loc_t *oldloc, + loc_t *newloc) +{ + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, -1, EPERM, NULL, NULL); + return 0; +} + +int32_t +is_space_left (xlator_t *this, + size_t size) +{ + struct bdb_private *private = this->private; + struct statvfs stbuf = {0,}; + int32_t ret = -1; + fsblkcnt_t req_blocks = 0; + fsblkcnt_t usable_blocks = 0; + + ret = statvfs (private->export_path, &stbuf); + if (ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to do statvfs on %s", private->export_path); + return 0; + } else { + req_blocks = (size / stbuf.f_frsize) + 1; + + usable_blocks = (stbuf.f_bfree - BDB_ENOSPC_THRESHOLD); + + gf_log (this->name, GF_LOG_DEBUG, + "requested size: %"GF_PRI_SIZET"\nfree blocks: %"PRIu64"\nblock size: %lu\nfrag size: %lu", + size, stbuf.f_bfree, stbuf.f_bsize, stbuf.f_frsize); + + if (req_blocks < usable_blocks) + return 1; + else + return 0; + } +} + +int32_t +bdb_create (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + mode_t mode, + fd_t *fd) +{ + int32_t op_ret = -1; + int32_t op_errno = EPERM; + char *db_path = NULL; + struct stat stbuf = {0,}; + bctx_t *bctx = NULL; + struct bdb_private *private = NULL; + char *key_string = NULL; + struct bdb_fd *bfd = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + private = this->private; + + bctx = bctx_parent (B_TABLE(this), loc->path); + op_errno = ENOENT; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); + op_ret = lstat (db_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + db_path, strerror (op_errno)); + goto out; + } + + MAKE_KEY_FROM_PATH (key_string, loc->path); + op_ret = bdb_db_put (bctx, NULL, key_string, NULL, 0, 0, 0); + op_errno = EINVAL; + GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out); + + /* create successful */ + bfd = CALLOC (1, sizeof (*bfd)); + op_ret = -1; + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, bfd, out); + + /* NOTE: bdb_get_bctx_from () returns bctx with a ref */ + bfd->ctx = bctx; + bfd->key = strdup (key_string); + op_ret = -1; + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, bfd->key, out); + + BDB_SET_BFD (this, fd, bfd); + + stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); + stbuf.st_mode = private->file_mode; + stbuf.st_size = 0; + stbuf.st_nlink = 1; + stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); + op_ret = 0; + op_errno = 0; +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, fd, loc->inode, &stbuf); + + return 0; +} + + +/* bdb_open + * + * as input parameters bdb_open gets the file name, i.e key. bdb_open should effectively + * do: store key, open storage db, store storage-db pointer. + * + */ +int32_t +bdb_open (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flags, + fd_t *fd) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + bctx_t *bctx = NULL; + char *key_string = NULL; + struct bdb_fd *bfd = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + bctx = bctx_parent (B_TABLE(this), loc->path); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + bfd = CALLOC (1, sizeof (*bfd)); + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, bfd, out); + + /* NOTE: bctx_parent () returns bctx with a ref */ + bfd->ctx = bctx; + + MAKE_KEY_FROM_PATH (key_string, loc->path); + bfd->key = strdup (key_string); + op_ret = -1; + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, bfd->key, out); + + BDB_SET_BFD (this, fd, bfd); + op_ret = 0; +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, fd); + + return 0; +} + +int32_t +bdb_readv (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t offset) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + struct iovec vec = {0,}; + struct stat stbuf = {0,}; + struct bdb_fd *bfd = NULL; + dict_t *reply_dict = NULL; + char *buf = NULL; + data_t *buf_data = NULL; + char *db_path = NULL; + int32_t read_size = 0; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + bfd = bdb_extract_bfd (fd, this); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, bfd, out); + + MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory); + op_ret = lstat (db_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + db_path, strerror (op_errno)); + goto out; + } + + /* we are ready to go */ + op_ret = bdb_db_get (bfd->ctx, NULL, + bfd->key, &buf, + size, offset); + read_size = op_ret; + if (op_ret == -1) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to do db_storage_get()"); + op_ret = -1; + op_errno = ENOENT; + goto out; + } else if (op_ret == 0) { + goto out; + } + + buf_data = get_new_data (); + op_ret = -1; + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, buf_data, out); + + reply_dict = get_new_dict (); + op_ret = -1; + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, reply_dict, out); + + buf_data->data = buf; + + if (size < read_size) { + op_ret = size; + read_size = size; + } + + buf_data->len = op_ret; + + dict_set (reply_dict, NULL, buf_data); + + frame->root->rsp_refs = dict_ref (reply_dict); + + vec.iov_base = buf; + vec.iov_len = read_size; + + stbuf.st_ino = fd->inode->ino; + stbuf.st_size = op_ret ; + stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); + op_ret = size; +out: + STACK_UNWIND (frame, op_ret, op_errno, &vec, 1, &stbuf); + + if (reply_dict) + dict_unref (reply_dict); + + return 0; +} + + +int32_t +bdb_writev (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + struct iovec *vector, + int32_t count, + off_t offset) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + struct stat stbuf = {0,}; + struct bdb_fd *bfd = NULL; + int32_t idx = 0; + off_t c_off = offset; + int32_t c_ret = -1; + char *db_path = NULL; + size_t total_size = 0; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, vector, out); + + bfd = bdb_extract_bfd (fd, this); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, bfd, out); + + MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory); + op_ret = lstat (db_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + db_path, strerror (op_errno)); + goto out; + } + + + for (idx = 0; idx < count; idx++) + total_size += vector[idx].iov_len; + + if (!is_space_left (this, total_size)) { + gf_log (this->name, + GF_LOG_ERROR, + "requested storage for %"GF_PRI_SIZET", ENOSPC", total_size); + op_ret = -1; + op_errno = ENOSPC; + goto out; + } + + + /* we are ready to go */ + for (idx = 0; idx < count; idx++) { + c_ret = bdb_db_put (bfd->ctx, NULL, + bfd->key, vector[idx].iov_base, + vector[idx].iov_len, c_off, 0); + if (c_ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to do bdb_db_put at offset: %"PRIu64" for file: %s", + c_off, bfd->key); + break; + } else { + c_off += vector[idx].iov_len; + } + op_ret += vector[idx].iov_len; + } /* for(idx=0;...)... */ + + if (c_ret) { + /* write failed */ + gf_log (this->name, + GF_LOG_ERROR, + "failed to do bdb_db_put(): %s", + db_strerror (op_ret)); + op_ret = -1; + op_errno = EBADFD; /* TODO: search for a more meaningful errno */ + goto out; + } + /* NOTE: we want to increment stbuf->st_size, as stored in db */ + stbuf.st_size = op_ret; + stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); + op_errno = 0; + +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + return 0; +} + +int32_t +bdb_flush (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + int32_t op_ret = -1; + int32_t op_errno = EPERM; + struct bdb_fd *bfd = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + bfd = bdb_extract_bfd (fd, this); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, bfd, out); + + /* do nothing */ + op_ret = 0; + op_errno = 0; + +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + +int32_t +bdb_release (xlator_t *this, + fd_t *fd) +{ + int32_t op_ret = -1; + int32_t op_errno = EBADFD; + struct bdb_fd *bfd = NULL; + + if ((bfd = bdb_extract_bfd (fd, this)) == NULL){ + gf_log (this->name, + GF_LOG_ERROR, + "failed to extract %s specific information from fd:%p", this->name, fd); + op_ret = -1; + op_errno = EBADFD; + } else { + bctx_unref (bfd->ctx); + bfd->ctx = NULL; + + if (bfd->key) + free (bfd->key); /* we did strdup() in bdb_open() */ + free (bfd); + op_ret = 0; + op_errno = 0; + } /* if((fd->ctx == NULL)...)...else */ + + return 0; +}/* bdb_release */ + + +int32_t +bdb_fsync (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t datasync) +{ + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, 0, 0); + return 0; +}/* bdb_fsync */ + +static int gf_bdb_lk_log; + +int32_t +bdb_lk (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t cmd, + struct flock *lock) +{ + struct flock nullock = {0, }; + + gf_bdb_lk_log++; + if (!(gf_bdb_lk_log % GF_UNIVERSAL_ANSWER)) { + gf_log (this->name, GF_LOG_ERROR, + "\"features/posix-locks\" translator is not loaded, you need to use it"); + } + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, -1, ENOSYS, &nullock); + return 0; +}/* bdb_lk */ + +/* bdb_lookup + * + * there are four possibilities for a file being looked up: + * 1. file exists and is a directory. + * 2. file exists and is a symlink. + * 3. file exists and is a regular file. + * 4. file does not exist. + * case 1 and 2 are handled by doing lstat() on the @loc. if the file is a directory or symlink, + * lstat() succeeds. lookup continues to check if the @loc belongs to case-3 only if lstat() fails. + * to check for case 3, bdb_lookup does a bdb_db_get() for the given @loc. (see description of + * bdb_db_get() for more details on how @loc is transformed into db handle and key). if check + * for case 1, 2 and 3 fail, we proceed to conclude that file doesn't exist (case 4). + * + * @frame: call frame. + * @this: xlator_t of this instance of bdb xlator. + * @loc: loc_t specifying the file to operate upon. + * @need_xattr: if need_xattr != 0, we are asked to return all the extended attributed of @loc, + * if any exist, in a dictionary. if @loc is a regular file and need_xattr is set, then + * we look for value of need_xattr. if need_xattr > sizo-of-the-file @loc, then the file + * content of @loc is returned in dictionary of xattr with 'glusterfs.content' as + * dictionary key. + * + * NOTE: bdb currently supports only directories, symlinks and regular files. + * + * NOTE: bdb_lookup returns the 'struct stat' of underlying file itself, in case of directory and + * symlink (st_ino is modified as bdb allocates its own set of inodes of all files). for + * regular files, bdb uses 'struct stat' of the database file in which the @loc is stored + * as templete and modifies st_ino (see bdb_inode_transform for more details), st_mode (can + * be set in volfile 'option file-mode <mode>'), st_size (exact size of the @loc + * contents), st_blocks (block count on the underlying filesystem to accomodate st_size, + * see BDB_COUNT_BLOCKS in bdb.h for more details). + */ +int32_t +bdb_lookup (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *xattr_req) +{ + struct stat stbuf = {0, }; + int32_t op_ret = -1; + int32_t op_errno = ENOENT; + dict_t *xattr = NULL; + char *pathname = NULL; + char *directory = NULL; + char *real_path = NULL; + bctx_t *bctx = NULL; + char *db_path = NULL; + struct bdb_private *private = NULL; + char *key_string = NULL; + int32_t entry_size = 0; + char *file_content = NULL; + data_t *file_content_data = NULL; + uint64_t need_xattr = 0; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + private = this->private; + + MAKE_REAL_PATH (real_path, this, loc->path); + + pathname = strdup (loc->path); + GF_VALIDATE_OR_GOTO (this->name, pathname, out); + + directory = dirname (pathname); + GF_VALIDATE_OR_GOTO (this->name, directory, out); + + if (!strcmp (directory, loc->path)) { + /* SPECIAL CASE: looking up root */ + op_ret = lstat (real_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + real_path, strerror (op_errno)); + goto out; + } + + /* bctx_lookup() returns NULL only when its time to wind up, + * we should shutdown functioning */ + bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); + op_ret = -1; + op_errno = EINVAL; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + stbuf.st_ino = 1; + stbuf.st_mode = private->dir_mode; + } else { + MAKE_KEY_FROM_PATH (key_string, loc->path); + op_ret = lstat (real_path, &stbuf); + if ((op_ret == 0) && (S_ISDIR (stbuf.st_mode))){ + bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); + op_ret = -1; + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + if (loc->ino) { + /* revalidating directory inode */ + gf_log (this->name, + GF_LOG_DEBUG, + "revalidating directory %s", (char *)loc->path); + stbuf.st_ino = loc->ino; + } else { + stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); + } + stbuf.st_mode = private->dir_mode; + op_ret = 0; + op_errno = 0; + goto out; + } else if (op_ret == 0) { + /* a symlink */ + gf_log (this->name, + GF_LOG_DEBUG, + "lookup called for symlink: %s", loc->path); + bctx = bctx_parent (B_TABLE(this), loc->path); + op_ret = -1; + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + if (loc->ino) { + stbuf.st_ino = loc->ino; + } else { + stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); + } + stbuf.st_mode = private->symlink_mode; + op_ret = 0; + op_errno = 0; + goto out; + } + + /* for regular files */ + bctx = bctx_parent (B_TABLE(this), loc->path); + op_ret = -1; + op_errno = ENOENT; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + if (GF_FILE_CONTENT_REQUESTED(xattr_req, &need_xattr)) { + entry_size = bdb_db_get (bctx, + NULL, + loc->path, + &file_content, + 0, 0); + } else { + entry_size = bdb_db_get (bctx, + NULL, + loc->path, + NULL, + 0, 0); + } + + op_ret = entry_size; + op_errno = ENOENT; + if (op_ret == -1) { + gf_log (this->name, + GF_LOG_DEBUG, + "returning ENOENT for %s", loc->path); + goto out; + } + + MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); + op_ret = lstat (db_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + db_path, strerror (op_errno)); + goto out; + } + + if ((need_xattr >= entry_size) + && (entry_size) && (file_content)) { + file_content_data = data_from_dynptr (file_content, + entry_size); + xattr = get_new_dict (); + dict_set (xattr, "glusterfs.content", + file_content_data); + } else { + if (file_content) + free (file_content); + } + + if (loc->ino) { + /* revalidate */ + stbuf.st_ino = loc->ino; + stbuf.st_size = entry_size; + stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); + } else { + /* fresh lookup, create an inode number */ + stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); + stbuf.st_size = entry_size; + stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); + }/* if(inode->ino)...else */ + stbuf.st_nlink = 1; + stbuf.st_mode = private->file_mode; + } + op_ret = 0; +out: + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + if (pathname) + free (pathname); + + if (xattr) + dict_ref (xattr); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf, xattr); + + if (xattr) + dict_unref (xattr); + + return 0; + +}/* bdb_lookup */ + +int32_t +bdb_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + + struct stat stbuf = {0,}; + char *real_path = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + struct bdb_private *private = NULL; + char *db_path = NULL; + bctx_t *bctx = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + private = this->private; + GF_VALIDATE_OR_GOTO (this->name, private, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = lstat (real_path, &stbuf); + op_errno = errno; + if (op_ret == 0) { + /* directory or symlink */ + stbuf.st_ino = loc->inode->ino; + if (S_ISDIR(stbuf.st_mode)) + stbuf.st_mode = private->dir_mode; + else + stbuf.st_mode = private->symlink_mode; + /* we are done, lets unwind the stack */ + goto out; + } + + bctx = bctx_parent (B_TABLE(this), loc->path); + op_ret = -1; + op_errno = ENOENT; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); + op_ret = lstat (db_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + db_path, strerror (op_errno)); + goto out; + } + + stbuf.st_size = bdb_db_get (bctx, NULL, loc->path, NULL, 0, 0); + stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); + stbuf.st_ino = loc->inode->ino; + +out: + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +}/* bdb_stat */ + + + +/* bdb_opendir - in the world of bdb, open/opendir is all about opening correspondind databases. + * opendir in particular, opens the database for the directory which is + * to be opened. after opening the database, a cursor to the database is also created. + * cursor helps us get the dentries one after the other, and cursor maintains the state + * about current positions in directory. pack 'pointer to db', 'pointer to the + * cursor' into struct bdb_dir and store it in fd->ctx, we get from our parent xlator. + * + * @frame: call frame + * @this: our information, as we filled during init() + * @loc: location information + * @fd: file descriptor structure (glusterfs internal) + * + * return value - immaterial, async call. + * + */ +int32_t +bdb_opendir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + fd_t *fd) +{ + char *real_path = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + bctx_t *bctx = NULL; + struct bdb_dir *bfd = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + bfd = CALLOC (1, sizeof (*bfd)); + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, bfd, out); + + bfd->dir = opendir (real_path); + op_errno = errno; + GF_VALIDATE_OR_GOTO (this->name, bfd->dir, out); + + /* NOTE: bctx_lookup() return bctx with ref */ + bfd->ctx = bctx; + + bfd->path = strdup (real_path); + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, bfd->path, out); + + BDB_SET_BFD (this, fd, bfd); + op_ret = 0; +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, fd); + + return 0; +}/* bdb_opendir */ + + +int32_t +bdb_getdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t off, + int32_t flag) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int32_t ret = -1; + int32_t real_path_len = 0; + int32_t entry_path_len = 0; + int32_t count = 0; + char *real_path = NULL; + char *entry_path = NULL; + char *db_path = NULL; + dir_entry_t entries = {0, }; + dir_entry_t *tmp = NULL; + DIR *dir = NULL; + struct dirent *dirent = NULL; + struct bdb_dir *bfd = NULL; + struct stat db_stbuf = {0,}; + struct stat buf = {0,}; + DBC *cursorp = NULL; + size_t tmp_name_len = 0; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + bfd = bdb_extract_bfd (fd, this); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, bfd, out); + + MAKE_REAL_PATH (real_path, this, bfd->path); + dir = bfd->dir; + + while ((dirent = readdir (dir))) { + if (!dirent) + break; + + if (IS_BDB_PRIVATE_FILE(dirent->d_name)) { + continue; + } + + tmp_name_len = strlen (dirent->d_name); + if (entry_path_len < (real_path_len + 1 + (tmp_name_len) + 1)) { + entry_path_len = real_path_len + tmp_name_len + 1024; + entry_path = realloc (entry_path, entry_path_len); + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, entry_path, out); + } + + strncpy (&entry_path[real_path_len+1], dirent->d_name, tmp_name_len); + op_ret = stat (entry_path, &buf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + entry_path, strerror (op_errno)); + goto out; + } + + if ((flag == GF_GET_DIR_ONLY) && + (ret != -1 && !S_ISDIR(buf.st_mode))) { + continue; + } + + tmp = CALLOC (1, sizeof (*tmp)); + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, tmp, out); + + tmp->name = strdup (dirent->d_name); + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, dirent->d_name, out); + + memcpy (&tmp->buf, &buf, sizeof (buf)); + + tmp->buf.st_ino = -1; + if (S_ISLNK(tmp->buf.st_mode)) { + char linkpath[ZR_PATH_MAX] = {0,}; + ret = readlink (entry_path, linkpath, ZR_PATH_MAX); + if (ret != -1) { + linkpath[ret] = '\0'; + tmp->link = strdup (linkpath); + } + } else { + tmp->link = ""; + } + + count++; + + tmp->next = entries.next; + entries.next = tmp; + /* if size is 0, count can never be = size, so entire dir is read */ + + if (count == size) + break; + } + + if ((flag != GF_GET_DIR_ONLY) && (count < size)) { + /* read from db */ + op_ret = bdb_cursor_open (bfd->ctx, &cursorp); + op_errno = EINVAL; + GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out); + + MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bfd->ctx->directory); + op_ret = lstat (db_path, &db_stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + db_path, strerror (op_errno)); + goto out; + } + + /* read all the entries in database, one after the other and put into dictionary */ + while (1) { + DBT key = {0,}, value = {0,}; + + key.flags = DB_DBT_MALLOC; + value.flags = DB_DBT_MALLOC; + op_ret = bdb_cursor_get (cursorp, &key, &value, DB_NEXT); + + if (op_ret == DB_NOTFOUND) { + gf_log (this->name, + GF_LOG_DEBUG, + "end of list of key/value pair in db for directory: %s", + bfd->ctx->directory); + op_ret = 0; + op_errno = 0; + break; + } else if (op_ret != 0){ + gf_log (this->name, + GF_LOG_ERROR, + "failed to do cursor get for directory %s: %s", + bfd->ctx->directory, db_strerror (op_ret)); + op_ret = -1; + op_errno = ENOENT; + break; + } + /* successfully read */ + tmp = CALLOC (1, sizeof (*tmp)); + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, tmp, out); + + tmp->name = CALLOC (1, key.size + 1); + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, tmp->name, out); + + memcpy (tmp->name, key.data, key.size); + tmp->buf = db_stbuf; + tmp->buf.st_size = bdb_db_get (bfd->ctx, NULL, + tmp->name, NULL, + 0, 0); + tmp->buf.st_blocks = BDB_COUNT_BLOCKS (tmp->buf.st_size, \ + tmp->buf.st_blksize); + /* FIXME: wat will be the effect of this? */ + tmp->buf.st_ino = -1; + count++; + + tmp->next = entries.next; + tmp->link = ""; + entries.next = tmp; + /* if size is 0, count can never be = size, so entire dir is read */ + if (count == size) + break; + + free (key.data); + } /* while(1){ } */ + bdb_cursor_close (bfd->ctx, cursorp); + } else { + /* do nothing */ + } + FREE (entry_path); + op_ret = 0; + +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &entries, count); + + while (entries.next) { + tmp = entries.next; + entries.next = entries.next->next; + FREE (tmp->name); + FREE (tmp); + } + return 0; +}/* bdb_getdents */ + + +int32_t +bdb_releasedir (xlator_t *this, + fd_t *fd) +{ + int32_t op_ret = 0; + int32_t op_errno = 0; + struct bdb_dir *bfd = NULL; + + if ((bfd = bdb_extract_bfd (fd, this)) == NULL) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to extract fd data from fd=%p", fd); + op_ret = -1; + op_errno = EBADF; + } else { + if (bfd->path) { + free (bfd->path); + } else { + gf_log (this->name, GF_LOG_ERROR, "bfd->path was NULL. fd=%p bfd=%p", + fd, bfd); + } + + if (bfd->dir) { + closedir (bfd->dir); + } else { + gf_log (this->name, + GF_LOG_ERROR, + "bfd->dir is NULL."); + } + if (bfd->ctx) { + bctx_unref (bfd->ctx); + } else { + gf_log (this->name, + GF_LOG_ERROR, + "bfd->ctx is NULL"); + } + free (bfd); + } + + return 0; +}/* bdb_releasedir */ + + +int32_t +bdb_readlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + size_t size) +{ + char *dest = NULL; + int32_t op_ret = -1; + int32_t op_errno = EPERM; + char *real_path = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + dest = alloca (size + 1); + GF_VALIDATE_OR_GOTO (this->name, dest, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = readlink (real_path, dest, size); + + if (op_ret > 0) + dest[op_ret] = 0; + + op_errno = errno; + + if (op_ret == -1) { + gf_log (this->name, + GF_LOG_DEBUG, + "readlink failed on %s: %s", + loc->path, strerror (op_errno)); + } +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, dest); + + return 0; +}/* bdb_readlink */ + + +int32_t +bdb_mkdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int32_t op_ret = -1; + int32_t ret = -1; + int32_t op_errno = EINVAL; + char *real_path = NULL; + struct stat stbuf = {0, }; + bctx_t *bctx = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = mkdir (real_path, mode); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to mkdir %s (%s)", + real_path, strerror (op_errno)); + goto out; + } + + op_ret = chown (real_path, frame->root->uid, frame->root->gid); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to chmod on %s (%s)", + real_path, strerror (op_errno)); + goto err; + } + + op_ret = lstat (real_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + real_path, strerror (op_errno)); + goto err; + } + + bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, bctx, err); + + stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); + + goto out; + +err: + ret = rmdir (real_path); + if (ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to rmdir the directory created (%s)", + strerror (errno)); + } + + +out: + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + + return 0; +}/* bdb_mkdir */ + + +int32_t +bdb_unlink (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + bctx_t *bctx = NULL; + char *real_path = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + bctx = bctx_parent (B_TABLE(this), loc->path); + op_errno = ENOENT; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + op_ret = bdb_db_del (bctx, NULL, loc->path); + if (op_ret == DB_NOTFOUND) { + MAKE_REAL_PATH (real_path, this, loc->path); + op_ret = unlink (real_path); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to unlink on %s (%s)", + real_path, strerror (op_errno)); + goto out; + } + + } else if (op_ret == 0) { + op_errno = 0; + } +out: + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +}/* bdb_unlink */ + + + +int32_t +bdb_do_rmdir (xlator_t *this, + loc_t *loc) +{ + char *real_path = NULL; + int32_t ret = -1; + bctx_t *bctx = NULL; + DB_ENV *dbenv = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + dbenv = BDB_ENV(this); + GF_VALIDATE_OR_GOTO (this->name, dbenv, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + bctx = bctx_lookup (B_TABLE(this), loc->path); + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + LOCK(&bctx->lock); + { + if (bctx->dbp == NULL) { + goto unlock; + } + + ret = bctx->dbp->close (bctx->dbp, 0); + GF_VALIDATE_OR_GOTO (this->name, (ret == 0), unlock); + + bctx->dbp = NULL; + + ret = dbenv->dbremove (dbenv, NULL, bctx->db_path, NULL, 0); + if (ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to DB_ENV->dbremove() on path %s: %s", + loc->path, db_strerror (ret)); + } + } +unlock: + UNLOCK(&bctx->lock); + + if (ret) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to remove db %s: %s", bctx->db_path, db_strerror (ret)); + ret = -1; + goto out; + } + gf_log (this->name, + GF_LOG_DEBUG, + "removed db %s", bctx->db_path); + ret = rmdir (real_path); + +out: + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + return ret; +} + +int32_t +bdb_rmdir (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + int32_t op_ret = -1; + int32_t op_errno = ENOTEMPTY; + + if (!is_dir_empty (this, loc)) { + gf_log (this->name, + GF_LOG_DEBUG, + "rmdir: directory %s not empty", loc->path); + op_errno = ENOTEMPTY; + op_ret = -1; + goto out; + } + + op_ret = bdb_do_rmdir (this, loc); + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to bdb_do_rmdir on %s", + loc->path); + goto out; + } + +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} /* bdb_rmdir */ + +int32_t +bdb_symlink (call_frame_t *frame, + xlator_t *this, + const char *linkname, + loc_t *loc) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + char *real_path = NULL; + struct stat stbuf = {0,}; + struct bdb_private *private = NULL; + bctx_t *bctx = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + GF_VALIDATE_OR_GOTO (this->name, linkname, out); + + private = this->private; + GF_VALIDATE_OR_GOTO (this->name, private, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + op_ret = symlink (linkname, real_path); + op_errno = errno; + if (op_ret == 0) { + op_ret = lstat (real_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + real_path, strerror (op_errno)); + goto err; + } + + bctx = bctx_parent (B_TABLE(this), loc->path); + GF_VALIDATE_OR_GOTO (this->name, bctx, err); + + stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); + stbuf.st_mode = private->symlink_mode; + + goto out; + } +err: + op_ret = unlink (real_path); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to unlink the previously created symlink (%s)", + strerror (op_errno)); + } + op_ret = -1; + op_errno = ENOENT; +out: + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + + return 0; +} /* bdb_symlink */ + +int32_t +bdb_chmod (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + mode_t mode) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + char *real_path = NULL; + struct stat stbuf = {0,}; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + op_ret = lstat (real_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + real_path, strerror (op_errno)); + goto out; + } + + /* directory or symlink */ + op_ret = chmod (real_path, mode); + op_errno = errno; + +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +}/* bdb_chmod */ + + +int32_t +bdb_chown (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + uid_t uid, + gid_t gid) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + char *real_path = NULL; + struct stat stbuf = {0,}; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + op_ret = lstat (real_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + real_path, strerror (op_errno)); + goto out; + } + + /* directory or symlink */ + op_ret = lchown (real_path, uid, gid); + op_errno = errno; +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +}/* bdb_chown */ + + +int32_t +bdb_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + char *real_path = NULL; + struct stat stbuf = {0,}; + char *db_path = NULL; + bctx_t *bctx = NULL; + char *key_string = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + bctx = bctx_parent (B_TABLE(this), loc->path); + op_errno = ENOENT; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + MAKE_KEY_FROM_PATH (key_string, loc->path); + + /* now truncate */ + MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); + op_ret = lstat (db_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + db_path, strerror (op_errno)); + goto out; + } + + if (loc->inode->ino) { + stbuf.st_ino = loc->inode->ino; + }else { + stbuf.st_ino = bdb_inode_transform (stbuf.st_ino, bctx); + } + + op_ret = bdb_db_put (bctx, NULL, key_string, NULL, 0, 1, 0); + if (op_ret == -1) { + gf_log (this->name, + GF_LOG_DEBUG, + "failed to do bdb_db_put: %s", + db_strerror (op_ret)); + op_ret = -1; + op_errno = EINVAL; /* TODO: better errno */ + } + +out: + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +}/* bdb_truncate */ + + +int32_t +bdb_utimens (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + struct timespec ts[2]) +{ + int32_t op_ret = -1; + int32_t op_errno = EPERM; + char *real_path = NULL; + struct stat stbuf = {0,}; + struct timeval tv[2] = {{0,},}; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + op_ret = lstat (real_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + op_errno = EPERM; + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + real_path, strerror (op_errno)); + goto out; + } + + /* directory or symlink */ + tv[0].tv_sec = ts[0].tv_sec; + tv[0].tv_usec = ts[0].tv_nsec / 1000; + tv[1].tv_sec = ts[1].tv_sec; + tv[1].tv_usec = ts[1].tv_nsec / 1000; + + op_ret = lutimes (real_path, tv); + if (op_ret == -1 && errno == ENOSYS) { + op_ret = utimes (real_path, tv); + } + op_errno = errno; + if (op_ret == -1) { + gf_log (this->name, + GF_LOG_WARNING, + "utimes on %s failed: %s", + loc->path, strerror (op_errno)); + goto out; + } + + op_ret = lstat (real_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + real_path, strerror (op_errno)); + goto out; + } + + stbuf.st_ino = loc->inode->ino; + +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +}/* bdb_utimens */ + +int32_t +bdb_statfs (call_frame_t *frame, + xlator_t *this, + loc_t *loc) + +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + char *real_path = NULL; + struct statvfs buf = {0, }; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = statvfs (real_path, &buf); + op_errno = errno; +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &buf); + return 0; +}/* bdb_statfs */ + +static int gf_bdb_xattr_log; + +/* bdb_setxattr - set extended attributes. + * + * bdb allows setxattr operation only on directories. + * bdb reservers 'glusterfs.file.<attribute-name>' to operate on the content of the files + * under the specified directory. 'glusterfs.file.<attribute-name>' transforms to contents of + * file of name '<attribute-name>' under specified directory. + * + * @frame: call frame. + * @this: xlator_t of this instance of bdb xlator. + * @loc: loc_t specifying the file to operate upon. + * @dict: list of extended attributes to set on @loc. + * @flags: can be XATTR_REPLACE (replace an existing extended attribute only if it exists) or + * XATTR_CREATE (create an extended attribute only if it doesn't already exist). + * + * + */ +int32_t +bdb_setxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + dict_t *dict, + int flags) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + data_pair_t *trav = dict->members_list; + bctx_t *bctx = NULL; + char *real_path = NULL; + char *key = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + GF_VALIDATE_OR_GOTO (this->name, dict, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + if (!S_ISDIR (loc->inode->st_mode)) { + op_ret = -1; + op_errno = EPERM; + goto out; + } + + while (trav) { + if (ZR_FILE_CONTENT_REQUEST(trav->key) ) { + bctx = bctx_lookup (B_TABLE(this), loc->path); + op_errno = EINVAL; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + key = &(trav->key[15]); + + if (flags & XATTR_REPLACE) { + /* replace only if previously exists, otherwise error out */ + op_ret = bdb_db_get (bctx, NULL, key, + NULL, 0, 0); + if (op_ret == -1) { + /* key doesn't exist in database */ + gf_log (this->name, + GF_LOG_DEBUG, + "cannot XATTR_REPLACE, xattr %s doesn't exist " + "on path %s", key, loc->path); + op_ret = -1; + op_errno = ENOENT; + break; + } + op_ret = bdb_db_put (bctx, NULL, + key, trav->value->data, + trav->value->len, + op_ret, BDB_TRUNCATE_RECORD); + if (op_ret != 0) { + op_ret = -1; + op_errno = EINVAL; + break; + } + } else { + /* fresh create */ + op_ret = bdb_db_put (bctx, NULL, key, + trav->value->data, + trav->value->len, + 0, 0); + if (op_ret != 0) { + op_ret = -1; + op_errno = EINVAL; + break; + } else { + op_ret = 0; + op_errno = 0; + } /* if(op_ret!=0)...else */ + } /* if(flags&XATTR_REPLACE)...else */ + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + } else { + /* do plain setxattr */ + op_ret = lsetxattr (real_path, + trav->key, + trav->value->data, + trav->value->len, + flags); + op_errno = errno; + if ((op_ret == -1) && (op_errno != ENOENT)) { + if (op_errno == ENOTSUP) { + gf_bdb_xattr_log++; + if (!(gf_bdb_xattr_log % GF_UNIVERSAL_ANSWER)) { + gf_log (this->name, GF_LOG_WARNING, + "Extended Attributes support not present."\ + "Please check"); + } + } else { + gf_log (this->name, GF_LOG_DEBUG, + "setxattr failed on %s (%s)", + loc->path, strerror (op_errno)); + } + break; + } + } /* if(ZR_FILE_CONTENT_REQUEST())...else */ + trav = trav->next; + }/* while(trav) */ +out: + frame->root->rsp_refs = NULL; + + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +}/* bdb_setxattr */ + + +/* bdb_gettxattr - get extended attributes. + * + * bdb allows getxattr operation only on directories. + * bdb_getxattr retrieves the whole content of the file, when glusterfs.file.<attribute-name> + * is specified. + * + * @frame: call frame. + * @this: xlator_t of this instance of bdb xlator. + * @loc: loc_t specifying the file to operate upon. + * @name: name of extended attributes to get for @loc. + * + * NOTE: see description of bdb_setxattr for details on how + * 'glusterfs.file.<attribute-name>' is handles by bdb. + */ +int32_t +bdb_getxattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + int32_t op_ret = 0; + int32_t op_errno = 0; + dict_t *dict = NULL; + bctx_t *bctx = NULL; + char *buf = NULL; + char *key_string = NULL; + int32_t list_offset = 0; + size_t size = 0; + size_t remaining_size = 0; + char *real_path = NULL; + char key[1024] = {0,}; + char *value = NULL; + char *list = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + GF_VALIDATE_OR_GOTO (this->name, name, out); + + dict = get_new_dict (); + GF_VALIDATE_OR_GOTO (this->name, dict, out); + + if (!S_ISDIR (loc->inode->st_mode)) { + gf_log (this->name, + GF_LOG_DEBUG, + "operation not permitted on a non-directory file: %s", loc->path); + op_ret = -1; + op_errno = ENODATA; + goto out; + } + + if (name && ZR_FILE_CONTENT_REQUEST(name)) { + bctx = bctx_lookup (B_TABLE(this), loc->path); + op_errno = EINVAL; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + key_string = (char *)&(name[15]); + + op_ret = bdb_db_get (bctx, NULL, key_string, &buf, 0, 0); + if (op_ret == -1) { + gf_log (this->name, + GF_LOG_DEBUG, + "failed to db get on directory: %s for key: %s", + bctx->directory, name); + op_ret = -1; + op_errno = ENODATA; + goto out; + } + + dict_set (dict, (char *)name, data_from_dynptr (buf, op_ret)); + } else { + MAKE_REAL_PATH (real_path, this, loc->path); + size = llistxattr (real_path, NULL, 0); + op_errno = errno; + if (size <= 0) { + /* There are no extended attributes, send an empty dictionary */ + if (size == -1 && op_errno != ENODATA) { + if (op_errno == ENOTSUP) { + gf_bdb_xattr_log++; + if (!(gf_bdb_xattr_log % GF_UNIVERSAL_ANSWER)) + gf_log (this->name, + GF_LOG_WARNING, + "Extended Attributes support not present."\ + "Please check"); + } else { + gf_log (this->name, + GF_LOG_WARNING, + "llistxattr failed on %s (%s)", + loc->path, strerror (op_errno)); + } + } + op_ret = -1; + op_errno = ENODATA; + } else { + list = alloca (size + 1); + op_errno = ENOMEM; + GF_VALIDATE_OR_GOTO (this->name, list, out); + + size = llistxattr (real_path, list, size); + op_ret = size; + op_errno = errno; + if (size == -1) { + gf_log (this->name, + GF_LOG_ERROR, + "llistxattr failed on %s (%s)", + loc->path, strerror (errno)); + goto out; + } + remaining_size = size; + list_offset = 0; + while (remaining_size > 0) { + if(*(list+list_offset) == '\0') + break; + strcpy (key, list + list_offset); + op_ret = lgetxattr (real_path, key, NULL, 0); + if (op_ret == -1) + break; + value = CALLOC (op_ret + 1, sizeof(char)); + GF_VALIDATE_OR_GOTO (this->name, value, out); + + op_ret = lgetxattr (real_path, key, value, op_ret); + if (op_ret == -1) + break; + value [op_ret] = '\0'; + dict_set (dict, key, data_from_dynptr (value, op_ret)); + remaining_size -= strlen (key) + 1; + list_offset += strlen (key) + 1; + } /* while(remaining_size>0) */ + } /* if(size <= 0)...else */ + } /* if(name...)...else */ + +out: + if(bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + if (dict) + dict_ref (dict); + + STACK_UNWIND (frame, op_ret, op_errno, dict); + + if (dict) + dict_unref (dict); + + return 0; +}/* bdb_getxattr */ + + +int32_t +bdb_removexattr (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + const char *name) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + bctx_t *bctx = NULL; + char *real_path = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + GF_VALIDATE_OR_GOTO (this->name, name, out); + + if (!S_ISDIR(loc->inode->st_mode)) { + gf_log (this->name, + GF_LOG_WARNING, + "operation not permitted on non-directory files"); + op_ret = -1; + op_errno = EPERM; + goto out; + } + + if (ZR_FILE_CONTENT_REQUEST(name)) { + bctx = bctx_lookup (B_TABLE(this), loc->path); + op_errno = EINVAL; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + op_ret = bdb_db_del (bctx, NULL, name); + if (op_ret == -1) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to delete %s from db of %s directory", + name, loc->path); + op_errno = EINVAL; /* TODO: errno */ + goto out; + } + } else { + MAKE_REAL_PATH(real_path, this, loc->path); + op_ret = lremovexattr (real_path, name); + op_errno = errno; + if (op_ret == -1) { + if (op_errno == ENOTSUP) { + gf_bdb_xattr_log++; + if (!(gf_bdb_xattr_log % GF_UNIVERSAL_ANSWER)) + gf_log (this->name, GF_LOG_WARNING, + "Extended Attributes support not present." + "Please check"); + } else { + gf_log (this->name, + GF_LOG_WARNING, + "%s: %s", + loc->path, strerror (op_errno)); + } + } /* if(op_ret == -1) */ + } /* if (ZR_FILE_CONTENT_REQUEST(name))...else */ + +out: + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +}/* bdb_removexattr */ + + +int32_t +bdb_fsyncdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int datasync) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + struct bdb_fd *bfd = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + frame->root->rsp_refs = NULL; + + bfd = bdb_extract_bfd (fd, this); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, bfd, out); + +out: + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +}/* bdb_fsycndir */ + + +int32_t +bdb_access (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t mask) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + char *real_path = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = access (real_path, mask); + op_errno = errno; + /* TODO: implement for db entries */ +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +}/* bdb_access */ + + +int32_t +bdb_ftruncate (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + off_t offset) +{ + int32_t op_ret = -1; + int32_t op_errno = EPERM; + struct stat buf = {0,}; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + /* TODO: impelement */ +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &buf); + + return 0; +} + +int32_t +bdb_fchown (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + uid_t uid, + gid_t gid) +{ + int32_t op_ret = -1; + int32_t op_errno = EPERM; + struct stat buf = {0,}; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + /* TODO: implement */ +out: + STACK_UNWIND (frame, op_ret, op_errno, &buf); + + return 0; +} + + +int32_t +bdb_fchmod (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + mode_t mode) +{ + int32_t op_ret = -1; + int32_t op_errno = EPERM; + struct stat buf = {0,}; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + /* TODO: impelement */ +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &buf); + + return 0; +} + +int32_t +bdb_setdents (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + int32_t flags, + dir_entry_t *entries, + int32_t count) +{ + int32_t op_ret = -1, op_errno = EINVAL; + char *entry_path = NULL; + int32_t real_path_len = 0; + int32_t entry_path_len = 0; + int32_t ret = 0; + struct bdb_dir *bfd = NULL; + dir_entry_t *trav = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + GF_VALIDATE_OR_GOTO (this->name, entries, out); + + frame->root->rsp_refs = NULL; + + bfd = bdb_extract_bfd (fd, this); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, bfd, out); + + real_path_len = strlen (bfd->path); + entry_path_len = real_path_len + 256; + entry_path = CALLOC (1, entry_path_len); + GF_VALIDATE_OR_GOTO (this->name, entry_path, out); + + strcpy (entry_path, bfd->path); + entry_path[real_path_len] = '/'; + + trav = entries->next; + while (trav) { + char pathname[ZR_PATH_MAX] = {0,}; + strcpy (pathname, entry_path); + strcat (pathname, trav->name); + + if (S_ISDIR(trav->buf.st_mode)) { + /* If the entry is directory, create it by calling 'mkdir'. If + * directory is not present, it will be created, if its present, + * no worries even if it fails. + */ + ret = mkdir (pathname, trav->buf.st_mode); + if ((ret == -1) && (errno != EEXIST)) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to created directory %s: %s", + pathname, strerror(errno)); + goto loop; + } + + gf_log (this->name, + GF_LOG_DEBUG, + "Creating directory %s with mode (0%o)", + pathname, + trav->buf.st_mode); + /* Change the mode + * NOTE: setdents tries its best to restore the state + * of storage. if chmod and chown fail, they can be + * ignored now */ + ret = chmod (pathname, trav->buf.st_mode); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, + GF_LOG_ERROR, + "chmod failed on %s (%s)", + pathname, strerror (errno)); + goto loop; + } + /* change the ownership */ + ret = chown (pathname, trav->buf.st_uid, trav->buf.st_gid); + if (ret != 0) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, + GF_LOG_ERROR, + "chown failed on %s (%s)", + pathname, strerror (errno)); + goto loop; + } + } else if ((flags == GF_SET_IF_NOT_PRESENT) || + (flags != GF_SET_DIR_ONLY)) { + /* Create a 0 byte file here */ + if (S_ISREG (trav->buf.st_mode)) { + op_ret = bdb_db_put (bfd->ctx, NULL, + trav->name, NULL, 0, 0, 0); + if (op_ret != 0) { + /* create successful */ + gf_log (this->name, + GF_LOG_ERROR, + "failed to create file %s", + pathname); + } /* if (!op_ret)...else */ + } else if (S_ISLNK (trav->buf.st_mode)) { + /* TODO: impelement */; + } else { + gf_log (this->name, + GF_LOG_ERROR, + "storage/bdb allows to create regular files only" + "file %s (mode = %d) cannot be created", + pathname, trav->buf.st_mode); + } /* if(S_ISREG())...else */ + } /* if(S_ISDIR())...else if */ + loop: + /* consider the next entry */ + trav = trav->next; + } /* while(trav) */ + +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + + FREE (entry_path); + return 0; +} + +int32_t +bdb_fstat (call_frame_t *frame, + xlator_t *this, + fd_t *fd) +{ + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + struct stat stbuf = {0,}; + struct bdb_fd *bfd = NULL; + bctx_t *bctx = NULL; + char *db_path = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + bfd = bdb_extract_bfd (fd, this); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, bfd, out); + + bctx = bfd->ctx; + + MAKE_REAL_PATH_TO_STORAGE_DB (db_path, this, bctx->directory); + op_ret = lstat (db_path, &stbuf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to lstat on %s (%s)", + db_path, strerror (op_errno)); + goto out; + } + + stbuf.st_ino = fd->inode->ino; + stbuf.st_size = bdb_db_get (bctx, NULL, bfd->key, NULL, 0, 0); + stbuf.st_blocks = BDB_COUNT_BLOCKS (stbuf.st_size, stbuf.st_blksize); + +out: + frame->root->rsp_refs = NULL; + + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + return 0; +} + + +int32_t +bdb_readdir (call_frame_t *frame, + xlator_t *this, + fd_t *fd, + size_t size, + off_t off) +{ + struct bdb_dir *bfd = NULL; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + size_t filled = 0; + gf_dirent_t *this_entry = NULL; + gf_dirent_t entries; + struct dirent *entry = NULL; + off_t in_case = 0; + int32_t this_size = 0; + DBC *cursorp = NULL; + int32_t count = 0; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, fd, out); + + INIT_LIST_HEAD (&entries.list); + + bfd = bdb_extract_bfd (fd, this); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, bfd, out); + + op_errno = ENOMEM; + + while (filled <= size) { + this_entry = NULL; + entry = NULL; + in_case = 0; + this_size = 0; + + in_case = telldir (bfd->dir); + entry = readdir (bfd->dir); + if (!entry) + break; + + if (IS_BDB_PRIVATE_FILE(entry->d_name)) + continue; + + this_size = dirent_size (entry); + + if (this_size + filled > size) { + seekdir (bfd->dir, in_case); + break; + } + + count++; + + this_entry = gf_dirent_for_name (entry->d_name); + this_entry->d_ino = entry->d_ino; + + this_entry->d_off = -1; + + this_entry->d_type = entry->d_type; + this_entry->d_len = entry->d_reclen; + + + list_add (&this_entry->list, &entries.list); + + filled += this_size; + } + op_ret = filled; + op_errno = 0; + if (filled >= size) { + goto out; + } + + /* hungry kyaa? */ + op_ret = bdb_cursor_open (bfd->ctx, &cursorp); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out); + + /* TODO: fix d_off, don't use bfd->offset. wrong method */ + if (strlen (bfd->offset)) { + DBT key = {0,}, value = {0,}; + key.data = bfd->offset; + key.size = strlen (bfd->offset); + key.flags = DB_DBT_USERMEM; + value.dlen = 0; + value.doff = 0; + value.flags = DB_DBT_PARTIAL; + + op_ret = bdb_cursor_get (cursorp, &key, &value, DB_SET); + op_errno = EBADFD; + GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out); + + } else { + /* first time or last time, do nothing */ + } + + while (filled <= size) { + DBT key = {0,}, value = {0,}; + this_entry = NULL; + + key.flags = DB_DBT_MALLOC; + value.dlen = 0; + value.doff = 0; + value.flags = DB_DBT_PARTIAL; + op_ret = bdb_cursor_get (cursorp, &key, &value, DB_NEXT); + + if (op_ret == DB_NOTFOUND) { + /* we reached end of the directory */ + op_ret = 0; + op_errno = 0; + break; + } else if (op_ret != 0) { + gf_log (this->name, + GF_LOG_DEBUG, + "database error during readdir"); + op_ret = -1; + op_errno = ENOENT; + break; + } /* if (op_ret == DB_NOTFOUND)...else if...else */ + + if (key.data == NULL) { + /* NOTE: currently ignore when we get key.data == NULL. + * TODO: we should not get key.data = NULL */ + gf_log (this->name, + GF_LOG_DEBUG, + "null key read from db"); + continue; + }/* if(key.data)...else */ + count++; + this_size = bdb_dirent_size (&key); + if (this_size + filled > size) + break; + /* TODO - consider endianness here */ + this_entry = gf_dirent_for_name ((const char *)key.data); + /* FIXME: bug, if someone is going to use ->d_ino */ + this_entry->d_ino = -1; + this_entry->d_off = 0; + this_entry->d_type = 0; + this_entry->d_len = key.size; + + if (key.data) { + strncpy (bfd->offset, key.data, key.size); + bfd->offset [key.size] = '\0'; + free (key.data); + } + + list_add (&this_entry->list, &entries.list); + + filled += this_size; + }/* while */ + bdb_cursor_close (bfd->ctx, cursorp); + op_ret = filled; + op_errno = 0; +out: + frame->root->rsp_refs = NULL; + gf_log (this->name, + GF_LOG_DEBUG, + "read %"GF_PRI_SIZET" bytes for %d entries", filled, count); + STACK_UNWIND (frame, count, op_errno, &entries); + + gf_dirent_free (&entries); + + return 0; +} + + +int32_t +bdb_stats (call_frame_t *frame, + xlator_t *this, + int32_t flags) + +{ + int32_t op_ret = 0; + int32_t op_errno = 0; + + struct xlator_stats xlstats = {0, }, *stats = NULL; + struct statvfs buf; + struct timeval tv; + struct bdb_private *private = NULL; + int64_t avg_read = 0; + int64_t avg_write = 0; + int64_t _time_ms = 0; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + + private = (struct bdb_private *)(this->private); + stats = &xlstats; + + op_ret = statvfs (private->export_path, &buf); + op_errno = errno; + if (op_ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to statvfs on %s (%s)", + private->export_path, strerror (op_errno)); + goto out; + } + + stats->nr_files = private->stats.nr_files; + stats->nr_clients = private->stats.nr_clients; /* client info is maintained at FSd */ + stats->free_disk = buf.f_bfree * buf.f_bsize; /* Number of Free block in the filesystem. */ + stats->total_disk_size = buf.f_blocks * buf.f_bsize; /* */ + stats->disk_usage = (buf.f_blocks - buf.f_bavail) * buf.f_bsize; + + /* Calculate read and write usage */ + gettimeofday (&tv, NULL); + + /* Read */ + _time_ms = (tv.tv_sec - private->init_time.tv_sec) * 1000 + + ((tv.tv_usec - private->init_time.tv_usec) / 1000); + + avg_read = (_time_ms) ? (private->read_value / _time_ms) : 0; /* KBps */ + avg_write = (_time_ms) ? (private->write_value / _time_ms) : 0; /* KBps */ + + _time_ms = (tv.tv_sec - private->prev_fetch_time.tv_sec) * 1000 + + ((tv.tv_usec - private->prev_fetch_time.tv_usec) / 1000); + if (_time_ms && ((private->interval_read / _time_ms) > private->max_read)) { + private->max_read = (private->interval_read / _time_ms); + } + if (_time_ms && ((private->interval_write / _time_ms) > private->max_write)) { + private->max_write = private->interval_write / _time_ms; + } + + stats->read_usage = avg_read / private->max_read; + stats->write_usage = avg_write / private->max_write; + + gettimeofday (&(private->prev_fetch_time), NULL); + private->interval_read = 0; + private->interval_write = 0; + +out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, stats); + return 0; +} + + +int32_t +bdb_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *lock) +{ + frame->root->rsp_refs = NULL; + + gf_log (this->name, GF_LOG_CRITICAL, + "\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS"); + + STACK_UNWIND (frame, -1, ENOSYS); + return 0; +} + + +int32_t +bdb_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *lock) +{ + frame->root->rsp_refs = NULL; + + gf_log (this->name, GF_LOG_CRITICAL, + "\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS"); + + STACK_UNWIND (frame, -1, ENOSYS); + return 0; +} + + +int32_t +bdb_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type) +{ + frame->root->rsp_refs = NULL; + + gf_log (this->name, GF_LOG_CRITICAL, + "\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS"); + + STACK_UNWIND (frame, -1, ENOSYS); + return 0; +} + + +int32_t +bdb_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, entrylk_cmd cmd, + entrylk_type type) +{ + frame->root->rsp_refs = NULL; + + gf_log (this->name, GF_LOG_CRITICAL, + "\"features/posix-locks\" translator is not loaded. You need to use it for proper functioning of GlusterFS"); + + STACK_UNWIND (frame, -1, ENOSYS); + return 0; +} + + +int32_t +bdb_checksum (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + int32_t flag) +{ + char *real_path = NULL; + DIR *dir = NULL; + struct dirent *dirent = NULL; + uint8_t file_checksum[ZR_FILENAME_MAX] = {0,}; + uint8_t dir_checksum[ZR_FILENAME_MAX] = {0,}; + int32_t op_ret = -1; + int32_t op_errno = EINVAL; + int32_t i = 0, length = 0; + bctx_t *bctx = NULL; + DBC *cursorp = NULL; + char *data = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", frame, out); + GF_VALIDATE_OR_GOTO ("bdb", this, out); + GF_VALIDATE_OR_GOTO (this->name, loc, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + { + dir = opendir (real_path); + op_errno = errno; + GF_VALIDATE_OR_GOTO (this->name, dir, out); + while ((dirent = readdir (dir))) { + if (!dirent) + break; + + if (IS_BDB_PRIVATE_FILE(dirent->d_name)) + continue; + + length = strlen (dirent->d_name); + for (i = 0; i < length; i++) + dir_checksum[i] ^= dirent->d_name[i]; + } /* while((dirent...)) */ + closedir (dir); + } + + { + bctx = bctx_lookup (B_TABLE(this), (char *)loc->path); + op_errno = EINVAL; + GF_VALIDATE_OR_GOTO (this->name, bctx, out); + + op_ret = bdb_cursor_open (bctx, &cursorp); + op_errno = EINVAL; + GF_VALIDATE_OR_GOTO (this->name, (op_ret == 0), out); + + while (1) { + DBT key = {0,}, value = {0,}; + + key.flags = DB_DBT_MALLOC; + value.doff = 0; + value.dlen = 0; + op_ret = bdb_cursor_get (cursorp, &key, &value, DB_NEXT); + + if (op_ret == DB_NOTFOUND) { + gf_log (this->name, + GF_LOG_DEBUG, + "end of list of key/value pair in db for " + "directory: %s", bctx->directory); + op_ret = 0; + op_errno = 0; + break; + } else if (op_ret == 0){ + /* successfully read */ + data = key.data; + length = key.size; + for (i = 0; i < length; i++) + file_checksum[i] ^= data[i]; + + free (key.data); + } else { + gf_log (this->name, + GF_LOG_ERROR, + "failed to do cursor get for directory %s: %s", + bctx->directory, db_strerror (op_ret)); + op_ret = -1; + op_errno = ENOENT; + break; + }/* if(op_ret == DB_NOTFOUND)...else if...else */ + } /* while(1) */ + bdb_cursor_close (bctx, cursorp); + } +out: + if (bctx) { + /* NOTE: bctx_unref always returns success, + * see description of bctx_unref for more details */ + bctx_unref (bctx); + } + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum); + + return 0; +} + +/** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + switch (event) + { + case GF_EVENT_PARENT_UP: + { + /* Tell the parent that bdb xlator is up */ + assert ((this->private != NULL) && + (BDB_ENV(this) != NULL)); + default_notify (this, GF_EVENT_CHILD_UP, data); + } + break; + default: + /* */ + break; + } + return 0; +} + + + +/** + * init - + */ +int32_t +init (xlator_t *this) +{ + int32_t ret = -1; + struct stat buf = {0,}; + struct bdb_private *_private = NULL; + data_t *directory = NULL; + bctx_t *bctx = NULL; + + GF_VALIDATE_OR_GOTO ("bdb", this, out); + + _private = CALLOC (1, sizeof (*_private)); + GF_VALIDATE_OR_GOTO (this->name, _private, out); + + if (this->children) { + gf_log (this->name, + GF_LOG_ERROR, + "FATAL: storage/bdb cannot have subvolumes"); + FREE (_private); + goto out;; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + directory = dict_get (this->options, "directory"); + if (!directory) { + gf_log (this->name, GF_LOG_ERROR, + "export directory not specified in volfile"); + FREE (_private); + goto out; + } + umask (000); // umask `masking' is done at the client side + /* // * No need to create directory, sys admin should do it himself + if (mkdir (directory->data, 0777) == 0) { + gf_log (this->name, GF_LOG_WARNING, + "directory specified not exists, created"); + } + */ + + /* Check whether the specified directory exists, if not create it. */ + ret = stat (directory->data, &buf); + if ((ret != 0) || !S_ISDIR (buf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "specified directory '%s' doesn't exists, Exiting", directory->data); + FREE (_private); + goto out; + } else { + ret = 0; + } + + + _private->export_path = strdup (directory->data); + _private->export_path_length = strlen (_private->export_path); + + { + /* Stats related variables */ + gettimeofday (&_private->init_time, NULL); + gettimeofday (&_private->prev_fetch_time, NULL); + _private->max_read = 1; + _private->max_write = 1; + } + + this->private = (void *)_private; + { + ret = bdb_db_init (this, this->options); + + if (ret == -1){ + gf_log (this->name, + GF_LOG_DEBUG, + "failed to initialize database"); + goto out; + } else { + bctx = bctx_lookup (_private->b_table, "/"); + /* NOTE: we are not doing bctx_unref() for root bctx, + * let it remain in active list forever */ + if (!bctx) { + gf_log (this->name, + GF_LOG_ERROR, + "failed to allocate memory for root (/) bctx: out of memory"); + goto out; + } else { + ret = 0; + } + } + } +out: + return ret; +} + +void +bctx_cleanup (struct list_head *head) +{ + bctx_t *trav = NULL; + bctx_t *tmp = NULL; + DB *storage = NULL; + + list_for_each_entry_safe (trav, tmp, head, list) { + LOCK (&trav->lock); + storage = trav->dbp; + trav->dbp = NULL; + list_del_init (&trav->list); + UNLOCK (&trav->lock); + + if (storage) { + storage->close (storage, 0); + storage = NULL; + } + } + return; +} + +void +fini (xlator_t *this) +{ + struct bdb_private *private = NULL; + int32_t idx = 0; + int32_t ret = 0; + private = this->private; + + if (B_TABLE(this)) { + /* close all the dbs from lru list */ + bctx_cleanup (&(B_TABLE(this)->b_lru)); + for (idx = 0; idx < B_TABLE(this)->hash_size; idx++) + bctx_cleanup (&(B_TABLE(this)->b_hash[idx])); + + if (BDB_ENV(this)) { + LOCK (&private->active_lock); + private->active = 0; + UNLOCK (&private->active_lock); + + ret = pthread_join (private->checkpoint_thread, NULL); + if (ret != 0) { + gf_log (this->name, + GF_LOG_CRITICAL, + "failed to join checkpoint thread"); + } + + /* TODO: pick each of the 'struct bctx' from private->b_hash + * and close all the databases that are open */ + BDB_ENV(this)->close (BDB_ENV(this), 0); + } else { + /* impossible to reach here */ + } + + FREE (B_TABLE(this)); + } + FREE (private); + return; +} + +struct xlator_mops mops = { + .stats = bdb_stats, +}; + +struct xlator_fops fops = { + .lookup = bdb_lookup, + .stat = bdb_stat, + .opendir = bdb_opendir, + .readdir = bdb_readdir, + .readlink = bdb_readlink, + .mknod = bdb_mknod, + .mkdir = bdb_mkdir, + .unlink = bdb_unlink, + .rmdir = bdb_rmdir, + .symlink = bdb_symlink, + .rename = bdb_rename, + .link = bdb_link, + .chmod = bdb_chmod, + .chown = bdb_chown, + .truncate = bdb_truncate, + .utimens = bdb_utimens, + .create = bdb_create, + .open = bdb_open, + .readv = bdb_readv, + .writev = bdb_writev, + .statfs = bdb_statfs, + .flush = bdb_flush, + .fsync = bdb_fsync, + .setxattr = bdb_setxattr, + .getxattr = bdb_getxattr, + .removexattr = bdb_removexattr, + .fsyncdir = bdb_fsyncdir, + .access = bdb_access, + .ftruncate = bdb_ftruncate, + .fstat = bdb_fstat, + .lk = bdb_lk, + .inodelk = bdb_inodelk, + .finodelk = bdb_finodelk, + .entrylk = bdb_entrylk, + .fentrylk = bdb_fentrylk, + .fchown = bdb_fchown, + .fchmod = bdb_fchmod, + .setdents = bdb_setdents, + .getdents = bdb_getdents, + .checksum = bdb_checksum, +}; + +struct xlator_cbks cbks = { + .release = bdb_release, + .releasedir = bdb_releasedir +}; + +#if 0 +struct volume_options options[] = { + { "directory", GF_OPTION_TYPE_PATH, 0, }, + { "logdir", GF_OPTION_TYPE_PATH, 0, }, + { "errfile", GF_OPTION_TYPE_PATH, 0, }, + { "dir-mode", GF_OPTION_TYPE_ANY, 0, }, // base 8 number + { "file-mode", GF_OPTION_TYPE_ANY, 0, }, // base 8 number + { "page-size", GF_OPTION_TYPE_SIZET, -1, }, + { "lru-limit", GF_OPTION_TYPE_INT, -1, }, + { "lock-timeout", GF_OPTION_TYPE_TIME, 0, }, + { "checkpoint-timeout", GF_OPTION_TYPE_TIME, 0, }, + { "transaction-timeout", GF_OPTION_TYPE_TIME, 0, }, + { "mode", GF_OPTION_TYPE_BOOL, 0, }, // Should be 'cache' ?? + { "access-mode", GF_OPTION_TYPE_STR, 0, 0, 0, "btree"}, + { NULL, 0, } +}; + +#endif /* #if 0 */ diff --git a/xlators/storage/bdb/src/bdb.h b/xlators/storage/bdb/src/bdb.h new file mode 100644 index 000000000..f2d962680 --- /dev/null +++ b/xlators/storage/bdb/src/bdb.h @@ -0,0 +1,439 @@ +/* + Copyright (c) 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _BDB_H +#define _BDB_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <dirent.h> +#include <unistd.h> +#include <sys/types.h> +#include <dirent.h> + +#include <db.h> + +#ifdef linux +#ifdef __GLIBC__ +#include <sys/fsuid.h> +#else +#include <unistd.h> +#endif +#endif + +#ifdef HAVE_SYS_XATTR_H +#include <sys/xattr.h> +#endif + +#ifdef HAVE_SYS_EXTATTR_H +#include <sys/extattr.h> +#endif + +#include <pthread.h> +#include "xlator.h" +#include "inode.h" +#include "compat.h" +#include "compat-errno.h" + +#define GLFS_BDB_STORAGE "/glusterfs_storage.db" + +/* numbers are not so reader-friendly, so lets have ON and OFF macros */ +#define ON 1 +#define OFF 0 + +#define BDB_DEFAULT_LRU_LIMIT 100 +#define BDB_DEFAULT_HASH_SIZE 100 + +#define BDB_ENOSPC_THRESHOLD 25600 + +#define BDB_DEFAULT_CHECKPOINT_TIMEOUT 30 + +#define BCTX_ENV(bctx) (bctx->table->dbenv) +/* MAKE_REAL_PATH(var,this,path) + * make the real path on the underlying file-system + * + * @var: destination to hold the real path + * @this: pointer to xlator_t corresponding to bdb xlator + * @path: path, as seen from mount-point + */ +#define MAKE_REAL_PATH(var, this, path) do { \ + int base_len = ((struct bdb_private *)this->private)->export_path_length; \ + var = alloca (strlen (path) + base_len + 2); \ + strcpy (var, ((struct bdb_private *)this->private)->export_path); \ + strcpy (&var[base_len], path); \ + } while (0) + +/* MAKE_REAL_PATH_TO_STORAGE_DB(var,this,path) + * make the real path to the storage-database file on file-system + * + * @var: destination to hold the real path + * @this: pointer to xlator_t corresponding to bdb xlator + * @path: path of the directory, as seen from mount-point + */ +#define MAKE_REAL_PATH_TO_STORAGE_DB(var, this, path) do { \ + int base_len = ((struct bdb_private *)this->private)->export_path_length; \ + var = alloca (strlen (path) + base_len + strlen (GLFS_BDB_STORAGE)); \ + strcpy (var, ((struct bdb_private *)this->private)->export_path); \ + strcpy (&var[base_len], path); \ + strcat (var, GLFS_BDB_STORAGE); \ + } while (0) + +/* MAKE_KEY_FROM_PATH(key,path) + * make a 'key', which we use as key in the underlying database by using the path + * + * @key: destination to hold the key + * @path: path to file as seen from mount-point + */ +#define MAKE_KEY_FROM_PATH(key, path) do { \ + char *tmp = alloca (strlen (path)); \ + strcpy (tmp, path); \ + key = basename (tmp); \ + }while (0); + +/* BDB_DO_LSTAT(path,stbuf,dirent) + * construct real-path to a dirent and do lstat on the real-path + * + * @path: path to the directory whose readdir is currently in progress + * @stbuf: a 'struct stat *' + * @dirent: a 'struct dirent *' + */ +#define BDB_DO_LSTAT(path, stbuf, dirent) do { \ + char tmp_real_path[GF_PATH_MAX]; \ + strcpy(tmp_real_path, path); \ + strcat (tmp_real_path, "/"); \ + strcat(tmp_real_path, dirent->d_name); \ + ret = lstat (tmp_real_path, stbuf); \ + } while(0); + +/* IS_BDB_PRIVATE_FILE(name) + * check if a given 'name' is bdb xlator's internal file name + * + * @name: basename of a file. + * + * bdb xlator reserves file names 'glusterfs_storage.db', + * 'glusterfs_ns.db'(used by bdb xlator itself), 'log.*', '__db.*' (used by libdb) + */ +#define IS_BDB_PRIVATE_FILE(name) ((!strncmp(name, "__db.", 5)) || \ + (!strcmp(name, "glusterfs_storage.db")) || \ + (!strcmp(name, "glusterfs_ns.db")) || \ + (!strncmp(name, "log.0000", 8))) + +/* check if 'name' is '.' or '..' entry */ +#define IS_DOT_DOTDOT(name) ((!strncmp(name,".", 1)) || (!strncmp(name,"..", 2))) + +/* BDB_SET_BCTX(this,inode,bctx) + * put a stamp on inode. d00d, you are using bdb.. huhaha. + * pointer to 'struct bdb_ctx' is stored in inode's ctx of all directories. + * this will happen either in lookup() or mkdir(). + * + * @this: pointer xlator_t of bdb xlator. + * @inode: inode where 'struct bdb_ctx *' has to be stored. + * @bctx: a 'struct bdb_ctx *' + */ +#define BDB_SET_BCTX(this,inode,bctx) do{ \ + inode_ctx_put(inode, this, (uint64_t)(long)bctx); \ + }while (0); + +/* MAKE_BCTX_FROM_INODE(this,bctx,inode) + * extract bdb xlator's 'struct bdb_ctx *' from an inode's ctx. + * valid only if done for directory inodes, otherwise bctx = NULL. + * + * @this: pointer xlator_t of bdb xlator. + * @bctx: a 'struct bdb_ctx *' + * @inode: inode from where 'struct bdb_ctx *' has to be extracted. + */ +#define MAKE_BCTX_FROM_INODE(this,bctx,inode) do{ \ + uint64_t tmp_bctx = 0; \ + inode_ctx_get (inode, this, &tmp_bctx); \ + if (ret == 0) \ + bctx = (void *)(long)tmp_bctx; \ + }while (0); + +#define BDB_SET_BFD(this,fd,bfd) do{ \ + fd_ctx_set (fd, this, (uint64_t)(long)bfd); \ + }while (0); + +/* maximum number of open dbs that bdb xlator will ever have */ +#define BDB_MAX_OPEN_DBS 100 + +/* convert file size to block-count */ +#define BDB_COUNT_BLOCKS(size,blksize) (((size + blksize - 1)/blksize) - 1) + +/* file permissions, again macros are more readable */ +#define RWXRWXRWX 0777 +#define DEFAULT_FILE_MODE 0644 +#define DEFAULT_DIR_MODE 0755 + +/* see, if have a valid file permissions specification in @mode */ +#define IS_VALID_FILE_MODE(mode) (!(mode & (~RWXRWXRWX))) +#define IS_VALID_DIR_MODE(mode) (!(mode & (~(RWXRWXRWX))) + +/* maximum retries for a failed transactional operation */ +#define BDB_MAX_RETRIES 10 + +typedef struct bctx_table bctx_table_t; +typedef struct bdb_ctx bctx_t; +typedef struct bdb_cache bdb_cache_t; +typedef struct bdb_private bdb_private_t; + +struct bctx_table { + uint64_t dbflags; /* flags to be used for opening each database */ + uint64_t cache; /* cache: can be either ON or OFF */ + gf_lock_t lock; /* used to lock the 'struct bctx_table *' */ + gf_lock_t checkpoint_lock; /* lock for checkpointing */ + struct list_head *b_hash; /* hash table of 'struct bdb_ctx' */ + struct list_head active; /* list of active 'struct bdb_ctx' */ + struct list_head b_lru; /* lru list of inactive 'struct bdb_ctx' */ + struct list_head purge; + uint32_t lru_limit; + uint32_t lru_size; + uint32_t hash_size; + DBTYPE access_mode; /* access mode for accessing the databases, + * can be DB_HASH, DB_BTREE */ + DB_ENV *dbenv; /* DB_ENV under which every db operation + * is carried over */ + int32_t transaction; + xlator_t *this; + + uint64_t page_size; /* page-size of DB, + * DB->set_pagesize(), should be set before DB->open */ +}; + +struct bdb_ctx { + /* controller members */ + struct list_head list; /* lru list of 'struct bdb_ctx's, + * a bdb_ctx can exist in one of b_hash or lru lists */ + struct list_head b_hash; /* directory 'name' hashed list of 'struct bdb_ctx's */ + + struct bctx_table *table; + int32_t ref; /* reference count */ + gf_lock_t lock; /* used to lock this 'struct bdb_ctx' */ + + char *directory; /* directory path */ + DB *dbp; /* pointer to open database, that resides inside this directory */ + uint32_t cache; /* cache ON or OFF */ + + /* per directory cache, bdb xlator's internal cache */ + struct list_head c_list; /* linked list of cached records */ + int32_t c_count; /* number of cached records */ + + int32_t key_hash; /* index to hash table list, to which this ctx belongs */ + char *db_path; /* absolute path to db file */ +}; + +struct bdb_fd { + struct bdb_ctx *ctx; /* pointer to bdb_ctx of the parent directory */ + char *key; /* name of the file. NOTE: basename, not the complete path */ + int32_t flags; /* open flags */ +}; + +struct bdb_dir { + struct bdb_ctx *ctx; /* pointer to bdb_ctx of this directory */ + DIR *dir; /* open directory pointer, as returned by opendir() */ + char offset[NAME_MAX]; /* FIXME: readdir offset, too crude. must go */ + char *path; /* path to this directory */ +}; + +/* cache */ +struct bdb_cache { + struct list_head c_list; /* list of 'struct bdb_cache' under a 'struct bdb_ctx' */ + char *key; /* name of the file this cache holds. NOTE: basename of file */ + char *data; /* file content */ + size_t size; /* size of the file content that this cache holds */ +}; + + +struct bdb_private { + inode_table_t *itable; /* pointer to inode table that we use */ + int32_t temp; /**/ + char is_stateless; /**/ + char *export_path; /* path to the export directory + * (option directory <export-path>) */ + int32_t export_path_length; /* length of 'export_path' string */ + + /* statistics */ + struct xlator_stats stats; /* Statistics, provides activity of the server */ + + struct timeval prev_fetch_time; + struct timeval init_time; + int32_t max_read; /* */ + int32_t max_write; /* */ + int64_t interval_read; /* Used to calculate the max_read value */ + int64_t interval_write; /* Used to calculate the max_write value */ + int64_t read_value; /* Total read, from init */ + int64_t write_value; /* Total write, from init */ + + /* bdb xlator specific private data */ + uint64_t envflags; /* flags used for opening DB_ENV for this xlator */ + uint64_t dbflags; /* flags to be used for opening each database */ + uint64_t cache; /* cache: can be either ON or OFF */ + uint32_t transaction; /* transaction: can be either ON or OFF */ + uint32_t active; + gf_lock_t active_lock; + struct bctx_table *b_table; + DBTYPE access_mode; /* access mode for accessing the databases, + * can be DB_HASH, DB_BTREE + * (option access-mode <mode>) */ + mode_t file_mode; /* mode for each and every file stored on bdb + * (option file-mode <mode>) */ + mode_t dir_mode; /* mode for each and every directory stored on bdb + * (option dir-mode <mode>) */ + mode_t symlink_mode; /* mode for each and every symlink stored on bdb */ + pthread_t checkpoint_thread; /* pthread_t object used for creating checkpoint + * thread */ + int32_t checkpoint_timeout; /* time duration between two consecutive checkpoint + * operations. + * (option checkpoint-timeout <time-in-seconds>) */ + ino_t next_ino; /* inode number allocation counter */ + gf_lock_t ino_lock; /* lock to protect 'next_ino' */ + char *logdir; /* environment log directory + * (option logdir <directory>) */ + char *errfile; /* errfile path, used by environment to + * print detailed error log. + * (option errfile <errfile-path>) */ + FILE *errfp; /* DB_ENV->set_errfile() expects us to fopen + * the errfile before doing DB_ENV->set_errfile() */ + uint32_t txn_timeout; /* used by DB_ENV->set_timeout to set the timeout for + * a transactionally encapsulated DB->operation() to + * timeout before waiting for locks to be released. + * (option transaction-timeout <time-in-milliseconds>) + */ + uint32_t lock_timeout; + uint32_t log_auto_remove; /* DB_AUTO_LOG_REMOVE flag for DB_ENV*/ + uint32_t log_region_max; +}; + + +static inline int32_t +bdb_txn_begin (DB_ENV *dbenv, + DB_TXN **ptxnid) +{ + return dbenv->txn_begin (dbenv, NULL, ptxnid, 0); +} + +static inline int32_t +bdb_txn_abort (DB_TXN *txnid) +{ + return txnid->abort (txnid); +} + +static inline int32_t +bdb_txn_commit (DB_TXN *txnid) +{ + return txnid->commit (txnid, 0); +} + +inline void * +bdb_extract_bfd (fd_t *fd, xlator_t *this); + + +void * +bdb_db_stat (bctx_t *bctx, + DB_TXN *txnid, + uint32_t flags); + +int32_t +bdb_db_get(struct bdb_ctx *bctx, + DB_TXN *txnid, + const char *key_string, + char **buf, + size_t size, + off_t offset); + +#define BDB_TRUNCATE_RECORD 0xcafebabe + +int32_t +bdb_db_put (struct bdb_ctx *bctx, + DB_TXN *txnid, + const char *key_string, + const char *buf, + size_t size, + off_t offset, + int32_t flags); + +int32_t +bdb_db_del (struct bdb_ctx *bctx, + DB_TXN *txnid, + const char *path); + +ino_t +bdb_inode_transform (ino_t parent, + struct bdb_ctx *bctx); + + +int32_t +bdb_cursor_open (struct bdb_ctx *bctx, + DBC **cursorp); + +int32_t +bdb_cursor_get (DBC *cursorp, + DBT *key, + DBT *value, + int32_t flags); + + +int32_t +bdb_cursor_close (struct bdb_ctx *ctx, + DBC *cursorp); + + +int32_t +bdb_dirent_size (DBT *key); + +int32_t +dirent_size (struct dirent *entry); + +int +bdb_db_init (xlator_t *this, + dict_t *options); + +void +bdb_dbs_from_dict_close (dict_t *this, + char *key, + data_t *value, + void *data); + +bctx_t * +bctx_lookup (struct bctx_table *table, + const char *path); + +bctx_t * +bctx_parent +(struct bctx_table *table, + const char *path); + +bctx_t * +bctx_unref (bctx_t *ctx); + +bctx_t * +bctx_ref (bctx_t *ctx); + +bctx_t * +bctx_rename (bctx_t *bctx, + const char *db_newpath); + +int32_t +bdb_db_rename (bctx_table_t *table, + const char *tmp_db_newpath, + const char *real_db_newpath); +#endif /* _BDB_H */ diff --git a/xlators/storage/posix/Makefile.am b/xlators/storage/posix/Makefile.am new file mode 100644 index 000000000..d471a3f92 --- /dev/null +++ b/xlators/storage/posix/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = src + +CLEANFILES = diff --git a/xlators/storage/posix/src/Makefile.am b/xlators/storage/posix/src/Makefile.am new file mode 100644 index 000000000..2859e09aa --- /dev/null +++ b/xlators/storage/posix/src/Makefile.am @@ -0,0 +1,17 @@ + +xlator_LTLIBRARIES = posix.la +xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/storage + +posix_la_LDFLAGS = -module -avoidversion + +posix_la_SOURCES = posix.c xattr-cache.c +posix_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + +noinst_HEADERS = posix.h xattr-cache.h + +AM_CFLAGS = -fPIC -fno-strict-aliasing -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE -D$(GF_HOST_OS) -Wall \ + -I$(top_srcdir)/libglusterfs/src -shared -nostartfiles \ + $(GF_CFLAGS) + +CLEANFILES = + diff --git a/xlators/storage/posix/src/posix.c b/xlators/storage/posix/src/posix.c new file mode 100644 index 000000000..159f02dde --- /dev/null +++ b/xlators/storage/posix/src/posix.c @@ -0,0 +1,3715 @@ +/* + Copyright (c) 2006, 2007, 2008, 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#define __XOPEN_SOURCE 500 + +#include <stdint.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <errno.h> +#include <ftw.h> + +#ifndef GF_BSD_HOST_OS +#include <alloca.h> +#endif /* GF_BSD_HOST_OS */ + +#include "glusterfs.h" +#include "dict.h" +#include "logging.h" +#include "posix.h" +#include "xlator.h" +#include "defaults.h" +#include "common-utils.h" +#include "compat-errno.h" +#include "compat.h" +#include "byte-order.h" + +#undef HAVE_SET_FSID +#ifdef HAVE_SET_FSID + +#define DECLARE_OLD_FS_ID_VAR uid_t old_fsuid; gid_t old_fsgid; + +#define SET_FS_ID(uid, gid) do { \ + old_fsuid = setfsuid (uid); \ + old_fsgid = setfsgid (gid); \ + } while (0) + +#define SET_TO_OLD_FS_ID() do { \ + setfsuid (old_fsuid); \ + setfsgid (old_fsgid); \ + } while (0) + +#else + +#define DECLARE_OLD_FS_ID_VAR +#define SET_FS_ID(uid, gid) +#define SET_TO_OLD_FS_ID() + +#endif + +typedef struct { + xlator_t *this; + const char *real_path; + dict_t *xattr; + struct stat *stbuf; + loc_t *loc; +} posix_xattr_filler_t; + +int +posix_forget (xlator_t *this, inode_t *inode) +{ + uint64_t tmp_cache = 0; + if (!inode_ctx_del (inode, this, &tmp_cache)) + dict_destroy ((dict_t *)(long)tmp_cache); + + return 0; +} + +static void +_posix_xattr_get_set (dict_t *xattr_req, + char *key, + data_t *data, + void *xattrargs) +{ + posix_xattr_filler_t *filler = xattrargs; + char *value = NULL; + ssize_t xattr_size = -1; + int ret = -1; + char *databuf = NULL; + int _fd = -1; + loc_t *loc = NULL; + ssize_t req_size = 0; + + + /* should size be put into the data_t ? */ + if (!strcmp (key, "glusterfs.content")) { + /* file content request */ + req_size = data_to_uint64 (data); + if (req_size >= filler->stbuf->st_size) { + _fd = open (filler->real_path, O_RDONLY); + + if (_fd == -1) { + gf_log (filler->this->name, GF_LOG_ERROR, + "opening file %s failed: %s", + filler->real_path, strerror (errno)); + goto err; + } + + databuf = calloc (1, filler->stbuf->st_size); + + if (!databuf) { + gf_log (filler->this->name, GF_LOG_ERROR, + "out of memory :("); + goto err; + } + + ret = read (_fd, databuf, filler->stbuf->st_size); + if (ret == -1) { + gf_log (filler->this->name, GF_LOG_ERROR, + "read on file %s failed: %s", + filler->real_path, strerror (errno)); + goto err; + } + + ret = close (_fd); + _fd = -1; + if (ret == -1) { + gf_log (filler->this->name, GF_LOG_ERROR, + "close on file %s failed: %s", + filler->real_path, strerror (errno)); + goto err; + } + + ret = dict_set_bin (filler->xattr, key, + databuf, filler->stbuf->st_size); + if (ret < 0) { + goto err; + } + + /* To avoid double free in cleanup below */ + databuf = NULL; + err: + if (_fd != -1) + close (_fd); + if (databuf) + FREE (databuf); + } + } else if (!strcmp (key, GLUSTERFS_OPEN_FD_COUNT)) { + loc = filler->loc; + if (!list_empty (&loc->inode->fd_list)) { + ret = dict_set_uint32 (filler->xattr, key, 1); + } else { + ret = dict_set_uint32 (filler->xattr, key, 0); + } + } else { + xattr_size = lgetxattr (filler->real_path, key, NULL, 0); + + if (xattr_size > 0) { + value = calloc (1, xattr_size + 1); + + lgetxattr (filler->real_path, key, value, xattr_size); + + value[xattr_size] = '\0'; + ret = dict_set_bin (filler->xattr, key, + value, xattr_size); + if (ret < 0) + gf_log (filler->this->name, GF_LOG_ERROR, + "dict set failed. path: %s, key: %s", + filler->real_path, key); + } + } +} + + +dict_t * +posix_lookup_xattr_fill (xlator_t *this, const char *real_path, loc_t *loc, + dict_t *xattr_req, struct stat *buf) +{ + dict_t *xattr = NULL; + posix_xattr_filler_t filler = {0, }; + + xattr = get_new_dict(); + if (!xattr) { + gf_log (this->name, GF_LOG_ERROR, + "memory allocation failed :("); + goto out; + } + + filler.this = this; + filler.real_path = real_path; + filler.xattr = xattr; + filler.stbuf = buf; + filler.loc = loc; + + dict_foreach (xattr_req, _posix_xattr_get_set, &filler); +out: + return xattr; +} + + +int32_t +posix_lookup (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *xattr_req) +{ + struct stat buf = {0, }; + char * real_path = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + dict_t * xattr = NULL; + + struct posix_private *priv = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (loc->path, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + priv = this->private; + + op_ret = lstat (real_path, &buf); + op_errno = errno; + + if (op_ret == -1) { + if (op_errno != ENOENT) { + gf_log (this->name, GF_LOG_WARNING, + "lstat on %s failed: %s", + loc->path, strerror (op_errno)); + } + goto out; + } + + /* Make sure we don't access another mountpoint inside export dir. + * It may cause inode number to repeat from single export point, + * which leads to severe problems.. + */ + if (priv->base_stdev != buf.st_dev) { + op_errno = ENOENT; + gf_log (this->name, GF_LOG_WARNING, + "%s: different mountpoint/device, returning " + "ENOENT", loc->path); + goto out; + } + + if (xattr_req && (op_ret == 0)) { + xattr = posix_lookup_xattr_fill (this, real_path, loc, + xattr_req, &buf); + } + + op_ret = 0; +out: + frame->root->rsp_refs = NULL; + + if (xattr) + dict_ref (xattr); + + STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &buf, xattr); + + if (xattr) + dict_unref (xattr); + + return 0; +} + + +int32_t +posix_stat (call_frame_t *frame, + xlator_t *this, + loc_t *loc) +{ + struct stat buf = {0,}; + char * real_path = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = lstat (real_path, &buf); + + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "lstat on %s: %s", loc->path, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID(); + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &buf); + + return 0; +} + +int32_t +posix_opendir (call_frame_t *frame, xlator_t *this, + loc_t *loc, fd_t *fd) +{ + char * real_path = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + DIR * dir = NULL; + struct posix_fd * pfd = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (loc->path, out); + VALIDATE_OR_GOTO (fd, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + dir = opendir (real_path); + + if (dir == NULL) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "opendir failed on %s (%s)", + loc->path, strerror (op_errno)); + goto out; + } + + op_ret = dirfd (dir); + if (op_ret < 0) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "dirfd() failed on %s (%s)", + loc->path, strerror (op_errno)); + goto out; + } + + pfd = CALLOC (1, sizeof (*fd)); + if (!pfd) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + pfd->dir = dir; + pfd->fd = dirfd (dir); + pfd->path = strdup (real_path); + if (!pfd->path) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + fd_ctx_set (fd, this, (uint64_t)(long)pfd); + + frame->root->rsp_refs = NULL; + + op_ret = 0; + + out: + if (op_ret == -1) { + if (dir) { + closedir (dir); + dir = NULL; + } + if (pfd) { + if (pfd->path) + FREE (pfd->path); + FREE (pfd); + pfd = NULL; + } + } + + SET_TO_OLD_FS_ID (); + STACK_UNWIND (frame, op_ret, op_errno, fd); + return 0; +} + + +int32_t +posix_getdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off, int32_t flag) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = NULL; + dir_entry_t entries = {0, }; + dir_entry_t * tmp = NULL; + DIR * dir = NULL; + struct dirent * dirent = NULL; + int real_path_len = -1; + int entry_path_len = -1; + char * entry_path = NULL; + int count = 0; + struct posix_fd * pfd = NULL; + uint64_t tmp_pfd = 0; + struct stat buf = {0,}; + int ret = -1; + char tmp_real_path[ZR_PATH_MAX]; + char linkpath[ZR_PATH_MAX]; + + DECLARE_OLD_FS_ID_VAR ; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_ERROR, + "fd %p does not have context in %s", + fd, this->name); + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + if (!pfd->path) { + op_errno = EBADFD; + gf_log (this->name, GF_LOG_ERROR, + "pfd does not have path set (possibly file " + "fd, fd=%p)", fd); + goto out; + } + + real_path = pfd->path; + real_path_len = strlen (real_path); + + entry_path_len = real_path_len + NAME_MAX; + entry_path = CALLOC (1, entry_path_len); + + if (!entry_path) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + strncpy (entry_path, real_path, entry_path_len); + entry_path[real_path_len] = '/'; + + dir = pfd->dir; + + if (!dir) { + op_errno = EBADFD; + gf_log (this->name, GF_LOG_ERROR, + "pfd does not have dir set (possibly file fd, " + "fd=%p, path=`%s'", + fd, real_path); + goto out; + } + + /* TODO: check for all the type of flag, and behave appropriately */ + + while ((dirent = readdir (dir))) { + if (!dirent) + break; + + /* This helps in self-heal, when only directories + needs to be replicated */ + + /* This is to reduce the network traffic, in case only + directory is needed from posix */ + + strncpy (tmp_real_path, real_path, ZR_PATH_MAX); + strncat (tmp_real_path, "/", + ZR_PATH_MAX - strlen (tmp_real_path)); + + strncat (tmp_real_path, dirent->d_name, + ZR_PATH_MAX - strlen (tmp_real_path)); + ret = lstat (tmp_real_path, &buf); + + if ((flag == GF_GET_DIR_ONLY) + && (ret != -1 && !S_ISDIR(buf.st_mode))) { + continue; + } + + tmp = CALLOC (1, sizeof (*tmp)); + + if (!tmp) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + tmp->name = strdup (dirent->d_name); + if (!tmp->name) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + if (entry_path_len < + (real_path_len + 1 + strlen (tmp->name) + 1)) { + entry_path_len = (real_path_len + + strlen (tmp->name) + 1024); + + entry_path = realloc (entry_path, entry_path_len); + } + + strcpy (&entry_path[real_path_len+1], tmp->name); + + ret = lstat (entry_path, &tmp->buf); + + if (ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "lstat on %s failed: %s", + entry_path, strerror (op_errno)); + goto out; + } + + if (S_ISLNK(tmp->buf.st_mode)) { + + ret = readlink (entry_path, linkpath, ZR_PATH_MAX); + if (ret != -1) { + linkpath[ret] = '\0'; + tmp->link = strdup (linkpath); + } + } else { + tmp->link = ""; + } + + count++; + + tmp->next = entries.next; + entries.next = tmp; + + /* if size is 0, count can never be = size, so entire + dir is read */ + if (count == size) + break; + } + + FREE (entry_path); + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + if (op_ret == -1) { + if (entry_path) + FREE (entry_path); + } + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &entries, count); + + if (op_ret == 0) { + while (entries.next) { + tmp = entries.next; + entries.next = entries.next->next; + FREE (tmp->name); + FREE (tmp); + } + } + + return 0; +} + + +int32_t +posix_releasedir (xlator_t *this, + fd_t *fd) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + struct posix_fd * pfd = NULL; + uint64_t tmp_pfd = 0; + int ret = 0; + + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = fd_ctx_del (fd, this, &tmp_pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_ERROR, + "pfd from fd=%p is NULL", fd); + goto out; + } + + pfd = (struct posix_fd *)(long)tmp_pfd; + if (!pfd->dir) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_ERROR, + "pfd->dir is NULL for fd=%p path=%s", + fd, pfd->path ? pfd->path : "<NULL>"); + goto out; + } + + ret = closedir (pfd->dir); + if (ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "closedir on %p failed", pfd->dir); + goto out; + } + pfd->dir = NULL; + + if (!pfd->path) { + op_errno = EBADFD; + gf_log (this->name, GF_LOG_ERROR, + "pfd->path was NULL. fd=%p pfd=%p", + fd, pfd); + goto out; + } + + op_ret = 0; + + out: + if (pfd) { + if (pfd->path) + FREE (pfd->path); + FREE (pfd); + } + + return 0; +} + + +int32_t +posix_readlink (call_frame_t *frame, xlator_t *this, + loc_t *loc, size_t size) +{ + char * dest = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + + dest = alloca (size + 1); + + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = readlink (real_path, dest, size); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "readlink on %s: %s", loc->path, strerror (op_errno)); + goto out; + } + + dest[op_ret] = 0; + + out: + SET_TO_OLD_FS_ID (); + frame->root->rsp_refs = NULL; + + STACK_UNWIND (frame, op_ret, op_errno, dest); + + return 0; +} + +int32_t +posix_mknod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode, dev_t dev) +{ + int tmp_fd = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = 0; + struct stat stbuf = { 0, }; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = mknod (real_path, mode, dev); + + if (op_ret == -1) { + op_errno = errno; + if ((op_errno == EINVAL) && S_ISREG (mode)) { + /* Over Darwin, mknod with (S_IFREG|mode) + doesn't work */ + tmp_fd = creat (real_path, mode); + if (tmp_fd == -1) + goto out; + close (tmp_fd); + } else { + + gf_log (this->name, GF_LOG_ERROR, + "mknod on %s: %s", loc->path, + strerror (op_errno)); + goto out; + } + } + +#ifndef HAVE_SET_FSID + op_ret = lchown (real_path, frame->root->uid, frame->root->gid); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "lchown on %s: %s", loc->path, strerror (op_errno)); + goto out; + } +#endif + + op_ret = lstat (real_path, &stbuf); + + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "mknod on %s: %s", loc->path, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + + return 0; +} + +int32_t +posix_mkdir (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = NULL; + struct stat stbuf = {0, }; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = mkdir (real_path, mode); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "mkdir of %s: %s", loc->path, strerror (op_errno)); + goto out; + } + +#ifndef HAVE_SET_FSID + op_ret = chown (real_path, frame->root->uid, frame->root->gid); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "chown on %s: %s", loc->path, strerror (op_errno)); + goto out; + } +#endif + + op_ret = lstat (real_path, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "lstat on %s: %s", loc->path, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + + return 0; +} + + +int32_t +posix_unlink (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = NULL; + + xattr_cache_handle_t handle = {{0,}, 0}; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + loc_copy (&handle.loc, loc); + { + posix_xattr_cache_flush (this, &handle); + } + loc_wipe (&handle.loc); + + op_ret = unlink (real_path); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "unlink of %s: %s", loc->path, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +int32_t +posix_rmdir (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = 0; + + xattr_cache_handle_t handle = {{0,}, 0}; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + loc_copy (&handle.loc, loc); + { + posix_xattr_cache_flush (this, &handle); + } + loc_wipe (&handle.loc); + + op_ret = rmdir (real_path); + op_errno = errno; + + if (op_errno == EEXIST) + /* Solaris sets errno = EEXIST instead of ENOTEMPTY */ + op_errno = ENOTEMPTY; + + if (op_ret == -1 && op_errno != ENOTEMPTY) { + gf_log (this->name, GF_LOG_WARNING, + "rmdir of %s: %s", loc->path, strerror (op_errno)); + goto out; + } + + out: + SET_TO_OLD_FS_ID (); + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +int32_t +posix_symlink (call_frame_t *frame, xlator_t *this, + const char *linkname, loc_t *loc) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = 0; + struct stat stbuf = { 0, }; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (linkname, out); + VALIDATE_OR_GOTO (loc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = symlink (linkname, real_path); + + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "symlink of %s --> %s: %s", + loc->path, linkname, strerror (op_errno)); + goto out; + } + +#ifndef HAVE_SET_FSID + op_ret = lchown (real_path, frame->root->uid, frame->root->gid); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "lchown failed on %s: %s", + loc->path, strerror (op_errno)); + goto out; + } +#endif + op_ret = lstat (real_path, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "lstat failed on %s: %s", + loc->path, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, loc->inode, &stbuf); + + return 0; +} + + +int +posix_rename (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_oldpath = NULL; + char * real_newpath = NULL; + struct stat stbuf = {0, }; + + xattr_cache_handle_t handle = {{0,}, 0}; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (oldloc, out); + VALIDATE_OR_GOTO (newloc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_oldpath, this, oldloc->path); + MAKE_REAL_PATH (real_newpath, this, newloc->path); + + loc_copy (&handle.loc, oldloc); + { + posix_xattr_cache_flush (this, &handle); + } + loc_wipe (&handle.loc); + + op_ret = rename (real_oldpath, real_newpath); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, + (op_errno == ENOTEMPTY ? GF_LOG_DEBUG : GF_LOG_ERROR), + "rename of %s to %s failed: %s", + oldloc->path, newloc->path, strerror (op_errno)); + goto out; + } + + op_ret = lstat (real_newpath, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "lstat on %s failed: %s", + real_newpath, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + + +int +posix_link (call_frame_t *frame, xlator_t *this, + loc_t *oldloc, loc_t *newloc) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_oldpath = 0; + char * real_newpath = 0; + struct stat stbuf = {0, }; + + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (oldloc, out); + VALIDATE_OR_GOTO (newloc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_oldpath, this, oldloc->path); + MAKE_REAL_PATH (real_newpath, this, newloc->path); + + op_ret = link (real_oldpath, real_newpath); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "link %s to %s failed: %s", + oldloc->path, newloc->path, strerror (op_errno)); + goto out; + } + + op_ret = lstat (real_newpath, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "lstat on %s failed: %s", + real_newpath, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, oldloc->inode, &stbuf); + + return 0; +} + + +int +posix_chmod (call_frame_t *frame, xlator_t *this, + loc_t *loc, mode_t mode) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = 0; + struct stat stbuf = {0,}; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + if (S_ISLNK (loc->inode->st_mode)) { + /* chmod on a link should always succeed */ + op_ret = lstat (real_path, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "lstat on %s failed: %s", + real_path, strerror (op_errno)); + goto out; + } + op_ret = 0; + goto out; + } + + op_ret = lchmod (real_path, mode); + if ((op_ret == -1) && (errno == ENOSYS)) { + gf_log (this->name, GF_LOG_DEBUG, + "lchmod not implemented, falling back to chmod"); + op_ret = chmod (real_path, mode); + } + + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "chmod on %s failed: %s", + loc->path, strerror (op_errno)); + goto out; + } + + op_ret = lstat (real_path, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "lstat on %s failed: %s", + real_path, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + + +int +posix_chown (call_frame_t *frame, xlator_t *this, + loc_t *loc, uid_t uid, gid_t gid) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = 0; + struct stat stbuf = {0,}; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = lchown (real_path, uid, gid); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "lchown on %s failed: %s", + loc->path, strerror (op_errno)); + goto out; + } + + op_ret = lstat (real_path, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "lstat on %s failed: %s", + real_path, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + + +int32_t +posix_truncate (call_frame_t *frame, + xlator_t *this, + loc_t *loc, + off_t offset) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = 0; + struct stat stbuf = {0,}; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = truncate (real_path, offset); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "truncate on %s failed: %s", + loc->path, strerror (op_errno)); + goto out; + } + + op_ret = lstat (real_path, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, "lstat on %s failed: %s", + real_path, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + + +int +posix_utimens (call_frame_t *frame, xlator_t *this, + loc_t *loc, struct timespec ts[2]) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = 0; + struct stat stbuf = {0,}; + struct timeval tv[2] = {{0,},{0,}}; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + tv[0].tv_sec = ts[0].tv_sec; + tv[0].tv_usec = ts[0].tv_nsec / 1000; + tv[1].tv_sec = ts[1].tv_sec; + tv[1].tv_usec = ts[1].tv_nsec / 1000; + + op_ret = lutimes (real_path, tv); + if ((op_ret == -1) && (errno == ENOSYS)) { + op_ret = utimes (real_path, tv); + } + + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "utimes on %s: %s", real_path, strerror (op_errno)); + goto out; + } + + op_ret = lstat (real_path, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "lstat on %s: %s", real_path, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + +int32_t +posix_create (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, mode_t mode, + fd_t *fd) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int32_t _fd = -1; + int _flags = 0; + char * real_path = NULL; + struct stat stbuf = {0, }; + struct posix_fd * pfd = NULL; + struct posix_private * priv = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (fd, out); + + priv = this->private; + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + if (!flags) { + _flags = O_CREAT | O_RDWR | O_EXCL; + } + else { + _flags = flags | O_CREAT; + } + + if (priv->o_direct) + flags |= O_DIRECT; + + _fd = open (real_path, _flags, mode); + + if (_fd == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "open on %s: %s", loc->path, strerror (op_errno)); + goto out; + } + +#ifndef HAVE_SET_FSID + op_ret = chown (real_path, frame->root->uid, frame->root->gid); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "chown on %s failed: %s", + real_path, strerror (op_errno)); + goto out; + } +#endif + + op_ret = fstat (_fd, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "fstat on %d failed: %s", _fd, strerror (op_errno)); + goto out; + } + + op_ret = -1; + pfd = CALLOC (1, sizeof (*pfd)); + + if (!pfd) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + close (_fd); + goto out; + } + + pfd->flags = flags; + pfd->fd = _fd; + + fd_ctx_set (fd, this, (uint64_t)(long)pfd); + + ((struct posix_private *)this->private)->stats.nr_files++; + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, fd, loc->inode, &stbuf); + + return 0; +} + +int32_t +posix_open (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flags, fd_t *fd) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = NULL; + int32_t _fd = -1; + struct posix_fd * pfd = NULL; + struct posix_private * priv = NULL; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (this->private, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (fd, out); + + priv = this->private; + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + if (priv->o_direct) + flags |= O_DIRECT; + + _fd = open (real_path, flags, 0); + if (_fd == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "open on %s: %s", real_path, strerror (op_errno)); + goto out; + } + + pfd = CALLOC (1, sizeof (*pfd)); + + if (!pfd) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + pfd->flags = flags; + pfd->fd = _fd; + + fd_ctx_set (fd, this, (uint64_t)(long)pfd); + + ((struct posix_private *)this->private)->stats.nr_files++; + +#ifndef HAVE_SET_FSID + if (flags & O_CREAT) { + op_ret = chown (real_path, frame->root->uid, frame->root->gid); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "chown on %s failed: %s", + real_path, strerror (op_errno)); + goto out; + } + } +#endif + + op_ret = 0; + + out: + if (op_ret == -1) { + if (_fd != -1) { + close (_fd); + _fd = -1; + } + } + + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, fd); + + return 0; +} + +#define ALIGN_BUF(ptr,bound) ((void *)((unsigned long)(ptr + bound - 1) & \ + (unsigned long)(~(bound - 1)))) + +int +posix_readv (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t offset) +{ + uint64_t tmp_pfd = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + char * buf = NULL; + char * alloc_buf = NULL; + int _fd = -1; + struct posix_private * priv = NULL; + dict_t * reply_dict = NULL; + struct iovec vec = {0,}; + struct posix_fd * pfd = NULL; + struct stat stbuf = {0,}; + int align = 1; + int ret = -1; + int dict_ret = -1; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_ERROR, + "pfd is NULL from fd=%p", fd); + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + if (!size) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_ERROR, "size == 0"); + goto out; + } + + if (pfd->flags & O_DIRECT) { + align = 4096; /* align to page boundary */ + } + + alloc_buf = MALLOC (1 * (size + align)); + if (!alloc_buf) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + /* page aligned buffer */ + buf = ALIGN_BUF (alloc_buf, align); + + _fd = pfd->fd; + + op_ret = lseek (_fd, offset, SEEK_SET); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "lseek(%"PRId64") failed: %s", + offset, strerror (op_errno)); + goto out; + } + + op_ret = read (_fd, buf, size); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "read failed: %s", strerror (op_errno)); + goto out; + } + + priv->read_value += size; + priv->interval_read += size; + + vec.iov_base = buf; + vec.iov_len = op_ret; + + op_ret = -1; + reply_dict = get_new_dict (); + if (!reply_dict) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + dict_ref (reply_dict); + + dict_ret = dict_set_ptr (reply_dict, NULL, alloc_buf); + if (dict_ret < 0) { + op_errno = -dict_ret; + gf_log (this->name, GF_LOG_ERROR, "could not dict_set: (%s)", + strerror (op_errno)); + goto out; + } + + /* + * readv successful, and we need to get the stat of the file + * we read from + */ + + op_ret = fstat (_fd, &stbuf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "fstat failed: %s", strerror (op_errno)); + goto out; + } + + op_ret = 0; + out: + if (op_ret == -1) { + frame->root->rsp_refs = NULL; + + if (reply_dict) { + dict_unref (reply_dict); + reply_dict = NULL; + } + + if ((alloc_buf != NULL) && (dict_ret != -1)) + FREE (alloc_buf); + } + + if (reply_dict) + frame->root->rsp_refs = reply_dict; + + STACK_UNWIND (frame, op_ret, op_errno, &vec, 1, &stbuf); + + if (reply_dict) + dict_unref (reply_dict); + + return 0; +} + + +int32_t +posix_writev (call_frame_t *frame, xlator_t *this, + fd_t *fd, struct iovec *vector, int32_t count, off_t offset) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct posix_private * priv = NULL; + struct posix_fd * pfd = NULL; + struct stat stbuf = {0,}; + int ret = -1; + + int idx = 0; + int align = 4096; + int max_buf_size = 0; + int retval = 0; + char * buf = NULL; + char * alloc_buf = NULL; + uint64_t tmp_pfd = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (vector, out); + VALIDATE_OR_GOTO (this->private, out); + + priv = this->private; + + VALIDATE_OR_GOTO (priv, out); + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "pfd is NULL from fd=%p", fd); + op_errno = -ret; + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + _fd = pfd->fd; + + op_ret = lseek (_fd, offset, SEEK_SET); + + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "lseek(%"PRId64") failed: %s", + offset, strerror (op_errno)); + goto out; + } + + /* Check for the O_DIRECT flag during open() */ + if (pfd->flags & O_DIRECT) { + /* This is O_DIRECT'd file */ + op_ret = -1; + for (idx = 0; idx < count; idx++) { + if (max_buf_size < vector[idx].iov_len) + max_buf_size = vector[idx].iov_len; + } + + alloc_buf = MALLOC (1 * (max_buf_size + align)); + if (!alloc_buf) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + goto out; + } + + for (idx = 0; idx < count; idx++) { + /* page aligned buffer */ + buf = ALIGN_BUF (alloc_buf, align); + + memcpy (buf, vector[idx].iov_base, + vector[idx].iov_len); + + /* not sure whether writev works on O_DIRECT'd fd */ + retval = write (_fd, buf, vector[idx].iov_len); + + if (retval == -1) { + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "O_DIRECT enabled: %s", + strerror (op_errno)); + goto out; + } + + break; + } + if (op_ret == -1) + op_ret = 0; + op_ret += retval; + } + + } else /* if (O_DIRECT) */ { + + /* This is not O_DIRECT'd fd */ + op_ret = writev (_fd, vector, count); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "writev failed: %s", + strerror (op_errno)); + goto out; + } + } + + priv->write_value += op_ret; + priv->interval_write += op_ret; + + if (op_ret >= 0) { + /* wiretv successful, we also need to get the stat of + * the file we wrote to + */ + ret = fstat (_fd, &stbuf); + if (ret == -1) { + op_ret = -1; + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "fstat failed: %s", + strerror (op_errno)); + goto out; + } + } + + out: + if (alloc_buf) { + FREE (alloc_buf); + } + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &stbuf); + + return 0; +} + + +int32_t +posix_statfs (call_frame_t *frame, xlator_t *this, + loc_t *loc) +{ + char * real_path = NULL; + int32_t op_ret = -1; + int32_t op_errno = 0; + struct statvfs buf = {0, }; + struct posix_private * priv = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (this->private, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + priv = this->private; + + op_ret = statvfs (real_path, &buf); + + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "statvfs failed: %s", + strerror (op_errno)); + goto out; + } + + if (!priv->export_statfs) { + buf.f_blocks = 0; + buf.f_bfree = 0; + buf.f_bavail = 0; + buf.f_files = 0; + buf.f_ffree = 0; + buf.f_favail = 0; + } + + op_ret = 0; + + out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &buf); + return 0; +} + + +int32_t +posix_flush (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct posix_fd * pfd = NULL; + int ret = -1; + uint64_t tmp_pfd = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_ERROR, + "pfd is NULL on fd=%p", fd); + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + _fd = pfd->fd; + + /* do nothing */ + posix_xattr_cache_flush_all (this); + + op_ret = 0; + + out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + + +int32_t +posix_release (xlator_t *this, + fd_t *fd) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct posix_private * priv = NULL; + struct posix_fd * pfd = NULL; + int ret = -1; + uint64_t tmp_pfd = 0; + xattr_cache_handle_t handle = {{0,},0}; + + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + priv = this->private; + + priv->stats.nr_files--; + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_ERROR, + "pfd is NULL from fd=%p", fd); + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + handle.fd = fd; + posix_xattr_cache_flush (this, &handle); + + _fd = pfd->fd; + + op_ret = close (_fd); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "close(): %s", strerror (op_errno)); + goto out; + } + + if (pfd->dir) { + op_ret = -1; + op_errno = EBADF; + gf_log (this->name, GF_LOG_ERROR, + "pfd->dir is %p (not NULL) for file fd=%p", + pfd->dir, fd); + goto out; + } + + op_ret = 0; + + out: + if (pfd) + FREE (pfd); + + return 0; +} + + +int32_t +posix_fsync (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t datasync) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct posix_fd * pfd = NULL; + int ret = -1; + uint64_t tmp_pfd = 0; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + +#ifdef GF_DARWIN_HOST_OS + /* Always return success in case of fsync in MAC OS X */ + op_ret = 0; + goto out; +#endif + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_ERROR, "pfd not found in fd's ctx"); + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + _fd = pfd->fd; + + if (datasync) { + ; +#ifdef HAVE_FDATASYNC + op_ret = fdatasync (_fd); +#endif + } else { + op_ret = fsync (_fd); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, "fsync: %s", + strerror (op_errno)); + } + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + frame->root->rsp_refs = NULL; + + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +static int gf_posix_xattr_enotsup_log; + +int +set_file_contents (xlator_t *this, char *real_path, + data_pair_t *trav, int flags) +{ + char * key = NULL; + char real_filepath[ZR_PATH_MAX] = {0,}; + int32_t file_fd = -1; + int op_ret = 0; + int ret = -1; + + key = &(trav->key[15]); + sprintf (real_filepath, "%s/%s", real_path, key); + + if (flags & XATTR_REPLACE) { + /* if file exists, replace it + * else, error out */ + file_fd = open (real_filepath, O_TRUNC|O_WRONLY); + + if (file_fd == -1) { + goto create; + } + + if (trav->value->len) { + ret = write (file_fd, trav->value->data, + trav->value->len); + if (ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "write failed while doing setxattr " + "for key %s on path %s: %s", + key, real_filepath, strerror (errno)); + goto out; + } + + ret = close (file_fd); + if (ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "close failed on %s: %s", + real_filepath, strerror (errno)); + goto out; + } + } + + create: /* we know file doesn't exist, create it */ + + file_fd = open (real_filepath, O_CREAT|O_WRONLY, 0644); + + if (file_fd == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "failed to open file %s with O_CREAT: %s", + key, strerror (errno)); + goto out; + } + + ret = write (file_fd, trav->value->data, trav->value->len); + if (ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "write failed on %s while setxattr with " + "key %s: %s", + real_filepath, key, strerror (errno)); + goto out; + } + + ret = close (file_fd); + if (ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "close failed on %s while setxattr with " + "key %s: %s", + real_filepath, key, strerror (errno)); + goto out; + } + } + + out: + return op_ret; +} + +int +handle_pair (xlator_t *this, char *real_path, + data_pair_t *trav, int flags) +{ + int sys_ret = -1; + int ret = 0; + + if (ZR_FILE_CONTENT_REQUEST(trav->key)) { + ret = set_file_contents (this, real_path, trav, flags); + } else { + sys_ret = lsetxattr (real_path, trav->key, trav->value->data, + trav->value->len, flags); + + if (sys_ret < 0) { + if (errno == ENOTSUP) { + GF_LOG_OCCASIONALLY(gf_posix_xattr_enotsup_log, + this->name,GF_LOG_WARNING, + "Extended attributes not " + "supported"); + } else if (errno == ENOENT) { + gf_log (this->name, GF_LOG_DEBUG, + "setxattr on %s failed: %s", real_path, + strerror (errno)); + } else { + +#ifdef GF_DARWIN_HOST_OS + gf_log (this->name, + ((errno == EINVAL) ? + GF_LOG_DEBUG : GF_LOG_WARNING), + "%s: key:%s error:%s", + real_path, trav->key, + strerror (errno)); +#else /* ! DARWIN */ + gf_log (this->name, GF_LOG_WARNING, + "%s: key:%s error:%s", + real_path, trav->key, + strerror (errno)); +#endif /* DARWIN */ + } + + ret = -errno; + goto out; + } + } + out: + return ret; +} + +int32_t +posix_setxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, dict_t *dict, int flags) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = NULL; + data_pair_t * trav = NULL; + int ret = -1; + + DECLARE_OLD_FS_ID_VAR; + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + VALIDATE_OR_GOTO (dict, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + trav = dict->members_list; + + while (trav) { + ret = handle_pair (this, real_path, trav, flags); + if (ret < 0) { + op_errno = -ret; + goto out; + } + trav = trav->next; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + +int +get_file_contents (xlator_t *this, char *real_path, + const char *name, char **contents) +{ + char real_filepath[ZR_PATH_MAX] = {0,}; + char * key = NULL; + int32_t file_fd = -1; + struct stat stbuf = {0,}; + int op_ret = 0; + int ret = -1; + + key = (char *) &(name[15]); + sprintf (real_filepath, "%s/%s", real_path, key); + + op_ret = lstat (real_filepath, &stbuf); + if (op_ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, "lstat failed on %s: %s", + real_filepath, strerror (errno)); + goto out; + } + + file_fd = open (real_filepath, O_RDONLY); + + if (file_fd == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, "open failed on %s: %s", + real_filepath, strerror (errno)); + goto out; + } + + *contents = CALLOC (stbuf.st_size + 1, sizeof(char)); + + if (! *contents) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, "out of memory :("); + goto out; + } + + ret = read (file_fd, *contents, stbuf.st_size); + if (ret <= 0) { + op_ret = -1; + gf_log (this->name, GF_LOG_ERROR, "read on %s failed", + real_filepath); + goto out; + } + + *contents[stbuf.st_size] = '\0'; + + op_ret = close (file_fd); + file_fd = -1; + if (op_ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, "close on %s failed: %s", + real_filepath, strerror (errno)); + goto out; + } + + out: + if (op_ret < 0) { + if (*contents) + FREE (*contents); + if (file_fd != -1) + close (file_fd); + } + + return op_ret; +} + +/** + * posix_getxattr - this function returns a dictionary with all the + * key:value pair present as xattr. used for + * both 'listxattr' and 'getxattr'. + */ +int32_t +posix_getxattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name) +{ + int32_t op_ret = -1; + int32_t op_errno = ENOENT; + int32_t list_offset = 0; + size_t size = 0; + size_t remaining_size = 0; + char key[1024] = {0,}; + char * value = NULL; + char * list = NULL; + char * real_path = NULL; + dict_t * dict = NULL; + char * file_contents = NULL; + int ret = -1; + + DECLARE_OLD_FS_ID_VAR; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + SET_FS_ID (frame->root->uid, frame->root->gid); + MAKE_REAL_PATH (real_path, this, loc->path); + + if (loc->inode && S_ISDIR(loc->inode->st_mode) && name && + ZR_FILE_CONTENT_REQUEST(name)) { + ret = get_file_contents (this, real_path, name, + &file_contents); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_ERROR, + "getting file contents failed: %s", + strerror (op_errno)); + goto out; + } + } + + /* Get the total size */ + dict = get_new_dict (); + if (!dict) { + gf_log (this->name, GF_LOG_ERROR, "out of memory :("); + goto out; + } + + size = llistxattr (real_path, NULL, 0); + if (size == -1) { + op_errno = errno; + if ((errno == ENOTSUP) || (errno == ENOSYS)) { + GF_LOG_OCCASIONALLY (gf_posix_xattr_enotsup_log, + this->name, GF_LOG_WARNING, + "Extended attributes not " + "supported."); + } + else { + gf_log (this->name, GF_LOG_ERROR, + "listxattr failed on %s: %s", + real_path, strerror (op_errno)); + } + goto out; + } + + if (size == 0) + goto done; + + list = alloca (size + 1); + if (!list) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "out of memory :("); + goto out; + } + + size = llistxattr (real_path, list, size); + + remaining_size = size; + list_offset = 0; + while (remaining_size > 0) { + if(*(list + list_offset) == '\0') + break; + + strcpy (key, list + list_offset); + op_ret = lgetxattr (real_path, key, NULL, 0); + if (op_ret == -1) + break; + + value = CALLOC (op_ret + 1, sizeof(char)); + if (!value) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "out of memory :("); + goto out; + } + + op_ret = lgetxattr (real_path, key, value, op_ret); + if (op_ret == -1) + break; + + value [op_ret] = '\0'; + dict_set (dict, key, data_from_dynptr (value, op_ret)); + remaining_size -= strlen (key) + 1; + list_offset += strlen (key) + 1; + + } /* while (remaining_size > 0) */ + + done: + op_ret = size; + + if (dict) { + dict_ref (dict); + } + + out: + SET_TO_OLD_FS_ID (); + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, dict); + + if (dict) + dict_unref (dict); + + return 0; +} + +int32_t +posix_removexattr (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *name) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = NULL; + + DECLARE_OLD_FS_ID_VAR; + + MAKE_REAL_PATH (real_path, this, loc->path); + + SET_FS_ID (frame->root->uid, frame->root->gid); + + op_ret = lremovexattr (real_path, name); + + if (op_ret == -1) { + op_errno = errno; + if (op_errno != ENOATTR && op_errno != EPERM) + gf_log (this->name, GF_LOG_WARNING, + "removexattr on %s: %s", loc->path, + strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +posix_fsyncdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, int datasync) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + struct posix_fd * pfd = NULL; + int _fd = -1; + int ret = -1; + uint64_t tmp_pfd = 0; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_ERROR, + "pfd is NULL, fd=%p", fd); + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + _fd = pfd->fd; + + op_ret = 0; + + out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + + return 0; +} + + +void +posix_print_xattr (dict_t *this, + char *key, + data_t *value, + void *data) +{ + gf_log ("posix", GF_LOG_TRACE, + "(key/val) = (%s/%d)", key, data_to_int32 (value)); +} + + +/** + * add_array - add two arrays of 32-bit numbers (stored in network byte order) + * dest = dest + src + * @count: number of 32-bit numbers + * FIXME: handle overflow + */ + +static void +__add_array (int32_t *dest, int32_t *src, int count) +{ + int i = 0; + for (i = 0; i < count; i++) { + dest[i] = hton32 (ntoh32 (dest[i]) + ntoh32 (src[i])); + } +} + + +/** + * xattrop - xattr operations - for internal use by GlusterFS + * @optype: ADD_ARRAY: + * dict should contain: + * "key" ==> array of 32-bit numbers + */ + + +int +posix_xattrop_common (call_frame_t *frame, xlator_t *this, + xattr_cache_handle_t *handle, + gf_xattrop_flags_t optype, dict_t *xattr) +{ + int32_t *array = NULL; + + int ret = 0; + int count = 0; + + int op_ret = 0; + int op_errno = 0; + + data_pair_t *trav = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (xattr, out); + VALIDATE_OR_GOTO (this, out); + + trav = xattr->members_list; + + while (trav) { + count = trav->value->len / sizeof (int32_t); + array = CALLOC (count, sizeof (int32_t)); + + ret = posix_xattr_cache_read (this, handle, trav->key, + array, trav->value->len); + + switch (optype) { + + case GF_XATTROP_ADD_ARRAY: + __add_array (array, (int32_t *) trav->value->data, + trav->value->len / 4); + break; + + default: + gf_log (this->name, GF_LOG_ERROR, + "unknown xattrop type %d", + optype); + + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + ret = posix_xattr_cache_write (this, handle, trav->key, + array, trav->value->len); + + ret = dict_set_bin (xattr, trav->key, array, + trav->value->len); + + if (ret != 0) { + gf_log (this->name, GF_LOG_ERROR, + "key=%s (%s)", + trav->key, strerror (-ret)); + op_ret = -1; + op_errno = EINVAL; + goto out; + } + + trav = trav->next; + array = NULL; + } + +out: + if (array) + FREE (array); + + STACK_UNWIND (frame, op_ret, op_errno, xattr); + return 0; +} + + +int +posix_xattrop (call_frame_t *frame, xlator_t *this, + loc_t *loc, gf_xattrop_flags_t optype, dict_t *xattr) +{ + xattr_cache_handle_t handle = {{0,}, 0}; + int ret = -1; + + loc_copy (&handle.loc, loc); + { + ret = posix_xattrop_common (frame, this, &handle, optype, xattr); + } + loc_wipe (&handle.loc); + + return ret; +} + + +int +posix_fxattrop (call_frame_t *frame, xlator_t *this, + fd_t *fd, gf_xattrop_flags_t optype, dict_t *xattr) +{ + int ret = -1; + xattr_cache_handle_t handle = {{0,}, 0}; + + handle.fd = fd; + + ret = posix_xattrop_common (frame, this, &handle, optype, xattr); + + return ret; +} + + +int +posix_access (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t mask) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + char * real_path = NULL; + + DECLARE_OLD_FS_ID_VAR; + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (loc, out); + + MAKE_REAL_PATH (real_path, this, loc->path); + + op_ret = access (real_path, mask & 07); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, "access failed on %s: %s", + loc->path, strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + return 0; +} + + +int32_t +posix_ftruncate (call_frame_t *frame, xlator_t *this, + fd_t *fd, off_t offset) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct stat buf = {0,}; + struct posix_fd * pfd = NULL; + int ret = -1; + uint64_t tmp_pfd = 0; + + DECLARE_OLD_FS_ID_VAR; + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "pfd is NULL, fd=%p", fd); + op_errno = -ret; + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + _fd = pfd->fd; + + op_ret = ftruncate (_fd, offset); + + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, "ftruncate failed: %s", + strerror (errno)); + goto out; + } + + op_ret = fstat (_fd, &buf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, "fstat failed: %s", + strerror (errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &buf); + + return 0; +} + +int32_t +posix_fchown (call_frame_t *frame, xlator_t *this, + fd_t *fd, uid_t uid, gid_t gid) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct stat buf = {0,}; + struct posix_fd * pfd = NULL; + int ret = -1; + uint64_t tmp_pfd = 0; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "pfd is NULL, fd=%p", fd); + op_errno = -ret; + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + _fd = pfd->fd; + + op_ret = fchown (_fd, uid, gid); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, "fchown failed: %s", + strerror (op_errno)); + goto out; + } + + op_ret = fstat (_fd, &buf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, "fstat failed: %s", + strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &buf); + + return 0; +} + + +int32_t +posix_fchmod (call_frame_t *frame, xlator_t *this, + fd_t *fd, mode_t mode) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd = -1; + struct stat buf = {0,}; + struct posix_fd * pfd = NULL; + int ret = -1; + uint64_t tmp_pfd = 0; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "pfd is NULL fd=%p", fd); + op_errno = -ret; + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + _fd = pfd->fd; + + op_ret = fchmod (_fd, mode); + + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "fchmod failed: %s", strerror (errno)); + goto out; + } + + op_ret = fstat (_fd, &buf); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, + "fstat failed: %s", strerror (errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &buf); + + return 0; +} + + +static int +same_file_type (mode_t m1, mode_t m2) +{ + return ((S_IFMT & (m1 ^ m2)) == 0); +} + + +static int +ensure_file_type (xlator_t *this, char *pathname, mode_t mode) +{ + struct stat stbuf = {0,}; + int op_ret = 0; + int ret = -1; + + ret = lstat (pathname, &stbuf); + if (ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_CRITICAL, + "stat failed while trying to make sure entry %s " + "is a directory: %s", pathname, strerror (errno)); + goto out; + } + + if (!same_file_type (mode, stbuf.st_mode)) { + op_ret = -EEXIST; + gf_log (this->name, GF_LOG_CRITICAL, + "entry %s is a different type of file " + "than expected", pathname); + goto out; + } + out: + return op_ret; +} + +static int +create_entry (xlator_t *this, int32_t flags, + dir_entry_t *entry, char *pathname) +{ + int op_ret = 0; + int ret = -1; + struct timeval tv[2] = {{0,0},{0,0}}; + + if (S_ISDIR (entry->buf.st_mode)) { + /* + * If the entry is directory, create it by + * calling 'mkdir'. If the entry is already + * present, check if it is a directory, + * and issue a warning if otherwise. + */ + + ret = mkdir (pathname, entry->buf.st_mode); + if (ret == -1) { + if (errno == EEXIST) { + op_ret = ensure_file_type (this, pathname, + entry->buf.st_mode); + } + else { + op_ret = -errno; + gf_log (this->name, GF_LOG_DEBUG, + "mkdir %s with mode (0%o) failed: %s", + pathname, entry->buf.st_mode, + strerror (errno)); + goto out; + } + } + + } else if ((flags & GF_SET_IF_NOT_PRESENT) + || !(flags & GF_SET_DIR_ONLY)) { + + /* create a 0-byte file here */ + + if (S_ISREG (entry->buf.st_mode)) { + ret = open (pathname, O_CREAT|O_EXCL, + entry->buf.st_mode); + + if (ret == -1) { + if (errno == EEXIST) { + op_ret = ensure_file_type (this, + pathname, + entry->buf.st_mode); + } + else { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "Error creating file %s with " + "mode (0%o): %s", + pathname, entry->buf.st_mode, + strerror (errno)); + goto out; + } + } + + close (ret); + + } else if (S_ISLNK (entry->buf.st_mode)) { + ret = symlink (entry->link, pathname); + + if (ret == -1) { + if (errno == EEXIST) { + op_ret = ensure_file_type (this, + pathname, + entry->buf.st_mode); + } + else { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "error creating symlink %s: %s" + , pathname, strerror (errno)); + goto out; + } + } + + } else if (S_ISBLK (entry->buf.st_mode) || + S_ISCHR (entry->buf.st_mode) || + S_ISFIFO (entry->buf.st_mode) || + S_ISSOCK (entry->buf.st_mode)) { + + ret = mknod (pathname, entry->buf.st_mode, + entry->buf.st_dev); + + if (ret == -1) { + if (errno == EEXIST) { + op_ret = ensure_file_type (this, + pathname, + entry->buf.st_mode); + } else { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "error creating device file " + "%s: %s", + pathname, strerror (errno)); + goto out; + } + } + } else { + gf_log (this->name, GF_LOG_ERROR, + "invalid mode 0%o for %s", entry->buf.st_mode, + pathname); + op_ret = -EINVAL; + goto out; + } + } + + /* + * Preserve atime and mtime + */ + + if (!S_ISLNK (entry->buf.st_mode)) { + tv[0].tv_sec = entry->buf.st_atime; + tv[1].tv_sec = entry->buf.st_mtime; + ret = utimes (pathname, tv); + if (ret == -1) { + op_ret = -errno; + gf_log (this->name, GF_LOG_ERROR, + "utimes %s failed: %s", + pathname, strerror (errno)); + goto out; + } + } + +out: + return op_ret; + +} + + +int +posix_setdents (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t flags, dir_entry_t *entries, + int32_t count) +{ + char * real_path = NULL; + char * entry_path = NULL; + int32_t real_path_len = -1; + int32_t entry_path_len = -1; + int32_t ret = 0; + int32_t op_ret = -1; + int32_t op_errno = 0; + struct posix_fd * pfd = {0, }; + struct timeval tv[2] = {{0, }, {0, }}; + uint64_t tmp_pfd = 0; + char pathname[ZR_PATH_MAX] = {0,}; + dir_entry_t * trav = NULL; + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + VALIDATE_OR_GOTO (entries, out); + + tv[0].tv_sec = tv[0].tv_usec = 0; + tv[1].tv_sec = tv[1].tv_usec = 0; + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + op_errno = -ret; + gf_log (this->name, GF_LOG_ERROR, + "fd's ctx not found on fd=%p for %s", + fd, this->name); + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + real_path = pfd->path; + + if (!real_path) { + op_errno = EINVAL; + gf_log (this->name, GF_LOG_ERROR, + "path is NULL on pfd=%p fd=%p", pfd, fd); + goto out; + } + + real_path_len = strlen (real_path); + entry_path_len = real_path_len + 256; + entry_path = CALLOC (1, entry_path_len); + + if (!entry_path) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "out of memory :("); + goto out; + } + + strcpy (entry_path, real_path); + entry_path[real_path_len] = '/'; + + posix_xattr_cache_flush_all (this); + + /* fd exists, and everything looks fine */ + /** + * create an entry for each one present in '@entries' + * - if flag is set (ie, if its namespace), create both directories + * and files + * - if not set, create only directories. + * + * after the entry is created, change the mode and ownership of the + * entry according to the stat present in entries->buf. + */ + + trav = entries->next; + while (trav) { + strcpy (pathname, entry_path); + strcat (pathname, trav->name); + + ret = create_entry (this, flags, trav, pathname); + if (ret < 0) { + op_errno = -ret; + goto out; + } + + /* TODO: handle another flag, GF_SET_OVERWRITE */ + + /* Change the mode */ + if (!S_ISLNK (trav->buf.st_mode)) { + ret = chmod (pathname, trav->buf.st_mode); + if (ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "chmod on %s failed: %s", pathname, + strerror (op_errno)); + goto out; + } + } + + /* change the ownership */ + ret = lchown (pathname, trav->buf.st_uid, trav->buf.st_gid); + if (ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "chmod on %s failed: %s", pathname, + strerror (op_errno)); + goto out; + } + + if (flags & GF_SET_EPOCH_TIME) { + ret = utimes (pathname, tv); + if (ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "utimes on %s failed: %s", pathname, + strerror (op_errno)); + goto out; + } + } + + /* consider the next entry */ + trav = trav->next; + } + + op_ret = 0; + out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno); + if (entry_path) + FREE (entry_path); + + return 0; +} + +int32_t +posix_fstat (call_frame_t *frame, xlator_t *this, + fd_t *fd) +{ + int _fd = -1; + int32_t op_ret = -1; + int32_t op_errno = 0; + struct stat buf = {0,}; + struct posix_fd * pfd = NULL; + uint64_t tmp_pfd = 0; + int ret = -1; + + DECLARE_OLD_FS_ID_VAR; + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "pfd is NULL, fd=%p", fd); + op_errno = -ret; + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + _fd = pfd->fd; + + op_ret = fstat (_fd, &buf); + + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_WARNING, "fstat failed: %s", + strerror (op_errno)); + goto out; + } + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &buf); + return 0; +} + +static int gf_posix_lk_log; + +int32_t +posix_lk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *lock) +{ + struct flock nullock = {0, }; + frame->root->rsp_refs = NULL; + + gf_posix_lk_log++; + + GF_LOG_OCCASIONALLY (gf_posix_lk_log, this->name, GF_LOG_ERROR, + "\"features/posix-locks\" translator is " + "not loaded, you need to use it"); + + STACK_UNWIND (frame, -1, ENOSYS, &nullock); + return 0; +} + +int32_t +posix_inodelk (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t cmd, struct flock *lock) +{ + frame->root->rsp_refs = NULL; + + gf_log (this->name, GF_LOG_CRITICAL, + "\"features/posix-locks\" translator is not loaded. " + "You need to use it for proper functioning of GlusterFS"); + + STACK_UNWIND (frame, -1, ENOSYS); + return 0; +} + +int32_t +posix_finodelk (call_frame_t *frame, xlator_t *this, + fd_t *fd, int32_t cmd, struct flock *lock) +{ + frame->root->rsp_refs = NULL; + + gf_log (this->name, GF_LOG_CRITICAL, + "\"features/posix-locks\" translator is not loaded. " + "You need to use it for proper functioning of GlusterFS"); + + STACK_UNWIND (frame, -1, ENOSYS); + return 0; +} + + +int32_t +posix_entrylk (call_frame_t *frame, xlator_t *this, + loc_t *loc, const char *basename, entrylk_cmd cmd, + entrylk_type type) +{ + frame->root->rsp_refs = NULL; + + gf_log (this->name, GF_LOG_CRITICAL, + "\"features/posix-locks\" translator is not loaded. " + "You need to use it for proper functioning of GlusterFS"); + + STACK_UNWIND (frame, -1, ENOSYS); + return 0; +} + +int32_t +posix_fentrylk (call_frame_t *frame, xlator_t *this, + fd_t *fd, const char *basename, entrylk_cmd cmd, + entrylk_type type) +{ + frame->root->rsp_refs = NULL; + + gf_log (this->name, GF_LOG_CRITICAL, + "\"features/posix-locks\" translator is not loaded. " + " You need to use it for proper functioning of GlusterFS"); + + STACK_UNWIND (frame, -1, ENOSYS); + return 0; +} + + +int32_t +posix_readdir (call_frame_t *frame, xlator_t *this, + fd_t *fd, size_t size, off_t off) +{ + uint64_t tmp_pfd = 0; + struct posix_fd * pfd = NULL; + DIR * dir = NULL; + int ret = -1; + size_t filled = 0; + int count = 0; + + int32_t op_ret = -1; + int32_t op_errno = 0; + + gf_dirent_t * this_entry = NULL; + gf_dirent_t entries; + struct dirent * entry = NULL; + off_t in_case = -1; + int32_t this_size = -1; + + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + VALIDATE_OR_GOTO (fd, out); + + INIT_LIST_HEAD (&entries.list); + + ret = fd_ctx_get (fd, this, &tmp_pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "pfd is NULL, fd=%p", fd); + op_errno = -ret; + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + dir = pfd->dir; + + if (!dir) { + gf_log (this->name, GF_LOG_ERROR, + "dir is NULL for fd=%p", fd); + op_errno = EINVAL; + goto out; + } + + + if (!off) { + rewinddir (dir); + } else { + seekdir (dir, off); + } + + while (filled <= size) { + in_case = telldir (dir); + + if (in_case == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "telldir failed: %s", + strerror (errno)); + goto out; + } + + errno = 0; + entry = readdir (dir); + + if (!entry) { + if (errno == EBADF) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "readdir failed: %s", + strerror (op_errno)); + goto out; + } + break; + } + + this_size = dirent_size (entry); + + if (this_size + filled > size) { + seekdir (dir, in_case); + break; + } + + + this_entry = gf_dirent_for_name (entry->d_name); + + if (!this_entry) { + gf_log (this->name, GF_LOG_ERROR, + "could not create gf_dirent for entry %s (%s)", + entry->d_name, strerror (errno)); + goto out; + } + this_entry->d_off = telldir (dir); + this_entry->d_ino = entry->d_ino; + + list_add_tail (&this_entry->list, &entries.list); + + filled += this_size; + count ++; + } + + op_ret = count; + + out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, &entries); + + gf_dirent_free (&entries); + + return 0; +} + + +int32_t +posix_stats (call_frame_t *frame, xlator_t *this, + int32_t flags) + +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + + struct xlator_stats xlstats = {0, }; + struct xlator_stats * stats = NULL; + struct statvfs buf = {0,}; + struct timeval tv = {0,}; + struct posix_private * priv = (struct posix_private *)this->private; + + int64_t avg_read = 0; + int64_t avg_write = 0; + int64_t _time_ms = 0; + + DECLARE_OLD_FS_ID_VAR; + + SET_FS_ID (frame->root->uid, frame->root->gid); + + VALIDATE_OR_GOTO (frame, out); + VALIDATE_OR_GOTO (this, out); + + stats = &xlstats; + + op_ret = statvfs (priv->base_path, &buf); + + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "statvfs failed: %s", + strerror (op_errno)); + goto out; + } + + /* client info is maintained at FSd */ + stats->nr_clients = priv->stats.nr_clients; + stats->nr_files = priv->stats.nr_files; + + /* number of free block in the filesystem. */ + stats->free_disk = buf.f_bfree * buf.f_bsize; + + stats->total_disk_size = buf.f_blocks * buf.f_bsize; + stats->disk_usage = (buf.f_blocks - buf.f_bavail) * buf.f_bsize; + + /* Calculate read and write usage */ + op_ret = gettimeofday (&tv, NULL); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, + "gettimeofday failed: %s", strerror (errno)); + goto out; + } + + /* Read */ + _time_ms = (tv.tv_sec - priv->init_time.tv_sec) * 1000 + + ((tv.tv_usec - priv->init_time.tv_usec) / 1000); + + avg_read = (_time_ms) ? (priv->read_value / _time_ms) : 0; /* KBps */ + avg_write = (_time_ms) ? (priv->write_value / _time_ms) : 0; /* KBps */ + + _time_ms = (tv.tv_sec - priv->prev_fetch_time.tv_sec) * 1000 + + ((tv.tv_usec - priv->prev_fetch_time.tv_usec) / 1000); + + if (_time_ms && ((priv->interval_read / _time_ms) > priv->max_read)) { + priv->max_read = (priv->interval_read / _time_ms); + } + + if (_time_ms && + ((priv->interval_write / _time_ms) > priv->max_write)) { + priv->max_write = priv->interval_write / _time_ms; + } + + stats->read_usage = avg_read / priv->max_read; + stats->write_usage = avg_write / priv->max_write; + + op_ret = gettimeofday (&(priv->prev_fetch_time), NULL); + if (op_ret == -1) { + op_errno = errno; + gf_log (this->name, GF_LOG_ERROR, "gettimeofday failed: %s", + strerror (op_errno)); + goto out; + } + + priv->interval_read = 0; + priv->interval_write = 0; + + op_ret = 0; + + out: + SET_TO_OLD_FS_ID (); + + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, stats); + return 0; +} + +int32_t +posix_checksum (call_frame_t *frame, xlator_t *this, + loc_t *loc, int32_t flag) +{ + char * real_path = NULL; + DIR * dir = NULL; + struct dirent * dirent = NULL; + uint8_t file_checksum[ZR_FILENAME_MAX] = {0,}; + uint8_t dir_checksum[ZR_FILENAME_MAX] = {0,}; + int32_t op_ret = -1; + int32_t op_errno = 0; + int i = 0; + int length = 0; + + struct stat buf = {0,}; + char tmp_real_path[ZR_PATH_MAX] = {0,}; + int ret = -1; + + MAKE_REAL_PATH (real_path, this, loc->path); + + dir = opendir (real_path); + + if (!dir){ + op_errno = errno; + gf_log (this->name, GF_LOG_DEBUG, + "opendir() failed on `%s': %s", + real_path, strerror (op_errno)); + goto out; + } + + while ((dirent = readdir (dir))) { + errno = 0; + if (!dirent) { + if (errno != 0) { + op_errno = errno; + gf_log (this->name, GF_LOG_DEBUG, + "readdir() failed: %s", + strerror (errno)); + goto out; + } + break; + } + + length = strlen (dirent->d_name); + + strcpy (tmp_real_path, real_path); + strcat (tmp_real_path, "/"); + strcat (tmp_real_path, dirent->d_name); + ret = lstat (tmp_real_path, &buf); + + if (ret == -1) + continue; + + if (S_ISDIR (buf.st_mode)) { + for (i = 0; i < length; i++) + dir_checksum[i] ^= dirent->d_name[i]; + } else { + for (i = 0; i < length; i++) + file_checksum[i] ^= dirent->d_name[i]; + } + } + closedir (dir); + + op_ret = 0; + + out: + frame->root->rsp_refs = NULL; + STACK_UNWIND (frame, op_ret, op_errno, file_checksum, dir_checksum); + + return 0; +} + +/** + * notify - when parent sends PARENT_UP, send CHILD_UP event from here + */ +int32_t +notify (xlator_t *this, + int32_t event, + void *data, + ...) +{ + switch (event) + { + case GF_EVENT_PARENT_UP: + { + /* Tell the parent that posix xlator is up */ + default_notify (this, GF_EVENT_CHILD_UP, data); + } + break; + default: + /* */ + break; + } + return 0; +} + +/** + * init - + */ +int +init (xlator_t *this) +{ + int ret = 0; + int op_ret = -1; + gf_boolean_t tmp_bool = 0; + struct stat buf = {0,}; + struct posix_private * _private = NULL; + data_t * dir_data = NULL; + data_t * tmp_data = NULL; + + dir_data = dict_get (this->options, "directory"); + + if (this->children) { + gf_log (this->name, GF_LOG_ERROR, + "FATAL: storage/posix cannot have subvolumes"); + ret = -1; + goto out; + } + + if (!this->parents) { + gf_log (this->name, GF_LOG_WARNING, + "dangling volume. check volfile "); + } + + if (!dir_data) { + gf_log (this->name, GF_LOG_ERROR, + "export directory not specified in volfile"); + ret = -1; + goto out; + } + + umask (000); // umask `masking' is done at the client side + + /* Check whether the specified directory exists, if not create it. */ + op_ret = lstat (dir_data->data, &buf); + if ((ret != 0) || !S_ISDIR (buf.st_mode)) { + gf_log (this->name, GF_LOG_ERROR, + "directory '%s' doesn't exists, Exiting", + dir_data->data); + ret = -1; + goto out; + } + + + /* Check for Extended attribute support, if not present, log it */ + op_ret = lsetxattr (dir_data->data, + "trusted.glusterfs.test", "working", 8, 0); + if (op_ret < 0) { + tmp_data = dict_get (this->options, + "mandate-attribute"); + if (tmp_data) { + if (gf_string2boolean (tmp_data->data, + &tmp_bool) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "wrong option provided for key " + "\"mandate-xattr\""); + ret = -1; + goto out; + } + if (!tmp_bool) { + gf_log (this->name, GF_LOG_WARNING, + "Extended attribute not supported, " + "starting as per option"); + } else { + gf_log (this->name, GF_LOG_CRITICAL, + "Extended attribute not supported, " + "exiting"); + ret = -1; + goto out; + } + } else { + gf_log (this->name, GF_LOG_CRITICAL, + "Extended attribute not supported, exiting"); + ret = -1; + goto out; + } + } + + _private = CALLOC (1, sizeof (*_private)); + if (!_private) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + ret = -1; + goto out; + } + + _private->base_path = strdup (dir_data->data); + _private->base_path_length = strlen (_private->base_path); + _private->base_stdev = buf.st_dev; + + _private->xattr_cache = posix_xattr_cache_init (16); + if (!_private->xattr_cache) { + gf_log (this->name, GF_LOG_ERROR, + "out of memory :("); + ret = -1; + goto out; + } + + { + /* Stats related variables */ + gettimeofday (&_private->init_time, NULL); + gettimeofday (&_private->prev_fetch_time, NULL); + _private->max_read = 1; + _private->max_write = 1; + } + + _private->export_statfs = 1; + tmp_data = dict_get (this->options, "export-statfs-size"); + if (tmp_data) { + if (gf_string2boolean (tmp_data->data, + &_private->export_statfs) == -1) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "'export-statfs-size' takes only boolean " + "options"); + goto out; + } + if (!_private->export_statfs) + gf_log (this->name, GF_LOG_DEBUG, + "'statfs()' returns dummy size"); + } + + tmp_data = dict_get (this->options, "o-direct"); + if (tmp_data) { + if (gf_string2boolean (tmp_data->data, + &_private->o_direct) == -1) { + ret = -1; + gf_log (this->name, GF_LOG_ERROR, + "wrong option provided for 'o-direct'"); + goto out; + } + if (_private->o_direct) + gf_log (this->name, GF_LOG_DEBUG, + "o-direct mode is enabled (O_DIRECT " + "for every open)"); + } + +#ifndef GF_DARWIN_HOST_OS + { + struct rlimit lim; + lim.rlim_cur = 1048576; + lim.rlim_max = 1048576; + + if (setrlimit (RLIMIT_NOFILE, &lim) == -1) { + gf_log (this->name, GF_LOG_WARNING, + "WARNING: Failed to set 'ulimit -n " + " 1048576': %s", strerror(errno)); + lim.rlim_cur = 65536; + lim.rlim_max = 65536; + + if (setrlimit (RLIMIT_NOFILE, &lim) == -1) { + gf_log (this->name, GF_LOG_ERROR, + "Failed to set max open fd to " + "64k: %s", strerror(errno)); + } + else { + gf_log (this->name, GF_LOG_ERROR, + "max open fd set to 64k"); + } + } + } +#endif + + this->private = (void *)_private; + + out: + return ret; +} + +void +fini (xlator_t *this) +{ + struct posix_private *priv = this->private; + lremovexattr (priv->base_path, "trusted.glusterfs.test"); + FREE (priv); + return; +} + +struct xlator_mops mops = { + .stats = posix_stats, +}; + +struct xlator_fops fops = { + .lookup = posix_lookup, + .stat = posix_stat, + .opendir = posix_opendir, + .readdir = posix_readdir, + .readlink = posix_readlink, + .mknod = posix_mknod, + .mkdir = posix_mkdir, + .unlink = posix_unlink, + .rmdir = posix_rmdir, + .symlink = posix_symlink, + .rename = posix_rename, + .link = posix_link, + .chmod = posix_chmod, + .chown = posix_chown, + .truncate = posix_truncate, + .utimens = posix_utimens, + .create = posix_create, + .open = posix_open, + .readv = posix_readv, + .writev = posix_writev, + .statfs = posix_statfs, + .flush = posix_flush, + .fsync = posix_fsync, + .setxattr = posix_setxattr, + .getxattr = posix_getxattr, + .removexattr = posix_removexattr, + .fsyncdir = posix_fsyncdir, + .access = posix_access, + .ftruncate = posix_ftruncate, + .fstat = posix_fstat, + .lk = posix_lk, + .inodelk = posix_inodelk, + .finodelk = posix_finodelk, + .entrylk = posix_entrylk, + .fentrylk = posix_fentrylk, + .fchown = posix_fchown, + .fchmod = posix_fchmod, + .setdents = posix_setdents, + .getdents = posix_getdents, + .checksum = posix_checksum, + .xattrop = posix_xattrop, + .fxattrop = posix_fxattrop, +}; + +struct xlator_cbks cbks = { + .release = posix_release, + .releasedir = posix_releasedir, + .forget = posix_forget +}; + +struct volume_options options[] = { + { .key = {"o-direct"}, + .type = GF_OPTION_TYPE_BOOL }, + { .key = {"directory"}, + .type = GF_OPTION_TYPE_PATH }, + { .key = {"export-statfs-size"}, + .type = GF_OPTION_TYPE_BOOL }, + { .key = {"mandate-attribute"}, + .type = GF_OPTION_TYPE_BOOL }, + { .key = {NULL} } +}; diff --git a/xlators/storage/posix/src/posix.h b/xlators/storage/posix/src/posix.h new file mode 100644 index 000000000..b162139c9 --- /dev/null +++ b/xlators/storage/posix/src/posix.h @@ -0,0 +1,110 @@ +/* + Copyright (c) 2006, 2007, 2008 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef _POSIX_H +#define _POSIX_H + +#ifndef _CONFIG_H +#define _CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <unistd.h> +#include <sys/types.h> +#include <dirent.h> + +#ifdef linux +#ifdef __GLIBC__ +#include <sys/fsuid.h> +#else +#include <unistd.h> +#endif +#endif + +#ifdef HAVE_SYS_XATTR_H +#include <sys/xattr.h> +#endif + +#ifdef HAVE_SYS_EXTATTR_H +#include <sys/extattr.h> +#endif + +#include "xlator.h" +#include "inode.h" +#include "compat.h" + +#include "xattr-cache.h" + +/** + * posix_fd - internal structure common to file and directory fd's + */ + +struct posix_fd { + int fd; /* fd returned by the kernel */ + int32_t flags; /* flags for open/creat */ + char * path; /* used by setdents/getdents */ + DIR * dir; /* handle returned by the kernel */ +}; + +struct posix_private { + char *base_path; + int32_t base_path_length; + dev_t base_stdev; + + xattr_cache_t *xattr_cache; + + /* Statistics, provides activity of the server */ + struct xlator_stats stats; + + struct timeval prev_fetch_time; + struct timeval init_time; + + int32_t max_read; /* */ + int32_t max_write; /* */ + int64_t interval_read; /* Used to calculate the max_read value */ + int64_t interval_write; /* Used to calculate the max_write value */ + int64_t read_value; /* Total read, from init */ + int64_t write_value; /* Total write, from init */ + +/* + In some cases, two exported volumes may reside on the same + partition on the server. Sending statvfs info for both + the volumes will lead to erroneous df output at the client, + since free space on the partition will be counted twice. + + In such cases, user can disable exporting statvfs info + on one of the volumes by setting this option. +*/ + gf_boolean_t export_statfs; + + gf_boolean_t o_direct; /* always open files in O_DIRECT mode */ +}; + +#define POSIX_BASE_PATH(this) (((struct posix_private *)this->private)->base_path) + +#define POSIX_BASE_PATH_LEN(this) (((struct posix_private *)this->private)->base_path_length) + +#define MAKE_REAL_PATH(var, this, path) do { \ + var = alloca (strlen (path) + POSIX_BASE_PATH_LEN(this) + 2); \ + strcpy (var, POSIX_BASE_PATH(this)); \ + strcpy (&var[POSIX_BASE_PATH_LEN(this)], path); \ + } while (0) + +#endif /* _POSIX_H */ diff --git a/xlators/storage/posix/src/xattr-cache.c b/xlators/storage/posix/src/xattr-cache.c new file mode 100644 index 000000000..a39c35ae2 --- /dev/null +++ b/xlators/storage/posix/src/xattr-cache.c @@ -0,0 +1,521 @@ +/* + Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#include "byte-order.h" + +#include "xattr-cache.h" +#include "posix.h" +#include "compat-errno.h" + +static int +__hgetxattr (xattr_cache_handle_t *handle, xlator_t *this, + const char *key, void *value, size_t len) +{ + char * real_path = NULL; + struct posix_fd * pfd = NULL; + uint64_t tmp_pfd = 0; + int op_ret = -1; + int ret = -1; + int _fd = -1; + + if (handle->loc.path) { + MAKE_REAL_PATH (real_path, this, handle->loc.path); + op_ret = lgetxattr (real_path, key, value, len); + + if (op_ret == -1) + op_ret = -errno; + } else { + ret = fd_ctx_get (handle->fd, this, &tmp_pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get pfd from fd=%p", + handle->fd); + op_ret = -EBADFD; + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + _fd = pfd->fd; + + op_ret = fgetxattr (_fd, key, value, len); + if (op_ret == -1) + op_ret = -errno; + } + +out: + return op_ret; +} + + +static int +__hsetxattr (xattr_cache_handle_t *handle, xlator_t *this, + const char *key, void *value, size_t len, int flags) +{ + char * real_path = NULL; + struct posix_fd * pfd = NULL; + uint64_t tmp_pfd = 0; + int op_ret = -1; + int ret = -1; + int _fd = -1; + + if (handle->loc.path) { + MAKE_REAL_PATH (real_path, this, handle->loc.path); + + op_ret = lsetxattr (real_path, key, value, len, flags); + if (op_ret == -1) + op_ret = -errno; + } else { + ret = fd_ctx_get (handle->fd, this, &tmp_pfd); + if (ret < 0) { + gf_log (this->name, GF_LOG_ERROR, + "failed to get pfd from fd=%p", + handle->fd); + + op_ret = -EBADFD; + goto out; + } + pfd = (struct posix_fd *)(long)tmp_pfd; + + _fd = pfd->fd; + + op_ret = fsetxattr (_fd, key, value, len, flags); + if (op_ret == -1) + op_ret = -errno; + } + +out: + return op_ret; +} + + +static xattr_cache_entry_t * +__cache_lookup (xattr_cache_t *cache, inode_t *inode, char *key) +{ + int i = 0; + + for (i = 0; i < cache->size; i++) { + if ((cache->entries[i]->inode == inode) + && (!strcmp (cache->entries[i]->key, key))) { + cache->entries[i]->nraccess++; + return cache->entries[i]; + } + } + + return NULL; +} + + +static xattr_cache_entry_t * +__cache_least_used_entry (xattr_cache_t *cache) +{ + xattr_cache_entry_t *lue = cache->entries[0]; + int i; + + for (i = 0; i < cache->size; i++) { + if (cache->entries[i]->nraccess < lue->nraccess) + lue = cache->entries[i]; + } + + lue->nraccess++; + return lue; +} + + +static inode_t * +__inode_for_handle (xattr_cache_handle_t *handle) +{ + inode_t *inode = NULL; + + if (handle->loc.path) + inode = handle->loc.inode; + else if (handle->fd) + inode = handle->fd->inode; + + return inode; +} + + +static void +__free_handle (xattr_cache_handle_t *handle) +{ + if (handle->loc.path) + loc_wipe (&handle->loc); + + FREE (handle); +} + + +static xattr_cache_handle_t * +__copy_handle (xattr_cache_handle_t *handle) +{ + xattr_cache_handle_t *hnew = calloc (1, sizeof (xattr_cache_handle_t)); + + if (handle->loc.path) + loc_copy (&hnew->loc, &handle->loc); + else + hnew->fd = handle->fd; + + return hnew; +} + + +static int +__cache_populate_entry (xattr_cache_entry_t *entry, xlator_t *this, + xattr_cache_handle_t *handle, char *key, size_t len) +{ + int op_ret = -1; + + entry->array = calloc (1, len); + if (!entry->array) { + op_ret = -ENOMEM; + goto out; + } + + op_ret = __hgetxattr (handle, this, key, entry->array, len); + + entry->key = strdup (key); + entry->inode = __inode_for_handle (handle); + entry->handle = __copy_handle (handle); + entry->len = len; + entry->nraccess = 1; + +out: + return op_ret; +} + + +static int +__cache_flush_entry (xattr_cache_entry_t *entry, xlator_t *this) +{ + int ret = -1; + + if (entry->dirty) { + ret = __hsetxattr (entry->handle, this, + entry->key, entry->array, entry->len, 0); + } + + entry->len = 0; + entry->nraccess = 0; + entry->dirty = 0; + entry->inode = NULL; + + if (entry->key) { + FREE (entry->key); + entry->key = NULL; + } + + if (entry->array) { + FREE (entry->array); + entry->array = NULL; + } + + if (entry->handle) { + __free_handle (entry->handle); + entry->handle = NULL; + } + + return 0; +} + + +static void +__print_array (char *str, xlator_t *this, int32_t *array, size_t len) +{ + char *ptr = NULL; + char *buf = NULL; + + int i, count = -1; + + count = len / sizeof (int32_t); + + /* 10 digits per entry + 1 space + '[' and ']' */ + buf = malloc (count * 11 + 8); + + ptr = buf; + ptr += sprintf (ptr, "[ "); + for (i = 0; i < count; i++) + ptr += sprintf (ptr, "%d ", ntoh32 (array[i])); + ptr += sprintf (ptr, "]"); + + gf_log (this->name, GF_LOG_DEBUG, + "%s%s", str, buf); + + FREE (buf); +} + + +int +posix_xattr_cache_read (xlator_t *this, xattr_cache_handle_t *handle, + char *key, int32_t *array, size_t len) +{ + xattr_cache_entry_t *entry = NULL; + xattr_cache_entry_t *purgee = NULL; + + xattr_cache_t *cache = NULL; + inode_t *inode = NULL; + + int op_ret = -1; + + inode = __inode_for_handle (handle); + + if (!inode) { + gf_log (this->name, GF_LOG_DEBUG, + "handle has no inode!"); + goto out; + } + + cache = ((struct posix_private *) (this->private))->xattr_cache; + + pthread_mutex_lock (&cache->lock); + { + entry = __cache_lookup (cache, inode, key); + + if (entry) { + if (handle->loc.path) + gf_log (this->name, GF_LOG_DEBUG, + "cache hit for %s", handle->loc.path); + else if (handle->fd) + gf_log (this->name, GF_LOG_DEBUG, + "cache hit for fd=%p", handle->fd); + } + + if (!entry) { + purgee = __cache_least_used_entry (cache); + + if (purgee->handle && purgee->handle->loc.path) + gf_log (this->name, GF_LOG_DEBUG, + "flushing and purging entry for %s", + purgee->handle->loc.path); + else if (purgee->handle && purgee->handle->fd) + gf_log (this->name, GF_LOG_DEBUG, + "flushing and purging entry for fd=%p", + purgee->handle->fd); + __cache_flush_entry (purgee, this); + + if (handle->loc.path) + gf_log (this->name, GF_LOG_DEBUG, + "populating entry for %s", + handle->loc.path); + else if (handle->fd) + gf_log (this->name, GF_LOG_DEBUG, + "populating entry for fd=%p", + handle->fd); + __cache_populate_entry (purgee, this, handle, key, len); + + entry = purgee; + } + + memcpy (array, entry->array, len); + + __print_array ("read array: ", this, array, len); + } + pthread_mutex_unlock (&cache->lock); + + op_ret = 0; +out: + return op_ret; +} + + +int posix_xattr_cache_write (xlator_t *this, xattr_cache_handle_t *handle, + char *key, int32_t *array, size_t len) +{ + xattr_cache_t * cache = NULL; + xattr_cache_entry_t * entry = NULL; + + inode_t *inode = NULL; + + int op_ret = -1; + + inode = __inode_for_handle (handle); + + if (!inode) { + gf_log (this->name, GF_LOG_DEBUG, + "handle has no inode!"); + goto out; + } + + cache = ((struct posix_private *) (this->private))->xattr_cache; + + pthread_mutex_lock (&cache->lock); + { + entry = __cache_lookup (cache, inode, key); + + if (entry) { + entry->dirty = 1; + memcpy (entry->array, array, len); + } else { + /* + * This case shouldn't usually happen, since the + * entry should have been brought into the cache + * by the previous read (xattrop always does a read & + * write). + * + * If we've reached here, it means things are happening + * very quickly and the entry was flushed after read + * but before this write. In that case, let's just + * write this to disk + */ + + op_ret = __hsetxattr (handle, this, key, array, + len, 0); + } + + __print_array ("wrote array: ", this, array, len); + } + pthread_mutex_unlock (&cache->lock); + + op_ret = 0; +out: + return op_ret; +} + + +int posix_xattr_cache_flush (xlator_t *this, xattr_cache_handle_t *handle) +{ + xattr_cache_t *cache = NULL; + xattr_cache_entry_t *entry = NULL; + + int i; + inode_t *inode = NULL; + + int op_ret = -1; + + inode = __inode_for_handle (handle); + if (!inode) { + gf_log (this->name, GF_LOG_DEBUG, + "handle has no inode!"); + op_ret = -EINVAL; + goto out; + } + + cache = ((struct posix_private *) (this->private))->xattr_cache; + + pthread_mutex_lock (&cache->lock); + { + for (i = 0; i < cache->size; i++) { + entry = cache->entries[i]; + + if (entry->inode == inode) { + if (entry->handle->loc.path) + gf_log (this->name, GF_LOG_DEBUG, + "force flushing entry for %s", + entry->handle->loc.path); + + else if (cache->entries[i]->handle->fd) + gf_log (this->name, GF_LOG_DEBUG, + "force flushing entry for fd=%p", + entry->handle->fd); + + __cache_flush_entry (entry, this); + } + } + } + pthread_mutex_unlock (&cache->lock); + + op_ret = 0; +out: + return op_ret; +} + + +int +posix_xattr_cache_flush_all (xlator_t *this) +{ + xattr_cache_t *cache = NULL; + xattr_cache_entry_t *entry = NULL; + + int i; + int op_ret = 0; + + cache = ((struct posix_private *) (this->private))->xattr_cache; + + pthread_mutex_lock (&cache->lock); + { + gf_log (this->name, GF_LOG_DEBUG, + "flushing entire xattr cache: "); + + for (i = 0; i < cache->size; i++) { + entry = cache->entries[i]; + + if (!entry || !entry->handle) + continue; + + if (entry->handle->loc.path) + gf_log (this->name, GF_LOG_DEBUG, + " force flushing entry for %s", + entry->handle->loc.path); + + else if (cache->entries[i]->handle->fd) + gf_log (this->name, GF_LOG_DEBUG, + " force flushing entry for fd=%p", + entry->handle->fd); + + __cache_flush_entry (entry, this); + } + } + pthread_mutex_unlock (&cache->lock); + + return op_ret; +} + + +xattr_cache_t * +posix_xattr_cache_init (size_t size) +{ + int i = 0; + xattr_cache_t * cache = NULL; + int op_ret = -1; + + cache = CALLOC (1, sizeof (xattr_cache_t)); + if (!cache) { + goto out; + } + + cache->entries = CALLOC (size, sizeof (xattr_cache_entry_t *)); + if (!cache->entries) + goto out; + + cache->size = size; + + for (i = 0; i < size; i++) { + cache->entries[i] = calloc (1, sizeof (xattr_cache_entry_t)); + if (!cache->entries[i]) + goto out; + } + + pthread_mutex_init (&cache->lock, NULL); + + op_ret = 0; +out: + if (op_ret == -1) { + if (cache) { + if (cache->entries) { + for (i = 0; i < size; i++) + if (cache->entries[i]) + FREE (cache->entries[i]); + + FREE (cache->entries); + } + + FREE (cache); + } + } + + return cache; +} diff --git a/xlators/storage/posix/src/xattr-cache.h b/xlators/storage/posix/src/xattr-cache.h new file mode 100644 index 000000000..3e12742a9 --- /dev/null +++ b/xlators/storage/posix/src/xattr-cache.h @@ -0,0 +1,65 @@ +/* + Copyright (c) 2009 Z RESEARCH, Inc. <http://www.zresearch.com> + This file is part of GlusterFS. + + GlusterFS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3 of the License, + or (at your option) any later version. + + GlusterFS is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see + <http://www.gnu.org/licenses/>. +*/ + +#ifndef __XATTR_CACHE_H__ +#define __XATTR_CACHE_H__ + + +#include "glusterfs.h" +#include "inode.h" + +typedef struct __xattr_cache_handle { + loc_t loc; + fd_t *fd; +} xattr_cache_handle_t; + + +typedef struct __xattr_cache_entry { + char *key; /* name of the xattr */ + int32_t *array; /* value */ + size_t len; /* length of array in bytes */ + inode_t *inode; /* inode for which the entry is for */ + + xattr_cache_handle_t *handle; + unsigned char dirty; + unsigned long nraccess; /* number of times accessed */ +} xattr_cache_entry_t; + + +typedef struct __xattr_cache { + size_t size; + pthread_mutex_t lock; + xattr_cache_entry_t **entries; +} xattr_cache_t; + + +xattr_cache_t * posix_xattr_cache_init (size_t size); + +int posix_xattr_cache_read (xlator_t *this, xattr_cache_handle_t *handle, + char *key, int32_t *array, size_t len); + +int posix_xattr_cache_write (xlator_t *this, xattr_cache_handle_t *handle, + char *key, int32_t *array, size_t len); + +int posix_xattr_cache_flush (xlator_t *this, xattr_cache_handle_t *handle); + +int posix_xattr_cache_flush_all (xlator_t *this); + + +#endif /* __XATTR_CACHE_H__ */ -- cgit