diff options
author | Niels de Vos <ndevos@redhat.com> | 2014-01-28 10:06:13 +0100 |
---|---|---|
committer | Anand Avati <avati@redhat.com> | 2014-02-27 21:56:48 -0800 |
commit | b0515e2a4a08b657ef7e9715fb8c6222c700e78c (patch) | |
tree | 2186a50d4abf9c8a216e585b835c7abcfeb898c3 | |
parent | 6535bafe588ea901ac15d31ddb6550a2ba9cd915 (diff) |
write-behind: track filesize when doing extending writes
A program that calls mmap() on a newly created sparse file, may receive
a SIGBUS signal. If SIGBUS is not handled, a segmentation fault will
occur and the program will exit.
A bug in the write-behind translator can cause the creation of a sparse
file created with open(), seek(), write() to be cached. The last write()
may not be sent to the server, until write-behind deems this necessary.
* open(.., O_TRUNC, ...)/creat() the file, it is 0 bytes big
* seek() into the file, use offset 31
* write() 1 byte to the file
* the range from byte 0-30 are unwritten so called 'sparse'
The following illustration tries to capture this:
Legend:
[ = start of file
_ = unallocated/unwritten bytes
# = allocated bytes in the file
] = end of file
[_______________#]
| |
'- byte 0 '- byte 31
Without this change, reading from byte 0-30 will return an error, and
reading the same area through an mmap()'d pointer will trigger a SIGBUS.
Reading from this range did not trigger the outstanding write() to be
flushed. The brick that receives the read() (translated over the network
from mmap()) does not know that the file has been extended, and returns
-EINVAL. This error gets transported back from the brick to the
glusterfs-fuse client, and translated by the Linux kernel/VFS into
SIGBUS triggered by mmap().
In order to solve this, a new attribute to the wb_inode structure is
introduced; the current size of the file. All FOPs that can modify the
size, are expected to update wb_inode->size. This makes it possible for
extending writes with an offset bigger than EOF to mark the unwritten
area as modified/pending.
Change-Id: If5ba6646732e6be26568541ea9b12852a5d0b988
BUG: 1058663
Signed-off-by: Niels de Vos <ndevos@redhat.com>
Reviewed-on: http://review.gluster.org/6835
Tested-by: Gluster Build System <jenkins@build.gluster.com>
Reviewed-by: Raghavendra G <rgowdapp@redhat.com>
Reviewed-by: Anand Avati <avati@redhat.com>
-rw-r--r-- | tests/bugs/bug-1058663.c | 111 | ||||
-rw-r--r-- | tests/bugs/bug-1058663.t | 29 | ||||
-rw-r--r-- | xlators/performance/write-behind/src/write-behind.c | 165 |
3 files changed, 301 insertions, 4 deletions
diff --git a/tests/bugs/bug-1058663.c b/tests/bugs/bug-1058663.c new file mode 100644 index 00000000000..631afecce1f --- /dev/null +++ b/tests/bugs/bug-1058663.c @@ -0,0 +1,111 @@ +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <signal.h> + +#define FILE_SIZE 1048576 + +/* number of tests to run */ +#define RUN_LOOP 1000 + +/* number of SIGBUS before exiting */ +#define MAX_SIGBUS 1 +static int expect_sigbus = 0; +static int sigbus_received = 0; + +/* test for truncate()/seek()/write()/mmap() + * There should ne no SIGBUS triggered. + */ +void seek_write(char *filename) +{ + int fd; + uint8_t* map; + int i; + + fd = open(filename, O_RDWR|O_CREAT|O_TRUNC, 0600); + lseek(fd, FILE_SIZE - 1, SEEK_SET); + write(fd, "\xff", 1); + + map = mmap(NULL, FILE_SIZE, PROT_READ, MAP_PRIVATE, fd, 0); + for (i = 0; i < (FILE_SIZE - 1); i++) { + if (map[i] != 0) /* should never be true */ + abort(); + } + munmap(map, FILE_SIZE); + + close(fd); +} + +int read_after_eof(char *filename) +{ + int ret = 0; + int fd; + char* data; + uint8_t* map; + + fd = open(filename, O_RDWR|O_CREAT|O_TRUNC, 0600); + lseek(fd, FILE_SIZE - 1, SEEK_SET); + write(fd, "\xff", 1); + + /* trigger verify that reading after EOF fails */ + ret = read(fd, data, FILE_SIZE / 2); + if (ret != 0) + return 1; + + /* map an area of 1 byte after FILE_SIZE */ + map = mmap(NULL, 1, PROT_READ, MAP_PRIVATE, fd, FILE_SIZE); + /* map[0] is an access after EOF, it should trigger SIGBUS */ + if (map[0] != 0) + /* it is expected that we exit before we get here */ + if (!sigbus_received) + return 1; + munmap(map, FILE_SIZE); + + close(fd); + + return ret; +} + +/* signal handler for SIGBUS */ +void catch_sigbus(int signum) +{ + switch (signum) { + case SIGBUS: + sigbus_received++; + if (!expect_sigbus) + exit(EXIT_FAILURE); + if (sigbus_received >= MAX_SIGBUS) + exit(EXIT_SUCCESS); + break; + default: + printf("Unexpected signal received: %d\n", signum); + } +} + +int main(int argc, char** argv) +{ + int i = 0; + + if (argc == 1) { + printf("Usage: %s <filename>\n", argv[0]); + return EXIT_FAILURE; + } + + signal(SIGBUS, catch_sigbus); + + /* the next test should not trigger SIGBUS */ + expect_sigbus = 0; + for (i = 0; i < RUN_LOOP; i++) { + seek_write(argv[1]); + } + + /* the next test should trigger SIGBUS */ + expect_sigbus = 1; + if (read_after_eof(argv[1])) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} diff --git a/tests/bugs/bug-1058663.t b/tests/bugs/bug-1058663.t new file mode 100644 index 00000000000..5ca348e773c --- /dev/null +++ b/tests/bugs/bug-1058663.t @@ -0,0 +1,29 @@ +#!/bin/bash + +. $(dirname $0)/../include.rc +. $(dirname $0)/../volume.rc + +cleanup; + +## Start and create a volume +TEST glusterd; +TEST pidof glusterd; +TEST $CLI volume info; + +TEST $CLI volume create $V0 $H0:$B0/$V0; +TEST $CLI volume start $V0; + +TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id $V0 $M0 + +# compile the test program and run it +gcc $(dirname $0)/bug-1058663.c -o $(dirname $0)/bug-1058663; +TEST $(dirname $0)/bug-1058663 $M0/bug-1058663.bin; +rm -f $(dirname $0)/M0/bug-1058663.bin; + +TEST umount $M0; + +TEST $CLI volume stop $V0; +TEST $CLI volume delete $V0; + +cleanup; + diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c index b9cef1152a9..00457338dd7 100644 --- a/xlators/performance/write-behind/src/write-behind.c +++ b/xlators/performance/write-behind/src/write-behind.c @@ -108,6 +108,7 @@ typedef struct wb_inode { after it arrived (i.e, those that have a liability generation higher than itself) */ + size_t size; /* Size of the file to catch write after EOF. */ gf_lock_t lock; xlator_t *this; } wb_inode_t; @@ -503,8 +504,23 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted) switch (stub->fop) { case GF_FOP_WRITE: - req->ordering.off = stub->args.offset; - req->ordering.size = req->write_size; + LOCK (&wb_inode->lock); + { + if (wb_inode->size < stub->args.offset) { + req->ordering.off = wb_inode->size; + req->ordering.size = stub->args.offset + + req->write_size + - wb_inode->size; + } else { + req->ordering.off = stub->args.offset; + req->ordering.size = req->write_size; + } + + if (wb_inode->size < stub->args.offset + req->write_size) + wb_inode->size = stub->args.offset + + req->write_size; + } + UNLOCK (&wb_inode->lock); req->fd = fd_ref (stub->args.fd); @@ -519,10 +535,20 @@ wb_enqueue_common (wb_inode_t *wb_inode, call_stub_t *stub, int tempted) case GF_FOP_TRUNCATE: req->ordering.off = stub->args.offset; req->ordering.size = 0; /* till infinity */ + LOCK (&wb_inode->lock); + { + wb_inode->size = req->ordering.off; + } + UNLOCK (&wb_inode->lock); break; case GF_FOP_FTRUNCATE: req->ordering.off = stub->args.offset; req->ordering.size = 0; /* till infinity */ + LOCK (&wb_inode->lock); + { + wb_inode->size = req->ordering.off; + } + UNLOCK (&wb_inode->lock); req->fd = fd_ref (stub->args.fd); @@ -1196,6 +1222,20 @@ wb_process_queue (wb_inode_t *wb_inode) } +void +wb_set_inode_size(wb_inode_t *wb_inode, struct iatt *postbuf) +{ + GF_ASSERT (wb_inode); + GF_ASSERT (postbuf); + + LOCK (&wb_inode->lock); + { + wb_inode->size = postbuf->ia_size; + } + UNLOCK (&wb_inode->lock); +} + + int wb_writev_cbk (call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, int32_t op_errno, @@ -1576,11 +1616,29 @@ noqueue: } +int32_t +wb_truncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + GF_ASSERT (frame->local); + + if (op_ret == 0) + wb_set_inode_size (frame->local, postbuf); + + frame->local = NULL; + + STACK_UNWIND_STRICT (truncate, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} + + int wb_truncate_helper (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, dict_t *xdata) { - STACK_WIND (frame, default_truncate_cbk, FIRST_CHILD(this), + STACK_WIND (frame, wb_truncate_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); return 0; } @@ -1597,6 +1655,8 @@ wb_truncate (call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, if (!wb_inode) goto unwind; + frame->local = wb_inode; + stub = fop_truncate_stub (frame, wb_truncate_helper, loc, offset, xdata); if (!stub) @@ -1619,11 +1679,29 @@ unwind: } +int32_t +wb_ftruncate_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, struct iatt *prebuf, + struct iatt *postbuf, dict_t *xdata) +{ + GF_ASSERT (frame->local); + + if (op_ret == 0) + wb_set_inode_size (frame->local, postbuf); + + frame->local = NULL; + + STACK_UNWIND_STRICT (ftruncate, frame, op_ret, op_errno, prebuf, + postbuf, xdata); + return 0; +} + + int wb_ftruncate_helper (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, dict_t *xdata) { - STACK_WIND (frame, default_ftruncate_cbk, FIRST_CHILD(this), + STACK_WIND (frame, wb_ftruncate_cbk, FIRST_CHILD(this), FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); return 0; } @@ -1646,6 +1724,8 @@ wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, if (wb_fd_err (fd, this, &op_errno)) goto unwind; + frame->local = wb_inode; + stub = fop_ftruncate_stub (frame, wb_ftruncate_helper, fd, offset, xdata); if (!stub) { @@ -1663,6 +1743,8 @@ wb_ftruncate (call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, return 0; unwind: + frame->local = NULL; + STACK_UNWIND_STRICT (ftruncate, frame, -1, op_errno, NULL, NULL, NULL); if (stub) @@ -1763,6 +1845,81 @@ noqueue: } +int32_t +wb_create (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + + wb_inode = wb_inode_create (this, fd->inode); + if (!wb_inode) + goto unwind; + + if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC)) + wb_inode->size = 0; + + STACK_WIND_TAIL (frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->create, loc, flags, mode, + umask, fd, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT (create, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, + NULL, NULL); + return 0; +} + + +int32_t +wb_open (call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, + fd_t *fd, dict_t *xdata) +{ + wb_inode_t *wb_inode = NULL; + + wb_inode = wb_inode_create (this, fd->inode); + if (!wb_inode) + goto unwind; + + if (((flags & O_RDWR) || (flags & O_WRONLY)) && (flags & O_TRUNC)) + wb_inode->size = 0; + + STACK_WIND_TAIL (frame, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + return 0; + +unwind: + STACK_UNWIND_STRICT (open, frame, -1, ENOMEM, NULL, NULL); + return 0; +} + + +int32_t +wb_lookup_cbk (call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, inode_t *inode, + struct iatt *buf, dict_t *xdata, struct iatt *postparent) +{ + if (op_ret == 0) { + wb_inode_t *wb_inode = wb_inode_ctx_get (this, inode); + if (wb_inode) + wb_set_inode_size (wb_inode, buf); + } + + STACK_UNWIND_STRICT (lookup, frame, op_ret, op_errno, inode, buf, + xdata, postparent); + return 0; +} + + +int32_t +wb_lookup (call_frame_t *frame, xlator_t *this, loc_t *loc, + dict_t *xdata) +{ + STACK_WIND (frame, wb_lookup_cbk, FIRST_CHILD(this), + FIRST_CHILD(this)->fops->lookup, loc, xdata); + return 0; +} + + int wb_forget (xlator_t *this, inode_t *inode) { |