diff options
Diffstat (limited to 'xlators/storage/posix/src/posix-inode-fd-ops.c')
-rw-r--r-- | xlators/storage/posix/src/posix-inode-fd-ops.c | 268 |
1 files changed, 268 insertions, 0 deletions
diff --git a/xlators/storage/posix/src/posix-inode-fd-ops.c b/xlators/storage/posix/src/posix-inode-fd-ops.c index 9e2b37f582c..dafd1855ef9 100644 --- a/xlators/storage/posix/src/posix-inode-fd-ops.c +++ b/xlators/storage/posix/src/posix-inode-fd-ops.c @@ -1960,6 +1960,274 @@ out: } int32_t +posix_copy_file_range(call_frame_t *frame, xlator_t *this, fd_t *fd_in, + off64_t off_in, fd_t *fd_out, off64_t off_out, size_t len, + uint32_t flags, dict_t *xdata) +{ + int32_t op_ret = -1; + int32_t op_errno = 0; + int _fd_in = -1; + int _fd_out = -1; + struct posix_private *priv = NULL; + struct posix_fd *pfd_in = NULL; + struct posix_fd *pfd_out = NULL; + struct iatt preop_dst = { + 0, + }; + struct iatt postop_dst = { + 0, + }; + struct iatt stbuf = { + 0, + }; + int ret = -1; + dict_t *rsp_xdata = NULL; + int is_append = 0; + gf_boolean_t locked = _gf_false; + gf_boolean_t update_atomic = _gf_false; + posix_inode_ctx_t *ctx = NULL; + + VALIDATE_OR_GOTO(frame, out); + VALIDATE_OR_GOTO(this, out); + VALIDATE_OR_GOTO(fd_in, out); + VALIDATE_OR_GOTO(fd_in->inode, out); + VALIDATE_OR_GOTO(fd_out, out); + VALIDATE_OR_GOTO(fd_out->inode, out); + VALIDATE_OR_GOTO(this->private, out); + + priv = this->private; + + VALIDATE_OR_GOTO(priv, out); + DISK_SPACE_CHECK_AND_GOTO(frame, priv, xdata, op_ret, op_errno, out); + + if (posix_check_dev_file(this, fd_in->inode, "copy_file_range", &op_errno)) + goto out; + + if (posix_check_dev_file(this, fd_out->inode, "copy_file_range", &op_errno)) + goto out; + + ret = posix_fd_ctx_get(fd_in, this, &pfd_in, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd_in); + goto out; + } + + _fd_in = pfd_in->fd; + + ret = posix_fd_ctx_get(fd_out, this, &pfd_out, &op_errno); + if (ret < 0) { + gf_msg(this->name, GF_LOG_WARNING, ret, P_MSG_PFD_NULL, + "pfd is NULL from fd=%p", fd_out); + goto out; + } + + _fd_out = pfd_out->fd; + + /* + * Currently, the internal write is checked via xdata which + * is set by some xlator above. It could be due to several of + * the reasons such as healing or a snapshot operation happening + * using copy_file_range. As of now (i.e. writing the patch with + * this change) none of the xlators above posix are using the + * internal write with copy_file_range. In future it might + * change. Atleast as of now the hope is that, when that happens + * this functon or fop does not require additional changes for + * handling internal writes. + */ + ret = posix_check_internal_writes(this, fd_out, _fd_out, xdata); + if (ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "possible overwrite from internal client, fd=%p", fd_out); + op_ret = -1; + op_errno = EBUSY; + goto out; + } + + if (xdata) { + if (dict_get(xdata, GLUSTERFS_WRITE_UPDATE_ATOMIC)) + update_atomic = _gf_true; + } + + /* + * The update_atomic option is to instruct posix to do prestat, + * write and poststat atomically. This is to prevent any modification to + * ia_size and ia_blocks until poststat and the diff in their values + * between pre and poststat could be of use for some translators. + * This is similar to the atomic write operation. atmoic write is + * (i.e. prestat + write + poststat) used by shard as of now. In case, + * some xlator needs copy_file_range to be atomic from prestat and postat + * prespective (i.e. prestat + copy_file_range + poststat) then it has + * to send "GLUSTERFS_WRITE_UPDATE_ATOMIC" key in xdata. + */ + + op_ret = posix_inode_ctx_get_all(fd_out->inode, this, &ctx); + if (op_ret < 0) { + op_errno = ENOMEM; + goto out; + } + + if (update_atomic) { + ret = pthread_mutex_lock(&ctx->write_atomic_lock); + if (!ret) + locked = _gf_true; + else { + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_MUTEX_FAILED, + "failed to hold write atomic lock on %s", + uuid_utoa(fd_out->inode->gfid)); + goto out; + } + } + + op_ret = posix_fdstat(this, fd_out->inode, _fd_out, &preop_dst); + if (op_ret == -1) { + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "pre-operation fstat failed on fd=%p", fd_out); + goto out; + } + + /* + * Since, only the destination file (fd_out) is undergoing + * modification, the write related tests are done on that. + * i.e. this is treater similar to as if the destination file + * undergoing write fop from maintenance perspective. + */ + if (xdata) { + op_ret = posix_cs_maintenance(this, fd_out, NULL, &_fd_out, &preop_dst, + NULL, xdata, &rsp_xdata, _gf_false); + if (op_ret < 0) { + gf_msg(this->name, GF_LOG_ERROR, 0, 0, + "file state check failed, fd %p", fd_out); + op_errno = EIO; + goto out; + } + } + + /* + * NOTE: This is just doing a single execution of copy_file_range + * system call. If the returned value of this system call is less + * than len, then should we keep doing it in a for loop until the + * copy_file_range of all the len bytes is done? + * Check the example program provided in the man page of + * copy_file_range. + * If so, then a separate variables for both off_in and off_out + * should be used which are initialized to off_in and off_out + * that this function call receives, but then advanced by the + * value returned by sys_copy_file_range and then use that as + * off_in and off_out for next instance of copy_file_range execution. + */ + op_ret = sys_copy_file_range(_fd_in, &off_in, _fd_out, &off_out, len, + flags); + + if (op_ret < 0) { + op_errno = -op_ret; + op_ret = -1; + gf_msg(this->name, GF_LOG_ERROR, op_errno, P_MSG_COPY_FILE_RANGE_FAILED, + "copy_file_range failed: fd_in: %p (gfid: %s) ," + " fd_out %p (gfid:%s)", + fd_in, uuid_utoa(fd_in->inode->gfid), fd_out, + uuid_utoa(fd_out->inode->gfid)); + goto out; + } + + /* + * Let this be as it is for now. This function collects + * infomration such as open fd count etc. So, even though + * is_append does not apply to copy_file_range, for now, + * allowing it to be recorded in the dict as _gf_false. + */ + rsp_xdata = _fill_writev_xdata(fd_out, xdata, this, is_append); + + /* copy_file_range successful, we also need to get the stat of + * the file we wrote to (i.e. destination file or fd_out). + */ + ret = posix_fdstat(this, fd_out->inode, _fd_out, &postop_dst); + if (ret == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "post-operation fstat failed on fd=%p", fd_out); + goto out; + } + + /* + * Also perform the stat on the source fd (i.e. fd_in). For now, + * allowing it to be done within the locked region if the request + * is for atomic operation (and update) of copy_file_range. + */ + ret = posix_fdstat(this, fd_in->inode, _fd_in, &stbuf); + if (ret == -1) { + op_ret = -1; + op_errno = errno; + gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_FSTAT_FAILED, + "post-operation fstat failed on fd=%p", fd_in); + goto out; + } + + /* + * The core logic of what time attributes are to be updated + * on a fop is decided at client side xlator utime. + * All the remaining fops call posix_set_ctime function + * to update the {a,m,c}time. But, for all the other fops, + * the operation is happening on only one file (or inode). + * But here, there are 2 fds (source and destination). Hence + * the new function below to update the appropriate times for + * both the source and the destination file. + * For the source file, if at all anything has to be updated, + * it would be atime (as that file is only read, not updated). + * For the destination file, the attributes that require the + * modification would be mtime and ctime. + * What times have to be changed is actually determined by + * utime xlator. But, all of them would be in frame->root->flags. + * So, currently posix assumes that, the atime flag is for + * the source file and the other 2 flags are for the destination + * file. Since, the assumption is rigid (i.e. atime for source + * and {m,c}time for destination), the below function is called + * posix_set_ctime_cfr (cfr standing for copy_file_range). + * FUTURE TODO: + * In future, some other functionality or fop might operate + * simultaneously on 2 files. Then, depending upon what that new + * fop does or what are its requirements, the below function might + * require changes to become generic for consumption in case of + * simultaneous operations on 2 files. + */ + posix_set_ctime_cfr(frame, this, NULL, pfd_in->fd, fd_in->inode, &stbuf, + NULL, pfd_out->fd, fd_out->inode, &postop_dst); + + if (locked) { + pthread_mutex_unlock(&ctx->write_atomic_lock); + locked = _gf_false; + } + + /* + * Record copy_file_range in priv->write_value for now. + * If not needed, remove below section of code along with + * this comment (or add comment to explain why it is not + * needed). + */ + LOCK(&priv->lock); + { + priv->write_value += op_ret; + } + UNLOCK(&priv->lock); + +out: + + if (locked) { + pthread_mutex_unlock(&ctx->write_atomic_lock); + locked = _gf_false; + } + + STACK_UNWIND_STRICT(copy_file_range, frame, op_ret, op_errno, &stbuf, + &preop_dst, &postop_dst, rsp_xdata); + + if (rsp_xdata) + dict_unref(rsp_xdata); + return 0; +} + +int32_t posix_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) { char *real_path = NULL; |