summaryrefslogtreecommitdiffstats
path: root/xlators/storage/posix/src/posix-common.c
diff options
context:
space:
mode:
Diffstat (limited to 'xlators/storage/posix/src/posix-common.c')
-rw-r--r--xlators/storage/posix/src/posix-common.c355
1 files changed, 275 insertions, 80 deletions
diff --git a/xlators/storage/posix/src/posix-common.c b/xlators/storage/posix/src/posix-common.c
index 9c9d52e3609..f10722ec3fb 100644
--- a/xlators/storage/posix/src/posix-common.c
+++ b/xlators/storage/posix/src/posix-common.c
@@ -26,7 +26,6 @@
#include <signal.h>
#include <sys/uio.h>
#include <unistd.h>
-#include <ftw.h>
#ifndef GF_BSD_HOST_OS
#include <alloca.h>
@@ -36,30 +35,22 @@
#include <fcntl.h>
#endif /* HAVE_LINKAT */
-#include "glusterfs.h"
-#include "checksum.h"
-#include "dict.h"
-#include "logging.h"
-#include "posix.h"
#include "posix-inode-handle.h"
-#include "xlator.h"
-#include "defaults.h"
-#include "common-utils.h"
-#include "compat-errno.h"
-#include "compat.h"
-#include "byte-order.h"
-#include "syscall.h"
-#include "statedump.h"
-#include "locking.h"
-#include "timer.h"
+#include <glusterfs/compat-errno.h>
+#include <glusterfs/compat.h>
+#include <glusterfs/byte-order.h>
+#include <glusterfs/syscall.h>
+#include <glusterfs/statedump.h>
+#include <glusterfs/locking.h>
+#include <glusterfs/timer.h>
#include "glusterfs3-xdr.h"
-#include "hashfn.h"
#include "posix-aio.h"
-#include "glusterfs-acl.h"
+#include <glusterfs/glusterfs-acl.h>
#include "posix-messages.h"
-#include "events.h"
+#include <glusterfs/events.h>
#include "posix-gfid-path.h"
-#include "compat-uuid.h"
+#include <glusterfs/compat-uuid.h>
+#include "timer-wheel.h"
extern char *marker_xattrs[];
#define ALIGN_SIZE 4096
@@ -110,13 +101,13 @@ posix_priv(xlator_t *this)
struct posix_private *priv = NULL;
char key_prefix[GF_DUMP_MAX_BUF_LEN];
- (void)snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type,
- this->name);
- gf_proc_dump_add_section(key_prefix);
-
if (!this)
return 0;
+ (void)snprintf(key_prefix, GF_DUMP_MAX_BUF_LEN, "%s.%s", this->type,
+ this->name);
+ gf_proc_dump_add_section("%s", key_prefix);
+
priv = this->private;
if (!priv)
@@ -124,9 +115,9 @@ posix_priv(xlator_t *this)
gf_proc_dump_write("base_path", "%s", priv->base_path);
gf_proc_dump_write("base_path_length", "%d", priv->base_path_length);
- gf_proc_dump_write("max_read", "%d", priv->read_value);
- gf_proc_dump_write("max_write", "%d", priv->write_value);
- gf_proc_dump_write("nr_files", "%ld", priv->nr_files);
+ gf_proc_dump_write("max_read", "%" PRId64, GF_ATOMIC_GET(priv->read_value));
+ gf_proc_dump_write("max_write", "%" PRId64,
+ GF_ATOMIC_GET(priv->write_value));
return 0;
}
@@ -143,11 +134,60 @@ posix_inode(xlator_t *this)
int32_t
posix_notify(xlator_t *this, int32_t event, void *data, ...)
{
+ xlator_t *victim = data;
+ struct posix_private *priv = this->private;
+ int ret = 0;
+ struct timespec sleep_till = {
+ 0,
+ };
+ glusterfs_ctx_t *ctx = this->ctx;
+
switch (event) {
case GF_EVENT_PARENT_UP: {
- /* Tell the parent that posix xlator is up */
+ /* Notify the parent that posix xlator is up */
default_notify(this, GF_EVENT_CHILD_UP, data);
} break;
+
+ case GF_EVENT_PARENT_DOWN: {
+ if (!victim->cleanup_starting)
+ break;
+
+ if (priv->janitor) {
+ pthread_mutex_lock(&priv->janitor_mutex);
+ {
+ priv->janitor_task_stop = _gf_true;
+ ret = gf_tw_del_timer(this->ctx->tw->timer_wheel,
+ priv->janitor);
+ if (!ret) {
+ timespec_now_realtime(&sleep_till);
+ sleep_till.tv_sec += 1;
+ /* Wait to set janitor_task flag to _gf_false by
+ * janitor_task_done */
+ while (priv->janitor_task_stop) {
+ (void)pthread_cond_timedwait(&priv->janitor_cond,
+ &priv->janitor_mutex,
+ &sleep_till);
+ timespec_now_realtime(&sleep_till);
+ sleep_till.tv_sec += 1;
+ }
+ }
+ }
+ pthread_mutex_unlock(&priv->janitor_mutex);
+ GF_FREE(priv->janitor);
+ }
+ priv->janitor = NULL;
+ pthread_mutex_lock(&ctx->fd_lock);
+ {
+ while (priv->rel_fdcount > 0) {
+ pthread_cond_wait(&priv->fd_cond, &ctx->fd_lock);
+ }
+ }
+ pthread_mutex_unlock(&ctx->fd_lock);
+
+ gf_log(this->name, GF_LOG_INFO, "Sending CHILD_DOWN for brick %s",
+ victim->name);
+ default_notify(this->parents->xlator, GF_EVENT_CHILD_DOWN, data);
+ } break;
default:
/* */
break;
@@ -333,15 +373,31 @@ posix_reconfigure(xlator_t *this, dict_t *options)
" fallback to <hostname>:<export>");
}
- GF_OPTION_RECONF("reserve", priv->disk_reserve, options, uint32, out);
- if (priv->disk_reserve)
- posix_spawn_disk_space_check_thread(this);
+ GF_OPTION_RECONF("reserve", priv->disk_reserve, options, percent_or_size,
+ out);
+ /* option can be any one of percent or bytes */
+ priv->disk_unit = 0;
+ if (priv->disk_reserve < 100.0)
+ priv->disk_unit = 'p';
+
+ if (priv->disk_reserve) {
+ ret = posix_spawn_disk_space_check_thread(this);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_DISK_SPACE_CHECK_FAILED,
+ "Getting disk space check from thread failed");
+ goto out;
+ }
+ }
GF_OPTION_RECONF("health-check-interval", priv->health_check_interval,
options, uint32, out);
GF_OPTION_RECONF("health-check-timeout", priv->health_check_timeout,
options, uint32, out);
- posix_spawn_health_check_thread(this);
+ if (priv->health_check_interval) {
+ ret = posix_spawn_health_check_thread(this);
+ if (ret)
+ goto out;
+ }
GF_OPTION_RECONF("shared-brick-count", priv->shared_brick_count, options,
int32, out);
@@ -496,6 +552,30 @@ posix_create_unlink_dir(xlator_t *this)
return 0;
}
+int
+posix_create_open_directory_based_fd(xlator_t *this, int pdirfd, char *dir_name)
+{
+ int ret = -1;
+
+ ret = sys_openat(pdirfd, dir_name, (O_DIRECTORY | O_RDONLY), 0);
+ if (ret < 0 && errno == ENOENT) {
+ ret = sys_mkdirat(pdirfd, dir_name, 0700);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE,
+ "Creating directory %s failed", dir_name);
+ goto out;
+ }
+ ret = sys_openat(pdirfd, dir_name, (O_DIRECTORY | O_RDONLY), 0);
+ if (ret < 0 && errno != EEXIST) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE,
+ "error mkdir hash-1 %s ", dir_name);
+ goto out;
+ }
+ }
+out:
+ return ret;
+}
+
/**
* init -
*/
@@ -522,7 +602,7 @@ posix_init(xlator_t *this)
uuid_t gfid = {
0,
};
- uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+ static uuid_t rootgfid = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
char *guuid = NULL;
int32_t uid = -1;
int32_t gid = -1;
@@ -532,6 +612,15 @@ posix_init(xlator_t *this)
int force_directory = -1;
int create_mask = -1;
int create_directory_mask = -1;
+ char dir_handle[PATH_MAX] = {
+ 0,
+ };
+ int i;
+ char fhash[4] = {
+ 0,
+ };
+ int hdirfd = -1;
+ char value;
dir_data = dict_get(this->options, "directory");
@@ -572,7 +661,12 @@ posix_init(xlator_t *this)
}
_private->base_path = gf_strdup(dir_data->data);
- _private->base_path_length = strlen(_private->base_path);
+ _private->base_path_length = dir_data->len - 1;
+
+ _private->dirfd = -1;
+ _private->mount_lock = -1;
+ for (i = 0; i < 256; i++)
+ _private->arrdfd[i] = -1;
ret = dict_get_str(this->options, "hostname", &_private->hostname);
if (ret) {
@@ -588,16 +682,11 @@ posix_init(xlator_t *this)
}
/* Check for Extended attribute support, if not present, log it */
- op_ret = sys_lsetxattr(dir_data->data, "trusted.glusterfs.test", "working",
- 8, 0);
- if (op_ret != -1) {
- ret = sys_lremovexattr(dir_data->data, "trusted.glusterfs.test");
- if (ret) {
- gf_msg(this->name, GF_LOG_DEBUG, errno, P_MSG_INVALID_OPTION,
- "failed to remove xattr: "
- "trusted.glusterfs.test");
- }
- } else {
+ size = sys_lgetxattr(dir_data->data, "user.x", &value, sizeof(value));
+
+ if ((size == -1) && (errno == EOPNOTSUPP)) {
+ gf_msg(this->name, GF_LOG_DEBUG, 0, P_MSG_XDATA_GETXATTR,
+ "getxattr returned %zd", size);
tmp_data = dict_get(this->options, "mandate-attribute");
if (tmp_data) {
if (gf_string2boolean(tmp_data->data, &tmp_bool) == -1) {
@@ -757,6 +846,8 @@ posix_init(xlator_t *this)
}
LOCK_INIT(&_private->lock);
+ GF_ATOMIC_INIT(_private->read_value, 0);
+ GF_ATOMIC_INIT(_private->write_value, 0);
_private->export_statfs = 1;
tmp_data = dict_get(this->options, "export-statfs-size");
@@ -844,8 +935,9 @@ posix_init(xlator_t *this)
/* performing open dir on brick dir locks the brick dir
* and prevents it from being unmounted
*/
- _private->mount_lock = sys_opendir(dir_data->data);
- if (!_private->mount_lock) {
+ _private->mount_lock = sys_open(dir_data->data, (O_DIRECTORY | O_RDONLY),
+ 0);
+ if (_private->mount_lock < 0) {
ret = -1;
op_errno = errno;
gf_msg(this->name, GF_LOG_ERROR, 0, P_MSG_DIR_OPERATION_FAILED,
@@ -889,6 +981,28 @@ posix_init(xlator_t *this)
}
this->private = (void *)_private;
+ snprintf(dir_handle, sizeof(dir_handle), "%s/%s", _private->base_path,
+ GF_HIDDEN_PATH);
+ hdirfd = posix_create_open_directory_based_fd(this, _private->mount_lock,
+ dir_handle);
+ if (hdirfd < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE,
+ "error open directory failed for dir %s", dir_handle);
+ ret = -1;
+ goto out;
+ }
+ _private->dirfd = hdirfd;
+ for (i = 0; i < 256; i++) {
+ snprintf(fhash, sizeof(fhash), "%02x", i);
+ _private->arrdfd[i] = posix_create_open_directory_based_fd(this, hdirfd,
+ fhash);
+ if (_private->arrdfd[i] < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_HANDLE_CREATE,
+ "error openat failed for file %s", fhash);
+ ret = -1;
+ goto out;
+ }
+ }
op_ret = posix_handle_init(this);
if (op_ret == -1) {
@@ -946,27 +1060,45 @@ posix_init(xlator_t *this)
_private->disk_space_check_active = _gf_false;
_private->disk_space_full = 0;
- GF_OPTION_INIT("reserve", _private->disk_reserve, uint32, out);
- if (_private->disk_reserve)
- posix_spawn_disk_space_check_thread(this);
+
+ GF_OPTION_INIT("reserve", _private->disk_reserve, percent_or_size, out);
+
+ /* option can be any one of percent or bytes */
+ _private->disk_unit = 0;
+ if (_private->disk_reserve < 100.0)
+ _private->disk_unit = 'p';
+
+ if (_private->disk_reserve) {
+ ret = posix_spawn_disk_space_check_thread(this);
+ if (ret) {
+ gf_msg(this->name, GF_LOG_INFO, 0, P_MSG_DISK_SPACE_CHECK_FAILED,
+ "Getting disk space check from thread failed ");
+ goto out;
+ }
+ }
_private->health_check_active = _gf_false;
GF_OPTION_INIT("health-check-interval", _private->health_check_interval,
uint32, out);
GF_OPTION_INIT("health-check-timeout", _private->health_check_timeout,
uint32, out);
- if (_private->health_check_interval)
- posix_spawn_health_check_thread(this);
-
- pthread_mutex_init(&_private->janitor_lock, NULL);
- pthread_cond_init(&_private->janitor_cond, NULL);
- INIT_LIST_HEAD(&_private->janitor_fds);
-
- posix_spawn_janitor_thread(this);
+ if (_private->health_check_interval) {
+ ret = posix_spawn_health_check_thread(this);
+ if (ret)
+ goto out;
+ }
+ posix_janitor_timer_start(this);
pthread_mutex_init(&_private->fsync_mutex, NULL);
pthread_cond_init(&_private->fsync_cond, NULL);
+ pthread_mutex_init(&_private->janitor_mutex, NULL);
+ pthread_cond_init(&_private->janitor_cond, NULL);
+ pthread_cond_init(&_private->fd_cond, NULL);
INIT_LIST_HEAD(&_private->fsyncs);
+ _private->rel_fdcount = 0;
+ ret = posix_spawn_ctx_janitor_thread(this);
+ if (ret)
+ goto out;
ret = gf_thread_create(&_private->fsyncer, NULL, posix_fsyncer, this,
"posixfsy");
@@ -1040,9 +1172,27 @@ posix_init(xlator_t *this)
out);
GF_OPTION_INIT("ctime", _private->ctime, bool, out);
+
out:
if (ret) {
if (_private) {
+ if (_private->dirfd >= 0) {
+ sys_close(_private->dirfd);
+ _private->dirfd = -1;
+ }
+
+ for (i = 0; i < 256; i++) {
+ if (_private->arrdfd[i] >= 0) {
+ sys_close(_private->arrdfd[i]);
+ _private->arrdfd[i] = -1;
+ }
+ }
+ /*unlock brick dir*/
+ if (_private->mount_lock >= 0) {
+ (void)sys_close(_private->mount_lock);
+ _private->mount_lock = -1;
+ }
+
GF_FREE(_private->base_path);
GF_FREE(_private->hostname);
@@ -1061,36 +1211,84 @@ void
posix_fini(xlator_t *this)
{
struct posix_private *priv = this->private;
+ gf_boolean_t health_check = _gf_false;
+ glusterfs_ctx_t *ctx = this->ctx;
+ uint32_t count;
+ int ret = 0;
+ int i = 0;
+
if (!priv)
return;
LOCK(&priv->lock);
- if (priv->health_check_active) {
+ {
+ health_check = priv->health_check_active;
priv->health_check_active = _gf_false;
- pthread_cancel(priv->health_check);
- priv->health_check = 0;
}
UNLOCK(&priv->lock);
+
+ if (priv->dirfd >= 0) {
+ sys_close(priv->dirfd);
+ priv->dirfd = -1;
+ }
+
+ for (i = 0; i < 256; i++) {
+ if (priv->arrdfd[i] >= 0) {
+ sys_close(priv->arrdfd[i]);
+ priv->arrdfd[i] = -1;
+ }
+ }
+
+ if (health_check) {
+ (void)gf_thread_cleanup_xint(priv->health_check);
+ priv->health_check = 0;
+ }
+
if (priv->disk_space_check) {
priv->disk_space_check_active = _gf_false;
- pthread_cancel(priv->disk_space_check);
+ (void)gf_thread_cleanup_xint(priv->disk_space_check);
priv->disk_space_check = 0;
}
+
if (priv->janitor) {
- (void)gf_thread_cleanup_xint(priv->janitor);
- priv->janitor = 0;
+ /*TODO: Make sure the synctask is also complete */
+ ret = gf_tw_del_timer(this->ctx->tw->timer_wheel, priv->janitor);
+ if (ret < 0) {
+ gf_msg(this->name, GF_LOG_ERROR, errno, P_MSG_TIMER_DELETE_FAILED,
+ "Failed to delete janitor timer");
+ }
+ GF_FREE(priv->janitor);
+ priv->janitor = NULL;
}
+
+ pthread_mutex_lock(&ctx->fd_lock);
+ {
+ count = --ctx->pxl_count;
+ if (count == 0) {
+ pthread_cond_signal(&ctx->fd_cond);
+ }
+ }
+ pthread_mutex_unlock(&ctx->fd_lock);
+
+ if (count == 0) {
+ pthread_join(ctx->janitor, NULL);
+ }
+
if (priv->fsyncer) {
(void)gf_thread_cleanup_xint(priv->fsyncer);
priv->fsyncer = 0;
}
/*unlock brick dir*/
- if (priv->mount_lock)
- (void)sys_closedir(priv->mount_lock);
+ if (priv->mount_lock >= 0) {
+ (void)sys_close(priv->mount_lock);
+ priv->mount_lock = -1;
+ }
GF_FREE(priv->base_path);
LOCK_DESTROY(&priv->lock);
- pthread_mutex_destroy(&priv->janitor_lock);
pthread_mutex_destroy(&priv->fsync_mutex);
+ pthread_cond_destroy(&priv->fsync_cond);
+ pthread_mutex_destroy(&priv->janitor_mutex);
+ pthread_cond_destroy(&priv->janitor_cond);
GF_FREE(priv->hostname);
GF_FREE(priv->trash_path);
GF_FREE(priv);
@@ -1099,7 +1297,7 @@ posix_fini(xlator_t *this)
return;
}
-struct volume_options options[] = {
+struct volume_options posix_options[] = {
{.key = {"o-direct"}, .type = GF_OPTION_TYPE_BOOL},
{.key = {"directory"},
.type = GF_OPTION_TYPE_PATH,
@@ -1162,7 +1360,7 @@ struct volume_options options[] = {
{.key = {"health-check-timeout"},
.type = GF_OPTION_TYPE_INT,
.min = 0,
- .default_value = "10",
+ .default_value = "20",
.validate = GF_OPT_VALIDATE_MIN,
.description =
"Interval in seconds to wait aio_write finish for health check, "
@@ -1170,11 +1368,11 @@ struct volume_options options[] = {
.op_version = {GD_OP_VERSION_4_0_0},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
{.key = {"reserve"},
- .type = GF_OPTION_TYPE_INT,
+ .type = GF_OPTION_TYPE_PERCENT_OR_SIZET,
.min = 0,
.default_value = "1",
.validate = GF_OPT_VALIDATE_MIN,
- .description = "Percentage of disk space to be reserved."
+ .description = "Percentage/Size of disk space to be reserved."
" Set to 0 to disable",
.op_version = {GD_OP_VERSION_3_13_0},
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC},
@@ -1268,24 +1466,21 @@ struct volume_options options[] = {
.min = 0000,
.max = 0777,
.default_value = "0000",
- .validate = GF_OPT_VALIDATE_MIN,
- .validate = GF_OPT_VALIDATE_MAX,
+ .validate = GF_OPT_VALIDATE_BOTH,
.description = "Mode bit permission that will always be set on a file."},
{.key = {"force-directory-mode"},
.type = GF_OPTION_TYPE_INT,
.min = 0000,
.max = 0777,
.default_value = "0000",
- .validate = GF_OPT_VALIDATE_MIN,
- .validate = GF_OPT_VALIDATE_MAX,
+ .validate = GF_OPT_VALIDATE_BOTH,
.description = "Mode bit permission that will be always set on directory"},
{.key = {"create-mask"},
.type = GF_OPTION_TYPE_INT,
.min = 0000,
.max = 0777,
.default_value = "0777",
- .validate = GF_OPT_VALIDATE_MIN,
- .validate = GF_OPT_VALIDATE_MAX,
+ .validate = GF_OPT_VALIDATE_BOTH,
.description = "Any bit not set here will be removed from the"
"modes set on a file when it is created"},
{.key = {"create-directory-mask"},
@@ -1293,8 +1488,7 @@ struct volume_options options[] = {
.min = 0000,
.max = 0777,
.default_value = "0777",
- .validate = GF_OPT_VALIDATE_MIN,
- .validate = GF_OPT_VALIDATE_MAX,
+ .validate = GF_OPT_VALIDATE_BOTH,
.description = "Any bit not set here will be removed from the"
"modes set on a directory when it is created"},
{.key = {"max-hardlinks"},
@@ -1317,7 +1511,7 @@ struct volume_options options[] = {
"SHA256 checksum. MD5 otherwise."},
{.key = {"ctime"},
.type = GF_OPTION_TYPE_BOOL,
- .default_value = "off",
+ .default_value = "on",
.flags = OPT_FLAG_SETTABLE | OPT_FLAG_DOC,
.op_version = {GD_OP_VERSION_4_1_0},
.tags = {"ctime"},
@@ -1326,4 +1520,5 @@ struct volume_options options[] = {
"are stored in xattr to keep it consistent across replica and "
"distribute set. The time attributes stored at the backend are "
"not considered "},
- {.key = {NULL}}};
+ {.key = {NULL}},
+};