1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
|
/*
Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
This file is part of GlusterFS.
This file is licensed to you under your choice of the GNU Lesser
General Public License, version 3 or any later version (LGPLv3 or
later), or the GNU General Public License, version 2 (GPLv2), in all
cases as published by the Free Software Foundation.
*/
#ifndef __BIT_ROT_STUB_H__
#define __BIT_ROT_STUB_H__
#include <glusterfs/glusterfs.h>
#include <glusterfs/logging.h>
#include <glusterfs/dict.h>
#include <glusterfs/xlator.h>
#include <glusterfs/defaults.h>
#include <glusterfs/call-stub.h>
#include "bit-rot-stub-mem-types.h"
#include <glusterfs/syscall.h>
#include <glusterfs/common-utils.h>
#include "bit-rot-common.h"
#include "bit-rot-stub-messages.h"
#include "glusterfs3-xdr.h"
#include <glusterfs/syncop.h>
#include <glusterfs/syncop-utils.h>
#define BAD_OBJECT_THREAD_STACK_SIZE ((size_t)(1024 * 1024))
#define BR_STUB_DUMP_STR_SIZE 65536
#define BR_PATH_MAX_EXTRA (PATH_MAX + 1024)
#define BR_PATH_MAX_PLUS (PATH_MAX + 2048)
/*
* Oops. Spelling mistake. Correcting it
*/
#define OLD_BR_STUB_QUARANTINE_DIR GF_HIDDEN_PATH "/quanrantine"
#define BR_STUB_QUARANTINE_DIR GF_HIDDEN_PATH "/quarantine"
/* do not reference frame->local in cbk unless initialized.
* Assigned 0x1 marks verisoning flag between call path and
* cbk path.
*/
#define BR_STUB_VER_NOT_ACTIVE_THEN_GOTO(frame, priv, label) \
do { \
if (priv->do_versioning) \
frame->local = (void *)0x1; \
else \
goto label; \
} while (0)
#define BR_STUB_VER_COND_GOTO(priv, cond, label) \
do { \
if (!priv->do_versioning || cond) \
goto label; \
} while (0)
#define BR_STUB_VER_ENABLED_IN_CALLPATH(frame, flag) \
do { \
if (frame->local) \
flag = _gf_true; \
if (frame->local == (void *)0x1) \
frame->local = NULL; \
} while (0)
#define BR_STUB_RESET_LOCAL_NULL(frame) \
do { \
if (frame->local == (void *)0x1) \
frame->local = NULL; \
} while (0)
typedef int(br_stub_version_cbk)(call_frame_t *, void *, xlator_t *, int32_t,
int32_t, dict_t *);
typedef struct br_stub_inode_ctx {
int need_writeback; /* does the inode need
a writeback to disk? */
unsigned long currentversion; /* ongoing version */
int info_sign;
struct list_head fd_list; /* list of open fds or fds participating in
write operations */
gf_boolean_t bad_object;
} br_stub_inode_ctx_t;
typedef struct br_stub_fd {
fd_t *fd;
struct list_head list;
struct bad_object_dir {
DIR *dir;
off_t dir_eof;
} bad_object;
} br_stub_fd_t;
#define I_DIRTY (1 << 0) /* inode needs writeback */
#define I_MODIFIED (1 << 1)
#define WRITEBACK_DURABLE 1 /* writeback is durable */
/**
* This could just have been a plain struct without unions and all,
* but we may need additional things in the future.
*/
typedef struct br_stub_local {
call_stub_t *fopstub; /* stub for original fop */
int versioningtype; /* not much used atm */
union {
struct br_stub_ctx {
fd_t *fd;
uuid_t gfid;
inode_t *inode;
unsigned long version;
} context;
} u;
} br_stub_local_t;
#define BR_STUB_NO_VERSIONING (1 << 0)
#define BR_STUB_INCREMENTAL_VERSIONING (1 << 1)
typedef struct br_stub_private {
gf_boolean_t do_versioning;
uint32_t boot[2];
char export[PATH_MAX];
pthread_mutex_t lock;
pthread_cond_t cond;
struct list_head squeue; /* ordered signing queue */
pthread_t signth;
struct bad_objects_container {
pthread_t thread;
pthread_mutex_t bad_lock;
pthread_cond_t bad_cond;
struct list_head bad_queue;
} container;
struct mem_pool *local_pool;
char stub_basepath[BR_PATH_MAX_EXTRA];
uuid_t bad_object_dir_gfid;
} br_stub_private_t;
br_stub_fd_t *
br_stub_fd_new(void);
int
__br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd);
br_stub_fd_t *
__br_stub_fd_ctx_get(xlator_t *this, fd_t *fd);
br_stub_fd_t *
br_stub_fd_ctx_get(xlator_t *this, fd_t *fd);
int32_t
br_stub_fd_ctx_set(xlator_t *this, fd_t *fd, br_stub_fd_t *br_stub_fd);
static inline gf_boolean_t
__br_stub_is_bad_object(br_stub_inode_ctx_t *ctx)
{
return ctx->bad_object;
}
static inline void
__br_stub_mark_object_bad(br_stub_inode_ctx_t *ctx)
{
ctx->bad_object = _gf_true;
}
/* inode writeback helpers */
static inline void
__br_stub_mark_inode_dirty(br_stub_inode_ctx_t *ctx)
{
ctx->need_writeback |= I_DIRTY;
}
static inline void
__br_stub_mark_inode_synced(br_stub_inode_ctx_t *ctx)
{
ctx->need_writeback &= ~I_DIRTY;
}
static inline int
__br_stub_is_inode_dirty(br_stub_inode_ctx_t *ctx)
{
return (ctx->need_writeback & I_DIRTY);
}
/* inode mofification markers */
static inline void
__br_stub_set_inode_modified(br_stub_inode_ctx_t *ctx)
{
ctx->need_writeback |= I_MODIFIED;
}
static inline void
__br_stub_unset_inode_modified(br_stub_inode_ctx_t *ctx)
{
ctx->need_writeback &= ~I_MODIFIED;
}
static inline int
__br_stub_is_inode_modified(br_stub_inode_ctx_t *ctx)
{
return (ctx->need_writeback & I_MODIFIED);
}
static inline int
br_stub_require_release_call(xlator_t *this, fd_t *fd, br_stub_fd_t **fd_ctx)
{
int32_t ret = 0;
br_stub_fd_t *br_stub_fd = NULL;
br_stub_fd = br_stub_fd_new();
if (!br_stub_fd)
return -1;
br_stub_fd->fd = fd;
INIT_LIST_HEAD(&br_stub_fd->list);
ret = br_stub_fd_ctx_set(this, fd, br_stub_fd);
if (ret)
gf_msg(this->name, GF_LOG_WARNING, 0, BRS_MSG_SET_CONTEXT_FAILED,
"could not set fd context (for release callback");
else
*fd_ctx = br_stub_fd;
return ret;
}
/* get/set inode context helpers */
static inline int
__br_stub_get_inode_ctx(xlator_t *this, inode_t *inode, uint64_t *ctx)
{
return __inode_ctx_get(inode, this, ctx);
}
static inline int
br_stub_get_inode_ctx(xlator_t *this, inode_t *inode, uint64_t *ctx)
{
int ret = -1;
LOCK(&inode->lock);
{
ret = __br_stub_get_inode_ctx(this, inode, ctx);
}
UNLOCK(&inode->lock);
return ret;
}
static inline int
br_stub_set_inode_ctx(xlator_t *this, inode_t *inode, br_stub_inode_ctx_t *ctx)
{
uint64_t ctx_addr = (uint64_t)(uintptr_t)ctx;
return inode_ctx_set(inode, this, &ctx_addr);
}
/* version get/set helpers */
static inline unsigned long
__br_stub_writeback_version(br_stub_inode_ctx_t *ctx)
{
return (ctx->currentversion + 1);
}
static inline void
__br_stub_set_ongoing_version(br_stub_inode_ctx_t *ctx, unsigned long version)
{
if (ctx->currentversion < version)
ctx->currentversion = version;
else
gf_msg("bit-rot-stub", GF_LOG_WARNING, 0, BRS_MSG_CHANGE_VERSION_FAILED,
"current version: %lu"
"new version: %lu",
ctx->currentversion, version);
}
static inline int
__br_stub_can_trigger_release(inode_t *inode, br_stub_inode_ctx_t *ctx,
unsigned long *version)
{
/**
* If the inode is modified, then it has to be dirty. An inode is
* marked dirty once version is increased. Its marked as modified
* when the modification call (write/truncate) which triggered
* the versioning is successful.
*/
if (__br_stub_is_inode_modified(ctx) && list_empty(&ctx->fd_list) &&
(ctx->info_sign != BR_SIGN_REOPEN_WAIT)) {
GF_ASSERT(__br_stub_is_inode_dirty(ctx) == 0);
if (version)
*version = htonl(ctx->currentversion);
return 1;
}
return 0;
}
static inline int32_t
br_stub_get_ongoing_version(xlator_t *this, inode_t *inode,
unsigned long *version)
{
int32_t ret = 0;
uint64_t ctx_addr = 0;
br_stub_inode_ctx_t *ctx = NULL;
LOCK(&inode->lock);
{
ret = __inode_ctx_get(inode, this, &ctx_addr);
if (ret < 0)
goto unblock;
ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
*version = ctx->currentversion;
}
unblock:
UNLOCK(&inode->lock);
return ret;
}
/**
* fetch the current version from inode and return the context.
* inode->lock should be held before invoking this as context
* *needs* to be valid in the caller.
*/
static inline br_stub_inode_ctx_t *
__br_stub_get_ongoing_version_ctx(xlator_t *this, inode_t *inode,
unsigned long *version)
{
int32_t ret = 0;
uint64_t ctx_addr = 0;
br_stub_inode_ctx_t *ctx = NULL;
ret = __inode_ctx_get(inode, this, &ctx_addr);
if (ret < 0)
return NULL;
ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
if (version)
*version = ctx->currentversion;
return ctx;
}
/* filter for xattr fetch */
static inline int
br_stub_is_internal_xattr(const char *name)
{
if (name && ((strncmp(name, BITROT_CURRENT_VERSION_KEY,
SLEN(BITROT_CURRENT_VERSION_KEY)) == 0) ||
(strncmp(name, BITROT_SIGNING_VERSION_KEY,
SLEN(BITROT_SIGNING_VERSION_KEY)) == 0)))
return 1;
return 0;
}
static inline void
br_stub_remove_vxattrs(dict_t *xattr)
{
if (xattr) {
dict_del(xattr, BITROT_OBJECT_BAD_KEY);
dict_del(xattr, BITROT_CURRENT_VERSION_KEY);
dict_del(xattr, BITROT_SIGNING_VERSION_KEY);
dict_del(xattr, BITROT_SIGNING_XATTR_SIZE_KEY);
}
}
/**
* This function returns the below values for different situations
* 0 => as per the inode context object is not bad
* -1 => Failed to get the inode context itself
* -2 => As per the inode context object is bad
* Both -ve values means the fop which called this function is failed
* and error is returned upwards.
* In future if needed or more errors have to be handled, then those
* errors can be made into enums.
*/
static inline int
br_stub_is_bad_object(xlator_t *this, inode_t *inode)
{
int bad_object = 0;
gf_boolean_t tmp = _gf_false;
uint64_t ctx_addr = 0;
br_stub_inode_ctx_t *ctx = NULL;
int32_t ret = -1;
ret = br_stub_get_inode_ctx(this, inode, &ctx_addr);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
"failed to get the inode context for the inode %s",
uuid_utoa(inode->gfid));
bad_object = -1;
goto out;
}
ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
LOCK(&inode->lock);
{
tmp = __br_stub_is_bad_object(ctx);
if (tmp)
bad_object = -2;
}
UNLOCK(&inode->lock);
out:
return bad_object;
}
static inline int32_t
br_stub_mark_object_bad(xlator_t *this, inode_t *inode)
{
int32_t ret = -1;
uint64_t ctx_addr = 0;
br_stub_inode_ctx_t *ctx = NULL;
ret = br_stub_get_inode_ctx(this, inode, &ctx_addr);
if (ret) {
gf_msg(this->name, GF_LOG_ERROR, 0, BRS_MSG_GET_INODE_CONTEXT_FAILED,
"failed to get the "
"inode context for the inode %s",
uuid_utoa(inode->gfid));
goto out;
}
ctx = (br_stub_inode_ctx_t *)(long)ctx_addr;
LOCK(&inode->lock);
{
__br_stub_mark_object_bad(ctx);
}
UNLOCK(&inode->lock);
out:
return ret;
}
/**
* There is a possibility that dict_set might fail. The o/p of dict_set is
* given to the caller and the caller has to decide what to do.
*/
static inline int32_t
br_stub_mark_xdata_bad_object(xlator_t *this, inode_t *inode, dict_t *xdata)
{
int32_t ret = 0;
if (br_stub_is_bad_object(this, inode) == -2)
ret = dict_set_int32(xdata, GLUSTERFS_BAD_INODE, 1);
return ret;
}
int32_t
br_stub_add_fd_to_inode(xlator_t *this, fd_t *fd, br_stub_inode_ctx_t *ctx);
br_sign_state_t
__br_stub_inode_sign_state(br_stub_inode_ctx_t *ctx, glusterfs_fop_t fop,
fd_t *fd);
int
br_stub_dir_create(xlator_t *this, br_stub_private_t *priv);
int
br_stub_add(xlator_t *this, uuid_t gfid);
int32_t
br_stub_create_stub_gfid(xlator_t *this, char *stub_gfid_path, uuid_t gfid);
int
br_stub_dir_create(xlator_t *this, br_stub_private_t *priv);
call_stub_t *
__br_stub_dequeue(struct list_head *callstubs);
void
__br_stub_enqueue(struct list_head *callstubs, call_stub_t *stub);
void
br_stub_worker_enqueue(xlator_t *this, call_stub_t *stub);
void *
br_stub_worker(void *data);
int32_t
br_stub_lookup_wrapper(call_frame_t *frame, xlator_t *this, loc_t *loc,
dict_t *xattr_req);
int32_t
br_stub_readdir_wrapper(call_frame_t *frame, xlator_t *this, fd_t *fd,
size_t size, off_t off, dict_t *xdata);
int
br_stub_del(xlator_t *this, uuid_t gfid);
int
br_stub_bad_objects_path(xlator_t *this, fd_t *fd, gf_dirent_t *entries,
dict_t **dict);
void
br_stub_entry_xattr_fill(xlator_t *this, char *hpath, gf_dirent_t *entry,
dict_t *dict);
int
br_stub_get_path_of_gfid(xlator_t *this, inode_t *parent, inode_t *inode,
uuid_t gfid, char **path);
#endif /* __BIT_ROT_STUB_H__ */
|