summaryrefslogtreecommitdiffstats
path: root/xlators/storage/bdb/src/bdb.h
blob: c9db02c10e6a4459fc7183b079b6644a4d209ac3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
/*
  Copyright (c) 2008-2009 Z RESEARCH, Inc. <http://www.zresearch.com>
  This file is part of GlusterFS.

  GlusterFS is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published
  by the Free Software Foundation; either version 3 of the License,
  or (at your option) any later version.

  GlusterFS is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program.  If not, see
  <http://www.gnu.org/licenses/>.
*/

#ifndef _BDB_H
#define _BDB_H

#ifndef _CONFIG_H
#define _CONFIG_H
#include "config.h"
#endif

#include <stdio.h>
#include <dirent.h>
#include <unistd.h>
#include <sys/types.h>
#include <dirent.h>

#include <db.h>

#ifdef linux
#ifdef __GLIBC__
#include <sys/fsuid.h>
#else
#include <unistd.h>
#endif
#endif

#ifdef HAVE_SYS_XATTR_H
#include <sys/xattr.h>
#endif

#ifdef HAVE_SYS_EXTATTR_H
#include <sys/extattr.h>
#endif

#include <pthread.h>
#include "xlator.h"
#include "inode.h"
#include "compat.h"
#include "compat-errno.h"

#define BDB_STORAGE    "/glusterfs_storage.db"

/* numbers are not so reader-friendly, so lets have ON and OFF macros */
#define ON  1
#define OFF 0

#define BDB_DEFAULT_LRU_LIMIT 100
#define BDB_DEFAULT_HASH_SIZE 100

#define BDB_ENOSPC_THRESHOLD 25600

#define BDB_DEFAULT_CHECKPOINT_INTERVAL 30

#define BCTX_ENV(bctx) (bctx->table->dbenv)

#define BDB_EXPORT_PATH_LEN(_private) \
        (((struct bdb_private *)_private)->export_path_length)

#define BDB_EXPORT_PATH(_private) \
        (((struct bdb_private *)_private)->export_path)
/* MAKE_REAL_PATH(var,this,path)
 * make the real path on the underlying file-system
 *
 * @var:  destination to hold the real path
 * @this: pointer to xlator_t corresponding to bdb xlator
 * @path: path, as seen from mount-point
 */
#define MAKE_REAL_PATH(var, this, path) do {                            \
                int base_len = BDB_EXPORT_PATH_LEN(this->private);      \
                var = alloca (strlen (path) + base_len + 2);            \
                strcpy (var, BDB_EXPORT_PATH(this->private));           \
                strcpy (&var[base_len], path);                          \
        } while (0)

/* MAKE_REAL_PATH_TO_STORAGE_DB(var,this,path)
 * make the real path to the storage-database file on file-system
 *
 * @var:  destination to hold the real path
 * @this: pointer to xlator_t corresponding to bdb xlator
 * @path: path of the directory, as seen from mount-point
 */
#define MAKE_REAL_PATH_TO_STORAGE_DB(var, this, path) do {              \
                int base_len = BDB_EXPORT_PATH_LEN(this->private);      \
                var = alloca (strlen (path) +                           \
                              base_len +                                \
                              strlen (BDB_STORAGE));                    \
                strcpy (var, BDB_EXPORT_PATH(this->private));           \
                strcpy (&var[base_len], path);                          \
                strcat (var, BDB_STORAGE);                              \
        } while (0)

/* MAKE_KEY_FROM_PATH(key,path)
 * make a 'key', which we use as key in the underlying database by using
 * the path
 *
 * @key:  destination to hold the key
 * @path: path to file as seen from mount-point
 */
#define MAKE_KEY_FROM_PATH(key, path) do {              \
                char *tmp = alloca (strlen (path));     \
                strcpy (tmp, path);                     \
                key = basename (tmp);                   \
        }while (0);

/* BDB_DO_LSTAT(path,stbuf,dirent)
 * construct real-path to a dirent and do lstat on the real-path
 *
 * @path:   path to the directory whose readdir is currently in progress
 * @stbuf:  a 'struct stat *'
 * @dirent: a 'struct dirent *'
 */
#define BDB_DO_LSTAT(path, stbuf, dirent) do {          \
                char tmp_real_path[GF_PATH_MAX];        \
                strcpy(tmp_real_path, path);            \
                strcat (tmp_real_path, "/");            \
                strcat(tmp_real_path, dirent->d_name);  \
                ret = lstat (tmp_real_path, stbuf);     \
        } while(0);

/* IS_BDB_PRIVATE_FILE(name)
 * check if a given 'name' is bdb xlator's internal file name
 *
 * @name: basename of a file.
 *
 * bdb xlator reserves file names 'glusterfs_storage.db',
 * 'glusterfs_ns.db'(used by bdb xlator itself), 'log.*', '__db.*'
 * (used by libdb)
 */
#define IS_BDB_PRIVATE_FILE(name) ((!strncmp(name, "__db.", 5)) ||      \
                                   (!strcmp(name, "glusterfs_storage.db")) || \
                                   (!strcmp(name, "glusterfs_ns.db")) || \
                                   (!strncmp(name, "log.0000", 8)))

/* check if 'name' is '.' or '..' entry */
#define IS_DOT_DOTDOT(name) \
        ((!strncmp(name,".", 1)) || (!strncmp(name,"..", 2)))

/* BDB_SET_BCTX(this,inode,bctx)
 * put a stamp on inode. d00d, you are using bdb.. huhaha.
 * pointer to 'struct bdb_ctx' is stored in inode's ctx of all directories.
 * this will happen either in lookup() or mkdir().
 *
 * @this:  pointer xlator_t of bdb xlator.
 * @inode: inode where 'struct bdb_ctx *' has to be stored.
 * @bctx:  a 'struct bdb_ctx *'
 */
#define BDB_SET_BCTX(this,inode,bctx) do{                               \
                inode_ctx_put(inode, this, (uint64_t)(long)bctx);       \
        }while (0);

/* MAKE_BCTX_FROM_INODE(this,bctx,inode)
 * extract bdb xlator's 'struct bdb_ctx *' from an inode's ctx.
 * valid only if done for directory inodes, otherwise bctx = NULL.
 *
 * @this:  pointer xlator_t of bdb xlator.
 * @bctx:  a 'struct bdb_ctx *'
 * @inode: inode from where 'struct bdb_ctx *' has to be extracted.
 */
#define MAKE_BCTX_FROM_INODE(this,bctx,inode) do{       \
                uint64_t tmp_bctx = 0;                  \
                inode_ctx_get (inode, this, &tmp_bctx); \
                if (ret == 0)                           \
                        bctx = (void *)(long)tmp_bctx;  \
        }while (0);

#define BDB_SET_BFD(this,fd,bfd) do{                            \
                fd_ctx_set (fd, this, (uint64_t)(long)bfd);     \
        }while (0);

/* maximum number of open dbs that bdb xlator will ever have */
#define BDB_MAX_OPEN_DBS 100

/* convert file size to block-count */
#define BDB_COUNT_BLOCKS(size,blksize) (((size + blksize - 1)/blksize) - 1)

/* file permissions, again macros are more readable */
#define RWXRWXRWX         0777
#define DEFAULT_FILE_MODE 0644
#define DEFAULT_DIR_MODE  0755

/* see, if have a valid file permissions specification in @mode */
#define IS_VALID_FILE_MODE(mode) (!(mode & (~RWXRWXRWX)))
#define IS_VALID_DIR_MODE(mode)  (!(mode & (~(RWXRWXRWX)))

/* maximum retries for a failed transactional operation */
#define BDB_MAX_RETRIES 10

#define BDB_LL_PAGE_SIZE_DEFAULT    4096
#define BDB_LL_PAGE_SIZE_MIN        4096
#define BDB_LL_PAGE_SIZE_MAX        65536

#define PAGE_SIZE_IN_RANGE(_page_size)                  \
        ((_page_size >= BDB_LL_PAGE_SIZE_MIN)           \
         && (table->page_size <= BDB_LL_PAGE_SIZE_MAX))

typedef struct bctx_table bctx_table_t;
typedef struct bdb_ctx    bctx_t;
typedef struct bdb_cache  bdb_cache_t;
typedef struct bdb_private bdb_private_t;

struct bctx_table {
        /* flags to be used for opening each database */
        uint64_t            dbflags;

        /* cache: can be either ON or OFF */
        uint64_t            cache;

        /* used to lock the 'struct bctx_table *' */
        gf_lock_t           lock;

        /* lock for checkpointing */
        gf_lock_t           checkpoint_lock;

        /* hash table of 'struct bdb_ctx' */
        struct list_head   *b_hash;

        /* list of active 'struct bdb_ctx' */
        struct list_head    active;

        /* lru list of inactive 'struct bdb_ctx' */
        struct list_head    b_lru;
        struct list_head    purge;
        uint32_t            lru_limit;
        uint32_t            lru_size;
        uint32_t            hash_size;

        /* access mode for accessing the databases, can be DB_HASH, DB_BTREE */
        DBTYPE              access_mode;

        /* DB_ENV under which every db operation is carried over */
        DB_ENV             *dbenv;
        int32_t             transaction;
        xlator_t           *this;

        /* page-size of DB, DB->set_pagesize(), should be set before DB->open */
        uint64_t            page_size;
};

struct bdb_ctx {
        /* controller members */

        /* lru list of 'struct bdb_ctx's, a bdb_ctx can exist in one of
         * b_hash or lru lists */
        struct list_head   list;

        /* directory 'name' hashed list of 'struct bdb_ctx's */
        struct list_head   b_hash;

        struct bctx_table *table;
        int32_t            ref;         /* reference count */
        gf_lock_t          lock;        /* used to lock this 'struct bdb_ctx' */

        char              *directory;   /* directory path */

        /* pointer to open database, that resides inside this directory */
        DB                *dbp;
        uint32_t           cache;       /* cache ON or OFF */

        /* per directory cache, bdb xlator's internal cache */
        struct list_head   c_list;      /* linked list of cached records */
        int32_t            c_count;     /* number of cached records */

        /* index to hash table list, to which this ctx belongs */
        int32_t            key_hash;
        char              *db_path;     /* absolute path to db file */
};

struct bdb_fd {
        /* pointer to bdb_ctx of the parent directory */
        struct bdb_ctx *ctx;

        /* name of the file. NOTE: basename, not the complete path */
        char           *key;
        int32_t         flags;          /* open flags */
};

struct bdb_dir {
        /* pointer to bdb_ctx of this directory */
        struct bdb_ctx *ctx;

        /* open directory pointer, as returned by opendir() */
        DIR            *dir;

        /* FIXME: readdir offset, too crude. must go  */
        char            offset[NAME_MAX];
        char           *path;             /* path to this directory */
};

/* cache */
struct bdb_cache {
        /* list of 'struct bdb_cache' under a 'struct bdb_ctx' */
        struct list_head c_list;

        /* name of the file this cache holds. NOTE: basename of file */
        char            *key;
        char            *data;            /* file content */

        /* size of the file content that this cache holds */
        size_t           size;
};


struct bdb_private {
        /* pointer to inode table that we use */
        inode_table_t      *itable;
        int32_t             temp;               /**/
        char                is_stateless;       /**/

        /* path to the export directory
         * (option directory <export-path>) */
        char               *export_path;

        /* length of 'export_path' string */
        int32_t             export_path_length;

        /* statistics */
        /* Statistics, provides activity of the server */
        struct xlator_stats stats;

        struct timeval      prev_fetch_time;
        struct timeval      init_time;
        int32_t             max_read;           /* */
        int32_t             max_write;          /* */

        /* Used to calculate the max_read value */
        int64_t             interval_read;

        /* Used to calculate the max_write value */
        int64_t             interval_write;
        int64_t             read_value;         /* Total read, from init */
        int64_t             write_value;        /* Total write, from init */

        /* bdb xlator specific private data */

        /* flags used for opening DB_ENV for this xlator */
        uint64_t            envflags;

        /* flags to be used for opening each database */
        uint64_t            dbflags;

        /* cache: can be either ON or OFF */
        uint64_t            cache;

        /* transaction: can be either ON or OFF */
        uint32_t            transaction;
        uint32_t            active;
        gf_lock_t           active_lock;
        struct bctx_table  *b_table;

        /* access mode for accessing the databases, can be DB_HASH, DB_BTREE
         * (option access-mode <mode>) */
        DBTYPE              access_mode;

        /* mode for each and every file stored on bdb
         * (option file-mode <mode>) */
        mode_t              file_mode;

        /* mode for each and every directory stored on bdb
         * (option dir-mode <mode>) */
        mode_t              dir_mode;

        /* mode for each and every symlink stored on bdb */
        mode_t              symlink_mode;

        /* pthread_t object used for creating checkpoint thread */
        pthread_t           checkpoint_thread;

        /* time duration between two consecutive checkpoint operations.
         * (option checkpoint-interval <time-in-seconds>) */
        uint32_t             checkpoint_interval;

        /* inode number allocation counter */
        ino_t               next_ino;

        /* lock to protect 'next_ino' */
        gf_lock_t           ino_lock;

        /* environment log directory (option logdir <directory>) */
        char               *logdir;

        /* errfile path, used by environment to print detailed error log.
         * (option errfile <errfile-path>) */
        char               *errfile;

        /* DB_ENV->set_errfile() expects us to fopen
         * the errfile before doing DB_ENV->set_errfile() */
        FILE               *errfp;

       /* used by DB_ENV->set_timeout to set the timeout for
        * a transactionally encapsulated DB->operation() to
        * timeout before waiting for locks to be released.
        * (option transaction-timeout <time-in-milliseconds>)
        */
        uint32_t            txn_timeout;
        uint32_t            lock_timeout;

        /* DB_AUTO_LOG_REMOVE flag for DB_ENV*/
        uint32_t            log_auto_remove;
        uint32_t            log_region_max;
};


static inline int32_t
bdb_txn_begin (DB_ENV *dbenv,
               DB_TXN **ptxnid)
{
        return dbenv->txn_begin (dbenv, NULL, ptxnid, 0);
}

static inline int32_t
bdb_txn_abort (DB_TXN *txnid)
{
        return txnid->abort (txnid);
}

static inline int32_t
bdb_txn_commit (DB_TXN *txnid)
{
        return txnid->commit (txnid, 0);
}

inline void *
bdb_extract_bfd (fd_t *fd, xlator_t *this);


void *
bdb_db_stat (bctx_t *bctx,
             DB_TXN *txnid,
             uint32_t flags);

int32_t
bdb_db_get(struct bdb_ctx *bctx,
           DB_TXN *txnid,
           const char *key_string,
           char **buf,
           size_t size,
           off_t offset);

#define BDB_TRUNCATE_RECORD 0xcafebabe

int32_t
bdb_db_put (struct bdb_ctx *bctx,
            DB_TXN *txnid,
            const char *key_string,
            const char *buf,
            size_t size,
            off_t offset,
            int32_t flags);

int32_t
bdb_db_del (struct bdb_ctx *bctx,
            DB_TXN *txnid,
            const char *path);

ino_t
bdb_inode_transform (ino_t parent,
                     struct bdb_ctx *bctx);


int32_t
bdb_cursor_open (struct bdb_ctx *bctx,
                 DBC **cursorp);

int32_t
bdb_cursor_get (DBC *cursorp,
                DBT *key,
                DBT *value,
                int32_t flags);


int32_t
bdb_cursor_close (struct bdb_ctx *ctx,
                  DBC *cursorp);


int32_t
bdb_dirent_size (DBT *key);

int32_t
dirent_size (struct dirent *entry);

int
bdb_db_init (xlator_t *this,
             dict_t *options);

void
bdb_dbs_from_dict_close (dict_t *this,
                         char *key,
                         data_t *value,
                         void *data);

bctx_t *
bctx_lookup (struct bctx_table *table,
             const char *path);

bctx_t *
bctx_parent
(struct bctx_table *table,
 const char *path);

bctx_t *
bctx_unref (bctx_t *ctx);

bctx_t *
bctx_ref (bctx_t *ctx);

#endif /* _BDB_H */