summaryrefslogtreecommitdiffstats
path: root/doc
diff options
context:
space:
mode:
Diffstat (limited to 'doc')
-rw-r--r--doc/Makefile.am4
-rw-r--r--doc/README.md10
-rw-r--r--doc/debugging/analyzing-regression-cores.md (renamed from doc/developer-guide/coredump-analysis.md)53
-rw-r--r--doc/debugging/coredump-analysis.md31
-rw-r--r--doc/debugging/gfid-to-path.md45
-rw-r--r--doc/debugging/split-brain.md75
-rw-r--r--doc/debugging/statedump.md77
-rw-r--r--doc/developer-guide/Language-Bindings.md1
-rw-r--r--doc/developer-guide/README.md (renamed from doc/developer-guide/Developers-Index.md)23
-rw-r--r--doc/developer-guide/Using-Gluster-Test-Framework.md1
-rw-r--r--doc/developer-guide/afr-locks-evolution.md6
-rw-r--r--doc/developer-guide/afr-self-heal-daemon.md2
-rw-r--r--doc/developer-guide/bd-xlator.md469
-rw-r--r--doc/developer-guide/brickmux-thread-reduction.md64
-rw-r--r--doc/developer-guide/coding-standard.md129
-rw-r--r--doc/developer-guide/commit-guidelines.md136
-rw-r--r--doc/developer-guide/datastructure-inode.md61
-rw-r--r--doc/developer-guide/datastructure-iobuf.md36
-rw-r--r--doc/developer-guide/datastructure-mem-pool.md8
-rw-r--r--doc/developer-guide/dirops-transactions-in-dht.md3
-rw-r--r--doc/developer-guide/fuse-interrupt.md211
-rw-r--r--doc/developer-guide/identifying-resource-leaks.md24
-rw-r--r--doc/developer-guide/logging-guidelines.md2
-rw-r--r--doc/developer-guide/network_compression.md20
-rw-r--r--doc/developer-guide/options-to-contribute.md212
-rw-r--r--doc/developer-guide/syncop.md2
-rw-r--r--doc/developer-guide/thread-naming.md6
-rw-r--r--doc/developer-guide/translator-development.md4
-rw-r--r--doc/developer-guide/xlator-classification.md221
-rw-r--r--doc/features/ctime.md32
-rw-r--r--doc/gluster.843
-rw-r--r--doc/glusterd.84
-rw-r--r--doc/glusterfs.813
-rw-r--r--doc/glusterfsd.89
-rw-r--r--doc/mount.glusterfs.818
35 files changed, 1293 insertions, 762 deletions
diff --git a/doc/Makefile.am b/doc/Makefile.am
index 7c04d74019a..de68c20b4d7 100644
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,9 +1,9 @@
EXTRA_DIST = glusterfs.8 mount.glusterfs.8 gluster.8 \
glusterd.8 glusterfsd.8
-man8_MANS = glusterfs.8 mount.glusterfs.8
+man8_MANS = glusterfs.8 mount.glusterfs.8 gluster.8
if WITH_SERVER
-man8_MANS += gluster.8 glusterd.8 glusterfsd.8
+man8_MANS += glusterd.8 glusterfsd.8
endif
CLEANFILES =
diff --git a/doc/README.md b/doc/README.md
index 6294df94b67..6aa28642ef4 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -1,3 +1,11 @@
+## Developer Guide
+
+Gluster's contributors can check about the internals by visiting [Developer Guide Section](developer-guide). While it is not 'comprehensive', it can help you to get started.
+
+Also while coding, keep [Coding Standard](developer-guide/coding-standard.md) in mind.
+
+When you are ready to commit the changes, make sure you meet our [Commit message standard](developer-guide/commit-guidelines.md).
+
## Admin Guide ##
The gluster administration guide is maintained at [github](https://github.com/gluster/glusterdocs). The browsable admin guide can be found [here](http://docs.gluster.org/en/latest/Administrator%20Guide/).
@@ -10,7 +18,7 @@ The Gluster features which are 'in progress' or implemented can be found at [git
## Upgrade Guide ##
-The gluster upgrade guide is maintained at [github](https://github.com/gluster/glusterdocs). The browsable upgrade guide can be found [here](http://docs.gluster.org/en/latest/Upgrade-Guide/README/)
+The gluster upgrade guide is maintained at [github](https://github.com/gluster/glusterdocs). The browsable upgrade guide can be found [here](http://docs.gluster.org/en/latest/Upgrade-Guide)
The doc patch has to be sent against the above mentioned repository.
diff --git a/doc/developer-guide/coredump-analysis.md b/doc/debugging/analyzing-regression-cores.md
index 16fa9165fd0..5e10f41c6eb 100644
--- a/doc/developer-guide/coredump-analysis.md
+++ b/doc/debugging/analyzing-regression-cores.md
@@ -1,36 +1,35 @@
-This document explains how to analyze core-dumps obtained from regression
-machines, with examples.
-1) Download the core-tarball and extract it.
-2) 'cd' into directory where the tarball is extracted.
-~~~
-[root@atalur Downloads]# pwd
-/home/atalur/Downloads
-[root@atalur Downloads]# ls
+# Analyzing Regression Cores
+This document explains how to analyze core-dumps obtained from regression machines, with examples.
+1. Download the core-tarball and extract it.
+2. `cd` into directory where the tarball is extracted.
+```
+[sh]# pwd
+/home/user/Downloads
+[sh]# ls
build build-install-20150625_05_42_39.tar.bz2 lib64 usr
-~~~
-3) Determine the core file you need to examine. There can be more than one core file.
-You can list them from './build/install/cores' directory.
-~~~
-[root@atalur Downloads]# ls build/install/cores/
+```
+3. Determine the core file you need to examine. There can be more than one core file. You can list them from './build/install/cores' directory.
+```
+[sh]# ls build/install/cores/
core.9341 liblist.txt liblist.txt.tmp
-~~~
+```
In case you are unsure which binary generated the core-file, executing 'file' command on it will help.
-~~~
-[root@atalur Downloads]# file ./build/install/cores/core.9341
+```
+[sh]# file ./build/install/cores/core.9341
./build/install/cores/core.9341: ELF 64-bit LSB core file x86-64, version 1 (SYSV), SVR4-style, from '/build/install/sbin/glusterfsd -s slave26.cloud.gluster.org --volfile-id patchy'
-~~~
-As seen, the core file was generated by glusterfsd binary, and path to it is provided (/build/install/sbin/glusterfsd).
-4) Now, run the following command on the core:
-~~~
+```
+As seen, the core file was generated by glusterfsd binary, and path to it is provided (/build/install/sbin/glusterfsd).
+
+4. Now, run the following command on the core:
+```
gdb -ex 'set sysroot ./' -ex 'core-file ./build/install/cores/core.xxx' <target, say ./build/install/sbin/glusterd>
In this case,
gdb -ex 'set sysroot ./' -ex 'core-file ./build/install/cores/core.9341' ./build/install/sbin/glusterfsd
-~~~
-5) You can cross check if all shared libraries are available and loaded by using 'info sharedlibrary' command from
-inside gdb.
-6) Once verified, usual gdb commands based on requirement can be used to debug the core.
-'bt' or 'backtrace' from gdb of core used in examples:
-~~~
+```
+5. You can cross check if all shared libraries are available and loaded by using 'info sharedlibrary' command from inside gdb.
+6. Once verified, usual gdb commands based on requirement can be used to debug the core.
+ `bt` or `backtrace` from gdb of core used in examples:
+```
Core was generated by `/build/install/sbin/glusterfsd -s slave26.cloud.gluster.org --volfile-id patchy'.
Program terminated with signal SIGABRT, Aborted.
#0 0x00007f512a54e625 in raise () from ./lib64/libc.so.6
@@ -52,4 +51,4 @@ Program terminated with signal SIGABRT, Aborted.
#12 0x00007f512a55f8f0 in ?? () from ./lib64/libc.so.6
#13 0x0000000000000000 in ?? ()
(gdb)
-~~~
+```
diff --git a/doc/debugging/coredump-analysis.md b/doc/debugging/coredump-analysis.md
deleted file mode 100644
index f9ecf73216e..00000000000
--- a/doc/debugging/coredump-analysis.md
+++ /dev/null
@@ -1,31 +0,0 @@
-This document explains how to analyze core-dumps obtained from regression
-machines, with examples.
-1) Download the core-tarball and extract it.
-2) 'cd' into the root of extracted tarball.
-~~~
-[root@atalur Downloads]# pwd
-/home/atalur/Downloads
-[root@atalur Downloads]# ls
-build build-install-20150625_05_42_39.tar.bz2 lib64 usr
-~~~
-3) Determine the core file you need to examine. There can be more than one core file.
-You can list them from './build/install/cores' directory.
-~~~
-[root@atalur Downloads]# ls build/install/cores/
-core.9341 liblist.txt liblist.txt.tmp
-~~~
-In case you are unsure which binary generated the core-file, executing 'file' command on it will help.
-~~~
-[root@atalur Downloads]# file ./build/install/cores/core.9341
-./build/install/cores/core.9341: ELF 64-bit LSB core file x86-64, version 1 (SYSV), SVR4-style, from '/build/install/sbin/glusterfsd -s slave26.cloud.gluster.org --volfile-id patchy'
-~~~
-As seen, the core file was generated by glusterfsd binary, and path to it is provide (/build/install/sbin/glusterfsd).
-4) Now, run the following command on the core:
-~~~
-gdb -ex 'set sysroot ./' -ex 'core-file ./build/install/cores/core.xxx' <target, say ./build/install/sbin/glusterd>
-In this case,
-gdb -ex 'set sysroot ./' -ex 'core-file ./build/install/cores/core.9341' ./build/install/sbin/glusterfsd
-~~~
-5) You can cross check if all shared libraries are available and loaded by using 'info sharedlibrary' command from
-inside gdb.
-6) Once verified, usual gdb commands based on requirement can be used to debug the core.
diff --git a/doc/debugging/gfid-to-path.md b/doc/debugging/gfid-to-path.md
index 09c459e52c8..1917bf2cca1 100644
--- a/doc/debugging/gfid-to-path.md
+++ b/doc/debugging/gfid-to-path.md
@@ -1,37 +1,37 @@
-#Convert GFID to Path
+# Convert GFID to Path
GlusterFS internal file identifier (GFID) is a uuid that is unique to each
file across the entire cluster. This is analogous to inode number in a
normal filesystem. The GFID of a file is stored in its xattr named
`trusted.gfid`.
-####Special mount using [gfid-access translator][1]:
-~~~
+#### Special mount using [gfid-access translator][1]:
+```
mount -t glusterfs -o aux-gfid-mount vm1:test /mnt/testvol
-~~~
+```
Assuming, you have `GFID` of a file from changelog (or somewhere else).
For trying this out, you can get `GFID` of a file from mountpoint:
-~~~
+```
getfattr -n glusterfs.gfid.string /mnt/testvol/dir/file
-~~~
+```
---
-###Get file path from GFID (Method 1):
+### Get file path from GFID (Method 1):
**(Lists hardlinks delimited by `:`, returns path as seen from mountpoint)**
-####Turn on build-pgfid option
-~~~
+#### Turn on build-pgfid option
+```
gluster volume set test build-pgfid on
-~~~
+```
Read virtual xattr `glusterfs.ancestry.path` which contains the file path
-~~~
+```
getfattr -n glusterfs.ancestry.path -e text /mnt/testvol/.gfid/<GFID>
-~~~
+```
**Example:**
-~~~
+```
[root@vm1 glusterfs]# ls -il /mnt/testvol/dir/
total 1
10610563327990022372 -rw-r--r--. 2 root root 3 Jul 17 18:05 file
@@ -46,28 +46,23 @@ glusterfs.gfid.string="11118443-1894-4273-9340-4b212fa1c0e4"
getfattr: Removing leading '/' from absolute path names
# file: mnt/testvol/.gfid/11118443-1894-4273-9340-4b212fa1c0e4
glusterfs.ancestry.path="/dir/file:/dir/file3"
-~~~
+```
---
-###Get file path from GFID (Method 2):
+### Get file path from GFID (Method 2):
**(Does not list all hardlinks, returns backend brick path)**
-~~~
+```
getfattr -n trusted.glusterfs.pathinfo -e text /mnt/testvol/.gfid/<GFID>
-~~~
+```
**Example:**
-~~~
+```
[root@vm1 glusterfs]# getfattr -n trusted.glusterfs.pathinfo -e text /mnt/testvol/.gfid/11118443-1894-4273-9340-4b212fa1c0e4
getfattr: Removing leading '/' from absolute path names
# file: mnt/testvol/.gfid/11118443-1894-4273-9340-4b212fa1c0e4
trusted.glusterfs.pathinfo="(<DISTRIBUTE:test-dht> <POSIX(/mnt/brick-test/b):vm1:/mnt/brick-test/b/dir//file3>)"
-~~~
+```
---
-###Get file path from GFID (Method 3):
-https://gist.github.com/semiosis/4392640
-
----
-####References and links:
+#### References and links:
[posix: placeholders for GFID to path conversion](http://review.gluster.org/5951)
-[1]: https://github.com/gluster/glusterfs/blob/master/doc/features/gfid-access.md
diff --git a/doc/debugging/split-brain.md b/doc/debugging/split-brain.md
index b0d938e26bc..6b122c40551 100644
--- a/doc/debugging/split-brain.md
+++ b/doc/debugging/split-brain.md
@@ -1,33 +1,36 @@
-Steps to recover from File split-brain.
-======================================
-
-Quick Start:
-============
-1. Get the path of the file that is in split-brain:
-> It can be obtained either by
-> a) The command `gluster volume heal info split-brain`.
-> b) Identify the files for which file operations performed
- from the client keep failing with Input/Output error.
-
-2. Close the applications that opened this file from the mount point.
+# Steps to recover from File split-brain
+This document contains steps to recover from a file split-brain.
+## Quick Start:
+### Step 1. Get the path of the file that is in split-brain:
+It can be obtained either by
+1. The command `gluster volume heal info split-brain`.
+2. Identify the files for which file operations performed from the client keep failing with Input/Output error.
+
+### Step 2. Close the applications that opened this file from the mount point.
In case of VMs, they need to be powered-off.
-3. Decide on the correct copy:
-> This is done by observing the afr changelog extended attributes of the file on
+### Step 3. Decide on the correct copy:
+This is done by observing the afr changelog extended attributes of the file on
the bricks using the getfattr command; then identifying the type of split-brain
(data split-brain, metadata split-brain, entry split-brain or split-brain due to
gfid-mismatch); and finally determining which of the bricks contains the 'good copy'
of the file.
-> `getfattr -d -m . -e hex <file-path-on-brick>`.
+```
+getfattr -d -m . -e hex <file-path-on-brick>
+```
+
It is also possible that one brick might contain the correct data while the
other might contain the correct metadata.
-4. Reset the relevant extended attribute on the brick(s) that contains the
-'bad copy' of the file data/metadata using the setfattr command.
-> `setfattr -n <attribute-name> -v <attribute-value> <file-path-on-brick>`
+### Step 4. Reset the relevant extended attribute on the brick(s) that contains the 'bad copy' of the file data/metadata using the setfattr command.
+```
+setfattr -n <attribute-name> -v <attribute-value> <file-path-on-brick>
+```
-5. Trigger self-heal on the file by performing lookup from the client:
-> `ls -l <file-path-on-gluster-mount>`
+### Step 5. Trigger self-heal on the file by performing lookup from the client:
+```
+ls -l <file-path-on-gluster-mount>
+```
Detailed Instructions for steps 3 through 5:
===========================================
@@ -36,13 +39,15 @@ afr changelog extended attributes.
Execute `getfattr -d -m . -e hex <file-path-on-brick>`
-* Example:
+Example:
+```
[root@store3 ~]# getfattr -d -e hex -m. brick-a/file.txt
\#file: brick-a/file.txt
security.selinux=0x726f6f743a6f626a6563745f723a66696c655f743a733000
trusted.afr.vol-client-2=0x000000000000000000000000
trusted.afr.vol-client-3=0x000000000200000000000000
trusted.gfid=0x307a5c9efddd4e7c96e94fd4bcdcbd1b
+```
The extended attributes with `trusted.afr.<volname>-client-<subvolume-index>`
are used by afr to maintain changelog of the file.The values of the
@@ -51,10 +56,11 @@ client (fuse or nfs-server) processes. When the glusterfs client modifies a file
or directory, the client contacts each brick and updates the changelog extended
attribute according to the response of the brick.
-'subvolume-index' is nothing but (brick number - 1) in
+`subvolume-index` is nothing but (brick number - 1) in
`gluster volume info <volname>` output.
-* Example:
+Example:
+```
[root@pranithk-laptop ~]# gluster volume info vol
Volume Name: vol
Type: Distributed-Replicate
@@ -71,6 +77,7 @@ attribute according to the response of the brick.
brick-f: pranithk-laptop:/gfs/brick-f
brick-g: pranithk-laptop:/gfs/brick-g
brick-h: pranithk-laptop:/gfs/brick-h
+```
In the example above:
```
@@ -91,12 +98,15 @@ present in all the other bricks in it's replica set as seen by that brick.
In the example volume given above, all files in brick-a will have 2 entries,
one for itself and the other for the file present in it's replica pair, i.e.brick-b:
+```
trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for itself (brick-a)
trusted.afr.vol-client-1=0x000000000000000000000000 -->changelog for brick-b as seen by brick-a
-
+```
Likewise, all files in brick-b will have:
+```
trusted.afr.vol-client-0=0x000000000000000000000000 -->changelog for brick-a as seen by brick-b
trusted.afr.vol-client-1=0x000000000000000000000000 -->changelog for itself (brick-b)
+```
The same can be extended for other replica pairs.
@@ -122,7 +132,8 @@ When a file split-brain happens it could be either data split-brain or
meta-data split-brain or both. When a split-brain happens the changelog of the
file would be something like this:
-* Example:(Lets consider both data, metadata split-brain on same file).
+Example:(Lets consider both data, metadata split-brain on same file).
+```
[root@pranithk-laptop vol]# getfattr -d -m . -e hex /gfs/brick-?/a
getfattr: Removing leading '/' from absolute path names
\#file: gfs/brick-a/a
@@ -133,10 +144,11 @@ trusted.gfid=0x80acdbd886524f6fbefa21fc356fed57
trusted.afr.vol-client-0=0x000003b00000000100000000
trusted.afr.vol-client-1=0x000000000000000000000000
trusted.gfid=0x80acdbd886524f6fbefa21fc356fed57
+```
-###Observations:
+### Observations:
-####According to changelog extended attributes on file /gfs/brick-a/a:
+#### According to changelog extended attributes on file /gfs/brick-a/a:
The first 8 digits of trusted.afr.vol-client-0 are all
zeros (0x00000000................), and the first 8 digits of
trusted.afr.vol-client-1 are not all zeros (0x000003d7................).
@@ -149,7 +161,7 @@ trusted.afr.vol-client-1 are not all zeros (0x........00000001........).
So the changelog on /gfs/brick-a/a implies that some metadata operations succeeded
on itself but failed on /gfs/brick-b/a.
-####According to Changelog extended attributes on file /gfs/brick-b/a:
+#### According to Changelog extended attributes on file /gfs/brick-b/a:
The first 8 digits of trusted.afr.vol-client-0 are not all
zeros (0x000003b0................), and the first 8 digits of
trusted.afr.vol-client-1 are all zeros (0x00000000................).
@@ -205,6 +217,7 @@ Hence execute
`setfattr -n trusted.afr.vol-client-1 -v 0x000003d70000000000000000 /gfs/brick-a/a`
Thus after the above operations are done, the changelogs look like this:
+```
[root@pranithk-laptop vol]# getfattr -d -m . -e hex /gfs/brick-?/a
getfattr: Removing leading '/' from absolute path names
\#file: gfs/brick-a/a
@@ -216,7 +229,7 @@ trusted.gfid=0x80acdbd886524f6fbefa21fc356fed57
trusted.afr.vol-client-0=0x000000000000000100000000
trusted.afr.vol-client-1=0x000000000000000000000000
trusted.gfid=0x80acdbd886524f6fbefa21fc356fed57
-
+```
Triggering Self-heal:
---------------------
@@ -243,9 +256,9 @@ needs to be removed.The gfid-link files are present in the .glusterfs folder
in the top-level directory of the brick. If the gfid of the file is
0x307a5c9efddd4e7c96e94fd4bcdcbd1b (the trusted.gfid extended attribute got
from the getfattr command earlier),the gfid-link file can be found at
-> /gfs/brick-a/.glusterfs/30/7a/307a5c9efddd4e7c96e94fd4bcdcbd1b
+`/gfs/brick-a/.glusterfs/30/7a/307a5c9efddd4e7c96e94fd4bcdcbd1b`
-####Word of caution:
+#### Word of caution:
Before deleting the gfid-link, we have to ensure that there are no hard links
to the file present on that brick. If hard-links exist,they must be deleted as
well.
diff --git a/doc/debugging/statedump.md b/doc/debugging/statedump.md
index 9939576e270..9dfdce15fad 100644
--- a/doc/debugging/statedump.md
+++ b/doc/debugging/statedump.md
@@ -1,21 +1,30 @@
-#Statedump
+# Statedump
Statedump is a file generated by glusterfs process with different data structure state which may contain the active inodes, fds, mempools, iobufs, memory allocation stats of different types of datastructures per xlator etc.
-##How to generate statedump
-We can find the directory where statedump files are created using 'gluster --print-statedumpdir' command.
+## How to generate statedump
+We can find the directory where statedump files are created using `gluster --print-statedumpdir` command.
Create that directory if not already present based on the type of installation.
Lets call this directory `statedump-directory`.
-We can generate statedump using 'kill -USR1 <pid-of-gluster-process>'.
+We can generate statedump using `kill -USR1 <pid-of-gluster-process>`.
gluster-process is nothing but glusterd/glusterfs/glusterfsd process.
There are also commands to generate statedumps for brick processes/nfs server/quotad
-For bricks: `gluster volume statedump <volname>`
+For bricks:
+```
+gluster volume statedump <volname>
+```
-For nfs server: `gluster volume statedump <volname> nfs`
+For nfs server:
+```
+gluster volume statedump <volname> nfs
+```
-For quotad: `gluster volume statedump <volname> quotad`
+For quotad:
+```
+gluster volume statedump <volname> quotad
+```
For brick-processes files will be created in `statedump-directory` with name of the file as `hyphenated-brick-path.<pid>.dump.timestamp`. For all other processes it will be `glusterdump.<pid>.dump.timestamp`.
@@ -24,21 +33,21 @@ processes could have used the `SIGUSR1` signal already for other purposes.
To generate statedump for the processes, using libgfapi, below command can be
executed from one of the nodes in the gluster cluster to which the libgfapi
application is connected to.
-
- gluster volume statedump <volname> client <hostname>:<process id>
-
+```
+gluster volume statedump <volname> client <hostname>:<process id>
+```
The statedumps can be found in the `statedump-directory`, the name of the
statedumps being `glusterdump.<pid>.dump.timestamp`. For a process there can be
multiple such files created depending on the number of times the volume is
accessed by the process (related to the number of `glfs_init()` calls).
-##How to read statedump
+## How to read statedump
We shall see snippets of each type of statedump.
First and last lines of the file have starting and ending time of writing the statedump file. Times will be in UTC timezone.
mallinfo return status is printed in the following format. Please read man mallinfo for more information about what each field means.
-###Mallinfo
+### Mallinfo
```
[mallinfo]
mallinfo_arena=100020224 /* Non-mmapped space allocated (bytes) */
@@ -53,7 +62,7 @@ mallinfo_fordblks=3310112 /* Total free space (bytes) */
mallinfo_keepcost=133712 /* Top-most, releasable space (bytes) */
```
-###Data structure allocation stats
+### Data structure allocation stats
For every xlator data structure memory per translator loaded in the call-graph is displayed in the following format:
For xlator with name: glusterfs
@@ -74,7 +83,7 @@ max_num_allocs=3 #Maximum number of active allocations at any point in the life
total_allocs=7 #Number of times this data is allocated in the life of the process.
```
-###Mempools
+### Mempools
Mempools are optimization to reduce the number of allocations of a data type. If we create a mem-pool of lets say 1024 elements for a data-type, new elements will be allocated from heap using syscalls like calloc, only if all the 1024 elements in the pool are in active use.
@@ -94,7 +103,7 @@ cur-stdalloc=0 #Denotes the number of allocations made from heap once cold-count
max-stdalloc=0 #Maximum number of allocations from heap that are in active use at any point in the life of the process.
```
-###Iobufs
+### Iobufs
```
[iobuf.global]
iobuf_pool=0x1f0d970 #The memory pool for iobufs
@@ -105,7 +114,7 @@ iobuf_pool.arena_cnt=8 #Total number of arenas in the pool
iobuf_pool.request_misses=0 #The number of iobufs that were stdalloc'd (as they exceeded the default max page size provided by iobuf_pool).
```
-There are 3 lists of arenas
+There are 3 lists of arenas:
1. Arena list: arenas allocated during iobuf pool creation and the arenas that are in use(active_cnt != 0) will be part of this list.
2. Purge list: arenas that can be purged(no active iobufs, active_cnt == 0).
@@ -142,7 +151,7 @@ arena.6.active_iobuf.2.ptr=0x7fdb92189000
At any given point in time if there are lots of filled arenas then that could be a sign of iobuf leaks.
-###Call stack
+### Call stack
All the fops received by gluster are handled using call-stacks. Call stack contains the information about uid/gid/pid etc of the process that is executing the fop. Each call-stack contains different call-frames per xlator which handles that fop.
```
@@ -157,7 +166,7 @@ op=LOOKUP #Fop
type=1 #Type of the op i.e. FOP/MGMT-OP
cnt=9 #Number of frames in this stack.
```
-###Call-frame
+### Call-frame
Each frame will have information about which xlator the frame belongs to, what is the function it wound to/from and will be unwind to. It also mentions if the unwind happened or not. If we observe hangs in the system and want to find out which xlator is causing it. Take a statedump and see what is the final xlator which is yet to be unwound.
```
@@ -172,7 +181,7 @@ wind_to=priv->children[i]->fops->lookup
unwind_to=afr_lookup_cbk #Parent xlator function to which unwind happened
```
-###History of operations in Fuse
+### History of operations in Fuse
Fuse maintains history of operations that happened in fuse.
@@ -188,7 +197,7 @@ TIME=2014-07-09 16:44:57.523394
message=[0] fuse_getattr_resume: 4591, STAT, path: (/iozone.tmp), gfid: (3afb4968-5100-478d-91e9-76264e634c9f)
```
-###Xlator configuration
+### Xlator configuration
```
[cluster/replicate.r2-replicate-0] #Xlator type, name information
child_count=2 #Number of children to the xlator
@@ -208,7 +217,7 @@ favorite_child=-1
wait_count=1
```
-###Graph/inode table
+### Graph/inode table
```
[active graph - 1]
@@ -220,7 +229,7 @@ conn.1.bound_xl./data/brick01a/homegfs.lru_size=183 #Number of inodes present in
conn.1.bound_xl./data/brick01a/homegfs.purge_size=0 #Number of inodes present in purge list
```
-###Inode
+### Inode
```
[conn.1.bound_xl./data/brick01a/homegfs.active.324] #324th inode in active inode list
gfid=e6d337cf-97eb-44b3-9492-379ba3f6ad42 #Gfid of the inode
@@ -239,7 +248,7 @@ ia_type=2
Ref by xl:.fuse=1
Ref by xl:.patchy-client-0=-1
```
-###Inode context
+### Inode context
For each inode per xlator some context could be stored. This context can also be printed in the statedump. Here is the inode ctx of locks xlator
```
[xlator.features.locks.homegfs-locks.inode]
@@ -256,12 +265,12 @@ lock-dump.domain.domain=homegfs-replicate-0 #Domain name where entry/data operat
inodelk.inodelk[0](ACTIVE)=type=WRITE, whence=0, start=11141120, len=131072, pid = 18446744073709551615, owner=080b1ada117f0000, client=0xb7fc30, connection-id=compute-30-029.com-3505-2014/06/29-14:46:12:477358-homegfs-client-0-0-1, granted at Sun Jun 29 11:10:36 2014 #Active lock information
```
-##FAQ
-###How to debug Memory leaks using statedump?
+## FAQ
+### How to debug Memory leaks using statedump?
-####Using memory accounting feature:
+#### Using memory accounting feature:
-`https://bugzilla.redhat.com/show_bug.cgi?id=1120151` is one of the bugs which was debugged using statedump to see which data-structure is leaking. Here is the process used to find what the leak is using statedump. According to the bug the observation is that the process memory usage is increasing whenever one of the bricks is wiped in a replicate volume and a `full` self-heal is invoked to heal the contents. Statedump of the process is taken using kill -USR1 `<pid-of-gluster-self-heal-daemon>`.
+[Bug 1120151](https://bugzilla.redhat.com/show_bug.cgi?id=1120151) is one of the bugs which was debugged using statedump to see which data-structure is leaking. Here is the process used to find what the leak is using statedump. According to the bug the observation is that the process memory usage is increasing whenever one of the bricks is wiped in a replicate volume and a `full` self-heal is invoked to heal the contents. Statedump of the process is taken using `kill -USR1 <pid-of-gluster-self-heal-daemon>`.
```
grep -w num_allocs glusterdump.5225.dump.1405493251
num_allocs=77078
@@ -284,10 +293,10 @@ grep of the statedump revealed too many allocations for the following data-types
3. gf_common_mt_mem_pool.
After checking afr-code for allocations with tag `gf_common_mt_char` found `data-self-heal` code path does not free one such allocated memory. `gf_common_mt_mem_pool` suggests that there is a leak in pool memory. `replicate-0:dict_t`, `glusterfs:data_t` and `glusterfs:data_pair_t` pools are using lot of memory, i.e. cold_count is `0` and too many allocations. Checking source code of dict.c revealed that `key` in `dict` is allocated with `gf_common_mt_char` i.e. `2.` tag and value is created using gf_asprintf which in-turn uses `gf_common_mt_asprintf` i.e. `1.`. Browsing the code for leak in self-heal code paths lead to a line which over-writes a variable with new dictionary even when it was already holding a reference to another dictionary. After fixing these leaks, ran the same test to verify that none of the `num_allocs` are increasing even after healing 10,000 files directory hierarchy in statedump of self-heal daemon.
-Please check http://review.gluster.org/8316 for more info about patch/code.
+Please check this [patch](http://review.gluster.org/8316) for more info about the fix.
-####Debugging leaks in memory pools:
-Statedump output of memory pools was used to test and verify the fixes to https://bugzilla.redhat.com/show_bug.cgi?id=1134221. On code analysis, dict_t objects were found to be leaking (in terms of not being unref'd enough number of times, during name self-heal. The test involved creating 100 files on plain replicate volume, removing them from one of the bricks's backend, and then triggering lookup on them from the mount point. Statedump of the mount process was taken before executing the test case and after it, after compiling glusterfs with -DDEBUG flags (to have cold count set to 0 by default).
+#### Debugging leaks in memory pools:
+Statedump output of memory pools was used to test and verify the fixes to [Bug 1134221](https://bugzilla.redhat.com/show_bug.cgi?id=1134221). On code analysis, dict_t objects were found to be leaking (in terms of not being unref'd enough number of times, during name self-heal. The test involved creating 100 files on plain replicate volume, removing them from one of the brick's backend, and then triggering lookup on them from the mount point. Statedump of the mount process was taken before executing the test case and after it, after compiling glusterfs with -DDEBUG flags (to have cold count set to 0 by default).
Statedump output of the fuse mount process before the test case was executed:
@@ -319,7 +328,7 @@ cur-stdalloc=214
max-stdalloc=220
```
-Here, with cold count being 0 by default, cur-stdalloc indicated the number of dict_t objects that were allocated in heap using mem_get(), and yet to be freed using mem_put() (refer to https://github.com/gluster/glusterfs/blob/master/doc/data-structures/mem-pool.md for more details on how mempool works). After the test case (name selfheal of 100 files), there was a rise in the cur-stdalloc value (from 14 to 214) for dict_t.
+Here, with cold count being 0 by default, `cur-stdalloc` indicated the number of `dict_t` objects that were allocated in heap using `mem_get()`, and yet to be freed using `mem_put()` (refer to this [page](../developer-guide/datastructure-mem-pool.md) for more details on how mempool works). After the test case (name selfheal of 100 files), there was a rise in the cur-stdalloc value (from 14 to 214) for `dict_t`.
After these leaks were fixed, glusterfs was again compiled with -DDEBUG flags, and the same steps were performed again and statedump was taken before and after executing the test case, of the mount. This was done to ascertain the validity of the fix. And the following are the results:
@@ -353,8 +362,8 @@ max-stdalloc=119
```
The value of cur-stdalloc remained 14 before and after the test, indicating that the fix indeed does what it's supposed to do.
-###How to debug hangs because of frame-loss?
-`https://bugzilla.redhat.com/show_bug.cgi?id=994959` is one of the bugs where statedump was helpful in finding where the frame was lost. Here is the process used to find where the hang is using statedump.
+### How to debug hangs because of frame-loss?
+[Bug 994959](https://bugzilla.redhat.com/show_bug.cgi?id=994959) is one of the bugs where statedump was helpful in finding where the frame was lost. Here is the process used to find where the hang is using statedump.
When the hang was observed, statedumps are taken for all the processes. On mount's statedump the following stack is shown:
```
[global.callpool.stack.1.frame.1]
@@ -402,4 +411,4 @@ unwind_to=qr_readdirp_cbk
```
`unwind_to` shows that call was unwound to `afr_readdirp_cbk` from client xlator.
Inspecting that function revealed that afr is not unwinding the stack when fop failed.
-Check http://review.gluster.org/5531 for more info about patch/code changes.
+Check this [patch](http://review.gluster.org/5531) for more info about the fix.
diff --git a/doc/developer-guide/Language-Bindings.md b/doc/developer-guide/Language-Bindings.md
index 33f2e58504c..951f5fae2f6 100644
--- a/doc/developer-guide/Language-Bindings.md
+++ b/doc/developer-guide/Language-Bindings.md
@@ -1,3 +1,4 @@
+# Language Bindings
GlusterFS 3.4 introduced the libgfapi client API for C programs. This
page lists bindings to the libgfapi C library from other languages.
diff --git a/doc/developer-guide/Developers-Index.md b/doc/developer-guide/README.md
index 4c6346e83d4..aaf9c7476b0 100644
--- a/doc/developer-guide/Developers-Index.md
+++ b/doc/developer-guide/README.md
@@ -18,11 +18,9 @@ code check-in.
the GPL v2 and the LGPL v3 or later
- [GlusterFS Coding Standards](./coding-standard.md)
-Developing
-----------
+- If you are not sure of where to start, and what to do, we have a small
+ write-up on what you can pick. [Check it out](./options-to-contribute.md)
-- [Language Bindings](./Language Bindings.md) - Connect to
- GlusterFS using various language bindings
Adding File operations
----------------------
@@ -53,20 +51,29 @@ Daemon Management Framework
Translators
-----------
-- [Block Device Tanslator](./bd-xlator.md)
- [Performance/write-Behind Translator](./write-behind.md)
- [Translator Development](./translator-development.md)
- [Storage/posix Translator](./posix.md)
-- [Compression translator](./network_compression.md)
+
+
+Brick multiplex
+---------------
+
+- [Brick mux resource reduction](./brickmux-thread-reduction.md)
+
+Fuse
+----
+
+- [Interrupt handling](./fuse-interrupt.md)
Testing/Debugging
-----------------
- [Unit Tests in GlusterFS](./unittest.md)
- [Using the Gluster Test
- Framework](./Using Gluster Test Framework.md) - Step by
+ Framework](./Using-Gluster-Test-Framework.md) - Step by
step instructions for running the Gluster Test Framework
-- [Coredump Analysis](./coredump-analysis.md) - Steps to analize coredumps generated by regression machines.
+- [Coredump Analysis](../debugging/analyzing-regression-cores.md) - Steps to analize coredumps generated by regression machines.
- [Identifying Resource Leaks](./identifying-resource-leaks.md)
Release Process
diff --git a/doc/developer-guide/Using-Gluster-Test-Framework.md b/doc/developer-guide/Using-Gluster-Test-Framework.md
index 96fa9247e84..d2bb1c391da 100644
--- a/doc/developer-guide/Using-Gluster-Test-Framework.md
+++ b/doc/developer-guide/Using-Gluster-Test-Framework.md
@@ -1,3 +1,4 @@
+# USing Gluster Test Framwork
Description
-----------
diff --git a/doc/developer-guide/afr-locks-evolution.md b/doc/developer-guide/afr-locks-evolution.md
index 7d2a136d871..2dabbcfeb13 100644
--- a/doc/developer-guide/afr-locks-evolution.md
+++ b/doc/developer-guide/afr-locks-evolution.md
@@ -32,10 +32,10 @@ AFR makes use of locks xlator extensively:
* For Entry self-heal, it is `entrylk(NULL name, parent inode)`. Specifying NULL for the name takes full lock on the directory referred to by the inode.
* For data self-heal, there is a bit of history as to how locks evolved:
-###Initial version (say version 1) :
+### Initial version (say version 1) :
There was no concept of selfheal daemon (shd). Only client lookups triggered heals. so AFR always took `inodelk(0,0,DATA_DOMAIN)` for healing. The issue with this approach was that when heal was in progress, I/O from clients was blocked .
-###version 2:
+### version 2:
shd was introduced. We needed to allow I/O to go through when heal was going,provided the ranges did not overlap. To that extent, the following approach was adopted:
+ 1.shd takes (full inodelk in DATA_DOMAIN). Thus client FOPS are blocked and cannot modify changelog-xattrs
@@ -79,7 +79,7 @@ It modifies data but the FOP succeeds only on brick 2. writev returns success, a
and thus goes ahead and copies stale 128Kb from brick 1 to brick2. Thus as far as application is concerned, `writev` returned success but bricks have stale data.
What needs to be done is `writev` must return success only if it succeeded on atleast one source brick (brick b1 in this case). Otherwise The heal still happens in reverse direction but as far as the application is concerned, it received an error.
-###Note on lock **domains**
+### Note on lock **domains**
We have used conceptual names in this document like DATA_DOMAIN/ METADATA_DOMAIN/ SELF_HEAL_DOMAIN. In the code, these are mapped to strings that are based on the AFR xlator name like so:
DATA_DOMAIN --->"vol_name-replicate-n"
diff --git a/doc/developer-guide/afr-self-heal-daemon.md b/doc/developer-guide/afr-self-heal-daemon.md
index b85ddd1c856..65940d420b7 100644
--- a/doc/developer-guide/afr-self-heal-daemon.md
+++ b/doc/developer-guide/afr-self-heal-daemon.md
@@ -39,7 +39,7 @@ When a client (mount) performs an operation on the file, the index xlator presen
and removes it in post-op phase if the operation is successful. Thus if an entry is present inside the .glusterfs/indices/xattrop/ directory when there is no I/O
happening on the file, it means the file needs healing (or atleast an examination if the brick crashed after the post-op completed but just before the removal of the hardlink).
-####Index heal steps:
+#### Index heal steps:
<pre><code>
In shd process of *each node* {
opendir +readdir (.glusterfs/indices/xattrop/)
diff --git a/doc/developer-guide/bd-xlator.md b/doc/developer-guide/bd-xlator.md
deleted file mode 100644
index 1771fb6e24b..00000000000
--- a/doc/developer-guide/bd-xlator.md
+++ /dev/null
@@ -1,469 +0,0 @@
-#Block device translator
-
-Block device translator (BD xlator) is a translator added to GlusterFS which provides block backend for GlusterFS. This replaces the existing bd_map translator in GlusterFS that provided similar but very limited functionality. GlusterFS expects the underlying brick to be formatted with a POSIX compatible file system. BD xlator changes that and allows for having bricks that are raw block devices like LVM which needn’t have any file systems on them. Hence with BD xlator, it becomes possible to build a GlusterFS volume comprising of bricks that are logical volumes (LV).
-
-##bd
-
-BD xlator maps underlying LVs to files and hence the LVs appear as files to GlusterFS clients. Though BD volume externally appears very similar to the usual Posix volume, not all operations are supported or possible for the files on a BD volume. Only those operations that make sense for a block device are supported and the exact semantics are described in subsequent sections.
-
-While Posix volume takes a file system directory as brick, BD volume needs a volume group (VG) as brick. In the usual use case of BD volume, a file created on BD volume will result in an LV being created in the brick VG. In addition to a VG, BD volume also needs a file system directory that should be specified at the volume creation time. This directory is necessary for supporting the notion of directories and directory hierarchy for the BD volume. Metadata about LVs (size, mapping info) is stored in this directory.
-
-BD xlator was mainly developed to use block devices directly as VM images when GlusterFS is used as storage for KVM virtualization. Some of the salient points of BD xlator are
-
-* Since BD supports file level snapshots and clones by leveraging the snapshot and clone capabilities of LVM, it can be used to fully off-load snapshot and cloning operations from QEMU to the storage (GlusterFS) itself.
-
-* BD understands dm-thin LVs and hence can support files that are backed by thinly provisioned LVs. This capability of BD xlator translates to having thinly provisioned raw VM images.
-
-* BD enables thin LVs from a thin pool to be used from multiple nodes that have visibility to GlusterFS BD volume. Thus thin pool can be used as a VM image repository allowing access/visibility to it from multiple nodes.
-
-* BD supports true zerofill by using BLKZEROOUT ioctl on underlying block devices. Thus BD allows SCSI WRITESAME to be used on underlying block device if the device supports it.
-
-Though BD xlator is primarily intended to be used with block devices, it does provide full Posix xlator compatibility for files that are created on BD volume but are not backed by or mapped to a block device. Such files which don’t have a block device mapping exist on the Posix directory that is specified during BD volume creation. BD xlator is available from GlusterFS-3.5 release.
-
-###Compiling BD translator
-
-BD xlator needs lvm2 development library. –enable-bd-xlator option can be used with `./configure` script to explicitly enable BD translator. The following snippet from the output of configure script shows that BD xlator is enabled for compilation.
-
-
-#####GlusterFS configure summary
-
- …
- Block Device xlator : yes
-
-
-###Creating a BD volume
-
-BD supports hosting of both linear LV and thin LV within the same volume. However seperate examples are provided below. As noted above, the prerequisite for a BD volume is VG which is created from a loop device here, but it can be any other device too.
-
-
-* Creating BD volume with linear LV backend
-
-* Create a loop device
-
-
- [root@node ~]# dd if=/dev/zero of=bd-loop count=1024 bs=1M
-
- [root@node ~]# losetup /dev/loop0 bd-loop
-
-
-* Prepare a brick by creating a VG
-
- [root@node ~]# pvcreate /dev/loop0
-
- [root@node ~]# vgcreate bd-vg /dev/loop0
-
-
-* Create the BD volume
-
-* Create a POSIX directory first
-
-
- [root@node ~]# mkdir /bd-meta
-
-It is recommended that this directory is created on an LV in the brick VG itself so that both data and metadata live together on the same device.
-
-
-* Create and mount the volume
-
- [root@node ~]# gluster volume create bd node:/bd-meta?bd-vg force
-
-
-The general syntax for specifying the brick is `host:/posix-dir?volume-group-name` where “?” is the separator.
-
-
-
- [root@node ~]# gluster volume start bd
- [root@node ~]# gluster volume info bd
- Volume Name: bd
- Type: Distribute
- Volume ID: cb042d2a-f435-4669-b886-55f5927a4d7f
- Status: Started
- Xlator 1: BD
- Capability 1: offload_copy
- Capability 2: offload_snapshot
- Number of Bricks: 1
- Transport-type: tcp
- Bricks:
- Brick1: node:/bd-meta
- Brick1 VG: bd-vg
-
-
-
- [root@node ~]# mount -t glusterfs node:/bd /mnt
-
-* Create a file that is backed by an LV
-
- [root@node ~]# ls /mnt
-
- [root@node ~]#
-
-Since the volume is empty now, so is the underlying VG.
-
- [root@node ~]# lvdisplay bd-vg
- [root@node ~]#
-
-Creating a file that is mapped to an LV is a 2 step operation. First the file should be created on the mount point and a specific extended attribute should be set to map the file to LV.
-
- [root@node ~]# touch /mnt/lv
- [root@node ~]# setfattr -n “user.glusterfs.bd” -v “lv” /mnt/lv
-
-Now an LV got created in the VG brick and the file /mnt/lv maps to this LV. Any read/write to this file ends up as read/write to the underlying LV.
-
- [root@node ~]# lvdisplay bd-vg
- — Logical volume —
- LV Path /dev/bd-vg/6ff0f25f-2776-4d19-adfb-df1a3cab8287
- LV Name 6ff0f25f-2776-4d19-adfb-df1a3cab8287
- VG Name bd-vg
- LV UUID PjMPcc-RkD5-RADz-6ixG-UYsk-oclz-vL0nv6
- LV Write Access read/write
- LV Creation host, time node, 2013-11-26 16:15:45 +0530
- LV Status available
- open 0
- LV Size 4.00 MiB
- Current LE 1
- Segments 1
- Allocation inherit
- Read ahead sectors 0
- Block device 253:6
-
-The file gets created with default LV size which is 1 LE which is 4MB in this case.
-
- [root@node ~]# ls -lh /mnt/lv
- -rw-r–r–. 1 root root 4.0M Nov 26 16:15 /mnt/lv
-
-truncate can be used to set the required file size.
-
- [root@node ~]# truncate /mnt/lv -s 256M
- [root@node ~]# lvdisplay bd-vg
- — Logical volume —
- LV Path /dev/bd-vg/6ff0f25f-2776-4d19-adfb-df1a3cab8287
- LV Name 6ff0f25f-2776-4d19-adfb-df1a3cab8287
- VG Name bd-vg
- LV UUID PjMPcc-RkD5-RADz-6ixG-UYsk-oclz-vL0nv6
- LV Write Access read/write
- LV Creation host, time node, 2013-11-26 16:15:45 +0530
- LV Status available
- # open 0
- LV Size 256.00 MiB
- Current LE 64
- Segments 1
- Allocation inherit
- Read ahead sectors 0
- Block device 253:6
-
-
- [root@node ~]# ls -lh /mnt/lv
- -rw-r–r–. 1 root root 256M Nov 26 16:15 /mnt/lv
-
- currently LV size has been set to 256
-
-The size of the file/LV can be specified during creation/mapping time itself like this:
-
- setfattr -n “user.glusterfs.bd” -v “lv:256MB” /mnt/lv
-
-2. Creating BD volume with thin LV backend
-
-* Create a loop device
-
-
- [root@node ~]# dd if=/dev/zero of=bd-loop-thin count=1024 bs=1M
-
- [root@node ~]# losetup /dev/loop0 bd-loop-thin
-
-
-* Prepare a brick by creating a VG and thin pool
-
-
- [root@node ~]# pvcreate /dev/loop0
-
- [root@node ~]# vgcreate bd-vg-thin /dev/loop0
-
-
-* Create a thin pool
-
-
- [root@node ~]# lvcreate –thin bd-vg-thin -L 1000M
-
- Rounding up size to full physical extent 4.00 MiB
- Logical volume “lvol0″ created
-
-lvdisplay shows the thin pool
-
- [root@node ~]# lvdisplay bd-vg-thin
- — Logical volume —
- LV Name lvol0
- VG Name bd-vg-thin
- LV UUID HVa3EM-IVMS-QG2g-oqU6-1UxC-RgqS-g8zhVn
- LV Write Access read/write
- LV Creation host, time node, 2013-11-26 16:39:06 +0530
- LV Pool transaction ID 0
- LV Pool metadata lvol0_tmeta
- LV Pool data lvol0_tdata
- LV Pool chunk size 64.00 KiB
- LV Zero new blocks yes
- LV Status available
- # open 0
- LV Size 1000.00 MiB
- Allocated pool data 0.00%
- Allocated metadata 0.88%
- Current LE 250
- Segments 1
- Allocation inherit
- Read ahead sectors auto
- Block device 253:9
-
-* Create the BD volume
-
-* Create a POSIX directory first
-
-
- [root@node ~]# mkdir /bd-meta-thin
-
-* Create and mount the volume
-
- [root@node ~]# gluster volume create bd-thin node:/bd-meta-thin?bd-vg-thin force
-
- [root@node ~]# gluster volume start bd-thin
-
-
- [root@node ~]# gluster volume info bd-thin
- Volume Name: bd-thin
- Type: Distribute
- Volume ID: 27aa7eb0-4ffa-497e-b639-7cbda0128793
- Status: Started
- Xlator 1: BD
- Capability 1: thin
- Capability 2: offload_copy
- Capability 3: offload_snapshot
- Number of Bricks: 1
- Transport-type: tcp
- Bricks:
- Brick1: node:/bd-meta-thin
- Brick1 VG: bd-vg-thin
-
-
- [root@node ~]# mount -t glusterfs node:/bd-thin /mnt
-
-* Create a file that is backed by a thin LV
-
-
- [root@node ~]# ls /mnt
-
- [root@node ~]#
-
-Creating a file that is mapped to a thin LV is a 2 step operation. First the file should be created on the mount point and a specific extended attribute should be set to map the file to a thin LV.
-
- [root@node ~]# touch /mnt/thin-lv
-
- [root@node ~]# setfattr -n “user.glusterfs.bd” -v “thin:256MB” /mnt/thin-lv
-
-Now /mnt/thin-lv is a thin provisioned file that is backed by a thin LV and size has been set to 256.
-
- [root@node ~]# lvdisplay bd-vg-thin
- — Logical volume —
- LV Name lvol0
- VG Name bd-vg-thin
- LV UUID HVa3EM-IVMS-QG2g-oqU6-1UxC-RgqS-g8zhVn
- LV Write Access read/write
- LV Creation host, time node, 2013-11-26 16:39:06 +0530
- LV Pool transaction ID 1
- LV Pool metadata lvol0_tmeta
- LV Pool data lvol0_tdata
- LV Pool chunk size 64.00 KiB
- LV Zero new blocks yes
- LV Status available
- # open 0
- LV Size 000.00 MiB
- Allocated pool data 0.00%
- Allocated metadata 0.98%
- Current LE 250
- Segments 1
- Allocation inherit
- Read ahead sectors auto
- Block device 253:9
-
-
-
-
- — Logical volume —
- LV Path dev/bd-vg-thin/081b01d1-1436-4306-9baf-41c7bf5a2c73
- LV Name 081b01d1-1436-4306-9baf-41c7bf5a2c73
- VG Name bd-vg-thin
- LV UUID coxpTY-2UZl-9293-8H2X-eAZn-wSp6-csZIeB
- LV Write Access read/write
- LV Creation host, time node, 2013-11-26 16:43:19 +0530
- LV Pool name lvol0
- LV Status available
- # open 0
- LV Size 256.00 MiB
- Mapped size 0.00%
- Current LE 64
- Segments 1
- Allocation inherit
- Read ahead sectors auto
- Block device 253:10
-
-
-
-
-
-As can be seen from above, creation of a file resulted in creation of a thin LV in the brick.
-
-
-###Improvisation on BD translator:
-
-First version of BD xlator ( block backend) had few limitations such as
-
-* Creation of directories not supported
-* Supports only single brick
-* Does not use extended attributes (and client gfid) like posix xlator
-* Creation of special files (symbolic links, device nodes etc) not
- supported
-
-Basic limitation of not allowing directory creation was blocking
-oVirt/VDSM to consume BD xlator as part of Gluster domain since VDSM
-creates multi-level directories when GlusterFS is used as storage
-backend for storing VM images.
-
-To overcome these limitations a new BD xlator with following
-improvements are implemented.
-
-* New hybrid BD xlator that handles both regular files and block device
- files
-* The volume will have both POSIX and BD bricks. Regular files are
- created on POSIX bricks, block devices are created on the BD brick (VG)
-* BD xlator leverages exiting POSIX xlator for most POSIX calls and
- hence sits above the POSIX xlator
-* Block device file is differentiated from regular file by an extended
- attribute
-* The xattr 'user.glusterfs.bd' (BD_XATTR) plays a role in mapping a
- posix file to Logical Volume (LV).
-* When a client sends a request to set BD_XATTR on a posix file, a new
- LV is created and mapped to posix file. So every block device will
- have a representative file in POSIX brick with 'user.glusterfs.bd'
- (BD_XATTR) set.
-* Here after all operations on this file results in LV related
- operations.
-
-For example, opening a file that has BD_XATTR set results in opening
-the LV block device, reading results in reading the corresponding LV
-block device.
-
-When BD xlator gets request to set BD_XATTR via setxattr call, it
-creates a LV and information about this LV is placed in the xattr of the
-posix file. xattr "user.glusterfs.bd" used to identify that posix file
-is mapped to BD.
-
-Usage:
-Server side:
-
- [root@host1 ~]# gluster volume create bdvol host1:/storage/vg1_info?vg1 host2:/storage/vg2_info?vg2
-
-It creates a distributed gluster volume 'bdvol' with Volume Group vg1
-using posix brick /storage/vg1_info in host1 and Volume Group vg2 using
-/storage/vg2_info in host2.
-
-
- [root@host1 ~]# gluster volume start bdvol
-
-Client side:
-
- [root@node ~]# mount -t glusterfs host1:/bdvol /media
- [root@node ~]# touch /media/posix
-
-It creates regular posix file 'posix' in either host1:/vg1 or host2:/vg2 brick
-
- [root@node ~]# mkdir /media/image
-
- [root@node ~]# touch /media/image/lv1
-
-
-It also creates regular posix file 'lv1' in either host1:/vg1 or
-host2:/vg2 brick
-
- [root@node ~]# setfattr -n "user.glusterfs.bd" -v "lv" /media/image/lv1
-
- [root@node ~]#
-
-
-Above setxattr results in creating a new LV in corresponding brick's VG
-and it sets 'user.glusterfs.bd' with value 'lv:<default-extent-size''
-
-
- [root@node ~]# truncate -s5G /media/image/lv1
-
-
-It results in resizig LV 'lv1'to 5G
-
-New BD xlator code is placed in `xlators/storage/bd` directory.
-
-Also add volume-uuid to the VG so that same VG cannot be used for other
-bricks/volumes. After deleting a gluster volume, one has to manually
-remove the associated tag using vgchange <vg-name> --deltag
-`<trusted.glusterfs.volume-id:<volume-id>>`
-
-
-#### Exposing volume capabilities
-
-With multiple storage translators (posix and bd) being supported in GlusterFS, it becomes
-necessary to know the volume type so that user can issue appropriate calls that are relevant
-only to the a given volume type. Hence there needs to be a way to expose the type of
-the storage translator of the volume to the user.
-
-BD xlator is capable of providing server offloaded file copy, server/storage offloaded
-zeroing of a file etc. This capabilities should be visible to the client/user, so that these
-features can be exploited.
-
-BD xlator exports capability information through gluster volume info (and --xml) output. For eg:
-
-`snip of gluster volume info output for a BD based volume`
-
- Xlator 1: BD
- Capability 1: thin
-
-`snip of gluster volume info --xml output for a BD based volume`
-
- <xlators>
- <xlator>
- <name>BD</name>
- <capabilities>
- <capability>thin</capability>
- </capabilities>
- </xlator>
- </xlators>
-
-But this capability information should also exposed through some other means so that a host
-which is not part of Gluster peer could also avail this capabilities.
-
-* Type
-
-BD translator supports both regular files and block device, i,e., one can create files on
-GlusterFS volume backed by BD translator and this file could end up as regular posix file or
-a logical volume (block device) based on the user''s choice. User can do a setxattr on the
-created file to convert it to a logical volume.
-
-Users of BD backed volume like QEMU would like to know that it is working with BD type of volume
-so that it can issue an additional setxattr call after creating a VM image on GlusterFS backend.
-This is necessary to ensure that the created VM image is backed by LV instead of file.
-
-There are different ways to expose this information (BD type of volume) to user.
-One way is to export it via a `getxattr` call. That said, When a client issues getxattr("volume_type")
-on a root gfid, bd xlator will return 1 implying its BD xlator. But posix xlator will return ENODATA
-and client code can interpret this as posix xlator. Also capability list can be returned via
-getxattr("caps") for root gfid.
-
-* Capabilities
-
-BD xlator supports new features such as server offloaded file copy, thin provisioned VM images etc.
-
-There is no standard way of exploiting these features from client side (such as syscall
-to exploit server offloaded copy). So these features need to be exported to the client so that
-they can be used. BD xlator latest version exports these capabilities information through
-gluster volume info (and --xml) output. But if a client is not part of GlusterFS peer
-it can''t run volume info command to get the list of capabilities of a given GlusterFS volume.
-For example, GlusterFS block driver in qemu need to get the capability list so that these features are used.
-
-
-
-Parts of this documentation were originally published here
-#http://raobharata.wordpress.com/2013/11/27/glusterfs-block-device-translator/
diff --git a/doc/developer-guide/brickmux-thread-reduction.md b/doc/developer-guide/brickmux-thread-reduction.md
new file mode 100644
index 00000000000..7d76e8ff579
--- /dev/null
+++ b/doc/developer-guide/brickmux-thread-reduction.md
@@ -0,0 +1,64 @@
+# Resource usage reduction in brick multiplexing
+
+Each brick is regresented with a graph of translators in a brick process.
+Each translator in the graph has its own set of threads and mem pools
+and other system resources allocations. Most of the times all these
+resources are not put to full use. Reducing the resource consumption
+of each brick is a problem in itself that needs to be addressed. The other
+aspect to it is, sharing of resources across brick graph, this becomes
+critical in brick multiplexing scenario. In this document we will be discussing
+only about the threads.
+
+If a brick mux process hosts 50 bricks there are atleast 600+ threads created
+in that process. Some of these are global threads that are shared by all the
+brick graphs, and others are per translator threads. The global threads like
+synctask threads, timer threads, sigwaiter, poller etc. are configurable and
+do not needs to be reduced. The per translator threads keeps growing as the
+number of bricks in the process increases. Each brick spawns atleast 10+
+threads:
+- io-threads
+- posix threads:
+ 1. Janitor
+ 2. Fsyncer
+ 3. Helper
+ 4. aio-thread
+- changelog and bitrot threads(even when the features are not enabled)
+
+## io-threads
+
+io-threads should be made global to the process, having 16+ threads for
+each brick does not make sense. But io-thread translator is loaded in
+the graph, and the position of io-thread translator decides from when
+the fops will be parallelised across threads. We cannot entirely move
+the io-threads to libglusterfs and say the multiplexing happens from
+the master translator or so. Hence, the io-thread orchestrator code
+is moved to libglusterfs, which ensures there is only one set of
+io-threads that is shared among the io-threads translator in each brick.
+This poses performance issues due to lock-contention in the io-threds
+layer. This also shall be addressed by having multiple locks instead of
+one global lock for io-threads.
+
+## Posix threads
+Most of the posix threads execute tasks in a timely manner, hence it can be
+replaced with a timer whose handler register a task to synctask framework, once
+the task is complete, the timer is registered again. With this we can eliminate
+the need of one thread for each task. The problem with using synctasks is
+the performance impact it will have due to make/swapcontext. For task that
+does not involve network wait, we need not do makecontext, instead the task
+function with arg can be stored and executed when a synctask thread is free.
+We need to implement an api in synctask to execute atomic tasks(no network wait)
+without the overhead of make/swapcontext. This will solve the performance
+impact associated with using synctask framework.
+
+And the other challenge, is to cancel all the tasks pending from a translator.
+This is important to cleanly detach brick. For this, we need to implement an
+api in synctask that can cancel all the tasks from a given translator.
+
+For future, this will be replced to use global thread-pool(once implemented).
+
+## Changelog and bitrot threads
+
+In the initial implementation, the threads are not created if the feature is
+not enabled. We need to share threads across changelog instances if we plan
+to enable these features in brick mux scenario.
+
diff --git a/doc/developer-guide/coding-standard.md b/doc/developer-guide/coding-standard.md
index 446e3424d16..031c6c0da99 100644
--- a/doc/developer-guide/coding-standard.md
+++ b/doc/developer-guide/coding-standard.md
@@ -1,6 +1,30 @@
GlusterFS Coding Standards
==========================
+Before you get started
+----------------------
+Before starting with other part of coding standard, install `clang-format`
+
+On Fedora:
+```
+$ dnf install clang
+```
+On debian/Ubuntu:
+```
+$ apt-get install clang
+```
+Once you are done with all the local changes, you need to run below set of commands,
+before submitting the patch for review.
+```
+$ git add $file # if any
+$ git commit -a -s -m "commit message"
+$ git show --pretty="format:" --name-only | grep -v "contrib/" | egrep "*\.[ch]$" | xargs clang-format -i
+$ git diff # see if there are any changes
+$ git commit -a --amend # get the format changes done
+$ ./submit-for-review.sh
+```
+
+
Structure definitions should have a comment per member
------------------------------------------------------
@@ -26,6 +50,54 @@ DBTYPE access_mode; /* access mode for accessing
*/
```
+Structure members should be aligned based on the padding requirements
+---------------------------------------------------------------------
+
+The compiler will make sure that structure members have optimum alignment,
+but at the expense of suboptimal padding. More important is to optimize the
+padding. The compiler won't do that for you.
+
+This also will help utilize the memory better
+
+*Bad:*
+```
+struct bad {
+ bool b; /* 0 */
+ /* 1..7 pad */
+ void *p; /* 8..15 */
+ char c; /* 16 */
+ char a[16]; /* 17..33 */
+ /* 34..39 pad */
+ int64_t ii; /* 40..47 */
+ int32_t i; /* 48..51 */
+ /* 52..55 pad */
+ int64_t iii; /* 56..63 */
+};
+```
+
+*Good:*
+```
+struct good {
+ int64_t ii; /* explicit 64-bit types */
+ void *p; /* may be 64- or 32-bit */
+ long l; /* may be 64- or 32-bit */
+ int i; /* 32-bit */
+ short s; /* 16-bit */
+ char c; /* 8-bit */
+ bool b; /* 8-bit */
+ char a[1024];
+);
+```
+Make sure the items with the most stringent alignment requirements will need
+to come earliest (ie, pointers and perhaps uint64_t etc), and those with less
+stringent alignment requirements at the end (uint16/uint8 and char). Also note
+that the long array (if any) should be at the end of the structure, regardless
+of the type.
+
+Also note, if your structure's overall size is crossing 1k-4k limit, it is
+recommended to mention the reason why the particular structure needs so much
+memory as a comment at the top.
+
Use \_typename for struct tags and typename\_t for typedefs
---------------------------------------------------------
@@ -108,6 +180,28 @@ instead.
gf_boolean_t port_inuse[65536]; /* 256KB, this actually happened */
```
+NOTE: Ideal is to limit the stack array to less than 256 bytes.
+
+
+Character array initializing
+----------------------------
+
+It is recommended to keep the character array initializing to empty string.
+
+*Good:*
+```
+char msg[1024] = "";
+```
+
+Not so much recommended, even though it means the same.
+
+```
+char msg[1024] = {0,};
+```
+
+We recommend above to structure initialization.
+
+
Validate all arguments to a function
------------------------------------
@@ -272,6 +366,37 @@ strcpy (entry_path, real_path);
strncpy (entry_path, real_path, entry_path_len);
```
+Do not use memset prior to sprintf/snprintf/vsnprintf etc...
+------------------------------------------------------------
+snprintf(and other similar string functions) terminates the buffer with a
+'\0'(null character). Hence, there is no need to do a memset before using
+snprintf. (Of course you need to account one extra byte for the null character
+in your allocation).
+
+Note: Similarly if you are doing pre-memory allocation for the buffer, use
+GF_MALLOC instead of GF_CALLOC, since the later is bit costlier.
+
+*Bad:*
+
+```
+char buffer[x];
+memset (buffer, 0, x);
+bytes_read = snprintf (buffer, sizeof buffer, "bad standard");
+```
+
+*Good:*
+```
+char buffer[x];
+bytes_read = snprintf (buffer, sizeof (buffer), "good standard");
+```
+
+And it is always to good initialize the char array if the string is static.
+
+E.g.
+```
+char buffer[] = "good standard";
+```
+
No dead or commented code
-------------------------
@@ -495,8 +620,8 @@ If a value isn't supposed/expected to change, there's no cost to adding a
### Avoid global variables (including 'static' auto variables)
Almost all state in Gluster is contextual and should be contained in the
-appropriate structure reflecting its scope (e.g. call\_frame\_t, call\_stack\_t,
-xlator\_t, glusterfs\_ctx\_t). With dynamic loading and graph switches in play,
+appropriate structure reflecting its scope (e.g. `call\_frame\_t`, `call\_stack\_t`,
+`xlator\_t`, `glusterfs\_ctx\_t`). With dynamic loading and graph switches in play,
each global requires careful consideration of when it should be initialized or
reinitialized, when it might _accidentally_ be reinitialized, when its value
might become stale, and so on. A few global variables are needed to serve as
diff --git a/doc/developer-guide/commit-guidelines.md b/doc/developer-guide/commit-guidelines.md
new file mode 100644
index 00000000000..38bbe525cbd
--- /dev/null
+++ b/doc/developer-guide/commit-guidelines.md
@@ -0,0 +1,136 @@
+## Git Commit Good Practice
+
+The following document is based on experience doing code development, bug troubleshooting and code review across a number of projects using Git. The document is mostly borrowed from [Open Stack](https://wiki.openstack.org/wiki/GitCommitMessages), but made more meaningful in the context of GlusterFS project.
+
+This topic can be split into two areas of concern
+
+* The structured set/split of the code changes
+* The information provided in the commit message
+
+### Executive Summary
+The points and examples that will be raised in this document ought to clearly demonstrate the value in splitting up changes into a sequence of individual commits, and the importance in writing good commit messages to go along with them. If these guidelines were widely applied it would result in a significant improvement in the quality of the GlusterFS Git history. Both a carrot & stick will be required to effect changes. This document intends to be the carrot by alerting people to the benefits, while anyone doing Gerrit code review can act as the stick ;-P
+
+In other words, when reviewing a change in Gerrit:
+* Do not simply look at the correctness of the code.
+* Review the commit message itself and request improvements to its content.
+* Look out for commits which are mixing multiple logical changes and require the submitter to split them into separate commits.
+* Ensure whitespace changes are not mixed in with functional changes.
+* Ensure no-op code refactoring is done separately from functional changes.
+
+And so on.
+
+It might be mentioned that Gerrit's handling of patch series is not entirely perfect. Let that not become a valid reason to avoid creating patch series. The tools being used should be subservient to developers needs, and since they are open source they can be fixed / improved. Software source code is "read mostly, write occassionally" and thus the most important criteria is to improve the long term maintainability by the large pool of developers in the community, and not to sacrifice too much for the sake of the single author who may never touch the code again.
+
+And now the long detailed guidelines & examples of good & bad practice
+
+### Structural split of changes
+The cardinal rule for creating good commits is to ensure there is only one "logical change" per commit. There are many reasons why this is an important rule:
+
+* The smaller the amount of code being changed, the quicker & easier it is to review & identify potential flaws.
+* If a change is found to be flawed later, it may be necessary to revert the broken commit. This is much easier to do if there are not other unrelated code changes entangled with the original commit.
+* When troubleshooting problems using Git's bisect capability, small well defined changes will aid in isolating exactly where the code problem was introduced.
+* When browsing history using Git annotate/blame, small well defined changes also aid in isolating exactly where & why a piece of code came from.
+
+#### Things to avoid when creating commits
+With the above points in mind, there are some commonly encountered examples of bad things to avoid
+
+* Mixing whitespace changes with functional code changes.
+
+The whitespace changes will obscure the important functional changes, making it harder for a reviewer to correctly determine whether the change is correct. Solution: Create 2 commits, one with the whitespace changes, one with the functional changes. Typically the whitespace change would be done first, but that need not be a hard rule.
+
+* Mixing two unrelated functional changes.
+
+Again the reviewer will find it harder to identify flaws if two unrelated changes are mixed together. If it becomes necessary to later revert a broken commit, the two unrelated changes will need to be untangled, with further risk of bug creation.
+
+* Sending large new features in a single giant commit.
+
+It may well be the case that the code for a new feature is only useful when all of it is present. This does not, however, imply that the entire feature should be provided in a single commit. New features often entail refactoring existing code. It is highly desirable that any refactoring is done in commits which are separate from those implementing the new feature. This helps reviewers and test suites validate that the refactoring has no unintentional functional changes.
+
+Even the newly written code can often be split up into multiple pieces that can be independently reviewed. For example, changes which add new internal fops or library functions, can be in self-contained commits. Again this leads to easier code review. It also allows other developers to cherry-pick small parts of the work, if the entire new feature is not immediately ready for merge. This will encourage the author & reviewers to think about the generic library functions' design, and not simply pick a design that is easier for their currently chosen internal implementation.
+
+The basic rule to follow is
+
+If a code change can be split into a sequence of patches/commits, then it should be split. Less is not more. More is more.
+
+##### Examples of bad practice
+
+TODO: Pick glusterfs specific example.
+
+
+##### Examples of good practice
+
+
+### Information in commit messages
+As important as the content of the change, is the content of the commit message describing it. When writing a commit message there are some important things to remember
+
+* Do not assume the reviewer understands what the original problem was.
+
+When reading bug reports, after a number of back & forth comments, it is often as clear as mud, what the root cause problem is. The commit message should have a clear statement as to what the original problem is. The bug is merely interesting historical background on /how/ the problem was identified. It should be possible to review a proposed patch for correctness without needing to read the bug ticket.
+
+* Do not assume the reviewer has access to external web services/site.
+
+In 6 months time when someone is on a train/plane/coach/beach/pub troubleshooting a problem & browsing Git history, there is no guarantee they will have access to the online bug tracker, or online blueprint documents. The great step forward with distributed SCM is that you no longer need to be "online" to have access to all information about the code repository. The commit message should be totally self-contained, to maintain that benefit.
+
+* Do not assume the code is self-evident/self-documenting.
+
+What is self-evident to one person, might be clear as mud to another person. Always document what the original problem was and how it is being fixed, for any change except the most obvious typos, or whitespace only commits.
+
+* Describe why a change is being made.
+
+A common mistake is to just document how the code has been written, without describing /why/ the developer chose to do it that way. By all means describe the overall code structure, particularly for large changes, but more importantly describe the intent/motivation behind the changes.
+
+* Read the commit message to see if it hints at improved code structure.
+
+Often when describing a large commit message, it becomes obvious that a commit should have in fact been split into 2 or more parts. Don't be afraid to go back and rebase the change to split it up into separate commits.
+
+* Ensure sufficient information to decide whether to review.
+
+When Gerrit sends out email alerts for new patch submissions there is minimal information included, principally the commit message and the list of files changes. Given the high volume of patches, it is not reasonable to expect all reviewers to examine the patches in detail. The commit message must thus contain sufficient information to alert the potential reviewers to the fact that this is a patch they need to look at.
+
+* The first commit line is the most important.
+
+In Git commits the first line of the commit message has special significance. It is used as email subject line, git annotate messages, gitk viewer annotations, merge commit messages and many more places where space is at a premium. As well as summarizing the change itself, it should take care to detail what part of the code is affected. eg if it is 'afr', 'dht' or any translator. Or in some cases, it can be touching all these components, but the commit message can be 'coverity:', 'txn-framework:', 'new-fop: ', etc.
+
+* Describe any limitations of the current code.
+
+If the code being changed still has future scope for improvements, or any known limitations then mention these in the commit message. This demonstrates to the reviewer that the broader picture has been considered and what tradeoffs have been done in terms of short term goals vs. long term wishes.
+
+* Do not include patch set-specific comments.
+
+In other words, if you rebase your change please don't add "Patch set 2: rebased" to your commit message. That isn't going to be relevant once your change has merged. Please do make a note of that in Gerrit as a comment on your change, however. It helps reviewers know what changed between patch sets. This also applies to comments such as "Added unit tests", "Fixed localization problems", or any other such patch set to patch set changes that don't affect the overall intent of your commit.
+
+**The main rule to follow is:**
+
+The commit message must contain all the information required to fully understand & review the patch for correctness. Less is not more. More is more.
+
+
+#### Including external references
+
+The commit message is primarily targeted towards human interpretation, but there is always some metadata provided for machine use. In the case of GlusterFS this includes at least the 'Change-id', "bug"/"feature" ID references and "Signed-off-by" tag (generated by 'git commit -s').
+
+The 'Change-id' line is a unique hash describing the change, which is generated by a Git commit hook. This should not be changed when rebasing a commit following review feedback, since it is used by Gerrit, to track versions of a patch.
+
+The 'bug' line can reference a bug in a few ways. Gerrit creates a link to the bug when viewing the patch on review.gluster.org so that reviewers can quickly access the bug/issue on Bugzilla or Github.
+
+**Fixes: bz#1601166** -- use 'Fixes: bz#NNNNN' if the commit is intended to fully fix and close the bug being referenced.
+**Fixes: #411** -- use 'Fixes: #NNN' if the patch fixes the github issue completely.
+
+**Updates: bz#1193929** -- use 'Updates: bz#NNNN' if the commit is only a partial fix and more work is needed.
+**Updates: #175** -- use 'Updates: #NNNN' if the commit is only a partial fix and more work is needed for the feature completion.
+
+We encourage the use of `Co-Authored-By: name <name@example.com>` in commit messages to indicate people who worked on a particular patch. It's a convention for recognizing multiple authors, and our projects would encourage the stats tools to observe it when collecting statistics.
+
+### Summary of Git commit message structure
+
+* Provide a brief description of the change in the first line.
+* The first line should be limited to 50 characters and should not end with a period.
+
+* Insert a single blank line after the first line.
+
+* Provide a detailed description of the change in the following lines, breaking paragraphs where needed.
+
+* Subsequent lines should be wrapped at 72 characters.
+
+Put the 'Change-id', 'Fixes bz#NNNNN' and 'Signed-off-by: <>' lines at the very end.
+
+TODO: Add good examples
diff --git a/doc/developer-guide/datastructure-inode.md b/doc/developer-guide/datastructure-inode.md
index a340ab9ca8e..45d7a941e5f 100644
--- a/doc/developer-guide/datastructure-inode.md
+++ b/doc/developer-guide/datastructure-inode.md
@@ -1,6 +1,6 @@
-#Inode and dentry management in GlusterFS:
+# Inode and dentry management in GlusterFS:
-##Background
+## Background
Filesystems internally refer to files and directories via inodes. Inodes
are unique identifiers of the entities stored in a filesystem. Whenever an
application has to operate on a file/directory (read/modify), the filesystem
@@ -41,11 +41,10 @@ struct _inode_table {
};
```
-#Life-cycle
+# Life-cycle
```
-
inode_table_new (size_t lru_limit, xlator_t *xl)
-
+```
This is a function which allocates a new inode table. Usually the top xlators in
the graph such as protocol/server (for bricks), fuse and nfs (for fuse and nfs
mounts) and libgfapi do inode managements. Hence they are the ones which will
@@ -59,11 +58,8 @@ new inode table.
Thus an allocated inode table is destroyed only when the filesystem daemon is
killed or unmounted.
-```
-
-#what it contains.
-```
+# what it contains.
Inode table in glusterfs mainly contains a hash table for maintaining inodes.
In general a file/directory is considered to be existing if there is a
corresponding inode present in the inode table. If a inode for a file/directory
@@ -76,21 +72,21 @@ size of the hash table (as of now it is hard coded to 14057. The hash value of
a inode is calculated using its gfid).
Apart from the hash table, inode table also maintains 3 important list of inodes
-1) Active list:
+1. Active list:
Active list contains all the active inodes (i.e inodes which are currently part
of some fop).
-2) Lru list:
+2. Lru list:
Least recently used inodes list. A limit can be set for the size of the lru
list. For bricks it is 16384 and for clients it is infinity.
-3) Purge list:
+3. Purge list:
List of all the inodes which have to be purged (i.e inodes which have to be
deleted from the inode table due to unlink/rmdir/forget).
And at last it also contains the mem-pool for allocating inodes, dentries so
that frequent malloc/calloc and free of the data structures can be avoided.
-```
-#Data structure (inode)
+
+# Data structure (inode)
```
struct _inode {
inode_table_t *table; /* the table this inode belongs to */
@@ -108,7 +104,7 @@ struct _inode {
struct _inode_ctx *_ctx; /* place holder for keeping the
information about the inode by different xlators */
};
-
+```
As said above, inodes are internal way of identifying the files/directories. A
inode uniquely represents a file/directory. A new inode is created whenever a
create/mkdir/symlink/mknod operations are performed. Apart from that a new inode
@@ -128,9 +124,9 @@ inodes are those inodes whose refcount is greater than zero. Whenever some
operation comes on a file/directory, and the resolver tries to find the inode
for it, it increments the refcount of the inode before returning the inode. The
refcount of an inode can be incremented by calling the below function
-
+```
inode_ref (inode_t *inode)
-
+```
Any xlator which wants to operate on a inode as part of some fop (or wants the
inode in the callback), should hold a ref on the inode.
Once the fop is completed before sending the reply of the fop to the above
@@ -139,18 +135,18 @@ zero, it is removed from the active inodes list and put into LRU list maintained
by the inode table. Thus in short if some fop is happening on a file/directory,
the corresponding inode will be in the active list or it will be in the LRU
list.
-```
-#Life Cycle
+
+# Life Cycle
A new inode is created whenever a new file/directory/symlink is created OR a
successful lookup of an existing entry is done. The xlators which does inode
management (as of now protocol/server, fuse, nfs, gfapi) will perform inode_link
operation upon successful lookup or successful creation of a new entry.
-
+```
inode_link (inode_t *inode, inode_t *parent, const char *name,
struct iatt *buf);
-
+```
inode_link actually adds the inode to the inode table (to be precise it adds
the inode to the hash table maintained by the inode table. The hash value is
calculated based on the gfid). Copies the gfid to the inode (the gfid is
@@ -160,7 +156,7 @@ A inode is removed from the inode table and eventually destroyed when unlink
or rmdir operation is performed on a file/directory, or the the lru limit of
the inode table has been exceeded.
-#Data structure (dentry)
+# Data structure (dentry)
```
struct _dentry {
@@ -170,22 +166,22 @@ struct _dentry {
char *name; /* name of the directory entry */
inode_t *parent; /* directory of the entry */
};
-
+```
A dentry is the presence of an entry for a file/directory within its parent
directory. A dentry usually points to the inode to which it belongs to. In
glusterfs a dentry contains the following fields.
-1) a hook using which it can add itself to the list of
+1. a hook using which it can add itself to the list of
the dentries maintained by the inode to which it points to.
-2) A hash table pointer.
-3) Pointer to the inode to which it belongs to.
-4) Name of the dentry
-5) Pointer to the inode of the parent directory in which the dentry is present
+2. A hash table pointer.
+3. Pointer to the inode to which it belongs to.
+4. Name of the dentry
+5. Pointer to the inode of the parent directory in which the dentry is present
A new dentry is created when a new file/directory/symlink is created or a hard
link to an existing file is created.
-
+```
__dentry_create (inode_t *inode, inode_t *parent, const char *name);
-
+```
A dentry holds a refcount on the parent
directory so that the parent inode is never removed from the active inode's list
and put to the lru list (If the lru limit of the lru list is exceeded, there is
@@ -212,15 +208,14 @@ deleted due to file removal or lru limit being exceeded the inode is retired
purge list maintained by the inode table), the nlookup count is set to 0 via
inode_forget api. The inode table, then prunes all the inodes from the purge
list by destroying the inode contexts maintained by each xlator.
-
+```
unlinking of the dentry is done via inode_unlink;
void
inode_unlink (inode_t *inode, inode_t *parent, const char *name);
-
+```
If the inode has multiple hard links, then the unlink operation performed by
the application results just in the removal of the dentry with the name provided
by the application. For the inode to be removed, all the dentries of the inode
should be unlinked.
-```
diff --git a/doc/developer-guide/datastructure-iobuf.md b/doc/developer-guide/datastructure-iobuf.md
index 5f521f1485f..03604e3672c 100644
--- a/doc/developer-guide/datastructure-iobuf.md
+++ b/doc/developer-guide/datastructure-iobuf.md
@@ -1,6 +1,6 @@
-#Iobuf-pool
-##Datastructures
-###iobuf
+# Iobuf-pool
+## Datastructures
+### iobuf
Short for IO Buffer. It is one allocatable unit for the consumers of the IOBUF
API, each unit hosts @page_size(defined in arena structure) bytes of memory. As
initial step of processing a fop, the IO buffer passed onto GlusterFS by the
@@ -28,7 +28,7 @@ struct iobuf {
};
```
-###iobref
+### iobref
There may be need of multiple iobufs for a single fop, like in vectored read/write.
Hence multiple iobufs(default 16) are encapsulated under one iobref.
```
@@ -40,7 +40,7 @@ struct iobref {
int used; /* number of iobufs added to this iobref */
};
```
-###iobuf_arenas
+### iobuf_arenas
One region of memory MMAPed from the operating system. Each region MMAPs
@arena_size bytes of memory, and hosts @arena_size / @page_size IOBUFs.
The same sized iobufs are grouped into one arena, for sanity of access.
@@ -77,7 +77,7 @@ struct iobuf_arena {
};
```
-###iobuf_pool
+### iobuf_pool
Pool of Iobufs. As there may be many Io buffers required by the filesystem,
a pool of iobufs are preallocated and kept, if these preallocated ones are
exhausted only then the standard malloc/free is called, thus improving the
@@ -139,8 +139,8 @@ arenas in the purge list are destroyed only if there is atleast one arena in
(e.g: If there is an arena (page_size=128KB, count=32) in purge list, this arena
is destroyed(munmap) only if there is an arena in 'arenas' list with page_size=128KB).
-##APIs
-###iobuf_get
+## APIs
+### iobuf_get
```
struct iobuf *iobuf_get (struct iobuf_pool *iobuf_pool);
@@ -149,7 +149,7 @@ Creates a new iobuf of the default page size(128KB hard coded as of yet).
Also takes a reference(increments ref count), hence no need of doing it
explicitly after getting iobuf.
-###iobuf_get2
+### iobuf_get2
```
struct iobuf * iobuf_get2 (struct iobuf_pool *iobuf_pool, size_t page_size);
@@ -179,7 +179,7 @@ if (requested iobuf size > Max iobuf size in the pool(1MB as of yet))
Also takes a reference(increments ref count), hence no need of doing it
explicitly after getting iobuf.
-###iobuf_ref
+### iobuf_ref
```
struct iobuf *iobuf_ref (struct iobuf *iobuf);
@@ -188,7 +188,7 @@ struct iobuf *iobuf_ref (struct iobuf *iobuf);
xlator/function/, its a good practice to take a reference so that iobuf is not
deleted by the allocator.
-###iobuf_unref
+### iobuf_unref
```
void iobuf_unref (struct iobuf *iobuf);
```
@@ -203,33 +203,33 @@ Unreference the iobuf, if the ref count is zero iobuf is considered free.
Every iobuf_ref should have a corresponding iobuf_unref, and also every
iobuf_get/2 should have a correspondning iobuf_unref.
-###iobref_new
+### iobref_new
```
struct iobref *iobref_new ();
```
Creates a new iobref structure and returns its pointer.
-###iobref_ref
+### iobref_ref
```
struct iobref *iobref_ref (struct iobref *iobref);
```
Take a reference on the iobref.
-###iobref_unref
+### iobref_unref
```
void iobref_unref (struct iobref *iobref);
```
Decrements the reference count of the iobref. If the ref count is 0, then unref
all the iobufs(iobuf_unref) in the iobref, and destroy the iobref.
-###iobref_add
+### iobref_add
```
int iobref_add (struct iobref *iobref, struct iobuf *iobuf);
```
Adds the given iobuf into the iobref, it takes a ref on the iobuf before adding
it, hence explicit iobuf_ref is not required if adding to the iobref.
-###iobref_merge
+### iobref_merge
```
int iobref_merge (struct iobref *to, struct iobref *from);
```
@@ -239,13 +239,13 @@ on all the iobufs added to the 'to' iobref. Hence iobref_unref should be
performed both on 'from' and 'to' iobrefs (performing iobref_unref only on 'to'
will not free the iobufs and may result in leak).
-###iobref_clear
+### iobref_clear
```
void iobref_clear (struct iobref *iobref);
```
Unreference all the iobufs in the iobref, and also unref the iobref.
-##Iobuf Leaks
+## Iobuf Leaks
If all iobuf_refs/iobuf_new do not have correspondning iobuf_unref, then the
iobufs are not freed and recurring execution of such code path may lead to huge
memory leaks. The easiest way to identify if a memory leak is caused by iobufs
diff --git a/doc/developer-guide/datastructure-mem-pool.md b/doc/developer-guide/datastructure-mem-pool.md
index c71aa2a8ddd..225567cbf9f 100644
--- a/doc/developer-guide/datastructure-mem-pool.md
+++ b/doc/developer-guide/datastructure-mem-pool.md
@@ -1,5 +1,5 @@
-#Mem-pool
-##Background
+# Mem-pool
+## Background
There was a time when every fop in glusterfs used to incur cost of allocations/de-allocations for every stack wind/unwind between xlators because stack/frame/*_localt_t in every wind/unwind was allocated and de-allocated. Because of all these system calls in the fop path there was lot of latency and the worst part is that most of the times the number of frames/stacks active at any time wouldn't cross a threshold. So it was decided that this threshold number of frames/stacks would be allocated in the beginning of the process only once. Get one of them from the pool of stacks/frames whenever `STACK_WIND` is performed and put it back into the pool in `STACK_UNWIND`/`STACK_DESTROY` without incurring any extra system calls. The data structures are allocated only when threshold number of such items are in active use i.e. pool is in complete use.% increase in the performance once this was added to all the common data structures (inode/fd/dict etc) in xlators throughout the stack was tremendous.
## Data structure
@@ -27,7 +27,7 @@ will be served from here until all the elements in the pool are in use i.e. cold
};
```
-##Life-cycle
+## Life-cycle
```
mem_pool_new (data_type, unsigned long count)
@@ -120,5 +120,5 @@ mem_pool_destroy (struct mem_pool *pool)
Deletes this pool from the `global_list` maintained by `glusterfs-ctx` and frees all the memory allocated in `mem_pool_new`.
-###How to pick pool-size
+### How to pick pool-size
This varies from work-load to work-load. Create the mem-pool with some random size and run the work-load. Take the statedump after the work-load is complete. In the statedump if `max_alloc` is always less than `cold_count` may be reduce the size of the pool closer to `max_alloc`. On the otherhand if there are lots of `pool-misses` then increase the `pool_size` by `max_stdalloc` to achieve better 'hit-rate' of the pool.
diff --git a/doc/developer-guide/dirops-transactions-in-dht.md b/doc/developer-guide/dirops-transactions-in-dht.md
index 83f63be3f45..909a97001aa 100644
--- a/doc/developer-guide/dirops-transactions-in-dht.md
+++ b/doc/developer-guide/dirops-transactions-in-dht.md
@@ -1,3 +1,4 @@
+# dirops transactions in dht
Need for transactions during operations on directories arise from two
basic design elements of DHT:
@@ -269,4 +270,4 @@ And the examples are:
* Both _dst_ and _dst/dst_ have same gfid - _gfid-src_. As observed earlier, symptom might be directory listing being incomplete
- mkdir (dst) vs renamedir ("src", "dst")
- rmdir (src) vs renamedir ("src", "dst")
- - rmdir (dst) vs renamedir ("src", "dst") \ No newline at end of file
+ - rmdir (dst) vs renamedir ("src", "dst")
diff --git a/doc/developer-guide/fuse-interrupt.md b/doc/developer-guide/fuse-interrupt.md
new file mode 100644
index 00000000000..ec991b81ec5
--- /dev/null
+++ b/doc/developer-guide/fuse-interrupt.md
@@ -0,0 +1,211 @@
+# Fuse interrupt handling
+
+## Conventions followed
+
+- *FUSE* refers to the "wire protocol" between kernel and userspace and
+ related specifications.
+- *fuse* refers to the kernel subsystem and also to the GlusterFs translator.
+
+## FUSE interrupt handling spec
+
+The [Linux kernel FUSE documentation](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/filesystems/fuse.txt?h=v4.18#n148)
+desrcibes how interrupt handling happens in fuse.
+
+## Interrupt handling in the fuse translator
+
+### Declarations
+
+This document describes the internal API in the fuse translator with which
+interrupt can be handled.
+
+The API being internal (to be used only in fuse-bridge.c; the functions are
+not exported to a header file).
+
+```
+enum fuse_interrupt_state {
+ /* ... */
+ INTERRUPT_SQUELCHED,
+ INTERRUPT_HANDLED,
+ /* ... */
+};
+typedef enum fuse_interrupt_state fuse_interrupt_state_t;
+struct fuse_interrupt_record;
+typedef struct fuse_interrupt_record fuse_interrupt_record_t;
+typedef void (*fuse_interrupt_handler_t)(xlator_t *this,
+ fuse_interrupt_record_t *);
+struct fuse_interrupt_record {
+ fuse_in_header_t fuse_in_header;
+ void *data;
+ /*
+ ...
+ */
+};
+
+fuse_interrupt_record_t *
+fuse_interrupt_record_new(fuse_in_header_t *finh,
+ fuse_interrupt_handler_t handler);
+
+void
+fuse_interrupt_record_insert(xlator_t *this, fuse_interrupt_record_t *fir);
+
+gf_boolean_t
+fuse_interrupt_finish_fop(call_frame_t *frame, xlator_t *this,
+ gf_boolean_t sync, void **datap);
+
+void
+fuse_interrupt_finish_interrupt(xlator_t *this, fuse_interrupt_record_t *fir,
+ fuse_interrupt_state_t intstat,
+ gf_boolean_t sync, void **datap);
+```
+
+The code demonstrates the usage of the API through `fuse_flush()`. (It's a
+dummy implementation only for demonstration purposes.) Flush is chosen
+because a `FLUSH` interrupt is easy to trigger (see
+*tests/features/interrupt.t*). Interrupt handling for flush is switched on
+by `--fuse-flush-handle-interrupt` (a hidden glusterfs command line flag).
+The implementation of flush interrupt is contained in the
+`fuse_flush_interrupt_handler()` function and blocks guarded by the
+
+```
+if (priv->flush_handle_interrupt) { ...
+```
+
+conditional (where `priv` is a `*fuse_private_t`).
+
+### Overview
+
+"Regular" fuse fops and interrupt handlers interact via a list containing
+interrupt records.
+
+If a fop wishes to have its interrupts handled, it needs to set up an
+interrupt record and insert it into the list; also when it's to finish
+(ie. in its "cbk" stage) it needs to delete the record from the list.
+
+If no interrupt happens, basically that's all to it - a list insertion
+and deletion.
+
+However, if an interrupt comes for the fop, the interrupt FUSE request
+will carry the data identifying an ongoing fop (that is, its `unique`),
+and based on that, the interrupt record will be looked up in the list, and
+the specific interrupt handler (a member of the interrupt record) will be
+called.
+
+Usually the fop needs to share some data with the interrupt handler to
+enable it to perform its task (also shared via the interrupt record).
+The interrupt API offers two approaches to manage shared data:
+- _Async or reference-counting strategy_: from the point on when the interrupt
+ record is inserted to the list, it's owned jointly by the regular fop and
+ the prospective interrupt handler. Both of them need to check before they
+ return if the other is still holding a reference; if not, then they are
+ responsible for reclaiming the shared data.
+- _Sync or borrow strategy_: the interrupt handler is considered a borrower
+ of the shared data. The interrupt handler should not reclaim the shared
+ data. The fop will wait for the interrupt handler to finish (ie., the borrow
+ to be returned), then it has to reclaim the shared data.
+
+The user of the interrupt API need to call the following functions to
+instrument this control flow:
+- `fuse_interrupt_record_insert()` in the fop to insert the interrupt record to
+ the list;
+- `fuse_interrupt_finish_fop()`in the fop (cbk) and
+- `fuse_interrupt_finish_interrupt()`in the interrupt handler
+
+to perform needed synchronization at the end their tenure. The data management
+strategies are implemented by the `fuse_interrupt_finish_*()` functions (which
+have an argument to specify which strategy to use); these routines take care
+of freeing the interrupt record itself, while the reclamation of the shared data
+is left to the API user.
+
+### Usage
+
+A given FUSE fop can be enabled to handle interrupts via the following
+steps:
+
+- Define a handler function (of type `fuse_interrupt_handler_t`).
+ It should implement the interrupt handling logic and in the end
+ call (directly or as async callback) `fuse_interrupt_finish_interrupt()`.
+ The `intstat` argument to `fuse_interrupt_finish_interrupt` should be
+ either `INTERRUPT_SQUELCHED` or `INTERRUPT_HANDLED`.
+ - `INTERRUPT_SQUELCHED` means that the interrupt could not be delivered
+ and the fop is going on uninterrupted.
+ - `INTERRUPT_HANDLED` means that the interrupt was actually handled. In
+ this case the fop will be answered from interrupt context with errno
+ `EINTR` (that is, the fop should not send a response to the kernel).
+
+ (the enum `fuse_interrupt_state` includes further members, which are reserved
+ for internal use).
+
+ We return to the `sync` and `datap` arguments later.
+- In the `fuse_<FOP>` function create an interrupt record using
+ `fuse_interrupt_record_new()`, passing the incoming `fuse_in_header` and
+ the above handler function to it.
+ - Arbitrary further data can be referred to via the `data` member of the
+ interrupt record that is to be passed on from fop context to
+ interrupt context.
+- When it's set up, pass the interrupt record to
+ `fuse_interrupt_record_insert()`.
+- In `fuse_<FOP>_cbk` call `fuse_interrupt_finish_fop()`.
+ - `fuse_interrupt_finish_fop()` returns a Boolean according to whether the
+ interrupt was handled. If it was, then the FUSE request is already
+ answered and the stack gets destroyed in `fuse_interrupt_finish_fop` so
+ `fuse_<FOP>_cbk()` can just return (zero). Otherwise follow the standard
+ cbk logic (answer the FUSE request and destroy the stack -- these are
+ typically accomplished by `fuse_err_cbk()`).
+- The last two argument of `fuse_interrupt_finish_fop()` and
+ `fuse_interrupt_finish_interrupt()` are `gf_boolean_t sync` and
+ `void **datap`.
+ - `sync` represents the strategy for freeing the interrupt record. The
+ interrupt handler and the fop handler are in race to get at the interrupt
+ record first (interrupt handler for purposes of doing the interrupt
+ handling, fop handler for purposes of deactivating the interrupt record
+ upon completion of the fop handling).
+ - If `sync` is true, then the fop handler will wait for the interrupt
+ handler to finish and it takes care of freeing.
+ - If `sync` is false, the loser of the above race will perform freeing.
+
+ Freeing is done within the respective interrupt finish routines, except
+ for the `data` field of the interrupt record; with respect to that, see
+ the discussion of the `datap` parameter below. The strategy has to be
+ consensual, that is, `fuse_interrupt_finish_fop()` and
+ `fuse_interrupt_finish_interrupt()` must pass the same value for `sync`.
+ If dismantling the resources associated with the interrupt record is
+ simple, `sync = _gf_false` is the suggested choice; `sync = _gf_true` can
+ be useful in the opposite case, when dismantling those resources would
+ be inconvenient to implement in two places or to enact in non-fop context.
+ - If `datap` is `NULL`, the `data` member of the interrupt record will be
+ freed within the interrupt finish routine. If it points to a valid
+ `void *` pointer, and if caller is doing the cleanup (see `sync` above),
+ then that pointer will be directed to the `data` member of the interrupt
+ record and it's up to the caller what it's doing with it.
+ - If `sync` is true, interrupt handler can use `datap = NULL`, and
+ fop handler will have `datap` point to a valid pointer.
+ - If `sync` is false, and handlers pass a pointer to a pointer for
+ `datap`, they should check if the pointed pointer is NULL before
+ attempting to deal with the data.
+
+### FUSE answer for the interrupted fop
+
+The kernel acknowledges a successful interruption for a given FUSE request
+if the filesystem daemon answers it with errno EINTR; upon that, the syscall
+which induced the request will be abruptly terminated with an interrupt, rather
+than returning a value.
+
+In glusterfs, this can be arranged in two ways.
+
+- If the interrupt handler wins the race for the interrupt record, ie.
+ `fuse_interrupt_finish_fop()` returns true to `fuse_<FOP>_cbk()`, then, as
+ said above, `fuse_<FOP>_cbk()` does not need to answer the FUSE request.
+ That's because then the interrupt handler will take care about answering
+ it (with errno EINTR).
+- If `fuse_interrupt_finish_fop()` returns false to `fuse_<FOP>_cbk()`, then
+ this return value does not inform the fop handler whether there was an interrupt
+ or not. This return value occurs both when fop handler won the race for the
+ interrupt record against the interrupt handler, and when there was no interrupt
+ at all.
+
+ However, the internal logic of the fop handler might detect from other
+ circumstances that an interrupt was delivered. For example, the fop handler
+ might be sleeping, waiting for some data to arrive, so that a premature
+ wakeup (with no data present) occurs if the interrupt handler intervenes. In
+ such cases it's the responsibility of the fop handler to reply the FUSE
+ request with errro EINTR.
diff --git a/doc/developer-guide/identifying-resource-leaks.md b/doc/developer-guide/identifying-resource-leaks.md
index 851fc4424bc..950cae79b0a 100644
--- a/doc/developer-guide/identifying-resource-leaks.md
+++ b/doc/developer-guide/identifying-resource-leaks.md
@@ -174,3 +174,27 @@ In this case, the resource leak can be addressed by adding a single line to the
Running the same Valgrind command and comparing the output will show that the
memory leak in `xlators/meta/src/meta.c:init` is not reported anymore.
+
+### Running DRD, the Valgrind thread error detector
+
+When configuring GlusterFS with:
+
+```shell
+./configure --enable-valgrind
+```
+
+the default Valgrind tool (Memcheck) is enabled. But it's also possble to select
+one of Memcheck or DRD by using:
+
+```shell
+./configure --enable-valgrind=memcheck
+```
+
+or:
+
+```shell
+./configure --enable-valgrind=drd
+```
+
+respectively. When using DRD, it's recommended to consult
+https://valgrind.org/docs/manual/drd-manual.html before running.
diff --git a/doc/developer-guide/logging-guidelines.md b/doc/developer-guide/logging-guidelines.md
index 58adf944b67..0e6b2588535 100644
--- a/doc/developer-guide/logging-guidelines.md
+++ b/doc/developer-guide/logging-guidelines.md
@@ -62,7 +62,7 @@ There are 2 interfaces provided to log messages,
headers (like the time stamp, dom, errnum etc.). The primary users of
the above interfaces are, when printing the final graph, or printing
the configuration when a process is about dump core or abort, or
- printing the backtrace when a process recieves a critical signal
+ printing the backtrace when a process receives a critical signal
- These interfaces should not be used outside the scope of the users
above, unless you know what you are doing
diff --git a/doc/developer-guide/network_compression.md b/doc/developer-guide/network_compression.md
index 7327591ef63..1222a765276 100644
--- a/doc/developer-guide/network_compression.md
+++ b/doc/developer-guide/network_compression.md
@@ -1,9 +1,9 @@
-#On-Wire Compression + Decompression
+# On-Wire Compression + Decompression
The 'compression translator' compresses and decompresses data in-flight
between client and bricks.
-###Working
+### Working
When a writev call occurs, the client compresses the data before sending it to
brick. On the brick, compressed data is decompressed. Similarly, when a readv
call occurs, the brick compresses the data before sending it to client. On the
@@ -19,7 +19,7 @@ During normal operation, this is the format of data sent over wire:
The trailer contains the CRC32 checksum and length of original uncompressed
data. This is used for validation.
-###Usage
+### Usage
Turning on compression xlator:
@@ -27,7 +27,7 @@ Turning on compression xlator:
gluster volume set <vol_name> network.compression on
~~~
-###Configurable parameters (optional)
+### Configurable parameters (optional)
**Compression level**
~~~
@@ -35,10 +35,10 @@ gluster volume set <vol_name> network.compression.compression-level 8
~~~
~~~
-0 : no compression
-1 : best speed
-9 : best compression
--1 : default compression
+ 0 : no compression
+ 1 : best speed
+ 9 : best compression
+-1 : default compression
~~~
**Minimum file size**
@@ -55,7 +55,7 @@ Other less frequently used parameters include `network.compression.mem-level`
and `network.compression.window-size`. More details can about these options
can be found by running `gluster volume set help` command.
-###Known Issues and Limitations
+### Known Issues and Limitations
* Compression translator cannot work with striped volumes.
* Mount point hangs when writing a file with write-behind xlator turned on. To
@@ -65,7 +65,7 @@ set`performance.strict-write-ordering` to on.
distribute volumes. This limitation is caused by AFR not being able to
propagate xdata. This issue has been fixed in glusterfs versions > 3.5
-###TODO
+### TODO
Although zlib offers high compression ratio, it is very slow. We can make the
translator pluggable to add support for other compression methods such as
[lz4 compression](https://code.google.com/p/lz4/)
diff --git a/doc/developer-guide/options-to-contribute.md b/doc/developer-guide/options-to-contribute.md
new file mode 100644
index 00000000000..3f0d84e7645
--- /dev/null
+++ b/doc/developer-guide/options-to-contribute.md
@@ -0,0 +1,212 @@
+# A guide for contributors
+
+While you have gone through 'how to contribute' guides, if you are
+not sure what to work on, but really want to help the project, you
+have now landed on the right document :-)
+
+### Basic
+
+Instead of planning to fix **all** the below issues in one patch,
+we recommend you to have a a constant, continuous flow of improvements
+for the project. We recommend you to pick 1 file (or just few files) at
+a time to address below issues.
+Pick any `.c` (or `.h`) file, and you can send a patch which fixes **any**
+of the below themes. Ideally, fix all such occurrences in the file, even
+though, the reviewers would review even a single line change patch
+from you.
+
+1. Check for variable definitions, and if there is an array definition,
+which is very large at the top of the function, see if you can re-scope
+the variable to relevant sections (if it helps).
+
+Most of the time, some of these arrays may be used for 'error' handling,
+and it is possible to use them only in that scope.
+
+Reference: https://review.gluster.org/20846/
+
+
+2. Check for complete string initialization at the beginning of a function.
+Ideally, there is no reason to initialize a string. Fix it across the file.
+
+Example:
+
+`char new_path_name[PATH_MAX] = {0};` to `char new_path_name[PATH_MAX];`
+
+
+3. Change `calloc()` to `malloc()` wherever it makes sense.
+
+In a case of allocating a structures, where you expect certain (or most of)
+variables to be 0 (or NULL), it makes sense to use calloc(). But otherwise,
+there is an extra cost to `memset()` the whole object after allocating it.
+While it is not a significant improvement in performance, code which gets
+hit 1000s of times in a second, it would add some value.
+
+Reference: https://review.gluster.org/20878/
+
+
+4. You can consider using `snprintf()`, instead of `strncpy()` while dealing
+with strings.
+
+strncpy() won't null terminate if the dest buffer isn't big enough; snprintf()
+does. While most of the string operations in the code is on array, and larger
+size than required, strncpy() does an extra copy of 0s at the end of
+string till the size of the array. It makes sense to use `snprintf()`,
+which doesn't suffer from that behavior.
+
+Also check the return value from snprintf() for buffer overflow and handle
+accordingly
+
+Reference: https://review.gluster.org/20925/
+
+
+5. Now, pick a `.h` file, and see if a structure is very large, and see
+if re-aligning them as per [coding-standard](./coding-standard.md) gives any size benefit,
+if yes, go ahead and change it. Make sure you check all the structures
+in the file for similar pattern.
+
+Reference: [Check this section](https://github.com/gluster/glusterfs/blob/master/doc/developer-guide/coding-standard.md#structure-members-should-be-aligned-based-on-the-padding-requirements
+
+
+### If you are up for more :-)
+
+Good progress! Glad you are interested to know more. We are surely interested
+in next level of contributions from you!
+
+#### Coverity
+
+Visit [Coverity Dashboard](https://scan.coverity.com/projects/gluster-glusterfs?tab=overview).
+
+Now, if the number of defect is not 0, you have an opportunity to contribute.
+
+You get all the detail on why the particular defect is mentioned there, and
+most probable hint on how to fix it. Do it!
+
+Reference: https://review.gluster.org/21394/
+
+Use the same reference Id (789278) as the patch, so we can capture it is in
+single bugzilla.
+
+#### Clang-Scan
+
+Clang-Scan is a tool which scans the .c files and reports the possible issues,
+similar to coverity, but a different tool. Over the years we have seen, they
+both report very different set of issues, and hence there is a value in fixing it.
+
+GlusterFS project gets tested with clang-scan job every night, and the report is
+posted in the [job details page](https://build.gluster.org/job/clang-scan/lastCompletedBuild/clangScanBuildBugs/).
+As long as the number is not 0 in the report here, you have an opportunity to
+contribute! Similar to coverity dashboard, click on 'Details' to find out the
+reason behind that report, and send a patch.
+
+Reference: https://review.gluster.org/21025
+
+Again, you can use reference Id (1622665) for these patches!
+
+
+### I am good with programming, I would like to do more than above!
+
+#### Locked regions / Critical sections
+
+In the file you open, see if the lock is taken only to increment or decrement
+a flag, counter etc. If yes, then recommend you to convert it to ATOMIC locks.
+It is simple activity, but, if you know programing, you would know the benefit
+here.
+
+NOTE: There may not always a possibility to do this! You may have to check
+with developers first before going ahead.
+
+Reference: https://review.gluster.org/21221/
+
+
+#### ASan (address sanitizer)
+
+[The job](https://build.gluster.org/job/asan/) runs regression with asan builds,
+and you can also run glusterfs with asan on your workload to identify the leaks.
+If there are any leaks reported, feel free to check it, and send us patch.
+
+You can also run `valgrind` and let us know what it reports.
+
+Reference: https://review.gluster.org/21397
+
+
+#### Porting to different architecture
+
+This is something which we are not focusing right now, happy to collaborate!
+
+Reference: https://review.gluster.org/21276
+
+
+#### Fix 'TODO/FIXME' in codebase
+
+There are few cases of pending features, or pending validations, which are
+pending from sometime. You can pick them in the given file, and choose to
+fix it.
+
+
+### I don't know C, but I am interested to contribute in some way!
+
+You are most welcome! Our community is open for your contribution! First thing
+which comes to our mind is **documentation**. Next is, **testing** or validation.
+
+If you have some hardware, and want to run some performance comparisons with
+different version, or options, and help us to tune better is also a great help.
+
+
+#### Documentation
+
+1. We have some documentation in [glusterfs repo](../), go through these, and
+see if you can help us to keep up-to-date.
+
+2. The https://docs.gluster.org is powered by https://github.com/gluster/glusterdocs
+repo. You can check out the repo, and help in keeping that up-to-date.
+
+3. [Our website](https://gluster.org) is maintained by https://github.com/gluster/glusterweb
+repo. Help us to keep this up-to-date, and add content there.
+
+4. Write blogs about Gluster, and your experience, and make world know little
+more about Gluster, and your use-case, and how it helped to solve the problem.
+
+
+#### Testing
+
+1. There is a regression test suite in glusterfs, which runs with every patch, and is
+triggered by just running `./run-tests.sh` from the root of the project repo.
+
+You can add more test case to match your use-case, and send it as a patch, so you
+can make sure all future patches in glusterfs would keep your usecase intact.
+
+2. [Glusto-Tests](https://github.com/gluster/glusto-tests): This is another testing
+framework written for gluster, and makes use of clustered setup to test different
+use-cases, and helps to validate many bugs.
+
+
+#### Ansible
+
+Gluster Organization has rich set of ansible roles, which are actively maintained.
+Feel free to check them out here - https://github.com/gluster/gluster-ansible
+
+
+#### Monitoring
+
+We have prometheus repo, and are actively working on adding more metrics. Add what
+you need @ https://github.com/gluster/gluster-prometheus
+
+
+#### Health-Report
+
+This is a project, where at any given point in time, you want to run some set of
+commands locally, and get an output to analyze the status, it can be added.
+Contribute @ https://github.com/gluster/gluster-health-report
+
+
+### All these C/bash/python is old-school, I want something in containers.
+
+We have something for you too :-)
+
+Please visit our https://github.com/gluster/gcs repo for checking how you can help,
+and how gluster can help you in container world.
+
+
+### Note
+
+For any queries, best way is to contact us through mailing-list, <mailto:gluster-devel@gluster.org>
diff --git a/doc/developer-guide/syncop.md b/doc/developer-guide/syncop.md
index 4e30451f30e..bcc8bd08e01 100644
--- a/doc/developer-guide/syncop.md
+++ b/doc/developer-guide/syncop.md
@@ -1,4 +1,4 @@
-#syncop framework
+# syncop framework
A coroutines-based, cooperative multi-tasking framework.
## Topics
diff --git a/doc/developer-guide/thread-naming.md b/doc/developer-guide/thread-naming.md
index 204cd7681b4..513140d4437 100644
--- a/doc/developer-guide/thread-naming.md
+++ b/doc/developer-guide/thread-naming.md
@@ -29,10 +29,10 @@ gf_thread_create_detached (pthread_t *thread,
As max name length for a thread in POSIX is only 16 characters including the
'\0' character, you have to be a little creative with naming. Also, it is
important that all Gluster threads have common prefix. Considering these
-conditions, we have "gluster" as prefix for all the threads created by these
+conditions, we have "glfs_" as prefix for all the threads created by these
wrapper functions. It is responsibility of the owner of thread to provide the
suffix part of the name. It does not have to be a descriptive name, as it has
-only 8 letters to work with. However, it should be unique enough such that it
+only 10 letters to work with. However, it should be unique enough such that it
can be matched with a table which describes it.
If n number of threads are spwaned to perform same function, it is must that the
@@ -87,6 +87,7 @@ such that it can be matched with a table below without ambiguity.
- posixfsy - posix fsync
- posixhc - posix heal
- posixjan - posix janitor
+- posixrsv - posix reserve
- quiesce - quiesce dequeue
- rdmaAsyn - rdma async event handler
- rdmaehan - rdma completion handler
@@ -99,6 +100,5 @@ such that it can be matched with a table below without ambiguity.
- spoller - socket poller
- sprocN - syncop worker thread
- tbfclock - token bucket filter token generator thread
-- tierfixl - tier fix layout
- timer - timer thread
- upreaper - upcall reaper
diff --git a/doc/developer-guide/translator-development.md b/doc/developer-guide/translator-development.md
index 3bf7e153354..f75935519f6 100644
--- a/doc/developer-guide/translator-development.md
+++ b/doc/developer-guide/translator-development.md
@@ -472,7 +472,7 @@ hello
Now let's interrupt the process and see where we are.
```
-^C
+
Program received signal SIGINT, Interrupt.
0x0000003a0060b3dc in pthread_cond_wait@@GLIBC_2.3.2 ()
from /lib64/libpthread.so.0
@@ -680,4 +680,4 @@ Original author's site:
Gluster community site:
- * [Translators](http://www.gluster.org/community/documentation/index.php/Translators)
+ * [Translators](https://docs.gluster.org/en/latest/Quick-Start-Guide/Architecture/#translators)
diff --git a/doc/developer-guide/xlator-classification.md b/doc/developer-guide/xlator-classification.md
new file mode 100644
index 00000000000..6073df9375f
--- /dev/null
+++ b/doc/developer-guide/xlator-classification.md
@@ -0,0 +1,221 @@
+# xlator categories and expectations
+
+The purpose of the document is to define a category for various xlators
+and expectations around what each category means from a perspective of
+health and maintenance of a xlator.
+
+The need to do this is to ensure certain categories are kept in good
+health, and helps the community and contributors focus their efforts around the
+same.
+
+This document also provides implementation details for xlator developers to
+declare a category for any xlator.
+
+## Table of contents
+1. Audience
+2. Categories (and expectations of each category)
+3. Implementation and usage details
+
+## Audience
+
+This document is intended for the following community participants,
+- New xlator contributors
+- Existing xlator maintainers
+- Packaging and gluster management stack maintainers
+
+For a more user facing understanding it is recommended to read section (TBD)
+in the gluster documentation.
+
+## Categories
+1. Experimental (E)
+2. TechPreview (TP)
+3. Maintained (M)
+4. Deprecated (D)
+5. Obsolete (O)
+
+### Experimental (E)
+
+Developed in the experimental branch, for exploring new features. These xlators
+are NEVER packaged as a part of releases, interested users and contributors can
+build and work with these from sources. In the future, these maybe available as
+an package based on a weekly build of the same.
+
+#### Quality expectations
+- Compiles or passes smoke tests
+- Does not break nightly experimental regressions
+ - NOTE: If a nightly is broken, then all patches that were merged are reverted
+ till the errant patch is found and subsequently fixed
+
+### TechPreview (TP)
+
+Xlators in master or release branches that are not deemed fit to be in
+production deployments, but are feature complete to invite feedback and host
+user data.
+
+These xlators will be worked upon with priority by maintainers/authors who are
+involved in making them more stable than xlators in the Experimental/Deprecated/
+Obsolete categories.
+
+There is no guarantee that these xlators will move to the Maintained state, and
+may just get Obsoleted based on feedback, or other project goals or technical
+alternatives.
+
+#### Quality expectations
+- Same as Maintained, minus
+ - Performance, Scale, other(?)
+ - *TBD* *NOTE* Need inputs, Intention is all quality goals as in Maintained,
+ other than the list above (which for now has scale and performance)
+
+### Maintained (M)
+
+These xltors are part of the core Gluster functionality and are maintained
+actively. These are part of master and release branches and are higher in
+priority of maintainers and other interested contributors.
+
+#### Quality expectations
+
+NOTE: A short note on what each of these mean are added here, details to follow.
+
+NOTE: Out of the gate all of the following are not mandated, consider the
+following a desirable state to reach as we progress on each
+
+- Bug backlog: Actively address bug backlog
+- Enhancement backlog: Actively maintain outstanding enhancement backlog (need
+ not be acted on, but should be visible to all)
+- Review backlog: Actively keep this below desired counts and states
+- Static code health: Actively meet near-zero issues in this regard
+ - Coverity, spellcheck and other checks
+- Runtime code health: Actively meet defined coverage levels in this regard
+ - Coverage, others?
+ - Per-patch regressions
+ - Glusto runs
+ - Performance
+ - Scalability
+- Technical specifications: Implementation details should be documented and
+ updated at regular cadence (even per patch that change assumptions in
+ here)
+- User documentation: User facing details should be maintained to current
+ status in the documentation
+- Debuggability: Steps, tools, procedures should be documented and maintained
+ each release/patch as applicable
+- Troubleshooting: Steps, tools, procedures should be documented and maintained
+ each release/patch as applicable
+ - Steps/guides for self service
+ - Knowledge base for problems
+- Other common criteria that will apply: Required metrics/desired states to be
+ defined per criteria
+ - Monitoring, usability, statedump, and other such xlator expectations
+
+### Deprecated (D)
+
+Xlators on master or release branches that would be obsoleted and/or replaced
+with similar or other functionality in the next major release.
+
+#### Quality expectations
+- Retain status-quo when moved to this state, till it is moved to obsoleted
+- Provide migration steps if feature provided by the xlator is replaced with
+other xlators
+
+### Obsolete (O)
+
+Xlator/code still in tree, but not packaged or shipped or maintained in any
+form. This is noted as a category till the code is removed from the tree.
+
+These xlators and their corresponding code and test health will not be executed.
+
+#### Quality expectations
+- None
+
+## Implementation and usage details
+
+### How to specify an xlators category
+
+While defining 'xlator_api_t' structure for the corresponding xlator, add a
+flag like below:
+
+```
+diff --git a/xlators/performance/nl-cache/src/nl-cache.c b/xlators/performance/nl-cache/src/nl-cache.c
+index 0f0e53bac2..8267d6897c 100644
+--- a/xlators/performance/nl-cache/src/nl-cache.c
++++ b/xlators/performance/nl-cache/src/nl-cache.c
+@@ -869,4 +869,5 @@ xlator_api_t xlator_api = {
+ .cbks = &nlc_cbks,
+ .options = nlc_options,
+ .identifier = "nl-cache",
++ .category = GF_TECH_PREVIEW,
+ };
+diff --git a/xlators/performance/quick-read/src/quick-read.c b/xlators/performance/quick-read/src/quick-read.c
+index 8d39720e7f..235de27c19 100644
+--- a/xlators/performance/quick-read/src/quick-read.c
++++ b/xlators/performance/quick-read/src/quick-read.c
+@@ -1702,4 +1702,5 @@ xlator_api_t xlator_api = {
+ .cbks = &qr_cbks,
+ .options = qr_options,
+ .identifier = "quick-read",
++ .category = GF_MAINTAINED,
+ };
+```
+
+Similarly, if a particular option is in different state other than
+the xlator state, one can add the same flag in options structure too.
+
+```
+diff --git a/xlators/cluster/afr/src/afr.c b/xlators/cluster/afr/src/afr.c
+index 0e86e33d03..81996743d1 100644
+--- a/xlators/cluster/afr/src/afr.c
++++ b/xlators/cluster/afr/src/afr.c
+@@ -772,6 +772,7 @@ struct volume_options options[] = {
+ .description = "Maximum latency for shd halo replication in msec."
+ },
+ { .key = {"halo-enabled"},
++ .category = GF_TECH_PREVIEW,
+ .type = GF_OPTION_TYPE_BOOL,
+ .default_value = "False",
+
+```
+
+
+### User experience using the categories
+
+#### Ability to use a category
+
+This section details which category of xlators can be used when and specifics
+around when each category is enabled.
+
+1. Maintained category xlators can be used by default, this implies, volumes
+created with these xlators enabled will throw no warnings, or need no user
+intervention to use the xlator.
+
+2. Tech Preview category xlators needs cluster configuration changes to allow
+these xlatorss to be used in volumes, further, logs will contain a message
+stating TP xlators are in use. Without the cluster configured to allow TP
+xlators, volumes created or edited to use such xlators would result in errors.
+ - (TBD) Cluster configuration option
+ - (TBD) Warning message
+ - (TBD) Code mechanics on how this is achieved
+
+3. Deprecated category xlators can be used by default, but will throw a warning
+in the logs that such are in use and will be deprecated in the future.
+ - (TBD) Warning message
+
+4. Obsolete category xlators will not be packaged and hence cannot be used from
+release builds.
+
+5. Experimental category xlators will not be packaged and hence cannot be used
+from release builds, if running experimental (weekly or other such) builds,
+these will throw a warning in the logs stating experimental xlators are in use.
+ - (TBD) Warning message
+
+#### Ability to query xlator category
+
+(TBD) Need to provide the ability to query xlator categories, or list xlators
+and their respective categories.
+
+#### User facing changes
+
+User facing changes that are expected due to this change include the following,
+- Cluster wide option to enable TP xlators, or more generically a category
+level of xlators
+- Errors in commands that fail due to invalid categories
+- Warning messages in logs to denote certain categories of xlators are in use
+- (TBD) Ability to query xlators and their respective categories
diff --git a/doc/features/ctime.md b/doc/features/ctime.md
index 6c6cad90055..74a77abed4b 100644
--- a/doc/features/ctime.md
+++ b/doc/features/ctime.md
@@ -1,25 +1,25 @@
-#Consistent time attributes in gluster across replica/distribute
+# Consistent time attributes in gluster across replica/distribute
-####Problem:
+#### Problem:
Traditionally gluster has been using time attributes (ctime, atime, mtime) of files/dirs from bricks. The problem with this approach is that, it is not consisteant across replica and distribute bricks. And applications which depend on it breaks as replica might not always return time attributes from same brick.
Tar especially gives "file changed as we read it" whenever it detects ctime differences when stat is served from different bricks. The way we have been trying to solve it is to serve the stat structures from same brick in afr, max-time in dht. But it doesn't avoid the problem completely. Because there is no way to change ctime at the moment(lutimes() only allows mtime, atime), there is little we can do to make sure ctimes match after self-heals/xattr updates/rebalance.
-####Solution Proposed:
+#### Solution Proposed:
Store time attribues (ctime, mtime, atime) as an xattr of the file. The xattr is updated based
on the fop. If a filesystem fop changes only mtime and ctime, update only those in xattr for
that file.
-####Design Overview:
+#### Design Overview:
1) As part of each fop, top layer will generate a time stamp and pass it to the down along
with other information
-> - This will bring a dependency for NTP synced clients along with servers
-> - There can be a diff in time if the fop stuck in the xlator for various reason,
+ - This will bring a dependency for NTP synced clients along with servers
+ - There can be a diff in time if the fop stuck in the xlator for various reason,
for ex: because of locks.
2) On the server, posix layer stores the value in the memory (inode ctx) and will sync the data periodically to the disk as an extended attr
-> - Of course sync call also will force it. And fop comes for an inode which is not linked, we do the sync immediately.
+ - Of course sync call also will force it. And fop comes for an inode which is not linked, we do the sync immediately.
3) Each time when inodes are created or initialized it read the data from disk and store in inode ctx.
@@ -29,19 +29,19 @@ for ex: because of locks.
6) File ops that changes the parent directory attr time need to be consistent across all the distributed directories across the subvolumes. (for eg: a create call will change ctime and mtime of parent dir)
-> - This has to handle separately because we only send the fop to the hashed subvolume.
-> - We can asynchronously send the timeupdate setattr fop to the other subvoumes and change the values for parent directory if the file fops is successful on hashed subvolume.
-> - This will have a window where the times are inconsistent across dht subvolume (Please provide your suggestions)
+ - This has to handle separately because we only send the fop to the hashed subvolume.
+ - We can asynchronously send the timeupdate setattr fop to the other subvoumes and change the values for parent directory if the file fops is successful on hashed subvolume.
+ - This will have a window where the times are inconsistent across dht subvolume (Please provide your suggestions)
7) Currently we have couple of mount options for time attributes like noatime, relatime , nodiratime etc. But we are not explicitly handled those options even if it is given as mount option when gluster mount.
-####Implementation Overview:
+#### Implementation Overview:
This features involves changes in following xlators.
-> - utime xlator
-> - posix xlator
+ - utime xlator
+ - posix xlator
-#####utime xlator:
+##### utime xlator:
This is a new client side xlator which does following tasks.
1. It will generate a time stamp and passes it down in frame->root->ctime and over the network.
@@ -50,7 +50,7 @@ This is a new client side xlator which does following tasks.
Patches:
1. https://review.gluster.org/#/c/19857/
-#####posix xlator:
+##### posix xlator:
Following tasks are done in posix xlator:
1. Provides APIs to set and get the xattr from backend. It also caches the xattr in inode context. During get, it updates time attributes stored in xattr into iatt structure.
@@ -61,7 +61,7 @@ Following tasks are done in posix xlator:
2. https://review.gluster.org/#/c/19795/
3. https://review.gluster.org/#/c/19796/
-####Pending Work:
+#### Pending Work:
1. Handling of time related mount options (noatime, realatime,etc)
2. flag based create (depending on flags in open, create behaviour might change)
3. Changes in dht for direcotory sync acrosss multiple subvolumes
diff --git a/doc/gluster.8 b/doc/gluster.8
index 3c523260c2d..ba595edca15 100644
--- a/doc/gluster.8
+++ b/doc/gluster.8
@@ -38,10 +38,10 @@ Display information about all volumes, or the specified volume.
\fB\ volume list \fR
List all volumes in cluster
.TP
-\fB\ volume status [all | <VOLNAME> [nfs|shd|<BRICK>|quotad|tierd]] [detail|clients|mem|inode|fd|callpool|tasks|client-list] \fR
+\fB\ volume status [all | <VOLNAME> [nfs|shd|<BRICK>|quotad]] [detail|clients|mem|inode|fd|callpool|tasks|client-list] \fR
Display status of all or specified volume(s)/brick
.TP
-\fB\ volume create <NEW-VOLNAME> [stripe <COUNT>] [replica <COUNT>] [disperse [<COUNT>]] [redundancy <COUNT>] [transport <tcp|rdma|tcp,rdma>] <NEW-BRICK> ... \fR
+\fB\ volume create <NEW-VOLNAME> [stripe <COUNT>] [[replica <COUNT> [arbiter <COUNT>]]|[replica 2 thin-arbiter 1]] [disperse [<COUNT>]] [disperse-data <COUNT>] [redundancy <COUNT>] [transport <tcp|rdma|tcp,rdma>] <NEW-BRICK> ... <TA-BRICK> \fR
Create a new volume of the specified type using the specified bricks and transport type (the default transport type is tcp).
To create a volume with both transports (tcp and rdma), give 'transport tcp,rdma' as an option.
.TP
@@ -113,6 +113,9 @@ Rotate the log file for corresponding volume/brick.
\fB\ volume profile <VOLNAME> {start|info [peek|incremental [peek]|cumulative|clear]|stop} [nfs] \fR
Profile operations on the volume. Once started, volume profile <volname> info provides cumulative statistics of the FOPs performed.
.TP
+\fB\ volume top <VOLNAME> {open|read|write|opendir|readdir|clear} [nfs|brick <brick>] [list-cnt <value>] | {read-perf|write-perf} [bs <size> count <count>] [brick <brick>] [list-cnt <value>] \fR
+Generates a profile of a volume representing the performance and bottlenecks/hotspots of each brick.
+.TP
\fB\ volume statedump <VOLNAME> [[nfs|quotad] [all|mem|iobuf|callpool|priv|fd|inode|history]... | [client <hostname:process-id>]] \fR
Dumps the in memory state of the specified process or the bricks of the volume.
.TP
@@ -134,32 +137,6 @@ List all the nodes in the pool (including localhost)
.TP
\fB\ peer help \fR
Display help for the peer command.
-.SS "Tier Commands"
-.TP
-\fB\ volume tier <VOLNAME> attach [<replica COUNT>] <NEW-BRICK>... \fR
-Attach to an existing volume a tier of specified type using the specified bricks.
-.TP
-\fB\ volume tier <VOLNAME> start [force] \fR
-Start the tier service for <VOLNAME>
-.TP
-\fB\ volume tier <VOLNAME> status \fR
-Display statistics on data migration between the hot and cold tiers.
-.TP
-\fB\ volume tier <VOLNAME> stop [force] \fR
-Stop the tier service for <VOLNAME>
-.TP
-\fB\ volume tier <VOLNAME> detach start\fR
-Begin detaching the hot tier from the volume. Data will be moved from the hot tier to the cold tier.
-.TP
-\fB\ volume tier <VOLNAME> detach commit [force]\fR
-Commit detaching the hot tier from the volume. The volume will revert to its original state before the hot tier was attached.
-.TP
-\fB\ volume tier <VOLNAME> detach status\fR
-Check status of data movement from the hot to cold tier.
-.TP
-\fB\ volume tier <VOLNAME> detach stop\fR
-Stop detaching the hot tier from the volume.
-
.SS "Quota Commands"
.TP
\fB\ volume quota <VOLNAME> enable \fR
@@ -214,8 +191,10 @@ NOTE: valid units of time and their symbols are : hours(h/hr), minutes(m/min), s
\fB\ system:: execute gsec_create\fR
Generates pem keys which are required for push-pem
.TP
-\fB\ volume geo-replication <MASTER_VOL> <SLAVE_HOST>::<SLAVE_VOL> create [push-pem] [force]\fR
+\fB\ volume geo-replication <MASTER_VOL> <SLAVE_HOST>::<SLAVE_VOL> create [[ssh-port n][[no-verify]|[push-pem]]] [force]\fR
Create a new geo-replication session from <MASTER_VOL> to <SLAVE_HOST> host machine having <SLAVE_VOL>.
+Use ssh-port n if custom SSH port is configured in slave nodes.
+Use no-verify if the rsa-keys of nodes in master volume is distributed to slave nodes through an external agent.
Use push-pem to push the keys automatically.
.TP
\fB\ volume geo-replication <MASTER_VOL> <SLAVE_HOST>::<SLAVE_VOL> {start|stop} [force] \fR
@@ -239,6 +218,12 @@ Use "!<OPTION>" to reset option <OPTION> to default value.
\fB\ volume bitrot <VOLNAME> {enable|disable} \fR
Enable/disable bitrot for volume <VOLNAME>
.TP
+\fB\ volume bitrot <VOLNAME> signing-time <time-in-secs> \fR
+Waiting time for an object after last fd is closed to start signing process.
+.TP
+\fB\ volume bitrot <VOLNAME> signer-threads <count> \fR
+Number of signing process threads. Usually set to number of available cores.
+.TP
\fB\ volume bitrot <VOLNAME> scrub-throttle {lazy|normal|aggressive} \fR
Scrub-throttle value is a measure of how fast or slow the scrubber scrubs the filesystem for volume <VOLNAME>
.TP
diff --git a/doc/glusterd.8 b/doc/glusterd.8
index 3ef7c2b72d1..e3768c78761 100644
--- a/doc/glusterd.8
+++ b/doc/glusterd.8
@@ -30,8 +30,8 @@ File to use for logging.
\fB\-L <LOGLEVEL>, \fB\-\-log\-level=<LOGLEVEL>\fR
Logging severity. Valid options are TRACE, DEBUG, INFO, WARNING, ERROR and CRITICAL (the default is INFO).
.TP
-\fB\-L, \fB\-\-localtime\-logging=on|off\fR
-Enable or disable localtime log timestamps. Valid options are on and off (the default is off).
+\fB\-\-localtime\-logging\fR
+Enable localtime log timestamps.
.TP
\fB\-\-debug\fR
Run the program in debug mode. This option sets \fB\-\-no\-daemon\fR, \fB\-\-log\-level\fR to DEBUG
diff --git a/doc/glusterfs.8 b/doc/glusterfs.8
index 592dedb6759..3d359ea85e4 100644
--- a/doc/glusterfs.8
+++ b/doc/glusterfs.8
@@ -53,8 +53,8 @@ Maximum number of connect attempts to server. This option should be provided wit
\fB\-\-acl\fR
Mount the filesystem with POSIX ACL support.
.TP
-\fB\-L, \fB\-\-localtime\-logging=on|off\fR
-Enable or disable localtime log timestamps. Valid options are on and off (the default is off).
+\fB\-\-localtime\-logging\fR
+Enable localtime log timestamps.
.TP
\fB\-\-debug\fR
Run in debug mode. This option sets \fB\-\-no\-daemon\fR, \fB\-\-log\-level\fR to DEBUG,
@@ -63,8 +63,8 @@ and \fB\-\-log\-file\fR to console.
\fB\-\-enable\-ino32=BOOL\fR
Use 32-bit inodes when mounting to workaround application that doesn't support 64-bit inodes.
.TP
-\fB\-\-fopen\-keep\-cache\fR
-Do not purge the cache on file open.
+\fB\-\-fopen\-keep\-cache[=BOOL]\fR
+Do not purge the cache on file open (default: false).
.TP
\fB\-\-mac\-compat=BOOL\fR
Provide stubs for attributes needed for seamless operation on Macs (the default is off).
@@ -139,6 +139,11 @@ Enable fuse in-kernel writeback cache.
\fB\-\-negative\-timeout=SECONDS\fR
Set negative timeout to SECONDS in fuse kernel module (the default is 0).
.TP
+\fB\-\-auto\-invalidation=BOOL\fR
+controls whether fuse-kernel can auto-invalidate attribute, dentry and
+page-cache. Disable this only if same files/directories are not
+accessed across two different mounts concurrently [default: on].
+.TP
\fB\-\-volfile-check\fR
Enable strict volume file checking.
diff --git a/doc/glusterfsd.8 b/doc/glusterfsd.8
index c5a95d1611f..bc1de2a8c80 100644
--- a/doc/glusterfsd.8
+++ b/doc/glusterfsd.8
@@ -51,8 +51,8 @@ Server to get the volume from. This option overrides \fB\-\-volfile option
.PP
.TP
-\fB\-L, \fB\-\-localtime\-logging=on|off\fR
-Enable or disable localtime log timestamps. Valid options are on and off (the default is off).
+\fB\-\-localtime\-logging\fR
+Enable localtime log timestamps.
.TP
\fB\-\-debug\fR
Run in debug mode. This option sets \fB\-\-no\-daemon\fR, \fB\-\-log\-level\fR to DEBUG
@@ -107,6 +107,11 @@ Enable/Disable direct-io mode in fuse module [default: enable]
.TP
\fB\-\-resolve-gids\fR
Resolve all auxiliary groups in fuse translator (max 32 otherwise)
+.TP
+\fB\-\-auto\-invalidation=BOOL\fR
+controls whether fuse-kernel can auto-invalidate attribute, dentry and
+page-cache. Disable this only if same files/directories are not
+accessed across two different mounts concurrently [default: on]
.SS "Miscellaneous Options"
.PP
diff --git a/doc/mount.glusterfs.8 b/doc/mount.glusterfs.8
index 367f02d9b1a..ce16e9e40b7 100644
--- a/doc/mount.glusterfs.8
+++ b/doc/mount.glusterfs.8
@@ -44,8 +44,8 @@ INFO and NONE [default: INFO]
\fBacl
Mount the filesystem with POSIX ACL support
.TP
-\fBfopen\-keep\-cache
-Do not purge the cache on file open
+\fBfopen\-keep\-cache[=BOOL]
+Do not purge the cache on file open (default: false)
.TP
\fBworm
Mount the filesystem in 'worm' mode
@@ -122,6 +122,15 @@ Provide list of backup volfile servers in the following format [default: None]
\fBDeprecated\fR option - placed here for backward compatibility [default: 1]
.TP
.TP
+\fBlru-limit=\fRN
+Set fuse module's limit for number of inodes kept in LRU list to N [default: 65536]
+.TP
+.TP
+\fBinvalidate-limit=\fRN
+Suspend fuse invalidations implied by 'lru-limit' if number of outstanding
+invalidations reaches N
+.TP
+.TP
\fBbackground-qlen=\fRN
Set fuse module's background queue length to N [default: 64]
.TP
@@ -142,6 +151,11 @@ Enable fuse in-kernel writeback cache [default: off]
.TP
\fBattr\-times\-granularity=\fRNS
Declare supported granularity of file attribute [default: 0]
+.TP
+\fBauto\-invalidation=\fRBOOL
+controls whether fuse-kernel can auto-invalidate attribute, dentry and
+page-cache. Disable this only if same files/directories are not
+accessed across two different mounts concurrently [default: on]
.PP
.SH FILES
.TP