tests/functional/dht/test_add_brick_rebalance_with_self_heal_in_progress.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

#  Copyright (C) 2020 Red Hat, Inc. <http://www.redhat.com>
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along`
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

from random import choice
from glusto.core import Glusto as g
from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on
from glustolibs.gluster.exceptions import ExecutionError
from glustolibs.gluster.brick_libs import get_all_bricks, bring_bricks_online
from glustolibs.gluster.heal_libs import monitor_heal_completion
from glustolibs.gluster.rebalance_ops import (
    rebalance_start, wait_for_rebalance_to_complete)
from glustolibs.gluster.volume_libs import expand_volume
from glustolibs.io.utils import (collect_mounts_arequal, validate_io_procs,
                                 wait_for_io_to_complete)
from glustolibs.misc.misc_libs import kill_process


@runs_on([['distributed-replicated', 'distributed-arbiter'], ['glusterfs']])
class TestAddBrickRebalanceWithSelfHeal(GlusterBaseClass):

    def setUp(self):

        self.get_super_method(self, 'setUp')()

        # Setup Volume
        if not self.setup_volume_and_mount_volume([self.mounts[0]]):
            raise ExecutionError("Failed to Setup and mount volume")

        self.is_io_running = False

    def tearDown(self):

        # If I/O processes are running wait for it to complete
        if self.is_io_running:
            if not wait_for_io_to_complete(self.list_of_io_processes,
                                           [self.mounts[0]]):
                raise ExecutionError("Failed to wait for I/O to complete")

        if not self.unmount_volume_and_cleanup_volume([self.mounts[0]]):
            raise ExecutionError("Failed to Cleanup Volume")

        # Calling GlusterBaseClass tearDown
        self.get_super_method(self, 'tearDown')()

    def test_add_brick_rebalance_with_self_heal_in_progress(self):
        """
        Test case:
        1. Create a volume, start it and mount it.
        2. Start creating a few files on mount point.
        3. While file creation is going on, kill one of the bricks
           in the replica pair.
        4. After file creattion is complete collect arequal checksum
           on mount point.
        5. Bring back the brick online by starting volume with force.
        6. Check if all bricks are online and if heal is in progress.
        7. Add bricks to the volume and start rebalance.
        8. Wait for rebalance and heal to complete on volume.
        9. Collect arequal checksum on mount point and compare
           it with the one taken in step 4.
        """
        # Start I/O from mount point and wait for it to complete
        cmd = ("cd %s; for i in {1..1000} ; do "
               "dd if=/dev/urandom of=file$i bs=10M count=1; done"
               % self.mounts[0].mountpoint)
        self.list_of_io_processes = [
            g.run_async(self.mounts[0].client_system, cmd)]
        self.is_copy_running = True

        # Get a list of all the bricks to kill brick
        brick_list = get_all_bricks(self.mnode, self.volname)
        self.assertIsNotNone(brick_list, "Empty present brick list")

        # Kill brick process of a brick which is being removed
        brick = choice(brick_list)
        node, _ = brick.split(":")
        ret = kill_process(node, process_names="glusterfsd")
        self.assertTrue(ret, "Failed to kill brick process of brick %s"
                        % brick)

        # Validate if I/O was successful or not.
        ret = validate_io_procs(self.list_of_io_processes, self.mounts)
        self.assertTrue(ret, "IO failed on some of the clients")
        self.is_copy_running = False

        # Collect arequal checksum before ops
        arequal_checksum_before = collect_mounts_arequal(self.mounts[0])

        # Bring back the brick online by starting volume with force
        ret = bring_bricks_online(self.mnode, self.volname, brick_list,
                                  bring_bricks_online_methods=[
                                      'volume_start_force'])
        self.assertTrue(ret, "Error in bringing back brick online")
        g.log.info('All bricks are online now')

        # Add brick to volume
        ret = expand_volume(self.mnode, self.volname, self.servers,
                            self.all_servers_info)
        self.assertTrue(ret, "Failed to add brick on volume %s"
                        % self.volname)

        # Trigger rebalance and wait for it to complete
        ret, _, _ = rebalance_start(self.mnode, self.volname,
                                    force=True)
        self.assertEqual(ret, 0, "Failed to start rebalance on the volume %s"
                         % self.volname)

        # Wait for rebalance to complete
        ret = wait_for_rebalance_to_complete(self.mnode, self.volname,
                                             timeout=1200)
        self.assertTrue(ret, "Rebalance is not yet complete on the volume "
                             "%s" % self.volname)
        g.log.info("Rebalance successfully completed")

        # Wait for heal to complete
        ret = monitor_heal_completion(self.mnode, self.volname)
        self.assertTrue(ret, "heal has not yet completed")
        g.log.info("Self heal completed")

        # Check for data loss by comparing arequal before and after ops
        arequal_checksum_after = collect_mounts_arequal(self.mounts[0])
        self.assertEqual(arequal_checksum_before, arequal_checksum_after,
                         "arequal checksum is NOT MATCHNG")
        g.log.info("arequal checksum is SAME")