From 967ed15514ee4d9225547461c25c97aea4b34c37 Mon Sep 17 00:00:00 2001 From: Leela Venkaiah G Date: Wed, 10 Feb 2021 17:18:46 +0530 Subject: [Test] Verify metadata and data self heal Test: test_metadata_heal_from_shd Steps: 1. Create, mount and run IO on volume 2. Set `self-heal-daemon` to `off` and bring files into metadata split brain 3. Set `self-heal-daemon` to `on` and wait for heal completion 4. Validate areequal checksum on backend bricks Test: test_metadata_heal_from_heal_cmd Steps: 1. Create, mount and run IO on volume 2. Set `self-heal-daemon` to `off` and bring files into metadata split brain 3. Set `self-heal-daemon` to `on`, invoke `gluster vol heal` 4. Validate areequal checksum on backend bricks Test: test_data_heal_from_shd Steps: 1. Create, mount and run IO on volume 2. Set `self-heal-daemon` to `off` and bring files into data split brain 3. Set `self-heal-daemon` to `on` and wait for heal completion 4. Validate areequal checksum on backend bricks Change-Id: I24411d964fb6252ae5b621c6569e791b54dcc311 Signed-off-by: Leela Venkaiah G --- .../arbiter/test_verify_metadata_and_data_heal.py | 297 +++++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 tests/functional/arbiter/test_verify_metadata_and_data_heal.py diff --git a/tests/functional/arbiter/test_verify_metadata_and_data_heal.py b/tests/functional/arbiter/test_verify_metadata_and_data_heal.py new file mode 100644 index 000000000..d48e36e73 --- /dev/null +++ b/tests/functional/arbiter/test_verify_metadata_and_data_heal.py @@ -0,0 +1,297 @@ +# Copyright (C) 2021 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +from glusto.core import Glusto as g + +from glustolibs.gluster.brick_libs import (bring_bricks_offline, + bring_bricks_online, + get_online_bricks_list) +from glustolibs.gluster.exceptions import ExecutionError +from glustolibs.gluster.gluster_base_class import GlusterBaseClass, runs_on +from glustolibs.gluster.glusterdir import mkdir +from glustolibs.gluster.heal_libs import ( + is_heal_complete, is_volume_in_split_brain, monitor_heal_completion, + wait_for_self_heal_daemons_to_be_online) +from glustolibs.gluster.heal_ops import (disable_self_heal_daemon, + enable_self_heal_daemon, trigger_heal) +from glustolibs.gluster.lib_utils import (add_user, collect_bricks_arequal, + del_user, group_add, group_del) +from glustolibs.gluster.volume_libs import get_subvols +from glustolibs.io.utils import list_all_files_and_dirs_mounts + + +@runs_on([['arbiter', 'replicated'], ['glusterfs']]) +class TestMetadataAndDataHeal(GlusterBaseClass): + '''Description: Verify shd heals files after performing metadata and data + operations while a brick was down''' + def _dac_helper(self, host, option): + '''Helper for creating, deleting users and groups''' + + # Permission/Ownership changes required only for `test_metadata..` + # tests, using random group and usernames + if 'metadata' not in self.test_dir: + return + + if option == 'create': + # Groups + for group in ('qa_func', 'qa_system'): + if not group_add(host, group): + raise ExecutionError('Unable to {} group {} on ' + '{}'.format(option, group, host)) + + # User + if not add_user(host, 'qa_all', group='qa_func'): + raise ExecutionError('Unable to {} user {} under {} on ' + '{}'.format(option, 'qa_all', 'qa_func', + host)) + elif option == 'delete': + # Groups + for group in ('qa_func', 'qa_system'): + if not group_del(host, group): + raise ExecutionError('Unable to {} group {} on ' + '{}'.format(option, group, host)) + + # User + if not del_user(host, 'qa_all'): + raise ExecutionError('Unable to {} user on {}'.format( + option, host)) + + def setUp(self): + self.get_super_method(self, 'setUp')() + + # A single mount is enough for all the tests + self.mounts = self.mounts[0:1] + self.client = self.mounts[0].client_system + + # Use testcase name as test directory + self.test_dir = self.id().split('.')[-1] + self.fqpath = self.mounts[0].mountpoint + '/' + self.test_dir + + if not self.setup_volume_and_mount_volume(mounts=self.mounts): + raise ExecutionError('Failed to setup and mount ' + '{}'.format(self.volname)) + + # Crete group and user names required for the test + self._dac_helper(host=self.client, option='create') + + def tearDown(self): + # Delete group and user names created as part of setup + self._dac_helper(host=self.client, option='delete') + + if not self.unmount_volume_and_cleanup_volume(mounts=self.mounts): + raise ExecutionError('Not able to unmount and cleanup ' + '{}'.format(self.volname)) + + self.get_super_method(self, 'tearDown')() + + def _perform_io_and_disable_self_heal(self): + '''Refactor of steps common to all tests: Perform IO, disable heal''' + ret = mkdir(self.client, self.fqpath) + self.assertTrue(ret, + 'Directory creation failed on {}'.format(self.client)) + self.io_cmd = 'cat /dev/urandom | tr -dc [:space:][:print:] | head -c ' + # Create 6 dir's, 6 files and 6 files in each subdir with 10K data + file_io = ('''cd {0}; for i in `seq 1 6`; + do mkdir dir.$i; {1} 10K > file.$i; + for j in `seq 1 6`; + do {1} 10K > dir.$i/file.$j; done; + done;'''.format(self.fqpath, self.io_cmd)) + ret, _, err = g.run(self.client, file_io) + self.assertEqual(ret, 0, 'Unable to create directories and data files') + self.assertFalse(err, '{0} failed with {1}'.format(file_io, err)) + + # Disable self heal deamon + self.assertTrue(disable_self_heal_daemon(self.mnode, self.volname), + 'Disabling self-heal-daemon falied') + + def _perform_brick_ops_and_enable_self_heal(self, op_type): + '''Refactor of steps common to all tests: Brick down and perform + metadata/data operations''' + # First brick in the subvol will always be online and used for self + # heal, so make keys match brick index + self.op_cmd = { + # Metadata Operations (owner and permission changes) + 'metadata': { + 2: + '''cd {0}; for i in `seq 1 3`; do chown -R qa_all:qa_func \ + dir.$i file.$i; chmod -R 555 dir.$i file.$i; done;''', + 3: + '''cd {0}; for i in `seq 1 3`; do chown -R :qa_system \ + dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', + # 4 - Will be used for final data consistency check + 4: + '''cd {0}; for i in `seq 1 6`; do chown -R qa_all:qa_system \ + dir.$i file.$i; chmod -R 777 dir.$i file.$i; done;''', + }, + # Data Operations (append data to the files) + 'data': { + 2: + '''cd {0}; for i in `seq 1 3`; + do {1} 2K >> file.$i; + for j in `seq 1 3`; + do {1} 2K >> dir.$i/file.$j; done; + done;''', + 3: + '''cd {0}; for i in `seq 1 3`; + do {1} 3K >> file.$i; + for j in `seq 1 3`; + do {1} 3K >> dir.$i/file.$j; done; + done;''', + # 4 - Will be used for final data consistency check + 4: + '''cd {0}; for i in `seq 1 6`; + do {1} 4K >> file.$i; + for j in `seq 1 6`; + do {1} 4K >> dir.$i/file.$j; done; + done;''', + }, + } + bricks = get_online_bricks_list(self.mnode, self.volname) + self.assertIsNotNone(bricks, + 'Not able to get list of bricks in the volume') + + # Make first brick always online and start operations from second brick + for index, brick in enumerate(bricks[1:], start=2): + + # Bring brick offline + ret = bring_bricks_offline(self.volname, brick) + self.assertTrue(ret, 'Unable to bring {} offline'.format(bricks)) + + # Perform metadata/data operation + cmd = self.op_cmd[op_type][index].format(self.fqpath, self.io_cmd) + ret, _, err = g.run(self.client, cmd) + self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err)) + self.assertFalse(err, '{0} failed with {1}'.format(cmd, err)) + + # Bring brick online + ret = bring_bricks_online( + self.mnode, + self.volname, + brick, + bring_bricks_online_methods='volume_start_force') + + # Assert metadata/data operations resulted in pending heals + self.assertFalse(is_heal_complete(self.mnode, self.volname)) + + # Enable and wait self heal daemon to be online + self.assertTrue(enable_self_heal_daemon(self.mnode, self.volname), + 'Enabling self heal daemon failed') + self.assertTrue( + wait_for_self_heal_daemons_to_be_online(self.mnode, self.volname), + 'Not all self heal daemons are online') + + def _validate_heal_completion_and_arequal(self, op_type): + '''Refactor of steps common to all tests: Validate heal from heal + commands, verify arequal, perform IO and verify arequal after IO''' + + # Validate heal completion + self.assertTrue(monitor_heal_completion(self.mnode, self.volname), + 'Self heal is not completed within timeout') + self.assertFalse( + is_volume_in_split_brain(self.mnode, self.volname), + 'Volume is in split brain even after heal completion') + + subvols = get_subvols(self.mnode, self.volname)['volume_subvols'] + self.assertTrue(subvols, 'Not able to get list of subvols') + arbiter = self.volume_type.find('arbiter') >= 0 + stop = len(subvols[0]) - 1 if arbiter else len(subvols[0]) + + # Validate arequal + self._validate_arequal_and_perform_lookup(subvols, stop) + + # Perform some additional metadata/data operations + cmd = self.op_cmd[op_type][4].format(self.fqpath, self.io_cmd) + ret, _, err = g.run(self.client, cmd) + self.assertEqual(ret, 0, '{0} failed with {1}'.format(cmd, err)) + self.assertFalse(err, '{0} failed with {1}'.format(cmd, err)) + + # Validate arequal after additional operations + self._validate_arequal_and_perform_lookup(subvols, stop) + + def _validate_arequal_and_perform_lookup(self, subvols, stop): + '''Refactor of steps common to all tests: Validate arequal from bricks + backend and perform a lookup of all files from mount''' + for subvol in subvols: + ret, arequal = collect_bricks_arequal(subvol[0:stop]) + self.assertTrue( + ret, 'Unable to get `arequal` checksum on ' + '{}'.format(subvol[0:stop])) + self.assertEqual( + len(set(arequal)), 1, 'Mismatch of `arequal` ' + 'checksum among {} is identified'.format(subvol[0:stop])) + + # Perform a lookup of all files and directories on mounts + self.assertTrue(list_all_files_and_dirs_mounts(self.mounts), + 'Failed to list all files and dirs from mount') + + def test_metadata_heal_from_shd(self): + '''Description: Verify files heal after switching on `self-heal-daemon` + when metadata operations are performed while a brick was down + + Steps: + 1. Create, mount and run IO on volume + 2. Set `self-heal-daemon` to `off`, cyclic brick down and perform + metadata operations + 3. Set `self-heal-daemon` to `on` and wait for heal completion + 4. Validate areequal checksum on backend bricks + ''' + op_type = 'metadata' + self._perform_io_and_disable_self_heal() + self._perform_brick_ops_and_enable_self_heal(op_type=op_type) + self._validate_heal_completion_and_arequal(op_type=op_type) + g.log.info('Pass: Verification of metadata heal after switching on ' + '`self heal daemon` is complete') + + def test_metadata_heal_from_heal_cmd(self): + '''Description: Verify files heal after triggering heal command when + metadata operations are performed while a brick was down + + Steps: + 1. Create, mount and run IO on volume + 2. Set `self-heal-daemon` to `off`, cyclic brick down and perform + metadata operations + 3. Set `self-heal-daemon` to `on`, invoke `gluster vol heal` + 4. Validate areequal checksum on backend bricks + ''' + op_type = 'metadata' + self._perform_io_and_disable_self_heal() + self._perform_brick_ops_and_enable_self_heal(op_type=op_type) + + # Invoke `glfsheal` + self.assertTrue(trigger_heal(self.mnode, self.volname), + 'Unable to trigger index heal on the volume') + + self._validate_heal_completion_and_arequal(op_type=op_type) + g.log.info( + 'Pass: Verification of metadata heal via `glfsheal` is complete') + + def test_data_heal_from_shd(self): + '''Description: Verify files heal after triggering heal command when + data operations are performed while a brick was down + + Steps: + 1. Create, mount and run IO on volume + 2. Set `self-heal-daemon` to `off`, cyclic brick down and perform data + operations + 3. Set `self-heal-daemon` to `on` and wait for heal completion + 4. Validate areequal checksum on backend bricks + ''' + op_type = 'data' + self._perform_io_and_disable_self_heal() + self._perform_brick_ops_and_enable_self_heal(op_type=op_type) + self._validate_heal_completion_and_arequal(op_type=op_type) + g.log.info('Pass: Verification of data heal after switching on ' + '`self heal daemon` is complete') -- cgit