From 642af0a8889aafcf1a2fc0e5ad4bd73eb4921f8d Mon Sep 17 00:00:00 2001 From: "a.y.volkov" Date: Mon, 1 Aug 2022 09:16:36 +0300 Subject: [PATCH] Add test for network failover Signed-off-by: a.y.volkov --- pytest_tests/helpers/iptables_helper.py | 16 +++ pytest_tests/helpers/ssh_helper.py | 24 +++- pytest_tests/pytest.ini | 4 +- pytest_tests/testsuites/conftest.py | 8 ++ pytest_tests/testsuites/failovers/__init__.py | 0 .../testsuites/failovers/failover_utils.py | 49 ++++++++ .../failovers/test_failover_network.py | 118 ++++++++++++++++++ .../failovers/test_failover_storage.py | 84 ++++--------- .../lib/python_keywords/storage_policy.py | 11 +- 9 files changed, 247 insertions(+), 67 deletions(-) create mode 100644 pytest_tests/helpers/iptables_helper.py create mode 100644 pytest_tests/testsuites/failovers/__init__.py create mode 100644 pytest_tests/testsuites/failovers/failover_utils.py create mode 100644 pytest_tests/testsuites/failovers/test_failover_network.py diff --git a/pytest_tests/helpers/iptables_helper.py b/pytest_tests/helpers/iptables_helper.py new file mode 100644 index 00000000..3fb4c2bd --- /dev/null +++ b/pytest_tests/helpers/iptables_helper.py @@ -0,0 +1,16 @@ +from ssh_helper import HostClient + + +class IpTablesHelper: + + @staticmethod + def drop_input_traffic_to_port(client: HostClient, ports: list[str]): + for port in ports: + cmd_output = client.exec(cmd=f'iptables -A INPUT -p tcp --dport {port} -j DROP') + assert cmd_output.rc == 0 + + @staticmethod + def restore_input_traffic_to_port(client: HostClient, ports: list[str]): + for port in ports: + cmd_output = client.exec(cmd=f'iptables -D INPUT -p tcp --dport {port} -j DROP') + assert cmd_output.rc == 0 diff --git a/pytest_tests/helpers/ssh_helper.py b/pytest_tests/helpers/ssh_helper.py index dad4c943..ffc06228 100644 --- a/pytest_tests/helpers/ssh_helper.py +++ b/pytest_tests/helpers/ssh_helper.py @@ -66,15 +66,19 @@ class HostClient: TIMEOUT_RESTORE_CONNECTION = 10, 24 def __init__(self, ip: str, login: str, password: Optional[str] = None, - private_key_path: Optional[str] = None, init_ssh_client=True) -> None: + private_key_path: Optional[str] = None, private_key_passphrase: Optional[str] = None, + init_ssh_client=True) -> None: self.ip = ip self.login = login self.password = password self.private_key_path = private_key_path + self.private_key_passphrase = private_key_passphrase if init_ssh_client: self.create_connection(self.SSH_CONNECTION_ATTEMPTS) def exec(self, cmd: str, verify=True, timeout=90) -> SSHCommand: + if self.login != 'root': + cmd = f'sudo {cmd}' cmd_result = self._inner_exec(cmd, timeout) if verify: assert cmd_result.rc == 0, f'Non zero rc from command: "{cmd}"' @@ -110,6 +114,15 @@ class HostClient: self.login, self.password = keep_user, keep_password self.create_connection() + @contextmanager + def create_ssh_connection(self) -> 'SSHClient': + if not self.ssh_client: + self.create_connection() + try: + yield self.ssh_client + finally: + self.drop() + @allure.step('Restore connection') def restore_ssh_connection(self): retry_time, retry_count = self.TIMEOUT_RESTORE_CONNECTION @@ -151,13 +164,13 @@ class HostClient: try: if self.private_key_path: logging.info( - f"Trying to connect to host {self.ip} using SSH key " + f"Trying to connect to host {self.ip} as {self.login} using SSH key " f"{self.private_key_path} (attempt {attempt})" ) self.ssh_client.connect( hostname=self.ip, username=self.login, - pkey=RSAKey.from_private_key_file(self.private_key_path, self.password), + pkey=RSAKey.from_private_key_file(self.private_key_path, self.private_key_passphrase), timeout=self.CONNECTION_TIMEOUT ) else: @@ -175,7 +188,7 @@ class HostClient: except AuthenticationException as auth_err: logging.error(f'Host: {self.ip}. {auth_err}') - + self.drop() raise auth_err except ( @@ -186,6 +199,7 @@ class HostClient: OSError ) as ssh_err: exc_err = ssh_err + self.drop() logging.error(f'Host: {self.ip}, connection error. {exc_err}') raise HostIsNotAvailable(self.ip, exc_err) @@ -197,8 +211,6 @@ class HostClient: def _inner_exec(self, cmd: str, timeout: int) -> SSHCommand: if not self.ssh_client: self.create_connection() - if self.login != "root": - cmd = f"sudo {cmd}" for _ in range(self.SSH_CONNECTION_ATTEMPTS): try: _, stdout, stderr = self.ssh_client.exec_command(cmd, timeout=timeout) diff --git a/pytest_tests/pytest.ini b/pytest_tests/pytest.ini index d6a73ad1..81231bc7 100644 --- a/pytest_tests/pytest.ini +++ b/pytest_tests/pytest.ini @@ -18,4 +18,6 @@ markers = long: long tests (with long execution time) node_mgmt: neofs control commands acl: tests for basic and extended ACL - failover: tests for system recovery after a failure \ No newline at end of file + failover: tests for system recovery after a failure + failover_panic: tests for system recovery after panic reboot of a node + failover_net: tests for network failure diff --git a/pytest_tests/testsuites/conftest.py b/pytest_tests/testsuites/conftest.py index 9a7b410f..ca3cb5b1 100644 --- a/pytest_tests/testsuites/conftest.py +++ b/pytest_tests/testsuites/conftest.py @@ -23,6 +23,13 @@ deco.keyword = robot_keyword_adapter logger = logging.getLogger('NeoLogger') +@pytest.fixture(scope='session') +def free_storage_check(): + if os.getenv('FREE_STORAGE', default='False').lower() not in ('true', '1'): + pytest.skip('Test only works on SberCloud infrastructure') + yield + + @pytest.fixture(scope='session', autouse=True) @allure.title('Check binary versions') def check_binary_versions(request): @@ -81,6 +88,7 @@ def init_wallet_with_address(prepare_tmp_dir): def prepare_wallet_and_deposit(init_wallet_with_address): wallet, addr, _ = init_wallet_with_address logger.info(f'Init wallet: {wallet},\naddr: {addr}') + allure.attach.file(wallet, os.path.basename(wallet), allure.attachment_type.JSON) if not FREE_STORAGE: deposit = 30 diff --git a/pytest_tests/testsuites/failovers/__init__.py b/pytest_tests/testsuites/failovers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pytest_tests/testsuites/failovers/failover_utils.py b/pytest_tests/testsuites/failovers/failover_utils.py new file mode 100644 index 00000000..90d523a8 --- /dev/null +++ b/pytest_tests/testsuites/failovers/failover_utils.py @@ -0,0 +1,49 @@ +import logging +from time import sleep +from typing import Optional + +import allure + +from common import NEOFS_NETMAP_DICT +from python_keywords.node_management import node_healthcheck +from storage_policy import get_nodes_with_object + +logger = logging.getLogger('NeoLogger') + + +@allure.step('Wait for object replication') +def wait_object_replication_on_nodes(wallet: str, cid: str, oid: str, expected_copies: int, + excluded_nodes: Optional[list[str]] = None) -> list[str]: + excluded_nodes = excluded_nodes or [] + sleep_interval, attempts = 10, 18 + nodes = [] + for __attempt in range(attempts): + nodes = get_nodes_with_object(wallet, cid, oid, skip_nodes=excluded_nodes) + if len(nodes) == expected_copies: + return nodes + sleep(sleep_interval) + raise AssertionError(f'Expected {expected_copies} copies of object, but found {len(nodes)}. ' + f'Waiting time {sleep_interval * attempts}') + + +@allure.step('Wait for storage node returned to cluster') +def wait_all_storage_node_returned(): + sleep_interval, attempts = 10, 12 + for __attempt in range(attempts): + if is_all_storage_node_returned(): + return + sleep(sleep_interval) + raise AssertionError('Storage node(s) is broken') + + +def is_all_storage_node_returned() -> bool: + with allure.step('Run health check for all storage nodes'): + for node_name in NEOFS_NETMAP_DICT.keys(): + try: + health_check = node_healthcheck(node_name) + except Exception as err: + logger.warning(f'Node healthcheck fails with error {err}') + return False + if health_check.health_status != 'READY' or health_check.network_status != 'ONLINE': + return False + return True diff --git a/pytest_tests/testsuites/failovers/test_failover_network.py b/pytest_tests/testsuites/failovers/test_failover_network.py new file mode 100644 index 00000000..6790135b --- /dev/null +++ b/pytest_tests/testsuites/failovers/test_failover_network.py @@ -0,0 +1,118 @@ +import logging +from random import choices +from time import sleep + +import allure +import pytest + +from common import STORAGE_NODE_SSH_PRIVATE_KEY_PATH, STORAGE_NODE_SSH_USER, STORAGE_NODE_SSH_PASSWORD, \ + NEOFS_NETMAP_DICT +from iptables_helper import IpTablesHelper +from python_keywords.container import create_container +from python_keywords.neofs_verbs import get_object, put_object +from python_keywords.utility_keywords import generate_file, get_file_hash +from ssh_helper import HostClient +from wellknown_acl import PUBLIC_ACL +from .failover_utils import wait_all_storage_node_returned, wait_object_replication_on_nodes + +logger = logging.getLogger('NeoLogger') +STORAGE_NODE_COMMUNICATION_PORT = '8080' +STORAGE_NODE_COMMUNICATION_PORT_TLS = '8082' +PORTS_TO_BLOCK = [STORAGE_NODE_COMMUNICATION_PORT, STORAGE_NODE_COMMUNICATION_PORT_TLS] +blocked_hosts = [] + + +@pytest.fixture(autouse=True) +@allure.step('Install iptables if needed') +def install_iptables_if_needed(): + check_command = 'iptables --version' + install_command = 'apt-get --yes install iptables' + for node_config in NEOFS_NETMAP_DICT.values(): + host = node_config.get('rpc').split(':')[0] + client = HostClient(ip=host, login=STORAGE_NODE_SSH_USER, + password=STORAGE_NODE_SSH_PASSWORD, + private_key_path=STORAGE_NODE_SSH_PRIVATE_KEY_PATH) + with client.create_ssh_connection(): + try: + client.exec(check_command) + except AssertionError as err: + logger.info(f'Command {check_command} fails with error {err}') + client.exec(install_command) + client.exec(check_command) + + +@pytest.fixture(autouse=True) +@allure.step('Restore network') +def restore_network(): + yield + + not_empty = len(blocked_hosts) != 0 + for host in list(blocked_hosts): + with allure.step(f'Start storage node {host}'): + client = HostClient(ip=host, login=STORAGE_NODE_SSH_USER, + password=STORAGE_NODE_SSH_PASSWORD, + private_key_path=STORAGE_NODE_SSH_PRIVATE_KEY_PATH) + with client.create_ssh_connection(): + IpTablesHelper.restore_input_traffic_to_port(client, PORTS_TO_BLOCK) + blocked_hosts.remove(host) + if not_empty: + wait_all_storage_node_returned() + + +@allure.title('Block Storage node traffic') +@pytest.mark.failover +@pytest.mark.failover_net +def test_block_storage_node_traffic(prepare_wallet_and_deposit, free_storage_check): + """ + Block storage nodes traffic using iptables and wait for replication for objects. + """ + wallet = prepare_wallet_and_deposit + placement_rule = 'REP 2 IN X CBF 2 SELECT 2 FROM * AS X' + excluded_nodes = [] + wakeup_node_timeout = 10 # timeout to let nodes detect that traffic has blocked + nodes_to_block_count = 2 + + source_file_path = generate_file() + cid = create_container(wallet, rule=placement_rule, basic_acl=PUBLIC_ACL) + oid = put_object(wallet, source_file_path, cid) + nodes = wait_object_replication_on_nodes(wallet, cid, oid, 2) + + logger.info(f'Nodes are {nodes}') + random_nodes = [(node, node.split(':')[0]) for node in nodes] + if nodes_to_block_count > len(nodes): + random_nodes = [(node, node.split(':')[0]) for node in choices(nodes, k=2)] + + for random_node, random_node_ip in random_nodes: + client = HostClient(ip=random_node_ip, login=STORAGE_NODE_SSH_USER, + password=STORAGE_NODE_SSH_PASSWORD, + private_key_path=STORAGE_NODE_SSH_PRIVATE_KEY_PATH) + + with allure.step(f'Block incoming traffic for node {random_node} on port {PORTS_TO_BLOCK}'): + with client.create_ssh_connection(): + IpTablesHelper.drop_input_traffic_to_port(client, PORTS_TO_BLOCK) + blocked_hosts.append(random_node_ip) + excluded_nodes.append(random_node) + sleep(wakeup_node_timeout) + + new_nodes = wait_object_replication_on_nodes(wallet, cid, oid, 2, excluded_nodes=excluded_nodes) + + assert random_node not in new_nodes + + got_file_path = get_object(wallet, cid, oid, endpoint=new_nodes[0]) + assert get_file_hash(source_file_path) == get_file_hash(got_file_path) + + for random_node, random_node_ip in random_nodes: + client = HostClient(ip=random_node_ip, login=STORAGE_NODE_SSH_USER, + password=STORAGE_NODE_SSH_PASSWORD, + private_key_path=STORAGE_NODE_SSH_PRIVATE_KEY_PATH) + + with allure.step(f'Unblock incoming traffic for node {random_node} on port {PORTS_TO_BLOCK}'): + with client.create_ssh_connection(): + IpTablesHelper.restore_input_traffic_to_port(client, PORTS_TO_BLOCK) + blocked_hosts.remove(random_node_ip) + sleep(wakeup_node_timeout) + + new_nodes = wait_object_replication_on_nodes(wallet, cid, oid, 2) + + got_file_path = get_object(wallet, cid, oid, endpoint=new_nodes[0]) + assert get_file_hash(source_file_path) == get_file_hash(got_file_path) diff --git a/pytest_tests/testsuites/failovers/test_failover_storage.py b/pytest_tests/testsuites/failovers/test_failover_storage.py index 97d677de..50458e87 100644 --- a/pytest_tests/testsuites/failovers/test_failover_storage.py +++ b/pytest_tests/testsuites/failovers/test_failover_storage.py @@ -1,91 +1,54 @@ import logging import os -from time import sleep import allure import pytest -from common import NEOFS_NETMAP_DICT + +from common import STORAGE_NODE_SSH_PRIVATE_KEY_PATH, STORAGE_NODE_SSH_USER, STORAGE_NODE_SSH_PASSWORD from python_keywords.container import create_container from python_keywords.neofs_verbs import get_object, put_object -from python_keywords.node_management import node_healthcheck from python_keywords.utility_keywords import generate_file, get_file_hash from sbercloud_helper import SberCloud from ssh_helper import HostClient, HostIsNotAvailable -from storage_policy import get_nodes_with_object from wellknown_acl import PUBLIC_ACL +from .failover_utils import wait_all_storage_node_returned, wait_object_replication_on_nodes logger = logging.getLogger('NeoLogger') stopped_hosts = [] -@pytest.fixture(scope='session') -def free_storage_check(): - if os.getenv('FREE_STORAGE', default='False').lower() not in ('true', '1'): - pytest.skip('Test only works on SberCloud infrastructure') - yield - - @pytest.fixture(scope='session') def sbercloud_client(): with allure.step('Connect to SberCloud'): try: yield SberCloud(f'{os.getcwd()}/configuration/sbercloud.yaml') - except Exception: - pytest.fail('SberCloud infrastructure not available') + except Exception as err: + pytest.fail(f'SberCloud infrastructure not available. Error\n{err}') @pytest.fixture(autouse=True) +@allure.step('Return all storage nodes') def return_all_storage_nodes_fixture(sbercloud_client): yield return_all_storage_nodes(sbercloud_client) def panic_reboot_host(ip: str = None): - ssh = HostClient(ip=ip, login="root", private_key_path=f"{os.getcwd()}/configuration/id_rsa") - ssh.exec('echo 1 > /proc/sys/kernel/sysrq') + ssh = HostClient(ip=ip, login=STORAGE_NODE_SSH_USER, + password=STORAGE_NODE_SSH_PASSWORD, + private_key_path=STORAGE_NODE_SSH_PRIVATE_KEY_PATH) + ssh.exec('sudo sh -c "echo 1 > /proc/sys/kernel/sysrq"') with pytest.raises(HostIsNotAvailable): - ssh.exec('echo b > /proc/sysrq-trigger', timeout=1) + ssh.exec('sudo sh -c "echo b > /proc/sysrq-trigger"', timeout=1) def return_all_storage_nodes(sbercloud_client: SberCloud): - for host in stopped_hosts: + for host in list(stopped_hosts): with allure.step(f'Start storage node {host}'): sbercloud_client.start_node(node_ip=host.split(':')[-2]) + stopped_hosts.remove(host) + wait_all_storage_node_returned() - stopped_hosts.clear() - - -def is_all_storage_node_returned() -> bool: - with allure.step('Run health check for all storage nodes'): - for node_name in NEOFS_NETMAP_DICT.keys(): - try: - health_check = node_healthcheck(node_name) - except (AssertionError, HostIsNotAvailable, TimeoutError): - return False - if health_check.health_status != 'READY' or health_check.network_status != 'ONLINE': - return False - return True - - -def wait_all_storage_node_returned(): - sleep_interval, attempts = 10, 12 - for __attempt in range(attempts): - if is_all_storage_node_returned(): - return - sleep(sleep_interval) - raise AssertionError('Storage node(s) is broken') - - -def wait_object_replication(wallet, cid, oid, expected_copies: int, excluded_nodes: [str] = None) -> [str]: - excluded_nodes = excluded_nodes or [] - sleep_interval, attempts = 10, 12 - nodes = [] - for __attempt in range(attempts): - nodes = [node for node in get_nodes_with_object(wallet, cid, oid) if node not in excluded_nodes] - if len(nodes) == expected_copies: - return nodes - sleep(sleep_interval) - raise AssertionError(f'Expected {expected_copies} copies of object, but found {len(nodes)} ') @allure.title('Lost and return nodes') @@ -98,14 +61,14 @@ def test_lost_storage_node(prepare_wallet_and_deposit, sbercloud_client: SberClo source_file_path = generate_file() cid = create_container(wallet, rule=placement_rule, basic_acl=PUBLIC_ACL) oid = put_object(wallet, source_file_path, cid) - nodes = wait_object_replication(wallet, cid, oid, 2) + nodes = wait_object_replication_on_nodes(wallet, cid, oid, 2) new_nodes = [] for node in nodes: stopped_hosts.append(node) with allure.step(f'Stop storage node {node}'): sbercloud_client.stop_node(node_ip=node.split(':')[-2], hard=hard_reboot) - new_nodes = wait_object_replication(wallet, cid, oid, 2, excluded_nodes=[node]) + new_nodes = wait_object_replication_on_nodes(wallet, cid, oid, 2, excluded_nodes=[node]) assert not [node for node in nodes if node in new_nodes] got_file_path = get_object(wallet, cid, oid, endpoint=new_nodes[0]) @@ -114,7 +77,7 @@ def test_lost_storage_node(prepare_wallet_and_deposit, sbercloud_client: SberClo with allure.step(f'Return storage nodes'): return_all_storage_nodes(sbercloud_client) - new_nodes = wait_object_replication(wallet, cid, oid, 2) + new_nodes = wait_object_replication_on_nodes(wallet, cid, oid, 2) got_file_path = get_object(wallet, cid, oid, endpoint=new_nodes[0]) assert get_file_hash(source_file_path) == get_file_hash(got_file_path) @@ -122,6 +85,7 @@ def test_lost_storage_node(prepare_wallet_and_deposit, sbercloud_client: SberClo @allure.title('Panic storage node(s)') @pytest.mark.parametrize('sequence', [True, False]) +@pytest.mark.failover_panic @pytest.mark.failover def test_panic_storage_node(prepare_wallet_and_deposit, free_storage_check, sequence: bool): wallet = prepare_wallet_and_deposit @@ -130,18 +94,22 @@ def test_panic_storage_node(prepare_wallet_and_deposit, free_storage_check, sequ cid = create_container(wallet, rule=placement_rule, basic_acl=PUBLIC_ACL) oid = put_object(wallet, source_file_path, cid) - nodes = wait_object_replication(wallet, cid, oid, 2) + nodes = wait_object_replication_on_nodes(wallet, cid, oid, 2) allure.attach('\n'.join(nodes), 'Current nodes with object', allure.attachment_type.TEXT) for node in nodes: with allure.step(f'Hard reboot host {node} via magic SysRq option'): panic_reboot_host(ip=node.split(':')[-2]) if sequence: - new_nodes = wait_object_replication(wallet, cid, oid, 2, excluded_nodes=[node]) + try: + new_nodes = wait_object_replication_on_nodes(wallet, cid, oid, 2, excluded_nodes=[node]) + except AssertionError: + new_nodes = wait_object_replication_on_nodes(wallet, cid, oid, 2) + allure.attach('\n'.join(new_nodes), f'Nodes with object after {node} fail', - allure.attachment_type.TEXT) + allure.attachment_type.TEXT) if not sequence: - new_nodes = wait_object_replication(wallet, cid, oid, 2, excluded_nodes=nodes) + new_nodes = wait_object_replication_on_nodes(wallet, cid, oid, 2) allure.attach('\n'.join(new_nodes), 'Nodes with object after nodes fail', allure.attachment_type.TEXT) got_file_path = get_object(wallet, cid, oid, endpoint=new_nodes[0]) diff --git a/robot/resources/lib/python_keywords/storage_policy.py b/robot/resources/lib/python_keywords/storage_policy.py index 49f143b0..70324c74 100644 --- a/robot/resources/lib/python_keywords/storage_policy.py +++ b/robot/resources/lib/python_keywords/storage_policy.py @@ -5,6 +5,8 @@ that storage policies are kept. """ +from typing import Optional + from robot.api import logger from robot.api.deco import keyword @@ -86,7 +88,7 @@ def get_complex_object_copies(wallet: str, cid: str, oid: str): @keyword('Get Nodes With Object') -def get_nodes_with_object(wallet: str, cid: str, oid: str): +def get_nodes_with_object(wallet: str, cid: str, oid: str, skip_nodes: Optional[list[str]] = None) -> list[str]: """ The function returns list of nodes which store the given object. @@ -95,11 +97,16 @@ def get_nodes_with_object(wallet: str, cid: str, oid: str): we request the nodes cid (str): ID of the container which store the object oid (str): object ID + skip_nodes (list): list of nodes that should be excluded from check Returns: (list): nodes which store the object """ + nodes_to_search = NEOFS_NETMAP + if skip_nodes: + nodes_to_search = [node for node in NEOFS_NETMAP if node not in skip_nodes] + nodes_list = [] - for node in NEOFS_NETMAP: + for node in nodes_to_search: try: res = neofs_verbs.head_object(wallet, cid, oid, endpoint=node,