From f97bfed183c077cfb966c147f625e401c91b2d38 Mon Sep 17 00:00:00 2001 From: "a.y.volkov" Date: Wed, 3 Aug 2022 18:20:50 +0300 Subject: [PATCH] Add test for adding node to cluster Signed-off-by: a.y.volkov --- pytest_tests/pytest.ini | 1 + pytest_tests/testsuites/conftest.py | 57 +++++-- .../failovers/test_failover_network.py | 6 +- .../failovers/test_failover_storage.py | 5 +- .../network/test_node_management.py | 149 ++++++++++++++++-- .../lib/python_keywords/cli_helpers.py | 6 +- robot/resources/lib/python_keywords/epoch.py | 15 +- .../lib/python_keywords}/failover_utils.py | 0 .../lib/python_keywords/node_management.py | 9 +- 9 files changed, 218 insertions(+), 30 deletions(-) rename {pytest_tests/testsuites/failovers => robot/resources/lib/python_keywords}/failover_utils.py (100%) diff --git a/pytest_tests/pytest.ini b/pytest_tests/pytest.ini index 81231bc7..b154f9db 100644 --- a/pytest_tests/pytest.ini +++ b/pytest_tests/pytest.ini @@ -21,3 +21,4 @@ markers = failover: tests for system recovery after a failure failover_panic: tests for system recovery after panic reboot of a node failover_net: tests for network failure + add_nodes: add nodes to cluster diff --git a/pytest_tests/testsuites/conftest.py b/pytest_tests/testsuites/conftest.py index 6dfbb896..15ea04ca 100644 --- a/pytest_tests/testsuites/conftest.py +++ b/pytest_tests/testsuites/conftest.py @@ -11,7 +11,7 @@ import wallet from cli_helpers import _cmd_run from common import ASSETS_DIR, FREE_STORAGE, MAINNET_WALLET_PATH, NEOFS_NETMAP_DICT from payment_neogo import neofs_deposit, transfer_mainnet_gas -from python_keywords.node_management import node_healthcheck +from python_keywords.node_management import node_healthcheck, create_ssh_client from sbercloud_helper import SberCloudConfig @@ -26,24 +26,37 @@ logger = logging.getLogger('NeoLogger') @pytest.fixture(scope='session') def cloud_infrastructure_check(): - cloud_config = SberCloudConfig.from_env() - if not cloud_config.project_id: + if not is_cloud_infrastructure(): pytest.skip('Test only works on SberCloud infrastructure') yield +def is_cloud_infrastructure(): + cloud_config = SberCloudConfig.from_env() + return cloud_config.project_id is not None + + @pytest.fixture(scope='session', autouse=True) @allure.title('Check binary versions') def check_binary_versions(request): environment_dir = request.config.getoption('--alluredir') - + is_cloud = is_cloud_infrastructure() # Collect versions of neo binaries binaries = ['neo-go', 'neofs-cli', 'neofs-authmate'] - env_out = {} - for binary in binaries: - out = _cmd_run(f'{binary} --version') - version = re.search(r'version[:\s]*(.+)', out, re.IGNORECASE) - env_out[binary.upper()] = version.group(1) if version else 'Unknown' + env_out = _get_binaries_version_local(binaries) + + if is_cloud: + binaries = ['neo-go', + 'neofs-adm', + 'neofs-cli', + 'neofs-http-gw', + 'neofs-ir', + 'neofs-lens', + 'neofs-node', + 'neofs-s3-authmate', + 'neofs-s3-gw', + 'neogo-morph-cn'] + env_out = _get_binaries_version_remote(binaries) # Get version of aws binary out = _cmd_run('aws --version') @@ -56,6 +69,32 @@ def check_binary_versions(request): out_file.write(f'{env}={env_value}\n') +def _get_binaries_version_local(binaries: list) -> dict: + env_out = {} + for binary in binaries: + out = _cmd_run(f'{binary} --version') + version = re.search(r'version[:\s]*(.+)', out, re.IGNORECASE) + env_out[binary.upper()] = version.group(1) if version else 'Unknown' + return env_out + + +def _get_binaries_version_remote(binaries: list) -> dict: + env_out = {} + + for node_name in NEOFS_NETMAP_DICT: + with create_ssh_client(node_name) as ssh_client: + for binary in binaries: + out = ssh_client.exec(f'{binary} --version').stdout + version = re.search(r'version[:\s]*(.+)', out, re.IGNORECASE) + version = version.group(1) if version else 'Unknown' + if not env_out.get(binary.upper()): + env_out[binary.upper()] = version + else: + msg = f'Expected binary {binary} versions on node s1 and {node_name} are the same' + assert env_out[binary.upper()] == version, msg + return env_out + + @pytest.fixture(scope='session', autouse=True) @allure.title('Run health check for all storage nodes') def run_health_check(): diff --git a/pytest_tests/testsuites/failovers/test_failover_network.py b/pytest_tests/testsuites/failovers/test_failover_network.py index efcbf86d..fd501f3e 100644 --- a/pytest_tests/testsuites/failovers/test_failover_network.py +++ b/pytest_tests/testsuites/failovers/test_failover_network.py @@ -5,15 +5,15 @@ from time import sleep import allure import pytest -from common import STORAGE_NODE_SSH_PRIVATE_KEY_PATH, STORAGE_NODE_SSH_USER, STORAGE_NODE_SSH_PASSWORD, \ - NEOFS_NETMAP_DICT +from common import (STORAGE_NODE_SSH_PRIVATE_KEY_PATH, STORAGE_NODE_SSH_USER, + STORAGE_NODE_SSH_PASSWORD, NEOFS_NETMAP_DICT) +from failover_utils import wait_all_storage_node_returned, wait_object_replication_on_nodes from iptables_helper import IpTablesHelper from python_keywords.container import create_container from python_keywords.neofs_verbs import get_object, put_object from python_keywords.utility_keywords import generate_file, get_file_hash from ssh_helper import HostClient from wellknown_acl import PUBLIC_ACL -from .failover_utils import wait_all_storage_node_returned, wait_object_replication_on_nodes logger = logging.getLogger('NeoLogger') STORAGE_NODE_COMMUNICATION_PORT = '8080' diff --git a/pytest_tests/testsuites/failovers/test_failover_storage.py b/pytest_tests/testsuites/failovers/test_failover_storage.py index 5ee56b8d..e67546f5 100644 --- a/pytest_tests/testsuites/failovers/test_failover_storage.py +++ b/pytest_tests/testsuites/failovers/test_failover_storage.py @@ -5,13 +5,14 @@ import pytest from common import (STORAGE_NODE_SSH_PRIVATE_KEY_PATH, STORAGE_NODE_SSH_USER, STORAGE_NODE_SSH_PASSWORD) +from failover_utils import wait_all_storage_node_returned, wait_object_replication_on_nodes from python_keywords.container import create_container from python_keywords.neofs_verbs import get_object, put_object from python_keywords.utility_keywords import generate_file, get_file_hash from sbercloud_helper import SberCloud, SberCloudConfig -from ssh_helper import HostClient, HostIsNotAvailable +from ssh_helper import HostClient from wellknown_acl import PUBLIC_ACL -from .failover_utils import wait_all_storage_node_returned, wait_object_replication_on_nodes + logger = logging.getLogger('NeoLogger') stopped_hosts = [] diff --git a/pytest_tests/testsuites/network/test_node_management.py b/pytest_tests/testsuites/network/test_node_management.py index 70d4284c..9e52b5d8 100644 --- a/pytest_tests/testsuites/network/test_node_management.py +++ b/pytest_tests/testsuites/network/test_node_management.py @@ -6,24 +6,22 @@ import allure import pytest from data_formatters import get_wallet_public_key from common import (COMPLEX_OBJ_SIZE, MAINNET_BLOCK_TIME, NEOFS_CONTRACT_CACHE_TIMEOUT, - NEOFS_NETMAP_DICT, STORAGE_WALLET_PASS) + NEOFS_NETMAP_DICT, STORAGE_RPC_ENDPOINT_1, STORAGE_WALLET_PASS) from epoch import tick_epoch from python_keywords.container import create_container, get_container -from python_keywords.neofs_verbs import (delete_object, get_object, - head_object, put_object) -from python_keywords.node_management import (drop_object, get_netmap_snapshot, - get_locode, - node_healthcheck, - node_set_status, node_shard_list, - node_shard_set_mode, - start_nodes_remote, - stop_nodes_remote) +from python_keywords.failover_utils import wait_object_replication_on_nodes +from python_keywords.neofs_verbs import delete_object, get_object, head_object, put_object +from python_keywords.node_management import (create_ssh_client, drop_object, get_netmap_snapshot, + get_locode, node_healthcheck, node_set_status, + node_shard_list, node_shard_set_mode, + start_nodes_remote, stop_nodes_remote) from storage_policy import get_nodes_with_object, get_simple_object_copies from utility import placement_policy_from_container, robot_time_to_int, wait_for_gc_pass_on_storage_nodes from utility_keywords import generate_file from wellknown_acl import PUBLIC_ACL logger = logging.getLogger('NeoLogger') +check_nodes = [] @pytest.fixture @@ -72,6 +70,137 @@ def after_run_set_all_nodes_online(): logger.error(f"Node status change fails with error:\n{err}") +def wait_for_service_started(ssh_client, service_name: str): + expected_state = 'active (running)' + for __attempt in range(10): + output = ssh_client.exec(f'systemctl status {service_name}') + if expected_state in output.stdout: + return + sleep(3) + raise AssertionError(f'Service {service_name} is not in {expected_state} state') + + +@pytest.fixture +def return_nodes_after_test_run(): + yield + return_nodes() + + +def cleanup_node(node_to_cleanup, alive_node): + exclude_node_from_network_map(node_to_cleanup, alive_node) + + with create_ssh_client(node_to_cleanup) as ssh_client: + ssh_client.exec(f'systemctl stop neofs-storage.service') + ssh_client.exec('rm -rf /srv/neofs/*') + sleep(robot_time_to_int(MAINNET_BLOCK_TIME)) + + +@allure.step('Return node to cluster') +def return_nodes(alive_node: str = None): + for node in list(check_nodes): + with create_ssh_client(node) as ssh_client: + ssh_client.exec(f'systemctl start neofs-storage.service') + wait_for_service_started(ssh_client, 'neofs-storage.service') + sleep(robot_time_to_int(MAINNET_BLOCK_TIME)) + + with allure.step(f'Move node {node} to online state'): + node_set_status(node, status='online', retry=True) + + check_nodes.remove(node) + sleep(robot_time_to_int(MAINNET_BLOCK_TIME)) + for __attempt in range(3): + try: + tick_epoch() + break + except RuntimeError: + sleep(3) + + check_node_in_map(node, alive_node) + + +def exclude_node_from_network_map(node_to_exclude, alive_node): + node_wallet_path = NEOFS_NETMAP_DICT[node_to_exclude]['wallet_path'] + node_netmap_key = get_wallet_public_key( + node_wallet_path, + STORAGE_WALLET_PASS, + format="base58" + ) + + with allure.step(f'Move node {node_to_exclude} to offline state'): + node_set_status(node_to_exclude, status='offline') + + sleep(robot_time_to_int(MAINNET_BLOCK_TIME)) + tick_epoch() + + snapshot = get_netmap_snapshot(node_name=alive_node) + assert node_netmap_key not in snapshot, f'Expected node with key {node_netmap_key} not in network map' + + +def include_node_to_network_map(node_to_include, alive_node): + with allure.step(f'Move node {node_to_include} to online state'): + node_set_status(node_to_include, status='online') + + sleep(robot_time_to_int(MAINNET_BLOCK_TIME)) + tick_epoch() + + check_node_in_map(node_to_include, alive_node) + + +@allure.step('Check node {node_name} in network map') +def check_node_in_map(node_name: str, alive_node: str = None): + alive_node = alive_node or node_name + node_wallet_path = NEOFS_NETMAP_DICT[node_name]['wallet_path'] + node_netmap_key = get_wallet_public_key( + node_wallet_path, + STORAGE_WALLET_PASS, + format="base58" + ) + + logger.info(f'Node {node_name} netmap key: {node_netmap_key}') + + snapshot = get_netmap_snapshot(node_name=alive_node) + assert node_netmap_key in snapshot, f'Expected node with key {node_netmap_key} in network map' + + +@allure.title('Add one node to cluster') +@pytest.mark.add_nodes +@pytest.mark.node_mgmt +def test_add_nodes(prepare_tmp_dir, prepare_wallet_and_deposit, return_nodes_after_test_run): + wallet = prepare_wallet_and_deposit + placement_rule_3 = 'REP 3 IN X CBF 1 SELECT 3 FROM * AS X' + placement_rule_4 = 'REP 4 IN X CBF 1 SELECT 4 FROM * AS X' + source_file_path = generate_file() + + additional_node = choice(list( + node for node, node_config in NEOFS_NETMAP_DICT.items() if node_config.get('rpc') != STORAGE_RPC_ENDPOINT_1)) + alive_node = choice([node for node in NEOFS_NETMAP_DICT if node != additional_node]) + + check_node_in_map(additional_node, alive_node) + + with allure.step(f'Exclude node {additional_node} from map and clean it up'): + cleanup_node(additional_node, alive_node) + check_nodes.append(additional_node) + + cid = create_container(wallet, rule=placement_rule_3, basic_acl=PUBLIC_ACL) + oid = put_object(wallet, source_file_path, cid, endpoint=NEOFS_NETMAP_DICT[alive_node].get('rpc')) + wait_object_replication_on_nodes(wallet, cid, oid, 3) + + return_nodes(alive_node) + + with allure.step('Check data could be replicated to new node'): + random_node = choice([node for node in NEOFS_NETMAP_DICT if node not in (additional_node, alive_node)]) + exclude_node_from_network_map(random_node, alive_node) + + wait_object_replication_on_nodes(wallet, cid, oid, 3, excluded_nodes=[random_node]) + include_node_to_network_map(random_node, alive_node) + wait_object_replication_on_nodes(wallet, cid, oid, 3) + + with allure.step('Check container could be created with new node'): + cid = create_container(wallet, rule=placement_rule_4, basic_acl=PUBLIC_ACL) + oid = put_object(wallet, source_file_path, cid, endpoint=NEOFS_NETMAP_DICT[alive_node].get('rpc')) + wait_object_replication_on_nodes(wallet, cid, oid, 4) + + @allure.title('Control Operations with storage nodes') @pytest.mark.node_mgmt def test_nodes_management(prepare_tmp_dir): diff --git a/robot/resources/lib/python_keywords/cli_helpers.py b/robot/resources/lib/python_keywords/cli_helpers.py index b4c06d7c..f44d9af7 100644 --- a/robot/resources/lib/python_keywords/cli_helpers.py +++ b/robot/resources/lib/python_keywords/cli_helpers.py @@ -16,6 +16,8 @@ import pexpect from robot.api import logger ROBOT_AUTO_KEYWORDS = False +COLOR_GREEN = "\033[92m" +COLOR_OFF = "\033[0m" def _cmd_run(cmd: str, timeout: int = 30) -> str: @@ -26,7 +28,7 @@ def _cmd_run(cmd: str, timeout: int = 30) -> str: compl_proc = None start_time = datetime.now() try: - logger.info(f"Executing command: {cmd}") + logger.info(f"{COLOR_GREEN}Executing command: {cmd}{COLOR_OFF}") start_time = datetime.utcnow() compl_proc = subprocess.run(cmd, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, @@ -35,7 +37,7 @@ def _cmd_run(cmd: str, timeout: int = 30) -> str: output = compl_proc.stdout return_code = compl_proc.returncode end_time = datetime.utcnow() - logger.info(f"Output: {output}") + logger.info(f"{COLOR_GREEN}Output: {output}{COLOR_OFF}") _attach_allure_log(cmd, output, return_code, start_time, end_time) return output diff --git a/robot/resources/lib/python_keywords/epoch.py b/robot/resources/lib/python_keywords/epoch.py index 33e9e594..26f35223 100644 --- a/robot/resources/lib/python_keywords/epoch.py +++ b/robot/resources/lib/python_keywords/epoch.py @@ -1,5 +1,7 @@ #!/usr/bin/python3.9 +import sys +import allure from robot.api import logger from robot.api.deco import keyword @@ -28,18 +30,27 @@ def tick_epoch(): # If neofs-adm is available, then we tick epoch with it (to be consistent with UAT tests) cmd = f"{NEOFS_ADM_EXEC} morph force-new-epoch -c {NEOFS_ADM_CONFIG_PATH}" logger.info(f"Executing shell command: {cmd}") + out = '' + err = '' try: out = wrappers.run_sh(cmd) logger.info(f"Command completed with output: {out}") except Exception as exc: logger.error(exc) + err = str(exc) raise RuntimeError("Failed to tick epoch") from exc - + finally: + if 'allure' in sys.modules: + allure.attach(( + f'COMMAND: {cmd}\n' + f'OUTPUT:\n {out}\n' + f'ERROR: {err}\n' + ), 'Tick Epoch', allure.attachment_type.TEXT) return # Otherwise we tick epoch using transaction cur_epoch = get_epoch() return contract.invoke_contract_multisig( contract.get_netmap_contract_hash(MORPH_ENDPOINT), - f"newEpoch int:{cur_epoch+1}", + f"newEpoch int:{cur_epoch + 1}", IR_WALLET_PATH, IR_WALLET_PASS, MORPH_ENDPOINT) diff --git a/pytest_tests/testsuites/failovers/failover_utils.py b/robot/resources/lib/python_keywords/failover_utils.py similarity index 100% rename from pytest_tests/testsuites/failovers/failover_utils.py rename to robot/resources/lib/python_keywords/failover_utils.py diff --git a/robot/resources/lib/python_keywords/node_management.py b/robot/resources/lib/python_keywords/node_management.py index 51a9a3ea..910aea16 100644 --- a/robot/resources/lib/python_keywords/node_management.py +++ b/robot/resources/lib/python_keywords/node_management.py @@ -178,7 +178,7 @@ def node_healthcheck(node_name: str) -> HealthStatus: @keyword('Set status for node') -def node_set_status(node_name: str, status: str) -> None: +def node_set_status(node_name: str, status: str, retry=False) -> None: """ The function sets particular status for given node. Args: @@ -188,7 +188,12 @@ def node_set_status(node_name: str, status: str) -> None: (void) """ command = f"control set-status --status {status}" - run_control_command(node_name, command) + try: + run_control_command(node_name, command) + except AssertionError as err: + if not retry: + raise AssertionError(f'Command control set-status failed with error {err}') from err + run_control_command(node_name, command) @keyword('Get netmap snapshot')