From b0c9502bd33287e1c5d4178688c71b08f1e4e751 Mon Sep 17 00:00:00 2001 From: Dmitriy Zayakin Date: Mon, 20 Nov 2023 13:49:54 +0300 Subject: [PATCH] Add Maintenance tests Signed-off-by: Dmitriy Zayakin --- pytest.ini | 1 + .../failovers/test_failover_storage.py | 346 +++++++++++++++++- 2 files changed, 342 insertions(+), 5 deletions(-) diff --git a/pytest.ini b/pytest.ini index 37431ab1..a8ed28f3 100644 --- a/pytest.ini +++ b/pytest.ini @@ -13,6 +13,7 @@ markers = # controlling markers no_healthcheck: skip healthcheck for this test # functional markers + maintenance: tests for change mode node container: tests for container creation grpc_api: standard gRPC API tests grpc_control: tests related to using frostfs-cli control commands diff --git a/pytest_tests/testsuites/failovers/test_failover_storage.py b/pytest_tests/testsuites/failovers/test_failover_storage.py index 3fae0cf3..27fdafdb 100644 --- a/pytest_tests/testsuites/failovers/test_failover_storage.py +++ b/pytest_tests/testsuites/failovers/test_failover_storage.py @@ -1,16 +1,30 @@ import logging +import os import random from datetime import datetime from time import sleep import allure import pytest -from frostfs_testlib.resources.common import MORPH_BLOCK_TIME +from frostfs_testlib.cli import FrostfsCli +from frostfs_testlib.cli.netmap_parser import NetmapParser +from frostfs_testlib.resources.cli import FROSTFS_CLI_EXEC +from frostfs_testlib.resources.common import DEFAULT_WALLET_CONFIG, MORPH_BLOCK_TIME from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL from frostfs_testlib.s3 import S3ClientWrapper, VersioningStatus -from frostfs_testlib.shell import CommandOptions -from frostfs_testlib.steps.cli.container import StorageContainer, StorageContainerInfo, create_container -from frostfs_testlib.steps.cli.object import get_object, put_object_to_random_node +from frostfs_testlib.steps.cli.container import ( + StorageContainer, + StorageContainerInfo, + create_container, + search_nodes_with_container, +) +from frostfs_testlib.steps.cli.object import ( + delete_object, + get_object, + put_object, + put_object_to_random_node, + search_object, +) from frostfs_testlib.steps.node_management import ( check_node_in_map, check_node_not_in_map, @@ -23,7 +37,7 @@ from frostfs_testlib.steps.s3 import s3_helper from frostfs_testlib.storage.cluster import Cluster, ClusterNode, S3Gate, StorageNode from frostfs_testlib.storage.controllers import ClusterStateController, ShardsWatcher from frostfs_testlib.storage.dataclasses.object_size import ObjectSize -from frostfs_testlib.storage.dataclasses.storage_object_info import StorageObjectInfo +from frostfs_testlib.storage.dataclasses.storage_object_info import ModeNode, StorageObjectInfo from frostfs_testlib.storage.dataclasses.wallet import WalletInfo from frostfs_testlib.testing.cluster_test_base import ClusterTestBase from frostfs_testlib.testing.test_control import expect_not_raises @@ -773,3 +787,325 @@ class TestStorageDataLoss(ClusterTestBase): with allure.step("Delete bucket"): s3_client.delete_bucket(bucket) + + +@pytest.mark.maintenance +class TestMaintenanceMode(ClusterTestBase): + change_node: ClusterNode = None + + @pytest.fixture() + @allure.title("Init Frostfs CLI remote") + def frostfs_cli_remote(self, node_under_test: ClusterNode) -> FrostfsCli: + host = node_under_test.host + service_config = host.get_service_config(node_under_test.storage_node.name) + wallet_path = service_config.attributes["wallet_path"] + wallet_password = service_config.attributes["wallet_password"] + + shell = host.get_shell() + wallet_config_path = f"/tmp/{node_under_test.storage_node.name}-config.yaml" + wallet_config = f'wallet: {wallet_path}\npassword: "{wallet_password}"' + shell.exec(f"echo '{wallet_config}' > {wallet_config_path}") + cli = FrostfsCli(shell=shell, frostfs_cli_exec_path=FROSTFS_CLI_EXEC, config_file=wallet_config_path) + return cli + + @pytest.fixture() + @allure.title("Init Frostfs CLI remote") + def frostfs_cli(self) -> FrostfsCli: + cli = FrostfsCli(shell=self.shell, frostfs_cli_exec_path=FROSTFS_CLI_EXEC, config_file=DEFAULT_WALLET_CONFIG) + return cli + + @pytest.fixture() + def restore_online_mode_node(self, cluster_state_controller: ClusterStateController, default_wallet: str): + yield + cluster_state_controller.set_mode_node(cluster_node=self.change_node, wallet=default_wallet, status="online") + self.tick_epoch(wait_block=2) + + def basic_operations(self, wallet, cid, oid, shell, endpoint, matchs, object_size): + file_path = generate_file(object_size) + default_kw = {"wallet": wallet, "cid": cid, "shell": shell, "endpoint": endpoint} + operations = { + get_object: {"oid": oid}, + search_object: {}, + delete_object: {"oid": oid}, + put_object: {"path": file_path}, + } + for index, operation, kw in enumerate(operations.items()): + with allure.step(f"Run {operation.__name__} object, waiting response - {matchs[index]}"): + default_kw.update(kw) + if operation == search_object and "Found" in matchs[index]: + operation(**default_kw) + continue + with pytest.raises(RuntimeError, match=matchs[index]): + operation(**default_kw) + os.remove(file_path) + + @allure.title("Test of basic node operations in maintenance mode") + def test_maintenance_mode( + self, + default_wallet: str, + simple_object_size: ObjectSize, + cluster_state_controller: ClusterStateController, + restore_online_mode_node: None, + ): + with allure.step("Create container and create\put object"): + cid = create_container( + wallet=default_wallet, + shell=self.shell, + endpoint=self.cluster.default_rpc_endpoint, + rule="REP 1 CBF 1", + ) + node = search_nodes_with_container( + wallet=default_wallet, + cid=cid, + shell=self.shell, + endpoint=self.cluster.default_rpc_endpoint, + cluster=self.cluster, + ) + self.change_node = node[0] + file_path = generate_file(simple_object_size.value) + oid = put_object( + wallet=default_wallet, + path=file_path, + cid=cid, + shell=self.shell, + endpoint=self.change_node.storage_node.get_rpc_endpoint(), + ) + with allure.step("Enable MaintenanceModeAllowed:"): + cluster_state_controller.set_mode_node( + cluster_node=self.change_node, wallet=default_wallet, status="maintenance" + ) + + other_nodes = list(set(self.cluster.cluster_nodes) - set(node)) + node_and_match = { + self.change_node: ["node is under maintenance"] * 4, + other_nodes[0]: [ + "object not found", + "Found 0 objects", + "object not found", + "node is under maintenance", + ], + } + with allure.step("Run basic operations"): + for cluster_node, matchs in node_and_match.items(): + self.basic_operations( + wallet=default_wallet, + cid=cid, + oid=oid, + shell=self.shell, + endpoint=cluster_node.storage_node.get_rpc_endpoint(), + matchs=matchs, + object_size=simple_object_size.value, + ) + + @pytest.mark.sanity + @allure.title("MAINTENANCE and OFFLINE mode transitions") + def test_mode_transitions( + self, + cluster_state_controller: ClusterStateController, + node_under_test: ClusterNode, + default_wallet: str, + frostfs_cli: FrostfsCli, + restore_online_mode_node: None, + ): + self.change_node = node_under_test + cluster_nodes = list(set(self.cluster.cluster_nodes) - {node_under_test}) + + with allure.step("Tick epoch"): + self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2) + + with allure.step("Set node mode to offline"): + cluster_state_controller.set_mode_node( + cluster_node=node_under_test, + wallet=default_wallet, + status=ModeNode.OFFLINE.value, + ) + + with allure.step("Tick epoch to update the network map"): + self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2) + + with ((allure.step("Check node mode = offline, after update the network map"))): + netmap = frostfs_cli.netmap.snapshot( + rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet + ).stdout + netmap = NetmapParser.snapshot_all_nodes(netmap) + assert node_under_test.host_ip not in [ + netmap_node.node for netmap_node in netmap + ], f"Node {node_under_test.host_ip} not in state offline" + + with allure.step("Restart storage service"): + cluster_state_controller.stop_storage_service(node_under_test) + cluster_state_controller.start_storage_service(node_under_test) + + with allure.step("Tick epoch after restart storage service and set mode to offline"): + self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2) + + with (allure.step("Check node mode = online, after restart storage service")): + netmap = frostfs_cli.netmap.snapshot( + rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet + ).stdout + node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status + assert node_state == ModeNode.ONLINE.value.upper(), ( + f"Node actual state - {node_state}, " f"expect - {ModeNode.ONLINE.value}" + ) + + with allure.step("Tick epoch"): + self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2) + + with allure.step("Set node mode to maintenance"): + cluster_state_controller.set_mode_node( + cluster_node=node_under_test, wallet=default_wallet, status=ModeNode.MAINTENANCE.value + ) + with allure.step("Restart storage service"): + cluster_state_controller.stop_storage_service(node_under_test) + cluster_state_controller.start_storage_service(node_under_test) + + with allure.step("Tick epoch after restart storage service"): + self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2) + + with allure.step("Check node mode = maintenance, after restart storage service and tick epoch"): + netmap = frostfs_cli.netmap.snapshot( + rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet + ).stdout + node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status + assert node_state == ModeNode.MAINTENANCE.value.upper(), ( + f"Node actual state - {node_state}, " f"expect - {ModeNode.MAINTENANCE.value}" + ) + + with allure.step("Tick epoch"): + self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2) + + with allure.step("Set node mode to offline"): + netmap = frostfs_cli.netmap.snapshot( + rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet + ).stdout + node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status + assert node_state == ModeNode.OFFLINE.value.upper(), ( + f"Node actual state - {node_state}, " f"expect - {ModeNode.OFFLINE.value}" + ) + + with allure.step("Stop storage service"): + cluster_state_controller.stop_storage_service(node_under_test) + + with allure.step("Tick epoch"): + self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2) + + with allure.step("Start storage service"): + cluster_state_controller.start_storage_service(node_under_test) + + with allure.step("Check node mode = offline, after tick epoch and start storage service"): + netmap = frostfs_cli.netmap.snapshot( + rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet + ).stdout + node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status + assert node_state == ModeNode.OFFLINE.value.upper(), ( + f"Node actual state - {node_state}, " f"expect - {ModeNode.OFFLINE.value}" + ) + + with allure.step("Tick epoch"): + self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2) + + with allure.step("Check node mode = online, after start storage service and tick epoch"): + netmap = frostfs_cli.netmap.snapshot( + rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet + ).stdout + node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status + assert node_state == ModeNode.ONLINE.value.upper(), ( + f"Node actual state - {node_state}, " f"expect - {ModeNode.ONLINE.value}" + ) + + with allure.step("Tick epoch"): + self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2) + + with allure.step("Set node mode to maintenance"): + cluster_state_controller.set_mode_node( + cluster_node=node_under_test, wallet=default_wallet, status=ModeNode.MAINTENANCE.value + ) + + with allure.step("Stop storage service"): + cluster_state_controller.stop_storage_service(node_under_test) + + with allure.step("Tick epoch"): + self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2) + + with allure.step("Start storage service"): + cluster_state_controller.start_storage_service(node_under_test) + + with allure.step("Check node mode = maintenance"): + netmap = frostfs_cli.netmap.snapshot( + rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet + ).stdout + node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status + + assert ( + node_state == ModeNode.MAINTENANCE.value.upper() + ), f"Node actual state - {node_state}, expect - {ModeNode.MAINTENANCE.value}" + + with allure.step("Tick epoch"): + self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2) + + with allure.step("Check node mode = maintenance"): + netmap = frostfs_cli.netmap.snapshot( + rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet + ).stdout + node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status + assert node_state == ModeNode.MAINTENANCE.value.upper(), ( + f"Node actual state - {node_state}, " f"expect - {ModeNode.MAINTENANCE.value}" + ) + + @allure.title("A node cannot go into maintenance if maintenance is prohibited globally in the network") + def test_maintenance_globally_forbidden( + self, + cluster_state_controller: ClusterStateController, + node_under_test: ClusterNode, + frostfs_cli_remote: FrostfsCli, + frostfs_cli: FrostfsCli, + default_wallet: str, + restore_online_mode_node: None, + ): + self.change_node = node_under_test + control_endpoint = node_under_test.service(StorageNode).get_control_endpoint() + + with allure.step("Set MaintenanceModeAllowed = false"): + cluster_state_controller.set_maintenance_mode_allowed("false", node_under_test) + + with allure.step("Set status node - maintenance"): + with pytest.raises(RuntimeError, match="maintenance mode is not allowed by the network"): + frostfs_cli_remote.control.set_status(endpoint=control_endpoint, status="maintenance") + + with allure.step("Set MaintenanceModeAllowed = true"): + cluster_state_controller.set_maintenance_mode_allowed("true", node_under_test) + + with allure.step("Set status node - maintenance"): + output = frostfs_cli_remote.control.set_status(endpoint=control_endpoint, status="maintenance") + assert "update request successfully sent" in output.stdout, f"Response = {output}" + + with allure.step("Tick epoch"): + self.tick_epoch(wait_block=2) + + with allure.step("Check state node = maintenance "): + netmap_node = NetmapParser.snapshot_one_node( + frostfs_cli.netmap.snapshot( + rpc_endpoint=node_under_test.storage_node.get_rpc_endpoint(), wallet=default_wallet + ).stdout, + node_under_test, + ) + assert ( + netmap_node.node_status == ModeNode.MAINTENANCE.value.upper() + ), f"Node actual state - {netmap_node.node_status}, expect - {ModeNode.MAINTENANCE.value}" + + with allure.step("Set status node - online "): + frostfs_cli_remote.control.set_status(endpoint=control_endpoint, status="online") + + with allure.step("Tick epoch"): + self.tick_epoch() + + with allure.step("Check state node: online"): + netmap_node = NetmapParser.snapshot_one_node( + frostfs_cli.netmap.snapshot( + rpc_endpoint=node_under_test.storage_node.get_rpc_endpoint(), wallet=default_wallet + ).stdout, + node_under_test, + ) + assert ( + netmap_node.node_status == ModeNode.ONLINE.value.upper() + ), f"Node actual state - {netmap_node.node_status}, expect - {ModeNode.ONLINE.value}"