import logging import os.path import random import time import allure import pytest from frostfs_testlib.resources.common import MORPH_BLOCK_TIME from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL from frostfs_testlib.steps.cli.container import ( StorageContainer, StorageContainerInfo, create_container, ) from frostfs_testlib.steps.cli.object import get_object from frostfs_testlib.steps.node_management import check_node_in_map, check_node_not_in_map from frostfs_testlib.storage.cluster import ClusterNode, StorageNode from frostfs_testlib.storage.controllers import ClusterStateController from frostfs_testlib.storage.dataclasses.storage_object_info import StorageObjectInfo from frostfs_testlib.storage.dataclasses.wallet import WalletInfo from frostfs_testlib.testing.cluster_test_base import ClusterTestBase from frostfs_testlib.testing.test_control import wait_for_success from frostfs_testlib.utils import datetime_utils from frostfs_testlib.utils.failover_utils import wait_for_host_offline, wait_object_replication from frostfs_testlib.utils.file_utils import get_file_hash from pytest import FixtureRequest logger = logging.getLogger("NeoLogger") @pytest.mark.failover @pytest.mark.failover_server class TestFailoverServer(ClusterTestBase): @wait_for_success(max_wait_time=120, interval=1) def wait_node_not_in_map(self, *args, **kwargs): check_node_not_in_map(*args, **kwargs) @wait_for_success(max_wait_time=120, interval=1) def wait_node_in_map(self, *args, **kwargs): check_node_in_map(*args, **kwargs) @allure.step("Create {count_containers} containers and {count_files} objects") @pytest.fixture def containers( self, request: FixtureRequest, default_wallet: str, ) -> list[StorageContainer]: placement_rule = "REP 2 CBF 2 SELECT 2 FROM * AS X" containers = [] for _ in range(request.param): cont_id = create_container( default_wallet, shell=self.shell, endpoint=self.cluster.default_rpc_endpoint, rule=placement_rule, basic_acl=PUBLIC_ACL, ) wallet = WalletInfo(path=default_wallet) storage_cont_info = StorageContainerInfo(id=cont_id, wallet_file=wallet) containers.append( StorageContainer( storage_container_info=storage_cont_info, shell=self.shell, cluster=self.cluster ) ) return containers @allure.step("Create object and delete after test") @pytest.fixture(scope="class") def storage_objects( self, request: FixtureRequest, containers: list[StorageContainer], simple_object_size: int, complex_object_size: int, ) -> StorageObjectInfo: count_object = request.param object_size = [simple_object_size, complex_object_size] object_list = [] for cont in containers: for _ in range(count_object): object_list.append(cont.generate_object(size=random.choice(object_size))) for storage_object in object_list: os.remove(storage_object.file_path) yield object_list @allure.step("Select random node to stop and start it after test") @pytest.fixture def node_to_stop( self, node_under_test: ClusterNode, cluster_state_controller: ClusterStateController ) -> ClusterNode: yield node_under_test with allure.step(f"start {node_under_test.storage_node}"): cluster_state_controller.start_stopped_hosts() @allure.step("Upload object with nodes and compare") def get_corrupted_objects_list( self, nodes: list[StorageNode], storage_objects: list[StorageObjectInfo] ) -> list[StorageObjectInfo]: corrupted_objects = [] for node in nodes: for storage_object in storage_objects: got_file_path = get_object( storage_object.wallet_file_path, storage_object.cid, storage_object.oid, endpoint=node.get_rpc_endpoint(), shell=self.shell, timeout="60s", ) if storage_object.file_hash != get_file_hash(got_file_path): corrupted_objects.append(storage_object) os.remove(got_file_path) return corrupted_objects def check_objects_replication( self, storage_objects: list[StorageObjectInfo], storage_nodes: list[StorageNode] ) -> None: for storage_object in storage_objects: wait_object_replication( storage_object.cid, storage_object.oid, 2, shell=self.shell, nodes=storage_nodes, sleep_interval=45, attempts=60, ) @allure.title("Full shutdown node") @pytest.mark.parametrize("containers, storage_objects", [(5, 10)], indirect=True) def test_complete_node_shutdown( self, containers: list[StorageContainer], storage_objects: list[StorageObjectInfo], default_wallet: str, node_to_stop: ClusterNode, cluster_state_controller: ClusterStateController, ): with allure.step(f"Remove {node_to_stop} from the list of nodes"): alive_nodes = list(set(self.cluster.cluster_nodes) - {node_to_stop}) storage_nodes = [cluster.storage_node for cluster in alive_nodes] with allure.step("Tick epoch"): self.tick_epochs(1, storage_nodes[0]) with allure.step("Wait 2 block time"): time.sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME) * 2) cluster_state_controller.stop_node_host(node=node_to_stop, mode="hard") with allure.step(f"Check if the node {node_to_stop.storage_node} has stopped"): wait_for_host_offline(self.shell, node_to_stop.storage_node) with allure.step("Verify that there are no corrupted objects"): corrupted_objects_list = self.get_corrupted_objects_list(storage_nodes, storage_objects) assert not corrupted_objects_list with allure.step(f"check {node_to_stop.storage_node} in map"): self.wait_node_in_map( node_to_stop.storage_node, self.shell, alive_node=storage_nodes[0] ) count_tick_epoch = int(alive_nodes[0].ir_node.get_netmap_cleaner_threshold()) + 2 with allure.step(f"Tick {count_tick_epoch} epoch, in {storage_nodes[0]} node"): for tick in range(count_tick_epoch): self.tick_epoch(storage_nodes[0]) time.sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME) * 2) with allure.step(f"Check if the node {node_to_stop.storage_node} has stopped"): wait_for_host_offline(self.shell, node_to_stop.storage_node) with allure.step(f"Check {node_to_stop} in not map"): self.wait_node_not_in_map( node_to_stop.storage_node, self.shell, alive_node=storage_nodes[0] ) with allure.step( f"Verify that there are no corrupted objects after {count_tick_epoch} epoch" ): corrupted_objects_list = self.get_corrupted_objects_list(storage_nodes, storage_objects) assert not corrupted_objects_list @allure.title("Temporarily disable a node") @pytest.mark.parametrize("containers, storage_objects", [(5, 10)], indirect=True) def test_temporarily_disable_a_node( self, containers: list[StorageContainer], storage_objects: list[StorageObjectInfo], default_wallet: str, node_to_stop: ClusterNode, cluster_state_controller: ClusterStateController, ): with allure.step(f"Remove {node_to_stop} from the list of nodes"): storage_nodes = list(set(self.cluster.storage_nodes) - {node_to_stop.storage_node}) with allure.step("Tick epoch"): self.tick_epochs(1, storage_nodes[0]) with allure.step("Wait 2 block time"): time.sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME) * 2) cluster_state_controller.stop_node_host(node=node_to_stop, mode="hard") with allure.step(f"Check if the node {node_to_stop} has stopped"): wait_for_host_offline(self.shell, node_to_stop.storage_node) with allure.step("Verify that there are no corrupted objects"): corrupted_objects_list = self.get_corrupted_objects_list(storage_nodes, storage_objects) assert not corrupted_objects_list with allure.step(f"Check {node_to_stop} in map"): self.wait_node_in_map( node_to_stop.storage_node, self.shell, alive_node=storage_nodes[0] ) cluster_state_controller.start_node_host(node_to_stop) with allure.step("Verify that there are no corrupted objects"): corrupted_objects_list = self.get_corrupted_objects_list(storage_nodes, storage_objects) assert not corrupted_objects_list