From ae57672c7d5842f375d7c50f8df9fcae1186efef Mon Sep 17 00:00:00 2001 From: Andrey Berezin Date: Tue, 31 Oct 2023 14:28:44 +0300 Subject: [PATCH] [#129] Updates for failover Signed-off-by: Andrey Berezin --- pytest_tests/testsuites/conftest.py | 16 +++- .../failovers/test_failover_storage.py | 82 +++++++------------ 2 files changed, 44 insertions(+), 54 deletions(-) diff --git a/pytest_tests/testsuites/conftest.py b/pytest_tests/testsuites/conftest.py index 559778d..5aab052 100644 --- a/pytest_tests/testsuites/conftest.py +++ b/pytest_tests/testsuites/conftest.py @@ -8,7 +8,9 @@ import allure import pytest import yaml from dateutil import parser +from frostfs_testlib.healthcheck.interfaces import Healthcheck from frostfs_testlib.hosting import Hosting +from frostfs_testlib.plugins import load_plugin from frostfs_testlib.reporter import AllureHandler, get_reporter from frostfs_testlib.resources.common import ( ASSETS_DIR, @@ -182,8 +184,18 @@ def s3_policy(request: pytest.FixtureRequest): @pytest.fixture(scope="session") -def cluster_state_controller(client_shell: Shell, cluster: Cluster) -> ClusterStateController: - controller = ClusterStateController(client_shell, cluster) +@allure.title("[Session] Create healthcheck object") +def healthcheck(cluster: Cluster) -> Healthcheck: + healthcheck_cls = load_plugin( + "frostfs.testlib.healthcheck", cluster.cluster_nodes[0].host.config.healthcheck_plugin_name + ) + + return healthcheck_cls() + + +@pytest.fixture(scope="session") +def cluster_state_controller(client_shell: Shell, cluster: Cluster, healthcheck: Healthcheck) -> ClusterStateController: + controller = ClusterStateController(client_shell, cluster, healthcheck) yield controller diff --git a/pytest_tests/testsuites/failovers/test_failover_storage.py b/pytest_tests/testsuites/failovers/test_failover_storage.py index d9f6b68..151b2e2 100644 --- a/pytest_tests/testsuites/failovers/test_failover_storage.py +++ b/pytest_tests/testsuites/failovers/test_failover_storage.py @@ -5,11 +5,9 @@ from time import sleep import allure import pytest -from frostfs_testlib.hosting import Host from frostfs_testlib.resources.common import MORPH_BLOCK_TIME from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL from frostfs_testlib.s3 import AwsCliClient, Boto3ClientWrapper, S3ClientWrapper, VersioningStatus -from frostfs_testlib.shell import CommandOptions from frostfs_testlib.steps.cli.container import StorageContainer, StorageContainerInfo, create_container from frostfs_testlib.steps.cli.object import get_object, put_object_to_random_node from frostfs_testlib.steps.node_management import ( @@ -21,7 +19,7 @@ from frostfs_testlib.steps.node_management import ( wait_for_node_to_be_ready, ) from frostfs_testlib.steps.s3 import s3_helper -from frostfs_testlib.storage.cluster import Cluster, ClusterNode, StorageNode +from frostfs_testlib.storage.cluster import Cluster, ClusterNode, S3Gate, StorageNode from frostfs_testlib.storage.controllers import ClusterStateController, ShardsWatcher from frostfs_testlib.storage.dataclasses.object_size import ObjectSize from frostfs_testlib.storage.dataclasses.storage_object_info import StorageObjectInfo @@ -57,26 +55,11 @@ def after_run_return_all_stopped_hosts(cluster_state_controller: ClusterStateCon cluster_state_controller.start_stopped_hosts() -@allure.step("Return all stopped storage services after test") +@allure.step("Return all stopped services after test") @pytest.fixture(scope="function") def after_run_return_all_stopped_services(cluster_state_controller: ClusterStateController): yield - cluster_state_controller.start_stopped_storage_services() - - -@allure.step("Return all stopped S3 GateWay services after test") -@pytest.fixture(scope="function") -def after_run_return_all_stopped_s3(cluster_state_controller: ClusterStateController): - yield - cluster_state_controller.start_stopped_s3_gates() - - -def panic_reboot_host(host: Host) -> None: - shell = host.get_shell() - shell.exec('sudo sh -c "echo 1 > /proc/sys/kernel/sysrq"') - - options = CommandOptions(close_stdin=True, timeout=1, check=False) - shell.exec('sudo sh -c "echo b > /proc/sysrq-trigger"', options) + cluster_state_controller.start_all_stopped_services() @pytest.mark.failover @@ -209,22 +192,21 @@ class TestFailoverStorage(ClusterTestBase): s3_client: S3ClientWrapper, simple_object_size: ObjectSize, cluster_state_controller: ClusterStateController, - after_run_return_all_stopped_s3, after_run_return_all_stopped_services, ): default_node = self.cluster.cluster_nodes[0] with allure.step("Turn S3 GW off on default node"): - cluster_state_controller.stop_s3_gate(default_node) + cluster_state_controller.stop_service_of_type(default_node, S3Gate) with allure.step("Turn off storage on default node"): - cluster_state_controller.stop_storage_service(default_node) + cluster_state_controller.stop_service_of_type(default_node, StorageNode) with allure.step("Turn on S3 GW on default node"): - cluster_state_controller.start_s3_gate(default_node) + cluster_state_controller.start_service_of_type(default_node, S3Gate) with allure.step("Turn on storage on default node"): - cluster_state_controller.start_storage_service(default_node) + cluster_state_controller.start_service_of_type(default_node, StorageNode) with allure.step("Create bucket with REP 1 SELECT 1 policy"): bucket = s3_client.create_bucket( @@ -240,13 +222,13 @@ class TestFailoverStorage(ClusterTestBase): with allure.step("Turn off all storage nodes except default"): for node in self.cluster.cluster_nodes[1:]: with allure.step(f"Stop storage service on node: {node}"): - cluster_state_controller.stop_storage_service(node) + cluster_state_controller.stop_service_of_type(node, StorageNode) with allure.step("Check that object is available"): s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name]) with allure.step("Start storage nodes"): - cluster_state_controller.start_stopped_storage_services() + cluster_state_controller.start_all_stopped_services() @pytest.mark.failover @@ -320,7 +302,7 @@ class TestEmptyMap(ClusterTestBase): def empty_map_stop_service_teardown(self, cluster_state_controller: ClusterStateController): yield with allure.step("Return all storage nodes to network map"): - cluster_state_controller.start_stopped_storage_services() + cluster_state_controller.start_all_stopped_services() for node in stopped_nodes: check_node_in_map(node, shell=self.shell, alive_node=node) @@ -365,7 +347,7 @@ class TestEmptyMap(ClusterTestBase): s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects) with allure.step("Stop all storage nodes"): - cluster_state_controller.stop_all_storage_services() + cluster_state_controller.stop_services_of_type(StorageNode) with allure.step("Remove all nodes from network map"): remove_nodes_from_map_morph( @@ -383,7 +365,7 @@ class TestEmptyMap(ClusterTestBase): first_node = self.cluster.cluster_nodes[0].service(StorageNode) with allure.step("Start first node and check network map"): - cluster_state_controller.start_storage_service(self.cluster.cluster_nodes[0]) + cluster_state_controller.start_service_of_type(self.cluster.cluster_nodes[0], StorageNode) wait_for_node_to_be_ready(first_node) for check_node in self.cluster.storage_nodes: @@ -392,7 +374,7 @@ class TestEmptyMap(ClusterTestBase): for node in self.cluster.cluster_nodes[1:]: storage_node = node.service(StorageNode) - cluster_state_controller.start_storage_service(node) + cluster_state_controller.start_service_of_type(node, StorageNode) wait_for_node_to_be_ready(storage_node) sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME)) @@ -420,9 +402,7 @@ class TestEmptyMap(ClusterTestBase): object_versions.append(put_object) with allure.step("Stop all storage nodes"): - for node in self.cluster.cluster_nodes: - with allure.step(f"Stop storage service on node: {node}"): - cluster_state_controller.stop_storage_service(node) + cluster_state_controller.stop_services_of_type(StorageNode) with allure.step("Delete blobovnicza and fstree from all nodes"): for node in self.cluster.storage_nodes: @@ -430,7 +410,7 @@ class TestEmptyMap(ClusterTestBase): node.delete_fstree() with allure.step("Start all storage nodes"): - cluster_state_controller.start_stopped_storage_services() + cluster_state_controller.start_all_stopped_services() # need to get Delete Marker first with allure.step("Delete the object from the bucket"): @@ -462,9 +442,7 @@ class TestEmptyMap(ClusterTestBase): s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name]) with allure.step("Stop all storage nodes"): - for node in self.cluster.cluster_nodes: - with allure.step(f"Stop storage service on node: {node}"): - cluster_state_controller.stop_storage_service(node) + cluster_state_controller.stop_services_of_type(StorageNode) with allure.step("Delete blobovnicza and fstree from all nodes"): for node in self.cluster.storage_nodes: @@ -472,7 +450,7 @@ class TestEmptyMap(ClusterTestBase): node.delete_fstree() with allure.step("Start all storage nodes"): - cluster_state_controller.start_stopped_storage_services() + cluster_state_controller.start_all_stopped_services() with allure.step("Delete the object from the bucket"): s3_client.delete_object(bucket, file_name) @@ -507,16 +485,14 @@ class TestEmptyMap(ClusterTestBase): s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name]) with allure.step("Stop all storage nodes"): - for node in self.cluster.cluster_nodes: - with allure.step(f"Stop storage service on node: {node}"): - cluster_state_controller.stop_storage_service(node) + cluster_state_controller.stop_services_of_type(StorageNode) with allure.step("Delete pilorama.db from all nodes"): for node in self.cluster.storage_nodes: node.delete_pilorama() with allure.step("Start all storage nodes"): - cluster_state_controller.start_stopped_storage_services() + cluster_state_controller.start_all_stopped_services() with allure.step("Check list objects first time"): objects_list = s3_client.list_objects(bucket) @@ -578,7 +554,7 @@ class TestStorageDataLoss(ClusterTestBase): ) with allure.step("Stop storage services on all nodes"): - cluster_state_controller.stop_all_storage_services() + cluster_state_controller.stop_services_of_type(StorageNode) with allure.step("Delete metabase from all nodes"): for node in cluster_state_controller.cluster.storage_nodes: @@ -594,7 +570,11 @@ class TestStorageDataLoss(ClusterTestBase): storage_node.save_config(config, config_file_path) with allure.step("Start storage services on all nodes"): - cluster_state_controller.start_stopped_storage_services() + cluster_state_controller.start_all_stopped_services() + + with allure.step("Wait for tree rebalance"): + # TODO: Use product metric when we have proper ones for this check + sleep(30) with allure.step("Delete objects from bucket"): with allure.step("Delete simple object from bucket"): @@ -655,13 +635,13 @@ class TestStorageDataLoss(ClusterTestBase): shards_watcher.take_shards_snapshot() with allure.step(f"Stop storage service on node {node_under_test}"): - cluster_state_controller.stop_storage_service(node_under_test) + cluster_state_controller.stop_service_of_type(node_under_test, StorageNode) with allure.step(f"Delete write cache from node {node_under_test}"): node_under_test.storage_node.delete_write_cache() with allure.step(f"Start storage service on node {node_under_test}"): - cluster_state_controller.start_storage_service(node_under_test) + cluster_state_controller.start_all_stopped_services() with allure.step("Objects should be available"): for storage_object in storage_objects: @@ -710,7 +690,7 @@ class TestStorageDataLoss(ClusterTestBase): with allure.step("Stop one node and wait for rebalance connection of s3 gate to storage service"): current_node = self.cluster.cluster_nodes[0] - cluster_state_controller.stop_storage_service(current_node) + cluster_state_controller.stop_service_of_type(current_node, StorageNode) # waiting for rebalance connection of s3 gate to storage service sleep(60) @@ -752,15 +732,13 @@ class TestStorageDataLoss(ClusterTestBase): piloramas_list_before_removing = self.get_piloramas_list(node_to_check) with allure.step("Stop all storage nodes"): - for node in self.cluster.cluster_nodes: - with allure.step(f"Stop storage service on node: {node}"): - cluster_state_controller.stop_storage_service(node) + cluster_state_controller.stop_services_of_type(StorageNode) with allure.step("Delete pilorama.db from one node"): node_to_check.delete_pilorama() with allure.step("Start all storage nodes"): - cluster_state_controller.start_stopped_storage_services() + cluster_state_controller.start_all_stopped_services() with allure.step("Tick epoch to trigger sync and then wait for 1 minute"): self.tick_epochs(1)