From d986fe2fb6dcf456b142224131afcbbab66a0bb7 Mon Sep 17 00:00:00 2001 From: Andrey Berezin Date: Wed, 25 Oct 2023 15:47:29 +0300 Subject: [PATCH] [#124] Use CSC in case of node shutdown Signed-off-by: Andrey Berezin --- pytest_tests/testsuites/conftest.py | 4 +- .../failovers/test_failover_storage.py | 205 +++++++++--------- requirements.txt | 2 +- 3 files changed, 104 insertions(+), 107 deletions(-) diff --git a/pytest_tests/testsuites/conftest.py b/pytest_tests/testsuites/conftest.py index 2c7a919..3fd97ed 100644 --- a/pytest_tests/testsuites/conftest.py +++ b/pytest_tests/testsuites/conftest.py @@ -276,10 +276,10 @@ def after_deploy_healthcheck(cluster: Cluster): parallel(readiness_on_node, cluster.cluster_nodes) -SERVICE_ACTIVE_TIME = 15 +SERVICE_ACTIVE_TIME = 20 -@wait_for_success(60 * SERVICE_ACTIVE_TIME * 2, 15) +@wait_for_success(60 * SERVICE_ACTIVE_TIME * 3, 60) @allure.step("Check readiness on node {cluster_node}") def readiness_on_node(cluster_node: ClusterNode): # TODO: Move to healtcheck classes diff --git a/pytest_tests/testsuites/failovers/test_failover_storage.py b/pytest_tests/testsuites/failovers/test_failover_storage.py index 8200a6c..d9f6b68 100644 --- a/pytest_tests/testsuites/failovers/test_failover_storage.py +++ b/pytest_tests/testsuites/failovers/test_failover_storage.py @@ -1,4 +1,5 @@ import logging +import random from datetime import datetime from time import sleep @@ -8,7 +9,7 @@ from frostfs_testlib.hosting import Host from frostfs_testlib.resources.common import MORPH_BLOCK_TIME from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL from frostfs_testlib.s3 import AwsCliClient, Boto3ClientWrapper, S3ClientWrapper, VersioningStatus -from frostfs_testlib.shell import CommandOptions, Shell +from frostfs_testlib.shell import CommandOptions from frostfs_testlib.steps.cli.container import StorageContainer, StorageContainerInfo, create_container from frostfs_testlib.steps.cli.object import get_object, put_object_to_random_node from frostfs_testlib.steps.node_management import ( @@ -28,7 +29,7 @@ from frostfs_testlib.storage.dataclasses.wallet import WalletInfo from frostfs_testlib.testing.cluster_test_base import ClusterTestBase from frostfs_testlib.testing.test_control import expect_not_raises from frostfs_testlib.utils import datetime_utils -from frostfs_testlib.utils.failover_utils import wait_all_storage_nodes_returned, wait_object_replication +from frostfs_testlib.utils.failover_utils import wait_object_replication from frostfs_testlib.utils.file_keeper import FileKeeper from frostfs_testlib.utils.file_utils import generate_file, get_file_hash @@ -51,9 +52,9 @@ def file_keeper(): @allure.step("Return all stopped hosts") @pytest.fixture(scope="function", autouse=True) -def after_run_return_all_stopped_hosts(client_shell: Shell, cluster: Cluster) -> str: - yield "After this test stopped services will be started automatically via fixture" - return_stopped_hosts(client_shell, cluster) +def after_run_return_all_stopped_hosts(cluster_state_controller: ClusterStateController) -> str: + yield + cluster_state_controller.start_stopped_hosts() @allure.step("Return all stopped storage services after test") @@ -78,16 +79,8 @@ def panic_reboot_host(host: Host) -> None: shell.exec('sudo sh -c "echo b > /proc/sysrq-trigger"', options) -def return_stopped_hosts(shell: Shell, cluster: Cluster) -> None: - for node in list(stopped_nodes): - with allure.step(f"Start host {node}"): - node.host.start_host() - stopped_nodes.remove(node) - - wait_all_storage_nodes_returned(shell, cluster) - - @pytest.mark.failover +@pytest.mark.failover_storage class TestFailoverStorage(ClusterTestBase): @allure.title("Shutdown and start node (stop_mode={stop_mode})") @pytest.mark.parametrize("stop_mode", ["hard", "soft"]) @@ -98,52 +91,68 @@ class TestFailoverStorage(ClusterTestBase): stop_mode: str, require_multiple_hosts, simple_object_size: ObjectSize, + cluster: Cluster, + cluster_state_controller: ClusterStateController, ): wallet = default_wallet placement_rule = "REP 2 IN X CBF 2 SELECT 2 FROM * AS X" source_file_path = generate_file(simple_object_size.value) - cid = create_container( - wallet, - shell=self.shell, - endpoint=self.cluster.default_rpc_endpoint, - rule=placement_rule, - basic_acl=PUBLIC_ACL, - ) - oid = put_object_to_random_node(wallet, source_file_path, cid, shell=self.shell, cluster=self.cluster) - nodes = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes) + stopped_hosts_nodes = [] - for node in nodes: - stopped_nodes.append(node) - - with allure.step(f"Stop host {node}"): - node.host.stop_host(stop_mode) - - new_nodes = wait_object_replication( - cid, - oid, - 2, + with allure.step(f"Create container and put object"): + cid = create_container( + wallet, shell=self.shell, - nodes=list(set(self.cluster.storage_nodes) - {*stopped_nodes}), + endpoint=self.cluster.default_rpc_endpoint, + rule=placement_rule, + basic_acl=PUBLIC_ACL, ) - assert all(old_node not in new_nodes for old_node in nodes) + oid = put_object_to_random_node(wallet, source_file_path, cid, shell=self.shell, cluster=self.cluster) + + with allure.step(f"Wait for replication and get nodes with object"): + nodes_with_object = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes) + + with allure.step(f"Stop 2 nodes with object and wait replication one by one"): + for storage_node in random.sample(nodes_with_object, 2): + stopped_hosts_nodes.append(storage_node) + + cluster_node = cluster.node(storage_node) + cluster_state_controller.stop_node_host(cluster_node, stop_mode) + + replicated_nodes = wait_object_replication( + cid, + oid, + 2, + shell=self.shell, + nodes=list(set(self.cluster.storage_nodes) - {*stopped_hosts_nodes}), + ) with allure.step("Check object data is not corrupted"): - got_file_path = get_object(wallet, cid, oid, endpoint=new_nodes[0].get_rpc_endpoint(), shell=self.shell) + got_file_path = get_object( + wallet, cid, oid, endpoint=replicated_nodes[0].get_rpc_endpoint(), shell=self.shell + ) assert get_file_hash(source_file_path) == get_file_hash(got_file_path) with allure.step("Return all hosts"): - return_stopped_hosts(self.shell, self.cluster) + cluster_state_controller.start_stopped_hosts() with allure.step("Check object data is not corrupted"): - new_nodes = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes) - got_file_path = get_object(wallet, cid, oid, shell=self.shell, endpoint=new_nodes[0].get_rpc_endpoint()) + replicated_nodes = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes) + got_file_path = get_object( + wallet, cid, oid, shell=self.shell, endpoint=replicated_nodes[0].get_rpc_endpoint() + ) assert get_file_hash(source_file_path) == get_file_hash(got_file_path) @allure.title("Panic reboot nodes (sequenced_reboots={sequence})") @pytest.mark.parametrize("sequence", [True, False]) @pytest.mark.failover_panic def test_panic_storage_node_host( - self, default_wallet, require_multiple_hosts, sequence: bool, simple_object_size: ObjectSize + self, + default_wallet, + require_multiple_hosts, + sequence: bool, + simple_object_size: ObjectSize, + cluster_state_controller: ClusterStateController, ): wallet = default_wallet placement_rule = "REP 2 IN X CBF 2 SELECT 2 FROM * AS X" @@ -157,38 +166,29 @@ class TestFailoverStorage(ClusterTestBase): ) oid = put_object_to_random_node(wallet, source_file_path, cid, shell=self.shell, cluster=self.cluster) - nodes = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes) + nodes_with_object = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes) allure.attach( - "\n".join([str(node) for node in nodes]), + "\n".join([str(node) for node in nodes_with_object]), "Current nodes with object", allure.attachment_type.TEXT, ) new_nodes: list[StorageNode] = [] - for node in nodes: - with allure.step(f"Hard reboot host {node} via magic SysRq option"): - panic_reboot_host(node.host) + with allure.step(f"Hard reboot hosts via magic SysRq option"): + for storage_node in nodes_with_object: + cluster_state_controller.panic_reboot_host(self.cluster.node(storage_node)) if sequence: - try: - new_nodes = wait_object_replication( - cid, - oid, - 2, - shell=self.shell, - nodes=list(set(self.cluster.storage_nodes) - {node}), - ) - except AssertionError: - new_nodes = wait_object_replication( - cid, - oid, - 2, - shell=self.shell, - nodes=self.cluster.storage_nodes, - ) + new_nodes = wait_object_replication( + cid, + oid, + 2, + shell=self.shell, + nodes=self.cluster.storage_nodes, + ) allure.attach( "\n".join([str(new_node) for new_node in new_nodes]), - f"Nodes with object after {node} fail", + f"Nodes with object after {storage_node} fail", allure.attachment_type.TEXT, ) @@ -213,19 +213,18 @@ class TestFailoverStorage(ClusterTestBase): after_run_return_all_stopped_services, ): default_node = self.cluster.cluster_nodes[0] - default_s3gate = self.cluster.s3_gates[0] with allure.step("Turn S3 GW off on default node"): - default_s3gate.stop_service() + cluster_state_controller.stop_s3_gate(default_node) with allure.step("Turn off storage on default node"): cluster_state_controller.stop_storage_service(default_node) with allure.step("Turn on S3 GW on default node"): - default_s3gate.start_service() + cluster_state_controller.start_s3_gate(default_node) with allure.step("Turn on storage on default node"): - cluster_state_controller.start_stopped_storage_services() + cluster_state_controller.start_storage_service(default_node) with allure.step("Create bucket with REP 1 SELECT 1 policy"): bucket = s3_client.create_bucket( @@ -246,6 +245,9 @@ class TestFailoverStorage(ClusterTestBase): with allure.step("Check that object is available"): s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name]) + with allure.step("Start storage nodes"): + cluster_state_controller.start_stopped_storage_services() + @pytest.mark.failover @pytest.mark.failover_empty_map @@ -259,7 +261,7 @@ class TestEmptyMap(ClusterTestBase): def empty_map_offline_teardown(self): yield with allure.step("Return all storage nodes to network map"): - for node in list(stopped_nodes): + for node in stopped_nodes: include_node_to_network_map(node, node, shell=self.shell, cluster=self.cluster) stopped_nodes.remove(node) @@ -315,19 +317,12 @@ class TestEmptyMap(ClusterTestBase): @allure.step("Teardown after EmptyMap stop service test") @pytest.fixture() - def empty_map_stop_service_teardown(self): + def empty_map_stop_service_teardown(self, cluster_state_controller: ClusterStateController): yield with allure.step("Return all storage nodes to network map"): - for node in list(list(stopped_nodes)): - with allure.step(f"Start node {node}"): - node.start_service() - with allure.step(f"Waiting status ready for node {node}"): - wait_for_node_to_be_ready(node) - - sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME)) - self.tick_epochs(1) + cluster_state_controller.start_stopped_storage_services() + for node in stopped_nodes: check_node_in_map(node, shell=self.shell, alive_node=node) - stopped_nodes.remove(node) @pytest.mark.failover_empty_map_stop_service @allure.title("Empty network map via stop all storage services (s3_client={s3_client})") @@ -337,6 +332,7 @@ class TestEmptyMap(ClusterTestBase): bucket: str, simple_object_size: ObjectSize, empty_map_stop_service_teardown, + cluster_state_controller: ClusterStateController, ): """ The test makes network map empty (stop storage service on all nodes @@ -369,39 +365,40 @@ class TestEmptyMap(ClusterTestBase): s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects) with allure.step("Stop all storage nodes"): - for node in self.cluster.storage_nodes: - with allure.step(f"Stop storage service on node: {node}"): - stopped_nodes.append(node) - node.stop_service() + cluster_state_controller.stop_all_storage_services() with allure.step("Remove all nodes from network map"): - remove_nodes_from_map_morph(shell=self.shell, cluster=self.cluster, remove_nodes=stopped_nodes) + remove_nodes_from_map_morph( + shell=self.shell, cluster=self.cluster, remove_nodes=self.cluster.services(StorageNode) + ) with allure.step("Return all storage nodes to network map"): - self.return_nodes_after_stop_with_check_empty_map(stopped_nodes) + self.return_nodes_after_stop_with_check_empty_map(cluster_state_controller) with allure.step("Check that object exists in bucket"): s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects) @allure.step("Return all nodes to cluster with check empty map first") - def return_nodes_after_stop_with_check_empty_map(self, return_nodes=None) -> None: - first_node = True - for node in list(return_nodes): - with allure.step(f"Start node {node}"): - node.start_service() - with allure.step(f"Waiting status ready for node {node}"): - wait_for_node_to_be_ready(node) + def return_nodes_after_stop_with_check_empty_map(self, cluster_state_controller: ClusterStateController) -> None: + first_node = self.cluster.cluster_nodes[0].service(StorageNode) - with allure.step("Make sure that network map is empty"): - if first_node: - for check_node in list(return_nodes): - check_node_not_in_map(check_node, shell=self.shell, alive_node=node) - first_node = False + with allure.step("Start first node and check network map"): + cluster_state_controller.start_storage_service(self.cluster.cluster_nodes[0]) + wait_for_node_to_be_ready(first_node) + + for check_node in self.cluster.storage_nodes: + check_node_not_in_map(check_node, shell=self.shell, alive_node=first_node) + + for node in self.cluster.cluster_nodes[1:]: + storage_node = node.service(StorageNode) + + cluster_state_controller.start_storage_service(node) + wait_for_node_to_be_ready(storage_node) sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME)) self.tick_epochs(1) - check_node_in_map(node, shell=self.shell, alive_node=node) - stopped_nodes.remove(node) + + check_node_in_map(storage_node, shell=self.shell, alive_node=first_node) @allure.title("Object loss from fstree/blobovnicza (versioning=enabled, s3_client={s3_client})") def test_s3_fstree_blobovnicza_loss_versioning_on( @@ -537,11 +534,11 @@ class TestEmptyMap(ClusterTestBase): @pytest.mark.failover_data_loss class TestStorageDataLoss(ClusterTestBase): @allure.step("Get list of all piloramas on node") - def get_piloramas_list(self, cluster_state_controller, node) -> list: - data_directory_path = cluster_state_controller.get_data_directory() + def get_piloramas_list(self, node: StorageNode) -> list: + data_directory_path = node.get_data_directory() cmd = f"sudo ls -1 {data_directory_path}/meta*/pilorama*" - shell = cluster_state_controller.host.get_shell() + shell = node.host.get_shell() stdout = shell.exec(cmd).stdout piloramas = stdout.split("\n") @@ -590,11 +587,11 @@ class TestStorageDataLoss(ClusterTestBase): with allure.step("Enable resync_metabase option for storage services"): for storage_node in cluster_state_controller.cluster.storage_nodes: with allure.step(f"Enable resync_metabase option for {storage_node}"): - config_file_path, config = storage_node.get_shard_config_path() + config_file_path, config = storage_node.get_shards_config() if not config["storage"]["shard"]["default"]["resync_metabase"]: file_keeper.add(storage_node, config_file_path) config["storage"]["shard"]["default"]["resync_metabase"] = True - storage_node.save_config(config) + storage_node.save_config(config, config_file_path) with allure.step("Start storage services on all nodes"): cluster_state_controller.start_stopped_storage_services() @@ -752,7 +749,7 @@ class TestStorageDataLoss(ClusterTestBase): node_to_check = self.cluster.storage_nodes[0] piloramas_list_before_removing = {} with allure.step("Get list of all pilorama.db"): - piloramas_list_before_removing = self.get_piloramas_list(node_to_check, cluster_state_controller) + piloramas_list_before_removing = self.get_piloramas_list(node_to_check) with allure.step("Stop all storage nodes"): for node in self.cluster.cluster_nodes: @@ -771,7 +768,7 @@ class TestStorageDataLoss(ClusterTestBase): piloramas_list_afrer_removing = {} with allure.step("Get list of all pilorama.db after sync"): - piloramas_list_afrer_removing = self.get_piloramas_list(node_to_check, cluster_state_controller) + piloramas_list_afrer_removing = self.get_piloramas_list(node_to_check) assert piloramas_list_afrer_removing == piloramas_list_before_removing, "List of pilorama.db is different" with allure.step("Check bucket versioning"): diff --git a/requirements.txt b/requirements.txt index 623d60c..2f28a1a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,6 @@ pyyaml==6.0.1 pytest==7.1.2 pytest-lazy-fixture==0.6.3 python-dateutil==2.8.2 -requests==2.28.0 +requests==2.28.1 tenacity==8.0.1 urllib3==1.26.9 \ No newline at end of file