[#129] Updates for failover

Signed-off-by: Andrey Berezin <a.berezin@yadro.com>
(cherry picked from commit ae57672c7d)
This commit is contained in:
Andrey Berezin 2023-10-31 14:28:44 +03:00 committed by Dmitry Anurin
parent e8289a3f83
commit cd06a073a2
2 changed files with 44 additions and 54 deletions

View file

@ -8,7 +8,9 @@ import allure
import pytest import pytest
import yaml import yaml
from dateutil import parser from dateutil import parser
from frostfs_testlib.healthcheck.interfaces import Healthcheck
from frostfs_testlib.hosting import Hosting from frostfs_testlib.hosting import Hosting
from frostfs_testlib.plugins import load_plugin
from frostfs_testlib.reporter import AllureHandler, get_reporter from frostfs_testlib.reporter import AllureHandler, get_reporter
from frostfs_testlib.resources.common import ( from frostfs_testlib.resources.common import (
ASSETS_DIR, ASSETS_DIR,
@ -182,8 +184,18 @@ def s3_policy(request: pytest.FixtureRequest):
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def cluster_state_controller(client_shell: Shell, cluster: Cluster) -> ClusterStateController: @allure.title("[Session] Create healthcheck object")
controller = ClusterStateController(client_shell, cluster) def healthcheck(cluster: Cluster) -> Healthcheck:
healthcheck_cls = load_plugin(
"frostfs.testlib.healthcheck", cluster.cluster_nodes[0].host.config.healthcheck_plugin_name
)
return healthcheck_cls()
@pytest.fixture(scope="session")
def cluster_state_controller(client_shell: Shell, cluster: Cluster, healthcheck: Healthcheck) -> ClusterStateController:
controller = ClusterStateController(client_shell, cluster, healthcheck)
yield controller yield controller

View file

@ -5,11 +5,9 @@ from time import sleep
import allure import allure
import pytest import pytest
from frostfs_testlib.hosting import Host
from frostfs_testlib.resources.common import MORPH_BLOCK_TIME from frostfs_testlib.resources.common import MORPH_BLOCK_TIME
from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL
from frostfs_testlib.s3 import AwsCliClient, Boto3ClientWrapper, S3ClientWrapper, VersioningStatus from frostfs_testlib.s3 import AwsCliClient, Boto3ClientWrapper, S3ClientWrapper, VersioningStatus
from frostfs_testlib.shell import CommandOptions
from frostfs_testlib.steps.cli.container import StorageContainer, StorageContainerInfo, create_container from frostfs_testlib.steps.cli.container import StorageContainer, StorageContainerInfo, create_container
from frostfs_testlib.steps.cli.object import get_object, put_object_to_random_node from frostfs_testlib.steps.cli.object import get_object, put_object_to_random_node
from frostfs_testlib.steps.node_management import ( from frostfs_testlib.steps.node_management import (
@ -21,7 +19,7 @@ from frostfs_testlib.steps.node_management import (
wait_for_node_to_be_ready, wait_for_node_to_be_ready,
) )
from frostfs_testlib.steps.s3 import s3_helper from frostfs_testlib.steps.s3 import s3_helper
from frostfs_testlib.storage.cluster import Cluster, ClusterNode, StorageNode from frostfs_testlib.storage.cluster import Cluster, ClusterNode, S3Gate, StorageNode
from frostfs_testlib.storage.controllers import ClusterStateController, ShardsWatcher from frostfs_testlib.storage.controllers import ClusterStateController, ShardsWatcher
from frostfs_testlib.storage.dataclasses.object_size import ObjectSize from frostfs_testlib.storage.dataclasses.object_size import ObjectSize
from frostfs_testlib.storage.dataclasses.storage_object_info import StorageObjectInfo from frostfs_testlib.storage.dataclasses.storage_object_info import StorageObjectInfo
@ -57,26 +55,11 @@ def after_run_return_all_stopped_hosts(cluster_state_controller: ClusterStateCon
cluster_state_controller.start_stopped_hosts() cluster_state_controller.start_stopped_hosts()
@allure.step("Return all stopped storage services after test") @allure.step("Return all stopped services after test")
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def after_run_return_all_stopped_services(cluster_state_controller: ClusterStateController): def after_run_return_all_stopped_services(cluster_state_controller: ClusterStateController):
yield yield
cluster_state_controller.start_stopped_storage_services() cluster_state_controller.start_all_stopped_services()
@allure.step("Return all stopped S3 GateWay services after test")
@pytest.fixture(scope="function")
def after_run_return_all_stopped_s3(cluster_state_controller: ClusterStateController):
yield
cluster_state_controller.start_stopped_s3_gates()
def panic_reboot_host(host: Host) -> None:
shell = host.get_shell()
shell.exec('sudo sh -c "echo 1 > /proc/sys/kernel/sysrq"')
options = CommandOptions(close_stdin=True, timeout=1, check=False)
shell.exec('sudo sh -c "echo b > /proc/sysrq-trigger"', options)
@pytest.mark.failover @pytest.mark.failover
@ -209,22 +192,21 @@ class TestFailoverStorage(ClusterTestBase):
s3_client: S3ClientWrapper, s3_client: S3ClientWrapper,
simple_object_size: ObjectSize, simple_object_size: ObjectSize,
cluster_state_controller: ClusterStateController, cluster_state_controller: ClusterStateController,
after_run_return_all_stopped_s3,
after_run_return_all_stopped_services, after_run_return_all_stopped_services,
): ):
default_node = self.cluster.cluster_nodes[0] default_node = self.cluster.cluster_nodes[0]
with allure.step("Turn S3 GW off on default node"): with allure.step("Turn S3 GW off on default node"):
cluster_state_controller.stop_s3_gate(default_node) cluster_state_controller.stop_service_of_type(default_node, S3Gate)
with allure.step("Turn off storage on default node"): with allure.step("Turn off storage on default node"):
cluster_state_controller.stop_storage_service(default_node) cluster_state_controller.stop_service_of_type(default_node, StorageNode)
with allure.step("Turn on S3 GW on default node"): with allure.step("Turn on S3 GW on default node"):
cluster_state_controller.start_s3_gate(default_node) cluster_state_controller.start_service_of_type(default_node, S3Gate)
with allure.step("Turn on storage on default node"): with allure.step("Turn on storage on default node"):
cluster_state_controller.start_storage_service(default_node) cluster_state_controller.start_service_of_type(default_node, StorageNode)
with allure.step("Create bucket with REP 1 SELECT 1 policy"): with allure.step("Create bucket with REP 1 SELECT 1 policy"):
bucket = s3_client.create_bucket( bucket = s3_client.create_bucket(
@ -240,13 +222,13 @@ class TestFailoverStorage(ClusterTestBase):
with allure.step("Turn off all storage nodes except default"): with allure.step("Turn off all storage nodes except default"):
for node in self.cluster.cluster_nodes[1:]: for node in self.cluster.cluster_nodes[1:]:
with allure.step(f"Stop storage service on node: {node}"): with allure.step(f"Stop storage service on node: {node}"):
cluster_state_controller.stop_storage_service(node) cluster_state_controller.stop_service_of_type(node, StorageNode)
with allure.step("Check that object is available"): with allure.step("Check that object is available"):
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name]) s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
with allure.step("Start storage nodes"): with allure.step("Start storage nodes"):
cluster_state_controller.start_stopped_storage_services() cluster_state_controller.start_all_stopped_services()
@pytest.mark.failover @pytest.mark.failover
@ -320,7 +302,7 @@ class TestEmptyMap(ClusterTestBase):
def empty_map_stop_service_teardown(self, cluster_state_controller: ClusterStateController): def empty_map_stop_service_teardown(self, cluster_state_controller: ClusterStateController):
yield yield
with allure.step("Return all storage nodes to network map"): with allure.step("Return all storage nodes to network map"):
cluster_state_controller.start_stopped_storage_services() cluster_state_controller.start_all_stopped_services()
for node in stopped_nodes: for node in stopped_nodes:
check_node_in_map(node, shell=self.shell, alive_node=node) check_node_in_map(node, shell=self.shell, alive_node=node)
@ -365,7 +347,7 @@ class TestEmptyMap(ClusterTestBase):
s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects) s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects)
with allure.step("Stop all storage nodes"): with allure.step("Stop all storage nodes"):
cluster_state_controller.stop_all_storage_services() cluster_state_controller.stop_services_of_type(StorageNode)
with allure.step("Remove all nodes from network map"): with allure.step("Remove all nodes from network map"):
remove_nodes_from_map_morph( remove_nodes_from_map_morph(
@ -383,7 +365,7 @@ class TestEmptyMap(ClusterTestBase):
first_node = self.cluster.cluster_nodes[0].service(StorageNode) first_node = self.cluster.cluster_nodes[0].service(StorageNode)
with allure.step("Start first node and check network map"): with allure.step("Start first node and check network map"):
cluster_state_controller.start_storage_service(self.cluster.cluster_nodes[0]) cluster_state_controller.start_service_of_type(self.cluster.cluster_nodes[0], StorageNode)
wait_for_node_to_be_ready(first_node) wait_for_node_to_be_ready(first_node)
for check_node in self.cluster.storage_nodes: for check_node in self.cluster.storage_nodes:
@ -392,7 +374,7 @@ class TestEmptyMap(ClusterTestBase):
for node in self.cluster.cluster_nodes[1:]: for node in self.cluster.cluster_nodes[1:]:
storage_node = node.service(StorageNode) storage_node = node.service(StorageNode)
cluster_state_controller.start_storage_service(node) cluster_state_controller.start_service_of_type(node, StorageNode)
wait_for_node_to_be_ready(storage_node) wait_for_node_to_be_ready(storage_node)
sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME)) sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME))
@ -420,9 +402,7 @@ class TestEmptyMap(ClusterTestBase):
object_versions.append(put_object) object_versions.append(put_object)
with allure.step("Stop all storage nodes"): with allure.step("Stop all storage nodes"):
for node in self.cluster.cluster_nodes: cluster_state_controller.stop_services_of_type(StorageNode)
with allure.step(f"Stop storage service on node: {node}"):
cluster_state_controller.stop_storage_service(node)
with allure.step("Delete blobovnicza and fstree from all nodes"): with allure.step("Delete blobovnicza and fstree from all nodes"):
for node in self.cluster.storage_nodes: for node in self.cluster.storage_nodes:
@ -430,7 +410,7 @@ class TestEmptyMap(ClusterTestBase):
node.delete_fstree() node.delete_fstree()
with allure.step("Start all storage nodes"): with allure.step("Start all storage nodes"):
cluster_state_controller.start_stopped_storage_services() cluster_state_controller.start_all_stopped_services()
# need to get Delete Marker first # need to get Delete Marker first
with allure.step("Delete the object from the bucket"): with allure.step("Delete the object from the bucket"):
@ -462,9 +442,7 @@ class TestEmptyMap(ClusterTestBase):
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name]) s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
with allure.step("Stop all storage nodes"): with allure.step("Stop all storage nodes"):
for node in self.cluster.cluster_nodes: cluster_state_controller.stop_services_of_type(StorageNode)
with allure.step(f"Stop storage service on node: {node}"):
cluster_state_controller.stop_storage_service(node)
with allure.step("Delete blobovnicza and fstree from all nodes"): with allure.step("Delete blobovnicza and fstree from all nodes"):
for node in self.cluster.storage_nodes: for node in self.cluster.storage_nodes:
@ -472,7 +450,7 @@ class TestEmptyMap(ClusterTestBase):
node.delete_fstree() node.delete_fstree()
with allure.step("Start all storage nodes"): with allure.step("Start all storage nodes"):
cluster_state_controller.start_stopped_storage_services() cluster_state_controller.start_all_stopped_services()
with allure.step("Delete the object from the bucket"): with allure.step("Delete the object from the bucket"):
s3_client.delete_object(bucket, file_name) s3_client.delete_object(bucket, file_name)
@ -507,16 +485,14 @@ class TestEmptyMap(ClusterTestBase):
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name]) s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
with allure.step("Stop all storage nodes"): with allure.step("Stop all storage nodes"):
for node in self.cluster.cluster_nodes: cluster_state_controller.stop_services_of_type(StorageNode)
with allure.step(f"Stop storage service on node: {node}"):
cluster_state_controller.stop_storage_service(node)
with allure.step("Delete pilorama.db from all nodes"): with allure.step("Delete pilorama.db from all nodes"):
for node in self.cluster.storage_nodes: for node in self.cluster.storage_nodes:
node.delete_pilorama() node.delete_pilorama()
with allure.step("Start all storage nodes"): with allure.step("Start all storage nodes"):
cluster_state_controller.start_stopped_storage_services() cluster_state_controller.start_all_stopped_services()
with allure.step("Check list objects first time"): with allure.step("Check list objects first time"):
objects_list = s3_client.list_objects(bucket) objects_list = s3_client.list_objects(bucket)
@ -578,7 +554,7 @@ class TestStorageDataLoss(ClusterTestBase):
) )
with allure.step("Stop storage services on all nodes"): with allure.step("Stop storage services on all nodes"):
cluster_state_controller.stop_all_storage_services() cluster_state_controller.stop_services_of_type(StorageNode)
with allure.step("Delete metabase from all nodes"): with allure.step("Delete metabase from all nodes"):
for node in cluster_state_controller.cluster.storage_nodes: for node in cluster_state_controller.cluster.storage_nodes:
@ -594,7 +570,11 @@ class TestStorageDataLoss(ClusterTestBase):
storage_node.save_config(config, config_file_path) storage_node.save_config(config, config_file_path)
with allure.step("Start storage services on all nodes"): with allure.step("Start storage services on all nodes"):
cluster_state_controller.start_stopped_storage_services() cluster_state_controller.start_all_stopped_services()
with allure.step("Wait for tree rebalance"):
# TODO: Use product metric when we have proper ones for this check
sleep(30)
with allure.step("Delete objects from bucket"): with allure.step("Delete objects from bucket"):
with allure.step("Delete simple object from bucket"): with allure.step("Delete simple object from bucket"):
@ -655,13 +635,13 @@ class TestStorageDataLoss(ClusterTestBase):
shards_watcher.take_shards_snapshot() shards_watcher.take_shards_snapshot()
with allure.step(f"Stop storage service on node {node_under_test}"): with allure.step(f"Stop storage service on node {node_under_test}"):
cluster_state_controller.stop_storage_service(node_under_test) cluster_state_controller.stop_service_of_type(node_under_test, StorageNode)
with allure.step(f"Delete write cache from node {node_under_test}"): with allure.step(f"Delete write cache from node {node_under_test}"):
node_under_test.storage_node.delete_write_cache() node_under_test.storage_node.delete_write_cache()
with allure.step(f"Start storage service on node {node_under_test}"): with allure.step(f"Start storage service on node {node_under_test}"):
cluster_state_controller.start_storage_service(node_under_test) cluster_state_controller.start_all_stopped_services()
with allure.step("Objects should be available"): with allure.step("Objects should be available"):
for storage_object in storage_objects: for storage_object in storage_objects:
@ -710,7 +690,7 @@ class TestStorageDataLoss(ClusterTestBase):
with allure.step("Stop one node and wait for rebalance connection of s3 gate to storage service"): with allure.step("Stop one node and wait for rebalance connection of s3 gate to storage service"):
current_node = self.cluster.cluster_nodes[0] current_node = self.cluster.cluster_nodes[0]
cluster_state_controller.stop_storage_service(current_node) cluster_state_controller.stop_service_of_type(current_node, StorageNode)
# waiting for rebalance connection of s3 gate to storage service # waiting for rebalance connection of s3 gate to storage service
sleep(60) sleep(60)
@ -752,15 +732,13 @@ class TestStorageDataLoss(ClusterTestBase):
piloramas_list_before_removing = self.get_piloramas_list(node_to_check) piloramas_list_before_removing = self.get_piloramas_list(node_to_check)
with allure.step("Stop all storage nodes"): with allure.step("Stop all storage nodes"):
for node in self.cluster.cluster_nodes: cluster_state_controller.stop_services_of_type(StorageNode)
with allure.step(f"Stop storage service on node: {node}"):
cluster_state_controller.stop_storage_service(node)
with allure.step("Delete pilorama.db from one node"): with allure.step("Delete pilorama.db from one node"):
node_to_check.delete_pilorama() node_to_check.delete_pilorama()
with allure.step("Start all storage nodes"): with allure.step("Start all storage nodes"):
cluster_state_controller.start_stopped_storage_services() cluster_state_controller.start_all_stopped_services()
with allure.step("Tick epoch to trigger sync and then wait for 1 minute"): with allure.step("Tick epoch to trigger sync and then wait for 1 minute"):
self.tick_epochs(1) self.tick_epochs(1)