Updates for failover #129
2 changed files with 44 additions and 54 deletions
|
@ -8,7 +8,9 @@ import allure
|
||||||
import pytest
|
import pytest
|
||||||
import yaml
|
import yaml
|
||||||
from dateutil import parser
|
from dateutil import parser
|
||||||
|
from frostfs_testlib.healthcheck.interfaces import Healthcheck
|
||||||
from frostfs_testlib.hosting import Hosting
|
from frostfs_testlib.hosting import Hosting
|
||||||
|
from frostfs_testlib.plugins import load_plugin
|
||||||
from frostfs_testlib.reporter import AllureHandler, get_reporter
|
from frostfs_testlib.reporter import AllureHandler, get_reporter
|
||||||
from frostfs_testlib.resources.common import (
|
from frostfs_testlib.resources.common import (
|
||||||
ASSETS_DIR,
|
ASSETS_DIR,
|
||||||
|
@ -182,8 +184,18 @@ def s3_policy(request: pytest.FixtureRequest):
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def cluster_state_controller(client_shell: Shell, cluster: Cluster) -> ClusterStateController:
|
@allure.title("[Session] Create healthcheck object")
|
||||||
controller = ClusterStateController(client_shell, cluster)
|
def healthcheck(cluster: Cluster) -> Healthcheck:
|
||||||
|
healthcheck_cls = load_plugin(
|
||||||
|
"frostfs.testlib.healthcheck", cluster.cluster_nodes[0].host.config.healthcheck_plugin_name
|
||||||
|
)
|
||||||
|
|
||||||
|
return healthcheck_cls()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def cluster_state_controller(client_shell: Shell, cluster: Cluster, healthcheck: Healthcheck) -> ClusterStateController:
|
||||||
|
controller = ClusterStateController(client_shell, cluster, healthcheck)
|
||||||
yield controller
|
yield controller
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,11 +5,9 @@ from time import sleep
|
||||||
|
|
||||||
import allure
|
import allure
|
||||||
import pytest
|
import pytest
|
||||||
from frostfs_testlib.hosting import Host
|
|
||||||
from frostfs_testlib.resources.common import MORPH_BLOCK_TIME
|
from frostfs_testlib.resources.common import MORPH_BLOCK_TIME
|
||||||
from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL
|
from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL
|
||||||
from frostfs_testlib.s3 import AwsCliClient, Boto3ClientWrapper, S3ClientWrapper, VersioningStatus
|
from frostfs_testlib.s3 import AwsCliClient, Boto3ClientWrapper, S3ClientWrapper, VersioningStatus
|
||||||
from frostfs_testlib.shell import CommandOptions
|
|
||||||
from frostfs_testlib.steps.cli.container import StorageContainer, StorageContainerInfo, create_container
|
from frostfs_testlib.steps.cli.container import StorageContainer, StorageContainerInfo, create_container
|
||||||
from frostfs_testlib.steps.cli.object import get_object, put_object_to_random_node
|
from frostfs_testlib.steps.cli.object import get_object, put_object_to_random_node
|
||||||
from frostfs_testlib.steps.node_management import (
|
from frostfs_testlib.steps.node_management import (
|
||||||
|
@ -21,7 +19,7 @@ from frostfs_testlib.steps.node_management import (
|
||||||
wait_for_node_to_be_ready,
|
wait_for_node_to_be_ready,
|
||||||
)
|
)
|
||||||
from frostfs_testlib.steps.s3 import s3_helper
|
from frostfs_testlib.steps.s3 import s3_helper
|
||||||
from frostfs_testlib.storage.cluster import Cluster, ClusterNode, StorageNode
|
from frostfs_testlib.storage.cluster import Cluster, ClusterNode, S3Gate, StorageNode
|
||||||
from frostfs_testlib.storage.controllers import ClusterStateController, ShardsWatcher
|
from frostfs_testlib.storage.controllers import ClusterStateController, ShardsWatcher
|
||||||
from frostfs_testlib.storage.dataclasses.object_size import ObjectSize
|
from frostfs_testlib.storage.dataclasses.object_size import ObjectSize
|
||||||
from frostfs_testlib.storage.dataclasses.storage_object_info import StorageObjectInfo
|
from frostfs_testlib.storage.dataclasses.storage_object_info import StorageObjectInfo
|
||||||
|
@ -57,26 +55,11 @@ def after_run_return_all_stopped_hosts(cluster_state_controller: ClusterStateCon
|
||||||
cluster_state_controller.start_stopped_hosts()
|
cluster_state_controller.start_stopped_hosts()
|
||||||
|
|
||||||
|
|
||||||
@allure.step("Return all stopped storage services after test")
|
@allure.step("Return all stopped services after test")
|
||||||
@pytest.fixture(scope="function")
|
@pytest.fixture(scope="function")
|
||||||
def after_run_return_all_stopped_services(cluster_state_controller: ClusterStateController):
|
def after_run_return_all_stopped_services(cluster_state_controller: ClusterStateController):
|
||||||
yield
|
yield
|
||||||
cluster_state_controller.start_stopped_storage_services()
|
cluster_state_controller.start_all_stopped_services()
|
||||||
|
|
||||||
|
|
||||||
@allure.step("Return all stopped S3 GateWay services after test")
|
|
||||||
@pytest.fixture(scope="function")
|
|
||||||
def after_run_return_all_stopped_s3(cluster_state_controller: ClusterStateController):
|
|
||||||
yield
|
|
||||||
cluster_state_controller.start_stopped_s3_gates()
|
|
||||||
|
|
||||||
|
|
||||||
def panic_reboot_host(host: Host) -> None:
|
|
||||||
shell = host.get_shell()
|
|
||||||
shell.exec('sudo sh -c "echo 1 > /proc/sys/kernel/sysrq"')
|
|
||||||
|
|
||||||
options = CommandOptions(close_stdin=True, timeout=1, check=False)
|
|
||||||
shell.exec('sudo sh -c "echo b > /proc/sysrq-trigger"', options)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.failover
|
@pytest.mark.failover
|
||||||
|
@ -209,22 +192,21 @@ class TestFailoverStorage(ClusterTestBase):
|
||||||
s3_client: S3ClientWrapper,
|
s3_client: S3ClientWrapper,
|
||||||
simple_object_size: ObjectSize,
|
simple_object_size: ObjectSize,
|
||||||
cluster_state_controller: ClusterStateController,
|
cluster_state_controller: ClusterStateController,
|
||||||
after_run_return_all_stopped_s3,
|
|
||||||
after_run_return_all_stopped_services,
|
after_run_return_all_stopped_services,
|
||||||
):
|
):
|
||||||
default_node = self.cluster.cluster_nodes[0]
|
default_node = self.cluster.cluster_nodes[0]
|
||||||
|
|
||||||
with allure.step("Turn S3 GW off on default node"):
|
with allure.step("Turn S3 GW off on default node"):
|
||||||
cluster_state_controller.stop_s3_gate(default_node)
|
cluster_state_controller.stop_service_of_type(default_node, S3Gate)
|
||||||
|
|
||||||
with allure.step("Turn off storage on default node"):
|
with allure.step("Turn off storage on default node"):
|
||||||
cluster_state_controller.stop_storage_service(default_node)
|
cluster_state_controller.stop_service_of_type(default_node, StorageNode)
|
||||||
|
|
||||||
with allure.step("Turn on S3 GW on default node"):
|
with allure.step("Turn on S3 GW on default node"):
|
||||||
cluster_state_controller.start_s3_gate(default_node)
|
cluster_state_controller.start_service_of_type(default_node, S3Gate)
|
||||||
|
|
||||||
with allure.step("Turn on storage on default node"):
|
with allure.step("Turn on storage on default node"):
|
||||||
cluster_state_controller.start_storage_service(default_node)
|
cluster_state_controller.start_service_of_type(default_node, StorageNode)
|
||||||
|
|
||||||
with allure.step("Create bucket with REP 1 SELECT 1 policy"):
|
with allure.step("Create bucket with REP 1 SELECT 1 policy"):
|
||||||
bucket = s3_client.create_bucket(
|
bucket = s3_client.create_bucket(
|
||||||
|
@ -240,13 +222,13 @@ class TestFailoverStorage(ClusterTestBase):
|
||||||
with allure.step("Turn off all storage nodes except default"):
|
with allure.step("Turn off all storage nodes except default"):
|
||||||
for node in self.cluster.cluster_nodes[1:]:
|
for node in self.cluster.cluster_nodes[1:]:
|
||||||
with allure.step(f"Stop storage service on node: {node}"):
|
with allure.step(f"Stop storage service on node: {node}"):
|
||||||
cluster_state_controller.stop_storage_service(node)
|
cluster_state_controller.stop_service_of_type(node, StorageNode)
|
||||||
|
|
||||||
with allure.step("Check that object is available"):
|
with allure.step("Check that object is available"):
|
||||||
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
|
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
|
||||||
|
|
||||||
with allure.step("Start storage nodes"):
|
with allure.step("Start storage nodes"):
|
||||||
cluster_state_controller.start_stopped_storage_services()
|
cluster_state_controller.start_all_stopped_services()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.failover
|
@pytest.mark.failover
|
||||||
|
@ -320,7 +302,7 @@ class TestEmptyMap(ClusterTestBase):
|
||||||
def empty_map_stop_service_teardown(self, cluster_state_controller: ClusterStateController):
|
def empty_map_stop_service_teardown(self, cluster_state_controller: ClusterStateController):
|
||||||
yield
|
yield
|
||||||
with allure.step("Return all storage nodes to network map"):
|
with allure.step("Return all storage nodes to network map"):
|
||||||
cluster_state_controller.start_stopped_storage_services()
|
cluster_state_controller.start_all_stopped_services()
|
||||||
for node in stopped_nodes:
|
for node in stopped_nodes:
|
||||||
check_node_in_map(node, shell=self.shell, alive_node=node)
|
check_node_in_map(node, shell=self.shell, alive_node=node)
|
||||||
|
|
||||||
|
@ -365,7 +347,7 @@ class TestEmptyMap(ClusterTestBase):
|
||||||
s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects)
|
s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects)
|
||||||
|
|
||||||
with allure.step("Stop all storage nodes"):
|
with allure.step("Stop all storage nodes"):
|
||||||
cluster_state_controller.stop_all_storage_services()
|
cluster_state_controller.stop_services_of_type(StorageNode)
|
||||||
|
|
||||||
with allure.step("Remove all nodes from network map"):
|
with allure.step("Remove all nodes from network map"):
|
||||||
remove_nodes_from_map_morph(
|
remove_nodes_from_map_morph(
|
||||||
|
@ -383,7 +365,7 @@ class TestEmptyMap(ClusterTestBase):
|
||||||
first_node = self.cluster.cluster_nodes[0].service(StorageNode)
|
first_node = self.cluster.cluster_nodes[0].service(StorageNode)
|
||||||
|
|
||||||
with allure.step("Start first node and check network map"):
|
with allure.step("Start first node and check network map"):
|
||||||
cluster_state_controller.start_storage_service(self.cluster.cluster_nodes[0])
|
cluster_state_controller.start_service_of_type(self.cluster.cluster_nodes[0], StorageNode)
|
||||||
wait_for_node_to_be_ready(first_node)
|
wait_for_node_to_be_ready(first_node)
|
||||||
|
|
||||||
for check_node in self.cluster.storage_nodes:
|
for check_node in self.cluster.storage_nodes:
|
||||||
|
@ -392,7 +374,7 @@ class TestEmptyMap(ClusterTestBase):
|
||||||
for node in self.cluster.cluster_nodes[1:]:
|
for node in self.cluster.cluster_nodes[1:]:
|
||||||
storage_node = node.service(StorageNode)
|
storage_node = node.service(StorageNode)
|
||||||
|
|
||||||
cluster_state_controller.start_storage_service(node)
|
cluster_state_controller.start_service_of_type(node, StorageNode)
|
||||||
wait_for_node_to_be_ready(storage_node)
|
wait_for_node_to_be_ready(storage_node)
|
||||||
|
|
||||||
sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME))
|
sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME))
|
||||||
|
@ -420,9 +402,7 @@ class TestEmptyMap(ClusterTestBase):
|
||||||
object_versions.append(put_object)
|
object_versions.append(put_object)
|
||||||
|
|
||||||
with allure.step("Stop all storage nodes"):
|
with allure.step("Stop all storage nodes"):
|
||||||
for node in self.cluster.cluster_nodes:
|
cluster_state_controller.stop_services_of_type(StorageNode)
|
||||||
with allure.step(f"Stop storage service on node: {node}"):
|
|
||||||
cluster_state_controller.stop_storage_service(node)
|
|
||||||
|
|
||||||
with allure.step("Delete blobovnicza and fstree from all nodes"):
|
with allure.step("Delete blobovnicza and fstree from all nodes"):
|
||||||
for node in self.cluster.storage_nodes:
|
for node in self.cluster.storage_nodes:
|
||||||
|
@ -430,7 +410,7 @@ class TestEmptyMap(ClusterTestBase):
|
||||||
node.delete_fstree()
|
node.delete_fstree()
|
||||||
|
|
||||||
with allure.step("Start all storage nodes"):
|
with allure.step("Start all storage nodes"):
|
||||||
cluster_state_controller.start_stopped_storage_services()
|
cluster_state_controller.start_all_stopped_services()
|
||||||
|
|
||||||
# need to get Delete Marker first
|
# need to get Delete Marker first
|
||||||
with allure.step("Delete the object from the bucket"):
|
with allure.step("Delete the object from the bucket"):
|
||||||
|
@ -462,9 +442,7 @@ class TestEmptyMap(ClusterTestBase):
|
||||||
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
|
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
|
||||||
|
|
||||||
with allure.step("Stop all storage nodes"):
|
with allure.step("Stop all storage nodes"):
|
||||||
for node in self.cluster.cluster_nodes:
|
cluster_state_controller.stop_services_of_type(StorageNode)
|
||||||
with allure.step(f"Stop storage service on node: {node}"):
|
|
||||||
cluster_state_controller.stop_storage_service(node)
|
|
||||||
|
|
||||||
with allure.step("Delete blobovnicza and fstree from all nodes"):
|
with allure.step("Delete blobovnicza and fstree from all nodes"):
|
||||||
for node in self.cluster.storage_nodes:
|
for node in self.cluster.storage_nodes:
|
||||||
|
@ -472,7 +450,7 @@ class TestEmptyMap(ClusterTestBase):
|
||||||
node.delete_fstree()
|
node.delete_fstree()
|
||||||
|
|
||||||
with allure.step("Start all storage nodes"):
|
with allure.step("Start all storage nodes"):
|
||||||
cluster_state_controller.start_stopped_storage_services()
|
cluster_state_controller.start_all_stopped_services()
|
||||||
|
|
||||||
with allure.step("Delete the object from the bucket"):
|
with allure.step("Delete the object from the bucket"):
|
||||||
s3_client.delete_object(bucket, file_name)
|
s3_client.delete_object(bucket, file_name)
|
||||||
|
@ -507,16 +485,14 @@ class TestEmptyMap(ClusterTestBase):
|
||||||
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
|
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
|
||||||
|
|
||||||
with allure.step("Stop all storage nodes"):
|
with allure.step("Stop all storage nodes"):
|
||||||
for node in self.cluster.cluster_nodes:
|
cluster_state_controller.stop_services_of_type(StorageNode)
|
||||||
with allure.step(f"Stop storage service on node: {node}"):
|
|
||||||
cluster_state_controller.stop_storage_service(node)
|
|
||||||
|
|
||||||
with allure.step("Delete pilorama.db from all nodes"):
|
with allure.step("Delete pilorama.db from all nodes"):
|
||||||
for node in self.cluster.storage_nodes:
|
for node in self.cluster.storage_nodes:
|
||||||
node.delete_pilorama()
|
node.delete_pilorama()
|
||||||
|
|
||||||
with allure.step("Start all storage nodes"):
|
with allure.step("Start all storage nodes"):
|
||||||
cluster_state_controller.start_stopped_storage_services()
|
cluster_state_controller.start_all_stopped_services()
|
||||||
|
|
||||||
with allure.step("Check list objects first time"):
|
with allure.step("Check list objects first time"):
|
||||||
objects_list = s3_client.list_objects(bucket)
|
objects_list = s3_client.list_objects(bucket)
|
||||||
|
@ -578,7 +554,7 @@ class TestStorageDataLoss(ClusterTestBase):
|
||||||
)
|
)
|
||||||
|
|
||||||
with allure.step("Stop storage services on all nodes"):
|
with allure.step("Stop storage services on all nodes"):
|
||||||
cluster_state_controller.stop_all_storage_services()
|
cluster_state_controller.stop_services_of_type(StorageNode)
|
||||||
|
|
||||||
with allure.step("Delete metabase from all nodes"):
|
with allure.step("Delete metabase from all nodes"):
|
||||||
for node in cluster_state_controller.cluster.storage_nodes:
|
for node in cluster_state_controller.cluster.storage_nodes:
|
||||||
|
@ -594,7 +570,11 @@ class TestStorageDataLoss(ClusterTestBase):
|
||||||
storage_node.save_config(config, config_file_path)
|
storage_node.save_config(config, config_file_path)
|
||||||
|
|
||||||
with allure.step("Start storage services on all nodes"):
|
with allure.step("Start storage services on all nodes"):
|
||||||
cluster_state_controller.start_stopped_storage_services()
|
cluster_state_controller.start_all_stopped_services()
|
||||||
|
|
||||||
|
with allure.step("Wait for tree rebalance"):
|
||||||
|
# TODO: Use product metric when we have proper ones for this check
|
||||||
|
sleep(30)
|
||||||
|
|
||||||
with allure.step("Delete objects from bucket"):
|
with allure.step("Delete objects from bucket"):
|
||||||
with allure.step("Delete simple object from bucket"):
|
with allure.step("Delete simple object from bucket"):
|
||||||
|
@ -655,13 +635,13 @@ class TestStorageDataLoss(ClusterTestBase):
|
||||||
shards_watcher.take_shards_snapshot()
|
shards_watcher.take_shards_snapshot()
|
||||||
|
|
||||||
with allure.step(f"Stop storage service on node {node_under_test}"):
|
with allure.step(f"Stop storage service on node {node_under_test}"):
|
||||||
cluster_state_controller.stop_storage_service(node_under_test)
|
cluster_state_controller.stop_service_of_type(node_under_test, StorageNode)
|
||||||
|
|
||||||
with allure.step(f"Delete write cache from node {node_under_test}"):
|
with allure.step(f"Delete write cache from node {node_under_test}"):
|
||||||
node_under_test.storage_node.delete_write_cache()
|
node_under_test.storage_node.delete_write_cache()
|
||||||
|
|
||||||
with allure.step(f"Start storage service on node {node_under_test}"):
|
with allure.step(f"Start storage service on node {node_under_test}"):
|
||||||
cluster_state_controller.start_storage_service(node_under_test)
|
cluster_state_controller.start_all_stopped_services()
|
||||||
|
|
||||||
with allure.step("Objects should be available"):
|
with allure.step("Objects should be available"):
|
||||||
for storage_object in storage_objects:
|
for storage_object in storage_objects:
|
||||||
|
@ -710,7 +690,7 @@ class TestStorageDataLoss(ClusterTestBase):
|
||||||
|
|
||||||
with allure.step("Stop one node and wait for rebalance connection of s3 gate to storage service"):
|
with allure.step("Stop one node and wait for rebalance connection of s3 gate to storage service"):
|
||||||
current_node = self.cluster.cluster_nodes[0]
|
current_node = self.cluster.cluster_nodes[0]
|
||||||
cluster_state_controller.stop_storage_service(current_node)
|
cluster_state_controller.stop_service_of_type(current_node, StorageNode)
|
||||||
# waiting for rebalance connection of s3 gate to storage service
|
# waiting for rebalance connection of s3 gate to storage service
|
||||||
sleep(60)
|
sleep(60)
|
||||||
|
|
||||||
|
@ -752,15 +732,13 @@ class TestStorageDataLoss(ClusterTestBase):
|
||||||
piloramas_list_before_removing = self.get_piloramas_list(node_to_check)
|
piloramas_list_before_removing = self.get_piloramas_list(node_to_check)
|
||||||
|
|
||||||
with allure.step("Stop all storage nodes"):
|
with allure.step("Stop all storage nodes"):
|
||||||
for node in self.cluster.cluster_nodes:
|
cluster_state_controller.stop_services_of_type(StorageNode)
|
||||||
with allure.step(f"Stop storage service on node: {node}"):
|
|
||||||
cluster_state_controller.stop_storage_service(node)
|
|
||||||
|
|
||||||
with allure.step("Delete pilorama.db from one node"):
|
with allure.step("Delete pilorama.db from one node"):
|
||||||
node_to_check.delete_pilorama()
|
node_to_check.delete_pilorama()
|
||||||
|
|
||||||
with allure.step("Start all storage nodes"):
|
with allure.step("Start all storage nodes"):
|
||||||
cluster_state_controller.start_stopped_storage_services()
|
cluster_state_controller.start_all_stopped_services()
|
||||||
|
|
||||||
with allure.step("Tick epoch to trigger sync and then wait for 1 minute"):
|
with allure.step("Tick epoch to trigger sync and then wait for 1 minute"):
|
||||||
self.tick_epochs(1)
|
self.tick_epochs(1)
|
||||||
|
|
Loading…
Reference in a new issue