frostfs-testcases/pytest_tests/testsuites/failovers/test_failover_storage.py

1052 lines
46 KiB
Python
Raw Normal View History

import logging
import os
import random
from datetime import datetime
from time import sleep
import allure
import pytest
from frostfs_testlib.cli import FrostfsCli
from frostfs_testlib.cli.netmap_parser import NetmapParser
from frostfs_testlib.resources.cli import FROSTFS_CLI_EXEC
from frostfs_testlib.resources.common import DEFAULT_WALLET_CONFIG, MORPH_BLOCK_TIME
from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL
2023-10-31 14:51:09 +00:00
from frostfs_testlib.s3 import S3ClientWrapper, VersioningStatus
from frostfs_testlib.steps.cli.container import (
StorageContainer,
StorageContainerInfo,
create_container,
search_nodes_with_container,
)
from frostfs_testlib.steps.cli.object import (
delete_object,
get_object,
put_object,
put_object_to_random_node,
search_object,
)
from frostfs_testlib.steps.node_management import (
check_node_in_map,
check_node_not_in_map,
exclude_node_from_network_map,
include_node_to_network_map,
remove_nodes_from_map_morph,
wait_for_node_to_be_ready,
)
from frostfs_testlib.steps.s3 import s3_helper
from frostfs_testlib.storage.cluster import Cluster, ClusterNode, S3Gate, StorageNode
from frostfs_testlib.storage.controllers import ClusterStateController, ShardsWatcher
from frostfs_testlib.storage.dataclasses.object_size import ObjectSize
from frostfs_testlib.storage.dataclasses.storage_object_info import ModeNode, StorageObjectInfo
from frostfs_testlib.storage.dataclasses.wallet import WalletInfo
from frostfs_testlib.testing.cluster_test_base import ClusterTestBase
2023-05-30 13:42:27 +00:00
from frostfs_testlib.testing.test_control import expect_not_raises
from frostfs_testlib.utils import datetime_utils
from frostfs_testlib.utils.failover_utils import wait_object_replication
2023-05-30 13:42:27 +00:00
from frostfs_testlib.utils.file_keeper import FileKeeper
from frostfs_testlib.utils.file_utils import generate_file, get_file_hash
logger = logging.getLogger("NeoLogger")
stopped_nodes: list[StorageNode] = []
2023-05-30 13:42:27 +00:00
@pytest.fixture(scope="function")
@allure.title("Provide File Keeper")
def file_keeper():
keeper = FileKeeper()
yield keeper
keeper.restore_files()
@allure.step("Return all stopped hosts")
@pytest.fixture(scope="function", autouse=True)
def after_run_return_all_stopped_hosts(cluster_state_controller: ClusterStateController) -> str:
yield
cluster_state_controller.start_stopped_hosts()
@allure.step("Return all stopped services after test")
2023-05-30 13:42:27 +00:00
@pytest.fixture(scope="function")
def after_run_return_all_stopped_services(cluster_state_controller: ClusterStateController):
yield
cluster_state_controller.start_all_stopped_services()
@pytest.mark.failover
@pytest.mark.failover_storage
class TestFailoverStorage(ClusterTestBase):
2023-09-08 10:35:34 +00:00
@allure.title("Shutdown and start node (stop_mode={stop_mode})")
@pytest.mark.parametrize("stop_mode", ["hard", "soft"])
@pytest.mark.failover_reboot
def test_lose_storage_node_host(
self,
default_wallet,
stop_mode: str,
require_multiple_hosts,
simple_object_size: ObjectSize,
cluster: Cluster,
cluster_state_controller: ClusterStateController,
):
wallet = default_wallet
placement_rule = "REP 2 IN X CBF 2 SELECT 2 FROM * AS X"
source_file_path = generate_file(simple_object_size.value)
stopped_hosts_nodes = []
with allure.step(f"Create container and put object"):
cid = create_container(
wallet,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
rule=placement_rule,
basic_acl=PUBLIC_ACL,
)
oid = put_object_to_random_node(wallet, source_file_path, cid, shell=self.shell, cluster=self.cluster)
with allure.step(f"Wait for replication and get nodes with object"):
nodes_with_object = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes)
with allure.step(f"Stop 2 nodes with object and wait replication one by one"):
for storage_node in random.sample(nodes_with_object, 2):
stopped_hosts_nodes.append(storage_node)
cluster_node = cluster.node(storage_node)
cluster_state_controller.stop_node_host(cluster_node, stop_mode)
replicated_nodes = wait_object_replication(
cid,
oid,
2,
shell=self.shell,
nodes=list(set(self.cluster.storage_nodes) - {*stopped_hosts_nodes}),
)
with allure.step("Check object data is not corrupted"):
got_file_path = get_object(
wallet, cid, oid, endpoint=replicated_nodes[0].get_rpc_endpoint(), shell=self.shell
)
assert get_file_hash(source_file_path) == get_file_hash(got_file_path)
with allure.step("Return all hosts"):
cluster_state_controller.start_stopped_hosts()
with allure.step("Check object data is not corrupted"):
replicated_nodes = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes)
got_file_path = get_object(
wallet, cid, oid, shell=self.shell, endpoint=replicated_nodes[0].get_rpc_endpoint()
)
assert get_file_hash(source_file_path) == get_file_hash(got_file_path)
2023-09-08 10:35:34 +00:00
@allure.title("Do not ignore unhealthy tree endpoints (s3_client={s3_client})")
def test_unhealthy_tree(
self,
s3_client: S3ClientWrapper,
simple_object_size: ObjectSize,
cluster_state_controller: ClusterStateController,
after_run_return_all_stopped_services,
):
default_node = self.cluster.cluster_nodes[0]
with allure.step("Turn S3 GW off on default node"):
cluster_state_controller.stop_service_of_type(default_node, S3Gate)
with allure.step("Turn off storage on default node"):
cluster_state_controller.stop_service_of_type(default_node, StorageNode)
with allure.step("Turn on S3 GW on default node"):
cluster_state_controller.start_service_of_type(default_node, S3Gate)
with allure.step("Turn on storage on default node"):
cluster_state_controller.start_service_of_type(default_node, StorageNode)
with allure.step("Create bucket with REP 1 SELECT 1 policy"):
bucket = s3_client.create_bucket(
location_constraint="load-1-1",
)
file_path = generate_file(simple_object_size.value)
file_name = s3_helper.object_key_from_file_path(file_path)
with allure.step("Put object into bucket"):
put_object = s3_client.put_object(bucket, file_path)
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
with allure.step("Turn off all storage nodes except default"):
for node in self.cluster.cluster_nodes[1:]:
with allure.step(f"Stop storage service on node: {node}"):
cluster_state_controller.stop_service_of_type(node, StorageNode)
with allure.step("Check that object is available"):
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
with allure.step("Start storage nodes"):
cluster_state_controller.start_all_stopped_services()
@pytest.mark.failover
@pytest.mark.failover_empty_map
class TestEmptyMap(ClusterTestBase):
"""
A set of tests for makes map empty and verify that we can read objects after that
"""
@allure.step("Teardown after EmptyMap offline test")
@pytest.fixture()
def empty_map_offline_teardown(self):
yield
with allure.step("Return all storage nodes to network map"):
for node in stopped_nodes:
include_node_to_network_map(node, node, shell=self.shell, cluster=self.cluster)
stopped_nodes.remove(node)
@pytest.mark.failover_empty_map_offlne
2023-09-08 10:35:34 +00:00
@allure.title("Empty network map via offline all storage nodes (s3_client={s3_client})")
def test_offline_all_storage_nodes(
self,
s3_client: S3ClientWrapper,
bucket: str,
simple_object_size: ObjectSize,
empty_map_offline_teardown,
):
"""
The test makes network map empty (set offline status on all storage nodes) then returns all nodes to map and checks that object can read through s3.
Steps:
1. Check that bucket is empty
2: PUT object into bucket
3: Check that object exists in bucket
4: Exclude all storage nodes from network map (set status OFFLINE)
5: Return all storage nodes to network map
6: Check that we can read object from #2
Args:
bucket: bucket which contains tested object
simple_object_size: size of object
"""
file_path = generate_file(simple_object_size.value)
file_name = s3_helper.object_key_from_file_path(file_path)
bucket_objects = [file_name]
objects_list = s3_client.list_objects(bucket)
assert not objects_list, f"Expected empty bucket, got {objects_list}"
with allure.step("Put object into bucket"):
s3_client.put_object(bucket, file_path)
with allure.step("Check that object exists in bucket"):
s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects)
storage_nodes = self.cluster.storage_nodes
with allure.step("Exclude all storage nodes from network map"):
for node in storage_nodes:
stopped_nodes.append(node)
exclude_node_from_network_map(node, node, shell=self.shell, cluster=self.cluster)
with allure.step("Return all storage nodes to network map"):
for node in storage_nodes:
include_node_to_network_map(node, node, shell=self.shell, cluster=self.cluster)
stopped_nodes.remove(node)
with allure.step("Check that we can read object"):
s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects)
@allure.step("Teardown after EmptyMap stop service test")
@pytest.fixture()
def empty_map_stop_service_teardown(self, cluster_state_controller: ClusterStateController):
yield
with allure.step("Return all storage nodes to network map"):
cluster_state_controller.start_all_stopped_services()
for node in stopped_nodes:
check_node_in_map(node, shell=self.shell, alive_node=node)
@pytest.mark.failover_empty_map_stop_service
2023-09-08 10:35:34 +00:00
@allure.title("Empty network map via stop all storage services (s3_client={s3_client})")
def test_stop_all_storage_nodes(
self,
s3_client: S3ClientWrapper,
bucket: str,
simple_object_size: ObjectSize,
empty_map_stop_service_teardown,
cluster_state_controller: ClusterStateController,
):
"""
The test makes network map empty (stop storage service on all nodes
then use 'frostfs-adm morph delete-nodes' to delete nodes from map)
then start all services and checks that object can read through s3.
Steps:
1. Check that bucket is empty
2: PUT object into bucket
3: Check that object exists in bucket
4: Exclude all storage nodes from network map (stop storage service
and manual exclude from map)
5: Return all storage nodes to network map
6: Check that we can read object from #2
Args:
bucket: bucket which contains tested object
simple_object_size: size of object
"""
file_path = generate_file(simple_object_size.value)
file_name = s3_helper.object_key_from_file_path(file_path)
bucket_objects = [file_name]
objects_list = s3_client.list_objects(bucket)
assert not objects_list, f"Expected empty bucket, got {objects_list}"
with allure.step("Put object into bucket"):
s3_client.put_object(bucket, file_path)
with allure.step("Check that object exists in bucket"):
s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects)
with allure.step("Stop all storage nodes"):
cluster_state_controller.stop_services_of_type(StorageNode)
with allure.step("Remove all nodes from network map"):
remove_nodes_from_map_morph(
shell=self.shell, cluster=self.cluster, remove_nodes=self.cluster.services(StorageNode)
)
with allure.step("Return all storage nodes to network map"):
self.return_nodes_after_stop_with_check_empty_map(cluster_state_controller)
with allure.step("Check that object exists in bucket"):
s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects)
@allure.step("Return all nodes to cluster with check empty map first")
def return_nodes_after_stop_with_check_empty_map(self, cluster_state_controller: ClusterStateController) -> None:
first_node = self.cluster.cluster_nodes[0].service(StorageNode)
with allure.step("Start first node and check network map"):
cluster_state_controller.start_service_of_type(self.cluster.cluster_nodes[0], StorageNode)
wait_for_node_to_be_ready(first_node)
for check_node in self.cluster.storage_nodes:
check_node_not_in_map(check_node, shell=self.shell, alive_node=first_node)
for node in self.cluster.cluster_nodes[1:]:
storage_node = node.service(StorageNode)
cluster_state_controller.start_service_of_type(node, StorageNode)
wait_for_node_to_be_ready(storage_node)
sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME))
self.tick_epochs(1)
check_node_in_map(storage_node, shell=self.shell, alive_node=first_node)
2023-09-08 10:35:34 +00:00
@allure.title("Object loss from fstree/blobovnicza (versioning=enabled, s3_client={s3_client})")
def test_s3_fstree_blobovnicza_loss_versioning_on(
self,
s3_client: S3ClientWrapper,
simple_object_size: ObjectSize,
cluster_state_controller: ClusterStateController,
):
bucket = s3_client.create_bucket()
s3_helper.set_bucket_versioning(s3_client, bucket, VersioningStatus.ENABLED)
file_path = generate_file(simple_object_size.value)
file_name = s3_helper.object_key_from_file_path(file_path)
object_versions = []
with allure.step("Put object into one bucket"):
put_object = s3_client.put_object(bucket, file_path)
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
object_versions.append(put_object)
with allure.step("Stop all storage nodes"):
cluster_state_controller.stop_services_of_type(StorageNode)
with allure.step("Delete blobovnicza and fstree from all nodes"):
for node in self.cluster.storage_nodes:
node.delete_blobovnicza()
node.delete_fstree()
with allure.step("Start all storage nodes"):
cluster_state_controller.start_all_stopped_services()
# need to get Delete Marker first
with allure.step("Delete the object from the bucket"):
delete_object = s3_client.delete_object(bucket, file_name)
object_versions.append(delete_object["VersionId"])
# and now delete all versions of object (including Delete Markers)
with allure.step("Delete all versions of the object from the bucket"):
for version in object_versions:
delete_object = s3_client.delete_object(bucket, file_name, version_id=version)
with allure.step("Delete bucket"):
s3_client.delete_bucket(bucket)
@allure.title("Object loss from fstree/blobovnicza (versioning=disabled, s3_client={s3_client})")
def test_s3_fstree_blobovnicza_loss_versioning_off(
self,
s3_client: S3ClientWrapper,
simple_object_size: ObjectSize,
cluster_state_controller: ClusterStateController,
):
bucket = s3_client.create_bucket()
file_path = generate_file(simple_object_size.value)
file_name = s3_helper.object_key_from_file_path(file_path)
with allure.step("Put object into one bucket"):
s3_client.put_object(bucket, file_path)
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
with allure.step("Stop all storage nodes"):
cluster_state_controller.stop_services_of_type(StorageNode)
with allure.step("Delete blobovnicza and fstree from all nodes"):
for node in self.cluster.storage_nodes:
node.delete_blobovnicza()
node.delete_fstree()
with allure.step("Start all storage nodes"):
cluster_state_controller.start_all_stopped_services()
with allure.step("Delete the object from the bucket"):
s3_client.delete_object(bucket, file_name)
with allure.step("Delete bucket"):
s3_client.delete_bucket(bucket)
@pytest.mark.skip(reason="Need to increase cache lifetime")
@pytest.mark.parametrize(
# versioning should NOT be VersioningStatus.SUSPENDED, it needs to be undefined
"versioning_status",
2023-08-07 09:43:16 +00:00
[VersioningStatus.ENABLED, VersioningStatus.UNDEFINED],
)
@allure.title(
2023-09-08 10:35:34 +00:00
"After Pilorama.db loss on all nodes list objects should return nothing in second listing (versioning_status={versioning_status}, s3_client={s3_client})"
)
def test_s3_pilorama_loss(
self,
s3_client: S3ClientWrapper,
simple_object_size: ObjectSize,
versioning_status: VersioningStatus,
cluster_state_controller: ClusterStateController,
):
bucket = s3_client.create_bucket()
2023-08-07 09:43:16 +00:00
s3_helper.set_bucket_versioning(s3_client, bucket, versioning_status)
file_path = generate_file(simple_object_size.value)
file_name = s3_helper.object_key_from_file_path(file_path)
with allure.step("Put object into one bucket"):
s3_client.put_object(bucket, file_path)
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
with allure.step("Stop all storage nodes"):
cluster_state_controller.stop_services_of_type(StorageNode)
with allure.step("Delete pilorama.db from all nodes"):
for node in self.cluster.storage_nodes:
node.delete_pilorama()
with allure.step("Start all storage nodes"):
cluster_state_controller.start_all_stopped_services()
with allure.step("Check list objects first time"):
objects_list = s3_client.list_objects(bucket)
assert objects_list, f"Expected not empty bucket"
with allure.step("Check list objects second time"):
objects_list = s3_client.list_objects(bucket)
assert not objects_list, f"Expected empty bucket, got {objects_list}"
with allure.step("Delete bucket"):
s3_client.delete_bucket(bucket)
2023-05-30 13:42:27 +00:00
@pytest.mark.failover
@pytest.mark.failover_data_loss
class TestStorageDataLoss(ClusterTestBase):
@allure.step("Get list of all piloramas on node")
def get_piloramas_list(self, node: StorageNode) -> list:
data_directory_path = node.get_data_directory()
cmd = f"sudo ls -1 {data_directory_path}/meta*/pilorama*"
shell = node.host.get_shell()
stdout = shell.exec(cmd).stdout
piloramas = stdout.split("\n")
return piloramas
2023-05-30 13:42:27 +00:00
@allure.title(
2023-09-08 10:35:34 +00:00
"After metabase loss on all nodes operations on objects and buckets should be still available via S3 (s3_client={s3_client})"
2023-05-30 13:42:27 +00:00
)
@pytest.mark.metabase_loss
def test_metabase_loss(
self,
s3_client: S3ClientWrapper,
simple_object_size: ObjectSize,
complex_object_size: ObjectSize,
2023-05-30 13:42:27 +00:00
cluster_state_controller: ClusterStateController,
after_run_return_all_stopped_services: str,
file_keeper: FileKeeper,
):
allure.dynamic.description(after_run_return_all_stopped_services)
with allure.step("Create bucket"):
bucket = s3_client.create_bucket()
with allure.step("Put objects into bucket"):
simple_object_path = generate_file(simple_object_size.value)
2023-05-30 13:42:27 +00:00
simple_object_key = s3_helper.object_key_from_file_path(simple_object_path)
complex_object_path = generate_file(complex_object_size.value)
2023-05-30 13:42:27 +00:00
complex_object_key = s3_helper.object_key_from_file_path(complex_object_path)
s3_client.put_object(bucket, simple_object_path)
s3_client.put_object(bucket, complex_object_path)
with allure.step("Check objects are in bucket"):
s3_helper.check_objects_in_bucket(
s3_client, bucket, expected_objects=[simple_object_key, complex_object_key]
)
with allure.step("Stop storage services on all nodes"):
cluster_state_controller.stop_services_of_type(StorageNode)
2023-05-30 13:42:27 +00:00
with allure.step("Delete metabase from all nodes"):
for node in cluster_state_controller.cluster.storage_nodes:
node.delete_metabase()
with allure.step("Enable resync_metabase option for storage services"):
for storage_node in cluster_state_controller.cluster.storage_nodes:
with allure.step(f"Enable resync_metabase option for {storage_node}"):
config_file_path, config = storage_node.get_shards_config()
2023-05-30 13:42:27 +00:00
if not config["storage"]["shard"]["default"]["resync_metabase"]:
file_keeper.add(storage_node, config_file_path)
config["storage"]["shard"]["default"]["resync_metabase"] = True
storage_node.save_config(config, config_file_path)
2023-05-30 13:42:27 +00:00
with allure.step("Start storage services on all nodes"):
cluster_state_controller.start_all_stopped_services()
with allure.step("Wait for tree rebalance"):
# TODO: Use product metric when we have proper ones for this check
sleep(30)
2023-05-30 13:42:27 +00:00
with allure.step("Delete objects from bucket"):
with allure.step("Delete simple object from bucket"):
with expect_not_raises():
s3_client.delete_object(bucket, simple_object_key)
with allure.step("Delete complex object from bucket"):
with expect_not_raises():
s3_client.delete_object(bucket, complex_object_key)
with allure.step("Delete bucket"):
with expect_not_raises():
s3_client.delete_bucket(bucket)
@allure.title("Write cache loss on one node should not affect shards and should not produce errors in log")
@pytest.mark.write_cache_loss
def test_write_cache_loss_on_one_node(
self,
node_under_test: ClusterNode,
simple_object_size: ObjectSize,
cluster_state_controller: ClusterStateController,
shards_watcher: ShardsWatcher,
default_wallet: str,
test_start_time: datetime,
after_run_return_all_stopped_services: str,
):
exception_messages = []
allure.dynamic.description(after_run_return_all_stopped_services)
with allure.step(f"Create container on node {node_under_test}"):
locode = node_under_test.storage_node.get_un_locode()
placement_rule = f"""REP 1 IN X
CBF 1
SELECT 1 FROM C AS X
FILTER 'UN-LOCODE' EQ '{locode}' AS C"""
cid = create_container(
default_wallet,
self.shell,
node_under_test.storage_node.get_rpc_endpoint(),
rule=placement_rule,
)
container = StorageContainer(
StorageContainerInfo(cid, WalletInfo(default_wallet)),
self.shell,
cluster_state_controller.cluster,
)
with allure.step(f"Put couple objects to container on node {node_under_test}"):
storage_objects: list[StorageObjectInfo] = []
for _ in range(5):
storage_object = container.generate_object(
simple_object_size.value,
endpoint=node_under_test.storage_node.get_rpc_endpoint(),
)
storage_objects.append(storage_object)
with allure.step("Take shards snapshot"):
shards_watcher.take_shards_snapshot()
with allure.step(f"Stop storage service on node {node_under_test}"):
cluster_state_controller.stop_service_of_type(node_under_test, StorageNode)
with allure.step(f"Delete write cache from node {node_under_test}"):
node_under_test.storage_node.delete_write_cache()
with allure.step(f"Start storage service on node {node_under_test}"):
cluster_state_controller.start_all_stopped_services()
with allure.step("Objects should be available"):
for storage_object in storage_objects:
get_object(
storage_object.wallet_file_path,
container.get_id(),
storage_object.oid,
self.shell,
node_under_test.storage_node.get_rpc_endpoint(),
)
with allure.step("No shards should have new errors"):
shards_watcher.take_shards_snapshot()
shards_with_errors = shards_watcher.get_shards_with_new_errors()
if shards_with_errors:
exception_messages.append(f"Shards have new errors: {shards_with_errors}")
with allure.step("No shards should have degraded status"):
snapshot = shards_watcher.get_shards_snapshot()
for shard in snapshot:
status = snapshot[shard]["mode"]
if status != "read-write":
exception_messages.append(f"Shard {shard} changed status to {status}")
with allure.step("No related errors should be in log"):
if node_under_test.host.is_message_in_logs(
message_regex=r"\Wno such file or directory\W", since=test_start_time
):
exception_messages.append(f"Node {node_under_test} have shard errors in logs")
with allure.step("Pass test if no errors found"):
assert not exception_messages, "\n".join(exception_messages)
2023-05-31 10:16:38 +00:00
@allure.title(
2023-09-08 10:35:34 +00:00
"Loss of one node should trigger use of tree and storage service in another node (s3_client={s3_client})"
2023-05-31 10:16:38 +00:00
)
def test_s3_one_endpoint_loss(
self,
bucket,
s3_client: S3ClientWrapper,
simple_object_size: ObjectSize,
2023-05-31 10:16:38 +00:00
after_run_return_all_stopped_services,
cluster_state_controller: ClusterStateController,
):
# TODO: need to check that s3 gate is connected to localhost (such metric will be supported in 1.3)
with allure.step("Stop one node and wait for rebalance connection of s3 gate to storage service"):
2023-05-31 10:16:38 +00:00
current_node = self.cluster.cluster_nodes[0]
cluster_state_controller.stop_service_of_type(current_node, StorageNode)
2023-05-31 10:16:38 +00:00
# waiting for rebalance connection of s3 gate to storage service
sleep(60)
file_path = generate_file(simple_object_size.value)
2023-05-31 10:16:38 +00:00
file_name = s3_helper.object_key_from_file_path(file_path)
with allure.step("Put object into one bucket"):
put_object = s3_client.put_object(bucket, file_path)
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
@allure.title("After Pilorama.db loss on one node object is retrievable (s3_client={s3_client})")
def test_s3_one_pilorama_loss(
self,
s3_client: S3ClientWrapper,
simple_object_size: ObjectSize,
cluster_state_controller: ClusterStateController,
):
bucket = s3_client.create_bucket(
location_constraint="load-1-4",
grant_read="uri=http://acs.amazonaws.com/groups/global/AllUsers",
)
s3_helper.set_bucket_versioning(s3_client, bucket, VersioningStatus.ENABLED)
with allure.step("Check bucket versioning"):
bucket_versioning = s3_client.get_bucket_versioning_status(bucket)
assert bucket_versioning == "Enabled", "Bucket should have enabled versioning"
file_path = generate_file(simple_object_size.value)
file_name = s3_helper.object_key_from_file_path(file_path)
object_versions = []
with allure.step("Put object into one bucket"):
put_object = s3_client.put_object(bucket, file_path)
s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
object_versions.append(put_object)
node_to_check = self.cluster.storage_nodes[0]
piloramas_list_before_removing = {}
with allure.step("Get list of all pilorama.db"):
piloramas_list_before_removing = self.get_piloramas_list(node_to_check)
with allure.step("Stop all storage nodes"):
cluster_state_controller.stop_services_of_type(StorageNode)
with allure.step("Delete pilorama.db from one node"):
node_to_check.delete_pilorama()
with allure.step("Start all storage nodes"):
cluster_state_controller.start_all_stopped_services()
with allure.step("Tick epoch to trigger sync and then wait for 1 minute"):
self.tick_epochs(1)
sleep(120)
piloramas_list_afrer_removing = {}
with allure.step("Get list of all pilorama.db after sync"):
piloramas_list_afrer_removing = self.get_piloramas_list(node_to_check)
assert piloramas_list_afrer_removing == piloramas_list_before_removing, "List of pilorama.db is different"
with allure.step("Check bucket versioning"):
bucket_versioning = s3_client.get_bucket_versioning_status(bucket)
assert bucket_versioning == "Enabled", "Bucket should have enabled versioning"
with allure.step("Check list objects"):
objects_list = s3_client.list_objects(bucket)
assert objects_list, f"Expected not empty bucket"
with allure.step("Delete the object from the bucket"):
delete_object = s3_client.delete_object(bucket, file_name)
assert "DeleteMarker" in delete_object.keys(), "Delete markers not found"
with allure.step("Check list objects"):
objects_list = s3_client.list_objects_versions(bucket)
assert objects_list, f"Expected not empty bucket"
object_versions.append(delete_object["VersionId"])
# and now delete all versions of object (including Delete Markers)
with allure.step("Delete all versions of the object from the bucket"):
for version in object_versions:
delete_object = s3_client.delete_object(bucket, file_name, version_id=version)
with allure.step("Check list objects"):
objects_list = s3_client.list_objects_versions(bucket)
assert not objects_list, f"Expected empty bucket"
with allure.step("Delete bucket"):
s3_client.delete_bucket(bucket)
@pytest.mark.maintenance
class TestMaintenanceMode(ClusterTestBase):
change_node: ClusterNode = None
@pytest.fixture()
@allure.title("Init Frostfs CLI remote")
def frostfs_cli_remote(self, node_under_test: ClusterNode) -> FrostfsCli:
host = node_under_test.host
service_config = host.get_service_config(node_under_test.storage_node.name)
wallet_path = service_config.attributes["wallet_path"]
wallet_password = service_config.attributes["wallet_password"]
shell = host.get_shell()
wallet_config_path = f"/tmp/{node_under_test.storage_node.name}-config.yaml"
wallet_config = f'wallet: {wallet_path}\npassword: "{wallet_password}"'
shell.exec(f"echo '{wallet_config}' > {wallet_config_path}")
cli = FrostfsCli(shell=shell, frostfs_cli_exec_path=FROSTFS_CLI_EXEC, config_file=wallet_config_path)
return cli
@pytest.fixture()
@allure.title("Init Frostfs CLI remote")
def frostfs_cli(self) -> FrostfsCli:
cli = FrostfsCli(shell=self.shell, frostfs_cli_exec_path=FROSTFS_CLI_EXEC, config_file=DEFAULT_WALLET_CONFIG)
return cli
@pytest.fixture()
def restore_online_mode_node(self, cluster_state_controller: ClusterStateController, default_wallet: str):
yield
cluster_state_controller.set_mode_node(cluster_node=self.change_node, wallet=default_wallet, status="online")
self.tick_epoch(wait_block=2)
def basic_operations(self, wallet, cid, oid, shell, endpoint, matchs, object_size):
file_path = generate_file(object_size)
default_kw = {"wallet": wallet, "cid": cid, "shell": shell, "endpoint": endpoint}
operations = {
get_object: {"oid": oid},
search_object: {},
delete_object: {"oid": oid},
put_object: {"path": file_path},
}
for index, operation, kw in enumerate(operations.items()):
with allure.step(f"Run {operation.__name__} object, waiting response - {matchs[index]}"):
default_kw.update(kw)
if operation == search_object and "Found" in matchs[index]:
operation(**default_kw)
continue
with pytest.raises(RuntimeError, match=matchs[index]):
operation(**default_kw)
os.remove(file_path)
@allure.title("Test of basic node operations in maintenance mode")
def test_maintenance_mode(
self,
default_wallet: str,
simple_object_size: ObjectSize,
cluster_state_controller: ClusterStateController,
restore_online_mode_node: None,
):
with allure.step("Create container and create\put object"):
cid = create_container(
wallet=default_wallet,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
rule="REP 1 CBF 1",
)
node = search_nodes_with_container(
wallet=default_wallet,
cid=cid,
shell=self.shell,
endpoint=self.cluster.default_rpc_endpoint,
cluster=self.cluster,
)
self.change_node = node[0]
file_path = generate_file(simple_object_size.value)
oid = put_object(
wallet=default_wallet,
path=file_path,
cid=cid,
shell=self.shell,
endpoint=self.change_node.storage_node.get_rpc_endpoint(),
)
with allure.step("Enable MaintenanceModeAllowed:"):
cluster_state_controller.set_mode_node(
cluster_node=self.change_node, wallet=default_wallet, status="maintenance"
)
other_nodes = list(set(self.cluster.cluster_nodes) - set(node))
node_and_match = {
self.change_node: ["node is under maintenance"] * 4,
other_nodes[0]: [
"object not found",
"Found 0 objects",
"object not found",
"node is under maintenance",
],
}
with allure.step("Run basic operations"):
for cluster_node, matchs in node_and_match.items():
self.basic_operations(
wallet=default_wallet,
cid=cid,
oid=oid,
shell=self.shell,
endpoint=cluster_node.storage_node.get_rpc_endpoint(),
matchs=matchs,
object_size=simple_object_size.value,
)
@pytest.mark.sanity
@allure.title("MAINTENANCE and OFFLINE mode transitions")
def test_mode_transitions(
self,
cluster_state_controller: ClusterStateController,
node_under_test: ClusterNode,
default_wallet: str,
frostfs_cli: FrostfsCli,
restore_online_mode_node: None,
):
self.change_node = node_under_test
cluster_nodes = list(set(self.cluster.cluster_nodes) - {node_under_test})
with allure.step("Tick epoch"):
self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2)
with allure.step("Set node mode to offline"):
cluster_state_controller.set_mode_node(
cluster_node=node_under_test,
wallet=default_wallet,
status=ModeNode.OFFLINE.value,
)
with allure.step("Tick epoch to update the network map"):
self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2)
with ((allure.step("Check node mode = offline, after update the network map"))):
netmap = frostfs_cli.netmap.snapshot(
rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet
).stdout
netmap = NetmapParser.snapshot_all_nodes(netmap)
assert node_under_test.host_ip not in [
netmap_node.node for netmap_node in netmap
], f"Node {node_under_test.host_ip} not in state offline"
with allure.step("Restart storage service"):
cluster_state_controller.stop_storage_service(node_under_test)
cluster_state_controller.start_storage_service(node_under_test)
with allure.step("Tick epoch after restart storage service and set mode to offline"):
self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2)
with (allure.step("Check node mode = online, after restart storage service")):
netmap = frostfs_cli.netmap.snapshot(
rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet
).stdout
node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status
assert node_state == ModeNode.ONLINE.value.upper(), (
f"Node actual state - {node_state}, " f"expect - {ModeNode.ONLINE.value}"
)
with allure.step("Tick epoch"):
self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2)
with allure.step("Set node mode to maintenance"):
cluster_state_controller.set_mode_node(
cluster_node=node_under_test, wallet=default_wallet, status=ModeNode.MAINTENANCE.value
)
with allure.step("Restart storage service"):
cluster_state_controller.stop_storage_service(node_under_test)
cluster_state_controller.start_storage_service(node_under_test)
with allure.step("Tick epoch after restart storage service"):
self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2)
with allure.step("Check node mode = maintenance, after restart storage service and tick epoch"):
netmap = frostfs_cli.netmap.snapshot(
rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet
).stdout
node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status
assert node_state == ModeNode.MAINTENANCE.value.upper(), (
f"Node actual state - {node_state}, " f"expect - {ModeNode.MAINTENANCE.value}"
)
with allure.step("Tick epoch"):
self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2)
with allure.step("Set node mode to offline"):
netmap = frostfs_cli.netmap.snapshot(
rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet
).stdout
node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status
assert node_state == ModeNode.OFFLINE.value.upper(), (
f"Node actual state - {node_state}, " f"expect - {ModeNode.OFFLINE.value}"
)
with allure.step("Stop storage service"):
cluster_state_controller.stop_storage_service(node_under_test)
with allure.step("Tick epoch"):
self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2)
with allure.step("Start storage service"):
cluster_state_controller.start_storage_service(node_under_test)
with allure.step("Check node mode = offline, after tick epoch and start storage service"):
netmap = frostfs_cli.netmap.snapshot(
rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet
).stdout
node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status
assert node_state == ModeNode.OFFLINE.value.upper(), (
f"Node actual state - {node_state}, " f"expect - {ModeNode.OFFLINE.value}"
)
with allure.step("Tick epoch"):
self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2)
with allure.step("Check node mode = online, after start storage service and tick epoch"):
netmap = frostfs_cli.netmap.snapshot(
rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet
).stdout
node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status
assert node_state == ModeNode.ONLINE.value.upper(), (
f"Node actual state - {node_state}, " f"expect - {ModeNode.ONLINE.value}"
)
with allure.step("Tick epoch"):
self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2)
with allure.step("Set node mode to maintenance"):
cluster_state_controller.set_mode_node(
cluster_node=node_under_test, wallet=default_wallet, status=ModeNode.MAINTENANCE.value
)
with allure.step("Stop storage service"):
cluster_state_controller.stop_storage_service(node_under_test)
with allure.step("Tick epoch"):
self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2)
with allure.step("Start storage service"):
cluster_state_controller.start_storage_service(node_under_test)
with allure.step("Check node mode = maintenance"):
netmap = frostfs_cli.netmap.snapshot(
rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet
).stdout
node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status
assert (
node_state == ModeNode.MAINTENANCE.value.upper()
), f"Node actual state - {node_state}, expect - {ModeNode.MAINTENANCE.value}"
with allure.step("Tick epoch"):
self.tick_epochs(epochs_to_tick=2, alive_node=cluster_nodes[0].storage_node, wait_block=2)
with allure.step("Check node mode = maintenance"):
netmap = frostfs_cli.netmap.snapshot(
rpc_endpoint=cluster_nodes[0].storage_node.get_rpc_endpoint(), wallet=default_wallet
).stdout
node_state = NetmapParser.snapshot_one_node(netmap, node_under_test).node_status
assert node_state == ModeNode.MAINTENANCE.value.upper(), (
f"Node actual state - {node_state}, " f"expect - {ModeNode.MAINTENANCE.value}"
)
@allure.title("A node cannot go into maintenance if maintenance is prohibited globally in the network")
def test_maintenance_globally_forbidden(
self,
cluster_state_controller: ClusterStateController,
node_under_test: ClusterNode,
frostfs_cli_remote: FrostfsCli,
frostfs_cli: FrostfsCli,
default_wallet: str,
restore_online_mode_node: None,
):
self.change_node = node_under_test
control_endpoint = node_under_test.service(StorageNode).get_control_endpoint()
with allure.step("Set MaintenanceModeAllowed = false"):
cluster_state_controller.set_maintenance_mode_allowed("false", node_under_test)
with allure.step("Set status node - maintenance"):
with pytest.raises(RuntimeError, match="maintenance mode is not allowed by the network"):
frostfs_cli_remote.control.set_status(endpoint=control_endpoint, status="maintenance")
with allure.step("Set MaintenanceModeAllowed = true"):
cluster_state_controller.set_maintenance_mode_allowed("true", node_under_test)
with allure.step("Set status node - maintenance"):
output = frostfs_cli_remote.control.set_status(endpoint=control_endpoint, status="maintenance")
assert "update request successfully sent" in output.stdout, f"Response = {output}"
with allure.step("Tick epoch"):
self.tick_epoch(wait_block=2)
with allure.step("Check state node = maintenance "):
netmap_node = NetmapParser.snapshot_one_node(
frostfs_cli.netmap.snapshot(
rpc_endpoint=node_under_test.storage_node.get_rpc_endpoint(), wallet=default_wallet
).stdout,
node_under_test,
)
assert (
netmap_node.node_status == ModeNode.MAINTENANCE.value.upper()
), f"Node actual state - {netmap_node.node_status}, expect - {ModeNode.MAINTENANCE.value}"
with allure.step("Set status node - online "):
frostfs_cli_remote.control.set_status(endpoint=control_endpoint, status="online")
with allure.step("Tick epoch"):
self.tick_epoch()
with allure.step("Check state node: online"):
netmap_node = NetmapParser.snapshot_one_node(
frostfs_cli.netmap.snapshot(
rpc_endpoint=node_under_test.storage_node.get_rpc_endpoint(), wallet=default_wallet
).stdout,
node_under_test,
)
assert (
netmap_node.node_status == ModeNode.ONLINE.value.upper()
), f"Node actual state - {netmap_node.node_status}, expect - {ModeNode.ONLINE.value}"