Add tests with empty map #22

Merged
anikeev-yadro merged 1 commit from anikeev-yadro/frostfs-testcases:anikeev/add_dataloss_test into master 2023-03-20 10:02:23 +00:00
4 changed files with 279 additions and 20 deletions

View file

@ -1,4 +1,5 @@
import random import random
import pathlib
import re import re
from dataclasses import dataclass from dataclasses import dataclass
from typing import Any from typing import Any
@ -110,7 +111,14 @@ class InnerRingNode(NodeBase):
since frostfs network will still treat it as "node" since frostfs network will still treat it as "node"
""" """
pass def get_netmap_cleaner_threshold(self) -> str:
config_file = self.get_remote_config_path()
contents = self.host.get_shell().exec(f"cat {config_file}").stdout
config = yaml.safe_load(contents)
value = config["netmap_cleaner"]["threshold"]
return value
class S3Gate(NodeBase): class S3Gate(NodeBase):

View file

@ -2,17 +2,24 @@ import logging
import random import random
import re import re
import time import time
from time import sleep
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional from typing import Optional
import allure import allure
from frostfs_testlib.cli import FrostfsCli from frostfs_testlib.cli import FrostfsAdm, FrostfsCli
from frostfs_testlib.shell import Shell from frostfs_testlib.shell import Shell
from frostfs_testlib.utils import datetime_utils from frostfs_testlib.utils import datetime_utils
from pytest_tests.helpers.cluster import Cluster, StorageNode from pytest_tests.helpers.cluster import Cluster, StorageNode
from pytest_tests.helpers.epoch import tick_epoch from pytest_tests.helpers.epoch import tick_epoch
from pytest_tests.resources.common import FROSTFS_CLI_EXEC, MORPH_BLOCK_TIME from pytest_tests.resources.common import (
FROSTFS_CLI_EXEC,
FROSTFS_ADM_CONFIG_PATH,
FROSTFS_ADM_EXEC,
FROSTFS_CLI_EXEC,
MORPH_BLOCK_TIME,
)
logger = logging.getLogger("NeoLogger") logger = logging.getLogger("NeoLogger")
@ -211,6 +218,62 @@ def check_node_in_map(
node_netmap_key in snapshot node_netmap_key in snapshot
), f"Expected node with key {node_netmap_key} to be in network map" ), f"Expected node with key {node_netmap_key} to be in network map"
@allure.step("Check node {node} NOT in network map")
def check_node_not_in_map(
node: StorageNode, shell: Shell, alive_node: Optional[StorageNode] = None
) -> None:
alive_node = alive_node or node
node_netmap_key = node.get_wallet_public_key()
logger.info(f"Node ({node.label}) netmap key: {node_netmap_key}")
snapshot = get_netmap_snapshot(alive_node, shell)
assert (
node_netmap_key not in snapshot
), f"Expected node with key {node_netmap_key} to be NOT in network map"
@allure.step("Wait for node {node} is ready")
def wait_for_node_to_be_ready(node: StorageNode) -> None:
timeout, attempts = 30, 6
for _ in range(attempts):
try:
health_check = storage_node_healthcheck(node)
if health_check.health_status == "READY":
return
except Exception as err:
logger.warning(f"Node {node} is not ready:\n{err}")
sleep(timeout)
raise AssertionError(
f"Node {node} hasn't gone to the READY state after {timeout * attempts} seconds"
)
@allure.step("Remove nodes from network map trough cli-adm morph command")
def remove_nodes_from_map_morph(shell: Shell, cluster: Cluster, remove_nodes: list[StorageNode], alive_node: Optional[StorageNode] = None):
"""
Move node to the Offline state in the candidates list and tick an epoch to update the netmap
using frostfs-adm
Args:
shell: local shell to make queries about current epoch. Remote shell will be used to tick new one
cluster: cluster instance under test
alive_node: node to send requests to (first node in cluster by default)
remove_nodes: list of nodes which would be removed from map
"""
alive_node = alive_node if alive_node else remove_nodes[0]
remote_shell = alive_node.host.get_shell()
node_netmap_keys = list(map(StorageNode.get_wallet_public_key, remove_nodes))
logger.info(f"Nodes netmap keys are: {' '.join(node_netmap_keys)}")
if FROSTFS_ADM_EXEC and FROSTFS_ADM_CONFIG_PATH:
# If frostfs-adm is available, then we tick epoch with it (to be consistent with UAT tests)
frostfsadm = FrostfsAdm(
shell=remote_shell,
frostfs_adm_exec_path=FROSTFS_ADM_EXEC,
config_file=FROSTFS_ADM_CONFIG_PATH,
)
frostfsadm.morph.remove_nodes(node_netmap_keys)
def _run_control_command_with_retries(node: StorageNode, command: str, retries: int = 0) -> str: def _run_control_command_with_retries(node: StorageNode, command: str, retries: int = 0) -> str:
for attempt in range(1 + retries): # original attempt + specified retries for attempt in range(1 + retries): # original attempt + specified retries

View file

@ -1,11 +1,15 @@
import os
import logging import logging
from time import sleep
import allure import allure
import pytest import pytest
from frostfs_testlib.analytics import test_case
from frostfs_testlib.hosting import Host from frostfs_testlib.hosting import Host
from frostfs_testlib.resources.common import PUBLIC_ACL from frostfs_testlib.resources.common import PUBLIC_ACL
from frostfs_testlib.shell import CommandOptions from frostfs_testlib.shell import CommandOptions
from frostfs_testlib.utils import datetime_utils
from pytest_tests.resources.common import FROSTFS_CONTRACT_CACHE_TIMEOUT, MORPH_BLOCK_TIME
from pytest_tests.helpers.cluster import Cluster, StorageNode from pytest_tests.helpers.cluster import Cluster, StorageNode
from pytest_tests.helpers.container import create_container from pytest_tests.helpers.container import create_container
from pytest_tests.helpers.failover_utils import ( from pytest_tests.helpers.failover_utils import (
@ -16,6 +20,34 @@ from pytest_tests.helpers.file_helper import generate_file, get_file_hash
from pytest_tests.helpers.frostfs_verbs import get_object, put_object_to_random_node from pytest_tests.helpers.frostfs_verbs import get_object, put_object_to_random_node
from pytest_tests.steps.cluster_test_base import ClusterTestBase from pytest_tests.steps.cluster_test_base import ClusterTestBase
from pytest_tests.helpers.node_management import (
check_node_in_map,
check_node_not_in_map,
exclude_node_from_network_map,
include_node_to_network_map,
stop_random_storage_nodes,
wait_for_node_to_be_ready,
remove_nodes_from_map_morph
)
from pytest_tests.helpers.s3_helper import (
check_objects_in_bucket
)
from pytest_tests.steps import s3_gate_object
from pytest_tests.steps.s3_gate_base import TestS3GateBase
from pytest_tests.helpers.aws_cli_client import AwsCliClient
from pytest_tests.helpers.file_helper import (
generate_file,
get_file_hash,
)
from pytest_tests.helpers.node_management import (
check_node_in_map,
exclude_node_from_network_map,
include_node_to_network_map,
)
logger = logging.getLogger("NeoLogger") logger = logging.getLogger("NeoLogger")
stopped_nodes: list[StorageNode] = [] stopped_nodes: list[StorageNode] = []
@ -173,3 +205,173 @@ class TestFailoverStorage(ClusterTestBase):
wallet, cid, oid, shell=self.shell, endpoint=new_nodes[0].get_rpc_endpoint() wallet, cid, oid, shell=self.shell, endpoint=new_nodes[0].get_rpc_endpoint()
) )
assert get_file_hash(source_file_path) == get_file_hash(got_file_path) assert get_file_hash(source_file_path) == get_file_hash(got_file_path)
def pytest_generate_tests(metafunc):
if "s3_client" in metafunc.fixturenames:
metafunc.parametrize("s3_client", ["aws cli", "boto3"], indirect=True)
@pytest.mark.failover
@pytest.mark.failover_empty_map
class TestEmptyMap(TestS3GateBase):
"""
A set of tests for makes map empty and verify that we can read objects after that
"""
@allure.step("Teardown after EmptyMap offline test")
@pytest.fixture()
def empty_map_offline_teardown(self):
yield
with allure.step("Return all storage nodes to network map"):
for node in list(stopped_nodes):
include_node_to_network_map(
node, node, shell=self.shell, cluster=self.cluster
)
stopped_nodes.remove(node)
@staticmethod
def object_key_from_file_path(full_path: str) -> str:
return os.path.basename(full_path)
@test_case.title("Test makes network map empty (offline all storage nodes)")
@test_case.priority(test_case.TestCasePriority.HIGH)
@test_case.suite_name("failovers")
@test_case.suite_section("test_failover_storage")
@pytest.mark.failover_empty_map_offlne
@allure.title("Test makes network map empty (offline all storage nodes)")
def test_offline_all_storage_nodes(self, bucket, simple_object_size, empty_map_offline_teardown):
"""
The test makes network map empty (set offline status on all storage nodes) then returns all nodes to map and checks that object can read through s3.
Steps:
1. Check that bucket is empty
2: PUT object into bucket
3: Check that object exists in bucket
4: Exclude all storage nodes from network map (set status OFFLINE)
5: Return all storage nodes to network map
6: Check that we can read object from #2
Args:
bucket: bucket which contains tested object
simple_object_size: size of object
"""
file_path = generate_file(simple_object_size)
file_name = self.object_key_from_file_path(file_path)
bucket_objects = [file_name]
objects_list = s3_gate_object.list_objects_s3(self.s3_client, bucket)
assert not objects_list, f"Expected empty bucket, got {objects_list}"
with allure.step("Put object into bucket"):
s3_gate_object.put_object_s3(self.s3_client, bucket, file_path)
with allure.step("Check that object exists in bucket"):
check_objects_in_bucket(self.s3_client, bucket, bucket_objects)
storage_nodes = self.cluster.storage_nodes
with allure.step("Exclude all storage nodes from network map"):
for node in storage_nodes:
exclude_node_from_network_map(
node, node, shell=self.shell, cluster=self.cluster
)
stopped_nodes.append(node)
with allure.step("Return all storage nodes to network map"):
for node in storage_nodes:
include_node_to_network_map(
node, node, shell=self.shell, cluster=self.cluster
)
stopped_nodes.remove(node)
with allure.step("Check that we can read object"):
check_objects_in_bucket(self.s3_client, bucket, bucket_objects)
objects_list = s3_gate_object.list_objects_s3(self.s3_client, bucket)
assert not objects_list, f"Expected empty bucket, got {objects_list}"
@allure.step("Teardown after EmptyMap stop service test")
@pytest.fixture()
def empty_map_stop_service_teardown(self):
yield
with allure.step("Return all storage nodes to network map"):
for node in list(list(stopped_nodes)):
with allure.step(f"Start node {node}"):
node.start_service()
with allure.step(f"Waiting status ready for node {node}"):
wait_for_node_to_be_ready(node)
sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME))
self.tick_epochs(1)
check_node_in_map(node, shell=self.shell, alive_node=node)
stopped_nodes.remove(node)
@test_case.title("Test makes network map empty (stop storage service on all nodes)")
@test_case.priority(test_case.TestCasePriority.HIGH)
@test_case.suite_name("failovers")
@test_case.suite_section("test_failover_storage")
@pytest.mark.failover_empty_map_stop_service
@allure.title("Test makes network map empty (stop storage service on all nodes)")
def test_stop_all_storage_nodes(self, bucket, simple_object_size, empty_map_stop_service_teardown):
"""
The test makes network map empty (stop storage service on all nodes
then use 'frostfs-adm morph delete-nodes' to delete nodes from map)
then start all services and checks that object can read through s3.
Steps:
1. Check that bucket is empty
2: PUT object into bucket
3: Check that object exists in bucket
4: Exclude all storage nodes from network map (stop storage service
and manual exclude from map)
5: Return all storage nodes to network map
6: Check that we can read object from #2
Args:
bucket: bucket which contains tested object
simple_object_size: size of object
"""
file_path = generate_file(simple_object_size)
file_name = self.object_key_from_file_path(file_path)
bucket_objects = [file_name]
objects_list = s3_gate_object.list_objects_s3(self.s3_client, bucket)
assert not objects_list, f"Expected empty bucket, got {objects_list}"
with allure.step("Put object into bucket"):
s3_gate_object.put_object_s3(self.s3_client, bucket, file_path)
with allure.step("Check that object exists in bucket"):
check_objects_in_bucket(self.s3_client, bucket, bucket_objects)
with allure.step("Stop all storage nodes"):
for node in self.cluster.storage_nodes:
with allure.step(f"Stop storage service on node: {node}"):
node.stop_service()
stopped_nodes.append(node)
with allure.step(f"Remove all nodes from network map"):
remove_nodes_from_map_morph(shell=self.shell, cluster=self.cluster, remove_nodes=stopped_nodes)
with allure.step("Return all storage nodes to network map"):
self.return_nodes_after_stop_with_check_empty_map(stopped_nodes)
with allure.step("Check that object exists in bucket"):
check_objects_in_bucket(self.s3_client, bucket, bucket_objects)
@allure.step("Return all nodes to cluster with check empty map first")
def return_nodes_after_stop_with_check_empty_map(self, return_nodes = None) -> None:
first_node = True
for node in list(return_nodes):
with allure.step(f"Start node {node}"):
node.start_service()
with allure.step(f"Waiting status ready for node {node}"):
wait_for_node_to_be_ready(node)
with allure.step(f"We need to make sure that network map is empty"):
if first_node:
for check_node in list(return_nodes):
check_node_not_in_map(check_node, shell=self.shell, alive_node=node)
first_node = False
sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME))
self.tick_epochs(1)
check_node_in_map(node, shell=self.shell, alive_node=node)
stopped_nodes.remove(node)

View file

@ -32,6 +32,7 @@ from pytest_tests.helpers.node_management import (
node_shard_set_mode, node_shard_set_mode,
storage_node_healthcheck, storage_node_healthcheck,
storage_node_set_status, storage_node_set_status,
wait_for_node_to_be_ready
) )
from pytest_tests.helpers.storage_policy import get_nodes_with_object, get_simple_object_copies from pytest_tests.helpers.storage_policy import get_nodes_with_object, get_simple_object_copies
from pytest_tests.helpers.utility import ( from pytest_tests.helpers.utility import (
@ -109,7 +110,7 @@ class TestNodeManagement(ClusterTestBase):
with allure.step(f"Start node {node}"): with allure.step(f"Start node {node}"):
node.start_service() node.start_service()
with allure.step(f"Waiting status ready for node {node}"): with allure.step(f"Waiting status ready for node {node}"):
self.wait_for_node_to_be_ready(node) wait_for_node_to_be_ready(node)
# We need to wait for node to establish notifications from morph-chain # We need to wait for node to establish notifications from morph-chain
# Otherwise it will hang up when we will try to set status # Otherwise it will hang up when we will try to set status
@ -451,21 +452,6 @@ class TestNodeManagement(ClusterTestBase):
f"Node {node} hasn't gone to the READY and ONLINE state after {timeout * attempts} second" f"Node {node} hasn't gone to the READY and ONLINE state after {timeout * attempts} second"
) )
@allure.step("Wait for node {node} is ready")
def wait_for_node_to_be_ready(self, node: StorageNode) -> None:
timeout, attempts = 30, 6
for _ in range(attempts):
try:
health_check = storage_node_healthcheck(node)
if health_check.health_status == "READY":
return
except Exception as err:
logger.warning(f"Node {node} is not ready:\n{err}")
sleep(timeout)
raise AssertionError(
f"Node {node} hasn't gone to the READY state after {timeout * attempts} seconds"
)
@allure.step("Wait for {expected_copies} object copies in the wallet") @allure.step("Wait for {expected_copies} object copies in the wallet")
def wait_for_expected_object_copies( def wait_for_expected_object_copies(
self, wallet: str, cid: str, oid: str, expected_copies: int = 2 self, wallet: str, cid: str, oid: str, expected_copies: int = 2