From eb464f422c541477426e2d77e7959f2df8a8a5d0 Mon Sep 17 00:00:00 2001 From: anikeev-yadro Date: Thu, 9 Mar 2023 13:19:41 +0300 Subject: [PATCH] Add tests with empty map Signed-off-by: anikeev-yadro --- pytest_tests/helpers/cluster.py | 10 +- pytest_tests/helpers/node_management.py | 67 +++++- .../failovers/test_failover_storage.py | 204 +++++++++++++++++- .../network/test_node_management.py | 18 +- 4 files changed, 279 insertions(+), 20 deletions(-) diff --git a/pytest_tests/helpers/cluster.py b/pytest_tests/helpers/cluster.py index 20704cc..cafe317 100644 --- a/pytest_tests/helpers/cluster.py +++ b/pytest_tests/helpers/cluster.py @@ -1,4 +1,5 @@ import random +import pathlib import re from dataclasses import dataclass from typing import Any @@ -110,7 +111,14 @@ class InnerRingNode(NodeBase): since frostfs network will still treat it as "node" """ - pass + def get_netmap_cleaner_threshold(self) -> str: + config_file = self.get_remote_config_path() + contents = self.host.get_shell().exec(f"cat {config_file}").stdout + + config = yaml.safe_load(contents) + value = config["netmap_cleaner"]["threshold"] + + return value class S3Gate(NodeBase): diff --git a/pytest_tests/helpers/node_management.py b/pytest_tests/helpers/node_management.py index a4e9984..b92c4cd 100644 --- a/pytest_tests/helpers/node_management.py +++ b/pytest_tests/helpers/node_management.py @@ -2,17 +2,24 @@ import logging import random import re import time +from time import sleep from dataclasses import dataclass from typing import Optional import allure -from frostfs_testlib.cli import FrostfsCli +from frostfs_testlib.cli import FrostfsAdm, FrostfsCli from frostfs_testlib.shell import Shell from frostfs_testlib.utils import datetime_utils from pytest_tests.helpers.cluster import Cluster, StorageNode from pytest_tests.helpers.epoch import tick_epoch -from pytest_tests.resources.common import FROSTFS_CLI_EXEC, MORPH_BLOCK_TIME +from pytest_tests.resources.common import ( + FROSTFS_CLI_EXEC, + FROSTFS_ADM_CONFIG_PATH, + FROSTFS_ADM_EXEC, + FROSTFS_CLI_EXEC, + MORPH_BLOCK_TIME, +) logger = logging.getLogger("NeoLogger") @@ -211,6 +218,62 @@ def check_node_in_map( node_netmap_key in snapshot ), f"Expected node with key {node_netmap_key} to be in network map" +@allure.step("Check node {node} NOT in network map") +def check_node_not_in_map( + node: StorageNode, shell: Shell, alive_node: Optional[StorageNode] = None +) -> None: + alive_node = alive_node or node + + node_netmap_key = node.get_wallet_public_key() + logger.info(f"Node ({node.label}) netmap key: {node_netmap_key}") + + snapshot = get_netmap_snapshot(alive_node, shell) + assert ( + node_netmap_key not in snapshot + ), f"Expected node with key {node_netmap_key} to be NOT in network map" + +@allure.step("Wait for node {node} is ready") +def wait_for_node_to_be_ready(node: StorageNode) -> None: + timeout, attempts = 30, 6 + for _ in range(attempts): + try: + health_check = storage_node_healthcheck(node) + if health_check.health_status == "READY": + return + except Exception as err: + logger.warning(f"Node {node} is not ready:\n{err}") + sleep(timeout) + raise AssertionError( + f"Node {node} hasn't gone to the READY state after {timeout * attempts} seconds" + ) + +@allure.step("Remove nodes from network map trough cli-adm morph command") +def remove_nodes_from_map_morph(shell: Shell, cluster: Cluster, remove_nodes: list[StorageNode], alive_node: Optional[StorageNode] = None): + """ + Move node to the Offline state in the candidates list and tick an epoch to update the netmap + using frostfs-adm + Args: + shell: local shell to make queries about current epoch. Remote shell will be used to tick new one + cluster: cluster instance under test + alive_node: node to send requests to (first node in cluster by default) + remove_nodes: list of nodes which would be removed from map + """ + + alive_node = alive_node if alive_node else remove_nodes[0] + remote_shell = alive_node.host.get_shell() + + node_netmap_keys = list(map(StorageNode.get_wallet_public_key, remove_nodes)) + logger.info(f"Nodes netmap keys are: {' '.join(node_netmap_keys)}") + + if FROSTFS_ADM_EXEC and FROSTFS_ADM_CONFIG_PATH: + # If frostfs-adm is available, then we tick epoch with it (to be consistent with UAT tests) + frostfsadm = FrostfsAdm( + shell=remote_shell, + frostfs_adm_exec_path=FROSTFS_ADM_EXEC, + config_file=FROSTFS_ADM_CONFIG_PATH, + ) + frostfsadm.morph.remove_nodes(node_netmap_keys) + def _run_control_command_with_retries(node: StorageNode, command: str, retries: int = 0) -> str: for attempt in range(1 + retries): # original attempt + specified retries diff --git a/pytest_tests/testsuites/failovers/test_failover_storage.py b/pytest_tests/testsuites/failovers/test_failover_storage.py index 167d1b9..655e619 100644 --- a/pytest_tests/testsuites/failovers/test_failover_storage.py +++ b/pytest_tests/testsuites/failovers/test_failover_storage.py @@ -1,11 +1,15 @@ +import os import logging +from time import sleep import allure import pytest +from frostfs_testlib.analytics import test_case from frostfs_testlib.hosting import Host from frostfs_testlib.resources.common import PUBLIC_ACL from frostfs_testlib.shell import CommandOptions - +from frostfs_testlib.utils import datetime_utils +from pytest_tests.resources.common import FROSTFS_CONTRACT_CACHE_TIMEOUT, MORPH_BLOCK_TIME from pytest_tests.helpers.cluster import Cluster, StorageNode from pytest_tests.helpers.container import create_container from pytest_tests.helpers.failover_utils import ( @@ -16,6 +20,34 @@ from pytest_tests.helpers.file_helper import generate_file, get_file_hash from pytest_tests.helpers.frostfs_verbs import get_object, put_object_to_random_node from pytest_tests.steps.cluster_test_base import ClusterTestBase +from pytest_tests.helpers.node_management import ( + check_node_in_map, + check_node_not_in_map, + exclude_node_from_network_map, + include_node_to_network_map, + stop_random_storage_nodes, + wait_for_node_to_be_ready, + remove_nodes_from_map_morph +) + +from pytest_tests.helpers.s3_helper import ( + check_objects_in_bucket +) +from pytest_tests.steps import s3_gate_object +from pytest_tests.steps.s3_gate_base import TestS3GateBase + +from pytest_tests.helpers.aws_cli_client import AwsCliClient +from pytest_tests.helpers.file_helper import ( + generate_file, + get_file_hash, +) + +from pytest_tests.helpers.node_management import ( + check_node_in_map, + exclude_node_from_network_map, + include_node_to_network_map, +) + logger = logging.getLogger("NeoLogger") stopped_nodes: list[StorageNode] = [] @@ -173,3 +205,173 @@ class TestFailoverStorage(ClusterTestBase): wallet, cid, oid, shell=self.shell, endpoint=new_nodes[0].get_rpc_endpoint() ) assert get_file_hash(source_file_path) == get_file_hash(got_file_path) + + +def pytest_generate_tests(metafunc): + if "s3_client" in metafunc.fixturenames: + metafunc.parametrize("s3_client", ["aws cli", "boto3"], indirect=True) + +@pytest.mark.failover +@pytest.mark.failover_empty_map +class TestEmptyMap(TestS3GateBase): + """ + A set of tests for makes map empty and verify that we can read objects after that + """ + @allure.step("Teardown after EmptyMap offline test") + @pytest.fixture() + def empty_map_offline_teardown(self): + yield + with allure.step("Return all storage nodes to network map"): + for node in list(stopped_nodes): + include_node_to_network_map( + node, node, shell=self.shell, cluster=self.cluster + ) + stopped_nodes.remove(node) + + @staticmethod + def object_key_from_file_path(full_path: str) -> str: + return os.path.basename(full_path) + + @test_case.title("Test makes network map empty (offline all storage nodes)") + @test_case.priority(test_case.TestCasePriority.HIGH) + @test_case.suite_name("failovers") + @test_case.suite_section("test_failover_storage") + @pytest.mark.failover_empty_map_offlne + @allure.title("Test makes network map empty (offline all storage nodes)") + def test_offline_all_storage_nodes(self, bucket, simple_object_size, empty_map_offline_teardown): + """ + The test makes network map empty (set offline status on all storage nodes) then returns all nodes to map and checks that object can read through s3. + + Steps: + 1. Check that bucket is empty + 2: PUT object into bucket + 3: Check that object exists in bucket + 4: Exclude all storage nodes from network map (set status OFFLINE) + 5: Return all storage nodes to network map + 6: Check that we can read object from #2 + Args: + bucket: bucket which contains tested object + simple_object_size: size of object + """ + file_path = generate_file(simple_object_size) + file_name = self.object_key_from_file_path(file_path) + bucket_objects = [file_name] + + objects_list = s3_gate_object.list_objects_s3(self.s3_client, bucket) + assert not objects_list, f"Expected empty bucket, got {objects_list}" + + with allure.step("Put object into bucket"): + s3_gate_object.put_object_s3(self.s3_client, bucket, file_path) + + with allure.step("Check that object exists in bucket"): + check_objects_in_bucket(self.s3_client, bucket, bucket_objects) + + storage_nodes = self.cluster.storage_nodes + with allure.step("Exclude all storage nodes from network map"): + for node in storage_nodes: + exclude_node_from_network_map( + node, node, shell=self.shell, cluster=self.cluster + ) + stopped_nodes.append(node) + + with allure.step("Return all storage nodes to network map"): + for node in storage_nodes: + include_node_to_network_map( + node, node, shell=self.shell, cluster=self.cluster + ) + stopped_nodes.remove(node) + + with allure.step("Check that we can read object"): + check_objects_in_bucket(self.s3_client, bucket, bucket_objects) + + objects_list = s3_gate_object.list_objects_s3(self.s3_client, bucket) + assert not objects_list, f"Expected empty bucket, got {objects_list}" + + @allure.step("Teardown after EmptyMap stop service test") + @pytest.fixture() + def empty_map_stop_service_teardown(self): + yield + with allure.step("Return all storage nodes to network map"): + for node in list(list(stopped_nodes)): + with allure.step(f"Start node {node}"): + node.start_service() + with allure.step(f"Waiting status ready for node {node}"): + wait_for_node_to_be_ready(node) + + sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME)) + self.tick_epochs(1) + check_node_in_map(node, shell=self.shell, alive_node=node) + stopped_nodes.remove(node) + + @test_case.title("Test makes network map empty (stop storage service on all nodes)") + @test_case.priority(test_case.TestCasePriority.HIGH) + @test_case.suite_name("failovers") + @test_case.suite_section("test_failover_storage") + @pytest.mark.failover_empty_map_stop_service + @allure.title("Test makes network map empty (stop storage service on all nodes)") + def test_stop_all_storage_nodes(self, bucket, simple_object_size, empty_map_stop_service_teardown): + """ + The test makes network map empty (stop storage service on all nodes + then use 'frostfs-adm morph delete-nodes' to delete nodes from map) + then start all services and checks that object can read through s3. + + Steps: + 1. Check that bucket is empty + 2: PUT object into bucket + 3: Check that object exists in bucket + 4: Exclude all storage nodes from network map (stop storage service + and manual exclude from map) + 5: Return all storage nodes to network map + 6: Check that we can read object from #2 + Args: + bucket: bucket which contains tested object + simple_object_size: size of object + """ + file_path = generate_file(simple_object_size) + file_name = self.object_key_from_file_path(file_path) + bucket_objects = [file_name] + + objects_list = s3_gate_object.list_objects_s3(self.s3_client, bucket) + assert not objects_list, f"Expected empty bucket, got {objects_list}" + + with allure.step("Put object into bucket"): + s3_gate_object.put_object_s3(self.s3_client, bucket, file_path) + + with allure.step("Check that object exists in bucket"): + check_objects_in_bucket(self.s3_client, bucket, bucket_objects) + + with allure.step("Stop all storage nodes"): + for node in self.cluster.storage_nodes: + with allure.step(f"Stop storage service on node: {node}"): + node.stop_service() + stopped_nodes.append(node) + + with allure.step(f"Remove all nodes from network map"): + remove_nodes_from_map_morph(shell=self.shell, cluster=self.cluster, remove_nodes=stopped_nodes) + + with allure.step("Return all storage nodes to network map"): + self.return_nodes_after_stop_with_check_empty_map(stopped_nodes) + + with allure.step("Check that object exists in bucket"): + check_objects_in_bucket(self.s3_client, bucket, bucket_objects) + + @allure.step("Return all nodes to cluster with check empty map first") + def return_nodes_after_stop_with_check_empty_map(self, return_nodes = None) -> None: + first_node = True + for node in list(return_nodes): + with allure.step(f"Start node {node}"): + node.start_service() + with allure.step(f"Waiting status ready for node {node}"): + wait_for_node_to_be_ready(node) + + with allure.step(f"We need to make sure that network map is empty"): + if first_node: + for check_node in list(return_nodes): + check_node_not_in_map(check_node, shell=self.shell, alive_node=node) + first_node = False + + sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME)) + self.tick_epochs(1) + check_node_in_map(node, shell=self.shell, alive_node=node) + stopped_nodes.remove(node) + diff --git a/pytest_tests/testsuites/network/test_node_management.py b/pytest_tests/testsuites/network/test_node_management.py index 888a2b0..4e82120 100644 --- a/pytest_tests/testsuites/network/test_node_management.py +++ b/pytest_tests/testsuites/network/test_node_management.py @@ -32,6 +32,7 @@ from pytest_tests.helpers.node_management import ( node_shard_set_mode, storage_node_healthcheck, storage_node_set_status, + wait_for_node_to_be_ready ) from pytest_tests.helpers.storage_policy import get_nodes_with_object, get_simple_object_copies from pytest_tests.helpers.utility import ( @@ -109,7 +110,7 @@ class TestNodeManagement(ClusterTestBase): with allure.step(f"Start node {node}"): node.start_service() with allure.step(f"Waiting status ready for node {node}"): - self.wait_for_node_to_be_ready(node) + wait_for_node_to_be_ready(node) # We need to wait for node to establish notifications from morph-chain # Otherwise it will hang up when we will try to set status @@ -451,21 +452,6 @@ class TestNodeManagement(ClusterTestBase): f"Node {node} hasn't gone to the READY and ONLINE state after {timeout * attempts} second" ) - @allure.step("Wait for node {node} is ready") - def wait_for_node_to_be_ready(self, node: StorageNode) -> None: - timeout, attempts = 30, 6 - for _ in range(attempts): - try: - health_check = storage_node_healthcheck(node) - if health_check.health_status == "READY": - return - except Exception as err: - logger.warning(f"Node {node} is not ready:\n{err}") - sleep(timeout) - raise AssertionError( - f"Node {node} hasn't gone to the READY state after {timeout * attempts} seconds" - ) - @allure.step("Wait for {expected_copies} object copies in the wallet") def wait_for_expected_object_copies( self, wallet: str, cid: str, oid: str, expected_copies: int = 2