From d986fe2fb6dcf456b142224131afcbbab66a0bb7 Mon Sep 17 00:00:00 2001
From: Andrey Berezin <a.berezin@yadro.com>
Date: Wed, 25 Oct 2023 15:47:29 +0300
Subject: [PATCH] [#124] Use CSC in case of node shutdown

Signed-off-by: Andrey Berezin <a.berezin@yadro.com>
---
 pytest_tests/testsuites/conftest.py           |   4 +-
 .../failovers/test_failover_storage.py        | 205 +++++++++---------
 requirements.txt                              |   2 +-
 3 files changed, 104 insertions(+), 107 deletions(-)

diff --git a/pytest_tests/testsuites/conftest.py b/pytest_tests/testsuites/conftest.py
index 2c7a9191..3fd97ed0 100644
--- a/pytest_tests/testsuites/conftest.py
+++ b/pytest_tests/testsuites/conftest.py
@@ -276,10 +276,10 @@ def after_deploy_healthcheck(cluster: Cluster):
         parallel(readiness_on_node, cluster.cluster_nodes)
 
 
-SERVICE_ACTIVE_TIME = 15
+SERVICE_ACTIVE_TIME = 20
 
 
-@wait_for_success(60 * SERVICE_ACTIVE_TIME * 2, 15)
+@wait_for_success(60 * SERVICE_ACTIVE_TIME * 3, 60)
 @allure.step("Check readiness on node {cluster_node}")
 def readiness_on_node(cluster_node: ClusterNode):
     # TODO: Move to healtcheck classes
diff --git a/pytest_tests/testsuites/failovers/test_failover_storage.py b/pytest_tests/testsuites/failovers/test_failover_storage.py
index 8200a6c3..d9f6b68f 100644
--- a/pytest_tests/testsuites/failovers/test_failover_storage.py
+++ b/pytest_tests/testsuites/failovers/test_failover_storage.py
@@ -1,4 +1,5 @@
 import logging
+import random
 from datetime import datetime
 from time import sleep
 
@@ -8,7 +9,7 @@ from frostfs_testlib.hosting import Host
 from frostfs_testlib.resources.common import MORPH_BLOCK_TIME
 from frostfs_testlib.resources.wellknown_acl import PUBLIC_ACL
 from frostfs_testlib.s3 import AwsCliClient, Boto3ClientWrapper, S3ClientWrapper, VersioningStatus
-from frostfs_testlib.shell import CommandOptions, Shell
+from frostfs_testlib.shell import CommandOptions
 from frostfs_testlib.steps.cli.container import StorageContainer, StorageContainerInfo, create_container
 from frostfs_testlib.steps.cli.object import get_object, put_object_to_random_node
 from frostfs_testlib.steps.node_management import (
@@ -28,7 +29,7 @@ from frostfs_testlib.storage.dataclasses.wallet import WalletInfo
 from frostfs_testlib.testing.cluster_test_base import ClusterTestBase
 from frostfs_testlib.testing.test_control import expect_not_raises
 from frostfs_testlib.utils import datetime_utils
-from frostfs_testlib.utils.failover_utils import wait_all_storage_nodes_returned, wait_object_replication
+from frostfs_testlib.utils.failover_utils import wait_object_replication
 from frostfs_testlib.utils.file_keeper import FileKeeper
 from frostfs_testlib.utils.file_utils import generate_file, get_file_hash
 
@@ -51,9 +52,9 @@ def file_keeper():
 
 @allure.step("Return all stopped hosts")
 @pytest.fixture(scope="function", autouse=True)
-def after_run_return_all_stopped_hosts(client_shell: Shell, cluster: Cluster) -> str:
-    yield "After this test stopped services will be started automatically via fixture"
-    return_stopped_hosts(client_shell, cluster)
+def after_run_return_all_stopped_hosts(cluster_state_controller: ClusterStateController) -> str:
+    yield
+    cluster_state_controller.start_stopped_hosts()
 
 
 @allure.step("Return all stopped storage services after test")
@@ -78,16 +79,8 @@ def panic_reboot_host(host: Host) -> None:
     shell.exec('sudo sh -c "echo b > /proc/sysrq-trigger"', options)
 
 
-def return_stopped_hosts(shell: Shell, cluster: Cluster) -> None:
-    for node in list(stopped_nodes):
-        with allure.step(f"Start host {node}"):
-            node.host.start_host()
-        stopped_nodes.remove(node)
-
-    wait_all_storage_nodes_returned(shell, cluster)
-
-
 @pytest.mark.failover
+@pytest.mark.failover_storage
 class TestFailoverStorage(ClusterTestBase):
     @allure.title("Shutdown and start node (stop_mode={stop_mode})")
     @pytest.mark.parametrize("stop_mode", ["hard", "soft"])
@@ -98,52 +91,68 @@ class TestFailoverStorage(ClusterTestBase):
         stop_mode: str,
         require_multiple_hosts,
         simple_object_size: ObjectSize,
+        cluster: Cluster,
+        cluster_state_controller: ClusterStateController,
     ):
         wallet = default_wallet
         placement_rule = "REP 2 IN X CBF 2 SELECT 2 FROM * AS X"
         source_file_path = generate_file(simple_object_size.value)
-        cid = create_container(
-            wallet,
-            shell=self.shell,
-            endpoint=self.cluster.default_rpc_endpoint,
-            rule=placement_rule,
-            basic_acl=PUBLIC_ACL,
-        )
-        oid = put_object_to_random_node(wallet, source_file_path, cid, shell=self.shell, cluster=self.cluster)
-        nodes = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes)
+        stopped_hosts_nodes = []
 
-        for node in nodes:
-            stopped_nodes.append(node)
-
-            with allure.step(f"Stop host {node}"):
-                node.host.stop_host(stop_mode)
-
-            new_nodes = wait_object_replication(
-                cid,
-                oid,
-                2,
+        with allure.step(f"Create container and put object"):
+            cid = create_container(
+                wallet,
                 shell=self.shell,
-                nodes=list(set(self.cluster.storage_nodes) - {*stopped_nodes}),
+                endpoint=self.cluster.default_rpc_endpoint,
+                rule=placement_rule,
+                basic_acl=PUBLIC_ACL,
             )
-        assert all(old_node not in new_nodes for old_node in nodes)
+            oid = put_object_to_random_node(wallet, source_file_path, cid, shell=self.shell, cluster=self.cluster)
+
+        with allure.step(f"Wait for replication and get nodes with object"):
+            nodes_with_object = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes)
+
+        with allure.step(f"Stop 2 nodes with object and wait replication one by one"):
+            for storage_node in random.sample(nodes_with_object, 2):
+                stopped_hosts_nodes.append(storage_node)
+
+                cluster_node = cluster.node(storage_node)
+                cluster_state_controller.stop_node_host(cluster_node, stop_mode)
+
+                replicated_nodes = wait_object_replication(
+                    cid,
+                    oid,
+                    2,
+                    shell=self.shell,
+                    nodes=list(set(self.cluster.storage_nodes) - {*stopped_hosts_nodes}),
+                )
 
         with allure.step("Check object data is not corrupted"):
-            got_file_path = get_object(wallet, cid, oid, endpoint=new_nodes[0].get_rpc_endpoint(), shell=self.shell)
+            got_file_path = get_object(
+                wallet, cid, oid, endpoint=replicated_nodes[0].get_rpc_endpoint(), shell=self.shell
+            )
             assert get_file_hash(source_file_path) == get_file_hash(got_file_path)
 
         with allure.step("Return all hosts"):
-            return_stopped_hosts(self.shell, self.cluster)
+            cluster_state_controller.start_stopped_hosts()
 
         with allure.step("Check object data is not corrupted"):
-            new_nodes = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes)
-            got_file_path = get_object(wallet, cid, oid, shell=self.shell, endpoint=new_nodes[0].get_rpc_endpoint())
+            replicated_nodes = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes)
+            got_file_path = get_object(
+                wallet, cid, oid, shell=self.shell, endpoint=replicated_nodes[0].get_rpc_endpoint()
+            )
             assert get_file_hash(source_file_path) == get_file_hash(got_file_path)
 
     @allure.title("Panic reboot nodes (sequenced_reboots={sequence})")
     @pytest.mark.parametrize("sequence", [True, False])
     @pytest.mark.failover_panic
     def test_panic_storage_node_host(
-        self, default_wallet, require_multiple_hosts, sequence: bool, simple_object_size: ObjectSize
+        self,
+        default_wallet,
+        require_multiple_hosts,
+        sequence: bool,
+        simple_object_size: ObjectSize,
+        cluster_state_controller: ClusterStateController,
     ):
         wallet = default_wallet
         placement_rule = "REP 2 IN X CBF 2 SELECT 2 FROM * AS X"
@@ -157,38 +166,29 @@ class TestFailoverStorage(ClusterTestBase):
         )
         oid = put_object_to_random_node(wallet, source_file_path, cid, shell=self.shell, cluster=self.cluster)
 
-        nodes = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes)
+        nodes_with_object = wait_object_replication(cid, oid, 2, shell=self.shell, nodes=self.cluster.storage_nodes)
         allure.attach(
-            "\n".join([str(node) for node in nodes]),
+            "\n".join([str(node) for node in nodes_with_object]),
             "Current nodes with object",
             allure.attachment_type.TEXT,
         )
 
         new_nodes: list[StorageNode] = []
-        for node in nodes:
-            with allure.step(f"Hard reboot host {node} via magic SysRq option"):
-                panic_reboot_host(node.host)
+        with allure.step(f"Hard reboot hosts via magic SysRq option"):
+            for storage_node in nodes_with_object:
+                cluster_state_controller.panic_reboot_host(self.cluster.node(storage_node))
                 if sequence:
-                    try:
-                        new_nodes = wait_object_replication(
-                            cid,
-                            oid,
-                            2,
-                            shell=self.shell,
-                            nodes=list(set(self.cluster.storage_nodes) - {node}),
-                        )
-                    except AssertionError:
-                        new_nodes = wait_object_replication(
-                            cid,
-                            oid,
-                            2,
-                            shell=self.shell,
-                            nodes=self.cluster.storage_nodes,
-                        )
+                    new_nodes = wait_object_replication(
+                        cid,
+                        oid,
+                        2,
+                        shell=self.shell,
+                        nodes=self.cluster.storage_nodes,
+                    )
 
                     allure.attach(
                         "\n".join([str(new_node) for new_node in new_nodes]),
-                        f"Nodes with object after {node} fail",
+                        f"Nodes with object after {storage_node} fail",
                         allure.attachment_type.TEXT,
                     )
 
@@ -213,19 +213,18 @@ class TestFailoverStorage(ClusterTestBase):
         after_run_return_all_stopped_services,
     ):
         default_node = self.cluster.cluster_nodes[0]
-        default_s3gate = self.cluster.s3_gates[0]
 
         with allure.step("Turn S3 GW off on default node"):
-            default_s3gate.stop_service()
+            cluster_state_controller.stop_s3_gate(default_node)
 
         with allure.step("Turn off storage on default node"):
             cluster_state_controller.stop_storage_service(default_node)
 
         with allure.step("Turn on S3 GW on default node"):
-            default_s3gate.start_service()
+            cluster_state_controller.start_s3_gate(default_node)
 
         with allure.step("Turn on storage on default node"):
-            cluster_state_controller.start_stopped_storage_services()
+            cluster_state_controller.start_storage_service(default_node)
 
         with allure.step("Create bucket with REP 1 SELECT 1 policy"):
             bucket = s3_client.create_bucket(
@@ -246,6 +245,9 @@ class TestFailoverStorage(ClusterTestBase):
         with allure.step("Check that object is available"):
             s3_helper.check_objects_in_bucket(s3_client, bucket, expected_objects=[file_name])
 
+        with allure.step("Start storage nodes"):
+            cluster_state_controller.start_stopped_storage_services()
+
 
 @pytest.mark.failover
 @pytest.mark.failover_empty_map
@@ -259,7 +261,7 @@ class TestEmptyMap(ClusterTestBase):
     def empty_map_offline_teardown(self):
         yield
         with allure.step("Return all storage nodes to network map"):
-            for node in list(stopped_nodes):
+            for node in stopped_nodes:
                 include_node_to_network_map(node, node, shell=self.shell, cluster=self.cluster)
                 stopped_nodes.remove(node)
 
@@ -315,19 +317,12 @@ class TestEmptyMap(ClusterTestBase):
 
     @allure.step("Teardown after EmptyMap stop service test")
     @pytest.fixture()
-    def empty_map_stop_service_teardown(self):
+    def empty_map_stop_service_teardown(self, cluster_state_controller: ClusterStateController):
         yield
         with allure.step("Return all storage nodes to network map"):
-            for node in list(list(stopped_nodes)):
-                with allure.step(f"Start node {node}"):
-                    node.start_service()
-                with allure.step(f"Waiting status ready for node {node}"):
-                    wait_for_node_to_be_ready(node)
-
-                sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME))
-                self.tick_epochs(1)
+            cluster_state_controller.start_stopped_storage_services()
+            for node in stopped_nodes:
                 check_node_in_map(node, shell=self.shell, alive_node=node)
-                stopped_nodes.remove(node)
 
     @pytest.mark.failover_empty_map_stop_service
     @allure.title("Empty network map via stop all storage services (s3_client={s3_client})")
@@ -337,6 +332,7 @@ class TestEmptyMap(ClusterTestBase):
         bucket: str,
         simple_object_size: ObjectSize,
         empty_map_stop_service_teardown,
+        cluster_state_controller: ClusterStateController,
     ):
         """
         The test makes network map empty (stop storage service on all nodes
@@ -369,39 +365,40 @@ class TestEmptyMap(ClusterTestBase):
             s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects)
 
         with allure.step("Stop all storage nodes"):
-            for node in self.cluster.storage_nodes:
-                with allure.step(f"Stop storage service on node: {node}"):
-                    stopped_nodes.append(node)
-                    node.stop_service()
+            cluster_state_controller.stop_all_storage_services()
 
         with allure.step("Remove all nodes from network map"):
-            remove_nodes_from_map_morph(shell=self.shell, cluster=self.cluster, remove_nodes=stopped_nodes)
+            remove_nodes_from_map_morph(
+                shell=self.shell, cluster=self.cluster, remove_nodes=self.cluster.services(StorageNode)
+            )
 
         with allure.step("Return all storage nodes to network map"):
-            self.return_nodes_after_stop_with_check_empty_map(stopped_nodes)
+            self.return_nodes_after_stop_with_check_empty_map(cluster_state_controller)
 
         with allure.step("Check that object exists in bucket"):
             s3_helper.check_objects_in_bucket(s3_client, bucket, bucket_objects)
 
     @allure.step("Return all nodes to cluster with check empty map first")
-    def return_nodes_after_stop_with_check_empty_map(self, return_nodes=None) -> None:
-        first_node = True
-        for node in list(return_nodes):
-            with allure.step(f"Start node {node}"):
-                node.start_service()
-            with allure.step(f"Waiting status ready for node {node}"):
-                wait_for_node_to_be_ready(node)
+    def return_nodes_after_stop_with_check_empty_map(self, cluster_state_controller: ClusterStateController) -> None:
+        first_node = self.cluster.cluster_nodes[0].service(StorageNode)
 
-            with allure.step("Make sure that network map is empty"):
-                if first_node:
-                    for check_node in list(return_nodes):
-                        check_node_not_in_map(check_node, shell=self.shell, alive_node=node)
-                    first_node = False
+        with allure.step("Start first node and check network map"):
+            cluster_state_controller.start_storage_service(self.cluster.cluster_nodes[0])
+            wait_for_node_to_be_ready(first_node)
+
+            for check_node in self.cluster.storage_nodes:
+                check_node_not_in_map(check_node, shell=self.shell, alive_node=first_node)
+
+        for node in self.cluster.cluster_nodes[1:]:
+            storage_node = node.service(StorageNode)
+
+            cluster_state_controller.start_storage_service(node)
+            wait_for_node_to_be_ready(storage_node)
 
             sleep(datetime_utils.parse_time(MORPH_BLOCK_TIME))
             self.tick_epochs(1)
-            check_node_in_map(node, shell=self.shell, alive_node=node)
-            stopped_nodes.remove(node)
+
+            check_node_in_map(storage_node, shell=self.shell, alive_node=first_node)
 
     @allure.title("Object loss from fstree/blobovnicza (versioning=enabled, s3_client={s3_client})")
     def test_s3_fstree_blobovnicza_loss_versioning_on(
@@ -537,11 +534,11 @@ class TestEmptyMap(ClusterTestBase):
 @pytest.mark.failover_data_loss
 class TestStorageDataLoss(ClusterTestBase):
     @allure.step("Get list of all piloramas on node")
-    def get_piloramas_list(self, cluster_state_controller, node) -> list:
-        data_directory_path = cluster_state_controller.get_data_directory()
+    def get_piloramas_list(self, node: StorageNode) -> list:
+        data_directory_path = node.get_data_directory()
 
         cmd = f"sudo ls -1 {data_directory_path}/meta*/pilorama*"
-        shell = cluster_state_controller.host.get_shell()
+        shell = node.host.get_shell()
         stdout = shell.exec(cmd).stdout
 
         piloramas = stdout.split("\n")
@@ -590,11 +587,11 @@ class TestStorageDataLoss(ClusterTestBase):
         with allure.step("Enable resync_metabase option for storage services"):
             for storage_node in cluster_state_controller.cluster.storage_nodes:
                 with allure.step(f"Enable resync_metabase option for {storage_node}"):
-                    config_file_path, config = storage_node.get_shard_config_path()
+                    config_file_path, config = storage_node.get_shards_config()
                     if not config["storage"]["shard"]["default"]["resync_metabase"]:
                         file_keeper.add(storage_node, config_file_path)
                         config["storage"]["shard"]["default"]["resync_metabase"] = True
-                        storage_node.save_config(config)
+                        storage_node.save_config(config, config_file_path)
 
         with allure.step("Start storage services on all nodes"):
             cluster_state_controller.start_stopped_storage_services()
@@ -752,7 +749,7 @@ class TestStorageDataLoss(ClusterTestBase):
         node_to_check = self.cluster.storage_nodes[0]
         piloramas_list_before_removing = {}
         with allure.step("Get list of all pilorama.db"):
-            piloramas_list_before_removing = self.get_piloramas_list(node_to_check, cluster_state_controller)
+            piloramas_list_before_removing = self.get_piloramas_list(node_to_check)
 
         with allure.step("Stop all storage nodes"):
             for node in self.cluster.cluster_nodes:
@@ -771,7 +768,7 @@ class TestStorageDataLoss(ClusterTestBase):
 
         piloramas_list_afrer_removing = {}
         with allure.step("Get list of all pilorama.db after sync"):
-            piloramas_list_afrer_removing = self.get_piloramas_list(node_to_check, cluster_state_controller)
+            piloramas_list_afrer_removing = self.get_piloramas_list(node_to_check)
             assert piloramas_list_afrer_removing == piloramas_list_before_removing, "List of pilorama.db is different"
 
         with allure.step("Check bucket versioning"):
diff --git a/requirements.txt b/requirements.txt
index 623d60cd..2f28a1ad 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,6 @@ pyyaml==6.0.1
 pytest==7.1.2
 pytest-lazy-fixture==0.6.3
 python-dateutil==2.8.2
-requests==2.28.0
+requests==2.28.1
 tenacity==8.0.1
 urllib3==1.26.9
\ No newline at end of file