Move shared code to testlib

Signed-off-by: Andrey Berezin <a.berezin@yadro.com>
2023-05-14 13:43:59 +03:00 · 2023-05-14 13:43:59 +03:00 · 997e768e92
commit 997e768e92
parent d97a02d1d3
69 changed files with 9213 additions and 64 deletions
--- a/src/frostfs_testlib/controllers/background_load_controller.py
+++ b/src/frostfs_testlib/controllers/background_load_controller.py
@ -0,0 +1,207 @@
+import frostfs_testlib.resources.optionals as optionals
+from frostfs_testlib.load.k6 import K6
+from frostfs_testlib.load.load_config import (
+    EndpointSelectionStrategy,
+    K6ProcessAllocationStrategy,
+    LoadParams,
+    LoadScenario,
+    LoadType,
+)
+from frostfs_testlib.load.load_steps import init_s3_client, prepare_k6_instances
+from frostfs_testlib.reporter import get_reporter
+from frostfs_testlib.resources.load_params import (
+    K6_TEARDOWN_PERIOD,
+    LOAD_NODE_SSH_PASSWORD,
+    LOAD_NODE_SSH_PRIVATE_KEY_PASSPHRASE,
+    LOAD_NODE_SSH_PRIVATE_KEY_PATH,
+    LOAD_NODE_SSH_USER,
+    LOAD_NODES,
+)
+from frostfs_testlib.shell.interfaces import SshCredentials
+from frostfs_testlib.storage.cluster import ClusterNode
+from frostfs_testlib.storage.cluster.frostfs_services import S3Gate, StorageNode
+from frostfs_testlib.storage.dataclasses.wallet import WalletInfo
+from frostfs_testlib.testing.test_control import run_optionally
+
+reporter = get_reporter()
+
+
+class BackgroundLoadController:
+    k6_instances: list[K6]
+    k6_dir: str
+    load_params: LoadParams
+    load_nodes: list[str]
+    verification_params: LoadParams
+    nodes_under_load: list[ClusterNode]
+    ssh_credentials: SshCredentials
+    loaders_wallet: WalletInfo
+    endpoints: list[str]
+
+    def __init__(
+        self,
+        k6_dir: str,
+        load_params: LoadParams,
+        loaders_wallet: WalletInfo,
+        nodes_under_load: list[ClusterNode],
+    ) -> None:
+        self.k6_dir = k6_dir
+        self.load_params = load_params
+        self.nodes_under_load = nodes_under_load
+        self.load_nodes = LOAD_NODES
+        self.loaders_wallet = loaders_wallet
+
+        if load_params.endpoint_selection_strategy is None:
+            raise RuntimeError("endpoint_selection_strategy should not be None")
+
+        self.endpoints = self._get_endpoints(
+            load_params.load_type, load_params.endpoint_selection_strategy
+        )
+        self.verification_params = LoadParams(
+            clients=load_params.readers,
+            scenario=LoadScenario.VERIFY,
+            registry_file=load_params.registry_file,
+            verify_time=load_params.verify_time,
+            load_type=load_params.load_type,
+            load_id=load_params.load_id,
+            working_dir=load_params.working_dir,
+            endpoint_selection_strategy=load_params.endpoint_selection_strategy,
+            k6_process_allocation_strategy=load_params.k6_process_allocation_strategy,
+        )
+        self.ssh_credentials = SshCredentials(
+            LOAD_NODE_SSH_USER,
+            LOAD_NODE_SSH_PASSWORD,
+            LOAD_NODE_SSH_PRIVATE_KEY_PATH,
+            LOAD_NODE_SSH_PRIVATE_KEY_PASSPHRASE,
+        )
+
+    @run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED, [])
+    def _get_endpoints(
+        self, load_type: LoadType, endpoint_selection_strategy: EndpointSelectionStrategy
+    ):
+        all_endpoints = {
+            LoadType.gRPC: {
+                EndpointSelectionStrategy.ALL: list(
+                    set(
+                        endpoint
+                        for node_under_load in self.nodes_under_load
+                        for endpoint in node_under_load.service(StorageNode).get_all_rpc_endpoint()
+                    )
+                ),
+                EndpointSelectionStrategy.FIRST: list(
+                    set(
+                        node_under_load.service(StorageNode).get_rpc_endpoint()
+                        for node_under_load in self.nodes_under_load
+                    )
+                ),
+            },
+            # for some reason xk6 appends http protocol on its own
+            LoadType.S3: {
+                EndpointSelectionStrategy.ALL: list(
+                    set(
+                        endpoint.replace("http://", "")
+                        for node_under_load in self.nodes_under_load
+                        for endpoint in node_under_load.service(S3Gate).get_all_endpoints()
+                    )
+                ),
+                EndpointSelectionStrategy.FIRST: list(
+                    set(
+                        node_under_load.service(S3Gate).get_endpoint().replace("http://", "")
+                        for node_under_load in self.nodes_under_load
+                    )
+                ),
+            },
+        }
+
+        return all_endpoints[load_type][endpoint_selection_strategy]
+
+    @run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED)
+    @reporter.step_deco("Prepare background load instances")
+    def prepare(self):
+        if self.load_params.load_type == LoadType.S3:
+            init_s3_client(
+                self.load_nodes,
+                self.load_params,
+                self.k6_dir,
+                self.ssh_credentials,
+                self.nodes_under_load,
+                self.loaders_wallet,
+            )
+
+        self._prepare(self.load_params)
+
+    def _prepare(self, load_params: LoadParams):
+        self.k6_instances = prepare_k6_instances(
+            load_nodes=LOAD_NODES,
+            ssh_credentials=self.ssh_credentials,
+            k6_dir=self.k6_dir,
+            load_params=load_params,
+            endpoints=self.endpoints,
+            loaders_wallet=self.loaders_wallet,
+        )
+
+    @run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED)
+    @reporter.step_deco("Start background load")
+    def start(self):
+        if self.load_params.preset is None:
+            raise RuntimeError("Preset should not be none at the moment of start")
+
+        with reporter.step(
+            f"Start background load on nodes {self.nodes_under_load}: "
+            f"writers = {self.load_params.writers}, "
+            f"obj_size = {self.load_params.object_size}, "
+            f"load_time = {self.load_params.load_time}, "
+            f"prepare_json = {self.load_params.preset.pregen_json}, "
+            f"endpoints = {self.endpoints}"
+        ):
+            for k6_load_instance in self.k6_instances:
+                k6_load_instance.start()
+
+    @run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED)
+    @reporter.step_deco("Stop background load")
+    def stop(self):
+        for k6_load_instance in self.k6_instances:
+            k6_load_instance.stop()
+
+    @run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED, True)
+    def is_running(self):
+        for k6_load_instance in self.k6_instances:
+            if not k6_load_instance.is_running:
+                return False
+
+        return True
+
+    def wait_until_finish(self):
+        if self.load_params.load_time is None:
+            raise RuntimeError("LoadTime should not be none")
+
+        for k6_instance in self.k6_instances:
+            k6_instance.wait_until_finished(self.load_params.load_time + int(K6_TEARDOWN_PERIOD))
+
+    @run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED)
+    def verify(self):
+        if self.verification_params.verify_time is None:
+            raise RuntimeError("verify_time should not be none")
+
+        self._prepare(self.verification_params)
+        with reporter.step("Run verify background load data"):
+            for k6_verify_instance in self.k6_instances:
+                k6_verify_instance.start()
+                k6_verify_instance.wait_until_finished(self.verification_params.verify_time)
+
+    @run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED)
+    @reporter.step_deco("K6 run results")
+    def get_results(self) -> dict:
+        results = {}
+        for k6_instance in self.k6_instances:
+            if k6_instance.load_params.k6_process_allocation_strategy is None:
+                raise RuntimeError("k6_process_allocation_strategy should not be none")
+
+            result = k6_instance.get_results()
+            keys_map = {
+                K6ProcessAllocationStrategy.PER_LOAD_NODE: k6_instance.load_node,
+                K6ProcessAllocationStrategy.PER_ENDPOINT: k6_instance.endpoints[0],
+            }
+            key = keys_map[k6_instance.load_params.k6_process_allocation_strategy]
+            results[key] = result
+
+        return results
--- a/src/frostfs_testlib/controllers/cluster_state_controller.py
+++ b/src/frostfs_testlib/controllers/cluster_state_controller.py
@ -0,0 +1,130 @@
+import time
+
+import allure
+
+import frostfs_testlib.resources.optionals as optionals
+from frostfs_testlib.reporter import get_reporter
+from frostfs_testlib.shell import CommandOptions, Shell
+from frostfs_testlib.steps import epoch
+from frostfs_testlib.storage.cluster import Cluster, ClusterNode, StorageNode
+from frostfs_testlib.storage.controllers.disk_controller import DiskController
+from frostfs_testlib.testing.test_control import run_optionally, wait_for_success
+from frostfs_testlib.utils.failover_utils import (
+    wait_all_storage_nodes_returned,
+    wait_for_host_offline,
+    wait_for_host_online,
+    wait_for_node_online,
+)
+
+reporter = get_reporter()
+
+
+class ClusterStateController:
+    def __init__(self, shell: Shell, cluster: Cluster) -> None:
+        self.stopped_nodes: list[ClusterNode] = []
+        self.detached_disks: dict[str, DiskController] = {}
+        self.stopped_storage_nodes: list[StorageNode] = []
+        self.cluster = cluster
+        self.shell = shell
+
+    @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
+    @reporter.step_deco("Stop host of node {node}")
+    def stop_node_host(self, node: ClusterNode, mode: str):
+        with allure.step(f"Stop host {node.host.config.address}"):
+            node.host.stop_host(mode=mode)
+            wait_for_host_offline(self.shell, node.storage_node)
+        self.stopped_nodes.append(node)
+
+    @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
+    @reporter.step_deco("Start host of node {node}")
+    def start_node_host(self, node: ClusterNode):
+        with allure.step(f"Start host {node.host.config.address}"):
+            node.host.start_host()
+            wait_for_host_online(self.shell, node.storage_node)
+            wait_for_node_online(node.storage_node)
+        self.stopped_nodes.remove(node)
+
+    @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
+    @reporter.step_deco("Start stopped hosts")
+    def start_stopped_hosts(self):
+        for node in self.stopped_nodes:
+            node.host.start_host()
+        self.stopped_nodes = []
+        wait_all_storage_nodes_returned(self.shell, self.cluster)
+
+    @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
+    @reporter.step_deco("Detach disk {device} at {mountpoint} on node {node}")
+    def detach_disk(self, node: StorageNode, device: str, mountpoint: str):
+        disk_controller = self._get_disk_controller(node, device, mountpoint)
+        self.detached_disks[disk_controller.id] = disk_controller
+        disk_controller.detach()
+
+    @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
+    @reporter.step_deco("Attach disk {device} at {mountpoint} on node {node}")
+    def attach_disk(self, node: StorageNode, device: str, mountpoint: str):
+        disk_controller = self._get_disk_controller(node, device, mountpoint)
+        disk_controller.attach()
+        self.detached_disks.pop(disk_controller.id, None)
+
+    @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
+    @reporter.step_deco("Restore detached disks")
+    def restore_disks(self):
+        for disk_controller in self.detached_disks.values():
+            disk_controller.attach()
+        self.detached_disks = {}
+
+    @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
+    @reporter.step_deco("Stop storage service on {node}")
+    def stop_storage_service(self, node: ClusterNode):
+        node.storage_node.stop_service()
+        self.stopped_storage_nodes.append(node.storage_node)
+
+    @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
+    @reporter.step_deco("Start storage service on {node}")
+    def start_storage_service(self, node: ClusterNode):
+        node.storage_node.start_service()
+        self.stopped_storage_nodes.remove(node.storage_node)
+
+    @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
+    @reporter.step_deco("Start stopped storage services")
+    def start_stopped_storage_services(self):
+        for node in self.stopped_storage_nodes:
+            node.start_service()
+        self.stopped_storage_nodes = []
+
+    @run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
+    @reporter.step_deco("Hard reboot host {node} via magic SysRq option")
+    def panic_reboot_host(self, node: ClusterNode):
+        shell = node.host.get_shell()
+        shell.exec('sudo sh -c "echo 1 > /proc/sys/kernel/sysrq"')
+
+        options = CommandOptions(close_stdin=True, timeout=1, check=False)
+        shell.exec('sudo sh -c "echo b > /proc/sysrq-trigger"', options)
+
+        # Let the things to be settled
+        # A little wait here to prevent ssh stuck during panic
+        time.sleep(10)
+        wait_for_host_online(self.shell, node.storage_node)
+        wait_for_node_online(node.storage_node)
+
+    @reporter.step_deco("Wait up to {timeout} seconds for nodes on cluster to align epochs")
+    def wait_for_epochs_align(self, timeout=60):
+        @wait_for_success(timeout, 5, None, True)
+        def check_epochs():
+            epochs_by_node = epoch.get_epochs_from_nodes(self.shell, self.cluster)
+            assert (
+                len(set(epochs_by_node.values())) == 1
+            ), f"unaligned epochs found: {epochs_by_node}"
+
+        check_epochs()
+
+    def _get_disk_controller(
+        self, node: StorageNode, device: str, mountpoint: str
+    ) -> DiskController:
+        disk_controller_id = DiskController.get_id(node, device)
+        if disk_controller_id in self.detached_disks.keys():
+            disk_controller = self.detached_disks[disk_controller_id]
+        else:
+            disk_controller = DiskController(node, device, mountpoint)
+
+        return disk_controller