Move shared code to testlib

Signed-off-by: Andrey Berezin <a.berezin@yadro.com>
This commit is contained in:
Andrey Berezin 2023-05-14 13:43:59 +03:00
parent d97a02d1d3
commit 997e768e92
69 changed files with 9213 additions and 64 deletions

View file

@ -0,0 +1,207 @@
import frostfs_testlib.resources.optionals as optionals
from frostfs_testlib.load.k6 import K6
from frostfs_testlib.load.load_config import (
EndpointSelectionStrategy,
K6ProcessAllocationStrategy,
LoadParams,
LoadScenario,
LoadType,
)
from frostfs_testlib.load.load_steps import init_s3_client, prepare_k6_instances
from frostfs_testlib.reporter import get_reporter
from frostfs_testlib.resources.load_params import (
K6_TEARDOWN_PERIOD,
LOAD_NODE_SSH_PASSWORD,
LOAD_NODE_SSH_PRIVATE_KEY_PASSPHRASE,
LOAD_NODE_SSH_PRIVATE_KEY_PATH,
LOAD_NODE_SSH_USER,
LOAD_NODES,
)
from frostfs_testlib.shell.interfaces import SshCredentials
from frostfs_testlib.storage.cluster import ClusterNode
from frostfs_testlib.storage.cluster.frostfs_services import S3Gate, StorageNode
from frostfs_testlib.storage.dataclasses.wallet import WalletInfo
from frostfs_testlib.testing.test_control import run_optionally
reporter = get_reporter()
class BackgroundLoadController:
k6_instances: list[K6]
k6_dir: str
load_params: LoadParams
load_nodes: list[str]
verification_params: LoadParams
nodes_under_load: list[ClusterNode]
ssh_credentials: SshCredentials
loaders_wallet: WalletInfo
endpoints: list[str]
def __init__(
self,
k6_dir: str,
load_params: LoadParams,
loaders_wallet: WalletInfo,
nodes_under_load: list[ClusterNode],
) -> None:
self.k6_dir = k6_dir
self.load_params = load_params
self.nodes_under_load = nodes_under_load
self.load_nodes = LOAD_NODES
self.loaders_wallet = loaders_wallet
if load_params.endpoint_selection_strategy is None:
raise RuntimeError("endpoint_selection_strategy should not be None")
self.endpoints = self._get_endpoints(
load_params.load_type, load_params.endpoint_selection_strategy
)
self.verification_params = LoadParams(
clients=load_params.readers,
scenario=LoadScenario.VERIFY,
registry_file=load_params.registry_file,
verify_time=load_params.verify_time,
load_type=load_params.load_type,
load_id=load_params.load_id,
working_dir=load_params.working_dir,
endpoint_selection_strategy=load_params.endpoint_selection_strategy,
k6_process_allocation_strategy=load_params.k6_process_allocation_strategy,
)
self.ssh_credentials = SshCredentials(
LOAD_NODE_SSH_USER,
LOAD_NODE_SSH_PASSWORD,
LOAD_NODE_SSH_PRIVATE_KEY_PATH,
LOAD_NODE_SSH_PRIVATE_KEY_PASSPHRASE,
)
@run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED, [])
def _get_endpoints(
self, load_type: LoadType, endpoint_selection_strategy: EndpointSelectionStrategy
):
all_endpoints = {
LoadType.gRPC: {
EndpointSelectionStrategy.ALL: list(
set(
endpoint
for node_under_load in self.nodes_under_load
for endpoint in node_under_load.service(StorageNode).get_all_rpc_endpoint()
)
),
EndpointSelectionStrategy.FIRST: list(
set(
node_under_load.service(StorageNode).get_rpc_endpoint()
for node_under_load in self.nodes_under_load
)
),
},
# for some reason xk6 appends http protocol on its own
LoadType.S3: {
EndpointSelectionStrategy.ALL: list(
set(
endpoint.replace("http://", "")
for node_under_load in self.nodes_under_load
for endpoint in node_under_load.service(S3Gate).get_all_endpoints()
)
),
EndpointSelectionStrategy.FIRST: list(
set(
node_under_load.service(S3Gate).get_endpoint().replace("http://", "")
for node_under_load in self.nodes_under_load
)
),
},
}
return all_endpoints[load_type][endpoint_selection_strategy]
@run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED)
@reporter.step_deco("Prepare background load instances")
def prepare(self):
if self.load_params.load_type == LoadType.S3:
init_s3_client(
self.load_nodes,
self.load_params,
self.k6_dir,
self.ssh_credentials,
self.nodes_under_load,
self.loaders_wallet,
)
self._prepare(self.load_params)
def _prepare(self, load_params: LoadParams):
self.k6_instances = prepare_k6_instances(
load_nodes=LOAD_NODES,
ssh_credentials=self.ssh_credentials,
k6_dir=self.k6_dir,
load_params=load_params,
endpoints=self.endpoints,
loaders_wallet=self.loaders_wallet,
)
@run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED)
@reporter.step_deco("Start background load")
def start(self):
if self.load_params.preset is None:
raise RuntimeError("Preset should not be none at the moment of start")
with reporter.step(
f"Start background load on nodes {self.nodes_under_load}: "
f"writers = {self.load_params.writers}, "
f"obj_size = {self.load_params.object_size}, "
f"load_time = {self.load_params.load_time}, "
f"prepare_json = {self.load_params.preset.pregen_json}, "
f"endpoints = {self.endpoints}"
):
for k6_load_instance in self.k6_instances:
k6_load_instance.start()
@run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED)
@reporter.step_deco("Stop background load")
def stop(self):
for k6_load_instance in self.k6_instances:
k6_load_instance.stop()
@run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED, True)
def is_running(self):
for k6_load_instance in self.k6_instances:
if not k6_load_instance.is_running:
return False
return True
def wait_until_finish(self):
if self.load_params.load_time is None:
raise RuntimeError("LoadTime should not be none")
for k6_instance in self.k6_instances:
k6_instance.wait_until_finished(self.load_params.load_time + int(K6_TEARDOWN_PERIOD))
@run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED)
def verify(self):
if self.verification_params.verify_time is None:
raise RuntimeError("verify_time should not be none")
self._prepare(self.verification_params)
with reporter.step("Run verify background load data"):
for k6_verify_instance in self.k6_instances:
k6_verify_instance.start()
k6_verify_instance.wait_until_finished(self.verification_params.verify_time)
@run_optionally(optionals.OPTIONAL_BACKGROUND_LOAD_ENABLED)
@reporter.step_deco("K6 run results")
def get_results(self) -> dict:
results = {}
for k6_instance in self.k6_instances:
if k6_instance.load_params.k6_process_allocation_strategy is None:
raise RuntimeError("k6_process_allocation_strategy should not be none")
result = k6_instance.get_results()
keys_map = {
K6ProcessAllocationStrategy.PER_LOAD_NODE: k6_instance.load_node,
K6ProcessAllocationStrategy.PER_ENDPOINT: k6_instance.endpoints[0],
}
key = keys_map[k6_instance.load_params.k6_process_allocation_strategy]
results[key] = result
return results

View file

@ -0,0 +1,130 @@
import time
import allure
import frostfs_testlib.resources.optionals as optionals
from frostfs_testlib.reporter import get_reporter
from frostfs_testlib.shell import CommandOptions, Shell
from frostfs_testlib.steps import epoch
from frostfs_testlib.storage.cluster import Cluster, ClusterNode, StorageNode
from frostfs_testlib.storage.controllers.disk_controller import DiskController
from frostfs_testlib.testing.test_control import run_optionally, wait_for_success
from frostfs_testlib.utils.failover_utils import (
wait_all_storage_nodes_returned,
wait_for_host_offline,
wait_for_host_online,
wait_for_node_online,
)
reporter = get_reporter()
class ClusterStateController:
def __init__(self, shell: Shell, cluster: Cluster) -> None:
self.stopped_nodes: list[ClusterNode] = []
self.detached_disks: dict[str, DiskController] = {}
self.stopped_storage_nodes: list[StorageNode] = []
self.cluster = cluster
self.shell = shell
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
@reporter.step_deco("Stop host of node {node}")
def stop_node_host(self, node: ClusterNode, mode: str):
with allure.step(f"Stop host {node.host.config.address}"):
node.host.stop_host(mode=mode)
wait_for_host_offline(self.shell, node.storage_node)
self.stopped_nodes.append(node)
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
@reporter.step_deco("Start host of node {node}")
def start_node_host(self, node: ClusterNode):
with allure.step(f"Start host {node.host.config.address}"):
node.host.start_host()
wait_for_host_online(self.shell, node.storage_node)
wait_for_node_online(node.storage_node)
self.stopped_nodes.remove(node)
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
@reporter.step_deco("Start stopped hosts")
def start_stopped_hosts(self):
for node in self.stopped_nodes:
node.host.start_host()
self.stopped_nodes = []
wait_all_storage_nodes_returned(self.shell, self.cluster)
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
@reporter.step_deco("Detach disk {device} at {mountpoint} on node {node}")
def detach_disk(self, node: StorageNode, device: str, mountpoint: str):
disk_controller = self._get_disk_controller(node, device, mountpoint)
self.detached_disks[disk_controller.id] = disk_controller
disk_controller.detach()
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
@reporter.step_deco("Attach disk {device} at {mountpoint} on node {node}")
def attach_disk(self, node: StorageNode, device: str, mountpoint: str):
disk_controller = self._get_disk_controller(node, device, mountpoint)
disk_controller.attach()
self.detached_disks.pop(disk_controller.id, None)
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
@reporter.step_deco("Restore detached disks")
def restore_disks(self):
for disk_controller in self.detached_disks.values():
disk_controller.attach()
self.detached_disks = {}
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
@reporter.step_deco("Stop storage service on {node}")
def stop_storage_service(self, node: ClusterNode):
node.storage_node.stop_service()
self.stopped_storage_nodes.append(node.storage_node)
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
@reporter.step_deco("Start storage service on {node}")
def start_storage_service(self, node: ClusterNode):
node.storage_node.start_service()
self.stopped_storage_nodes.remove(node.storage_node)
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
@reporter.step_deco("Start stopped storage services")
def start_stopped_storage_services(self):
for node in self.stopped_storage_nodes:
node.start_service()
self.stopped_storage_nodes = []
@run_optionally(optionals.OPTIONAL_FAILOVER_ENABLED)
@reporter.step_deco("Hard reboot host {node} via magic SysRq option")
def panic_reboot_host(self, node: ClusterNode):
shell = node.host.get_shell()
shell.exec('sudo sh -c "echo 1 > /proc/sys/kernel/sysrq"')
options = CommandOptions(close_stdin=True, timeout=1, check=False)
shell.exec('sudo sh -c "echo b > /proc/sysrq-trigger"', options)
# Let the things to be settled
# A little wait here to prevent ssh stuck during panic
time.sleep(10)
wait_for_host_online(self.shell, node.storage_node)
wait_for_node_online(node.storage_node)
@reporter.step_deco("Wait up to {timeout} seconds for nodes on cluster to align epochs")
def wait_for_epochs_align(self, timeout=60):
@wait_for_success(timeout, 5, None, True)
def check_epochs():
epochs_by_node = epoch.get_epochs_from_nodes(self.shell, self.cluster)
assert (
len(set(epochs_by_node.values())) == 1
), f"unaligned epochs found: {epochs_by_node}"
check_epochs()
def _get_disk_controller(
self, node: StorageNode, device: str, mountpoint: str
) -> DiskController:
disk_controller_id = DiskController.get_id(node, device)
if disk_controller_id in self.detached_disks.keys():
disk_controller = self.detached_disks[disk_controller_id]
else:
disk_controller = DiskController(node, device, mountpoint)
return disk_controller