Add test for adding node to cluster

Signed-off-by: a.y.volkov <a.y.volkov@yadro.com>
This commit is contained in:
a.y.volkov 2022-08-03 18:20:50 +03:00 committed by Vladimir Domnich
parent b468a06f4e
commit f97bfed183
9 changed files with 218 additions and 30 deletions

View file

@ -21,3 +21,4 @@ markers =
failover: tests for system recovery after a failure failover: tests for system recovery after a failure
failover_panic: tests for system recovery after panic reboot of a node failover_panic: tests for system recovery after panic reboot of a node
failover_net: tests for network failure failover_net: tests for network failure
add_nodes: add nodes to cluster

View file

@ -11,7 +11,7 @@ import wallet
from cli_helpers import _cmd_run from cli_helpers import _cmd_run
from common import ASSETS_DIR, FREE_STORAGE, MAINNET_WALLET_PATH, NEOFS_NETMAP_DICT from common import ASSETS_DIR, FREE_STORAGE, MAINNET_WALLET_PATH, NEOFS_NETMAP_DICT
from payment_neogo import neofs_deposit, transfer_mainnet_gas from payment_neogo import neofs_deposit, transfer_mainnet_gas
from python_keywords.node_management import node_healthcheck from python_keywords.node_management import node_healthcheck, create_ssh_client
from sbercloud_helper import SberCloudConfig from sbercloud_helper import SberCloudConfig
@ -26,24 +26,37 @@ logger = logging.getLogger('NeoLogger')
@pytest.fixture(scope='session') @pytest.fixture(scope='session')
def cloud_infrastructure_check(): def cloud_infrastructure_check():
cloud_config = SberCloudConfig.from_env() if not is_cloud_infrastructure():
if not cloud_config.project_id:
pytest.skip('Test only works on SberCloud infrastructure') pytest.skip('Test only works on SberCloud infrastructure')
yield yield
def is_cloud_infrastructure():
cloud_config = SberCloudConfig.from_env()
return cloud_config.project_id is not None
@pytest.fixture(scope='session', autouse=True) @pytest.fixture(scope='session', autouse=True)
@allure.title('Check binary versions') @allure.title('Check binary versions')
def check_binary_versions(request): def check_binary_versions(request):
environment_dir = request.config.getoption('--alluredir') environment_dir = request.config.getoption('--alluredir')
is_cloud = is_cloud_infrastructure()
# Collect versions of neo binaries # Collect versions of neo binaries
binaries = ['neo-go', 'neofs-cli', 'neofs-authmate'] binaries = ['neo-go', 'neofs-cli', 'neofs-authmate']
env_out = {} env_out = _get_binaries_version_local(binaries)
for binary in binaries:
out = _cmd_run(f'{binary} --version') if is_cloud:
version = re.search(r'version[:\s]*(.+)', out, re.IGNORECASE) binaries = ['neo-go',
env_out[binary.upper()] = version.group(1) if version else 'Unknown' 'neofs-adm',
'neofs-cli',
'neofs-http-gw',
'neofs-ir',
'neofs-lens',
'neofs-node',
'neofs-s3-authmate',
'neofs-s3-gw',
'neogo-morph-cn']
env_out = _get_binaries_version_remote(binaries)
# Get version of aws binary # Get version of aws binary
out = _cmd_run('aws --version') out = _cmd_run('aws --version')
@ -56,6 +69,32 @@ def check_binary_versions(request):
out_file.write(f'{env}={env_value}\n') out_file.write(f'{env}={env_value}\n')
def _get_binaries_version_local(binaries: list) -> dict:
env_out = {}
for binary in binaries:
out = _cmd_run(f'{binary} --version')
version = re.search(r'version[:\s]*(.+)', out, re.IGNORECASE)
env_out[binary.upper()] = version.group(1) if version else 'Unknown'
return env_out
def _get_binaries_version_remote(binaries: list) -> dict:
env_out = {}
for node_name in NEOFS_NETMAP_DICT:
with create_ssh_client(node_name) as ssh_client:
for binary in binaries:
out = ssh_client.exec(f'{binary} --version').stdout
version = re.search(r'version[:\s]*(.+)', out, re.IGNORECASE)
version = version.group(1) if version else 'Unknown'
if not env_out.get(binary.upper()):
env_out[binary.upper()] = version
else:
msg = f'Expected binary {binary} versions on node s1 and {node_name} are the same'
assert env_out[binary.upper()] == version, msg
return env_out
@pytest.fixture(scope='session', autouse=True) @pytest.fixture(scope='session', autouse=True)
@allure.title('Run health check for all storage nodes') @allure.title('Run health check for all storage nodes')
def run_health_check(): def run_health_check():

View file

@ -5,15 +5,15 @@ from time import sleep
import allure import allure
import pytest import pytest
from common import STORAGE_NODE_SSH_PRIVATE_KEY_PATH, STORAGE_NODE_SSH_USER, STORAGE_NODE_SSH_PASSWORD, \ from common import (STORAGE_NODE_SSH_PRIVATE_KEY_PATH, STORAGE_NODE_SSH_USER,
NEOFS_NETMAP_DICT STORAGE_NODE_SSH_PASSWORD, NEOFS_NETMAP_DICT)
from failover_utils import wait_all_storage_node_returned, wait_object_replication_on_nodes
from iptables_helper import IpTablesHelper from iptables_helper import IpTablesHelper
from python_keywords.container import create_container from python_keywords.container import create_container
from python_keywords.neofs_verbs import get_object, put_object from python_keywords.neofs_verbs import get_object, put_object
from python_keywords.utility_keywords import generate_file, get_file_hash from python_keywords.utility_keywords import generate_file, get_file_hash
from ssh_helper import HostClient from ssh_helper import HostClient
from wellknown_acl import PUBLIC_ACL from wellknown_acl import PUBLIC_ACL
from .failover_utils import wait_all_storage_node_returned, wait_object_replication_on_nodes
logger = logging.getLogger('NeoLogger') logger = logging.getLogger('NeoLogger')
STORAGE_NODE_COMMUNICATION_PORT = '8080' STORAGE_NODE_COMMUNICATION_PORT = '8080'

View file

@ -5,13 +5,14 @@ import pytest
from common import (STORAGE_NODE_SSH_PRIVATE_KEY_PATH, STORAGE_NODE_SSH_USER, from common import (STORAGE_NODE_SSH_PRIVATE_KEY_PATH, STORAGE_NODE_SSH_USER,
STORAGE_NODE_SSH_PASSWORD) STORAGE_NODE_SSH_PASSWORD)
from failover_utils import wait_all_storage_node_returned, wait_object_replication_on_nodes
from python_keywords.container import create_container from python_keywords.container import create_container
from python_keywords.neofs_verbs import get_object, put_object from python_keywords.neofs_verbs import get_object, put_object
from python_keywords.utility_keywords import generate_file, get_file_hash from python_keywords.utility_keywords import generate_file, get_file_hash
from sbercloud_helper import SberCloud, SberCloudConfig from sbercloud_helper import SberCloud, SberCloudConfig
from ssh_helper import HostClient, HostIsNotAvailable from ssh_helper import HostClient
from wellknown_acl import PUBLIC_ACL from wellknown_acl import PUBLIC_ACL
from .failover_utils import wait_all_storage_node_returned, wait_object_replication_on_nodes
logger = logging.getLogger('NeoLogger') logger = logging.getLogger('NeoLogger')
stopped_hosts = [] stopped_hosts = []

View file

@ -6,24 +6,22 @@ import allure
import pytest import pytest
from data_formatters import get_wallet_public_key from data_formatters import get_wallet_public_key
from common import (COMPLEX_OBJ_SIZE, MAINNET_BLOCK_TIME, NEOFS_CONTRACT_CACHE_TIMEOUT, from common import (COMPLEX_OBJ_SIZE, MAINNET_BLOCK_TIME, NEOFS_CONTRACT_CACHE_TIMEOUT,
NEOFS_NETMAP_DICT, STORAGE_WALLET_PASS) NEOFS_NETMAP_DICT, STORAGE_RPC_ENDPOINT_1, STORAGE_WALLET_PASS)
from epoch import tick_epoch from epoch import tick_epoch
from python_keywords.container import create_container, get_container from python_keywords.container import create_container, get_container
from python_keywords.neofs_verbs import (delete_object, get_object, from python_keywords.failover_utils import wait_object_replication_on_nodes
head_object, put_object) from python_keywords.neofs_verbs import delete_object, get_object, head_object, put_object
from python_keywords.node_management import (drop_object, get_netmap_snapshot, from python_keywords.node_management import (create_ssh_client, drop_object, get_netmap_snapshot,
get_locode, get_locode, node_healthcheck, node_set_status,
node_healthcheck, node_shard_list, node_shard_set_mode,
node_set_status, node_shard_list, start_nodes_remote, stop_nodes_remote)
node_shard_set_mode,
start_nodes_remote,
stop_nodes_remote)
from storage_policy import get_nodes_with_object, get_simple_object_copies from storage_policy import get_nodes_with_object, get_simple_object_copies
from utility import placement_policy_from_container, robot_time_to_int, wait_for_gc_pass_on_storage_nodes from utility import placement_policy_from_container, robot_time_to_int, wait_for_gc_pass_on_storage_nodes
from utility_keywords import generate_file from utility_keywords import generate_file
from wellknown_acl import PUBLIC_ACL from wellknown_acl import PUBLIC_ACL
logger = logging.getLogger('NeoLogger') logger = logging.getLogger('NeoLogger')
check_nodes = []
@pytest.fixture @pytest.fixture
@ -72,6 +70,137 @@ def after_run_set_all_nodes_online():
logger.error(f"Node status change fails with error:\n{err}") logger.error(f"Node status change fails with error:\n{err}")
def wait_for_service_started(ssh_client, service_name: str):
expected_state = 'active (running)'
for __attempt in range(10):
output = ssh_client.exec(f'systemctl status {service_name}')
if expected_state in output.stdout:
return
sleep(3)
raise AssertionError(f'Service {service_name} is not in {expected_state} state')
@pytest.fixture
def return_nodes_after_test_run():
yield
return_nodes()
def cleanup_node(node_to_cleanup, alive_node):
exclude_node_from_network_map(node_to_cleanup, alive_node)
with create_ssh_client(node_to_cleanup) as ssh_client:
ssh_client.exec(f'systemctl stop neofs-storage.service')
ssh_client.exec('rm -rf /srv/neofs/*')
sleep(robot_time_to_int(MAINNET_BLOCK_TIME))
@allure.step('Return node to cluster')
def return_nodes(alive_node: str = None):
for node in list(check_nodes):
with create_ssh_client(node) as ssh_client:
ssh_client.exec(f'systemctl start neofs-storage.service')
wait_for_service_started(ssh_client, 'neofs-storage.service')
sleep(robot_time_to_int(MAINNET_BLOCK_TIME))
with allure.step(f'Move node {node} to online state'):
node_set_status(node, status='online', retry=True)
check_nodes.remove(node)
sleep(robot_time_to_int(MAINNET_BLOCK_TIME))
for __attempt in range(3):
try:
tick_epoch()
break
except RuntimeError:
sleep(3)
check_node_in_map(node, alive_node)
def exclude_node_from_network_map(node_to_exclude, alive_node):
node_wallet_path = NEOFS_NETMAP_DICT[node_to_exclude]['wallet_path']
node_netmap_key = get_wallet_public_key(
node_wallet_path,
STORAGE_WALLET_PASS,
format="base58"
)
with allure.step(f'Move node {node_to_exclude} to offline state'):
node_set_status(node_to_exclude, status='offline')
sleep(robot_time_to_int(MAINNET_BLOCK_TIME))
tick_epoch()
snapshot = get_netmap_snapshot(node_name=alive_node)
assert node_netmap_key not in snapshot, f'Expected node with key {node_netmap_key} not in network map'
def include_node_to_network_map(node_to_include, alive_node):
with allure.step(f'Move node {node_to_include} to online state'):
node_set_status(node_to_include, status='online')
sleep(robot_time_to_int(MAINNET_BLOCK_TIME))
tick_epoch()
check_node_in_map(node_to_include, alive_node)
@allure.step('Check node {node_name} in network map')
def check_node_in_map(node_name: str, alive_node: str = None):
alive_node = alive_node or node_name
node_wallet_path = NEOFS_NETMAP_DICT[node_name]['wallet_path']
node_netmap_key = get_wallet_public_key(
node_wallet_path,
STORAGE_WALLET_PASS,
format="base58"
)
logger.info(f'Node {node_name} netmap key: {node_netmap_key}')
snapshot = get_netmap_snapshot(node_name=alive_node)
assert node_netmap_key in snapshot, f'Expected node with key {node_netmap_key} in network map'
@allure.title('Add one node to cluster')
@pytest.mark.add_nodes
@pytest.mark.node_mgmt
def test_add_nodes(prepare_tmp_dir, prepare_wallet_and_deposit, return_nodes_after_test_run):
wallet = prepare_wallet_and_deposit
placement_rule_3 = 'REP 3 IN X CBF 1 SELECT 3 FROM * AS X'
placement_rule_4 = 'REP 4 IN X CBF 1 SELECT 4 FROM * AS X'
source_file_path = generate_file()
additional_node = choice(list(
node for node, node_config in NEOFS_NETMAP_DICT.items() if node_config.get('rpc') != STORAGE_RPC_ENDPOINT_1))
alive_node = choice([node for node in NEOFS_NETMAP_DICT if node != additional_node])
check_node_in_map(additional_node, alive_node)
with allure.step(f'Exclude node {additional_node} from map and clean it up'):
cleanup_node(additional_node, alive_node)
check_nodes.append(additional_node)
cid = create_container(wallet, rule=placement_rule_3, basic_acl=PUBLIC_ACL)
oid = put_object(wallet, source_file_path, cid, endpoint=NEOFS_NETMAP_DICT[alive_node].get('rpc'))
wait_object_replication_on_nodes(wallet, cid, oid, 3)
return_nodes(alive_node)
with allure.step('Check data could be replicated to new node'):
random_node = choice([node for node in NEOFS_NETMAP_DICT if node not in (additional_node, alive_node)])
exclude_node_from_network_map(random_node, alive_node)
wait_object_replication_on_nodes(wallet, cid, oid, 3, excluded_nodes=[random_node])
include_node_to_network_map(random_node, alive_node)
wait_object_replication_on_nodes(wallet, cid, oid, 3)
with allure.step('Check container could be created with new node'):
cid = create_container(wallet, rule=placement_rule_4, basic_acl=PUBLIC_ACL)
oid = put_object(wallet, source_file_path, cid, endpoint=NEOFS_NETMAP_DICT[alive_node].get('rpc'))
wait_object_replication_on_nodes(wallet, cid, oid, 4)
@allure.title('Control Operations with storage nodes') @allure.title('Control Operations with storage nodes')
@pytest.mark.node_mgmt @pytest.mark.node_mgmt
def test_nodes_management(prepare_tmp_dir): def test_nodes_management(prepare_tmp_dir):

View file

@ -16,6 +16,8 @@ import pexpect
from robot.api import logger from robot.api import logger
ROBOT_AUTO_KEYWORDS = False ROBOT_AUTO_KEYWORDS = False
COLOR_GREEN = "\033[92m"
COLOR_OFF = "\033[0m"
def _cmd_run(cmd: str, timeout: int = 30) -> str: def _cmd_run(cmd: str, timeout: int = 30) -> str:
@ -26,7 +28,7 @@ def _cmd_run(cmd: str, timeout: int = 30) -> str:
compl_proc = None compl_proc = None
start_time = datetime.now() start_time = datetime.now()
try: try:
logger.info(f"Executing command: {cmd}") logger.info(f"{COLOR_GREEN}Executing command: {cmd}{COLOR_OFF}")
start_time = datetime.utcnow() start_time = datetime.utcnow()
compl_proc = subprocess.run(cmd, check=True, universal_newlines=True, compl_proc = subprocess.run(cmd, check=True, universal_newlines=True,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
@ -35,7 +37,7 @@ def _cmd_run(cmd: str, timeout: int = 30) -> str:
output = compl_proc.stdout output = compl_proc.stdout
return_code = compl_proc.returncode return_code = compl_proc.returncode
end_time = datetime.utcnow() end_time = datetime.utcnow()
logger.info(f"Output: {output}") logger.info(f"{COLOR_GREEN}Output: {output}{COLOR_OFF}")
_attach_allure_log(cmd, output, return_code, start_time, end_time) _attach_allure_log(cmd, output, return_code, start_time, end_time)
return output return output

View file

@ -1,5 +1,7 @@
#!/usr/bin/python3.9 #!/usr/bin/python3.9
import sys
import allure
from robot.api import logger from robot.api import logger
from robot.api.deco import keyword from robot.api.deco import keyword
@ -28,13 +30,22 @@ def tick_epoch():
# If neofs-adm is available, then we tick epoch with it (to be consistent with UAT tests) # If neofs-adm is available, then we tick epoch with it (to be consistent with UAT tests)
cmd = f"{NEOFS_ADM_EXEC} morph force-new-epoch -c {NEOFS_ADM_CONFIG_PATH}" cmd = f"{NEOFS_ADM_EXEC} morph force-new-epoch -c {NEOFS_ADM_CONFIG_PATH}"
logger.info(f"Executing shell command: {cmd}") logger.info(f"Executing shell command: {cmd}")
out = ''
err = ''
try: try:
out = wrappers.run_sh(cmd) out = wrappers.run_sh(cmd)
logger.info(f"Command completed with output: {out}") logger.info(f"Command completed with output: {out}")
except Exception as exc: except Exception as exc:
logger.error(exc) logger.error(exc)
err = str(exc)
raise RuntimeError("Failed to tick epoch") from exc raise RuntimeError("Failed to tick epoch") from exc
finally:
if 'allure' in sys.modules:
allure.attach((
f'COMMAND: {cmd}\n'
f'OUTPUT:\n {out}\n'
f'ERROR: {err}\n'
), 'Tick Epoch', allure.attachment_type.TEXT)
return return
# Otherwise we tick epoch using transaction # Otherwise we tick epoch using transaction

View file

@ -178,7 +178,7 @@ def node_healthcheck(node_name: str) -> HealthStatus:
@keyword('Set status for node') @keyword('Set status for node')
def node_set_status(node_name: str, status: str) -> None: def node_set_status(node_name: str, status: str, retry=False) -> None:
""" """
The function sets particular status for given node. The function sets particular status for given node.
Args: Args:
@ -188,6 +188,11 @@ def node_set_status(node_name: str, status: str) -> None:
(void) (void)
""" """
command = f"control set-status --status {status}" command = f"control set-status --status {status}"
try:
run_control_command(node_name, command)
except AssertionError as err:
if not retry:
raise AssertionError(f'Command control set-status failed with error {err}') from err
run_control_command(node_name, command) run_control_command(node_name, command)