2022-07-11 14:11:26 +00:00
import logging
2023-05-31 15:31:35 +00:00
from datetime import datetime
2023-03-09 10:19:41 +00:00
from time import sleep
2022-07-11 14:11:26 +00:00
import allure
import pytest
2023-03-09 10:19:41 +00:00
from frostfs_testlib . analytics import test_case
2023-01-09 12:46:03 +00:00
from frostfs_testlib . hosting import Host
2023-05-15 09:59:33 +00:00
from frostfs_testlib . resources . common import MORPH_BLOCK_TIME
from frostfs_testlib . resources . wellknown_acl import PUBLIC_ACL
2023-05-02 18:46:59 +00:00
from frostfs_testlib . s3 import AwsCliClient , Boto3ClientWrapper , S3ClientWrapper , VersioningStatus
2023-05-15 09:59:33 +00:00
from frostfs_testlib . shell import CommandOptions , Shell
2023-05-31 15:31:35 +00:00
from frostfs_testlib . steps . cli . container import (
StorageContainer ,
StorageContainerInfo ,
create_container ,
)
2023-05-15 09:59:33 +00:00
from frostfs_testlib . steps . cli . object import get_object , put_object_to_random_node
from frostfs_testlib . steps . node_management import (
2023-03-09 10:19:41 +00:00
check_node_in_map ,
check_node_not_in_map ,
exclude_node_from_network_map ,
include_node_to_network_map ,
2023-05-15 09:59:33 +00:00
remove_nodes_from_map_morph ,
2023-03-09 10:19:41 +00:00
wait_for_node_to_be_ready ,
)
2023-05-15 09:59:33 +00:00
from frostfs_testlib . steps . s3 import s3_helper
2023-05-31 15:31:35 +00:00
from frostfs_testlib . storage . cluster import Cluster , ClusterNode , StorageNode
from frostfs_testlib . storage . controllers import ClusterStateController , ShardsWatcher
2023-08-02 11:54:03 +00:00
from frostfs_testlib . storage . dataclasses . object_size import ObjectSize
2023-05-31 15:31:35 +00:00
from frostfs_testlib . storage . dataclasses . storage_object_info import StorageObjectInfo
from frostfs_testlib . storage . dataclasses . wallet import WalletInfo
2023-05-15 09:59:33 +00:00
from frostfs_testlib . testing . cluster_test_base import ClusterTestBase
2023-05-30 13:42:27 +00:00
from frostfs_testlib . testing . test_control import expect_not_raises
2023-05-15 09:59:33 +00:00
from frostfs_testlib . utils import datetime_utils
from frostfs_testlib . utils . failover_utils import (
wait_all_storage_nodes_returned ,
wait_object_replication ,
2023-03-09 10:19:41 +00:00
)
2023-05-30 13:42:27 +00:00
from frostfs_testlib . utils . file_keeper import FileKeeper
2023-05-15 09:59:33 +00:00
from frostfs_testlib . utils . file_utils import generate_file , get_file_hash
2023-03-09 10:19:41 +00:00
2022-09-28 12:07:16 +00:00
logger = logging . getLogger ( " NeoLogger " )
2022-12-05 22:31:45 +00:00
stopped_nodes : list [ StorageNode ] = [ ]
2022-07-11 14:11:26 +00:00
2023-08-02 11:54:03 +00:00
def pytest_generate_tests ( metafunc : pytest . Metafunc ) :
if " s3_client " in metafunc . fixturenames :
metafunc . parametrize ( " s3_client " , [ AwsCliClient , Boto3ClientWrapper ] , indirect = True )
2023-05-30 13:42:27 +00:00
@pytest.fixture ( scope = " function " )
@allure.title ( " Provide File Keeper " )
def file_keeper ( ) :
keeper = FileKeeper ( )
yield keeper
keeper . restore_files ( )
2022-10-13 16:13:45 +00:00
@allure.step ( " Return all stopped hosts " )
2023-03-14 09:21:40 +00:00
@pytest.fixture ( scope = " function " , autouse = True )
2023-05-30 13:42:27 +00:00
def after_run_return_all_stopped_hosts ( client_shell : Shell , cluster : Cluster ) - > str :
yield " After this test stopped services will be started automatically via fixture "
2023-05-15 09:59:33 +00:00
return_stopped_hosts ( client_shell , cluster )
2022-07-11 14:11:26 +00:00
2023-05-30 13:42:27 +00:00
@allure.step ( " Return all stopped storage services after test " )
@pytest.fixture ( scope = " function " )
def after_run_return_all_stopped_services ( cluster_state_controller : ClusterStateController ) :
yield
cluster_state_controller . start_stopped_storage_services ( )
2023-06-13 09:05:35 +00:00
@allure.step ( " Return all stopped S3 GateWay services after test " )
@pytest.fixture ( scope = " function " )
def after_run_return_all_stopped_s3 ( cluster_state_controller : ClusterStateController ) :
yield
2023-07-27 09:05:49 +00:00
cluster_state_controller . start_stopped_s3_gates ( )
2023-06-13 09:05:35 +00:00
2022-10-13 16:13:45 +00:00
def panic_reboot_host ( host : Host ) - > None :
shell = host . get_shell ( )
shell . exec ( ' sudo sh -c " echo 1 > /proc/sys/kernel/sysrq " ' )
options = CommandOptions ( close_stdin = True , timeout = 1 , check = False )
shell . exec ( ' sudo sh -c " echo b > /proc/sysrq-trigger " ' , options )
2022-07-11 14:11:26 +00:00
2023-05-15 09:59:33 +00:00
def return_stopped_hosts ( shell : Shell , cluster : Cluster ) - > None :
2022-12-05 22:31:45 +00:00
for node in list ( stopped_nodes ) :
with allure . step ( f " Start host { node } " ) :
node . host . start_host ( )
stopped_nodes . remove ( node )
2022-08-01 06:16:36 +00:00
2023-05-15 09:59:33 +00:00
wait_all_storage_nodes_returned ( shell , cluster )
2022-07-11 14:11:26 +00:00
2022-07-14 06:21:20 +00:00
@pytest.mark.failover
2022-12-05 22:31:45 +00:00
class TestFailoverStorage ( ClusterTestBase ) :
2023-09-08 10:35:34 +00:00
@allure.title ( " Shutdown and start node (stop_mode= {stop_mode} ) " )
2023-08-02 11:54:03 +00:00
@pytest.mark.parametrize ( " stop_mode " , [ " hard " , " soft " ] )
2022-12-05 22:31:45 +00:00
@pytest.mark.failover_reboot
def test_lose_storage_node_host (
2023-08-02 11:54:03 +00:00
self ,
default_wallet ,
stop_mode : str ,
require_multiple_hosts ,
simple_object_size : ObjectSize ,
2022-12-05 22:31:45 +00:00
) :
wallet = default_wallet
placement_rule = " REP 2 IN X CBF 2 SELECT 2 FROM * AS X "
2023-08-02 11:54:03 +00:00
source_file_path = generate_file ( simple_object_size . value )
2022-12-05 22:31:45 +00:00
cid = create_container (
wallet ,
shell = self . shell ,
endpoint = self . cluster . default_rpc_endpoint ,
rule = placement_rule ,
basic_acl = PUBLIC_ACL ,
)
oid = put_object_to_random_node (
wallet , source_file_path , cid , shell = self . shell , cluster = self . cluster
)
nodes = wait_object_replication (
cid , oid , 2 , shell = self . shell , nodes = self . cluster . storage_nodes
2022-10-21 14:53:54 +00:00
)
2022-07-11 14:11:26 +00:00
2022-12-05 22:31:45 +00:00
for node in nodes :
stopped_nodes . append ( node )
with allure . step ( f " Stop host { node } " ) :
2023-08-02 11:54:03 +00:00
node . host . stop_host ( stop_mode )
2022-12-05 22:31:45 +00:00
new_nodes = wait_object_replication (
cid ,
oid ,
2 ,
shell = self . shell ,
2023-09-07 07:18:18 +00:00
nodes = list ( set ( self . cluster . storage_nodes ) - { * stopped_nodes } ) ,
2022-12-05 22:31:45 +00:00
)
assert all ( old_node not in new_nodes for old_node in nodes )
with allure . step ( " Check object data is not corrupted " ) :
got_file_path = get_object (
wallet , cid , oid , endpoint = new_nodes [ 0 ] . get_rpc_endpoint ( ) , shell = self . shell
)
assert get_file_hash ( source_file_path ) == get_file_hash ( got_file_path )
2022-12-16 08:30:47 +00:00
with allure . step ( " Return all hosts " ) :
2023-05-15 09:59:33 +00:00
return_stopped_hosts ( self . shell , self . cluster )
2022-12-05 22:31:45 +00:00
with allure . step ( " Check object data is not corrupted " ) :
new_nodes = wait_object_replication (
cid , oid , 2 , shell = self . shell , nodes = self . cluster . storage_nodes
)
got_file_path = get_object (
wallet , cid , oid , shell = self . shell , endpoint = new_nodes [ 0 ] . get_rpc_endpoint ( )
)
assert get_file_hash ( source_file_path ) == get_file_hash ( got_file_path )
2023-09-08 10:35:34 +00:00
@allure.title ( " Panic reboot nodes (sequenced_reboots= {sequence} ) " )
2022-12-05 22:31:45 +00:00
@pytest.mark.parametrize ( " sequence " , [ True , False ] )
@pytest.mark.failover_panic
def test_panic_storage_node_host (
2023-08-02 11:54:03 +00:00
self , default_wallet , require_multiple_hosts , sequence : bool , simple_object_size : ObjectSize
2022-12-05 22:31:45 +00:00
) :
wallet = default_wallet
placement_rule = " REP 2 IN X CBF 2 SELECT 2 FROM * AS X "
2023-08-02 11:54:03 +00:00
source_file_path = generate_file ( simple_object_size . value )
2022-12-05 22:31:45 +00:00
cid = create_container (
wallet ,
shell = self . shell ,
endpoint = self . cluster . default_rpc_endpoint ,
rule = placement_rule ,
basic_acl = PUBLIC_ACL ,
)
oid = put_object_to_random_node (
wallet , source_file_path , cid , shell = self . shell , cluster = self . cluster
)
2022-07-11 14:11:26 +00:00
2022-12-05 22:31:45 +00:00
nodes = wait_object_replication (
cid , oid , 2 , shell = self . shell , nodes = self . cluster . storage_nodes
)
allure . attach (
2022-12-16 08:30:47 +00:00
" \n " . join ( [ str ( node ) for node in nodes ] ) ,
2022-12-05 22:31:45 +00:00
" Current nodes with object " ,
allure . attachment_type . TEXT ,
)
2022-07-11 14:11:26 +00:00
2022-12-05 22:31:45 +00:00
new_nodes : list [ StorageNode ] = [ ]
for node in nodes :
with allure . step ( f " Hard reboot host { node } via magic SysRq option " ) :
panic_reboot_host ( node . host )
if sequence :
try :
new_nodes = wait_object_replication (
cid ,
oid ,
2 ,
shell = self . shell ,
nodes = list ( set ( self . cluster . storage_nodes ) - { node } ) ,
)
except AssertionError :
new_nodes = wait_object_replication (
cid ,
oid ,
2 ,
shell = self . shell ,
nodes = self . cluster . storage_nodes ,
)
allure . attach (
2022-12-16 08:30:47 +00:00
" \n " . join ( [ str ( new_node ) for new_node in new_nodes ] ) ,
2022-12-05 22:31:45 +00:00
f " Nodes with object after { node } fail " ,
allure . attachment_type . TEXT ,
2022-10-13 18:53:44 +00:00
)
2022-08-01 06:16:36 +00:00
2022-12-05 22:31:45 +00:00
if not sequence :
new_nodes = wait_object_replication (
cid , oid , 2 , shell = self . shell , nodes = self . cluster . storage_nodes
)
allure . attach (
2022-12-16 08:30:47 +00:00
" \n " . join ( [ str ( new_node ) for new_node in new_nodes ] ) ,
2022-12-05 22:31:45 +00:00
" Nodes with object after nodes fail " ,
allure . attachment_type . TEXT ,
)
got_file_path = get_object (
wallet , cid , oid , shell = self . shell , endpoint = new_nodes [ 0 ] . get_rpc_endpoint ( )
2022-09-28 12:07:16 +00:00
)
2022-12-05 22:31:45 +00:00
assert get_file_hash ( source_file_path ) == get_file_hash ( got_file_path )
2023-03-09 10:19:41 +00:00
2023-09-08 10:35:34 +00:00
@allure.title ( " Do not ignore unhealthy tree endpoints (s3_client= {s3_client} ) " )
2023-06-13 09:05:35 +00:00
def test_unhealthy_tree (
self ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-06-13 09:05:35 +00:00
cluster_state_controller : ClusterStateController ,
after_run_return_all_stopped_s3 ,
after_run_return_all_stopped_services ,
) :
default_node = self . cluster . cluster_nodes [ 0 ]
default_s3gate = self . cluster . s3_gates [ 0 ]
with allure . step ( " Turn S3 GW off on default node " ) :
default_s3gate . stop_service ( )
with allure . step ( " Turn off storage on default node " ) :
cluster_state_controller . stop_storage_service ( default_node )
with allure . step ( " Turn on S3 GW on default node " ) :
default_s3gate . start_service ( )
with allure . step ( " Turn on storage on default node " ) :
cluster_state_controller . start_stopped_storage_services ( )
with allure . step ( " Create bucket with REP 1 SELECT 1 policy " ) :
bucket = s3_client . create_bucket (
location_constraint = " load-1-1 " ,
)
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-06-13 09:05:35 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
with allure . step ( " Put object into bucket " ) :
put_object = s3_client . put_object ( bucket , file_path )
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
with allure . step ( " Turn off all storage nodes except default " ) :
for node in self . cluster . cluster_nodes [ 1 : ] :
with allure . step ( f " Stop storage service on node: { node } " ) :
cluster_state_controller . stop_storage_service ( node )
with allure . step ( " Check that object is available " ) :
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
2023-03-09 10:19:41 +00:00
@pytest.mark.failover
@pytest.mark.failover_empty_map
2023-05-15 09:59:33 +00:00
class TestEmptyMap ( ClusterTestBase ) :
2023-03-09 10:19:41 +00:00
"""
A set of tests for makes map empty and verify that we can read objects after that
"""
2023-05-15 09:59:33 +00:00
2023-03-09 10:19:41 +00:00
@allure.step ( " Teardown after EmptyMap offline test " )
@pytest.fixture ( )
def empty_map_offline_teardown ( self ) :
yield
with allure . step ( " Return all storage nodes to network map " ) :
for node in list ( stopped_nodes ) :
2023-05-15 09:59:33 +00:00
include_node_to_network_map ( node , node , shell = self . shell , cluster = self . cluster )
2023-03-09 10:19:41 +00:00
stopped_nodes . remove ( node )
@test_case.title ( " Test makes network map empty (offline all storage nodes) " )
@test_case.priority ( test_case . TestCasePriority . HIGH )
@test_case.suite_name ( " failovers " )
@test_case.suite_section ( " test_failover_storage " )
@pytest.mark.failover_empty_map_offlne
2023-09-08 10:35:34 +00:00
@allure.title ( " Empty network map via offline all storage nodes (s3_client= {s3_client} ) " )
2023-05-15 09:59:33 +00:00
def test_offline_all_storage_nodes (
self ,
s3_client : S3ClientWrapper ,
bucket : str ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-15 09:59:33 +00:00
empty_map_offline_teardown ,
) :
2023-03-09 10:19:41 +00:00
"""
The test makes network map empty ( set offline status on all storage nodes ) then returns all nodes to map and checks that object can read through s3 .
2023-05-15 09:59:33 +00:00
2023-03-09 10:19:41 +00:00
Steps :
1. Check that bucket is empty
2 : PUT object into bucket
3 : Check that object exists in bucket
4 : Exclude all storage nodes from network map ( set status OFFLINE )
5 : Return all storage nodes to network map
6 : Check that we can read object from #2
Args :
bucket : bucket which contains tested object
simple_object_size : size of object
"""
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-15 09:59:33 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
2023-03-09 10:19:41 +00:00
bucket_objects = [ file_name ]
2023-05-15 09:59:33 +00:00
objects_list = s3_client . list_objects ( bucket )
2023-03-09 10:19:41 +00:00
assert not objects_list , f " Expected empty bucket, got { objects_list } "
with allure . step ( " Put object into bucket " ) :
2023-05-15 09:59:33 +00:00
s3_client . put_object ( bucket , file_path )
2023-03-09 10:19:41 +00:00
with allure . step ( " Check that object exists in bucket " ) :
2023-05-15 09:59:33 +00:00
s3_helper . check_objects_in_bucket ( s3_client , bucket , bucket_objects )
2023-03-09 10:19:41 +00:00
storage_nodes = self . cluster . storage_nodes
with allure . step ( " Exclude all storage nodes from network map " ) :
for node in storage_nodes :
2023-05-15 09:59:33 +00:00
exclude_node_from_network_map ( node , node , shell = self . shell , cluster = self . cluster )
2023-03-09 10:19:41 +00:00
stopped_nodes . append ( node )
with allure . step ( " Return all storage nodes to network map " ) :
for node in storage_nodes :
2023-05-15 09:59:33 +00:00
include_node_to_network_map ( node , node , shell = self . shell , cluster = self . cluster )
2023-03-09 10:19:41 +00:00
stopped_nodes . remove ( node )
with allure . step ( " Check that we can read object " ) :
2023-05-15 09:59:33 +00:00
s3_helper . check_objects_in_bucket ( s3_client , bucket , bucket_objects )
2023-03-09 10:19:41 +00:00
@allure.step ( " Teardown after EmptyMap stop service test " )
@pytest.fixture ( )
def empty_map_stop_service_teardown ( self ) :
yield
with allure . step ( " Return all storage nodes to network map " ) :
for node in list ( list ( stopped_nodes ) ) :
with allure . step ( f " Start node { node } " ) :
node . start_service ( )
with allure . step ( f " Waiting status ready for node { node } " ) :
wait_for_node_to_be_ready ( node )
sleep ( datetime_utils . parse_time ( MORPH_BLOCK_TIME ) )
self . tick_epochs ( 1 )
check_node_in_map ( node , shell = self . shell , alive_node = node )
stopped_nodes . remove ( node )
@test_case.title ( " Test makes network map empty (stop storage service on all nodes) " )
@test_case.priority ( test_case . TestCasePriority . HIGH )
@test_case.suite_name ( " failovers " )
@test_case.suite_section ( " test_failover_storage " )
@pytest.mark.failover_empty_map_stop_service
2023-09-08 10:35:34 +00:00
@allure.title ( " Empty network map via stop all storage services (s3_client= {s3_client} ) " )
2023-05-15 09:59:33 +00:00
def test_stop_all_storage_nodes (
self ,
s3_client : S3ClientWrapper ,
bucket : str ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-15 09:59:33 +00:00
empty_map_stop_service_teardown ,
) :
2023-03-09 10:19:41 +00:00
"""
2023-05-15 09:59:33 +00:00
The test makes network map empty ( stop storage service on all nodes
then use ' frostfs-adm morph delete-nodes ' to delete nodes from map )
2023-03-09 10:19:41 +00:00
then start all services and checks that object can read through s3 .
Steps :
1. Check that bucket is empty
2 : PUT object into bucket
3 : Check that object exists in bucket
2023-05-15 09:59:33 +00:00
4 : Exclude all storage nodes from network map ( stop storage service
2023-03-09 10:19:41 +00:00
and manual exclude from map )
5 : Return all storage nodes to network map
6 : Check that we can read object from #2
Args :
bucket : bucket which contains tested object
simple_object_size : size of object
"""
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-15 09:59:33 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
2023-03-09 10:19:41 +00:00
bucket_objects = [ file_name ]
2023-05-15 09:59:33 +00:00
objects_list = s3_client . list_objects ( bucket )
2023-03-09 10:19:41 +00:00
assert not objects_list , f " Expected empty bucket, got { objects_list } "
with allure . step ( " Put object into bucket " ) :
2023-05-15 09:59:33 +00:00
s3_client . put_object ( bucket , file_path )
2023-03-09 10:19:41 +00:00
with allure . step ( " Check that object exists in bucket " ) :
2023-05-15 09:59:33 +00:00
s3_helper . check_objects_in_bucket ( s3_client , bucket , bucket_objects )
2023-03-09 10:19:41 +00:00
with allure . step ( " Stop all storage nodes " ) :
for node in self . cluster . storage_nodes :
with allure . step ( f " Stop storage service on node: { node } " ) :
node . stop_service ( )
stopped_nodes . append ( node )
2023-05-15 09:59:33 +00:00
with allure . step ( " Remove all nodes from network map " ) :
remove_nodes_from_map_morph (
shell = self . shell , cluster = self . cluster , remove_nodes = stopped_nodes
)
2023-03-09 10:19:41 +00:00
with allure . step ( " Return all storage nodes to network map " ) :
self . return_nodes_after_stop_with_check_empty_map ( stopped_nodes )
with allure . step ( " Check that object exists in bucket " ) :
2023-05-15 09:59:33 +00:00
s3_helper . check_objects_in_bucket ( s3_client , bucket , bucket_objects )
2023-03-09 10:19:41 +00:00
@allure.step ( " Return all nodes to cluster with check empty map first " )
2023-05-15 09:59:33 +00:00
def return_nodes_after_stop_with_check_empty_map ( self , return_nodes = None ) - > None :
2023-03-09 10:19:41 +00:00
first_node = True
for node in list ( return_nodes ) :
with allure . step ( f " Start node { node } " ) :
node . start_service ( )
with allure . step ( f " Waiting status ready for node { node } " ) :
wait_for_node_to_be_ready ( node )
2023-05-15 09:59:33 +00:00
with allure . step ( " Make sure that network map is empty " ) :
2023-03-09 10:19:41 +00:00
if first_node :
for check_node in list ( return_nodes ) :
check_node_not_in_map ( check_node , shell = self . shell , alive_node = node )
first_node = False
sleep ( datetime_utils . parse_time ( MORPH_BLOCK_TIME ) )
self . tick_epochs ( 1 )
check_node_in_map ( node , shell = self . shell , alive_node = node )
stopped_nodes . remove ( node )
2023-05-02 18:46:59 +00:00
2023-09-08 10:35:34 +00:00
@allure.title ( " Object loss from fstree/blobovnicza (versioning=enabled, s3_client= {s3_client} ) " )
2023-05-02 18:46:59 +00:00
def test_s3_fstree_blobovnicza_loss_versioning_on (
2023-05-26 04:49:23 +00:00
self ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-26 04:49:23 +00:00
cluster_state_controller : ClusterStateController ,
2023-05-02 18:46:59 +00:00
) :
bucket = s3_client . create_bucket ( )
s3_helper . set_bucket_versioning ( s3_client , bucket , VersioningStatus . ENABLED )
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-02 18:46:59 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
object_versions = [ ]
with allure . step ( " Put object into one bucket " ) :
put_object = s3_client . put_object ( bucket , file_path )
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
object_versions . append ( put_object )
with allure . step ( " Stop all storage nodes " ) :
2023-05-26 04:49:23 +00:00
for node in self . cluster . cluster_nodes :
2023-05-02 18:46:59 +00:00
with allure . step ( f " Stop storage service on node: { node } " ) :
2023-05-26 04:49:23 +00:00
cluster_state_controller . stop_storage_service ( node )
2023-05-02 18:46:59 +00:00
with allure . step ( " Delete blobovnicza and fstree from all nodes " ) :
for node in self . cluster . storage_nodes :
node . delete_blobovnicza ( )
node . delete_fstree ( )
with allure . step ( " Start all storage nodes " ) :
2023-05-26 04:49:23 +00:00
cluster_state_controller . start_stopped_storage_services ( )
2023-05-02 18:46:59 +00:00
# need to get Delete Marker first
with allure . step ( " Delete the object from the bucket " ) :
delete_object = s3_client . delete_object ( bucket , file_name )
object_versions . append ( delete_object [ " VersionId " ] )
# and now delete all versions of object (including Delete Markers)
with allure . step ( " Delete all versions of the object from the bucket " ) :
for version in object_versions :
delete_object = s3_client . delete_object ( bucket , file_name , version_id = version )
with allure . step ( " Delete bucket " ) :
s3_client . delete_bucket ( bucket )
2023-09-08 10:35:34 +00:00
@allure.title (
" Object loss from fstree/blobovnicza (versioning=disabled, s3_client= {s3_client} ) "
)
2023-05-02 18:46:59 +00:00
def test_s3_fstree_blobovnicza_loss_versioning_off (
2023-05-26 04:49:23 +00:00
self ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-26 04:49:23 +00:00
cluster_state_controller : ClusterStateController ,
2023-05-02 18:46:59 +00:00
) :
bucket = s3_client . create_bucket ( )
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-02 18:46:59 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
with allure . step ( " Put object into one bucket " ) :
s3_client . put_object ( bucket , file_path )
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
with allure . step ( " Stop all storage nodes " ) :
2023-05-26 04:49:23 +00:00
for node in self . cluster . cluster_nodes :
2023-05-02 18:46:59 +00:00
with allure . step ( f " Stop storage service on node: { node } " ) :
2023-05-26 04:49:23 +00:00
cluster_state_controller . stop_storage_service ( node )
2023-05-02 18:46:59 +00:00
with allure . step ( " Delete blobovnicza and fstree from all nodes " ) :
for node in self . cluster . storage_nodes :
node . delete_blobovnicza ( )
node . delete_fstree ( )
with allure . step ( " Start all storage nodes " ) :
2023-05-26 04:49:23 +00:00
cluster_state_controller . start_stopped_storage_services ( )
2023-05-02 18:46:59 +00:00
with allure . step ( " Delete the object from the bucket " ) :
s3_client . delete_object ( bucket , file_name )
with allure . step ( " Delete bucket " ) :
s3_client . delete_bucket ( bucket )
2023-05-22 06:55:51 +00:00
@pytest.mark.skip ( reason = " Need to increase cache lifetime " )
@pytest.mark.parametrize (
# versioning should NOT be VersioningStatus.SUSPENDED, it needs to be undefined
" versioning_status " ,
2023-08-07 09:43:16 +00:00
[ VersioningStatus . ENABLED , VersioningStatus . UNDEFINED ] ,
2023-05-22 06:55:51 +00:00
)
@allure.title (
2023-09-08 10:35:34 +00:00
" After Pilorama.db loss on all nodes list objects should return nothing in second listing (versioning_status= {versioning_status} , s3_client= {s3_client} ) "
2023-05-22 06:55:51 +00:00
)
def test_s3_pilorama_loss (
self ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-22 06:55:51 +00:00
versioning_status : VersioningStatus ,
2023-05-26 04:49:23 +00:00
cluster_state_controller : ClusterStateController ,
2023-05-22 06:55:51 +00:00
) :
bucket = s3_client . create_bucket ( )
2023-08-07 09:43:16 +00:00
s3_helper . set_bucket_versioning ( s3_client , bucket , versioning_status )
2023-05-22 06:55:51 +00:00
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-22 06:55:51 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
with allure . step ( " Put object into one bucket " ) :
s3_client . put_object ( bucket , file_path )
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
with allure . step ( " Stop all storage nodes " ) :
2023-05-26 04:49:23 +00:00
for node in self . cluster . cluster_nodes :
2023-05-22 06:55:51 +00:00
with allure . step ( f " Stop storage service on node: { node } " ) :
2023-05-26 04:49:23 +00:00
cluster_state_controller . stop_storage_service ( node )
2023-05-22 06:55:51 +00:00
with allure . step ( " Delete pilorama.db from all nodes " ) :
for node in self . cluster . storage_nodes :
node . delete_pilorama ( )
with allure . step ( " Start all storage nodes " ) :
2023-05-26 04:49:23 +00:00
cluster_state_controller . start_stopped_storage_services ( )
2023-05-22 06:55:51 +00:00
with allure . step ( " Check list objects first time " ) :
objects_list = s3_client . list_objects ( bucket )
assert objects_list , f " Expected not empty bucket "
with allure . step ( " Check list objects second time " ) :
objects_list = s3_client . list_objects ( bucket )
assert not objects_list , f " Expected empty bucket, got { objects_list } "
with allure . step ( " Delete bucket " ) :
s3_client . delete_bucket ( bucket )
2023-05-30 13:42:27 +00:00
@pytest.mark.failover
@pytest.mark.failover_data_loss
class TestStorageDataLoss ( ClusterTestBase ) :
2023-05-30 14:14:26 +00:00
@allure.step ( " Get list of all piloramas on node " )
def get_piloramas_list ( self , cluster_state_controller , node ) - > list :
data_directory_path = cluster_state_controller . get_data_directory ( )
cmd = f " sudo ls -1 { data_directory_path } /meta*/pilorama* "
shell = cluster_state_controller . host . get_shell ( )
stdout = shell . exec ( cmd ) . stdout
piloramas = stdout . split ( " \n " )
return piloramas
2023-05-30 13:42:27 +00:00
@allure.title (
2023-09-08 10:35:34 +00:00
" After metabase loss on all nodes operations on objects and buckets should be still available via S3 (s3_client= {s3_client} ) "
2023-05-30 13:42:27 +00:00
)
@pytest.mark.metabase_loss
def test_metabase_loss (
self ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
complex_object_size : ObjectSize ,
2023-05-30 13:42:27 +00:00
cluster_state_controller : ClusterStateController ,
after_run_return_all_stopped_services : str ,
file_keeper : FileKeeper ,
) :
allure . dynamic . description ( after_run_return_all_stopped_services )
with allure . step ( " Create bucket " ) :
bucket = s3_client . create_bucket ( )
with allure . step ( " Put objects into bucket " ) :
2023-08-02 11:54:03 +00:00
simple_object_path = generate_file ( simple_object_size . value )
2023-05-30 13:42:27 +00:00
simple_object_key = s3_helper . object_key_from_file_path ( simple_object_path )
2023-08-02 11:54:03 +00:00
complex_object_path = generate_file ( complex_object_size . value )
2023-05-30 13:42:27 +00:00
complex_object_key = s3_helper . object_key_from_file_path ( complex_object_path )
s3_client . put_object ( bucket , simple_object_path )
s3_client . put_object ( bucket , complex_object_path )
with allure . step ( " Check objects are in bucket " ) :
s3_helper . check_objects_in_bucket (
s3_client , bucket , expected_objects = [ simple_object_key , complex_object_key ]
)
with allure . step ( " Stop storage services on all nodes " ) :
cluster_state_controller . stop_all_storage_services ( )
with allure . step ( " Delete metabase from all nodes " ) :
for node in cluster_state_controller . cluster . storage_nodes :
node . delete_metabase ( )
with allure . step ( " Enable resync_metabase option for storage services " ) :
for storage_node in cluster_state_controller . cluster . storage_nodes :
with allure . step ( f " Enable resync_metabase option for { storage_node } " ) :
config_file_path , config = storage_node . get_config ( )
if not config [ " storage " ] [ " shard " ] [ " default " ] [ " resync_metabase " ] :
file_keeper . add ( storage_node , config_file_path )
config [ " storage " ] [ " shard " ] [ " default " ] [ " resync_metabase " ] = True
storage_node . save_config ( config )
with allure . step ( " Start storage services on all nodes " ) :
cluster_state_controller . start_stopped_storage_services ( )
with allure . step ( " Delete objects from bucket " ) :
with allure . step ( " Delete simple object from bucket " ) :
with expect_not_raises ( ) :
s3_client . delete_object ( bucket , simple_object_key )
with allure . step ( " Delete complex object from bucket " ) :
with expect_not_raises ( ) :
s3_client . delete_object ( bucket , complex_object_key )
with allure . step ( " Delete bucket " ) :
with expect_not_raises ( ) :
s3_client . delete_bucket ( bucket )
2023-05-31 15:31:35 +00:00
@allure.title (
" Write cache loss on one node should not affect shards and should not produce errors in log "
)
@pytest.mark.write_cache_loss
def test_write_cache_loss_on_one_node (
self ,
node_under_test : ClusterNode ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-31 15:31:35 +00:00
cluster_state_controller : ClusterStateController ,
shards_watcher : ShardsWatcher ,
default_wallet : str ,
test_start_time : datetime ,
after_run_return_all_stopped_services : str ,
) :
exception_messages = [ ]
allure . dynamic . description ( after_run_return_all_stopped_services )
with allure . step ( f " Create container on node { node_under_test } " ) :
locode = node_under_test . storage_node . get_un_locode ( )
placement_rule = f """ REP 1 IN X
CBF 1
SELECT 1 FROM C AS X
FILTER ' UN-LOCODE ' EQ ' {locode} ' AS C """
cid = create_container (
default_wallet ,
self . shell ,
node_under_test . storage_node . get_rpc_endpoint ( ) ,
rule = placement_rule ,
)
container = StorageContainer (
StorageContainerInfo ( cid , WalletInfo ( default_wallet ) ) ,
self . shell ,
cluster_state_controller . cluster ,
)
with allure . step ( f " Put couple objects to container on node { node_under_test } " ) :
storage_objects : list [ StorageObjectInfo ] = [ ]
for _ in range ( 5 ) :
storage_object = container . generate_object (
2023-08-02 11:54:03 +00:00
simple_object_size . value ,
endpoint = node_under_test . storage_node . get_rpc_endpoint ( ) ,
2023-05-31 15:31:35 +00:00
)
storage_objects . append ( storage_object )
with allure . step ( " Take shards snapshot " ) :
shards_watcher . take_shards_snapshot ( )
with allure . step ( f " Stop storage service on node { node_under_test } " ) :
cluster_state_controller . stop_storage_service ( node_under_test )
with allure . step ( f " Delete write cache from node { node_under_test } " ) :
node_under_test . storage_node . delete_write_cache ( )
with allure . step ( f " Start storage service on node { node_under_test } " ) :
cluster_state_controller . start_storage_service ( node_under_test )
with allure . step ( " Objects should be available " ) :
for storage_object in storage_objects :
get_object (
storage_object . wallet_file_path ,
container . get_id ( ) ,
storage_object . oid ,
self . shell ,
node_under_test . storage_node . get_rpc_endpoint ( ) ,
)
with allure . step ( " No shards should have new errors " ) :
shards_watcher . take_shards_snapshot ( )
shards_with_errors = shards_watcher . get_shards_with_new_errors ( )
if shards_with_errors :
exception_messages . append ( f " Shards have new errors: { shards_with_errors } " )
with allure . step ( " No shards should have degraded status " ) :
snapshot = shards_watcher . get_shards_snapshot ( )
for shard in snapshot :
status = snapshot [ shard ] [ " mode " ]
if status != " read-write " :
exception_messages . append ( f " Shard { shard } changed status to { status } " )
with allure . step ( " No related errors should be in log " ) :
if node_under_test . host . is_message_in_logs (
message_regex = r " \ Wno such file or directory \ W " , since = test_start_time
) :
exception_messages . append ( f " Node { node_under_test } have shard errors in logs " )
with allure . step ( " Pass test if no errors found " ) :
assert not exception_messages , " \n " . join ( exception_messages )
2023-05-31 10:16:38 +00:00
@allure.title (
2023-09-08 10:35:34 +00:00
" Loss of one node should trigger use of tree and storage service in another node (s3_client= {s3_client} ) "
2023-05-31 10:16:38 +00:00
)
def test_s3_one_endpoint_loss (
self ,
bucket ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-31 10:16:38 +00:00
after_run_return_all_stopped_services ,
cluster_state_controller : ClusterStateController ,
) :
# TODO: need to check that s3 gate is connected to localhost (such metric will be supported in 1.3)
with allure . step (
" Stop one node and wait for rebalance connection of s3 gate to storage service "
) :
current_node = self . cluster . cluster_nodes [ 0 ]
cluster_state_controller . stop_storage_service ( current_node )
# waiting for rebalance connection of s3 gate to storage service
sleep ( 60 )
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-31 10:16:38 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
with allure . step ( " Put object into one bucket " ) :
put_object = s3_client . put_object ( bucket , file_path )
2023-05-30 14:14:26 +00:00
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
2023-09-08 10:35:34 +00:00
@allure.title (
" After Pilorama.db loss on one node object is retrievable (s3_client= {s3_client} ) "
)
2023-05-30 14:14:26 +00:00
def test_s3_one_pilorama_loss (
self ,
s3_client : S3ClientWrapper ,
2023-08-02 11:54:03 +00:00
simple_object_size : ObjectSize ,
2023-05-30 14:14:26 +00:00
cluster_state_controller : ClusterStateController ,
) :
bucket = s3_client . create_bucket (
location_constraint = " load-1-4 " ,
grant_read = " uri=http://acs.amazonaws.com/groups/global/AllUsers " ,
)
s3_helper . set_bucket_versioning ( s3_client , bucket , VersioningStatus . ENABLED )
with allure . step ( " Check bucket versioning " ) :
bucket_versioning = s3_client . get_bucket_versioning_status ( bucket )
assert bucket_versioning == " Enabled " , " Bucket should have enabled versioning "
2023-08-02 11:54:03 +00:00
file_path = generate_file ( simple_object_size . value )
2023-05-30 14:14:26 +00:00
file_name = s3_helper . object_key_from_file_path ( file_path )
object_versions = [ ]
with allure . step ( " Put object into one bucket " ) :
put_object = s3_client . put_object ( bucket , file_path )
s3_helper . check_objects_in_bucket ( s3_client , bucket , expected_objects = [ file_name ] )
object_versions . append ( put_object )
node_to_check = self . cluster . storage_nodes [ 0 ]
piloramas_list_before_removing = { }
with allure . step ( " Get list of all pilorama.db " ) :
piloramas_list_before_removing = self . get_piloramas_list (
node_to_check , cluster_state_controller
)
with allure . step ( " Stop all storage nodes " ) :
for node in self . cluster . cluster_nodes :
with allure . step ( f " Stop storage service on node: { node } " ) :
cluster_state_controller . stop_storage_service ( node )
with allure . step ( " Delete pilorama.db from one node " ) :
node_to_check . delete_pilorama ( )
with allure . step ( " Start all storage nodes " ) :
cluster_state_controller . start_stopped_storage_services ( )
with allure . step ( " Tick epoch to trigger sync and then wait for 1 minute " ) :
self . tick_epochs ( 1 )
sleep ( 120 )
piloramas_list_afrer_removing = { }
with allure . step ( " Get list of all pilorama.db after sync " ) :
piloramas_list_afrer_removing = self . get_piloramas_list (
node_to_check , cluster_state_controller
)
assert (
piloramas_list_afrer_removing == piloramas_list_before_removing
) , " List of pilorama.db is different "
with allure . step ( " Check bucket versioning " ) :
bucket_versioning = s3_client . get_bucket_versioning_status ( bucket )
assert bucket_versioning == " Enabled " , " Bucket should have enabled versioning "
with allure . step ( " Check list objects " ) :
objects_list = s3_client . list_objects ( bucket )
assert objects_list , f " Expected not empty bucket "
with allure . step ( " Delete the object from the bucket " ) :
delete_object = s3_client . delete_object ( bucket , file_name )
assert " DeleteMarker " in delete_object . keys ( ) , " Delete markers not found "
with allure . step ( " Check list objects " ) :
objects_list = s3_client . list_objects_versions ( bucket )
assert objects_list , f " Expected not empty bucket "
object_versions . append ( delete_object [ " VersionId " ] )
# and now delete all versions of object (including Delete Markers)
with allure . step ( " Delete all versions of the object from the bucket " ) :
for version in object_versions :
delete_object = s3_client . delete_object ( bucket , file_name , version_id = version )
with allure . step ( " Check list objects " ) :
objects_list = s3_client . list_objects_versions ( bucket )
assert not objects_list , f " Expected empty bucket "
with allure . step ( " Delete bucket " ) :
s3_client . delete_bucket ( bucket )