2020-03-23 11:36:32 +00:00
import nose
import random
2020-07-06 21:10:50 +00:00
from nose . plugins . attrib import attr
2020-07-03 11:49:13 +00:00
import uuid
from nose . tools import eq_ as eq
2020-03-23 11:36:32 +00:00
from . import (
2020-05-31 13:07:58 +00:00
get_client
2020-03-23 11:36:32 +00:00
)
region_name = ' '
2020-05-26 22:20:31 +00:00
# recurssion function for generating arithmetical expression
def random_expr ( depth ) :
# depth is the complexity of expression
if depth == 1 :
return str ( int ( random . random ( ) * 100 ) + 1 ) + " .0 "
return ' ( ' + random_expr ( depth - 1 ) + random . choice ( [ ' + ' , ' - ' , ' * ' , ' / ' ] ) + random_expr ( depth - 1 ) + ' ) '
def generate_s3select_where_clause ( bucket_name , obj_name ) :
a = random_expr ( 4 )
b = random_expr ( 4 )
s = random . choice ( [ ' < ' , ' > ' , ' == ' , ' <= ' , ' >= ' , ' != ' ] )
try :
eval ( a )
eval ( b )
except ZeroDivisionError :
return
# generate s3select statement using generated randome expression
# upon count(0)>0 it means true for the where clause expression
# the python-engine {eval( conditional expression )} should return same boolean result.
s3select_stmt = " select count(0) from stdin where " + a + s + b + " ; "
res = remove_xml_tags_from_result ( run_s3select ( bucket_name , obj_name , s3select_stmt ) ) . replace ( " , " , " " )
nose . tools . assert_equal ( int ( res ) > 0 , eval ( a + s + b ) )
def generate_s3select_expression_projection ( bucket_name , obj_name ) :
# generate s3select statement using generated randome expression
# statement return an arithmetical result for the generated expression.
# the same expression is evaluated by python-engine, result should be close enough(Epsilon)
e = random_expr ( 4 )
try :
eval ( e )
except ZeroDivisionError :
return
if eval ( e ) == 0 :
return
res = remove_xml_tags_from_result ( run_s3select ( bucket_name , obj_name , " select " + e + " from stdin; " , ) ) . replace ( " , " , " " )
# accuracy level
2020-05-31 13:07:58 +00:00
epsilon = float ( 0.000001 )
2020-05-26 22:20:31 +00:00
# both results should be close (epsilon)
assert ( 1 - ( float ( res . split ( " \n " ) [ 1 ] ) / eval ( e ) ) ) < epsilon
2020-07-06 21:10:50 +00:00
@attr ( ' s3select ' )
2020-07-03 11:49:13 +00:00
def get_random_string ( ) :
return uuid . uuid4 ( ) . hex [ : 6 ] . upper ( )
2020-05-26 22:20:31 +00:00
def test_generate_where_clause ( ) :
# create small csv file for testing the random expressions
single_line_csv = create_random_csv_object ( 1 , 1 )
bucket_name = " test "
2020-07-03 11:49:13 +00:00
obj_name = get_random_string ( ) #"single_line_csv.csv"
2020-05-26 22:20:31 +00:00
upload_csv_object ( bucket_name , obj_name , single_line_csv )
2020-05-31 13:07:58 +00:00
for _ in range ( 100 ) :
2020-05-26 22:20:31 +00:00
generate_s3select_where_clause ( bucket_name , obj_name )
2020-07-06 21:10:50 +00:00
@attr ( ' s3select ' )
2020-05-26 22:20:31 +00:00
def test_generate_projection ( ) :
# create small csv file for testing the random expressions
single_line_csv = create_random_csv_object ( 1 , 1 )
bucket_name = " test "
2020-07-03 11:49:13 +00:00
obj_name = get_random_string ( ) #"single_line_csv.csv"
2020-05-26 22:20:31 +00:00
upload_csv_object ( bucket_name , obj_name , single_line_csv )
for _ in range ( 100 ) :
generate_s3select_expression_projection ( bucket_name , obj_name )
2020-03-23 11:36:32 +00:00
2020-04-09 13:43:11 +00:00
def create_csv_object_for_datetime ( rows , columns ) :
result = " "
2020-05-26 08:36:38 +00:00
for _ in range ( rows ) :
row = " "
for _ in range ( columns ) :
row = row + " {} {:02d} {:02d} - {:02d} {:02d} {:02d} , " . format ( random . randint ( 0 , 100 ) + 1900 , random . randint ( 1 , 12 ) , random . randint ( 1 , 28 ) , random . randint ( 0 , 23 ) , random . randint ( 0 , 59 ) , random . randint ( 0 , 59 ) , )
2020-04-09 13:43:11 +00:00
result + = row + " \n "
return result
2020-05-08 13:51:24 +00:00
def create_random_csv_object ( rows , columns , col_delim = " , " , record_delim = " \n " , csv_schema = " " ) :
2020-03-23 11:36:32 +00:00
result = " "
2020-05-08 13:51:24 +00:00
if len ( csv_schema ) > 0 :
result = csv_schema + record_delim
2020-05-26 08:36:38 +00:00
for _ in range ( rows ) :
row = " "
for _ in range ( columns ) :
row = row + " {} {} " . format ( random . randint ( 0 , 1000 ) , col_delim )
2020-04-28 16:10:24 +00:00
result + = row + record_delim
2020-03-23 11:36:32 +00:00
return result
2020-04-28 16:10:24 +00:00
2020-03-23 11:36:32 +00:00
def upload_csv_object ( bucket_name , new_key , obj ) :
2020-05-31 13:07:58 +00:00
client = get_client ( )
client . create_bucket ( Bucket = bucket_name )
client . put_object ( Bucket = bucket_name , Key = new_key , Body = obj )
2020-07-03 11:49:13 +00:00
# validate uploaded object
c2 = get_client ( )
response = c2 . get_object ( Bucket = bucket_name , Key = new_key )
eq ( response [ ' Body ' ] . read ( ) . decode ( ' utf-8 ' ) , obj , ' s3select error[ downloaded object not equal to uploaded objecy ' )
2020-03-23 11:36:32 +00:00
2020-05-08 13:51:24 +00:00
def run_s3select ( bucket , key , query , column_delim = " , " , row_delim = " \n " , quot_char = ' " ' , esc_char = ' \\ ' , csv_header_info = " NONE " ) :
2020-05-23 12:10:14 +00:00
2020-05-31 13:07:58 +00:00
s3 = get_client ( )
2020-03-23 11:36:32 +00:00
r = s3 . select_object_content (
Bucket = bucket ,
Key = key ,
ExpressionType = ' SQL ' ,
2020-05-08 13:51:24 +00:00
InputSerialization = { " CSV " : { " RecordDelimiter " : row_delim , " FieldDelimiter " : column_delim , " QuoteEscapeCharacter " : esc_char , " QuoteCharacter " : quot_char , " FileHeaderInfo " : csv_header_info } , " CompressionType " : " NONE " } ,
2020-03-23 11:36:32 +00:00
OutputSerialization = { " CSV " : { } } ,
Expression = query , )
result = " "
for event in r [ ' Payload ' ] :
if ' Records ' in event :
records = event [ ' Records ' ] [ ' Payload ' ] . decode ( ' utf-8 ' )
result + = records
return result
def remove_xml_tags_from_result ( obj ) :
result = " "
for rec in obj . split ( " \n " ) :
if ( rec . find ( " Payload " ) > 0 or rec . find ( " Records " ) > 0 ) :
continue
result + = rec + " \n " # remove by split
return result
2020-03-29 14:23:05 +00:00
2020-05-26 08:36:38 +00:00
def create_list_of_int ( column_pos , obj , field_split = " , " , row_split = " \n " ) :
2020-03-29 14:23:05 +00:00
list_of_int = [ ]
2020-05-26 08:36:38 +00:00
for rec in obj . split ( row_split ) :
2020-03-23 11:36:32 +00:00
col_num = 1
if ( len ( rec ) == 0 ) :
2020-05-26 08:36:38 +00:00
continue
for col in rec . split ( field_split ) :
2020-03-23 11:36:32 +00:00
if ( col_num == column_pos ) :
2020-05-26 08:36:38 +00:00
list_of_int . append ( int ( col ) )
col_num + = 1
2020-03-23 11:36:32 +00:00
2020-03-29 14:23:05 +00:00
return list_of_int
2020-07-06 21:10:50 +00:00
@attr ( ' s3select ' )
2020-03-23 11:36:32 +00:00
def test_count_operation ( ) :
2020-07-03 11:49:13 +00:00
csv_obj_name = get_random_string ( )
2020-03-23 11:36:32 +00:00
bucket_name = " test "
2020-07-03 11:49:13 +00:00
num_of_rows = 1234
2020-03-23 11:36:32 +00:00
obj_to_load = create_random_csv_object ( num_of_rows , 10 )
upload_csv_object ( bucket_name , csv_obj_name , obj_to_load )
res = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select count(0) from stdin; " ) ) . replace ( " , " , " " )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( num_of_rows , int ( res ) )
2020-03-23 11:36:32 +00:00
2020-07-06 21:10:50 +00:00
@attr ( ' s3select ' )
2020-03-29 14:23:05 +00:00
def test_column_sum_min_max ( ) :
csv_obj = create_random_csv_object ( 10000 , 10 )
2020-03-23 11:36:32 +00:00
2020-07-03 11:49:13 +00:00
csv_obj_name = get_random_string ( )
2020-03-23 11:36:32 +00:00
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
2020-07-03 11:49:13 +00:00
csv_obj_name_2 = get_random_string ( )
2020-05-26 08:36:38 +00:00
bucket_name_2 = " testbuck2 "
2020-07-03 11:49:13 +00:00
upload_csv_object ( bucket_name_2 , csv_obj_name_2 , csv_obj )
2020-05-26 08:36:38 +00:00
2020-03-29 14:23:05 +00:00
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select min(int(_1)) from stdin; " ) ) . replace ( " , " , " " )
list_int = create_list_of_int ( 1 , csv_obj )
res_target = min ( list_int )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( int ( res_s3select ) , int ( res_target ) )
2020-03-29 14:23:05 +00:00
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select min(int(_4)) from stdin; " ) ) . replace ( " , " , " " )
list_int = create_list_of_int ( 4 , csv_obj )
res_target = min ( list_int )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( int ( res_s3select ) , int ( res_target ) )
2020-03-29 14:23:05 +00:00
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select max(int(_4)) from stdin; " ) ) . replace ( " , " , " " )
list_int = create_list_of_int ( 4 , csv_obj )
res_target = max ( list_int )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( int ( res_s3select ) , int ( res_target ) )
2020-03-29 14:23:05 +00:00
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select max(int(_7)) from stdin; " ) ) . replace ( " , " , " " )
list_int = create_list_of_int ( 7 , csv_obj )
res_target = max ( list_int )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( int ( res_s3select ) , int ( res_target ) )
2020-03-29 14:23:05 +00:00
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select sum(int(_4)) from stdin; " ) ) . replace ( " , " , " " )
list_int = create_list_of_int ( 4 , csv_obj )
res_target = sum ( list_int )
2020-03-23 11:36:32 +00:00
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( int ( res_s3select ) , int ( res_target ) )
2020-03-29 14:23:05 +00:00
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select sum(int(_7)) from stdin; " ) ) . replace ( " , " , " " )
list_int = create_list_of_int ( 7 , csv_obj )
res_target = sum ( list_int )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( int ( res_s3select ) , int ( res_target ) )
# the following queries, validates on *random* input an *accurate* relation between condition result,sum operation and count operation.
2020-07-03 11:49:13 +00:00
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name_2 , csv_obj_name_2 , " select count(0),sum(int(_1)),sum(int(_2)) from stdin where (int(_1)-int(_2)) == 2; " ) )
2020-05-26 08:36:38 +00:00
count , sum1 , sum2 , d = res_s3select . split ( " , " )
nose . tools . assert_equal ( int ( count ) * 2 , int ( sum1 ) - int ( sum2 ) )
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select count(0),sum(int(_1)),sum(int(_2)) from stdin where (int(_1)-int(_2)) == 4; " ) )
count , sum1 , sum2 , d = res_s3select . split ( " , " )
nose . tools . assert_equal ( int ( count ) * 4 , int ( sum1 ) - int ( sum2 ) )
2020-07-06 21:10:50 +00:00
@attr ( ' s3select ' )
2020-04-04 13:31:54 +00:00
def test_complex_expressions ( ) :
2020-03-23 11:36:32 +00:00
2020-04-04 13:31:54 +00:00
# purpose of test: engine is process correctly several projections containing aggregation-functions
csv_obj = create_random_csv_object ( 10000 , 10 )
2020-07-03 11:49:13 +00:00
csv_obj_name = get_random_string ( )
2020-04-04 13:31:54 +00:00
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select min(int(_1)),max(int(_2)),min(int(_3))+1 from stdin; " ) ) . replace ( " \n " , " " )
2020-05-26 08:36:38 +00:00
min_1 = min ( create_list_of_int ( 1 , csv_obj ) )
max_2 = max ( create_list_of_int ( 2 , csv_obj ) )
min_3 = min ( create_list_of_int ( 3 , csv_obj ) ) + 1
__res = " {} , {} , {} , " . format ( min_1 , max_2 , min_3 )
2020-04-04 13:31:54 +00:00
# assert is according to radom-csv function
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select , __res )
2020-04-04 13:31:54 +00:00
# purpose of test that all where conditions create the same group of values, thus same result
res_s3select_substr = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select min(int(_2)),max(int(_2)) from stdin where substr(_2,1,1) == " 1 " ' ) ) . replace ( " \n " , " " )
res_s3select_between_numbers = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select min(int(_2)),max(int(_2)) from stdin where int(_2)>=100 and int(_2)<200 ' ) ) . replace ( " \n " , " " )
res_s3select_eq_modolu = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select min(int(_2)),max(int(_2)) from stdin where int(_2)/100 == 1 or int(_2)/10 == 1 or int(_2) == 1 ' ) ) . replace ( " \n " , " " )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_substr , res_s3select_between_numbers )
2020-04-04 13:31:54 +00:00
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_between_numbers , res_s3select_eq_modolu )
2020-04-04 13:31:54 +00:00
2020-07-06 21:10:50 +00:00
@attr ( ' s3select ' )
2020-03-26 15:49:55 +00:00
def test_alias ( ) :
2020-03-31 11:35:51 +00:00
# purpose: test is comparing result of exactly the same queries , one with alias the other without.
# this test is setting alias on 3 projections, the third projection is using other projection alias, also the where clause is using aliases
# the test validate that where-clause and projections are executing aliases correctly, bare in mind that each alias has its own cache,
# and that cache need to be invalidate per new row.
2020-03-26 15:49:55 +00:00
csv_obj = create_random_csv_object ( 10000 , 10 )
2020-03-23 11:36:32 +00:00
2020-07-03 11:49:13 +00:00
csv_obj_name = get_random_string ( )
2020-03-26 15:49:55 +00:00
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
res_s3select_alias = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select int(_1) as a1, int(_2) as a2 , (a1+a2) as a3 from stdin where a3>100 and a3<300; " ) ) . replace ( " , " , " " )
res_s3select_no_alias = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select int(_1),int(_2),int(_1)+int(_2) from stdin where (int(_1)+int(_2))>100 and (int(_1)+int(_2))<300; " ) ) . replace ( " , " , " " )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_alias , res_s3select_no_alias )
2020-03-23 11:36:32 +00:00
2020-07-06 21:10:50 +00:00
@attr ( ' s3select ' )
2020-03-31 11:35:51 +00:00
def test_alias_cyclic_refernce ( ) :
2020-05-26 08:36:38 +00:00
number_of_rows = 10000
2020-03-31 11:35:51 +00:00
2020-05-26 08:36:38 +00:00
# purpose of test is to validate the s3select-engine is able to detect a cyclic reference to alias.
csv_obj = create_random_csv_object ( number_of_rows , 10 )
2020-03-31 11:35:51 +00:00
2020-07-03 11:49:13 +00:00
csv_obj_name = get_random_string ( )
2020-03-31 11:35:51 +00:00
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
res_s3select_alias = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select int(_1) as a1,int(_2) as a2, a1+a4 as a3, a5+a1 as a4, int(_3)+a3 as a5 from stdin; " ) )
2020-05-26 08:36:38 +00:00
find_res = res_s3select_alias . find ( " number of calls exceed maximum size, probably a cyclic reference to alias " )
2020-03-31 11:35:51 +00:00
assert int ( find_res ) > = 0
2020-07-06 21:10:50 +00:00
@attr ( ' s3select ' )
2020-04-09 13:43:11 +00:00
def test_datetime ( ) :
# purpose of test is to validate date-time functionality is correct,
# by creating same groups with different functions (nested-calls) ,which later produce the same result
csv_obj = create_csv_object_for_datetime ( 10000 , 1 )
2020-07-03 11:49:13 +00:00
csv_obj_name = get_random_string ( )
2020-04-09 13:43:11 +00:00
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
res_s3select_date_time = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(0) from stdin where extract( " year " ,timestamp(_1)) > 1950 and extract( " year " ,timestamp(_1)) < 1960; ' ) )
res_s3select_substr = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(0) from stdin where int(substr(_1,1,4))>1950 and int(substr(_1,1,4))<1960; ' ) )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_date_time , res_s3select_substr )
2020-04-09 13:43:11 +00:00
res_s3select_date_time = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(0) from stdin where datediff( " month " ,timestamp(_1),dateadd( " month " ,2,timestamp(_1)) ) == 2; ' ) )
res_s3select_count = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(0) from stdin; ' ) )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_date_time , res_s3select_count )
2020-04-09 13:43:11 +00:00
res_s3select_date_time = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(0) from stdin where datediff( " year " ,timestamp(_1),dateadd( " day " , 366 ,timestamp(_1))) == 1 ; ' ) )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_date_time , res_s3select_count )
2020-04-09 13:43:11 +00:00
2020-04-15 14:20:52 +00:00
# validate that utcnow is integrate correctly with other date-time functions
res_s3select_date_time_utcnow = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(0) from stdin where datediff( " hours " ,utcnow(),dateadd( " day " ,1,utcnow())) == 24 ; ' ) )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_date_time_utcnow , res_s3select_count )
2020-04-09 13:43:11 +00:00
2020-07-06 21:10:50 +00:00
@attr ( ' s3select ' )
2020-04-28 16:10:24 +00:00
def test_csv_parser ( ) :
# purpuse: test default csv values(, \n " \ ), return value may contain meta-char
# NOTE: should note that default meta-char for s3select are also for python, thus for one example double \ is mandatory
csv_obj = ' ,first,,,second,third= " c31,c32,c33 " ,forth= " 1,2,3,4 " ,fifth= " my_string= \\ " any_value \\ " , my_other_string= \\ " aaaa,bbb \\ " " , ' + " \n "
2020-07-03 11:49:13 +00:00
csv_obj_name = get_random_string ( )
2020-04-28 16:10:24 +00:00
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
# return value contain comma{,}
res_s3select_alias = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select _6 from stdin; " ) ) . replace ( " \n " , " " )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_alias , ' third= " c31,c32,c33 " , ' )
2020-04-28 16:10:24 +00:00
# return value contain comma{,}
res_s3select_alias = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select _7 from stdin; " ) ) . replace ( " \n " , " " )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_alias , ' forth= " 1,2,3,4 " , ' )
2020-04-28 16:10:24 +00:00
# return value contain comma{,}{"}, escape-rule{\} by-pass quote{"} , the escape{\} is removed.
res_s3select_alias = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select _8 from stdin; " ) ) . replace ( " \n " , " " )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_alias , ' fifth= " my_string= " any_value " , my_other_string= " aaaa,bbb " " , ' )
2020-04-28 16:10:24 +00:00
# return NULL as first token
res_s3select_alias = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select _1 from stdin; " ) ) . replace ( " \n " , " " )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_alias , ' , ' )
2020-04-28 16:10:24 +00:00
# return NULL in the middle of line
res_s3select_alias = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select _3 from stdin; " ) ) . replace ( " \n " , " " )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_alias , ' , ' )
2020-04-28 16:10:24 +00:00
# return NULL in the middle of line (successive)
res_s3select_alias = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select _4 from stdin; " ) ) . replace ( " \n " , " " )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_alias , ' , ' )
2020-04-28 16:10:24 +00:00
# return NULL at the end line
res_s3select_alias = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select _9 from stdin; " ) ) . replace ( " \n " , " " )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_s3select_alias , ' , ' )
2020-04-28 16:10:24 +00:00
2020-07-06 21:10:50 +00:00
@attr ( ' s3select ' )
2020-04-28 16:10:24 +00:00
def test_csv_definition ( ) :
number_of_rows = 10000
#create object with pipe-sign as field separator and tab as row delimiter.
csv_obj = create_random_csv_object ( number_of_rows , 10 , " | " , " \t " )
2020-07-03 11:49:13 +00:00
csv_obj_name = get_random_string ( )
2020-04-28 16:10:24 +00:00
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
# purpose of tests is to parse correctly input with different csv defintions
res = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select count(0) from stdin; " , " | " , " \t " ) ) . replace ( " , " , " " )
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( number_of_rows , int ( res ) )
2020-04-28 16:10:24 +00:00
# assert is according to radom-csv function
# purpose of test is validate that tokens are processed correctly
2020-05-26 08:36:38 +00:00
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select min(int(_1)),max(int(_2)),min(int(_3))+1 from stdin; " , " | " , " \t " ) ) . replace ( " \n " , " " )
min_1 = min ( create_list_of_int ( 1 , csv_obj , " | " , " \t " ) )
max_2 = max ( create_list_of_int ( 2 , csv_obj , " | " , " \t " ) )
min_3 = min ( create_list_of_int ( 3 , csv_obj , " | " , " \t " ) ) + 1
__res = " {} , {} , {} , " . format ( min_1 , max_2 , min_3 )
nose . tools . assert_equal ( res_s3select , __res )
2020-04-28 16:10:24 +00:00
2020-07-06 21:10:50 +00:00
@attr ( ' s3select ' )
2020-05-26 08:36:38 +00:00
def test_schema_definition ( ) :
number_of_rows = 10000
2020-04-28 16:10:24 +00:00
2020-05-08 13:51:24 +00:00
# purpose of test is to validate functionality using csv header info
csv_obj = create_random_csv_object ( number_of_rows , 10 , csv_schema = " c1,c2,c3,c4,c5,c6,c7,c8,c9,c10 " )
2020-07-03 11:49:13 +00:00
csv_obj_name = get_random_string ( )
2020-05-08 13:51:24 +00:00
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
# ignoring the schema on first line and retrieve using generic column number
res_ignore = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select _1,_3 from stdin; " , csv_header_info = " IGNORE " ) ) . replace ( " \n " , " " )
# using the scheme on first line, query is using the attach schema
res_use = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select c1,c3 from stdin; " , csv_header_info = " USE " ) ) . replace ( " \n " , " " )
# result of both queries should be the same
2020-05-26 08:36:38 +00:00
nose . tools . assert_equal ( res_ignore , res_use )
2020-05-08 13:51:24 +00:00
# using column-name not exist in schema
res_multiple_defintion = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select c1,c10,int(c11) from stdin; " , csv_header_info = " USE " ) ) . replace ( " \n " , " " )
assert res_multiple_defintion . find ( " alias {c11} or column not exist in schema " ) > 0
2020-05-26 08:36:38 +00:00
# alias-name is identical to column-name
res_multiple_defintion = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select int(c1)+int(c2) as c4,c4 from stdin; " , csv_header_info = " USE " ) ) . replace ( " \n " , " " )
assert res_multiple_defintion . find ( " multiple definition of column {c4} as schema-column and alias " ) > 0
2020-07-03 11:49:13 +00:00
2020-10-13 10:06:29 +00:00
@attr ( ' s3select ' )
def test_when_than_else_expressions ( ) :
csv_obj = create_random_csv_object ( 10000 , 10 )
csv_obj_name = get_random_string ( )
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select case when ((4*3)==(12)) than " case_1_2 " else " case_2_1 " end from stdin where (3*3==9); ' ) ) . replace ( " \n " , " " )
nose . tools . assert_equal ( res_s3select , " case_1_2, " )
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select case when cast(_1 as int)>100 and cast(_1 as int)<200 than " (100-200) " when cast(_1 as int)>200 and cast(_1 as int)<300 than " (200-300) " else " NONE " end from s3object; ' ) ) . replace ( " \n " , " " )
count1 = res_s3select . count ( " (100-200) " )
count2 = res_s3select . count ( " (200-300) " )
count3 = res_s3select . count ( " NONE " )
res = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(*) from s3object where cast(_1 as int)>100 and cast(_1 as int)<200 ; ' ) ) . replace ( " \n " , " " )
res1 = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(*) from s3object where cast(_1 as int)>200 and cast(_1 as int)<300 ; ' ) ) . replace ( " \n " , " " )
res2 = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(*) from s3object where cast(_1 as int)<=100 or cast(_1 as int)>=300 or cast(_1 as int)==200 ; ' ) ) . replace ( " \n " , " " )
nose . tools . assert_equal ( str ( count1 ) + ' , ' , res )
nose . tools . assert_equal ( str ( count2 ) + ' , ' , res1 )
nose . tools . assert_equal ( str ( count3 ) + ' , ' , res2 )
@attr ( ' s3select ' )
def test_coalesce_expressions ( ) :
csv_obj = create_random_csv_object ( 10000 , 10 )
csv_obj_name = get_random_string ( )
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(*) from s3object where char_length(_3)>2 and char_length(_4)>2 and cast(substr(_3,1,2) as int) == cast(substr(_4,1,2) as int); ' ) ) . replace ( " \n " , " " )
res_null = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(*) from s3object where cast(_3 as int)>99 and cast(_4 as int)>99 and coalesce(nullif(cast(substr(_3,1,2) as int),cast(substr(_4,1,2) as int)),7) == 7; ' ) ) . replace ( " \n " , " " )
nose . tools . assert_equal ( res_s3select , res_null )
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select coalesce(nullif(_5,_5),nullif(_1,_1),_2) from stdin; ' ) ) . replace ( " \n " , " " )
res_coalesce = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select coalesce(_2) from stdin; ' ) ) . replace ( " \n " , " " )
nose . tools . assert_equal ( res_s3select , res_coalesce )
@attr ( ' s3select ' )
def test_cast_expressions ( ) :
csv_obj = create_random_csv_object ( 10000 , 10 )
csv_obj_name = get_random_string ( )
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(*) from s3object where cast(_3 as int)>999; ' ) ) . replace ( " \n " , " " )
res = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(*) from s3object where char_length(_3)>3; ' ) ) . replace ( " \n " , " " )
nose . tools . assert_equal ( res_s3select , res )
res_s3select = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(*) from s3object where cast(_3 as int)>99 and cast(_3 as int)<1000; ' ) ) . replace ( " \n " , " " )
res = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , ' select count(*) from s3object where char_length(_3)==3; ' ) ) . replace ( " \n " , " " )
nose . tools . assert_equal ( res_s3select , res )
2020-07-03 11:49:13 +00:00
@attr ( ' s3select ' )
def test_version ( ) :
return
number_of_rows = 1
# purpose of test is to validate functionality using csv header info
csv_obj = create_random_csv_object ( number_of_rows , 10 )
csv_obj_name = get_random_string ( )
bucket_name = " test "
upload_csv_object ( bucket_name , csv_obj_name , csv_obj )
res_version = remove_xml_tags_from_result ( run_s3select ( bucket_name , csv_obj_name , " select version() from stdin; " ) ) . replace ( " \n " , " " )
nose . tools . assert_equal ( res_version , " 41.a, " )