forked from TrueCloudLab/s3-tests
DHO QA: Random Object Generation Script
Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list
This commit is contained in:
parent
262f1eecd1
commit
3a3cbb3d25
3 changed files with 220 additions and 0 deletions
3
generate_objects.conf
Normal file
3
generate_objects.conf
Normal file
|
@ -0,0 +1,3 @@
|
|||
- [3, 20, 5]
|
||||
- [3, 30, 2]
|
||||
|
124
generate_objects.py
Executable file
124
generate_objects.py
Executable file
|
@ -0,0 +1,124 @@
|
|||
#! /usr/bin/python
|
||||
|
||||
from boto.s3.connection import OrdinaryCallingFormat
|
||||
from boto.s3.connection import S3Connection
|
||||
from boto.s3.key import Key
|
||||
from optparse import OptionParser
|
||||
from realistic import RandomContentFile
|
||||
import realistic
|
||||
import random
|
||||
import yaml
|
||||
import boto
|
||||
import sys
|
||||
|
||||
DHO_HOST = 'objects.dreamhost.com'
|
||||
|
||||
def parse_opts():
|
||||
parser = OptionParser();
|
||||
parser.add_option('-O' , '--outfile', help='write output to FILE. Defaults to STDOUT', metavar='FILE')
|
||||
parser.add_option('-a' , '--access-key', dest='access_key', help='use S3 access key KEY', metavar='KEY')
|
||||
parser.add_option('-s' , '--secret-key', dest='secret_key', help='use S3 secret key KEY', metavar='KEY')
|
||||
parser.add_option('-b' , '--bucket', dest='bucket', help='push objects to BUCKET', metavar='BUCKET')
|
||||
parser.add_option('--checksum', dest='checksum', action='store_true', help='include the md5 checksum with the object urls')
|
||||
parser.add_option('--host', dest='host', help='use S3 gateway at HOST', metavar='HOST')
|
||||
parser.add_option('--seed', dest='seed', help='optional seed for the random number generator')
|
||||
|
||||
parser.set_defaults(host=DHO_HOST)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def parse_config(config_files):
|
||||
configurations = []
|
||||
for file in config_files:
|
||||
FILE = open(file, 'r')
|
||||
configurations = configurations + yaml.load(FILE.read())
|
||||
FILE.close()
|
||||
return configurations
|
||||
|
||||
|
||||
def get_bucket(conn, existing_bucket):
|
||||
if existing_bucket:
|
||||
return conn.get_bucket(existing_bucket)
|
||||
else:
|
||||
goop = '%x' % random.getrandbits(64)
|
||||
bucket = conn.create_bucket(goop)
|
||||
bucket.set_acl('public-read')
|
||||
return bucket
|
||||
|
||||
|
||||
def connect_s3(host, access_key, secret_key):
|
||||
conn = S3Connection(
|
||||
calling_format = OrdinaryCallingFormat(),
|
||||
is_secure = False,
|
||||
host = host,
|
||||
aws_access_key_id = access_key,
|
||||
aws_secret_access_key = secret_key)
|
||||
|
||||
return conn
|
||||
|
||||
|
||||
def generate_objects(bucket, quantity, mean, stddev, seed, checksum=False):
|
||||
"""Generate random objects with sizes across a normal distribution
|
||||
specified by mean and standard deviation and write them to bucket.
|
||||
IN:
|
||||
boto S3 bucket object
|
||||
Number of files
|
||||
mean file size in bytes
|
||||
standard deviation from mean file size
|
||||
seed for RNG
|
||||
flag to tell the method to append md5 checksums to the output
|
||||
OUT:
|
||||
list of urls (strings) to objects valid for 1 hour.
|
||||
If "checksum" is true, each output string consists of the url
|
||||
followed by the md5 checksum.
|
||||
"""
|
||||
urls = []
|
||||
file_generator = realistic.files(mean, stddev, seed)
|
||||
name_generator = realistic.names(15, 4,seed=seed)
|
||||
for _ in xrange(quantity):
|
||||
fp = file_generator.next()
|
||||
print >> sys.stderr, 'sending file with size %dB' % fp.size
|
||||
key = Key(bucket)
|
||||
key.key = name_generator.next()
|
||||
key.set_contents_from_file(fp)
|
||||
url = key.generate_url(3600) #valid for 1 hour
|
||||
if checksum:
|
||||
url += ' %s' % key.md5
|
||||
urls.append(url)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
def main():
|
||||
(options, args) = parse_opts();
|
||||
|
||||
#SETUP
|
||||
random.seed(options.seed if options.seed else None)
|
||||
if options.outfile:
|
||||
OUTFILE = open(options.outfile, 'w')
|
||||
else:
|
||||
OUTFILE = sys.stdout
|
||||
|
||||
conn = connect_s3(options.host, options.access_key, options.secret_key)
|
||||
bucket = get_bucket(conn, options.bucket)
|
||||
urls = []
|
||||
|
||||
print >> OUTFILE, 'bucket: %s' % bucket.name
|
||||
print >> sys.stderr, 'setup complete, generating files'
|
||||
for profile in parse_config(args):
|
||||
seed = random.random()
|
||||
urls += generate_objects(bucket, profile[0], profile[1], profile[2], seed, options.checksum)
|
||||
print >> sys.stderr, 'finished sending files. Saving urls to S3'
|
||||
|
||||
url_string = '\n'.join(urls)
|
||||
url_key = Key(bucket)
|
||||
url_key.key = 'urls'
|
||||
url_key.set_contents_from_string(url_string)
|
||||
print >> OUTFILE, url_string
|
||||
print >> sys.stderr, 'done'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
93
realistic.py
Normal file
93
realistic.py
Normal file
|
@ -0,0 +1,93 @@
|
|||
import hashlib
|
||||
import random
|
||||
import string
|
||||
|
||||
class RandomContentFile(object):
|
||||
def __init__(self, size, seed):
|
||||
self.seed = seed
|
||||
self.random = random.Random(self.seed)
|
||||
self.offset = 0
|
||||
self.size = size
|
||||
self.hash = hashlib.md5()
|
||||
self.digest_size = self.hash.digest_size
|
||||
self.digest = None
|
||||
|
||||
def seek(self, offset):
|
||||
assert offset == 0
|
||||
self.random.seed(self.seed)
|
||||
self.offset = offset
|
||||
|
||||
def tell(self):
|
||||
return self.offset
|
||||
|
||||
def read(self, size=-1):
|
||||
if size < 0:
|
||||
size = self.size - self.offset
|
||||
|
||||
r = []
|
||||
|
||||
random_count = min(size, self.size - self.offset - self.digest_size)
|
||||
if random_count > 0:
|
||||
self.offset += random_count
|
||||
size -= random_count
|
||||
data = ''.join(chr(self.random.getrandbits(8)) for _ in xrange(random_count))
|
||||
if self.hash is not None:
|
||||
self.hash.update(data)
|
||||
r.append(data)
|
||||
|
||||
digest_count = min(size, self.size - self.offset)
|
||||
if digest_count > 0:
|
||||
if self.digest is None:
|
||||
self.digest = self.hash.digest()
|
||||
self.hash = None
|
||||
self.offset += digest_count
|
||||
size -= digest_count
|
||||
data = self.digest[:digest_count]
|
||||
r.append(data)
|
||||
|
||||
return ''.join(r)
|
||||
|
||||
def files(mean, stddev, seed=None):
|
||||
"""
|
||||
Yields file-like objects with effectively random contents, where
|
||||
the size of each file follows the normal distribution with `mean`
|
||||
and `stddev`.
|
||||
|
||||
Beware, the file-likeness is very shallow. You can use boto's
|
||||
`key.set_contents_from_file` to send these to S3, but they are not
|
||||
full file objects.
|
||||
|
||||
The last 128 bits are the MD5 digest of the previous bytes, for
|
||||
verifying round-trip data integrity. For example, if you
|
||||
re-download the object and place the contents into a file called
|
||||
``foo``, the following should print two identical lines:
|
||||
|
||||
python -c 'import sys, hashlib; data=sys.stdin.read(); print hashlib.md5(data[:-16]).hexdigest(); print "".join("%02x" % ord(c) for c in data[-16:])' <foo
|
||||
|
||||
Except for objects shorter than 16 bytes, where the second line
|
||||
will be proportionally shorter.
|
||||
"""
|
||||
rand = random.Random(seed)
|
||||
while True:
|
||||
while True:
|
||||
size = int(rand.normalvariate(mean, stddev))
|
||||
if size >= 0:
|
||||
break
|
||||
yield RandomContentFile(size=size, seed=rand.getrandbits(32))
|
||||
|
||||
def names(mean, stddev, charset=None, seed=None):
|
||||
"""
|
||||
Yields strings that are somewhat plausible as file names, where
|
||||
the lenght of each filename follows the normal distribution with
|
||||
`mean` and `stddev`.
|
||||
"""
|
||||
if charset is None:
|
||||
charset = string.ascii_lowercase
|
||||
rand = random.Random(seed)
|
||||
while True:
|
||||
while True:
|
||||
length = int(rand.normalvariate(mean, stddev))
|
||||
if length >= 0:
|
||||
break
|
||||
name = ''.join(rand.choice(charset) for _ in xrange(length))
|
||||
yield name
|
Loading…
Reference in a new issue