forked from TrueCloudLab/s3-tests
DHO QA: Random Object Generation Script
Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list
This commit is contained in:
parent
262f1eecd1
commit
3a3cbb3d25
3 changed files with 220 additions and 0 deletions
3
generate_objects.conf
Normal file
3
generate_objects.conf
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
- [3, 20, 5]
|
||||||
|
- [3, 30, 2]
|
||||||
|
|
124
generate_objects.py
Executable file
124
generate_objects.py
Executable file
|
@ -0,0 +1,124 @@
|
||||||
|
#! /usr/bin/python
|
||||||
|
|
||||||
|
from boto.s3.connection import OrdinaryCallingFormat
|
||||||
|
from boto.s3.connection import S3Connection
|
||||||
|
from boto.s3.key import Key
|
||||||
|
from optparse import OptionParser
|
||||||
|
from realistic import RandomContentFile
|
||||||
|
import realistic
|
||||||
|
import random
|
||||||
|
import yaml
|
||||||
|
import boto
|
||||||
|
import sys
|
||||||
|
|
||||||
|
DHO_HOST = 'objects.dreamhost.com'
|
||||||
|
|
||||||
|
def parse_opts():
|
||||||
|
parser = OptionParser();
|
||||||
|
parser.add_option('-O' , '--outfile', help='write output to FILE. Defaults to STDOUT', metavar='FILE')
|
||||||
|
parser.add_option('-a' , '--access-key', dest='access_key', help='use S3 access key KEY', metavar='KEY')
|
||||||
|
parser.add_option('-s' , '--secret-key', dest='secret_key', help='use S3 secret key KEY', metavar='KEY')
|
||||||
|
parser.add_option('-b' , '--bucket', dest='bucket', help='push objects to BUCKET', metavar='BUCKET')
|
||||||
|
parser.add_option('--checksum', dest='checksum', action='store_true', help='include the md5 checksum with the object urls')
|
||||||
|
parser.add_option('--host', dest='host', help='use S3 gateway at HOST', metavar='HOST')
|
||||||
|
parser.add_option('--seed', dest='seed', help='optional seed for the random number generator')
|
||||||
|
|
||||||
|
parser.set_defaults(host=DHO_HOST)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_config(config_files):
|
||||||
|
configurations = []
|
||||||
|
for file in config_files:
|
||||||
|
FILE = open(file, 'r')
|
||||||
|
configurations = configurations + yaml.load(FILE.read())
|
||||||
|
FILE.close()
|
||||||
|
return configurations
|
||||||
|
|
||||||
|
|
||||||
|
def get_bucket(conn, existing_bucket):
|
||||||
|
if existing_bucket:
|
||||||
|
return conn.get_bucket(existing_bucket)
|
||||||
|
else:
|
||||||
|
goop = '%x' % random.getrandbits(64)
|
||||||
|
bucket = conn.create_bucket(goop)
|
||||||
|
bucket.set_acl('public-read')
|
||||||
|
return bucket
|
||||||
|
|
||||||
|
|
||||||
|
def connect_s3(host, access_key, secret_key):
|
||||||
|
conn = S3Connection(
|
||||||
|
calling_format = OrdinaryCallingFormat(),
|
||||||
|
is_secure = False,
|
||||||
|
host = host,
|
||||||
|
aws_access_key_id = access_key,
|
||||||
|
aws_secret_access_key = secret_key)
|
||||||
|
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def generate_objects(bucket, quantity, mean, stddev, seed, checksum=False):
|
||||||
|
"""Generate random objects with sizes across a normal distribution
|
||||||
|
specified by mean and standard deviation and write them to bucket.
|
||||||
|
IN:
|
||||||
|
boto S3 bucket object
|
||||||
|
Number of files
|
||||||
|
mean file size in bytes
|
||||||
|
standard deviation from mean file size
|
||||||
|
seed for RNG
|
||||||
|
flag to tell the method to append md5 checksums to the output
|
||||||
|
OUT:
|
||||||
|
list of urls (strings) to objects valid for 1 hour.
|
||||||
|
If "checksum" is true, each output string consists of the url
|
||||||
|
followed by the md5 checksum.
|
||||||
|
"""
|
||||||
|
urls = []
|
||||||
|
file_generator = realistic.files(mean, stddev, seed)
|
||||||
|
name_generator = realistic.names(15, 4,seed=seed)
|
||||||
|
for _ in xrange(quantity):
|
||||||
|
fp = file_generator.next()
|
||||||
|
print >> sys.stderr, 'sending file with size %dB' % fp.size
|
||||||
|
key = Key(bucket)
|
||||||
|
key.key = name_generator.next()
|
||||||
|
key.set_contents_from_file(fp)
|
||||||
|
url = key.generate_url(3600) #valid for 1 hour
|
||||||
|
if checksum:
|
||||||
|
url += ' %s' % key.md5
|
||||||
|
urls.append(url)
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
(options, args) = parse_opts();
|
||||||
|
|
||||||
|
#SETUP
|
||||||
|
random.seed(options.seed if options.seed else None)
|
||||||
|
if options.outfile:
|
||||||
|
OUTFILE = open(options.outfile, 'w')
|
||||||
|
else:
|
||||||
|
OUTFILE = sys.stdout
|
||||||
|
|
||||||
|
conn = connect_s3(options.host, options.access_key, options.secret_key)
|
||||||
|
bucket = get_bucket(conn, options.bucket)
|
||||||
|
urls = []
|
||||||
|
|
||||||
|
print >> OUTFILE, 'bucket: %s' % bucket.name
|
||||||
|
print >> sys.stderr, 'setup complete, generating files'
|
||||||
|
for profile in parse_config(args):
|
||||||
|
seed = random.random()
|
||||||
|
urls += generate_objects(bucket, profile[0], profile[1], profile[2], seed, options.checksum)
|
||||||
|
print >> sys.stderr, 'finished sending files. Saving urls to S3'
|
||||||
|
|
||||||
|
url_string = '\n'.join(urls)
|
||||||
|
url_key = Key(bucket)
|
||||||
|
url_key.key = 'urls'
|
||||||
|
url_key.set_contents_from_string(url_string)
|
||||||
|
print >> OUTFILE, url_string
|
||||||
|
print >> sys.stderr, 'done'
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
93
realistic.py
Normal file
93
realistic.py
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
import hashlib
|
||||||
|
import random
|
||||||
|
import string
|
||||||
|
|
||||||
|
class RandomContentFile(object):
|
||||||
|
def __init__(self, size, seed):
|
||||||
|
self.seed = seed
|
||||||
|
self.random = random.Random(self.seed)
|
||||||
|
self.offset = 0
|
||||||
|
self.size = size
|
||||||
|
self.hash = hashlib.md5()
|
||||||
|
self.digest_size = self.hash.digest_size
|
||||||
|
self.digest = None
|
||||||
|
|
||||||
|
def seek(self, offset):
|
||||||
|
assert offset == 0
|
||||||
|
self.random.seed(self.seed)
|
||||||
|
self.offset = offset
|
||||||
|
|
||||||
|
def tell(self):
|
||||||
|
return self.offset
|
||||||
|
|
||||||
|
def read(self, size=-1):
|
||||||
|
if size < 0:
|
||||||
|
size = self.size - self.offset
|
||||||
|
|
||||||
|
r = []
|
||||||
|
|
||||||
|
random_count = min(size, self.size - self.offset - self.digest_size)
|
||||||
|
if random_count > 0:
|
||||||
|
self.offset += random_count
|
||||||
|
size -= random_count
|
||||||
|
data = ''.join(chr(self.random.getrandbits(8)) for _ in xrange(random_count))
|
||||||
|
if self.hash is not None:
|
||||||
|
self.hash.update(data)
|
||||||
|
r.append(data)
|
||||||
|
|
||||||
|
digest_count = min(size, self.size - self.offset)
|
||||||
|
if digest_count > 0:
|
||||||
|
if self.digest is None:
|
||||||
|
self.digest = self.hash.digest()
|
||||||
|
self.hash = None
|
||||||
|
self.offset += digest_count
|
||||||
|
size -= digest_count
|
||||||
|
data = self.digest[:digest_count]
|
||||||
|
r.append(data)
|
||||||
|
|
||||||
|
return ''.join(r)
|
||||||
|
|
||||||
|
def files(mean, stddev, seed=None):
|
||||||
|
"""
|
||||||
|
Yields file-like objects with effectively random contents, where
|
||||||
|
the size of each file follows the normal distribution with `mean`
|
||||||
|
and `stddev`.
|
||||||
|
|
||||||
|
Beware, the file-likeness is very shallow. You can use boto's
|
||||||
|
`key.set_contents_from_file` to send these to S3, but they are not
|
||||||
|
full file objects.
|
||||||
|
|
||||||
|
The last 128 bits are the MD5 digest of the previous bytes, for
|
||||||
|
verifying round-trip data integrity. For example, if you
|
||||||
|
re-download the object and place the contents into a file called
|
||||||
|
``foo``, the following should print two identical lines:
|
||||||
|
|
||||||
|
python -c 'import sys, hashlib; data=sys.stdin.read(); print hashlib.md5(data[:-16]).hexdigest(); print "".join("%02x" % ord(c) for c in data[-16:])' <foo
|
||||||
|
|
||||||
|
Except for objects shorter than 16 bytes, where the second line
|
||||||
|
will be proportionally shorter.
|
||||||
|
"""
|
||||||
|
rand = random.Random(seed)
|
||||||
|
while True:
|
||||||
|
while True:
|
||||||
|
size = int(rand.normalvariate(mean, stddev))
|
||||||
|
if size >= 0:
|
||||||
|
break
|
||||||
|
yield RandomContentFile(size=size, seed=rand.getrandbits(32))
|
||||||
|
|
||||||
|
def names(mean, stddev, charset=None, seed=None):
|
||||||
|
"""
|
||||||
|
Yields strings that are somewhat plausible as file names, where
|
||||||
|
the lenght of each filename follows the normal distribution with
|
||||||
|
`mean` and `stddev`.
|
||||||
|
"""
|
||||||
|
if charset is None:
|
||||||
|
charset = string.ascii_lowercase
|
||||||
|
rand = random.Random(seed)
|
||||||
|
while True:
|
||||||
|
while True:
|
||||||
|
length = int(rand.normalvariate(mean, stddev))
|
||||||
|
if length >= 0:
|
||||||
|
break
|
||||||
|
name = ''.join(rand.choice(charset) for _ in xrange(length))
|
||||||
|
yield name
|
Loading…
Reference in a new issue