DHO QA: Random Object Generation Script

Script to generate garbage objects and push them to a bucket.

Script takes a config file on the command line (and some other command line
options using optparse) and generates a bunch of objects in an S3 bucket.
Also prints public URLs to stdout.

Number and sizes of the objects are determined by a yaml config file with each line
looking like this:
- [A, B, C]
A: Number of files in this group
B: Mean size of files in this group (in bytes)
C: Standard deviation (normal distribution) of file sizes in this group

command line options are:
 - S3 access key
 - S3 secret key
 - seed for PRNG
 - output file to write URLs to
 - flag to add md5 checksum to url list
This commit is contained in:
Kyle Marsh 2011-06-29 11:16:42 -07:00
parent 262f1eecd1
commit 3a3cbb3d25
3 changed files with 220 additions and 0 deletions

3
generate_objects.conf Normal file
View file

@ -0,0 +1,3 @@
- [3, 20, 5]
- [3, 30, 2]

124
generate_objects.py Executable file
View file

@ -0,0 +1,124 @@
#! /usr/bin/python
from boto.s3.connection import OrdinaryCallingFormat
from boto.s3.connection import S3Connection
from boto.s3.key import Key
from optparse import OptionParser
from realistic import RandomContentFile
import realistic
import random
import yaml
import boto
import sys
DHO_HOST = 'objects.dreamhost.com'
def parse_opts():
parser = OptionParser();
parser.add_option('-O' , '--outfile', help='write output to FILE. Defaults to STDOUT', metavar='FILE')
parser.add_option('-a' , '--access-key', dest='access_key', help='use S3 access key KEY', metavar='KEY')
parser.add_option('-s' , '--secret-key', dest='secret_key', help='use S3 secret key KEY', metavar='KEY')
parser.add_option('-b' , '--bucket', dest='bucket', help='push objects to BUCKET', metavar='BUCKET')
parser.add_option('--checksum', dest='checksum', action='store_true', help='include the md5 checksum with the object urls')
parser.add_option('--host', dest='host', help='use S3 gateway at HOST', metavar='HOST')
parser.add_option('--seed', dest='seed', help='optional seed for the random number generator')
parser.set_defaults(host=DHO_HOST)
return parser.parse_args()
def parse_config(config_files):
configurations = []
for file in config_files:
FILE = open(file, 'r')
configurations = configurations + yaml.load(FILE.read())
FILE.close()
return configurations
def get_bucket(conn, existing_bucket):
if existing_bucket:
return conn.get_bucket(existing_bucket)
else:
goop = '%x' % random.getrandbits(64)
bucket = conn.create_bucket(goop)
bucket.set_acl('public-read')
return bucket
def connect_s3(host, access_key, secret_key):
conn = S3Connection(
calling_format = OrdinaryCallingFormat(),
is_secure = False,
host = host,
aws_access_key_id = access_key,
aws_secret_access_key = secret_key)
return conn
def generate_objects(bucket, quantity, mean, stddev, seed, checksum=False):
"""Generate random objects with sizes across a normal distribution
specified by mean and standard deviation and write them to bucket.
IN:
boto S3 bucket object
Number of files
mean file size in bytes
standard deviation from mean file size
seed for RNG
flag to tell the method to append md5 checksums to the output
OUT:
list of urls (strings) to objects valid for 1 hour.
If "checksum" is true, each output string consists of the url
followed by the md5 checksum.
"""
urls = []
file_generator = realistic.files(mean, stddev, seed)
name_generator = realistic.names(15, 4,seed=seed)
for _ in xrange(quantity):
fp = file_generator.next()
print >> sys.stderr, 'sending file with size %dB' % fp.size
key = Key(bucket)
key.key = name_generator.next()
key.set_contents_from_file(fp)
url = key.generate_url(3600) #valid for 1 hour
if checksum:
url += ' %s' % key.md5
urls.append(url)
return urls
def main():
(options, args) = parse_opts();
#SETUP
random.seed(options.seed if options.seed else None)
if options.outfile:
OUTFILE = open(options.outfile, 'w')
else:
OUTFILE = sys.stdout
conn = connect_s3(options.host, options.access_key, options.secret_key)
bucket = get_bucket(conn, options.bucket)
urls = []
print >> OUTFILE, 'bucket: %s' % bucket.name
print >> sys.stderr, 'setup complete, generating files'
for profile in parse_config(args):
seed = random.random()
urls += generate_objects(bucket, profile[0], profile[1], profile[2], seed, options.checksum)
print >> sys.stderr, 'finished sending files. Saving urls to S3'
url_string = '\n'.join(urls)
url_key = Key(bucket)
url_key.key = 'urls'
url_key.set_contents_from_string(url_string)
print >> OUTFILE, url_string
print >> sys.stderr, 'done'
if __name__ == '__main__':
main()

93
realistic.py Normal file
View file

@ -0,0 +1,93 @@
import hashlib
import random
import string
class RandomContentFile(object):
def __init__(self, size, seed):
self.seed = seed
self.random = random.Random(self.seed)
self.offset = 0
self.size = size
self.hash = hashlib.md5()
self.digest_size = self.hash.digest_size
self.digest = None
def seek(self, offset):
assert offset == 0
self.random.seed(self.seed)
self.offset = offset
def tell(self):
return self.offset
def read(self, size=-1):
if size < 0:
size = self.size - self.offset
r = []
random_count = min(size, self.size - self.offset - self.digest_size)
if random_count > 0:
self.offset += random_count
size -= random_count
data = ''.join(chr(self.random.getrandbits(8)) for _ in xrange(random_count))
if self.hash is not None:
self.hash.update(data)
r.append(data)
digest_count = min(size, self.size - self.offset)
if digest_count > 0:
if self.digest is None:
self.digest = self.hash.digest()
self.hash = None
self.offset += digest_count
size -= digest_count
data = self.digest[:digest_count]
r.append(data)
return ''.join(r)
def files(mean, stddev, seed=None):
"""
Yields file-like objects with effectively random contents, where
the size of each file follows the normal distribution with `mean`
and `stddev`.
Beware, the file-likeness is very shallow. You can use boto's
`key.set_contents_from_file` to send these to S3, but they are not
full file objects.
The last 128 bits are the MD5 digest of the previous bytes, for
verifying round-trip data integrity. For example, if you
re-download the object and place the contents into a file called
``foo``, the following should print two identical lines:
python -c 'import sys, hashlib; data=sys.stdin.read(); print hashlib.md5(data[:-16]).hexdigest(); print "".join("%02x" % ord(c) for c in data[-16:])' <foo
Except for objects shorter than 16 bytes, where the second line
will be proportionally shorter.
"""
rand = random.Random(seed)
while True:
while True:
size = int(rand.normalvariate(mean, stddev))
if size >= 0:
break
yield RandomContentFile(size=size, seed=rand.getrandbits(32))
def names(mean, stddev, charset=None, seed=None):
"""
Yields strings that are somewhat plausible as file names, where
the lenght of each filename follows the normal distribution with
`mean` and `stddev`.
"""
if charset is None:
charset = string.ascii_lowercase
rand = random.Random(seed)
while True:
while True:
length = int(rand.normalvariate(mean, stddev))
if length >= 0:
break
name = ''.join(rand.choice(charset) for _ in xrange(length))
yield name