From 3a3cbb3d25f26fe71716773766289524d7be7c70 Mon Sep 17 00:00:00 2001 From: Kyle Marsh Date: Wed, 29 Jun 2011 11:16:42 -0700 Subject: [PATCH] DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list --- generate_objects.conf | 3 + generate_objects.py | 124 ++++++++++++++++++++++++++++++++++++++++++ realistic.py | 93 +++++++++++++++++++++++++++++++ 3 files changed, 220 insertions(+) create mode 100644 generate_objects.conf create mode 100755 generate_objects.py create mode 100644 realistic.py diff --git a/generate_objects.conf b/generate_objects.conf new file mode 100644 index 0000000..2be00e3 --- /dev/null +++ b/generate_objects.conf @@ -0,0 +1,3 @@ +- [3, 20, 5] +- [3, 30, 2] + diff --git a/generate_objects.py b/generate_objects.py new file mode 100755 index 0000000..e0c94c1 --- /dev/null +++ b/generate_objects.py @@ -0,0 +1,124 @@ +#! /usr/bin/python + +from boto.s3.connection import OrdinaryCallingFormat +from boto.s3.connection import S3Connection +from boto.s3.key import Key +from optparse import OptionParser +from realistic import RandomContentFile +import realistic +import random +import yaml +import boto +import sys + +DHO_HOST = 'objects.dreamhost.com' + +def parse_opts(): + parser = OptionParser(); + parser.add_option('-O' , '--outfile', help='write output to FILE. Defaults to STDOUT', metavar='FILE') + parser.add_option('-a' , '--access-key', dest='access_key', help='use S3 access key KEY', metavar='KEY') + parser.add_option('-s' , '--secret-key', dest='secret_key', help='use S3 secret key KEY', metavar='KEY') + parser.add_option('-b' , '--bucket', dest='bucket', help='push objects to BUCKET', metavar='BUCKET') + parser.add_option('--checksum', dest='checksum', action='store_true', help='include the md5 checksum with the object urls') + parser.add_option('--host', dest='host', help='use S3 gateway at HOST', metavar='HOST') + parser.add_option('--seed', dest='seed', help='optional seed for the random number generator') + + parser.set_defaults(host=DHO_HOST) + + return parser.parse_args() + + +def parse_config(config_files): + configurations = [] + for file in config_files: + FILE = open(file, 'r') + configurations = configurations + yaml.load(FILE.read()) + FILE.close() + return configurations + + +def get_bucket(conn, existing_bucket): + if existing_bucket: + return conn.get_bucket(existing_bucket) + else: + goop = '%x' % random.getrandbits(64) + bucket = conn.create_bucket(goop) + bucket.set_acl('public-read') + return bucket + + +def connect_s3(host, access_key, secret_key): + conn = S3Connection( + calling_format = OrdinaryCallingFormat(), + is_secure = False, + host = host, + aws_access_key_id = access_key, + aws_secret_access_key = secret_key) + + return conn + + +def generate_objects(bucket, quantity, mean, stddev, seed, checksum=False): + """Generate random objects with sizes across a normal distribution + specified by mean and standard deviation and write them to bucket. + IN: + boto S3 bucket object + Number of files + mean file size in bytes + standard deviation from mean file size + seed for RNG + flag to tell the method to append md5 checksums to the output + OUT: + list of urls (strings) to objects valid for 1 hour. + If "checksum" is true, each output string consists of the url + followed by the md5 checksum. + """ + urls = [] + file_generator = realistic.files(mean, stddev, seed) + name_generator = realistic.names(15, 4,seed=seed) + for _ in xrange(quantity): + fp = file_generator.next() + print >> sys.stderr, 'sending file with size %dB' % fp.size + key = Key(bucket) + key.key = name_generator.next() + key.set_contents_from_file(fp) + url = key.generate_url(3600) #valid for 1 hour + if checksum: + url += ' %s' % key.md5 + urls.append(url) + + return urls + + +def main(): + (options, args) = parse_opts(); + + #SETUP + random.seed(options.seed if options.seed else None) + if options.outfile: + OUTFILE = open(options.outfile, 'w') + else: + OUTFILE = sys.stdout + + conn = connect_s3(options.host, options.access_key, options.secret_key) + bucket = get_bucket(conn, options.bucket) + urls = [] + + print >> OUTFILE, 'bucket: %s' % bucket.name + print >> sys.stderr, 'setup complete, generating files' + for profile in parse_config(args): + seed = random.random() + urls += generate_objects(bucket, profile[0], profile[1], profile[2], seed, options.checksum) + print >> sys.stderr, 'finished sending files. Saving urls to S3' + + url_string = '\n'.join(urls) + url_key = Key(bucket) + url_key.key = 'urls' + url_key.set_contents_from_string(url_string) + print >> OUTFILE, url_string + print >> sys.stderr, 'done' + + +if __name__ == '__main__': + main() + diff --git a/realistic.py b/realistic.py new file mode 100644 index 0000000..58a7e1a --- /dev/null +++ b/realistic.py @@ -0,0 +1,93 @@ +import hashlib +import random +import string + +class RandomContentFile(object): + def __init__(self, size, seed): + self.seed = seed + self.random = random.Random(self.seed) + self.offset = 0 + self.size = size + self.hash = hashlib.md5() + self.digest_size = self.hash.digest_size + self.digest = None + + def seek(self, offset): + assert offset == 0 + self.random.seed(self.seed) + self.offset = offset + + def tell(self): + return self.offset + + def read(self, size=-1): + if size < 0: + size = self.size - self.offset + + r = [] + + random_count = min(size, self.size - self.offset - self.digest_size) + if random_count > 0: + self.offset += random_count + size -= random_count + data = ''.join(chr(self.random.getrandbits(8)) for _ in xrange(random_count)) + if self.hash is not None: + self.hash.update(data) + r.append(data) + + digest_count = min(size, self.size - self.offset) + if digest_count > 0: + if self.digest is None: + self.digest = self.hash.digest() + self.hash = None + self.offset += digest_count + size -= digest_count + data = self.digest[:digest_count] + r.append(data) + + return ''.join(r) + +def files(mean, stddev, seed=None): + """ + Yields file-like objects with effectively random contents, where + the size of each file follows the normal distribution with `mean` + and `stddev`. + + Beware, the file-likeness is very shallow. You can use boto's + `key.set_contents_from_file` to send these to S3, but they are not + full file objects. + + The last 128 bits are the MD5 digest of the previous bytes, for + verifying round-trip data integrity. For example, if you + re-download the object and place the contents into a file called + ``foo``, the following should print two identical lines: + + python -c 'import sys, hashlib; data=sys.stdin.read(); print hashlib.md5(data[:-16]).hexdigest(); print "".join("%02x" % ord(c) for c in data[-16:])' = 0: + break + yield RandomContentFile(size=size, seed=rand.getrandbits(32)) + +def names(mean, stddev, charset=None, seed=None): + """ + Yields strings that are somewhat plausible as file names, where + the lenght of each filename follows the normal distribution with + `mean` and `stddev`. + """ + if charset is None: + charset = string.ascii_lowercase + rand = random.Random(seed) + while True: + while True: + length = int(rand.normalvariate(mean, stddev)) + if length >= 0: + break + name = ''.join(rand.choice(charset) for _ in xrange(length)) + yield name