s3-tests/realistic.py
Kyle Marsh 3a3cbb3d25 DHO QA: Random Object Generation Script
Script to generate garbage objects and push them to a bucket.

Script takes a config file on the command line (and some other command line
options using optparse) and generates a bunch of objects in an S3 bucket.
Also prints public URLs to stdout.

Number and sizes of the objects are determined by a yaml config file with each line
looking like this:
- [A, B, C]
A: Number of files in this group
B: Mean size of files in this group (in bytes)
C: Standard deviation (normal distribution) of file sizes in this group

command line options are:
 - S3 access key
 - S3 secret key
 - seed for PRNG
 - output file to write URLs to
 - flag to add md5 checksum to url list
2011-07-08 11:27:56 -07:00

93 lines
3 KiB
Python

import hashlib
import random
import string
class RandomContentFile(object):
def __init__(self, size, seed):
self.seed = seed
self.random = random.Random(self.seed)
self.offset = 0
self.size = size
self.hash = hashlib.md5()
self.digest_size = self.hash.digest_size
self.digest = None
def seek(self, offset):
assert offset == 0
self.random.seed(self.seed)
self.offset = offset
def tell(self):
return self.offset
def read(self, size=-1):
if size < 0:
size = self.size - self.offset
r = []
random_count = min(size, self.size - self.offset - self.digest_size)
if random_count > 0:
self.offset += random_count
size -= random_count
data = ''.join(chr(self.random.getrandbits(8)) for _ in xrange(random_count))
if self.hash is not None:
self.hash.update(data)
r.append(data)
digest_count = min(size, self.size - self.offset)
if digest_count > 0:
if self.digest is None:
self.digest = self.hash.digest()
self.hash = None
self.offset += digest_count
size -= digest_count
data = self.digest[:digest_count]
r.append(data)
return ''.join(r)
def files(mean, stddev, seed=None):
"""
Yields file-like objects with effectively random contents, where
the size of each file follows the normal distribution with `mean`
and `stddev`.
Beware, the file-likeness is very shallow. You can use boto's
`key.set_contents_from_file` to send these to S3, but they are not
full file objects.
The last 128 bits are the MD5 digest of the previous bytes, for
verifying round-trip data integrity. For example, if you
re-download the object and place the contents into a file called
``foo``, the following should print two identical lines:
python -c 'import sys, hashlib; data=sys.stdin.read(); print hashlib.md5(data[:-16]).hexdigest(); print "".join("%02x" % ord(c) for c in data[-16:])' <foo
Except for objects shorter than 16 bytes, where the second line
will be proportionally shorter.
"""
rand = random.Random(seed)
while True:
while True:
size = int(rand.normalvariate(mean, stddev))
if size >= 0:
break
yield RandomContentFile(size=size, seed=rand.getrandbits(32))
def names(mean, stddev, charset=None, seed=None):
"""
Yields strings that are somewhat plausible as file names, where
the lenght of each filename follows the normal distribution with
`mean` and `stddev`.
"""
if charset is None:
charset = string.ascii_lowercase
rand = random.Random(seed)
while True:
while True:
length = int(rand.normalvariate(mean, stddev))
if length >= 0:
break
name = ''.join(rand.choice(charset) for _ in xrange(length))
yield name