s3-tests/realistic.py

import hashlib
import random
import string

class RandomContentFile(object):
    def __init__(self, size, seed):
        self.seed = seed
        self.random = random.Random(self.seed)
        self.offset = 0
        self.size = size
        self.hash = hashlib.md5()
        self.digest_size = self.hash.digest_size
        self.digest = None

    def seek(self, offset):
        assert offset == 0
        self.random.seed(self.seed)
        self.offset = offset

    def tell(self):
        return self.offset

    def read(self, size=-1):
        if size < 0:
            size = self.size - self.offset

        r = []

        random_count = min(size, self.size - self.offset - self.digest_size)
        if random_count > 0:
            self.offset += random_count
            size -= random_count
            data = ''.join(chr(self.random.getrandbits(8)) for _ in xrange(random_count))
            if self.hash is not None:
                self.hash.update(data)
            r.append(data)

        digest_count = min(size, self.size - self.offset)
        if digest_count > 0:
            if self.digest is None:
                self.digest = self.hash.digest()
                self.hash = None
            self.offset += digest_count
            size -= digest_count
            data = self.digest[:digest_count]
            r.append(data)

        return ''.join(r)

def files(mean, stddev, seed=None):
    """
    Yields file-like objects with effectively random contents, where
    the size of each file follows the normal distribution with `mean`
    and `stddev`.

    Beware, the file-likeness is very shallow. You can use boto's
    `key.set_contents_from_file` to send these to S3, but they are not
    full file objects.

    The last 128 bits are the MD5 digest of the previous bytes, for
    verifying round-trip data integrity. For example, if you
    re-download the object and place the contents into a file called
    ``foo``, the following should print two identical lines:

	python -c 'import sys, hashlib; data=sys.stdin.read(); print hashlib.md5(data[:-16]).hexdigest(); print "".join("%02x" % ord(c) for c in data[-16:])' <foo

    Except for objects shorter than 16 bytes, where the second line
    will be proportionally shorter.
    """
    rand = random.Random(seed)
    while True:
        while True:
            size = int(rand.normalvariate(mean, stddev))
            if size >= 0:
                break
        yield RandomContentFile(size=size, seed=rand.getrandbits(32))

def names(mean, stddev, charset=None, seed=None):
    """
    Yields strings that are somewhat plausible as file names, where
    the lenght of each filename follows the normal distribution with
    `mean` and `stddev`.
    """
    if charset is None:
        charset = string.ascii_lowercase
    rand = random.Random(seed)
    while True:
        while True:
            length = int(rand.normalvariate(mean, stddev))
            if length >= 0:
                break
        name = ''.join(rand.choice(charset) for _ in xrange(length))
        yield name
DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list 2011-06-29 11:16:42 -07:00			`import hashlib`
			`import random`
			`import string`

			`class RandomContentFile(object):`
			`def __init__(self, size, seed):`
			`self.seed = seed`
			`self.random = random.Random(self.seed)`
			`self.offset = 0`
			`self.size = size`
			`self.hash = hashlib.md5()`
			`self.digest_size = self.hash.digest_size`
			`self.digest = None`

			`def seek(self, offset):`
			`assert offset == 0`
			`self.random.seed(self.seed)`
			`self.offset = offset`

			`def tell(self):`
			`return self.offset`

			`def read(self, size=-1):`
			`if size < 0:`
			`size = self.size - self.offset`

			`r = []`

			`random_count = min(size, self.size - self.offset - self.digest_size)`
			`if random_count > 0:`
			`self.offset += random_count`
			`size -= random_count`
			`data = ''.join(chr(self.random.getrandbits(8)) for _ in xrange(random_count))`
			`if self.hash is not None:`
			`self.hash.update(data)`
			`r.append(data)`

			`digest_count = min(size, self.size - self.offset)`
			`if digest_count > 0:`
			`if self.digest is None:`
			`self.digest = self.hash.digest()`
			`self.hash = None`
			`self.offset += digest_count`
			`size -= digest_count`
			`data = self.digest[:digest_count]`
			`r.append(data)`

			`return ''.join(r)`

			`def files(mean, stddev, seed=None):`
			`"""`
			`Yields file-like objects with effectively random contents, where`
			the size of each file follows the normal distribution with `mean`
			and `stddev`.

			`Beware, the file-likeness is very shallow. You can use boto's`
			`key.set_contents_from_file` to send these to S3, but they are not
			`full file objects.`

			`The last 128 bits are the MD5 digest of the previous bytes, for`
			`verifying round-trip data integrity. For example, if you`
			`re-download the object and place the contents into a file called`
			``foo``, the following should print two identical lines:

			`python -c 'import sys, hashlib; data=sys.stdin.read(); print hashlib.md5(data[:-16]).hexdigest(); print "".join("%02x" % ord(c) for c in data[-16:])' <foo`

			`Except for objects shorter than 16 bytes, where the second line`
			`will be proportionally shorter.`
			`"""`
			`rand = random.Random(seed)`
			`while True:`
			`while True:`
			`size = int(rand.normalvariate(mean, stddev))`
			`if size >= 0:`
			`break`
			`yield RandomContentFile(size=size, seed=rand.getrandbits(32))`

			`def names(mean, stddev, charset=None, seed=None):`
			`"""`
			`Yields strings that are somewhat plausible as file names, where`
			`the lenght of each filename follows the normal distribution with`
			`mean` and `stddev`.
			`"""`
			`if charset is None:`
			`charset = string.ascii_lowercase`
			`rand = random.Random(seed)`
			`while True:`
			`while True:`
			`length = int(rand.normalvariate(mean, stddev))`
			`if length >= 0:`
			`break`
			`name = ''.join(rand.choice(charset) for _ in xrange(length))`
			`yield name`