s3-tests/s3tests/realistic.py

import bunch
import hashlib
import random
import string
import struct
import time

class RandomContentFile(object):
    def __init__(self, size, seed):
        self.seed = seed
        self.random = random.Random(self.seed)
        self.offset = 0
        self.buffer = ''
        self.size = size
        self.hash = hashlib.md5()
        self.digest_size = self.hash.digest_size
        self.digest = None

    def seek(self, offset):
        assert offset == 0
        self.random.seed(self.seed)
        self.offset = offset
        self.buffer = ''

    def tell(self):
        return self.offset

    def _generate(self):
        # generate and return a chunk of pseudorandom data
        # 256 bits = 32 bytes at a time
        size = 1*1024*1024
        l = [self.random.getrandbits(64) for _ in xrange(size/8)]
        s = struct.pack((size/8)*'Q', *l)
        return s

    def read(self, size=-1):
        if size < 0:
            size = self.size - self.offset

        r = []

        random_count = min(size, self.size - self.offset - self.digest_size)
        if random_count > 0:
            while len(self.buffer) < random_count:
                self.buffer += self._generate()
            self.offset += random_count
            size -= random_count
            data, self.buffer = self.buffer[:random_count], self.buffer[random_count:]
            if self.hash is not None:
                self.hash.update(data)
            r.append(data)

        digest_count = min(size, self.size - self.offset)
        if digest_count > 0:
            if self.digest is None:
                self.digest = self.hash.digest()
                self.hash = None
            self.offset += digest_count
            size -= digest_count
            data = self.digest[:digest_count]
            r.append(data)

        return ''.join(r)

class FileVerifier(object):
    def __init__(self):
        self.size = 0
        self.hash = hashlib.md5()
        self.buf = ''

    def write(self, data):
        self.size += len(data)
        self.buf += data
        digsz = -1*self.hash.digest_size
        new_data, self.buf = self.buf[0:digsz], self.buf[digsz:]
        self.hash.update(new_data)

    def valid(self):
        """
        Returns True if this file looks valid. The file is valid if the end
        of the file has the md5 digest for the first part of the file.
        """
        if self.size < self.hash.digest_size:
            return self.hash.digest().startswith(self.buf)

        return self.buf == self.hash.digest()

def files(mean, stddev, seed=None):
    """
    Yields file-like objects with effectively random contents, where
    the size of each file follows the normal distribution with `mean`
    and `stddev`.

    Beware, the file-likeness is very shallow. You can use boto's
    `key.set_contents_from_file` to send these to S3, but they are not
    full file objects.

    The last 128 bits are the MD5 digest of the previous bytes, for
    verifying round-trip data integrity. For example, if you
    re-download the object and place the contents into a file called
    ``foo``, the following should print two identical lines:

      python -c 'import sys, hashlib; data=sys.stdin.read(); print hashlib.md5(data[:-16]).hexdigest(); print "".join("%02x" % ord(c) for c in data[-16:])' <foo

    Except for objects shorter than 16 bytes, where the second line
    will be proportionally shorter.
    """
    rand = random.Random(seed)
    while True:
        while True:
            size = int(rand.normalvariate(mean, stddev))
            if size >= 0:
                break
        yield RandomContentFile(size=size, seed=rand.getrandbits(32))

def names(mean, stddev, charset=None, seed=None):
    """
    Yields strings that are somewhat plausible as file names, where
    the lenght of each filename follows the normal distribution with
    `mean` and `stddev`.
    """
    if charset is None:
        charset = string.ascii_lowercase
    rand = random.Random(seed)
    while True:
        while True:
            length = int(rand.normalvariate(mean, stddev))
            if length > 0:
                break
        name = ''.join(rand.choice(charset) for _ in xrange(length))
        yield name

def files_varied(groups):
    """ Yields a weighted-random selection of file-like objects. """
    # Quick data type sanity.
    assert groups and isinstance(groups, (list, tuple))

    total_num = 0
    file_sets = []
    rand = random.Random(time.time())

    # Build the sets for our yield
    for num, size, stddev in groups:
        assert num and size

        file_sets.append(bunch.Bunch(
            num    = num,
            size   = size,
            stddev = stddev,
            files  = files(size, stddev, time.time())
        ))
        total_num += num

    while True:
        if not total_num:
            raise StopIteration

        num = rand.randrange(total_num)

        ok = 0
        for file_set in file_sets:
            if num > file_set.num:
                num -= file_set.num
                continue

            total_num -= 1
            file_set.num   -= 1

            # None left in this set!
            if file_set.num == 0:
                file_sets.remove(file_set)

            ok = 1
            yield next(file_set.files)

        if not ok:
            raise RuntimeError, "Couldn't find a match."
Adding generator files_varied to s3tests.realistic Given a tuple of tuples, construct several files() generators, and yield from those randomly. Randomness is weighted based on the number of files remaining in each group. 2011-07-15 12:03:23 -07:00			`import bunch`
DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list 2011-06-29 11:16:42 -07:00			`import hashlib`
			`import random`
			`import string`
Do pseudorandom string generation 1 MB at a time. This gives it a ~10x speedup. 2011-07-06 15:27:50 -07:00			`import struct`
Adding generator files_varied to s3tests.realistic Given a tuple of tuples, construct several files() generators, and yield from those randomly. Randomness is weighted based on the number of files remaining in each group. 2011-07-15 12:03:23 -07:00			`import time`
DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list 2011-06-29 11:16:42 -07:00
			`class RandomContentFile(object):`
			`def __init__(self, size, seed):`
			`self.seed = seed`
			`self.random = random.Random(self.seed)`
			`self.offset = 0`
Do pseudorandom string generation 1 MB at a time. This gives it a ~10x speedup. 2011-07-06 15:27:50 -07:00			`self.buffer = ''`
DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list 2011-06-29 11:16:42 -07:00			`self.size = size`
			`self.hash = hashlib.md5()`
			`self.digest_size = self.hash.digest_size`
			`self.digest = None`

			`def seek(self, offset):`
			`assert offset == 0`
			`self.random.seed(self.seed)`
			`self.offset = offset`
Do pseudorandom string generation 1 MB at a time. This gives it a ~10x speedup. 2011-07-06 15:27:50 -07:00			`self.buffer = ''`
DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list 2011-06-29 11:16:42 -07:00
			`def tell(self):`
			`return self.offset`

Do pseudorandom string generation 1 MB at a time. This gives it a ~10x speedup. 2011-07-06 15:27:50 -07:00			`def _generate(self):`
			`# generate and return a chunk of pseudorandom data`
			`# 256 bits = 32 bytes at a time`
			`size = 110241024`
			`l = [self.random.getrandbits(64) for _ in xrange(size/8)]`
			`s = struct.pack((size/8)'Q', l)`
			`return s`

DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list 2011-06-29 11:16:42 -07:00			`def read(self, size=-1):`
			`if size < 0:`
			`size = self.size - self.offset`

			`r = []`

			`random_count = min(size, self.size - self.offset - self.digest_size)`
			`if random_count > 0:`
Do pseudorandom string generation 1 MB at a time. This gives it a ~10x speedup. 2011-07-06 15:27:50 -07:00			`while len(self.buffer) < random_count:`
			`self.buffer += self._generate()`
DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list 2011-06-29 11:16:42 -07:00			`self.offset += random_count`
			`size -= random_count`
Do pseudorandom string generation 1 MB at a time. This gives it a ~10x speedup. 2011-07-06 15:27:50 -07:00			`data, self.buffer = self.buffer[:random_count], self.buffer[random_count:]`
DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list 2011-06-29 11:16:42 -07:00			`if self.hash is not None:`
			`self.hash.update(data)`
			`r.append(data)`

			`digest_count = min(size, self.size - self.offset)`
			`if digest_count > 0:`
			`if self.digest is None:`
			`self.digest = self.hash.digest()`
			`self.hash = None`
			`self.offset += digest_count`
			`size -= digest_count`
			`data = self.digest[:digest_count]`
			`r.append(data)`

			`return ''.join(r)`

dho qa: rand_readwrite Adds the rand_readwrite utility. Updates realistic.py with a file verifier class. Updates generate_objects.py to allow the filename seed to be set. 2011-07-08 13:00:09 -07:00			`class FileVerifier(object):`
			`def __init__(self):`
			`self.size = 0`
			`self.hash = hashlib.md5()`
			`self.buf = ''`

			`def write(self, data):`
			`self.size += len(data)`
			`self.buf += data`
			`digsz = -1*self.hash.digest_size`
			`new_data, self.buf = self.buf[0:digsz], self.buf[digsz:]`
			`self.hash.update(new_data)`

			`def valid(self):`
			`"""`
			`Returns True if this file looks valid. The file is valid if the end`
			`of the file has the md5 digest for the first part of the file.`
			`"""`
fix random file verifier with small files The file verifier will now correctly verify small files (smaller than the md5 digest size), as long as the data that is there matches what the digest would be. 2011-07-13 13:50:26 -07:00			`if self.size < self.hash.digest_size:`
			`return self.hash.digest().startswith(self.buf)`

dho qa: rand_readwrite Adds the rand_readwrite utility. Updates realistic.py with a file verifier class. Updates generate_objects.py to allow the filename seed to be set. 2011-07-08 13:00:09 -07:00			`return self.buf == self.hash.digest()`

DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list 2011-06-29 11:16:42 -07:00			`def files(mean, stddev, seed=None):`
			`"""`
			`Yields file-like objects with effectively random contents, where`
			the size of each file follows the normal distribution with `mean`
			and `stddev`.

			`Beware, the file-likeness is very shallow. You can use boto's`
			`key.set_contents_from_file` to send these to S3, but they are not
			`full file objects.`

			`The last 128 bits are the MD5 digest of the previous bytes, for`
			`verifying round-trip data integrity. For example, if you`
			`re-download the object and place the contents into a file called`
			``foo``, the following should print two identical lines:

Whitespace cleanup. 2011-07-11 13:19:54 -07:00			`python -c 'import sys, hashlib; data=sys.stdin.read(); print hashlib.md5(data[:-16]).hexdigest(); print "".join("%02x" % ord(c) for c in data[-16:])' <foo`
DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list 2011-06-29 11:16:42 -07:00
			`Except for objects shorter than 16 bytes, where the second line`
			`will be proportionally shorter.`
			`"""`
			`rand = random.Random(seed)`
			`while True:`
			`while True:`
			`size = int(rand.normalvariate(mean, stddev))`
			`if size >= 0:`
			`break`
			`yield RandomContentFile(size=size, seed=rand.getrandbits(32))`

			`def names(mean, stddev, charset=None, seed=None):`
			`"""`
			`Yields strings that are somewhat plausible as file names, where`
			`the lenght of each filename follows the normal distribution with`
			`mean` and `stddev`.
			`"""`
			`if charset is None:`
			`charset = string.ascii_lowercase`
			`rand = random.Random(seed)`
			`while True:`
			`while True:`
			`length = int(rand.normalvariate(mean, stddev))`
realistic.names shouldn't return a 0-length filename 2011-07-11 14:17:06 -07:00			`if length > 0:`
DHO QA: Random Object Generation Script Script to generate garbage objects and push them to a bucket. Script takes a config file on the command line (and some other command line options using optparse) and generates a bunch of objects in an S3 bucket. Also prints public URLs to stdout. Number and sizes of the objects are determined by a yaml config file with each line looking like this: - [A, B, C] A: Number of files in this group B: Mean size of files in this group (in bytes) C: Standard deviation (normal distribution) of file sizes in this group command line options are: - S3 access key - S3 secret key - seed for PRNG - output file to write URLs to - flag to add md5 checksum to url list 2011-06-29 11:16:42 -07:00			`break`
			`name = ''.join(rand.choice(charset) for _ in xrange(length))`
			`yield name`
Adding generator files_varied to s3tests.realistic Given a tuple of tuples, construct several files() generators, and yield from those randomly. Randomness is weighted based on the number of files remaining in each group. 2011-07-15 12:03:23 -07:00
			`def files_varied(groups):`
			`""" Yields a weighted-random selection of file-like objects. """`
			`# Quick data type sanity.`
			`assert groups and isinstance(groups, (list, tuple))`

			`total_num = 0`
			`file_sets = []`
			`rand = random.Random(time.time())`

			`# Build the sets for our yield`
			`for num, size, stddev in groups:`
			`assert num and size`

			`file_sets.append(bunch.Bunch(`
			`num = num,`
			`size = size,`
			`stddev = stddev,`
			`files = files(size, stddev, time.time())`
			`))`
			`total_num += num`

			`while True:`
			`if not total_num:`
			`raise StopIteration`

			`num = rand.randrange(total_num)`

			`ok = 0`
			`for file_set in file_sets:`
			`if num > file_set.num:`
			`num -= file_set.num`
			`continue`

			`total_num -= 1`
			`file_set.num -= 1`

			`# None left in this set!`
			`if file_set.num == 0:`
			`file_sets.remove(file_set)`

			`ok = 1`
			`yield next(file_set.files)`

			`if not ok:`
			`raise RuntimeError, "Couldn't find a match."`