import hashlib import random import string class RandomContentFile(object): def __init__(self, size, seed): self.seed = seed self.random = random.Random(self.seed) self.offset = 0 self.size = size self.hash = hashlib.md5() self.digest_size = self.hash.digest_size self.digest = None def seek(self, offset): assert offset == 0 self.random.seed(self.seed) self.offset = offset def tell(self): return self.offset def read(self, size=-1): if size < 0: size = self.size - self.offset r = [] random_count = min(size, self.size - self.offset - self.digest_size) if random_count > 0: self.offset += random_count size -= random_count data = ''.join(chr(self.random.getrandbits(8)) for _ in xrange(random_count)) if self.hash is not None: self.hash.update(data) r.append(data) digest_count = min(size, self.size - self.offset) if digest_count > 0: if self.digest is None: self.digest = self.hash.digest() self.hash = None self.offset += digest_count size -= digest_count data = self.digest[:digest_count] r.append(data) return ''.join(r) def files(mean, stddev, seed=None): """ Yields file-like objects with effectively random contents, where the size of each file follows the normal distribution with `mean` and `stddev`. Beware, the file-likeness is very shallow. You can use boto's `key.set_contents_from_file` to send these to S3, but they are not full file objects. The last 128 bits are the MD5 digest of the previous bytes, for verifying round-trip data integrity. For example, if you re-download the object and place the contents into a file called ``foo``, the following should print two identical lines: python -c 'import sys, hashlib; data=sys.stdin.read(); print hashlib.md5(data[:-16]).hexdigest(); print "".join("%02x" % ord(c) for c in data[-16:])' = 0: break yield RandomContentFile(size=size, seed=rand.getrandbits(32)) def names(mean, stddev, charset=None, seed=None): """ Yields strings that are somewhat plausible as file names, where the lenght of each filename follows the normal distribution with `mean` and `stddev`. """ if charset is None: charset = string.ascii_lowercase rand = random.Random(seed) while True: while True: length = int(rand.normalvariate(mean, stddev)) if length >= 0: break name = ''.join(rand.choice(charset) for _ in xrange(length)) yield name