2011-07-15 19:03:23 +00:00
|
|
|
import bunch
|
2011-06-29 18:16:42 +00:00
|
|
|
import hashlib
|
|
|
|
import random
|
|
|
|
import string
|
2011-07-06 22:27:50 +00:00
|
|
|
import struct
|
2011-07-15 19:03:23 +00:00
|
|
|
import time
|
2011-06-29 18:16:42 +00:00
|
|
|
|
2011-07-26 16:41:48 +00:00
|
|
|
|
|
|
|
NANOSECOND = int(1e9)
|
|
|
|
|
|
|
|
|
2011-06-29 18:16:42 +00:00
|
|
|
class RandomContentFile(object):
|
|
|
|
def __init__(self, size, seed):
|
2011-07-15 21:19:58 +00:00
|
|
|
self.size = size
|
2011-06-29 18:16:42 +00:00
|
|
|
self.seed = seed
|
|
|
|
self.random = random.Random(self.seed)
|
|
|
|
|
2011-07-15 21:19:58 +00:00
|
|
|
# Boto likes to seek once more after it's done reading, so we need to save the last chunks/seek value.
|
|
|
|
self.last_chunks = self.chunks = None
|
|
|
|
self.last_seek = self.start_time = None
|
|
|
|
|
|
|
|
# Let seek initialize the rest of it, rather than dup code
|
|
|
|
self.seek(0)
|
|
|
|
|
|
|
|
def _mark_chunk(self):
|
2011-07-26 16:43:56 +00:00
|
|
|
self.chunks.append([self.offset, int(round((time.time() - self.last_seek) * NANOSECOND))])
|
2011-07-15 21:19:58 +00:00
|
|
|
|
2011-06-29 18:16:42 +00:00
|
|
|
def seek(self, offset):
|
|
|
|
assert offset == 0
|
|
|
|
self.random.seed(self.seed)
|
|
|
|
self.offset = offset
|
2011-07-06 22:27:50 +00:00
|
|
|
self.buffer = ''
|
2011-06-29 18:16:42 +00:00
|
|
|
|
2011-07-15 21:19:58 +00:00
|
|
|
self.hash = hashlib.md5()
|
|
|
|
self.digest_size = self.hash.digest_size
|
|
|
|
self.digest = None
|
2011-07-18 20:13:08 +00:00
|
|
|
|
|
|
|
# Save the last seek time as our start time, and the last chunks
|
2011-07-15 21:19:58 +00:00
|
|
|
self.start_time = self.last_seek
|
|
|
|
self.last_chunks = self.chunks
|
|
|
|
# Before emptying.
|
|
|
|
self.last_seek = time.time()
|
|
|
|
self.chunks = []
|
|
|
|
|
2011-06-29 18:16:42 +00:00
|
|
|
def tell(self):
|
|
|
|
return self.offset
|
|
|
|
|
2011-07-06 22:27:50 +00:00
|
|
|
def _generate(self):
|
|
|
|
# generate and return a chunk of pseudorandom data
|
|
|
|
# 256 bits = 32 bytes at a time
|
|
|
|
size = 1*1024*1024
|
|
|
|
l = [self.random.getrandbits(64) for _ in xrange(size/8)]
|
|
|
|
s = struct.pack((size/8)*'Q', *l)
|
|
|
|
return s
|
|
|
|
|
2011-06-29 18:16:42 +00:00
|
|
|
def read(self, size=-1):
|
|
|
|
if size < 0:
|
|
|
|
size = self.size - self.offset
|
|
|
|
|
|
|
|
r = []
|
|
|
|
|
|
|
|
random_count = min(size, self.size - self.offset - self.digest_size)
|
|
|
|
if random_count > 0:
|
2011-07-06 22:27:50 +00:00
|
|
|
while len(self.buffer) < random_count:
|
|
|
|
self.buffer += self._generate()
|
2011-06-29 18:16:42 +00:00
|
|
|
self.offset += random_count
|
|
|
|
size -= random_count
|
2011-07-06 22:27:50 +00:00
|
|
|
data, self.buffer = self.buffer[:random_count], self.buffer[random_count:]
|
2011-06-29 18:16:42 +00:00
|
|
|
if self.hash is not None:
|
|
|
|
self.hash.update(data)
|
|
|
|
r.append(data)
|
|
|
|
|
|
|
|
digest_count = min(size, self.size - self.offset)
|
|
|
|
if digest_count > 0:
|
|
|
|
if self.digest is None:
|
|
|
|
self.digest = self.hash.digest()
|
|
|
|
self.hash = None
|
|
|
|
self.offset += digest_count
|
|
|
|
size -= digest_count
|
|
|
|
data = self.digest[:digest_count]
|
|
|
|
r.append(data)
|
2011-07-15 21:19:58 +00:00
|
|
|
|
|
|
|
self._mark_chunk()
|
2011-06-29 18:16:42 +00:00
|
|
|
|
|
|
|
return ''.join(r)
|
|
|
|
|
2011-07-08 20:00:09 +00:00
|
|
|
class FileVerifier(object):
|
|
|
|
def __init__(self):
|
|
|
|
self.size = 0
|
|
|
|
self.hash = hashlib.md5()
|
|
|
|
self.buf = ''
|
2011-07-15 21:19:58 +00:00
|
|
|
self.created_at = time.time()
|
|
|
|
self.chunks = []
|
|
|
|
|
|
|
|
def _mark_chunk(self):
|
2011-07-26 16:43:56 +00:00
|
|
|
self.chunks.append([self.size, int(round((time.time() - self.created_at) * NANOSECOND))])
|
2011-07-08 20:00:09 +00:00
|
|
|
|
|
|
|
def write(self, data):
|
|
|
|
self.size += len(data)
|
|
|
|
self.buf += data
|
|
|
|
digsz = -1*self.hash.digest_size
|
|
|
|
new_data, self.buf = self.buf[0:digsz], self.buf[digsz:]
|
|
|
|
self.hash.update(new_data)
|
2011-07-15 21:19:58 +00:00
|
|
|
self._mark_chunk()
|
2011-07-08 20:00:09 +00:00
|
|
|
|
|
|
|
def valid(self):
|
|
|
|
"""
|
|
|
|
Returns True if this file looks valid. The file is valid if the end
|
|
|
|
of the file has the md5 digest for the first part of the file.
|
|
|
|
"""
|
2011-07-13 20:50:26 +00:00
|
|
|
if self.size < self.hash.digest_size:
|
|
|
|
return self.hash.digest().startswith(self.buf)
|
|
|
|
|
2011-07-08 20:00:09 +00:00
|
|
|
return self.buf == self.hash.digest()
|
|
|
|
|
2011-06-29 18:16:42 +00:00
|
|
|
def files(mean, stddev, seed=None):
|
|
|
|
"""
|
|
|
|
Yields file-like objects with effectively random contents, where
|
|
|
|
the size of each file follows the normal distribution with `mean`
|
|
|
|
and `stddev`.
|
|
|
|
|
|
|
|
Beware, the file-likeness is very shallow. You can use boto's
|
|
|
|
`key.set_contents_from_file` to send these to S3, but they are not
|
|
|
|
full file objects.
|
|
|
|
|
|
|
|
The last 128 bits are the MD5 digest of the previous bytes, for
|
|
|
|
verifying round-trip data integrity. For example, if you
|
|
|
|
re-download the object and place the contents into a file called
|
|
|
|
``foo``, the following should print two identical lines:
|
|
|
|
|
2011-07-11 20:19:54 +00:00
|
|
|
python -c 'import sys, hashlib; data=sys.stdin.read(); print hashlib.md5(data[:-16]).hexdigest(); print "".join("%02x" % ord(c) for c in data[-16:])' <foo
|
2011-06-29 18:16:42 +00:00
|
|
|
|
|
|
|
Except for objects shorter than 16 bytes, where the second line
|
|
|
|
will be proportionally shorter.
|
|
|
|
"""
|
|
|
|
rand = random.Random(seed)
|
|
|
|
while True:
|
|
|
|
while True:
|
|
|
|
size = int(rand.normalvariate(mean, stddev))
|
|
|
|
if size >= 0:
|
|
|
|
break
|
|
|
|
yield RandomContentFile(size=size, seed=rand.getrandbits(32))
|
|
|
|
|
2011-07-26 16:33:47 +00:00
|
|
|
def names(mean, stddev, charset=None, seed=None):
|
2011-06-29 18:16:42 +00:00
|
|
|
"""
|
|
|
|
Yields strings that are somewhat plausible as file names, where
|
|
|
|
the lenght of each filename follows the normal distribution with
|
|
|
|
`mean` and `stddev`.
|
|
|
|
"""
|
|
|
|
if charset is None:
|
|
|
|
charset = string.ascii_lowercase
|
|
|
|
rand = random.Random(seed)
|
|
|
|
while True:
|
|
|
|
while True:
|
|
|
|
length = int(rand.normalvariate(mean, stddev))
|
2011-07-11 21:17:06 +00:00
|
|
|
if length > 0:
|
2011-06-29 18:16:42 +00:00
|
|
|
break
|
|
|
|
name = ''.join(rand.choice(charset) for _ in xrange(length))
|
|
|
|
yield name
|
2011-07-15 19:03:23 +00:00
|
|
|
|
2011-07-18 22:26:03 +00:00
|
|
|
def files_varied(groups, unlimited=False):
|
2011-07-15 19:03:23 +00:00
|
|
|
""" Yields a weighted-random selection of file-like objects. """
|
|
|
|
# Quick data type sanity.
|
|
|
|
assert groups and isinstance(groups, (list, tuple))
|
|
|
|
|
|
|
|
total_num = 0
|
|
|
|
file_sets = []
|
|
|
|
rand = random.Random(time.time())
|
|
|
|
|
|
|
|
# Build the sets for our yield
|
|
|
|
for num, size, stddev in groups:
|
2011-07-19 16:09:51 +00:00
|
|
|
assert num and size #TODO
|
2011-07-15 19:03:23 +00:00
|
|
|
|
|
|
|
file_sets.append(bunch.Bunch(
|
|
|
|
num = num,
|
|
|
|
size = size,
|
|
|
|
stddev = stddev,
|
2011-07-19 16:09:51 +00:00
|
|
|
files = files(size, stddev, time.time()),
|
2011-07-15 19:03:23 +00:00
|
|
|
))
|
|
|
|
total_num += num
|
|
|
|
|
|
|
|
while True:
|
|
|
|
if not total_num:
|
|
|
|
raise StopIteration
|
|
|
|
|
|
|
|
num = rand.randrange(total_num)
|
|
|
|
|
|
|
|
ok = 0
|
|
|
|
for file_set in file_sets:
|
|
|
|
if num > file_set.num:
|
|
|
|
num -= file_set.num
|
|
|
|
continue
|
|
|
|
|
2011-07-18 22:26:03 +00:00
|
|
|
if not unlimited:
|
|
|
|
total_num -= 1
|
|
|
|
file_set.num -= 1
|
2011-07-15 19:03:23 +00:00
|
|
|
|
2011-07-18 22:26:03 +00:00
|
|
|
# None left in this set!
|
|
|
|
if file_set.num == 0:
|
|
|
|
file_sets.remove(file_set)
|
2011-07-15 19:03:23 +00:00
|
|
|
|
|
|
|
ok = 1
|
|
|
|
yield next(file_set.files)
|
|
|
|
|
|
|
|
if not ok:
|
2011-07-19 16:09:51 +00:00
|
|
|
raise RuntimeError("Couldn't find a match.")
|