Samesite - proxy that can cache partial transfers: Artifact [81ccd8c170]

Artifact 81ccd8c17003f701bcba441fd031bb95b18483cf11c592fa69c14770b1281927:

Executable file samesite.py — part of check-in [38b25713eb] at 2010-07-26 09:38:15 on branch master — option to skip ETag processing, X-Powered-By ignored (user: c.kworr@b84a3442-36b4-a7b2-c7ad-07429f13c525, size: 6024) [annotate] [blame] [check-ins using]
#!/usr/bin/env python3.1

import datetime, http.cookiejar, optparse, os, sys, shelve, re, urllib.request

parser = optparse.OptionParser()
parser.add_option('-v', '--verbose', action = 'store_true', dest = 'verbose', help = 'turns on verbose status notifications', metavar = 'bool', default = False)
parser.add_option('-d', '--dir', action = 'store', dest = 'dir', help = 'specify directory where the files should be stored', metavar = 'string', default = None)
parser.add_option('-r', '--root', action = 'store', dest = 'root', help = 'specify a site from which data should be mirrored', metavar = 'string', default = None)
parser.add_option('-l', '--log', action = 'store', dest = 'log', help = 'specify a log file to process', metavar = 'string', default = None)
parser.add_option('-e', '--skip-etag', action = 'store_true', dest = 'noetag', help = 'do not process etags', metavar = 'bool', default = False)
(options, args) = parser.parse_args()

assert options.dir, 'Directory not specified'
assert options.root, 'Server not specified'
assert options.log, 'Log file not specified'
assert os.access(options.log, os.R_OK), 'Log file unreadable'

# this is file index - everything is stored in this file
index = shelve.open(options.dir + '/.index')
desc_fields = ('Content-Length', 'Pragma', 'Last-Modified')
ignore_fields = ('Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By')

if not options.noetag:
	desc_fields += 'ETag',
else:
	ignore_fields += 'ETag',

block_size = 32768

while True:
	unchecked_files = set()
	checked_files = 0

	# reading log and storing found urls for processing
	# check file mtime XXX
	with open(options.log, 'r') as log_file:
		log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
		for line in log_file:
			this_line = log_line.match(line.strip())
			if this_line:
				unchecked_files.add(this_line.group(2))

	for url in unchecked_files:
		reload = False
		recheck = False
		print('Checking file:', url)

		# creating empty placeholder in index
		if not url in index:
			print('This one is new.')
			index[url] = {}
			reload = True

		# creating file name from url
		file_name = options.dir + re.compile('%20').sub(' ', url)

		# forcibly checking file if no file present
		if not reload and not os.access(file_name, os.R_OK):
			print('File not found or inaccessible.')
			reload = True

		# forcibly checking file if file size doesn't match with index data
		elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
			print('File size is ', os.stat(file_name).st_size, ' and stored file size is ', index[url]['Content-Length'], '.', sep='')
			reload = True

		# forcibly checking file if index hods Pragma header
		if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
			print('Pragma on: recheck imminent.')
			recheck = True

		# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
		if not recheck and not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
			continue

		try:
			with urllib.request.urlopen(options.root + url) as source:
				new_headers = {}
				headers = source.info()

				# stripping unneeded headers (XXX make this inplace?)
				for header in headers:
					if header in desc_fields:
						if header == 'Pragma' and headers[header] != 'no-cache':
							print('Pragma:', headers[header])
						new_headers[header] = headers[header]
					elif not header in ignore_fields:
						print('Undefined header "', header, '": ', headers[header], sep='')

				# comparing headers with data found in index
				# if any header has changed (except Pragma) file is fully downloaded
				# same if we get more or less headers
				old_keys = set(index[url].keys())
				old_keys.discard('__time__')
				old_keys.discard('Pragma')
				more_keys = set(new_headers.keys()) - old_keys
				more_keys.discard('Pragma')
				less_keys = old_keys - set(new_headers.keys())
				if len(more_keys) > 0:
					if not len(old_keys) == 0:
						print('More headers appear:', more_keys)
					reload = True
				elif len(less_keys) > 0:
					print('Less headers appear:', less_keys)
				else:
					for key in index[url].keys():
						if key not in ('__time__', 'Pragma') and not index[url][key] == new_headers[key]:
							print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
							reload = True

				# downloading file
				if reload:
					if 'Content-Length' in headers:
						print('Downloading', headers['Content-Length'], 'bytes [', end='')
					else:
						print('Downloading [', end='')
					sys.stdout.flush()

					# file is created at temporary location and moved in place only when download completes
					temp_file = open(options.dir + '/.tmp', 'wb')
					buffer = source.read(block_size)
					blocks = 0
					megs = 0
					while len(buffer) > 0:
						temp_file.write(buffer)
						print('.', end='')
						sys.stdout.flush()
						buffer = source.read(block_size)
						blocks += 1
						if blocks > 1024*1024/block_size:
							blocks = blocks - 1024*1024/block_size
							megs += 1
							print('{}Mb'.format(megs), end='')
					temp_file.close()
					print(']')
					os.renames(options.dir + '/.tmp', file_name)

				checked_files += 1

				# storing new time mark and storing new headers
				new_headers['__time__'] = datetime.datetime.now()
				index[url] = new_headers
				index.sync()

		except urllib.error.HTTPError as error:
			# in case of error we don't need to do anything actually,
			# if file download stalls or fails the file would not be moved to it's location
			print(error)

	print('[', len(unchecked_files), '/', checked_files, ']')

	# checking if there were any files downloaded, if yes - restarting sequence
	if checked_files == 0:
		break