Samesite - proxy that can cache partial transfers: Annotation For samesite.py

Lines of samesite.py from check-in 083ec707ea that are changed by the sequence of edits moving toward check-in 38b25713eb:

                         1: #!/usr/bin/env python3.1
                         2: 
                         3: import datetime, http.cookiejar, optparse, os, sys, shelve, re, urllib.request
                         4: 
                         5: parser = optparse.OptionParser()
                         6: parser.add_option('-v', '--verbose', action = 'store_true', dest = 'verbose', help = 'turns on verbose status notifications', metavar = 'bool', default = False)
                         7: parser.add_option('-d', '--dir', action = 'store', dest = 'dir', help = 'specify directory where the files should be stored', metavar = 'string', default = None)
                         8: parser.add_option('-r', '--root', action = 'store', dest = 'root', help = 'specify a site from which data should be mirrored', metavar = 'string', default = None)
                         9: parser.add_option('-l', '--log', action = 'store', dest = 'log', help = 'specify a log file to process', metavar = 'string', default = None)
                        10: (options, args) = parser.parse_args()
                        11: 
083ec707ea 2010-07-06   12: if not options.dir:
083ec707ea 2010-07-06   13: 	print('Directory not specified')
083ec707ea 2010-07-06   14: 	exit(1)
083ec707ea 2010-07-06   15: 
083ec707ea 2010-07-06   16: if not options.root:
083ec707ea 2010-07-06   17: 	print('Server not specified')
083ec707ea 2010-07-06   18: 	exit(1)
083ec707ea 2010-07-06   19: 
083ec707ea 2010-07-06   20: if not options.log:
083ec707ea 2010-07-06   21: 	print('Log file not specified')
083ec707ea 2010-07-06   22: 	exit(1)
083ec707ea 2010-07-06   23: 
083ec707ea 2010-07-06   24: if not os.access(options.log, os.R_OK):
083ec707ea 2010-07-06   25: 	print('Log file unreadable')
083ec707ea 2010-07-06   26: 	exit(1)
                        27: 
                        28: # this is file index - everything is stored in this file
                        29: index = shelve.open(options.dir + '/.index')
083ec707ea 2010-07-06   30: desc_fields = ('Content-Length', 'ETag', 'Pragma', 'Last-Modified')
083ec707ea 2010-07-06   31: ignore_fields = ('Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup')
                        32: 
                        33: block_size = 32768
                        34: 
                        35: while True:
                        36: 	unchecked_files = set()
                        37: 	checked_files = 0
                        38: 
                        39: 	# reading log and storing found urls for processing
                        40: 	# check file mtime XXX
                        41: 	with open(options.log, 'r') as log_file:
                        42: 		log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
                        43: 		for line in log_file:
                        44: 			this_line = log_line.match(line.strip())
                        45: 			if this_line:
                        46: 				unchecked_files.add(this_line.group(2))
                        47: 
                        48: 	for url in unchecked_files:
                        49: 		reload = False
                        50: 		recheck = False
                        51: 		print('Checking file:', url)
                        52: 
                        53: 		# creating empty placeholder in index
                        54: 		if not url in index:
                        55: 			print('This one is new.')
                        56: 			index[url] = {}
                        57: 			reload = True
                        58: 
                        59: 		# creating file name from url
                        60: 		file_name = options.dir + re.compile('%20').sub(' ', url)
                        61: 
                        62: 		# forcibly checking file if no file present
                        63: 		if not reload and not os.access(file_name, os.R_OK):
                        64: 			print('File not found or inaccessible.')
                        65: 			reload = True
                        66: 
                        67: 		# forcibly checking file if file size doesn't match with index data
                        68: 		elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
                        69: 			print('File size is ', os.stat(file_name).st_size, ' and stored file size is ', index[url]['Content-Length'], '.', sep='')
                        70: 			reload = True
                        71: 
                        72: 		# forcibly checking file if index hods Pragma header
                        73: 		if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
                        74: 			print('Pragma on: recheck imminent.')
                        75: 			recheck = True
                        76: 
                        77: 		# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
                        78: 		if not recheck and not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
                        79: 			continue
                        80: 
                        81: 		try:
                        82: 			with urllib.request.urlopen(options.root + url) as source:
                        83: 				new_headers = {}
                        84: 				headers = source.info()
                        85: 
                        86: 				# stripping unneeded headers (XXX make this inplace?)
                        87: 				for header in headers:
                        88: 					if header in desc_fields:
                        89: 						if header == 'Pragma' and headers[header] != 'no-cache':
                        90: 							print('Pragma:', headers[header])
                        91: 						new_headers[header] = headers[header]
                        92: 					elif not header in ignore_fields:
                        93: 						print('Undefined header "', header, '": ', headers[header], sep='')
                        94: 
                        95: 				# comparing headers with data found in index
                        96: 				# if any header has changed (except Pragma) file is fully downloaded
                        97: 				# same if we get more or less headers
                        98: 				old_keys = set(index[url].keys())
                        99: 				old_keys.discard('__time__')
                       100: 				old_keys.discard('Pragma')
                       101: 				more_keys = set(new_headers.keys()) - old_keys
                       102: 				more_keys.discard('Pragma')
                       103: 				less_keys = old_keys - set(new_headers.keys())
                       104: 				if len(more_keys) > 0:
                       105: 					if not len(old_keys) == 0:
                       106: 						print('More headers appear:', more_keys)
                       107: 					reload = True
                       108: 				elif len(less_keys) > 0:
                       109: 					print('Less headers appear:', less_keys)
083ec707ea 2010-07-06  110: 					reload = True
                       111: 				else:
                       112: 					for key in index[url].keys():
                       113: 						if key not in ('__time__', 'Pragma') and not index[url][key] == new_headers[key]:
                       114: 							print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
                       115: 							reload = True
                       116: 
                       117: 				# downloading file
                       118: 				if reload:
                       119: 					if 'Content-Length' in headers:
                       120: 						print('Downloading', headers['Content-Length'], 'bytes [', end='')
                       121: 					else:
                       122: 						print('Downloading [', end='')
                       123: 					sys.stdout.flush()
                       124: 
                       125: 					# file is created at temporary location and moved in place only when download completes
                       126: 					temp_file = open(options.dir + '/.tmp', 'wb')
                       127: 					buffer = source.read(block_size)
                       128: 					blocks = 0
                       129: 					megs = 0
                       130: 					while len(buffer) > 0:
                       131: 						temp_file.write(buffer)
                       132: 						print('.', end='')
                       133: 						sys.stdout.flush()
                       134: 						buffer = source.read(block_size)
                       135: 						blocks += 1
                       136: 						if blocks > 1024*1024/block_size:
                       137: 							blocks = blocks - 1024*1024/block_size
                       138: 							megs += 1
                       139: 							print('{}Mb'.format(megs), end='')
                       140: 					temp_file.close()
                       141: 					print(']')
                       142: 					os.renames(options.dir + '/.tmp', file_name)
                       143: 
                       144: 				checked_files += 1
                       145: 
                       146: 				# storing new time mark and storing new headers
                       147: 				new_headers['__time__'] = datetime.datetime.now()
                       148: 				index[url] = new_headers
                       149: 				index.sync()
                       150: 
                       151: 		except urllib.error.HTTPError as error:
                       152: 			# in case of error we don't need to do anything actually,
                       153: 			# if file download stalls or fails the file would not be moved to it's location
                       154: 			print(error)
                       155: 
                       156: 	print('[', len(unchecked_files), '/', checked_files, ']')
                       157: 
                       158: 	# checking if there were any files downloaded, if yes - restarting sequence
                       159: 	if checked_files == 0:
                       160: 		break
Annotation For samesite.py

Versions of samesite.py analyzed:

Lines of samesite.py from check-in 083ec707ea that are changed by the sequence of edits moving toward check-in 38b25713eb: