Samesite - proxy that can cache partial transfers

Annotation For samesite.py
anonymous

Annotation For samesite.py

Lines of samesite.py from check-in 827033dd7e that are changed by the sequence of edits moving toward check-in 80f8e3804a:

                         1: #!/usr/bin/env python3.1
                         2: 
                         3: import datetime, http.cookiejar, optparse, os, sys, shelve, re, urllib.request
                         4: 
                         5: parser = optparse.OptionParser()
                         6: parser.add_option('-v', '--verbose', action = 'store_true', dest = 'verbose', help = 'turns on verbose status notifications', metavar = 'bool', default = False)
                         7: parser.add_option('-d', '--dir', action = 'store', dest = 'dir', help = 'specify directory where the files should be stored', metavar = 'string', default = None)
                         8: parser.add_option('-r', '--root', action = 'store', dest = 'root', help = 'specify a site from which data should be mirrored', metavar = 'string', default = None)
                         9: parser.add_option('-l', '--log', action = 'store', dest = 'log', help = 'specify a log file to process', metavar = 'string', default = None)
                        10: parser.add_option('-e', '--skip-etag', action = 'store_true', dest = 'noetag', help = 'do not process etags', metavar = 'bool', default = False)
                        11: (options, args) = parser.parse_args()
                        12: 
                        13: assert options.dir, 'Directory not specified'
                        14: assert options.root, 'Server not specified'
827033dd7e 2010-08-04   15: assert options.log, 'Log file not specified'
827033dd7e 2010-08-04   16: assert os.access(options.log, os.R_OK), 'Log file unreadable'
                        17: 
                        18: # this is file index - everything is stored in this file
827033dd7e 2010-08-04   19: index = shelve.open(options.dir + '/.index')
                        20: desc_fields = ('Content-Length', 'Pragma', 'Last-Modified')
                        21: ignore_fields = ('Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By')
                        22: 
                        23: if not options.noetag:
                        24: 	desc_fields += 'ETag',
                        25: else:
                        26: 	ignore_fields += 'ETag',
                        27: 
827033dd7e 2010-08-04   28: block_size = 32768
827033dd7e 2010-08-04   29: 
827033dd7e 2010-08-04   30: while True:
827033dd7e 2010-08-04   31: 	unchecked_files = set()
827033dd7e 2010-08-04   32: 	checked_files = 0
827033dd7e 2010-08-04   33: 
827033dd7e 2010-08-04   34: 	# reading log and storing found urls for processing
827033dd7e 2010-08-04   35: 	# check file mtime XXX
827033dd7e 2010-08-04   36: 	with open(options.log, 'r') as log_file:
827033dd7e 2010-08-04   37: 		log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
827033dd7e 2010-08-04   38: 		for line in log_file:
827033dd7e 2010-08-04   39: 			this_line = log_line.match(line.strip())
827033dd7e 2010-08-04   40: 			if this_line:
827033dd7e 2010-08-04   41: 				unchecked_files.add(this_line.group(2))
827033dd7e 2010-08-04   42: 
827033dd7e 2010-08-04   43: 	for url in unchecked_files:
827033dd7e 2010-08-04   44: 		reload = False
827033dd7e 2010-08-04   45: 		recheck = False
827033dd7e 2010-08-04   46: 		info = 'Checking file: ' + url
827033dd7e 2010-08-04   47: 
827033dd7e 2010-08-04   48: 		# creating empty placeholder in index
827033dd7e 2010-08-04   49: 		if not url in index:
827033dd7e 2010-08-04   50: 			info += '\nThis one is new.'
827033dd7e 2010-08-04   51: 			index[url] = {}
827033dd7e 2010-08-04   52: 			reload = True
827033dd7e 2010-08-04   53: 
827033dd7e 2010-08-04   54: 		# creating file name from url
827033dd7e 2010-08-04   55: 		file_name = options.dir + re.compile('%20').sub(' ', url)
827033dd7e 2010-08-04   56: 
827033dd7e 2010-08-04   57: 		# forcibly checking file if no file present
827033dd7e 2010-08-04   58: 		if not reload and not os.access(file_name, os.R_OK):
827033dd7e 2010-08-04   59: 			info += '\nFile not found or inaccessible.'
827033dd7e 2010-08-04   60: 			reload = True
827033dd7e 2010-08-04   61: 
827033dd7e 2010-08-04   62: 		# forcibly checking file if file size doesn't match with index data
827033dd7e 2010-08-04   63: 		elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
827033dd7e 2010-08-04   64: 			info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
827033dd7e 2010-08-04   65: 			reload = True
827033dd7e 2010-08-04   66: 
827033dd7e 2010-08-04   67: 		# forcibly checking file if index hods Pragma header
827033dd7e 2010-08-04   68: 		if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
827033dd7e 2010-08-04   69: 			info +='\nPragma on: recheck imminent.'
827033dd7e 2010-08-04   70: 			recheck = True
827033dd7e 2010-08-04   71: 
827033dd7e 2010-08-04   72: 		if options.verbose:
827033dd7e 2010-08-04   73: 			print(info)
827033dd7e 2010-08-04   74: 
827033dd7e 2010-08-04   75: 		# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
827033dd7e 2010-08-04   76: 		if not recheck and not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
827033dd7e 2010-08-04   77: 			continue
827033dd7e 2010-08-04   78: 
827033dd7e 2010-08-04   79: 		try:
827033dd7e 2010-08-04   80: 			with urllib.request.urlopen(options.root + url) as source:
827033dd7e 2010-08-04   81: 				new_headers = {}
827033dd7e 2010-08-04   82: 				headers = source.info()
827033dd7e 2010-08-04   83: 				if not options.verbose:
827033dd7e 2010-08-04   84: 					print(info)
827033dd7e 2010-08-04   85: 
827033dd7e 2010-08-04   86: 				# stripping unneeded headers (XXX make this inplace?)
827033dd7e 2010-08-04   87: 				for header in headers:
827033dd7e 2010-08-04   88: 					if header in desc_fields:
827033dd7e 2010-08-04   89: 						if header == 'Pragma' and headers[header] != 'no-cache':
827033dd7e 2010-08-04   90: 							print('Pragma:', headers[header])
827033dd7e 2010-08-04   91: 						new_headers[header] = headers[header]
827033dd7e 2010-08-04   92: 					elif not header in ignore_fields:
827033dd7e 2010-08-04   93: 						print('Undefined header "', header, '": ', headers[header], sep='')
827033dd7e 2010-08-04   94: 
827033dd7e 2010-08-04   95: 				# comparing headers with data found in index
827033dd7e 2010-08-04   96: 				# if any header has changed (except Pragma) file is fully downloaded
827033dd7e 2010-08-04   97: 				# same if we get more or less headers
827033dd7e 2010-08-04   98: 				old_keys = set(index[url].keys())
827033dd7e 2010-08-04   99: 				old_keys.discard('__time__')
827033dd7e 2010-08-04  100: 				old_keys.discard('Pragma')
827033dd7e 2010-08-04  101: 				more_keys = set(new_headers.keys()) - old_keys
827033dd7e 2010-08-04  102: 				more_keys.discard('Pragma')
827033dd7e 2010-08-04  103: 				less_keys = old_keys - set(new_headers.keys())
827033dd7e 2010-08-04  104: 				if len(more_keys) > 0:
827033dd7e 2010-08-04  105: 					if not len(old_keys) == 0:
827033dd7e 2010-08-04  106: 						print('More headers appear:', more_keys)
827033dd7e 2010-08-04  107: 					reload = True
827033dd7e 2010-08-04  108: 				elif len(less_keys) > 0:
827033dd7e 2010-08-04  109: 					print('Less headers appear:', less_keys)
827033dd7e 2010-08-04  110: 				else:
827033dd7e 2010-08-04  111: 					for key in index[url].keys():
827033dd7e 2010-08-04  112: 						if key not in ('__time__', 'Pragma') and not index[url][key] == new_headers[key]:
827033dd7e 2010-08-04  113: 							print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
827033dd7e 2010-08-04  114: 							reload = True
827033dd7e 2010-08-04  115: 
827033dd7e 2010-08-04  116: 				# downloading file
827033dd7e 2010-08-04  117: 				if reload:
827033dd7e 2010-08-04  118: 					if 'Content-Length' in headers:
827033dd7e 2010-08-04  119: 						print('Downloading', headers['Content-Length'], 'bytes [', end='')
827033dd7e 2010-08-04  120: 					else:
827033dd7e 2010-08-04  121: 						print('Downloading [', end='')
827033dd7e 2010-08-04  122: 					sys.stdout.flush()
827033dd7e 2010-08-04  123: 
827033dd7e 2010-08-04  124: 					# file is created at temporary location and moved in place only when download completes
827033dd7e 2010-08-04  125: 					temp_file = open(options.dir + '/.tmp', 'wb')
827033dd7e 2010-08-04  126: 					buffer = source.read(block_size)
827033dd7e 2010-08-04  127: 					blocks = 0
827033dd7e 2010-08-04  128: 					megs = 0
827033dd7e 2010-08-04  129: 					while len(buffer) > 0:
827033dd7e 2010-08-04  130: 						temp_file.write(buffer)
827033dd7e 2010-08-04  131: 						print('.', end='')
827033dd7e 2010-08-04  132: 						sys.stdout.flush()
827033dd7e 2010-08-04  133: 						buffer = source.read(block_size)
827033dd7e 2010-08-04  134: 						blocks += 1
827033dd7e 2010-08-04  135: 						if blocks > 1024*1024/block_size:
827033dd7e 2010-08-04  136: 							blocks = blocks - 1024*1024/block_size
827033dd7e 2010-08-04  137: 							megs += 1
827033dd7e 2010-08-04  138: 							print('{}Mb'.format(megs), end='')
827033dd7e 2010-08-04  139: 					temp_file.close()
827033dd7e 2010-08-04  140: 					print(']')
827033dd7e 2010-08-04  141: 					os.renames(options.dir + '/.tmp', file_name)
827033dd7e 2010-08-04  142: 
827033dd7e 2010-08-04  143: 				checked_files += 1
827033dd7e 2010-08-04  144: 
827033dd7e 2010-08-04  145: 				# storing new time mark and storing new headers
827033dd7e 2010-08-04  146: 				new_headers['__time__'] = datetime.datetime.now()
827033dd7e 2010-08-04  147: 				index[url] = new_headers
827033dd7e 2010-08-04  148: 				index.sync()
827033dd7e 2010-08-04  149: 
827033dd7e 2010-08-04  150: 		except urllib.error.HTTPError as error:
827033dd7e 2010-08-04  151: 			# in case of error we don't need to do anything actually,
827033dd7e 2010-08-04  152: 			# if file download stalls or fails the file would not be moved to it's location
827033dd7e 2010-08-04  153: 			print(error)
827033dd7e 2010-08-04  154: 
827033dd7e 2010-08-04  155: 	if options.verbose:
827033dd7e 2010-08-04  156: 		print('[', len(unchecked_files), '/', checked_files, ']')
827033dd7e 2010-08-04  157: 
827033dd7e 2010-08-04  158: 	# checking if there were any files downloaded, if yes - restarting sequence
827033dd7e 2010-08-04  159: 	if checked_files == 0:
827033dd7e 2010-08-04  160: 		break