Samesite - proxy that can cache partial transfers: Annotation For samesite.py

Origin for each line in samesite.py from check-in 7b27f1db02:

08ae38b6ce 2010-06-25    1: #!/usr/bin/env python3.1
08ae38b6ce 2010-06-25    2: 
08ae38b6ce 2010-06-25    3: import datetime, http.cookiejar, optparse, os, sys, shelve, re, urllib.request
08ae38b6ce 2010-06-25    4: 
08ae38b6ce 2010-06-25    5: parser = optparse.OptionParser()
08ae38b6ce 2010-06-25    6: parser.add_option('-v', '--verbose', action = 'store_true', dest = 'verbose', help = 'turns on verbose status notifications', metavar = 'bool', default = False)
08ae38b6ce 2010-06-25    7: parser.add_option('-d', '--dir', action = 'store', dest = 'dir', help = 'specify directory where the files should be stored', metavar = 'string', default = None)
08ae38b6ce 2010-06-25    8: parser.add_option('-r', '--root', action = 'store', dest = 'root', help = 'specify a site from which data should be mirrored', metavar = 'string', default = None)
08ae38b6ce 2010-06-25    9: parser.add_option('-l', '--log', action = 'store', dest = 'log', help = 'specify a log file to process', metavar = 'string', default = None)
08ae38b6ce 2010-06-25   10: (options, args) = parser.parse_args()
08ae38b6ce 2010-06-25   11: 
08ae38b6ce 2010-06-25   12: if not options.dir:
08ae38b6ce 2010-06-25   13: 	print('Directory not specified')
08ae38b6ce 2010-06-25   14: 	exit(1)
08ae38b6ce 2010-06-25   15: 
08ae38b6ce 2010-06-25   16: if not options.root:
08ae38b6ce 2010-06-25   17: 	print('Server not specified')
08ae38b6ce 2010-06-25   18: 	exit(1)
08ae38b6ce 2010-06-25   19: 
08ae38b6ce 2010-06-25   20: if not options.log:
08ae38b6ce 2010-06-25   21: 	print('Log file not specified')
08ae38b6ce 2010-06-25   22: 	exit(1)
08ae38b6ce 2010-06-25   23: 
08ae38b6ce 2010-06-25   24: if not os.access(options.log, os.R_OK):
08ae38b6ce 2010-06-25   25: 	print('Log file unreadable')
08ae38b6ce 2010-06-25   26: 	exit(1)
08ae38b6ce 2010-06-25   27: 
08ae38b6ce 2010-06-25   28: # this is file index - everything is stored in this file
08ae38b6ce 2010-06-25   29: index = shelve.open(options.dir + '/.index')
08ae38b6ce 2010-06-25   30: desc_fields = ('Content-Length', 'ETag', 'Pragma', 'Last-Modified')
08ae38b6ce 2010-06-25   31: ignore_fields = ('Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup')
08ae38b6ce 2010-06-25   32: 
7b27f1db02 2010-07-01   33: block_size = 32768
7b27f1db02 2010-07-01   34: 
08ae38b6ce 2010-06-25   35: while True:
08ae38b6ce 2010-06-25   36: 	unchecked_files = set()
08ae38b6ce 2010-06-25   37: 	checked_files = 0
08ae38b6ce 2010-06-25   38: 
08ae38b6ce 2010-06-25   39: 	# reading log and storing found urls for processing
08ae38b6ce 2010-06-25   40: 	# check file mtime XXX
08ae38b6ce 2010-06-25   41: 	with open(options.log, 'r') as log_file:
7b27f1db02 2010-07-01   42: 		log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
08ae38b6ce 2010-06-25   43: 		for line in log_file:
08ae38b6ce 2010-06-25   44: 			this_line = log_line.match(line.strip())
08ae38b6ce 2010-06-25   45: 			if this_line:
7b27f1db02 2010-07-01   46: 				unchecked_files.add(this_line.group(2))
08ae38b6ce 2010-06-25   47: 
08ae38b6ce 2010-06-25   48: 	for url in unchecked_files:
08ae38b6ce 2010-06-25   49: 
08ae38b6ce 2010-06-25   50: 		# creating empty placeholder in index
08ae38b6ce 2010-06-25   51: 		if not url in index:
08ae38b6ce 2010-06-25   52: 			index[url] = {}
08ae38b6ce 2010-06-25   53: 		reload = False
08ae38b6ce 2010-06-25   54: 
08ae38b6ce 2010-06-25   55: 		# creating file name from url
08ae38b6ce 2010-06-25   56: 		file_name = options.dir + re.compile('%20').sub(' ', url)
08ae38b6ce 2010-06-25   57: 
08ae38b6ce 2010-06-25   58: 		# forcibly checking file if no file present
08ae38b6ce 2010-06-25   59: 		if not os.access(file_name, os.R_OK):
08ae38b6ce 2010-06-25   60: 			reload = True
08ae38b6ce 2010-06-25   61: 
08ae38b6ce 2010-06-25   62: 		# forcibly checking file if file size doesn't match with index data
08ae38b6ce 2010-06-25   63: 		elif 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
08ae38b6ce 2010-06-25   64: 			print('File size is', os.stat(file_name).st_size, 'and stored file size is', index[url]['Content-Length'])
08ae38b6ce 2010-06-25   65: 			reload = True
08ae38b6ce 2010-06-25   66: 
08ae38b6ce 2010-06-25   67: 		# forcibly checking file if index hods Pragma header
08ae38b6ce 2010-06-25   68: 		if 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
08ae38b6ce 2010-06-25   69: 			reload = True
08ae38b6ce 2010-06-25   70: 
08ae38b6ce 2010-06-25   71: 		# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
08ae38b6ce 2010-06-25   72: 		if not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
08ae38b6ce 2010-06-25   73: 			continue
08ae38b6ce 2010-06-25   74: 		print('Checking file:', url)
08ae38b6ce 2010-06-25   75: 		try:
08ae38b6ce 2010-06-25   76: 			with urllib.request.urlopen(options.root + url) as source:
08ae38b6ce 2010-06-25   77: 				new_headers = {}
08ae38b6ce 2010-06-25   78: 				headers = source.info()
08ae38b6ce 2010-06-25   79: 
08ae38b6ce 2010-06-25   80: 				# stripping unneeded headers (XXX make this inplace?)
08ae38b6ce 2010-06-25   81: 				for header in headers:
08ae38b6ce 2010-06-25   82: 					if header in desc_fields:
08ae38b6ce 2010-06-25   83: 						if header == 'Pragma' and headers[header] != 'no-cache':
08ae38b6ce 2010-06-25   84: 							print('Pragma:', headers[header])
08ae38b6ce 2010-06-25   85: 						new_headers[header] = headers[header]
08ae38b6ce 2010-06-25   86: 					elif not header in ignore_fields:
7b27f1db02 2010-07-01   87: 						print('Undefined header "', header, '": ', headers[header], sep='')
08ae38b6ce 2010-06-25   88: 
08ae38b6ce 2010-06-25   89: 				# comparing headers with data found in index
08ae38b6ce 2010-06-25   90: 				# if any header has changed (except Pragma) file is fully downloaded
08ae38b6ce 2010-06-25   91: 				# same if we get more or less headers
08ae38b6ce 2010-06-25   92: 				old_keys = set(index[url].keys())
08ae38b6ce 2010-06-25   93: 				old_keys.discard('__time__')
08ae38b6ce 2010-06-25   94: 				old_keys.discard('Pragma')
08ae38b6ce 2010-06-25   95: 				more_keys = set(new_headers.keys()) - old_keys
08ae38b6ce 2010-06-25   96: 				more_keys.discard('Pragma')
08ae38b6ce 2010-06-25   97: 				less_keys = old_keys - set(new_headers.keys())
08ae38b6ce 2010-06-25   98: 				if len(more_keys) > 0:
08ae38b6ce 2010-06-25   99: 					print('More headers appear:', more_keys)
08ae38b6ce 2010-06-25  100: 					reload = True
08ae38b6ce 2010-06-25  101: 				elif len(less_keys) > 0:
08ae38b6ce 2010-06-25  102: 					print('Less headers appear:', less_keys)
08ae38b6ce 2010-06-25  103: 					reload = True
08ae38b6ce 2010-06-25  104: 				else:
08ae38b6ce 2010-06-25  105: 					for key in index[url].keys():
08ae38b6ce 2010-06-25  106: 						if key not in ('__time__', 'Pragma') and not index[url][key] == new_headers[key]:
7b27f1db02 2010-07-01  107: 							print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
08ae38b6ce 2010-06-25  108: 							reload = True
08ae38b6ce 2010-06-25  109: 
08ae38b6ce 2010-06-25  110: 				# downloading file
08ae38b6ce 2010-06-25  111: 				if reload:
08ae38b6ce 2010-06-25  112: 					if 'Content-Length' in headers:
08ae38b6ce 2010-06-25  113: 						print('Downloading', headers['Content-Length'], 'bytes [', end='')
08ae38b6ce 2010-06-25  114: 					else:
08ae38b6ce 2010-06-25  115: 						print('Downloading [', end='')
08ae38b6ce 2010-06-25  116: 					sys.stdout.flush()
08ae38b6ce 2010-06-25  117: 
08ae38b6ce 2010-06-25  118: 					# file is created at temporary location and moved in place only when download completes
08ae38b6ce 2010-06-25  119: 					temp_file = open(options.dir + '/.tmp', 'wb')
7b27f1db02 2010-07-01  120: 					buffer = source.read(block_size)
7b27f1db02 2010-07-01  121: 					blocks = 0
7b27f1db02 2010-07-01  122: 					megs = 0
08ae38b6ce 2010-06-25  123: 					while len(buffer) > 0:
08ae38b6ce 2010-06-25  124: 						temp_file.write(buffer)
08ae38b6ce 2010-06-25  125: 						print('.', end='')
08ae38b6ce 2010-06-25  126: 						sys.stdout.flush()
7b27f1db02 2010-07-01  127: 						buffer = source.read(block_size)
7b27f1db02 2010-07-01  128: 						blocks += 1
7b27f1db02 2010-07-01  129: 						if blocks > 1024*1024/block_size:
7b27f1db02 2010-07-01  130: 							blocks = blocks - 1024*1024/block_size
7b27f1db02 2010-07-01  131: 							megs += 1
7b27f1db02 2010-07-01  132: 							print('{}Mb'.format(megs), end='')
08ae38b6ce 2010-06-25  133: 					temp_file.close()
08ae38b6ce 2010-06-25  134: 					print(']')
08ae38b6ce 2010-06-25  135: 					os.renames(options.dir + '/.tmp', file_name)
08ae38b6ce 2010-06-25  136: 
08ae38b6ce 2010-06-25  137: 				checked_files += 1
08ae38b6ce 2010-06-25  138: 
08ae38b6ce 2010-06-25  139: 				# storing new time mark and storing new headers
08ae38b6ce 2010-06-25  140: 				new_headers['__time__'] = datetime.datetime.now()
08ae38b6ce 2010-06-25  141: 				index[url] = new_headers
7b27f1db02 2010-07-01  142: 				index.sync()
08ae38b6ce 2010-06-25  143: 
08ae38b6ce 2010-06-25  144: 		except urllib.error.HTTPError as error:
08ae38b6ce 2010-06-25  145: 			# in case of error we don't need to do anything actually,
08ae38b6ce 2010-06-25  146: 			# if file download stalls or fails the file would not be moved to it's location
08ae38b6ce 2010-06-25  147: 			print(error)
08ae38b6ce 2010-06-25  148: 
08ae38b6ce 2010-06-25  149: 	print('[', len(unchecked_files), '/', checked_files, ']')
08ae38b6ce 2010-06-25  150: 
08ae38b6ce 2010-06-25  151: 	# checking if there were any files downloaded, if yes - restarting sequence
08ae38b6ce 2010-06-25  152: 	if checked_files == 0:
08ae38b6ce 2010-06-25  153: 		break
Annotation For samesite.py

Versions of samesite.py analyzed:

Origin for each line in samesite.py from check-in 7b27f1db02: