Samesite - proxy that can cache partial transfers: Annotation For samesite.py

Lines of samesite.py from check-in 31a8af9ff1 that are changed by the sequence of edits moving toward check-in a81f1a70fb:

                         1: #!/usr/bin/env python
                         2: 
                         3: from __future__ import unicode_literals, print_function
                         4: 
                         5: import bsddb.dbshelve, copy, datetime, os, BaseHTTPServer, sys, spacemap, re, urllib2
                         6: 
                         7: class Config:
                         8: 	__slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
                         9: 	_default = {
                        10: 		'general': {
                        11: 			'port': '8008',
                        12: 		},
                        13: 		'_other': {
                        14: 			'verbose': 'no',
                        15: 			'noetag': 'no',
                        16: 			'noparts': 'no',
                        17: 			'strip': '',
                        18: 			'sub': '',
                        19: 	},}
                        20: 
                        21: 	# function to read in config file
                        22: 	def __init__(self):
                        23: 		import ConfigParser, optparse
                        24: 
                        25: 		parser = optparse.OptionParser()
                        26: 		parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
                        27: 		(self.options, args) = parser.parse_args()
                        28: 
                        29: 		assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
                        30: 
                        31: 		configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
                        32: 		if configDir:
                        33: 			self.root = configDir.group(1)
                        34: 		else:
                        35: 			self.root = os.getcwd()
                        36: 
                        37: 		self._config = ConfigParser.ConfigParser()
                        38: 		self._config.readfp(open(self.options.config))
                        39: 
                        40: 		for section in self._config.sections():
                        41: 			if section != 'general':
                        42: 				if self._config.has_option(section, 'dir'):
                        43: 					if re.compile('^/$').match(self._config.get(section, 'dir')):
                        44: 						self._config.set(section, 'dir', self.root + os.sep + section)
                        45: 					thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
                        46: 					if thisDir:
                        47: 						self._config.set(section, 'dir', thisDir.group(1))
                        48: 					if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
                        49: 						self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
                        50: 				else:
                        51: 					self._config.set(section, 'dir', self.root + os.sep + section)
                        52: 
                        53: 				if not self._config.has_option(section, 'root'):
                        54: 					self._config.set(section, 'root', section)
                        55: 
                        56: 	# function to select config file section or create one
                        57: 	def section(self, section):
                        58: 		if not self._config.has_section(section):
                        59: 			self._config.add_section(section)
                        60: 		self._section = section
                        61: 
                        62: 	# function to get config parameter, if parameter doesn't exists the default
                        63: 	# value or None is substituted
                        64: 	def __getitem__(self, name):
                        65: 		if not self._config.has_option(self._section, name):
                        66: 			if self._section in self._default:
                        67: 				if name in self._default[self._section]:
                        68: 					self._config.set(self._section, name, self._default[self._section][name])
                        69: 				else:
                        70: 					self._config.set(self._section, name, None)
                        71: 			elif name in self._default['_other']:
                        72: 				self._config.set(self._section, name, self._default['_other'][name])
                        73: 			else:
                        74: 				self._config.set(self._section, name, None)
                        75: 		return(self._config.get(self._section, name))
                        76: 
                        77: config = Config()
                        78: 
                        79: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
                        80: 
                        81: const_desc_fields = set(['content-length', 'last-modified', 'pragma'])
                        82: const_ignore_fields = set([
                        83: 	'accept-ranges', 'age',
                        84: 	'cache-control', 'connection', 'content-type',
                        85: 	'date',
                        86: 	'expires',
                        87: 	'referer',
                        88: 	'server',
                        89: 	'via',
                        90: 	'x-cache', 'x-cache-lookup', 'x-livetool', 'x-powered-by',
                        91: ])
                        92: 
                        93: block_size = 4096
                        94: 
                        95: class MyRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
                        96: 	def __process(self):
                        97: 		# reload means file needs to be reloaded to serve request
                        98: 		reload = False
                        99: 		# recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
                       100: 		recheck = False
                       101: 		# file_stat means file definitely exists
                       102: 		file_stat = None
                       103: 		# requested_ranges holds data about any range requested
                       104: 		requested_ranges = None
                       105: 		# records holds data from index locally, should be written back upon successfull completion
                       106: 		record = None
                       107: 
                       108: 		myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
                       109: 		if myPath:
                       110: 			my_path = myPath.group(1)
                       111: 		else:
                       112: 			my_path = self.path
                       113: 
                       114: 		config.section(self.headers['host'])
                       115: 
                       116: 		if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0:
                       117: 			string = re.compile(config['strip']).sub(config['sub'], my_path)
                       118: 			my_path = string
                       119: 
                       120: 		info = 'Checking file: ' + my_path
                       121: 
                       122: 		if not os.access(config['dir'], os.X_OK):
                       123: 			os.mkdir(config['dir'])
                       124: 		# this is file index - everything is stored in this file
                       125: 		# _parts - list of stored parts of file
                       126: 		# _time - last time the file was checked
                       127: 		# everything else is just the headers
                       128: 		index = bsddb.dbshelve.open(config['dir'] + os.sep + '.index')
                       129: 
                       130: 		desc_fields = const_desc_fields.copy()
                       131: 		ignore_fields = const_ignore_fields.copy()
                       132: 		if config['noetag'] == 'no':
                       133: 			desc_fields.add('etag')
                       134: 		else:
                       135: 			ignore_fields.add('etag')
                       136: 
                       137: 		proxy_ignored = set([
                       138: 			'accept', 'accept-charset', 'accept-encoding', 'accept-language',
                       139: 			'cache-control', 'connection', 'content-length', 'cookie',
                       140: 			'host',
                       141: 			'if-modified-since', 'if-unmodified-since',
                       142: 			'referer',
                       143: 			'user-agent',
                       144: 			'via',
                       145: 			'x-forwarded-for', 'x-last-hr', 'x-last-http-status-code', 'x-removed', 'x-real-ip', 'x-retry-count',
                       146: 		])
                       147: 
                       148: 		print('===============[ {} request ]==='.format(self.command))
                       149: 
                       150: 		for header in self.headers:
                       151: 			if header in proxy_ignored:
                       152: 				pass
                       153: 			elif header in ('range'):
                       154: 				isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
                       155: 				if isRange:
                       156: 					requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
                       157: 				else:
                       158: 					return()
                       159: 			elif header in ('pragma'):
                       160: 				if my_path in index:
                       161: 					index[my_path][header] = self.headers[header]
                       162: 			else:
                       163: 				print('Unknown header - ', header, ': ', self.headers[header], sep='')
                       164: 				return()
                       165: 			print(header, self.headers[header])
                       166: 
                       167: 		# creating file name from my_path
                       168: 		file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
                       169: 		# partial file or unfinished download
                       170: 		temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
                       171: 
                       172: 		# creating empty placeholder in index
                       173: 		# if there's no space map and there's no file in real directory - we have no file
                       174: 		# if there's an empty space map - file is full
                       175: 		# space map generally covers every bit of file we don't posess currently
                       176: 		if not my_path in index:
                       177: 			info += '\nThis one is new.'
                       178: 			reload = True
                       179: 			record = {}
                       180: 		else:
                       181: 			# forcibly checking file if no file present
                       182: 			record = index[my_path]
                       183: 			if os.access(file_name, os.R_OK):
                       184: 				info += '\nFull file found.'
                       185: 				file_stat = os.stat(file_name)
                       186: 			elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK):
                       187: 				info += '\nPartial file found.'
                       188: 				file_stat = os.stat(temp_name)
31a8af9ff1 2012-01-16  189: 				recheck = True
                       190: 			else:
                       191: 				info += '\nFile not found or inaccessible.'
                       192: 				record['_parts'] = None
                       193: 				reload = True
                       194: 
                       195: 		if not '_parts' in record:
                       196: 			record['_parts'] = None
                       197: 
                       198: 		if record['_parts'] == None:
                       199: 			recheck = True
                       200: 
                       201: 		# forcibly checking file if file size doesn't match with index data
                       202: 		if not reload:
                       203: 			if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
                       204: 				if 'content-length' in record and file_stat and file_stat.st_size != int(record['content-length']):
                       205: 					info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['content-length'])
                       206: 					record['_parts'] = None
                       207: 					reload = True
                       208: 
                       209: 		# forcibly checking file if index holds Pragma header
                       210: 		if not reload and 'pragma' in record and record['pragma'] == 'no-cache':
                       211: 			info +='\nPragma on: recheck imminent.'
                       212: 			recheck = True
                       213: 
                       214: 		# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
                       215: 		if not recheck and not reload and '_time' in record and (record['_time'] - datetime.datetime.now() + datetime.timedelta(hours = 4)).days < 0:
                       216: 			info += '\nFile is old - rechecking.'
                       217: 			recheck = True
                       218: 
                       219: 		print(info)
                       220: 		if reload or recheck:
                       221: 
                       222: 			try:
                       223: 				request = 'http://' + config['root'] + self.path
                       224: 				my_headers = {}
                       225: 				for header in ('cache-control', 'cookie', 'referer', 'user-agent'):
                       226: 					if header in self.headers:
                       227: 						my_headers[header] = self.headers[header]
                       228: 
                       229: 				needed = None
                       230: 				if self.command not in ('HEAD'):
                       231: 					if '_parts' in record and record['_parts'] != None:
                       232: 						if config['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap():
                       233: 							needed = record['_parts']
                       234: 						else:
                       235: 							needed = record['_parts'] & requested_ranges
                       236: 					elif config['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap():
                       237: 						needed = requested_ranges
                       238: 					ranges = ()
                       239: 					print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
                       240: 					if needed != None and len(needed) > 0:
                       241: 						needed.rewind()
                       242: 						while True:
                       243: 							range = needed.pop()
                       244: 							if range[0] == None:
                       245: 								break
                       246: 							ranges += '{}-{}'.format(range[0], range[1] - 1),
                       247: 						my_headers['range'] = 'bytes=' + ','.join(ranges)
                       248: 
                       249: 				my_headers['Accept-Encoding'] = 'gzip'
                       250: 				request = urllib2.Request(request, headers = my_headers)
                       251: 
31a8af9ff1 2012-01-16  252: 				source = urllib2.urlopen(request)
                       253: 				new_record = {}
                       254: 				new_record['_parts'] = record['_parts']
                       255: 				headers = source.info()
                       256: 
                       257: 				if 'content-encoding' in headers and headers['content-encoding'] == 'gzip':
31a8af9ff1 2012-01-16  258: 					import gzip, StringIO
31a8af9ff1 2012-01-16  259: 					buf = StringIO.StringIO(source.read())
31a8af9ff1 2012-01-16  260: 					source = gzip.GzipFile(fileobj=buf)
                       261: 
                       262: 				# stripping unneeded headers (XXX make this inplace?)
                       263: 				for header in headers:
                       264: 					if header in desc_fields:
                       265: 						#if header == 'Pragma' and headers[header] != 'no-cache':
                       266: 						if header == 'content-length':
                       267: 							if 'content-range' not in headers:
                       268: 								new_record[header] = int(headers[header])
                       269: 						else:
                       270: 							new_record[header] = headers[header]
                       271: 					elif header == 'content-range':
                       272: 						range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
                       273: 						if range:
                       274: 							new_record['content-length'] = int(range.group(3))
                       275: 						else:	
                       276: 							assert False, 'Content-Range unrecognized.'
                       277: 					elif not header in ignore_fields:
                       278: 						print('Undefined header "', header, '": ', headers[header], sep='')
                       279: 
                       280: 				# comparing headers with data found in index
                       281: 				# if any header has changed (except Pragma) file is fully downloaded
                       282: 				# same if we get more or less headers
                       283: 				old_keys = set(record.keys())
                       284: 				old_keys.discard('_time')
                       285: 				old_keys.discard('pragma')
                       286: 				more_keys = set(new_record.keys()) - old_keys
                       287: 				more_keys.discard('pragma')
                       288: 				less_keys = old_keys - set(new_record.keys())
                       289: 				if len(more_keys) > 0:
                       290: 					if len(old_keys) != 0:
                       291: 						print('More headers appear:', more_keys)
                       292: 					reload = True
                       293: 				elif len(less_keys) > 0:
                       294: 					print('Less headers appear:', less_keys)
                       295: 				else:
                       296: 					for key in record.keys():
                       297: 						if key[0] != '_' and key != 'pragma' and record[key] != new_record[key]:
                       298: 							print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
                       299: 							print(type(record[key]), type(new_record[key]))
                       300: 							reload = True
                       301: 
                       302: 				if reload:
                       303: 					print('Reloading.')
                       304: 					if os.access(temp_name, os.R_OK):
                       305: 						os.unlink(temp_name)
                       306: 					if os.access(file_name, os.R_OK):
                       307: 						os.unlink(file_name)
                       308: 					if 'content-length' in new_record:
                       309: 						new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['content-length'])})
                       310: 				if not new_record['_parts']:
                       311: 					new_record['_parts'] = spacemap.SpaceMap()
                       312: 				print(new_record)
                       313: 
                       314: 				# downloading file or segment
                       315: 				if 'content-length' in new_record:
                       316: 					if needed == None:
                       317: 						needed = new_record['_parts']
                       318: 					else:
                       319: 						if len(needed) > 1:
                       320: 							print("Multipart requests currently not supported.")
                       321: 							assert False, 'Skip this one for now.'
                       322: 				#else:
                       323: 					#assert False, 'No content-length or Content-Range header.'
                       324: 
                       325: 				new_record['_time'] = datetime.datetime.now()
                       326: 				if self.command not in ('HEAD'):
                       327: 					# file is created at temporary location and moved in place only when download completes
                       328: 					if not os.access(temp_name, os.R_OK):
                       329: 						empty_name = config['dir'] + os.sep + '.tmp'
                       330: 						with open(empty_name, 'w+b') as some_file:
                       331: 							pass
                       332: 						os.renames(empty_name, temp_name)
                       333: 					temp_file = open(temp_name, 'r+b')
                       334: 					if requested_ranges == None and needed == None:
                       335: 						needed = new_record['_parts']
                       336: 					needed.rewind()
                       337: 					while True:
                       338: 						# XXX can make this implicit - one request per range
                       339: 						(start, end) = needed.pop()
                       340: 						if start == None:
                       341: 							break
                       342: 						stream_last = start
                       343: 						old_record = copy.copy(new_record)
                       344: 						if end - start < block_size:
                       345: 							req_block_size = end - start
                       346: 						else:
                       347: 							req_block_size = block_size
                       348: 						buffer = source.read(req_block_size)
                       349: 						length = len(buffer)
                       350: 						while length > 0 and stream_last < end:
                       351: 							stream_pos = stream_last + length
                       352: 							assert stream_pos <= end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
                       353: 							temp_file.seek(stream_last)
                       354: 							temp_file.write(buffer)
                       355: 							x = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
                       356: 							new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
                       357: 							index[my_path] = old_record
                       358: 							index.sync()
                       359: 							old_record = copy.copy(new_record)
                       360: 							stream_last = stream_pos
                       361: 							if end - stream_last < block_size:
                       362: 								req_block_size = end - stream_last
                       363: 							buffer = source.read(req_block_size)
                       364: 							length = len(buffer)
                       365: 					# moving downloaded data to real file
                       366: 					temp_file.close()
                       367: 
                       368: 				index[my_path] = new_record
                       369: 				index.sync()
                       370: 
                       371: 			except urllib2.HTTPError as error:
                       372: 				# in case of error we don't need to do anything actually,
                       373: 				# if file download stalls or fails the file would not be moved to it's location
                       374: 				print(error)
                       375: 
                       376: 		print(index[my_path])
                       377: 
                       378: 		if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
                       379: 			# just moving
                       380: 			# drop old dirs XXX
                       381: 			print('Moving temporary file to new destination.')
                       382: 			os.renames(temp_name, file_name)
                       383: 
                       384: 		if not my_path in index:
                       385: 			self.send_response(502)
                       386: 			self.end_headers()
                       387: 			return
                       388: 
                       389: 		if self.command == 'HEAD':
                       390: 			self.send_response(200)
                       391: 			if 'content-length' in index[my_path]:
                       392: 				self.send_header('content-length', index[my_path]['content-length'])
                       393: 			self.send_header('accept-ranges', 'bytes')
                       394: 			self.send_header('content-type', 'application/octet-stream')
                       395: 			if 'last-modified' in index[my_path]:
                       396: 				self.send_header('last-modified', index[my_path]['last-modified'])
                       397: 			self.end_headers()
                       398: 		else:
                       399: 			if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
                       400: 				file_name = temp_name
                       401: 
                       402: 			with open(file_name, 'rb') as real_file:
                       403: 				file_stat = os.stat(file_name)
                       404: 				if 'range' in self.headers:
                       405: 					self.send_response(206)
                       406: 					ranges = ()
                       407: 					requested_ranges.rewind()
                       408: 					while True:
                       409: 						pair = requested_ranges.pop()
                       410: 						if pair[0] == None:
                       411: 							break
                       412: 						ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
                       413: 					self.send_header('content-range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['content-length']))
                       414: 				else:
                       415: 					self.send_response(200)
                       416: 					self.send_header('content-length', str(file_stat.st_size))
                       417: 					requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
                       418: 				if 'last-modified' in index[my_path]:
                       419: 					self.send_header('last-modified', index[my_path]['last-modified'])
                       420: 				self.send_header('content-type', 'application/octet-stream')
                       421: 				self.end_headers()
                       422: 				if self.command in ('GET'):
                       423: 					if len(requested_ranges) > 0:
                       424: 						requested_ranges.rewind()
                       425: 						(start, end) = requested_ranges.pop()
                       426: 					else:
                       427: 						start = 0
                       428: 						# XXX ugly hack
                       429: 						if 'content-length' in index[my_path]:
                       430: 							end = index[my_path]['content-length']
                       431: 						else:
                       432: 							end = 0
                       433: 					real_file.seek(start)
                       434: 					if block_size > end - start:
                       435: 						req_block_size = end - start
                       436: 					else:
                       437: 						req_block_size = block_size
                       438: 					buffer = real_file.read(req_block_size)
                       439: 					length = len(buffer)
                       440: 					while length > 0:
                       441: 						self.wfile.write(buffer)
                       442: 						start += len(buffer)
                       443: 						if req_block_size > end - start:
                       444: 							req_block_size = end - start
                       445: 						if req_block_size == 0:
                       446: 							break
                       447: 						buffer = real_file.read(req_block_size)
                       448: 						length = len(buffer)
                       449: 				
                       450: 	def do_HEAD(self):
                       451: 		return self.__process()
                       452: 	def do_GET(self):
                       453: 		return self.__process()
                       454: 
                       455: config.section('general')
                       456: server = BaseHTTPServer.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
                       457: server.serve_forever()
Annotation For samesite.py

Versions of samesite.py analyzed:

Lines of samesite.py from check-in 31a8af9ff1 that are changed by the sequence of edits moving toward check-in a81f1a70fb: