Samesite - proxy that can cache partial transfers: Annotation For samesite.py

Lines of samesite.py from check-in e7b837a681 that are changed by the sequence of edits moving toward check-in b0975a28fb:

                         1: #!/usr/bin/env python3.1
                         2: 
                         3: import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request
                         4: 
                         5: class Config:
                         6: 	__slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
                         7: 	_default = {
                         8: 		'general': {
                         9: 			'port': '8008',
                        10: 		},
                        11: 		'_other': {
                        12: 			'verbose': 'no',
                        13: 			'noetag': 'no',
                        14: 			'noparts': 'no',
                        15: 	},}
                        16: 
                        17: 	# function to read in config file
                        18: 	def __init__(self):
                        19: 		import configparser, optparse
                        20: 
                        21: 		parser = optparse.OptionParser()
                        22: 		parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
                        23: 		(self.options, args) = parser.parse_args()
                        24: 
                        25: 		assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
                        26: 
                        27: 		configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
                        28: 		if configDir:
                        29: 			self.root = configDir.group(1)
                        30: 		else:
                        31: 			self.root = os.getcwd()
                        32: 
                        33: 		self._config = configparser.ConfigParser()
                        34: 		self._config.readfp(open(self.options.config))
                        35: 
                        36: 		for section in self._config.sections():
                        37: 			if section != 'general':
                        38: 				if self._config.has_option(section, 'dir'):
                        39: 					if re.compile('^/$').match(self._config.get(section, 'dir')):
                        40: 						self._config.set(section, 'dir', self.root + os.sep + section)
                        41: 					thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
                        42: 					if thisDir:
                        43: 						self._config.set(section, 'dir', thisDir.group(1))
                        44: 					if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
                        45: 						self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
                        46: 				else:
                        47: 					self._config.set(section, 'dir', self.root + os.sep + section)
                        48: 
                        49: 				if not self._config.has_option(section, 'root'):
                        50: 					self._config.set(section, 'root', section)
                        51: 
                        52: 	# function to select config file section or create one
                        53: 	def section(self, section):
                        54: 		if not self._config.has_section(section):
                        55: 			self._config.add_section(section)
                        56: 		self._section = section
                        57: 
                        58: 	# function to get config parameter, if parameter doesn't exists the default
                        59: 	# value or None is substituted
                        60: 	def __getitem__(self, name):
                        61: 		if not self._config.has_option(self._section, name):
                        62: 			if self._section in self._default:
                        63: 				if name in self._default[self._section]:
                        64: 					self._config.set(self._section, name, self._default[self._section][name])
                        65: 				else:
                        66: 					self._config.set(self._section, name, None)
                        67: 			elif name in self._default['_other']:
                        68: 				self._config.set(self._section, name, self._default['_other'][name])
                        69: 			else:
                        70: 				self._config.set(self._section, name, None)
                        71: 		return(self._config.get(self._section, name))
                        72: 
                        73: config = Config()
                        74: 
                        75: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
                        76: 
                        77: const_desc_fields = set(['Content-Length', 'Pragma', 'Last-Modified'])
                        78: const_ignore_fields = set(['Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By'])
                        79: 
                        80: block_size = 4096
                        81: 
                        82: '''
                        83: # later, kqueue would be good but later
                        84: class Connection:
                        85: 	__slots__ = frozenset(('__address', '__input', '__socket', '__status', 'error', 'method', 'url', 'http_version'))
                        86: 
                        87: 	def __init__(self, socket, address):
                        88: 		self.__address = address
                        89: 		self.__input = b''
                        90: 		self.__socket = socket
                        91: 		self.__status = 0
                        92: 
                        93: 	def read(self, kev):
                        94: 		buffer = self.__socket.recv(kev.data)
                        95: 		exhausted = False
                        96: 		if len(buffer) == 0:
                        97: 			eof = True
                        98: 		else:
                        99: 			self.__input += buffer
                       100: 			while not exhausted:
                       101: 				if self.__status == -1:
                       102: 					exhausted = True
                       103: 				elif self.__status == 0:
                       104: 					endstring = self.__input.find(b'\n')
                       105: 					if endstring > 0:
                       106: 						print('Processing request line.')
                       107: 						line = self.__input[:endstring].decode('ascii')
                       108: 						self.__input = self.__input[endstring + 1:]
                       109: 						isRequest = re.compile('(GET) ([^ ]+) HTTP/(1\.0)').match(line)
                       110: 						if not isRequest:
                       111: 							self.error = 'Not a HTTP connection.'
                       112: 							self.__status = -1
                       113: 						else:
                       114: 							self.method = isRequest.group(1)
                       115: 							self.url = isRequest.group(2)
                       116: 							self.http_version = isRequest.group(3)
                       117: 							self.__status = 1
                       118: 					else:
                       119: 						exhausted = True
                       120: 				elif self.__status == 1:
                       121: 					endstring = self.__input.find(b'\n')
                       122: 					if endstring > 0:
                       123: 						print('Processing header line.' + repr(self.__input))
                       124: 						line = self.__input[:endstring].decode('ascii')
                       125: 						self.__input = self.__input[endstring + 1:]
                       126: 						isHeader = re.compile('([^:]*): +(.*)').match(line)
                       127: 						if not isHeader:
                       128: 							self.error = 'Bad header.'
                       129: 							return(False)
                       130: 						# process header here
                       131: 					elif endstring == 0:
                       132: 						self.__status = 2
                       133: 					else:
                       134: 						exhausted = True
                       135: 
                       136: 	def write(self, kev):
                       137: 		pass
                       138: 
                       139: if options.port:
                       140: 	import select, socket
                       141: 
                       142: 	sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                       143: 	try:
                       144: 		sock.bind(('127.0.0.1', int(options.port)))
                       145: 		sock.listen(-1)
                       146: 
                       147: 		kq = select.kqueue()
                       148: 		assert kq.fileno() != -1, "Fatal error: can't initialise kqueue."
                       149: 
                       150: 		kq.control([select.kevent(sock, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
                       151: 		timeout = None
                       152: 
                       153: 		connections = {sock.fileno(): None}
                       154: 
                       155: 		while True:
                       156: 			kevs = kq.control(None, 1, timeout)
                       157: 
                       158: 			for kev in kevs:
                       159: 				if type(connections[kev.ident]) == Connection:
                       160: 					print(kev.ident, kev.data, kev.filter, kev.flags)
                       161: 					assert kev.data != 0, 'No data available.'
                       162: 					if kev.filter == select.KQ_FILTER_READ:
                       163: 						connections[kev.ident].read(kev)
                       164: 					elif kev.filter == select.KQ_FILTER_WRITE:
                       165: 						connections[kev.ident].write(kev)
                       166: 					else:
                       167: 						assert kev.filter in (select.KQ_FILTER_READ, select.KQ_FILTER_WRITE), 'Do we support other filters?'
                       168: 				else:
                       169: 					(conn, addr) = sock.accept()
                       170: 					print('Connection from ' + repr(addr))
                       171: 					kq.control([select.kevent(conn, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
                       172: 					connections[conn.fileno()] = Connection(conn, addr)
                       173: 
                       174: 				if kev.flags >> 15 == 1:
                       175: 					kq.control([select.kevent(kev.ident, select.KQ_FILTER_READ, select.KQ_EV_DELETE)], 0)
                       176: 					kq.control([select.kevent(kev.ident, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE)], 0)
                       177: 					del(connections[kev.ident])
                       178: 	finally:
                       179: 		sock.close()
                       180: '''
                       181: 
                       182: # XXX how about rechecking files?
                       183: if True:
                       184: 	import http.server
                       185: 
                       186: 	class MyRequestHandler(http.server.BaseHTTPRequestHandler):
                       187: 		def __process(self):
                       188: 			# reload means file needs to be reloaded to serve request
                       189: 			reload = False
                       190: 			# recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
                       191: 			recheck = False
                       192: 			# file_stat means file definitely exists
                       193: 			file_stat = None
                       194: 			# requested_ranges holds data about any range requested
                       195: 			requested_ranges = None
                       196: 			# records holds data from index locally, should be written back upon successfull completion
                       197: 			record = None
                       198: 			info = 'Checking file: ' + self.path
                       199: 
                       200: 			myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
                       201: 			if myPath:
                       202: 				my_path = myPath.group(1)
                       203: 			else:
                       204: 				my_path = self.path
                       205: 
                       206: 			config.section(self.headers['Host'])
                       207: 
                       208: 			if not os.access(config['dir'], os.X_OK):
                       209: 				os.mkdir(config['dir'])
                       210: 			# this is file index - everything is stored in this file
                       211: 			# _parts - list of stored parts of file
                       212: 			# _time - last time the file was checked
                       213: 			# everything else is just the headers
                       214: 			index = shelve.open(config['dir'] + os.sep + '.index')
                       215: 
                       216: 			desc_fields = const_desc_fields.copy()
                       217: 			ignore_fields = const_ignore_fields.copy()
                       218: 			if not config['noetag']:
                       219: 				desc_fields.add('ETag')
                       220: 			else:
                       221: 				ignore_fields.add('ETag')
                       222: 
                       223: 			proxy_ignored = ('Accept', 'Accept-Encoding',
                       224: 				'Cache-Control', 'Connection',
                       225: 				'Host',
                       226: 				'If-Modified-Since', 'If-Unmodified-Since',
                       227: 				'User-Agent',
                       228: 				'Via',
                       229: 				'X-Forwarded-For',
                       230: 			)
                       231: 
                       232: 			print('===============[ {} request ]==='.format(self.command))
                       233: 
                       234: 			for header in self.headers:
                       235: 				if header in proxy_ignored:
                       236: 					pass
                       237: 				elif header in ('Range'):
                       238: 					isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
                       239: 					if isRange:
                       240: 						requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
                       241: 					else:
                       242: 						return()
                       243: 				elif header in ('Pragma'):
                       244: 					if my_path in index:
                       245: 						index[my_path][header] = self.headers[header]
                       246: 				else:
                       247: 					print('Unknown header - ', header, ': ', self.headers[header], sep='')
                       248: 					return()
                       249: 				print(header, self.headers[header])
                       250: 
                       251: 			# creating empty placeholder in index
                       252: 			# if there's no space map and there's no file in real directory - we have no file
                       253: 			# if there's an empty space map - file is full
                       254: 			# space map generally covers every bit of file we don't posess currently
                       255: 			if not my_path in index:
                       256: 				info += '\nThis one is new.'
                       257: 				reload = True
                       258: 				record = {}
                       259: 			else:
                       260: 				record = index[my_path]
                       261: 
                       262: 			if not '_parts' in record:
                       263: 				record['_parts'] = None
                       264: 
e7b837a681 2010-08-25  265: 			# creating file name from my_path
e7b837a681 2010-08-25  266: 			file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
e7b837a681 2010-08-25  267: 			# partial file or unfinished download
e7b837a681 2010-08-25  268: 			temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
e7b837a681 2010-08-25  269: 
e7b837a681 2010-08-25  270: 			# forcibly checking file if no file present
e7b837a681 2010-08-25  271: 			if os.access(file_name, os.R_OK):
e7b837a681 2010-08-25  272: 				file_stat = os.stat(file_name)
e7b837a681 2010-08-25  273: 			elif '_parts' in record and os.access(temp_name, os.R_OK):
e7b837a681 2010-08-25  274: 				file_stat = os.stat(temp_name)
e7b837a681 2010-08-25  275: 			elif not reload:
e7b837a681 2010-08-25  276: 				print(record)
e7b837a681 2010-08-25  277: 				info += '\nFile not found or inaccessible.'
e7b837a681 2010-08-25  278: 				record['_parts'] = None
e7b837a681 2010-08-25  279: 				reload = True
                       280: 
                       281: 			# forcibly checking file if file size doesn't match with index data
                       282: 			if not reload:
                       283: 				if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
                       284: 					if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']):
                       285: 						info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length'])
                       286: 						reload = True
                       287: 
                       288: 			# forcibly checking file if index holds Pragma header
                       289: 			if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache':
                       290: 				info +='\nPragma on: recheck imminent.'
                       291: 				recheck = True
                       292: 
                       293: 			# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
                       294: 			if not recheck and not reload and '_time' in record and (datetime.datetime.now() - datetime.timedelta(hours = 4) - record['_time']).days < 0:
                       295: 				recheck = True
                       296: 
                       297: 			print(info)
                       298: 			if reload or recheck:
                       299: 
                       300: 				try:
                       301: 					request = 'http://' + config['root'] + my_path
                       302: 					needed = None
                       303: 					# XXX and if we specify full file we don't go partial?
                       304: 					if requested_ranges != None:
                       305: 						if '_parts' in record and record['_parts'] != None:
e7b837a681 2010-08-25  306: 							needed = record['_parts'] & requested_ranges
e7b837a681 2010-08-25  307: 						elif config['noparts']:
e7b837a681 2010-08-25  308: 							needed = record['_parts']
e7b837a681 2010-08-25  309: 						else:
                       310: 							needed = requested_ranges
                       311: 						ranges = ()
                       312: 						print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
e7b837a681 2010-08-25  313: 						if len(needed) > 0:
                       314: 							needed.rewind()
                       315: 							while True:
                       316: 								range = needed.pop()
                       317: 								if range[0] == None:
                       318: 									break
                       319: 								ranges += '{}-{}'.format(range[0], range[1] - 1),
                       320: 							request = urllib.request.Request(request, headers = {'Range': 'bytes=' + ','.join(ranges)})
                       321: 
                       322: 					with urllib.request.urlopen(request) as source:
                       323: 						new_record = {}
                       324: 						new_record['_parts'] = record['_parts']
                       325: 						headers = source.info()
                       326: 
                       327: 						# stripping unneeded headers (XXX make this inplace?)
                       328: 						for header in headers:
                       329: 							if header in desc_fields:
                       330: 								#if header == 'Pragma' and headers[header] != 'no-cache':
                       331: 								if header == 'Content-Length':
                       332: 									if 'Content-Range' not in headers:
                       333: 										new_record[header] = int(headers[header])
                       334: 								else:
                       335: 									new_record[header] = headers[header]
                       336: 							elif header == 'Content-Range':
                       337: 								range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
                       338: 								if range:
                       339: 									new_record['Content-Length'] = int(range.group(3))
                       340: 								else:	
                       341: 									assert False, 'Content-Range unrecognized.'
                       342: 							elif not header in ignore_fields:
                       343: 								print('Undefined header "', header, '": ', headers[header], sep='')
                       344: 
                       345: 						# comparing headers with data found in index
                       346: 						# if any header has changed (except Pragma) file is fully downloaded
                       347: 						# same if we get more or less headers
                       348: 						old_keys = set(record.keys())
                       349: 						old_keys.discard('_time')
                       350: 						old_keys.discard('Pragma')
                       351: 						more_keys = set(new_record.keys()) - old_keys
                       352: 						more_keys.discard('Pragma')
                       353: 						less_keys = old_keys - set(new_record.keys())
                       354: 						if len(more_keys) > 0:
                       355: 							if not len(old_keys) == 0:
                       356: 								print('More headers appear:', more_keys)
                       357: 							reload = True
                       358: 						elif len(less_keys) > 0:
                       359: 							print('Less headers appear:', less_keys)
                       360: 						else:
                       361: 							for key in record.keys():
                       362: 								if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]:
                       363: 									print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
                       364: 									print(type(record[key]), type(new_record[key]))
                       365: 									reload = True
                       366: 
                       367: 						if reload:
                       368: 							print('Reloading.')
                       369: 							if os.access(temp_name, os.R_OK):
                       370: 								os.unlink(temp_name)
                       371: 							if os.access(file_name, os.R_OK):
                       372: 								os.unlink(file_name)
e7b837a681 2010-08-25  373: 						if new_record['_parts'] == None or reload:
                       374: 							new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
                       375: 						print(new_record)
                       376: 
                       377: 						# downloading file or segment
                       378: 						if 'Content-Length' in new_record:
                       379: 							if needed == None:
                       380: 								needed = new_record['_parts']
                       381: 							else:
                       382: 								if len(needed) > 1:
                       383: 									print("Multipart requests currently not supported.")
                       384: 									assert False, 'Skip this one for now.'
                       385: 						else:
                       386: 							assert False, 'No Content-Length or Content-Range header.'
                       387: 
                       388: 						new_record['_time'] = datetime.datetime.now()
                       389: 						if self.command not in ('HEAD'):
                       390: 							# file is created at temporary location and moved in place only when download completes
                       391: 							if not os.access(temp_name, os.R_OK):
                       392: 								empty_name = config['dir'] + os.sep + '.tmp'
                       393: 								with open(empty_name, 'w+b') as some_file:
                       394: 									pass
                       395: 								os.renames(empty_name, temp_name)
                       396: 							temp_file = open(temp_name, 'r+b')
                       397: 							needed.rewind()
                       398: 							while True:
                       399: 								(start, end) = needed.pop()
                       400: 								if start == None:
                       401: 									break
                       402: 								stream_last = start
                       403: 								old_record = new_record
                       404: 								if end - start < block_size:
                       405: 									req_block_size = end - start
                       406: 								else:
                       407: 									req_block_size = block_size
                       408: 								buffer = source.read(req_block_size)
                       409: 								length = len(buffer)
                       410: 								while length > 0 and stream_last < end:
                       411: 									stream_pos = stream_last + length
                       412: 									assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
                       413: 									temp_file.seek(stream_last)
                       414: 									temp_file.write(buffer)
                       415: 									new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
                       416: 									index[my_path] = old_record
                       417: 									index.sync()
                       418: 									old_record = new_record
                       419: 									stream_last = stream_pos
                       420: 									if end - stream_last < block_size:
                       421: 										req_block_size = end - stream_last
                       422: 									buffer = source.read(req_block_size)
                       423: 									length = len(buffer)
                       424: 							# moving downloaded data to real file
                       425: 							temp_file.close()
                       426: 
                       427: 						print(new_record)
                       428: 						index[my_path] = new_record
                       429: 						index.sync()
                       430: 
                       431: 				except urllib.error.HTTPError as error:
                       432: 					# in case of error we don't need to do anything actually,
                       433: 					# if file download stalls or fails the file would not be moved to it's location
                       434: 					print(error)
                       435: 
                       436: 			if '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
                       437: 				# just moving
                       438: 				# drop old dirs XXX
                       439: 				print('Moving temporary file to new destination.')
                       440: 				os.renames(temp_name, file_name)
                       441: 
                       442: 			if self.command == 'HEAD':
                       443: 				self.send_response(200)
                       444: 				if 'Content-Length' in index[my_path]:
                       445: 					self.send_header('Content-Length', index[my_path]['Content-Length'])
                       446: 				self.send_header('Accept-Ranges', 'bytes')
                       447: 				self.send_header('Content-Type', 'application/octet-stream')
                       448: 				if 'Last-Modified' in index[my_path]:
                       449: 					self.send_header('Last-Modified', index[my_path]['Last-Modified'])
                       450: 				self.end_headers()
                       451: 			else:
                       452: 				if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
                       453: 					file_name = temp_name
                       454: 
                       455: 				with open(file_name, 'rb') as real_file:
                       456: 					file_stat = os.stat(file_name)
                       457: 					if 'Range' in self.headers:
                       458: 						self.send_response(206)
                       459: 						ranges = ()
                       460: 						requested_ranges.rewind()
                       461: 						while True:
                       462: 							pair = requested_ranges.pop()
                       463: 							if pair[0] == None:
                       464: 								break
                       465: 							ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
                       466: 						self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length']))
                       467: 					else:
                       468: 						self.send_response(200)
                       469: 						self.send_header('Content-Length', str(file_stat.st_size))
                       470: 						requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
                       471: 					self.send_header('Last-Modified', index[my_path]['Last-Modified'])
                       472: 					self.send_header('Content-Type', 'application/octet-stream')
                       473: 					self.end_headers()
                       474: 					if self.command in ('GET'):
                       475: 						if len(requested_ranges) > 0:
                       476: 							requested_ranges.rewind()
                       477: 							(start, end) = requested_ranges.pop()
                       478: 						else:
                       479: 							start = 0
                       480: 							end = index[my_path]['Content-Length']
                       481: 						real_file.seek(start)
                       482: 						if block_size > end - start:
                       483: 							req_block_size = end - start
                       484: 						else:
                       485: 							req_block_size = block_size
                       486: 						buffer = real_file.read(req_block_size)
                       487: 						length = len(buffer)
                       488: 						while length > 0:
                       489: 							self.wfile.write(buffer)
                       490: 							start += len(buffer)
                       491: 							if req_block_size > end - start:
                       492: 								req_block_size = end - start
                       493: 							if req_block_size == 0:
                       494: 								break
                       495: 							buffer = real_file.read(req_block_size)
                       496: 							length = len(buffer)
                       497: 					
                       498: 		def do_HEAD(self):
                       499: 			return self.__process()
                       500: 		def do_GET(self):
                       501: 			return self.__process()
                       502: 
                       503: 	config.section('general')
                       504: 	server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
                       505: 	server.serve_forever()
                       506: 
                       507: else:
                       508: 	while True:
                       509: 		unchecked_files = set()
                       510: 		checked_files = 0
                       511: 
                       512: 		# reading log and storing found urls for processing
                       513: 		# check file mtime XXX
                       514: 		with open(options.log, 'r') as log_file:
                       515: 			log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
                       516: 			for line in log_file:
                       517: 				this_line = log_line.match(line.strip())
                       518: 				if this_line:
                       519: 					unchecked_files.add(this_line.group(2))
                       520: 
                       521: 		for url in unchecked_files:
                       522: 			reload = False
                       523: 			recheck = False
                       524: 			info = 'Checking file: ' + url
                       525: 
                       526: 			# creating empty placeholder in index
                       527: 			if not url in index:
                       528: 				info += '\nThis one is new.'
                       529: 				index[url] = {}
                       530: 				reload = True
                       531: 
                       532: 			# creating file name from url
                       533: 			file_name = options.dir + re.compile('%20').sub(' ', url)
                       534: 
                       535: 			# forcibly checking file if no file present
                       536: 			if not reload and not os.access(file_name, os.R_OK):
                       537: 				info += '\nFile not found or inaccessible.'
                       538: 				reload = True
                       539: 
                       540: 			# forcibly checking file if file size doesn't match with index data
                       541: 			elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
                       542: 				info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
                       543: 				reload = True
                       544: 
                       545: 			# forcibly checking file if index hods Pragma header
                       546: 			if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
                       547: 				info +='\nPragma on: recheck imminent.'
                       548: 				recheck = True
                       549: 
                       550: 			# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
                       551: 			if not recheck and not reload and (options.noupdate or ('_time' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['_time']).days < 0)):
                       552: 				if options.verbose:
                       553: 					print(info)
                       554: 				continue
                       555: 			else:
                       556: 				print(info)
                       557: 
                       558: 			try:
                       559: 				with urllib.request.urlopen(options.root + url) as source:
                       560: 					new_headers = {}
                       561: 					headers = source.info()
                       562: 
                       563: 					# stripping unneeded headers (XXX make this inplace?)
                       564: 					for header in headers:
                       565: 						if header in desc_fields:
                       566: 							if header == 'Pragma' and headers[header] != 'no-cache':
                       567: 								print('Pragma:', headers[header])
                       568: 							new_headers[header] = headers[header]
                       569: 						elif not header in ignore_fields:
                       570: 							print('Undefined header "', header, '": ', headers[header], sep='')
                       571: 
                       572: 					# comparing headers with data found in index
                       573: 					# if any header has changed (except Pragma) file is fully downloaded
                       574: 					# same if we get more or less headers
                       575: 					old_keys = set(index[url].keys())
                       576: 					old_keys.discard('_time')
                       577: 					old_keys.discard('Pragma')
                       578: 					more_keys = set(new_headers.keys()) - old_keys
                       579: 					more_keys.discard('Pragma')
                       580: 					less_keys = old_keys - set(new_headers.keys())
                       581: 					if len(more_keys) > 0:
                       582: 						if not len(old_keys) == 0:
                       583: 							print('More headers appear:', more_keys)
                       584: 						reload = True
                       585: 					elif len(less_keys) > 0:
                       586: 						print('Less headers appear:', less_keys)
                       587: 					else:
                       588: 						for key in index[url].keys():
                       589: 							if key[0] != '_' and key != 'Pragma' and not index[url][key] == new_headers[key]:
                       590: 								print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
                       591: 								reload = True
                       592: 
                       593: 					# downloading file
                       594: 					if reload:
                       595: 						if 'Content-Length' in headers:
                       596: 							print('Downloading', headers['Content-Length'], 'bytes [', end='')
                       597: 						else:
                       598: 							print('Downloading [', end='')
                       599: 						sys.stdout.flush()
                       600: 
                       601: 						# file is created at temporary location and moved in place only when download completes
                       602: 						temp_file = open(options.dir + os.sep + '.tmp', 'wb')
                       603: 						buffer = source.read(block_size)
                       604: 						megablocks = 0
                       605: 						blocks = 0
                       606: 						megs = 0
                       607: 						while len(buffer) > 0:
                       608: 							temp_file.write(buffer)
                       609: 							buffer = source.read(block_size)
                       610: 							blocks += 1
                       611: 							if blocks > 102400/block_size:
                       612: 								megablocks += 1
                       613: 								if megablocks > 10:
                       614: 									megablocks = megablocks - 10
                       615: 									megs += 1
                       616: 									print('{}Mb'.format(megs), end='')
                       617: 								else:
                       618: 									print('.', end='')
                       619: 								blocks = blocks - 102400/block_size
                       620: 							sys.stdout.flush()
                       621: 						temp_file.close()
                       622: 						print(']')
                       623: 						os.renames(options.dir + os.sep + '.tmp', file_name)
                       624: 
                       625: 						checked_files += 1
                       626: 
                       627: 					# storing new time mark and storing new headers
                       628: 					new_headers['_time'] = datetime.datetime.now()
                       629: 					index[url] = new_headers
                       630: 					index.sync()
                       631: 
                       632: 			except urllib.error.HTTPError as error:
                       633: 				# in case of error we don't need to do anything actually,
                       634: 				# if file download stalls or fails the file would not be moved to it's location
                       635: 				print(error)
                       636: 
                       637: 		if options.verbose:
                       638: 			print('[', len(unchecked_files), '/', checked_files, ']')
                       639: 
                       640: 		# checking if there were any files downloaded, if yes - restarting sequence
                       641: 		if checked_files == 0:
                       642: 			break
Annotation For samesite.py

Versions of samesite.py analyzed:

Lines of samesite.py from check-in e7b837a681 that are changed by the sequence of edits moving toward check-in b0975a28fb: