Samesite - proxy that can cache partial transfers

Annotation For samesite.py
anonymous

Annotation For samesite.py

Lines of samesite.py from check-in c3db1a007e that are changed by the sequence of edits moving toward check-in 90160dbf50:

                         1: #!/usr/bin/env python3.1
                         2: 
                         3: import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request
                         4: 
                         5: class Config:
                         6: 	__slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
                         7: 	_default = {
                         8: 		'general': {
                         9: 			'port': '8008',
                        10: 		},
                        11: 		'_other': {
                        12: 			'verbose': 'no',
                        13: 			'noetag': 'no',
                        14: 			'noparts': 'no',
                        15: 			'strip': '',
                        16: 			'sub': '',
                        17: 	},}
                        18: 
                        19: 	# function to read in config file
                        20: 	def __init__(self):
                        21: 		import configparser, optparse
                        22: 
                        23: 		parser = optparse.OptionParser()
                        24: 		parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
                        25: 		(self.options, args) = parser.parse_args()
                        26: 
                        27: 		assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
                        28: 
                        29: 		configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
                        30: 		if configDir:
                        31: 			self.root = configDir.group(1)
                        32: 		else:
                        33: 			self.root = os.getcwd()
                        34: 
                        35: 		self._config = configparser.ConfigParser()
                        36: 		self._config.readfp(open(self.options.config))
                        37: 
                        38: 		for section in self._config.sections():
                        39: 			if section != 'general':
                        40: 				if self._config.has_option(section, 'dir'):
                        41: 					if re.compile('^/$').match(self._config.get(section, 'dir')):
                        42: 						self._config.set(section, 'dir', self.root + os.sep + section)
                        43: 					thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
                        44: 					if thisDir:
                        45: 						self._config.set(section, 'dir', thisDir.group(1))
                        46: 					if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
                        47: 						self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
                        48: 				else:
                        49: 					self._config.set(section, 'dir', self.root + os.sep + section)
                        50: 
                        51: 				if not self._config.has_option(section, 'root'):
                        52: 					self._config.set(section, 'root', section)
                        53: 
                        54: 	# function to select config file section or create one
                        55: 	def section(self, section):
                        56: 		if not self._config.has_section(section):
                        57: 			self._config.add_section(section)
                        58: 		self._section = section
                        59: 
                        60: 	# function to get config parameter, if parameter doesn't exists the default
                        61: 	# value or None is substituted
                        62: 	def __getitem__(self, name):
                        63: 		if not self._config.has_option(self._section, name):
                        64: 			if self._section in self._default:
                        65: 				if name in self._default[self._section]:
                        66: 					self._config.set(self._section, name, self._default[self._section][name])
                        67: 				else:
                        68: 					self._config.set(self._section, name, None)
                        69: 			elif name in self._default['_other']:
                        70: 				self._config.set(self._section, name, self._default['_other'][name])
                        71: 			else:
                        72: 				self._config.set(self._section, name, None)
                        73: 		return(self._config.get(self._section, name))
                        74: 
                        75: config = Config()
                        76: 
                        77: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
                        78: 
                        79: const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma'])
                        80: const_ignore_fields = set([
                        81: 	'Accept-Ranges', 'Age',
                        82: 	'Cache-Control', 'Connection', 'Content-Type',
                        83: 	'Date',
                        84: 	'Expires',
                        85: 	'Referer',
                        86: 	'Server',
                        87: 	'Via',
                        88: 	'X-Cache', 'X-Cache-Lookup', 'X-Powered-By'
                        89: ])
                        90: 
                        91: block_size = 4096
                        92: 
c3db1a007e 2010-09-16   93: '''
c3db1a007e 2010-09-16   94: # later, kqueue would be good but later
c3db1a007e 2010-09-16   95: class Connection:
c3db1a007e 2010-09-16   96: 	__slots__ = frozenset(('__address', '__input', '__socket', '__status', 'error', 'method', 'url', 'http_version'))
c3db1a007e 2010-09-16   97: 
c3db1a007e 2010-09-16   98: 	def __init__(self, socket, address):
c3db1a007e 2010-09-16   99: 		self.__address = address
c3db1a007e 2010-09-16  100: 		self.__input = b''
c3db1a007e 2010-09-16  101: 		self.__socket = socket
c3db1a007e 2010-09-16  102: 		self.__status = 0
c3db1a007e 2010-09-16  103: 
c3db1a007e 2010-09-16  104: 	def read(self, kev):
c3db1a007e 2010-09-16  105: 		buffer = self.__socket.recv(kev.data)
c3db1a007e 2010-09-16  106: 		exhausted = False
c3db1a007e 2010-09-16  107: 		if len(buffer) == 0:
c3db1a007e 2010-09-16  108: 			eof = True
c3db1a007e 2010-09-16  109: 		else:
c3db1a007e 2010-09-16  110: 			self.__input += buffer
c3db1a007e 2010-09-16  111: 			while not exhausted:
c3db1a007e 2010-09-16  112: 				if self.__status == -1:
c3db1a007e 2010-09-16  113: 					exhausted = True
c3db1a007e 2010-09-16  114: 				elif self.__status == 0:
c3db1a007e 2010-09-16  115: 					endstring = self.__input.find(b'\n')
c3db1a007e 2010-09-16  116: 					if endstring > 0:
c3db1a007e 2010-09-16  117: 						print('Processing request line.')
c3db1a007e 2010-09-16  118: 						line = self.__input[:endstring].decode('ascii')
c3db1a007e 2010-09-16  119: 						self.__input = self.__input[endstring + 1:]
c3db1a007e 2010-09-16  120: 						isRequest = re.compile('(GET) ([^ ]+) HTTP/(1\.0)').match(line)
c3db1a007e 2010-09-16  121: 						if not isRequest:
c3db1a007e 2010-09-16  122: 							self.error = 'Not a HTTP connection.'
c3db1a007e 2010-09-16  123: 							self.__status = -1
c3db1a007e 2010-09-16  124: 						else:
c3db1a007e 2010-09-16  125: 							self.method = isRequest.group(1)
c3db1a007e 2010-09-16  126: 							self.url = isRequest.group(2)
c3db1a007e 2010-09-16  127: 							self.http_version = isRequest.group(3)
c3db1a007e 2010-09-16  128: 							self.__status = 1
c3db1a007e 2010-09-16  129: 					else:
c3db1a007e 2010-09-16  130: 						exhausted = True
c3db1a007e 2010-09-16  131: 				elif self.__status == 1:
c3db1a007e 2010-09-16  132: 					endstring = self.__input.find(b'\n')
c3db1a007e 2010-09-16  133: 					if endstring > 0:
c3db1a007e 2010-09-16  134: 						print('Processing header line.' + repr(self.__input))
c3db1a007e 2010-09-16  135: 						line = self.__input[:endstring].decode('ascii')
c3db1a007e 2010-09-16  136: 						self.__input = self.__input[endstring + 1:]
c3db1a007e 2010-09-16  137: 						isHeader = re.compile('([^:]*): +(.*)').match(line)
c3db1a007e 2010-09-16  138: 						if not isHeader:
c3db1a007e 2010-09-16  139: 							self.error = 'Bad header.'
c3db1a007e 2010-09-16  140: 							return(False)
c3db1a007e 2010-09-16  141: 						# process header here
c3db1a007e 2010-09-16  142: 					elif endstring == 0:
c3db1a007e 2010-09-16  143: 						self.__status = 2
c3db1a007e 2010-09-16  144: 					else:
c3db1a007e 2010-09-16  145: 						exhausted = True
c3db1a007e 2010-09-16  146: 
c3db1a007e 2010-09-16  147: 	def write(self, kev):
c3db1a007e 2010-09-16  148: 		pass
c3db1a007e 2010-09-16  149: 
c3db1a007e 2010-09-16  150: if options.port:
c3db1a007e 2010-09-16  151: 	import select, socket
c3db1a007e 2010-09-16  152: 
c3db1a007e 2010-09-16  153: 	sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
c3db1a007e 2010-09-16  154: 	try:
c3db1a007e 2010-09-16  155: 		sock.bind(('127.0.0.1', int(options.port)))
c3db1a007e 2010-09-16  156: 		sock.listen(-1)
c3db1a007e 2010-09-16  157: 
c3db1a007e 2010-09-16  158: 		kq = select.kqueue()
c3db1a007e 2010-09-16  159: 		assert kq.fileno() != -1, "Fatal error: can't initialise kqueue."
c3db1a007e 2010-09-16  160: 
c3db1a007e 2010-09-16  161: 		kq.control([select.kevent(sock, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
c3db1a007e 2010-09-16  162: 		timeout = None
c3db1a007e 2010-09-16  163: 
c3db1a007e 2010-09-16  164: 		connections = {sock.fileno(): None}
c3db1a007e 2010-09-16  165: 
c3db1a007e 2010-09-16  166: 		while True:
c3db1a007e 2010-09-16  167: 			kevs = kq.control(None, 1, timeout)
c3db1a007e 2010-09-16  168: 
c3db1a007e 2010-09-16  169: 			for kev in kevs:
c3db1a007e 2010-09-16  170: 				if type(connections[kev.ident]) == Connection:
c3db1a007e 2010-09-16  171: 					print(kev.ident, kev.data, kev.filter, kev.flags)
c3db1a007e 2010-09-16  172: 					assert kev.data != 0, 'No data available.'
c3db1a007e 2010-09-16  173: 					if kev.filter == select.KQ_FILTER_READ:
c3db1a007e 2010-09-16  174: 						connections[kev.ident].read(kev)
c3db1a007e 2010-09-16  175: 					elif kev.filter == select.KQ_FILTER_WRITE:
c3db1a007e 2010-09-16  176: 						connections[kev.ident].write(kev)
c3db1a007e 2010-09-16  177: 					else:
c3db1a007e 2010-09-16  178: 						assert kev.filter in (select.KQ_FILTER_READ, select.KQ_FILTER_WRITE), 'Do we support other filters?'
c3db1a007e 2010-09-16  179: 				else:
c3db1a007e 2010-09-16  180: 					(conn, addr) = sock.accept()
c3db1a007e 2010-09-16  181: 					print('Connection from ' + repr(addr))
c3db1a007e 2010-09-16  182: 					kq.control([select.kevent(conn, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
c3db1a007e 2010-09-16  183: 					connections[conn.fileno()] = Connection(conn, addr)
c3db1a007e 2010-09-16  184: 
c3db1a007e 2010-09-16  185: 				if kev.flags >> 15 == 1:
c3db1a007e 2010-09-16  186: 					kq.control([select.kevent(kev.ident, select.KQ_FILTER_READ, select.KQ_EV_DELETE)], 0)
c3db1a007e 2010-09-16  187: 					kq.control([select.kevent(kev.ident, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE)], 0)
c3db1a007e 2010-09-16  188: 					del(connections[kev.ident])
c3db1a007e 2010-09-16  189: 	finally:
c3db1a007e 2010-09-16  190: 		sock.close()
c3db1a007e 2010-09-16  191: '''
c3db1a007e 2010-09-16  192: 
c3db1a007e 2010-09-16  193: # XXX how about rechecking files?
c3db1a007e 2010-09-16  194: if True:
c3db1a007e 2010-09-16  195: 	import http.server
c3db1a007e 2010-09-16  196: 
c3db1a007e 2010-09-16  197: 	class MyRequestHandler(http.server.BaseHTTPRequestHandler):
c3db1a007e 2010-09-16  198: 		def __process(self):
c3db1a007e 2010-09-16  199: 			# reload means file needs to be reloaded to serve request
c3db1a007e 2010-09-16  200: 			reload = False
c3db1a007e 2010-09-16  201: 			# recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
c3db1a007e 2010-09-16  202: 			recheck = False
c3db1a007e 2010-09-16  203: 			# file_stat means file definitely exists
c3db1a007e 2010-09-16  204: 			file_stat = None
c3db1a007e 2010-09-16  205: 			# requested_ranges holds data about any range requested
c3db1a007e 2010-09-16  206: 			requested_ranges = None
c3db1a007e 2010-09-16  207: 			# records holds data from index locally, should be written back upon successfull completion
c3db1a007e 2010-09-16  208: 			record = None
c3db1a007e 2010-09-16  209: 
c3db1a007e 2010-09-16  210: 			myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
c3db1a007e 2010-09-16  211: 			if myPath:
c3db1a007e 2010-09-16  212: 				my_path = myPath.group(1)
c3db1a007e 2010-09-16  213: 			else:
c3db1a007e 2010-09-16  214: 				my_path = self.path
c3db1a007e 2010-09-16  215: 
c3db1a007e 2010-09-16  216: 			config.section(self.headers['Host'])
c3db1a007e 2010-09-16  217: 
c3db1a007e 2010-09-16  218: 			if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0:
c3db1a007e 2010-09-16  219: 				string = re.compile(config['strip']).sub(config['sub'], my_path)
c3db1a007e 2010-09-16  220: 				my_path = string
c3db1a007e 2010-09-16  221: 
c3db1a007e 2010-09-16  222: 			info = 'Checking file: ' + my_path
c3db1a007e 2010-09-16  223: 
c3db1a007e 2010-09-16  224: 			if not os.access(config['dir'], os.X_OK):
c3db1a007e 2010-09-16  225: 				os.mkdir(config['dir'])
c3db1a007e 2010-09-16  226: 			# this is file index - everything is stored in this file
c3db1a007e 2010-09-16  227: 			# _parts - list of stored parts of file
c3db1a007e 2010-09-16  228: 			# _time - last time the file was checked
c3db1a007e 2010-09-16  229: 			# everything else is just the headers
c3db1a007e 2010-09-16  230: 			index = shelve.open(config['dir'] + os.sep + '.index')
c3db1a007e 2010-09-16  231: 
c3db1a007e 2010-09-16  232: 			desc_fields = const_desc_fields.copy()
c3db1a007e 2010-09-16  233: 			ignore_fields = const_ignore_fields.copy()
c3db1a007e 2010-09-16  234: 			if config['noetag'] == 'no':
c3db1a007e 2010-09-16  235: 				desc_fields.add('ETag')
c3db1a007e 2010-09-16  236: 			else:
c3db1a007e 2010-09-16  237: 				ignore_fields.add('ETag')
c3db1a007e 2010-09-16  238: 
c3db1a007e 2010-09-16  239: 			proxy_ignored = set([
c3db1a007e 2010-09-16  240: 				'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language',
c3db1a007e 2010-09-16  241: 				'Cache-Control', 'Connection', 'Content-Length', 'Cookie',
c3db1a007e 2010-09-16  242: 				'Host',
c3db1a007e 2010-09-16  243: 				'If-Modified-Since', 'If-Unmodified-Since',
c3db1a007e 2010-09-16  244: 				'Referer',
c3db1a007e 2010-09-16  245: 				'User-Agent',
c3db1a007e 2010-09-16  246: 				'Via',
c3db1a007e 2010-09-16  247: 				'X-Forwarded-For', 'X-REMOVED',
c3db1a007e 2010-09-16  248: 			])
c3db1a007e 2010-09-16  249: 
c3db1a007e 2010-09-16  250: 			print('===============[ {} request ]==='.format(self.command))
c3db1a007e 2010-09-16  251: 
c3db1a007e 2010-09-16  252: 			for header in self.headers:
c3db1a007e 2010-09-16  253: 				if header in proxy_ignored:
c3db1a007e 2010-09-16  254: 					pass
c3db1a007e 2010-09-16  255: 				elif header in ('Range'):
c3db1a007e 2010-09-16  256: 					isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
c3db1a007e 2010-09-16  257: 					if isRange:
c3db1a007e 2010-09-16  258: 						requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
c3db1a007e 2010-09-16  259: 					else:
c3db1a007e 2010-09-16  260: 						return()
c3db1a007e 2010-09-16  261: 				elif header in ('Pragma'):
c3db1a007e 2010-09-16  262: 					if my_path in index:
c3db1a007e 2010-09-16  263: 						index[my_path][header] = self.headers[header]
c3db1a007e 2010-09-16  264: 				else:
c3db1a007e 2010-09-16  265: 					print('Unknown header - ', header, ': ', self.headers[header], sep='')
c3db1a007e 2010-09-16  266: 					return()
c3db1a007e 2010-09-16  267: 				print(header, self.headers[header])
c3db1a007e 2010-09-16  268: 
c3db1a007e 2010-09-16  269: 			# creating file name from my_path
c3db1a007e 2010-09-16  270: 			file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
c3db1a007e 2010-09-16  271: 			# partial file or unfinished download
c3db1a007e 2010-09-16  272: 			temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
c3db1a007e 2010-09-16  273: 
c3db1a007e 2010-09-16  274: 			# creating empty placeholder in index
c3db1a007e 2010-09-16  275: 			# if there's no space map and there's no file in real directory - we have no file
c3db1a007e 2010-09-16  276: 			# if there's an empty space map - file is full
c3db1a007e 2010-09-16  277: 			# space map generally covers every bit of file we don't posess currently
c3db1a007e 2010-09-16  278: 			if not my_path in index:
c3db1a007e 2010-09-16  279: 				info += '\nThis one is new.'
c3db1a007e 2010-09-16  280: 				reload = True
c3db1a007e 2010-09-16  281: 				record = {}
c3db1a007e 2010-09-16  282: 			else:
c3db1a007e 2010-09-16  283: 				# forcibly checking file if no file present
c3db1a007e 2010-09-16  284: 				record = index[my_path]
c3db1a007e 2010-09-16  285: 				if os.access(file_name, os.R_OK):
c3db1a007e 2010-09-16  286: 					info += '\nFull file found.'
c3db1a007e 2010-09-16  287: 					file_stat = os.stat(file_name)
c3db1a007e 2010-09-16  288: 				elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK):
c3db1a007e 2010-09-16  289: 					info += '\nPartial file found.'
c3db1a007e 2010-09-16  290: 					file_stat = os.stat(temp_name)
c3db1a007e 2010-09-16  291: 					recheck = True
c3db1a007e 2010-09-16  292: 				else:
c3db1a007e 2010-09-16  293: 					info += '\nFile not found or inaccessible.'
c3db1a007e 2010-09-16  294: 					record['_parts'] = None
c3db1a007e 2010-09-16  295: 					reload = True
c3db1a007e 2010-09-16  296: 
c3db1a007e 2010-09-16  297: 			if not '_parts' in record:
c3db1a007e 2010-09-16  298: 				record['_parts'] = None
c3db1a007e 2010-09-16  299: 
c3db1a007e 2010-09-16  300: 			if record['_parts'] == None:
c3db1a007e 2010-09-16  301: 				recheck = True
c3db1a007e 2010-09-16  302: 
c3db1a007e 2010-09-16  303: 			# forcibly checking file if file size doesn't match with index data
c3db1a007e 2010-09-16  304: 			if not reload:
c3db1a007e 2010-09-16  305: 				if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
c3db1a007e 2010-09-16  306: 					if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']):
c3db1a007e 2010-09-16  307: 						info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length'])
c3db1a007e 2010-09-16  308: 						record['_parts'] = None
c3db1a007e 2010-09-16  309: 						reload = True
c3db1a007e 2010-09-16  310: 
c3db1a007e 2010-09-16  311: 			# forcibly checking file if index holds Pragma header
c3db1a007e 2010-09-16  312: 			if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache':
c3db1a007e 2010-09-16  313: 				info +='\nPragma on: recheck imminent.'
c3db1a007e 2010-09-16  314: 				recheck = True
c3db1a007e 2010-09-16  315: 
c3db1a007e 2010-09-16  316: 			# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
c3db1a007e 2010-09-16  317: 			if not recheck and not reload and '_time' in record and (datetime.datetime.now() - datetime.timedelta(hours = 4) - record['_time']).days < 0:
c3db1a007e 2010-09-16  318: 				recheck = True
c3db1a007e 2010-09-16  319: 
c3db1a007e 2010-09-16  320: 			print(info)
c3db1a007e 2010-09-16  321: 			if reload or recheck:
c3db1a007e 2010-09-16  322: 
c3db1a007e 2010-09-16  323: 				try:
c3db1a007e 2010-09-16  324: 					request = 'http://' + config['root'] + self.path
c3db1a007e 2010-09-16  325: 					my_headers = {}
c3db1a007e 2010-09-16  326: 					for header in ('Cache-Control', 'Cookie', 'Referer', 'User-Agent'):
c3db1a007e 2010-09-16  327: 						if header in self.headers:
c3db1a007e 2010-09-16  328: 							my_headers[header] = self.headers[header]
c3db1a007e 2010-09-16  329: 
c3db1a007e 2010-09-16  330: 					needed = None
c3db1a007e 2010-09-16  331: 					if '_parts' in record and record['_parts'] != None:
c3db1a007e 2010-09-16  332: 						if config['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap():
c3db1a007e 2010-09-16  333: 							needed = record['_parts']
c3db1a007e 2010-09-16  334: 						else:
c3db1a007e 2010-09-16  335: 							needed = record['_parts'] | requested_ranges
c3db1a007e 2010-09-16  336: 					elif config['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap():
c3db1a007e 2010-09-16  337: 						needed = requested_ranges
c3db1a007e 2010-09-16  338: 					ranges = ()
c3db1a007e 2010-09-16  339: 					print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
c3db1a007e 2010-09-16  340: 					if needed != None and len(needed) > 0:
c3db1a007e 2010-09-16  341: 						needed.rewind()
c3db1a007e 2010-09-16  342: 						while True:
c3db1a007e 2010-09-16  343: 							range = needed.pop()
c3db1a007e 2010-09-16  344: 							if range[0] == None:
c3db1a007e 2010-09-16  345: 								break
c3db1a007e 2010-09-16  346: 							ranges += '{}-{}'.format(range[0], range[1] - 1),
c3db1a007e 2010-09-16  347: 						my_headers['Range'] = 'bytes=' + ','.join(ranges)
c3db1a007e 2010-09-16  348: 
c3db1a007e 2010-09-16  349: 					request = urllib.request.Request(request, headers = my_headers)
c3db1a007e 2010-09-16  350: 
c3db1a007e 2010-09-16  351: 					with urllib.request.urlopen(request) as source:
c3db1a007e 2010-09-16  352: 						new_record = {}
c3db1a007e 2010-09-16  353: 						new_record['_parts'] = record['_parts']
c3db1a007e 2010-09-16  354: 						headers = source.info()
c3db1a007e 2010-09-16  355: 
c3db1a007e 2010-09-16  356: 						# stripping unneeded headers (XXX make this inplace?)
c3db1a007e 2010-09-16  357: 						for header in headers:
c3db1a007e 2010-09-16  358: 							if header in desc_fields:
c3db1a007e 2010-09-16  359: 								#if header == 'Pragma' and headers[header] != 'no-cache':
c3db1a007e 2010-09-16  360: 								if header == 'Content-Length':
c3db1a007e 2010-09-16  361: 									if 'Content-Range' not in headers:
c3db1a007e 2010-09-16  362: 										new_record[header] = int(headers[header])
c3db1a007e 2010-09-16  363: 								else:
c3db1a007e 2010-09-16  364: 									new_record[header] = headers[header]
c3db1a007e 2010-09-16  365: 							elif header == 'Content-Range':
c3db1a007e 2010-09-16  366: 								range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
c3db1a007e 2010-09-16  367: 								if range:
c3db1a007e 2010-09-16  368: 									new_record['Content-Length'] = int(range.group(3))
c3db1a007e 2010-09-16  369: 								else:	
c3db1a007e 2010-09-16  370: 									assert False, 'Content-Range unrecognized.'
c3db1a007e 2010-09-16  371: 							elif not header in ignore_fields:
c3db1a007e 2010-09-16  372: 								print('Undefined header "', header, '": ', headers[header], sep='')
c3db1a007e 2010-09-16  373: 
c3db1a007e 2010-09-16  374: 						# comparing headers with data found in index
c3db1a007e 2010-09-16  375: 						# if any header has changed (except Pragma) file is fully downloaded
c3db1a007e 2010-09-16  376: 						# same if we get more or less headers
c3db1a007e 2010-09-16  377: 						old_keys = set(record.keys())
c3db1a007e 2010-09-16  378: 						old_keys.discard('_time')
c3db1a007e 2010-09-16  379: 						old_keys.discard('Pragma')
c3db1a007e 2010-09-16  380: 						more_keys = set(new_record.keys()) - old_keys
c3db1a007e 2010-09-16  381: 						more_keys.discard('Pragma')
c3db1a007e 2010-09-16  382: 						less_keys = old_keys - set(new_record.keys())
c3db1a007e 2010-09-16  383: 						if len(more_keys) > 0:
c3db1a007e 2010-09-16  384: 							if not len(old_keys) == 0:
c3db1a007e 2010-09-16  385: 								print('More headers appear:', more_keys)
c3db1a007e 2010-09-16  386: 							reload = True
c3db1a007e 2010-09-16  387: 						elif len(less_keys) > 0:
c3db1a007e 2010-09-16  388: 							print('Less headers appear:', less_keys)
c3db1a007e 2010-09-16  389: 						else:
c3db1a007e 2010-09-16  390: 							for key in record.keys():
c3db1a007e 2010-09-16  391: 								if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]:
c3db1a007e 2010-09-16  392: 									print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
c3db1a007e 2010-09-16  393: 									print(type(record[key]), type(new_record[key]))
c3db1a007e 2010-09-16  394: 									reload = True
c3db1a007e 2010-09-16  395: 
c3db1a007e 2010-09-16  396: 						if reload:
c3db1a007e 2010-09-16  397: 							print('Reloading.')
c3db1a007e 2010-09-16  398: 							if os.access(temp_name, os.R_OK):
c3db1a007e 2010-09-16  399: 								os.unlink(temp_name)
c3db1a007e 2010-09-16  400: 							if os.access(file_name, os.R_OK):
c3db1a007e 2010-09-16  401: 								os.unlink(file_name)
c3db1a007e 2010-09-16  402: 							new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
c3db1a007e 2010-09-16  403: 						print(new_record)
c3db1a007e 2010-09-16  404: 
c3db1a007e 2010-09-16  405: 						# downloading file or segment
c3db1a007e 2010-09-16  406: 						if 'Content-Length' in new_record:
c3db1a007e 2010-09-16  407: 							if needed == None:
c3db1a007e 2010-09-16  408: 								needed = new_record['_parts']
c3db1a007e 2010-09-16  409: 							else:
c3db1a007e 2010-09-16  410: 								if len(needed) > 1:
c3db1a007e 2010-09-16  411: 									print("Multipart requests currently not supported.")
c3db1a007e 2010-09-16  412: 									assert False, 'Skip this one for now.'
c3db1a007e 2010-09-16  413: 						else:
c3db1a007e 2010-09-16  414: 							assert False, 'No Content-Length or Content-Range header.'
c3db1a007e 2010-09-16  415: 
c3db1a007e 2010-09-16  416: 						new_record['_time'] = datetime.datetime.now()
c3db1a007e 2010-09-16  417: 						if self.command not in ('HEAD'):
c3db1a007e 2010-09-16  418: 							# file is created at temporary location and moved in place only when download completes
c3db1a007e 2010-09-16  419: 							if not os.access(temp_name, os.R_OK):
c3db1a007e 2010-09-16  420: 								empty_name = config['dir'] + os.sep + '.tmp'
c3db1a007e 2010-09-16  421: 								with open(empty_name, 'w+b') as some_file:
c3db1a007e 2010-09-16  422: 									pass
c3db1a007e 2010-09-16  423: 								os.renames(empty_name, temp_name)
c3db1a007e 2010-09-16  424: 							temp_file = open(temp_name, 'r+b')
c3db1a007e 2010-09-16  425: 							if requested_ranges == None and needed == None:
c3db1a007e 2010-09-16  426: 								needed = new_record['_parts']
c3db1a007e 2010-09-16  427: 							needed.rewind()
c3db1a007e 2010-09-16  428: 							while True:
c3db1a007e 2010-09-16  429: 								(start, end) = needed.pop()
c3db1a007e 2010-09-16  430: 								if start == None:
c3db1a007e 2010-09-16  431: 									break
c3db1a007e 2010-09-16  432: 								stream_last = start
c3db1a007e 2010-09-16  433: 								old_record = new_record
c3db1a007e 2010-09-16  434: 								if end - start < block_size:
c3db1a007e 2010-09-16  435: 									req_block_size = end - start
c3db1a007e 2010-09-16  436: 								else:
c3db1a007e 2010-09-16  437: 									req_block_size = block_size
c3db1a007e 2010-09-16  438: 								buffer = source.read(req_block_size)
c3db1a007e 2010-09-16  439: 								length = len(buffer)
c3db1a007e 2010-09-16  440: 								while length > 0 and stream_last < end:
c3db1a007e 2010-09-16  441: 									stream_pos = stream_last + length
c3db1a007e 2010-09-16  442: 									assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
c3db1a007e 2010-09-16  443: 									temp_file.seek(stream_last)
c3db1a007e 2010-09-16  444: 									temp_file.write(buffer)
c3db1a007e 2010-09-16  445: 									new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
c3db1a007e 2010-09-16  446: 									index[my_path] = old_record
c3db1a007e 2010-09-16  447: 									index.sync()
c3db1a007e 2010-09-16  448: 									old_record = new_record
c3db1a007e 2010-09-16  449: 									stream_last = stream_pos
c3db1a007e 2010-09-16  450: 									if end - stream_last < block_size:
c3db1a007e 2010-09-16  451: 										req_block_size = end - stream_last
c3db1a007e 2010-09-16  452: 									buffer = source.read(req_block_size)
c3db1a007e 2010-09-16  453: 									length = len(buffer)
c3db1a007e 2010-09-16  454: 							# moving downloaded data to real file
c3db1a007e 2010-09-16  455: 							temp_file.close()
c3db1a007e 2010-09-16  456: 
c3db1a007e 2010-09-16  457: 						index[my_path] = new_record
c3db1a007e 2010-09-16  458: 						index.sync()
c3db1a007e 2010-09-16  459: 
c3db1a007e 2010-09-16  460: 				except urllib.error.HTTPError as error:
c3db1a007e 2010-09-16  461: 					# in case of error we don't need to do anything actually,
c3db1a007e 2010-09-16  462: 					# if file download stalls or fails the file would not be moved to it's location
c3db1a007e 2010-09-16  463: 					print(error)
c3db1a007e 2010-09-16  464: 
c3db1a007e 2010-09-16  465: 			print(index[my_path])
c3db1a007e 2010-09-16  466: 
c3db1a007e 2010-09-16  467: 			if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
c3db1a007e 2010-09-16  468: 				# just moving
c3db1a007e 2010-09-16  469: 				# drop old dirs XXX
c3db1a007e 2010-09-16  470: 				print('Moving temporary file to new destination.')
c3db1a007e 2010-09-16  471: 				os.renames(temp_name, file_name)
c3db1a007e 2010-09-16  472: 
c3db1a007e 2010-09-16  473: 			if not my_path in index:
c3db1a007e 2010-09-16  474: 				self.send_response(502)
c3db1a007e 2010-09-16  475: 				self.end_headers()
c3db1a007e 2010-09-16  476: 				return
c3db1a007e 2010-09-16  477: 
c3db1a007e 2010-09-16  478: 			if self.command == 'HEAD':
c3db1a007e 2010-09-16  479: 				self.send_response(200)
c3db1a007e 2010-09-16  480: 				if 'Content-Length' in index[my_path]:
c3db1a007e 2010-09-16  481: 					self.send_header('Content-Length', index[my_path]['Content-Length'])
c3db1a007e 2010-09-16  482: 				self.send_header('Accept-Ranges', 'bytes')
c3db1a007e 2010-09-16  483: 				self.send_header('Content-Type', 'application/octet-stream')
c3db1a007e 2010-09-16  484: 				if 'Last-Modified' in index[my_path]:
c3db1a007e 2010-09-16  485: 					self.send_header('Last-Modified', index[my_path]['Last-Modified'])
c3db1a007e 2010-09-16  486: 				self.end_headers()
c3db1a007e 2010-09-16  487: 			else:
c3db1a007e 2010-09-16  488: 				if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
c3db1a007e 2010-09-16  489: 					file_name = temp_name
c3db1a007e 2010-09-16  490: 
c3db1a007e 2010-09-16  491: 				with open(file_name, 'rb') as real_file:
c3db1a007e 2010-09-16  492: 					file_stat = os.stat(file_name)
c3db1a007e 2010-09-16  493: 					if 'Range' in self.headers:
c3db1a007e 2010-09-16  494: 						self.send_response(206)
c3db1a007e 2010-09-16  495: 						ranges = ()
c3db1a007e 2010-09-16  496: 						requested_ranges.rewind()
c3db1a007e 2010-09-16  497: 						while True:
c3db1a007e 2010-09-16  498: 							pair = requested_ranges.pop()
c3db1a007e 2010-09-16  499: 							if pair[0] == None:
c3db1a007e 2010-09-16  500: 								break
c3db1a007e 2010-09-16  501: 							ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
c3db1a007e 2010-09-16  502: 						self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length']))
c3db1a007e 2010-09-16  503: 					else:
c3db1a007e 2010-09-16  504: 						self.send_response(200)
c3db1a007e 2010-09-16  505: 						self.send_header('Content-Length', str(file_stat.st_size))
c3db1a007e 2010-09-16  506: 						requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
c3db1a007e 2010-09-16  507: 					if 'Last-Modified' in index[my_path]:
c3db1a007e 2010-09-16  508: 						self.send_header('Last-Modified', index[my_path]['Last-Modified'])
c3db1a007e 2010-09-16  509: 					self.send_header('Content-Type', 'application/octet-stream')
c3db1a007e 2010-09-16  510: 					self.end_headers()
c3db1a007e 2010-09-16  511: 					if self.command in ('GET'):
c3db1a007e 2010-09-16  512: 						if len(requested_ranges) > 0:
c3db1a007e 2010-09-16  513: 							requested_ranges.rewind()
c3db1a007e 2010-09-16  514: 							(start, end) = requested_ranges.pop()
c3db1a007e 2010-09-16  515: 						else:
c3db1a007e 2010-09-16  516: 							start = 0
c3db1a007e 2010-09-16  517: 							end = index[my_path]['Content-Length']
c3db1a007e 2010-09-16  518: 						real_file.seek(start)
c3db1a007e 2010-09-16  519: 						if block_size > end - start:
c3db1a007e 2010-09-16  520: 							req_block_size = end - start
c3db1a007e 2010-09-16  521: 						else:
c3db1a007e 2010-09-16  522: 							req_block_size = block_size
c3db1a007e 2010-09-16  523: 						buffer = real_file.read(req_block_size)
c3db1a007e 2010-09-16  524: 						length = len(buffer)
c3db1a007e 2010-09-16  525: 						while length > 0:
c3db1a007e 2010-09-16  526: 							self.wfile.write(buffer)
c3db1a007e 2010-09-16  527: 							start += len(buffer)
c3db1a007e 2010-09-16  528: 							if req_block_size > end - start:
c3db1a007e 2010-09-16  529: 								req_block_size = end - start
c3db1a007e 2010-09-16  530: 							if req_block_size == 0:
c3db1a007e 2010-09-16  531: 								break
c3db1a007e 2010-09-16  532: 							buffer = real_file.read(req_block_size)
c3db1a007e 2010-09-16  533: 							length = len(buffer)
c3db1a007e 2010-09-16  534: 					
c3db1a007e 2010-09-16  535: 		def do_HEAD(self):
c3db1a007e 2010-09-16  536: 			return self.__process()
c3db1a007e 2010-09-16  537: 		def do_GET(self):
c3db1a007e 2010-09-16  538: 			return self.__process()
c3db1a007e 2010-09-16  539: 
c3db1a007e 2010-09-16  540: 	config.section('general')
c3db1a007e 2010-09-16  541: 	server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
c3db1a007e 2010-09-16  542: 	server.serve_forever()
c3db1a007e 2010-09-16  543: 
c3db1a007e 2010-09-16  544: else:
c3db1a007e 2010-09-16  545: 	while True:
c3db1a007e 2010-09-16  546: 		unchecked_files = set()
c3db1a007e 2010-09-16  547: 		checked_files = 0
c3db1a007e 2010-09-16  548: 
c3db1a007e 2010-09-16  549: 		# reading log and storing found urls for processing
c3db1a007e 2010-09-16  550: 		# check file mtime XXX
c3db1a007e 2010-09-16  551: 		with open(options.log, 'r') as log_file:
c3db1a007e 2010-09-16  552: 			log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
c3db1a007e 2010-09-16  553: 			for line in log_file:
c3db1a007e 2010-09-16  554: 				this_line = log_line.match(line.strip())
c3db1a007e 2010-09-16  555: 				if this_line:
c3db1a007e 2010-09-16  556: 					unchecked_files.add(this_line.group(2))
c3db1a007e 2010-09-16  557: 
c3db1a007e 2010-09-16  558: 		for url in unchecked_files:
c3db1a007e 2010-09-16  559: 			reload = False
c3db1a007e 2010-09-16  560: 			recheck = False
c3db1a007e 2010-09-16  561: 			info = 'Checking file: ' + url
c3db1a007e 2010-09-16  562: 
c3db1a007e 2010-09-16  563: 			# creating empty placeholder in index
c3db1a007e 2010-09-16  564: 			if not url in index:
c3db1a007e 2010-09-16  565: 				info += '\nThis one is new.'
c3db1a007e 2010-09-16  566: 				index[url] = {}
c3db1a007e 2010-09-16  567: 				reload = True
c3db1a007e 2010-09-16  568: 
c3db1a007e 2010-09-16  569: 			# creating file name from url
c3db1a007e 2010-09-16  570: 			file_name = options.dir + re.compile('%20').sub(' ', url)
c3db1a007e 2010-09-16  571: 
c3db1a007e 2010-09-16  572: 			# forcibly checking file if no file present
c3db1a007e 2010-09-16  573: 			if not reload and not os.access(file_name, os.R_OK):
c3db1a007e 2010-09-16  574: 				info += '\nFile not found or inaccessible.'
c3db1a007e 2010-09-16  575: 				reload = True
c3db1a007e 2010-09-16  576: 
c3db1a007e 2010-09-16  577: 			# forcibly checking file if file size doesn't match with index data
c3db1a007e 2010-09-16  578: 			elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
c3db1a007e 2010-09-16  579: 				info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
c3db1a007e 2010-09-16  580: 				reload = True
c3db1a007e 2010-09-16  581: 
c3db1a007e 2010-09-16  582: 			# forcibly checking file if index hods Pragma header
c3db1a007e 2010-09-16  583: 			if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
c3db1a007e 2010-09-16  584: 				info +='\nPragma on: recheck imminent.'
c3db1a007e 2010-09-16  585: 				recheck = True
c3db1a007e 2010-09-16  586: 
c3db1a007e 2010-09-16  587: 			# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
c3db1a007e 2010-09-16  588: 			if not recheck and not reload and (options.noupdate or ('_time' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['_time']).days < 0)):
c3db1a007e 2010-09-16  589: 				if options.verbose:
c3db1a007e 2010-09-16  590: 					print(info)
c3db1a007e 2010-09-16  591: 				continue
c3db1a007e 2010-09-16  592: 			else:
c3db1a007e 2010-09-16  593: 				print(info)
c3db1a007e 2010-09-16  594: 
c3db1a007e 2010-09-16  595: 			try:
c3db1a007e 2010-09-16  596: 				with urllib.request.urlopen(options.root + url) as source:
c3db1a007e 2010-09-16  597: 					new_headers = {}
                       598: 					headers = source.info()
                       599: 
                       600: 					# stripping unneeded headers (XXX make this inplace?)
                       601: 					for header in headers:
                       602: 						if header in desc_fields:
c3db1a007e 2010-09-16  603: 							if header == 'Pragma' and headers[header] != 'no-cache':
c3db1a007e 2010-09-16  604: 								print('Pragma:', headers[header])
c3db1a007e 2010-09-16  605: 							new_headers[header] = headers[header]
                       606: 						elif not header in ignore_fields:
                       607: 							print('Undefined header "', header, '": ', headers[header], sep='')
                       608: 
                       609: 					# comparing headers with data found in index
                       610: 					# if any header has changed (except Pragma) file is fully downloaded
                       611: 					# same if we get more or less headers
c3db1a007e 2010-09-16  612: 					old_keys = set(index[url].keys())
                       613: 					old_keys.discard('_time')
                       614: 					old_keys.discard('Pragma')
c3db1a007e 2010-09-16  615: 					more_keys = set(new_headers.keys()) - old_keys
                       616: 					more_keys.discard('Pragma')
c3db1a007e 2010-09-16  617: 					less_keys = old_keys - set(new_headers.keys())
                       618: 					if len(more_keys) > 0:
                       619: 						if not len(old_keys) == 0:
                       620: 							print('More headers appear:', more_keys)
                       621: 						reload = True
                       622: 					elif len(less_keys) > 0:
                       623: 						print('Less headers appear:', less_keys)
                       624: 					else:
c3db1a007e 2010-09-16  625: 						for key in index[url].keys():
c3db1a007e 2010-09-16  626: 							if key[0] != '_' and key != 'Pragma' and not index[url][key] == new_headers[key]:
c3db1a007e 2010-09-16  627: 								print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
                       628: 								reload = True
                       629: 
c3db1a007e 2010-09-16  630: 					# downloading file
c3db1a007e 2010-09-16  631: 					if reload:
c3db1a007e 2010-09-16  632: 						if 'Content-Length' in headers:
c3db1a007e 2010-09-16  633: 							print('Downloading', headers['Content-Length'], 'bytes [', end='')
c3db1a007e 2010-09-16  634: 						else:
c3db1a007e 2010-09-16  635: 							print('Downloading [', end='')
c3db1a007e 2010-09-16  636: 						sys.stdout.flush()
c3db1a007e 2010-09-16  637: 
                       638: 						# file is created at temporary location and moved in place only when download completes
c3db1a007e 2010-09-16  639: 						temp_file = open(options.dir + os.sep + '.tmp', 'wb')
c3db1a007e 2010-09-16  640: 						buffer = source.read(block_size)
c3db1a007e 2010-09-16  641: 						megablocks = 0
c3db1a007e 2010-09-16  642: 						blocks = 0
c3db1a007e 2010-09-16  643: 						megs = 0
c3db1a007e 2010-09-16  644: 						while len(buffer) > 0:
c3db1a007e 2010-09-16  645: 							temp_file.write(buffer)
c3db1a007e 2010-09-16  646: 							buffer = source.read(block_size)
c3db1a007e 2010-09-16  647: 							blocks += 1
c3db1a007e 2010-09-16  648: 							if blocks > 102400/block_size:
c3db1a007e 2010-09-16  649: 								megablocks += 1
c3db1a007e 2010-09-16  650: 								if megablocks > 10:
c3db1a007e 2010-09-16  651: 									megablocks = megablocks - 10
c3db1a007e 2010-09-16  652: 									megs += 1
c3db1a007e 2010-09-16  653: 									print('{}Mb'.format(megs), end='')
c3db1a007e 2010-09-16  654: 								else:
c3db1a007e 2010-09-16  655: 									print('.', end='')
c3db1a007e 2010-09-16  656: 								blocks = blocks - 102400/block_size
c3db1a007e 2010-09-16  657: 							sys.stdout.flush()
c3db1a007e 2010-09-16  658: 						temp_file.close()
c3db1a007e 2010-09-16  659: 						print(']')
c3db1a007e 2010-09-16  660: 						os.renames(options.dir + os.sep + '.tmp', file_name)
c3db1a007e 2010-09-16  661: 
c3db1a007e 2010-09-16  662: 						checked_files += 1
c3db1a007e 2010-09-16  663: 
c3db1a007e 2010-09-16  664: 					# storing new time mark and storing new headers
c3db1a007e 2010-09-16  665: 					new_headers['_time'] = datetime.datetime.now()
c3db1a007e 2010-09-16  666: 					index[url] = new_headers
                       667: 					index.sync()
                       668: 
                       669: 			except urllib.error.HTTPError as error:
                       670: 				# in case of error we don't need to do anything actually,
                       671: 				# if file download stalls or fails the file would not be moved to it's location
                       672: 				print(error)
                       673: 
c3db1a007e 2010-09-16  674: 		if options.verbose:
c3db1a007e 2010-09-16  675: 			print('[', len(unchecked_files), '/', checked_files, ']')
c3db1a007e 2010-09-16  676: 
c3db1a007e 2010-09-16  677: 		# checking if there were any files downloaded, if yes - restarting sequence
c3db1a007e 2010-09-16  678: 		if checked_files == 0:
c3db1a007e 2010-09-16  679: 			break