Samesite - proxy that can cache partial transfers

Annotation For samesite.py
anonymous

Annotation For samesite.py

Origin for each line in samesite.py from check-in b0975a28fb:

08ae38b6ce 2010-06-25    1: #!/usr/bin/env python3.1
08ae38b6ce 2010-06-25    2: 
e7b837a681 2010-08-25    3: import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request
e7b837a681 2010-08-25    4: 
e7b837a681 2010-08-25    5: class Config:
e7b837a681 2010-08-25    6: 	__slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
e7b837a681 2010-08-25    7: 	_default = {
e7b837a681 2010-08-25    8: 		'general': {
e7b837a681 2010-08-25    9: 			'port': '8008',
e7b837a681 2010-08-25   10: 		},
e7b837a681 2010-08-25   11: 		'_other': {
e7b837a681 2010-08-25   12: 			'verbose': 'no',
e7b837a681 2010-08-25   13: 			'noetag': 'no',
e7b837a681 2010-08-25   14: 			'noparts': 'no',
e7b837a681 2010-08-25   15: 	},}
e7b837a681 2010-08-25   16: 
e7b837a681 2010-08-25   17: 	# function to read in config file
e7b837a681 2010-08-25   18: 	def __init__(self):
e7b837a681 2010-08-25   19: 		import configparser, optparse
e7b837a681 2010-08-25   20: 
e7b837a681 2010-08-25   21: 		parser = optparse.OptionParser()
e7b837a681 2010-08-25   22: 		parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
e7b837a681 2010-08-25   23: 		(self.options, args) = parser.parse_args()
e7b837a681 2010-08-25   24: 
e7b837a681 2010-08-25   25: 		assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
e7b837a681 2010-08-25   26: 
e7b837a681 2010-08-25   27: 		configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
e7b837a681 2010-08-25   28: 		if configDir:
e7b837a681 2010-08-25   29: 			self.root = configDir.group(1)
e7b837a681 2010-08-25   30: 		else:
e7b837a681 2010-08-25   31: 			self.root = os.getcwd()
e7b837a681 2010-08-25   32: 
e7b837a681 2010-08-25   33: 		self._config = configparser.ConfigParser()
e7b837a681 2010-08-25   34: 		self._config.readfp(open(self.options.config))
e7b837a681 2010-08-25   35: 
e7b837a681 2010-08-25   36: 		for section in self._config.sections():
e7b837a681 2010-08-25   37: 			if section != 'general':
e7b837a681 2010-08-25   38: 				if self._config.has_option(section, 'dir'):
e7b837a681 2010-08-25   39: 					if re.compile('^/$').match(self._config.get(section, 'dir')):
e7b837a681 2010-08-25   40: 						self._config.set(section, 'dir', self.root + os.sep + section)
e7b837a681 2010-08-25   41: 					thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
e7b837a681 2010-08-25   42: 					if thisDir:
e7b837a681 2010-08-25   43: 						self._config.set(section, 'dir', thisDir.group(1))
e7b837a681 2010-08-25   44: 					if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
e7b837a681 2010-08-25   45: 						self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
e7b837a681 2010-08-25   46: 				else:
e7b837a681 2010-08-25   47: 					self._config.set(section, 'dir', self.root + os.sep + section)
e7b837a681 2010-08-25   48: 
e7b837a681 2010-08-25   49: 				if not self._config.has_option(section, 'root'):
e7b837a681 2010-08-25   50: 					self._config.set(section, 'root', section)
e7b837a681 2010-08-25   51: 
e7b837a681 2010-08-25   52: 	# function to select config file section or create one
e7b837a681 2010-08-25   53: 	def section(self, section):
e7b837a681 2010-08-25   54: 		if not self._config.has_section(section):
e7b837a681 2010-08-25   55: 			self._config.add_section(section)
e7b837a681 2010-08-25   56: 		self._section = section
e7b837a681 2010-08-25   57: 
e7b837a681 2010-08-25   58: 	# function to get config parameter, if parameter doesn't exists the default
e7b837a681 2010-08-25   59: 	# value or None is substituted
e7b837a681 2010-08-25   60: 	def __getitem__(self, name):
e7b837a681 2010-08-25   61: 		if not self._config.has_option(self._section, name):
e7b837a681 2010-08-25   62: 			if self._section in self._default:
e7b837a681 2010-08-25   63: 				if name in self._default[self._section]:
e7b837a681 2010-08-25   64: 					self._config.set(self._section, name, self._default[self._section][name])
e7b837a681 2010-08-25   65: 				else:
e7b837a681 2010-08-25   66: 					self._config.set(self._section, name, None)
e7b837a681 2010-08-25   67: 			elif name in self._default['_other']:
e7b837a681 2010-08-25   68: 				self._config.set(self._section, name, self._default['_other'][name])
e7b837a681 2010-08-25   69: 			else:
e7b837a681 2010-08-25   70: 				self._config.set(self._section, name, None)
e7b837a681 2010-08-25   71: 		return(self._config.get(self._section, name))
e7b837a681 2010-08-25   72: 
e7b837a681 2010-08-25   73: config = Config()
e7b837a681 2010-08-25   74: 
e7b837a681 2010-08-25   75: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
e7b837a681 2010-08-25   76: 
e7b837a681 2010-08-25   77: const_desc_fields = set(['Content-Length', 'Pragma', 'Last-Modified'])
e7b837a681 2010-08-25   78: const_ignore_fields = set(['Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By'])
80f8e3804a 2010-08-20   79: 
80f8e3804a 2010-08-20   80: block_size = 4096
80f8e3804a 2010-08-20   81: 
80f8e3804a 2010-08-20   82: '''
80f8e3804a 2010-08-20   83: # later, kqueue would be good but later
80f8e3804a 2010-08-20   84: class Connection:
80f8e3804a 2010-08-20   85: 	__slots__ = frozenset(('__address', '__input', '__socket', '__status', 'error', 'method', 'url', 'http_version'))
80f8e3804a 2010-08-20   86: 
80f8e3804a 2010-08-20   87: 	def __init__(self, socket, address):
80f8e3804a 2010-08-20   88: 		self.__address = address
80f8e3804a 2010-08-20   89: 		self.__input = b''
80f8e3804a 2010-08-20   90: 		self.__socket = socket
80f8e3804a 2010-08-20   91: 		self.__status = 0
80f8e3804a 2010-08-20   92: 
80f8e3804a 2010-08-20   93: 	def read(self, kev):
80f8e3804a 2010-08-20   94: 		buffer = self.__socket.recv(kev.data)
80f8e3804a 2010-08-20   95: 		exhausted = False
80f8e3804a 2010-08-20   96: 		if len(buffer) == 0:
80f8e3804a 2010-08-20   97: 			eof = True
80f8e3804a 2010-08-20   98: 		else:
80f8e3804a 2010-08-20   99: 			self.__input += buffer
80f8e3804a 2010-08-20  100: 			while not exhausted:
80f8e3804a 2010-08-20  101: 				if self.__status == -1:
80f8e3804a 2010-08-20  102: 					exhausted = True
80f8e3804a 2010-08-20  103: 				elif self.__status == 0:
80f8e3804a 2010-08-20  104: 					endstring = self.__input.find(b'\n')
80f8e3804a 2010-08-20  105: 					if endstring > 0:
80f8e3804a 2010-08-20  106: 						print('Processing request line.')
80f8e3804a 2010-08-20  107: 						line = self.__input[:endstring].decode('ascii')
80f8e3804a 2010-08-20  108: 						self.__input = self.__input[endstring + 1:]
80f8e3804a 2010-08-20  109: 						isRequest = re.compile('(GET) ([^ ]+) HTTP/(1\.0)').match(line)
80f8e3804a 2010-08-20  110: 						if not isRequest:
80f8e3804a 2010-08-20  111: 							self.error = 'Not a HTTP connection.'
80f8e3804a 2010-08-20  112: 							self.__status = -1
80f8e3804a 2010-08-20  113: 						else:
80f8e3804a 2010-08-20  114: 							self.method = isRequest.group(1)
80f8e3804a 2010-08-20  115: 							self.url = isRequest.group(2)
80f8e3804a 2010-08-20  116: 							self.http_version = isRequest.group(3)
80f8e3804a 2010-08-20  117: 							self.__status = 1
80f8e3804a 2010-08-20  118: 					else:
80f8e3804a 2010-08-20  119: 						exhausted = True
80f8e3804a 2010-08-20  120: 				elif self.__status == 1:
80f8e3804a 2010-08-20  121: 					endstring = self.__input.find(b'\n')
80f8e3804a 2010-08-20  122: 					if endstring > 0:
80f8e3804a 2010-08-20  123: 						print('Processing header line.' + repr(self.__input))
80f8e3804a 2010-08-20  124: 						line = self.__input[:endstring].decode('ascii')
80f8e3804a 2010-08-20  125: 						self.__input = self.__input[endstring + 1:]
80f8e3804a 2010-08-20  126: 						isHeader = re.compile('([^:]*): +(.*)').match(line)
80f8e3804a 2010-08-20  127: 						if not isHeader:
80f8e3804a 2010-08-20  128: 							self.error = 'Bad header.'
80f8e3804a 2010-08-20  129: 							return(False)
80f8e3804a 2010-08-20  130: 						# process header here
80f8e3804a 2010-08-20  131: 					elif endstring == 0:
80f8e3804a 2010-08-20  132: 						self.__status = 2
80f8e3804a 2010-08-20  133: 					else:
80f8e3804a 2010-08-20  134: 						exhausted = True
80f8e3804a 2010-08-20  135: 
80f8e3804a 2010-08-20  136: 	def write(self, kev):
80f8e3804a 2010-08-20  137: 		pass
80f8e3804a 2010-08-20  138: 
80f8e3804a 2010-08-20  139: if options.port:
80f8e3804a 2010-08-20  140: 	import select, socket
80f8e3804a 2010-08-20  141: 
80f8e3804a 2010-08-20  142: 	sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
80f8e3804a 2010-08-20  143: 	try:
80f8e3804a 2010-08-20  144: 		sock.bind(('127.0.0.1', int(options.port)))
80f8e3804a 2010-08-20  145: 		sock.listen(-1)
80f8e3804a 2010-08-20  146: 
80f8e3804a 2010-08-20  147: 		kq = select.kqueue()
80f8e3804a 2010-08-20  148: 		assert kq.fileno() != -1, "Fatal error: can't initialise kqueue."
80f8e3804a 2010-08-20  149: 
80f8e3804a 2010-08-20  150: 		kq.control([select.kevent(sock, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
80f8e3804a 2010-08-20  151: 		timeout = None
80f8e3804a 2010-08-20  152: 
80f8e3804a 2010-08-20  153: 		connections = {sock.fileno(): None}
80f8e3804a 2010-08-20  154: 
80f8e3804a 2010-08-20  155: 		while True:
80f8e3804a 2010-08-20  156: 			kevs = kq.control(None, 1, timeout)
80f8e3804a 2010-08-20  157: 
80f8e3804a 2010-08-20  158: 			for kev in kevs:
80f8e3804a 2010-08-20  159: 				if type(connections[kev.ident]) == Connection:
80f8e3804a 2010-08-20  160: 					print(kev.ident, kev.data, kev.filter, kev.flags)
80f8e3804a 2010-08-20  161: 					assert kev.data != 0, 'No data available.'
80f8e3804a 2010-08-20  162: 					if kev.filter == select.KQ_FILTER_READ:
80f8e3804a 2010-08-20  163: 						connections[kev.ident].read(kev)
80f8e3804a 2010-08-20  164: 					elif kev.filter == select.KQ_FILTER_WRITE:
80f8e3804a 2010-08-20  165: 						connections[kev.ident].write(kev)
80f8e3804a 2010-08-20  166: 					else:
80f8e3804a 2010-08-20  167: 						assert kev.filter in (select.KQ_FILTER_READ, select.KQ_FILTER_WRITE), 'Do we support other filters?'
80f8e3804a 2010-08-20  168: 				else:
80f8e3804a 2010-08-20  169: 					(conn, addr) = sock.accept()
80f8e3804a 2010-08-20  170: 					print('Connection from ' + repr(addr))
80f8e3804a 2010-08-20  171: 					kq.control([select.kevent(conn, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
80f8e3804a 2010-08-20  172: 					connections[conn.fileno()] = Connection(conn, addr)
80f8e3804a 2010-08-20  173: 
80f8e3804a 2010-08-20  174: 				if kev.flags >> 15 == 1:
80f8e3804a 2010-08-20  175: 					kq.control([select.kevent(kev.ident, select.KQ_FILTER_READ, select.KQ_EV_DELETE)], 0)
80f8e3804a 2010-08-20  176: 					kq.control([select.kevent(kev.ident, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE)], 0)
80f8e3804a 2010-08-20  177: 					del(connections[kev.ident])
80f8e3804a 2010-08-20  178: 	finally:
80f8e3804a 2010-08-20  179: 		sock.close()
80f8e3804a 2010-08-20  180: '''
80f8e3804a 2010-08-20  181: 
e7b837a681 2010-08-25  182: # XXX how about rechecking files?
e7b837a681 2010-08-25  183: if True:
80f8e3804a 2010-08-20  184: 	import http.server
80f8e3804a 2010-08-20  185: 
80f8e3804a 2010-08-20  186: 	class MyRequestHandler(http.server.BaseHTTPRequestHandler):
80f8e3804a 2010-08-20  187: 		def __process(self):
80f8e3804a 2010-08-20  188: 			# reload means file needs to be reloaded to serve request
80f8e3804a 2010-08-20  189: 			reload = False
80f8e3804a 2010-08-20  190: 			# recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
80f8e3804a 2010-08-20  191: 			recheck = False
80f8e3804a 2010-08-20  192: 			# file_stat means file definitely exists
80f8e3804a 2010-08-20  193: 			file_stat = None
80f8e3804a 2010-08-20  194: 			# requested_ranges holds data about any range requested
80f8e3804a 2010-08-20  195: 			requested_ranges = None
80f8e3804a 2010-08-20  196: 			# records holds data from index locally, should be written back upon successfull completion
80f8e3804a 2010-08-20  197: 			record = None
80f8e3804a 2010-08-20  198: 			info = 'Checking file: ' + self.path
80f8e3804a 2010-08-20  199: 
d0071bdbc7 2010-08-20  200: 			myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
d0071bdbc7 2010-08-20  201: 			if myPath:
d0071bdbc7 2010-08-20  202: 				my_path = myPath.group(1)
d0071bdbc7 2010-08-20  203: 			else:
d0071bdbc7 2010-08-20  204: 				my_path = self.path
d0071bdbc7 2010-08-20  205: 
e7b837a681 2010-08-25  206: 			config.section(self.headers['Host'])
e7b837a681 2010-08-25  207: 
e7b837a681 2010-08-25  208: 			if not os.access(config['dir'], os.X_OK):
e7b837a681 2010-08-25  209: 				os.mkdir(config['dir'])
e7b837a681 2010-08-25  210: 			# this is file index - everything is stored in this file
e7b837a681 2010-08-25  211: 			# _parts - list of stored parts of file
e7b837a681 2010-08-25  212: 			# _time - last time the file was checked
e7b837a681 2010-08-25  213: 			# everything else is just the headers
e7b837a681 2010-08-25  214: 			index = shelve.open(config['dir'] + os.sep + '.index')
e7b837a681 2010-08-25  215: 
e7b837a681 2010-08-25  216: 			desc_fields = const_desc_fields.copy()
e7b837a681 2010-08-25  217: 			ignore_fields = const_ignore_fields.copy()
e7b837a681 2010-08-25  218: 			if not config['noetag']:
e7b837a681 2010-08-25  219: 				desc_fields.add('ETag')
e7b837a681 2010-08-25  220: 			else:
e7b837a681 2010-08-25  221: 				ignore_fields.add('ETag')
e7b837a681 2010-08-25  222: 
80f8e3804a 2010-08-20  223: 			proxy_ignored = ('Accept', 'Accept-Encoding',
80f8e3804a 2010-08-20  224: 				'Cache-Control', 'Connection',
80f8e3804a 2010-08-20  225: 				'Host',
fb10031536 2010-08-21  226: 				'If-Modified-Since', 'If-Unmodified-Since',
80f8e3804a 2010-08-20  227: 				'User-Agent',
80f8e3804a 2010-08-20  228: 				'Via',
80f8e3804a 2010-08-20  229: 				'X-Forwarded-For',
80f8e3804a 2010-08-20  230: 			)
80f8e3804a 2010-08-20  231: 
fb10031536 2010-08-21  232: 			print('===============[ {} request ]==='.format(self.command))
80f8e3804a 2010-08-20  233: 
80f8e3804a 2010-08-20  234: 			for header in self.headers:
80f8e3804a 2010-08-20  235: 				if header in proxy_ignored:
80f8e3804a 2010-08-20  236: 					pass
80f8e3804a 2010-08-20  237: 				elif header in ('Range'):
80f8e3804a 2010-08-20  238: 					isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
80f8e3804a 2010-08-20  239: 					if isRange:
e7b837a681 2010-08-25  240: 						requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
80f8e3804a 2010-08-20  241: 					else:
80f8e3804a 2010-08-20  242: 						return()
e7b837a681 2010-08-25  243: 				elif header in ('Pragma'):
e7b837a681 2010-08-25  244: 					if my_path in index:
e7b837a681 2010-08-25  245: 						index[my_path][header] = self.headers[header]
80f8e3804a 2010-08-20  246: 				else:
80f8e3804a 2010-08-20  247: 					print('Unknown header - ', header, ': ', self.headers[header], sep='')
80f8e3804a 2010-08-20  248: 					return()
80f8e3804a 2010-08-20  249: 				print(header, self.headers[header])
e7b837a681 2010-08-25  250: 
b0975a28fb 2010-08-26  251: 			# creating file name from my_path
b0975a28fb 2010-08-26  252: 			file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
b0975a28fb 2010-08-26  253: 			# partial file or unfinished download
b0975a28fb 2010-08-26  254: 			temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
b0975a28fb 2010-08-26  255: 
80f8e3804a 2010-08-20  256: 			# creating empty placeholder in index
80f8e3804a 2010-08-20  257: 			# if there's no space map and there's no file in real directory - we have no file
80f8e3804a 2010-08-20  258: 			# if there's an empty space map - file is full
80f8e3804a 2010-08-20  259: 			# space map generally covers every bit of file we don't posess currently
d0071bdbc7 2010-08-20  260: 			if not my_path in index:
80f8e3804a 2010-08-20  261: 				info += '\nThis one is new.'
80f8e3804a 2010-08-20  262: 				reload = True
e7b837a681 2010-08-25  263: 				record = {}
80f8e3804a 2010-08-20  264: 			else:
b0975a28fb 2010-08-26  265: 				# forcibly checking file if no file present
b0975a28fb 2010-08-26  266: 				if os.access(file_name, os.R_OK):
b0975a28fb 2010-08-26  267: 					file_stat = os.stat(file_name)
b0975a28fb 2010-08-26  268: 				elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK):
b0975a28fb 2010-08-26  269: 					file_stat = os.stat(temp_name)
b0975a28fb 2010-08-26  270: 				else:
b0975a28fb 2010-08-26  271: 					info += '\nFile not found or inaccessible.'
b0975a28fb 2010-08-26  272: 					index[my_path]['_parts'] = None
b0975a28fb 2010-08-26  273: 					reload = True
d0071bdbc7 2010-08-20  274: 				record = index[my_path]
b0975a28fb 2010-08-26  275: 
b0975a28fb 2010-08-26  276: 			print(record)
e7b837a681 2010-08-25  277: 
e7b837a681 2010-08-25  278: 			if not '_parts' in record:
e7b837a681 2010-08-25  279: 				record['_parts'] = None
e7b837a681 2010-08-25  280: 
b0975a28fb 2010-08-26  281: 			if record['_parts'] == None:
b0975a28fb 2010-08-26  282: 				recheck = True
80f8e3804a 2010-08-20  283: 
80f8e3804a 2010-08-20  284: 			# forcibly checking file if file size doesn't match with index data
80f8e3804a 2010-08-20  285: 			if not reload:
e7b837a681 2010-08-25  286: 				if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
80f8e3804a 2010-08-20  287: 					if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']):
80f8e3804a 2010-08-20  288: 						info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length'])
b0975a28fb 2010-08-26  289: 						record['_parts'] = None
80f8e3804a 2010-08-20  290: 						reload = True
80f8e3804a 2010-08-20  291: 
80f8e3804a 2010-08-20  292: 			# forcibly checking file if index holds Pragma header
80f8e3804a 2010-08-20  293: 			if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache':
80f8e3804a 2010-08-20  294: 				info +='\nPragma on: recheck imminent.'
80f8e3804a 2010-08-20  295: 				recheck = True
80f8e3804a 2010-08-20  296: 
80f8e3804a 2010-08-20  297: 			# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
80f8e3804a 2010-08-20  298: 			if not recheck and not reload and '_time' in record and (datetime.datetime.now() - datetime.timedelta(hours = 4) - record['_time']).days < 0:
80f8e3804a 2010-08-20  299: 				recheck = True
80f8e3804a 2010-08-20  300: 
80f8e3804a 2010-08-20  301: 			print(info)
80f8e3804a 2010-08-20  302: 			if reload or recheck:
80f8e3804a 2010-08-20  303: 
80f8e3804a 2010-08-20  304: 				try:
e7b837a681 2010-08-25  305: 					request = 'http://' + config['root'] + my_path
fb10031536 2010-08-21  306: 					needed = None
e7b837a681 2010-08-25  307: 					# XXX and if we specify full file we don't go partial?
80f8e3804a 2010-08-20  308: 					if requested_ranges != None:
80f8e3804a 2010-08-20  309: 						if '_parts' in record and record['_parts'] != None:
b0975a28fb 2010-08-26  310: 							if config['noparts']:
b0975a28fb 2010-08-26  311: 								needed = record['_parts']
b0975a28fb 2010-08-26  312: 							else:
b0975a28fb 2010-08-26  313: 								needed = record['_parts'] | requested_ranges
b0975a28fb 2010-08-26  314: 						elif not config['noparts']:
80f8e3804a 2010-08-20  315: 							needed = requested_ranges
80f8e3804a 2010-08-20  316: 						ranges = ()
fb10031536 2010-08-21  317: 						print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
b0975a28fb 2010-08-26  318: 						if needed != None and len(needed) > 0:
fb10031536 2010-08-21  319: 							needed.rewind()
fb10031536 2010-08-21  320: 							while True:
fb10031536 2010-08-21  321: 								range = needed.pop()
fb10031536 2010-08-21  322: 								if range[0] == None:
fb10031536 2010-08-21  323: 									break
fb10031536 2010-08-21  324: 								ranges += '{}-{}'.format(range[0], range[1] - 1),
fb10031536 2010-08-21  325: 							request = urllib.request.Request(request, headers = {'Range': 'bytes=' + ','.join(ranges)})
80f8e3804a 2010-08-20  326: 
80f8e3804a 2010-08-20  327: 					with urllib.request.urlopen(request) as source:
80f8e3804a 2010-08-20  328: 						new_record = {}
80f8e3804a 2010-08-20  329: 						new_record['_parts'] = record['_parts']
80f8e3804a 2010-08-20  330: 						headers = source.info()
80f8e3804a 2010-08-20  331: 
80f8e3804a 2010-08-20  332: 						# stripping unneeded headers (XXX make this inplace?)
80f8e3804a 2010-08-20  333: 						for header in headers:
80f8e3804a 2010-08-20  334: 							if header in desc_fields:
80f8e3804a 2010-08-20  335: 								#if header == 'Pragma' and headers[header] != 'no-cache':
80f8e3804a 2010-08-20  336: 								if header == 'Content-Length':
80f8e3804a 2010-08-20  337: 									if 'Content-Range' not in headers:
fb10031536 2010-08-21  338: 										new_record[header] = int(headers[header])
80f8e3804a 2010-08-20  339: 								else:
80f8e3804a 2010-08-20  340: 									new_record[header] = headers[header]
80f8e3804a 2010-08-20  341: 							elif header == 'Content-Range':
80f8e3804a 2010-08-20  342: 								range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
80f8e3804a 2010-08-20  343: 								if range:
fb10031536 2010-08-21  344: 									new_record['Content-Length'] = int(range.group(3))
80f8e3804a 2010-08-20  345: 								else:	
80f8e3804a 2010-08-20  346: 									assert False, 'Content-Range unrecognized.'
80f8e3804a 2010-08-20  347: 							elif not header in ignore_fields:
80f8e3804a 2010-08-20  348: 								print('Undefined header "', header, '": ', headers[header], sep='')
80f8e3804a 2010-08-20  349: 
80f8e3804a 2010-08-20  350: 						# comparing headers with data found in index
80f8e3804a 2010-08-20  351: 						# if any header has changed (except Pragma) file is fully downloaded
80f8e3804a 2010-08-20  352: 						# same if we get more or less headers
80f8e3804a 2010-08-20  353: 						old_keys = set(record.keys())
80f8e3804a 2010-08-20  354: 						old_keys.discard('_time')
80f8e3804a 2010-08-20  355: 						old_keys.discard('Pragma')
80f8e3804a 2010-08-20  356: 						more_keys = set(new_record.keys()) - old_keys
80f8e3804a 2010-08-20  357: 						more_keys.discard('Pragma')
80f8e3804a 2010-08-20  358: 						less_keys = old_keys - set(new_record.keys())
80f8e3804a 2010-08-20  359: 						if len(more_keys) > 0:
80f8e3804a 2010-08-20  360: 							if not len(old_keys) == 0:
80f8e3804a 2010-08-20  361: 								print('More headers appear:', more_keys)
80f8e3804a 2010-08-20  362: 							reload = True
80f8e3804a 2010-08-20  363: 						elif len(less_keys) > 0:
80f8e3804a 2010-08-20  364: 							print('Less headers appear:', less_keys)
80f8e3804a 2010-08-20  365: 						else:
80f8e3804a 2010-08-20  366: 							for key in record.keys():
80f8e3804a 2010-08-20  367: 								if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]:
80f8e3804a 2010-08-20  368: 									print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
fb10031536 2010-08-21  369: 									print(type(record[key]), type(new_record[key]))
80f8e3804a 2010-08-20  370: 									reload = True
80f8e3804a 2010-08-20  371: 
80f8e3804a 2010-08-20  372: 						if reload:
80f8e3804a 2010-08-20  373: 							print('Reloading.')
80f8e3804a 2010-08-20  374: 							if os.access(temp_name, os.R_OK):
80f8e3804a 2010-08-20  375: 								os.unlink(temp_name)
80f8e3804a 2010-08-20  376: 							if os.access(file_name, os.R_OK):
80f8e3804a 2010-08-20  377: 								os.unlink(file_name)
e7b837a681 2010-08-25  378: 							new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
fb10031536 2010-08-21  379: 						print(new_record)
80f8e3804a 2010-08-20  380: 
80f8e3804a 2010-08-20  381: 						# downloading file or segment
80f8e3804a 2010-08-20  382: 						if 'Content-Length' in new_record:
fb10031536 2010-08-21  383: 							if needed == None:
fb10031536 2010-08-21  384: 								needed = new_record['_parts']
80f8e3804a 2010-08-20  385: 							else:
fb10031536 2010-08-21  386: 								if len(needed) > 1:
80f8e3804a 2010-08-20  387: 									print("Multipart requests currently not supported.")
80f8e3804a 2010-08-20  388: 									assert False, 'Skip this one for now.'
80f8e3804a 2010-08-20  389: 						else:
80f8e3804a 2010-08-20  390: 							assert False, 'No Content-Length or Content-Range header.'
80f8e3804a 2010-08-20  391: 
fb10031536 2010-08-21  392: 						new_record['_time'] = datetime.datetime.now()
fb10031536 2010-08-21  393: 						if self.command not in ('HEAD'):
fb10031536 2010-08-21  394: 							# file is created at temporary location and moved in place only when download completes
fb10031536 2010-08-21  395: 							if not os.access(temp_name, os.R_OK):
e7b837a681 2010-08-25  396: 								empty_name = config['dir'] + os.sep + '.tmp'
fb10031536 2010-08-21  397: 								with open(empty_name, 'w+b') as some_file:
fb10031536 2010-08-21  398: 									pass
fb10031536 2010-08-21  399: 								os.renames(empty_name, temp_name)
fb10031536 2010-08-21  400: 							temp_file = open(temp_name, 'r+b')
fb10031536 2010-08-21  401: 							needed.rewind()
fb10031536 2010-08-21  402: 							while True:
fb10031536 2010-08-21  403: 								(start, end) = needed.pop()
fb10031536 2010-08-21  404: 								if start == None:
fb10031536 2010-08-21  405: 									break
fb10031536 2010-08-21  406: 								stream_last = start
fb10031536 2010-08-21  407: 								old_record = new_record
fb10031536 2010-08-21  408: 								if end - start < block_size:
fb10031536 2010-08-21  409: 									req_block_size = end - start
fb10031536 2010-08-21  410: 								else:
fb10031536 2010-08-21  411: 									req_block_size = block_size
fb10031536 2010-08-21  412: 								buffer = source.read(req_block_size)
fb10031536 2010-08-21  413: 								length = len(buffer)
fb10031536 2010-08-21  414: 								while length > 0 and stream_last < end:
fb10031536 2010-08-21  415: 									stream_pos = stream_last + length
fb10031536 2010-08-21  416: 									assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
fb10031536 2010-08-21  417: 									temp_file.seek(stream_last)
fb10031536 2010-08-21  418: 									temp_file.write(buffer)
e7b837a681 2010-08-25  419: 									new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
fb10031536 2010-08-21  420: 									index[my_path] = old_record
fb10031536 2010-08-21  421: 									index.sync()
fb10031536 2010-08-21  422: 									old_record = new_record
fb10031536 2010-08-21  423: 									stream_last = stream_pos
fb10031536 2010-08-21  424: 									if end - stream_last < block_size:
fb10031536 2010-08-21  425: 										req_block_size = end - stream_last
fb10031536 2010-08-21  426: 									buffer = source.read(req_block_size)
fb10031536 2010-08-21  427: 									length = len(buffer)
fb10031536 2010-08-21  428: 							# moving downloaded data to real file
fb10031536 2010-08-21  429: 							temp_file.close()
fb10031536 2010-08-21  430: 
fb10031536 2010-08-21  431: 						print(new_record)
fb10031536 2010-08-21  432: 						index[my_path] = new_record
fb10031536 2010-08-21  433: 						index.sync()
80f8e3804a 2010-08-20  434: 
80f8e3804a 2010-08-20  435: 				except urllib.error.HTTPError as error:
80f8e3804a 2010-08-20  436: 					# in case of error we don't need to do anything actually,
80f8e3804a 2010-08-20  437: 					# if file download stalls or fails the file would not be moved to it's location
80f8e3804a 2010-08-20  438: 					print(error)
e7b837a681 2010-08-25  439: 
e7b837a681 2010-08-25  440: 			if '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
e7b837a681 2010-08-25  441: 				# just moving
e7b837a681 2010-08-25  442: 				# drop old dirs XXX
e7b837a681 2010-08-25  443: 				print('Moving temporary file to new destination.')
e7b837a681 2010-08-25  444: 				os.renames(temp_name, file_name)
fb10031536 2010-08-21  445: 
80f8e3804a 2010-08-20  446: 			if self.command == 'HEAD':
80f8e3804a 2010-08-20  447: 				self.send_response(200)
d0071bdbc7 2010-08-20  448: 				if 'Content-Length' in index[my_path]:
d0071bdbc7 2010-08-20  449: 					self.send_header('Content-Length', index[my_path]['Content-Length'])
80f8e3804a 2010-08-20  450: 				self.send_header('Accept-Ranges', 'bytes')
80f8e3804a 2010-08-20  451: 				self.send_header('Content-Type', 'application/octet-stream')
d0071bdbc7 2010-08-20  452: 				if 'Last-Modified' in index[my_path]:
d0071bdbc7 2010-08-20  453: 					self.send_header('Last-Modified', index[my_path]['Last-Modified'])
80f8e3804a 2010-08-20  454: 				self.end_headers()
80f8e3804a 2010-08-20  455: 			else:
e7b837a681 2010-08-25  456: 				if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
80f8e3804a 2010-08-20  457: 					file_name = temp_name
80f8e3804a 2010-08-20  458: 
80f8e3804a 2010-08-20  459: 				with open(file_name, 'rb') as real_file:
80f8e3804a 2010-08-20  460: 					file_stat = os.stat(file_name)
fb10031536 2010-08-21  461: 					if 'Range' in self.headers:
fb10031536 2010-08-21  462: 						self.send_response(206)
80f8e3804a 2010-08-20  463: 						ranges = ()
80f8e3804a 2010-08-20  464: 						requested_ranges.rewind()
80f8e3804a 2010-08-20  465: 						while True:
80f8e3804a 2010-08-20  466: 							pair = requested_ranges.pop()
80f8e3804a 2010-08-20  467: 							if pair[0] == None:
80f8e3804a 2010-08-20  468: 								break
80f8e3804a 2010-08-20  469: 							ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
fb10031536 2010-08-21  470: 						self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length']))
80f8e3804a 2010-08-20  471: 					else:
fb10031536 2010-08-21  472: 						self.send_response(200)
80f8e3804a 2010-08-20  473: 						self.send_header('Content-Length', str(file_stat.st_size))
e7b837a681 2010-08-25  474: 						requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
fb10031536 2010-08-21  475: 					self.send_header('Last-Modified', index[my_path]['Last-Modified'])
80f8e3804a 2010-08-20  476: 					self.send_header('Content-Type', 'application/octet-stream')
80f8e3804a 2010-08-20  477: 					self.end_headers()
80f8e3804a 2010-08-20  478: 					if self.command in ('GET'):
fb10031536 2010-08-21  479: 						if len(requested_ranges) > 0:
fb10031536 2010-08-21  480: 							requested_ranges.rewind()
fb10031536 2010-08-21  481: 							(start, end) = requested_ranges.pop()
fb10031536 2010-08-21  482: 						else:
fb10031536 2010-08-21  483: 							start = 0
fb10031536 2010-08-21  484: 							end = index[my_path]['Content-Length']
80f8e3804a 2010-08-20  485: 						real_file.seek(start)
80f8e3804a 2010-08-20  486: 						if block_size > end - start:
80f8e3804a 2010-08-20  487: 							req_block_size = end - start
80f8e3804a 2010-08-20  488: 						else:
80f8e3804a 2010-08-20  489: 							req_block_size = block_size
80f8e3804a 2010-08-20  490: 						buffer = real_file.read(req_block_size)
80f8e3804a 2010-08-20  491: 						length = len(buffer)
80f8e3804a 2010-08-20  492: 						while length > 0:
80f8e3804a 2010-08-20  493: 							self.wfile.write(buffer)
80f8e3804a 2010-08-20  494: 							start += len(buffer)
80f8e3804a 2010-08-20  495: 							if req_block_size > end - start:
80f8e3804a 2010-08-20  496: 								req_block_size = end - start
80f8e3804a 2010-08-20  497: 							if req_block_size == 0:
80f8e3804a 2010-08-20  498: 								break
80f8e3804a 2010-08-20  499: 							buffer = real_file.read(req_block_size)
80f8e3804a 2010-08-20  500: 							length = len(buffer)
80f8e3804a 2010-08-20  501: 					
80f8e3804a 2010-08-20  502: 		def do_HEAD(self):
80f8e3804a 2010-08-20  503: 			return self.__process()
80f8e3804a 2010-08-20  504: 		def do_GET(self):
80f8e3804a 2010-08-20  505: 			return self.__process()
80f8e3804a 2010-08-20  506: 
e7b837a681 2010-08-25  507: 	config.section('general')
e7b837a681 2010-08-25  508: 	server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
80f8e3804a 2010-08-20  509: 	server.serve_forever()
80f8e3804a 2010-08-20  510: 
80f8e3804a 2010-08-20  511: else:
80f8e3804a 2010-08-20  512: 	while True:
80f8e3804a 2010-08-20  513: 		unchecked_files = set()
80f8e3804a 2010-08-20  514: 		checked_files = 0
80f8e3804a 2010-08-20  515: 
80f8e3804a 2010-08-20  516: 		# reading log and storing found urls for processing
80f8e3804a 2010-08-20  517: 		# check file mtime XXX
80f8e3804a 2010-08-20  518: 		with open(options.log, 'r') as log_file:
80f8e3804a 2010-08-20  519: 			log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
80f8e3804a 2010-08-20  520: 			for line in log_file:
80f8e3804a 2010-08-20  521: 				this_line = log_line.match(line.strip())
80f8e3804a 2010-08-20  522: 				if this_line:
80f8e3804a 2010-08-20  523: 					unchecked_files.add(this_line.group(2))
80f8e3804a 2010-08-20  524: 
80f8e3804a 2010-08-20  525: 		for url in unchecked_files:
80f8e3804a 2010-08-20  526: 			reload = False
80f8e3804a 2010-08-20  527: 			recheck = False
80f8e3804a 2010-08-20  528: 			info = 'Checking file: ' + url
80f8e3804a 2010-08-20  529: 
80f8e3804a 2010-08-20  530: 			# creating empty placeholder in index
80f8e3804a 2010-08-20  531: 			if not url in index:
80f8e3804a 2010-08-20  532: 				info += '\nThis one is new.'
80f8e3804a 2010-08-20  533: 				index[url] = {}
80f8e3804a 2010-08-20  534: 				reload = True
80f8e3804a 2010-08-20  535: 
80f8e3804a 2010-08-20  536: 			# creating file name from url
80f8e3804a 2010-08-20  537: 			file_name = options.dir + re.compile('%20').sub(' ', url)
80f8e3804a 2010-08-20  538: 
80f8e3804a 2010-08-20  539: 			# forcibly checking file if no file present
80f8e3804a 2010-08-20  540: 			if not reload and not os.access(file_name, os.R_OK):
80f8e3804a 2010-08-20  541: 				info += '\nFile not found or inaccessible.'
80f8e3804a 2010-08-20  542: 				reload = True
80f8e3804a 2010-08-20  543: 
80f8e3804a 2010-08-20  544: 			# forcibly checking file if file size doesn't match with index data
80f8e3804a 2010-08-20  545: 			elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
80f8e3804a 2010-08-20  546: 				info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
80f8e3804a 2010-08-20  547: 				reload = True
80f8e3804a 2010-08-20  548: 
80f8e3804a 2010-08-20  549: 			# forcibly checking file if index hods Pragma header
80f8e3804a 2010-08-20  550: 			if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
80f8e3804a 2010-08-20  551: 				info +='\nPragma on: recheck imminent.'
80f8e3804a 2010-08-20  552: 				recheck = True
80f8e3804a 2010-08-20  553: 
80f8e3804a 2010-08-20  554: 			# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
80f8e3804a 2010-08-20  555: 			if not recheck and not reload and (options.noupdate or ('_time' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['_time']).days < 0)):
80f8e3804a 2010-08-20  556: 				if options.verbose:
80f8e3804a 2010-08-20  557: 					print(info)
80f8e3804a 2010-08-20  558: 				continue
80f8e3804a 2010-08-20  559: 			else:
80f8e3804a 2010-08-20  560: 				print(info)
80f8e3804a 2010-08-20  561: 
80f8e3804a 2010-08-20  562: 			try:
80f8e3804a 2010-08-20  563: 				with urllib.request.urlopen(options.root + url) as source:
80f8e3804a 2010-08-20  564: 					new_headers = {}
80f8e3804a 2010-08-20  565: 					headers = source.info()
80f8e3804a 2010-08-20  566: 
80f8e3804a 2010-08-20  567: 					# stripping unneeded headers (XXX make this inplace?)
80f8e3804a 2010-08-20  568: 					for header in headers:
80f8e3804a 2010-08-20  569: 						if header in desc_fields:
80f8e3804a 2010-08-20  570: 							if header == 'Pragma' and headers[header] != 'no-cache':
80f8e3804a 2010-08-20  571: 								print('Pragma:', headers[header])
80f8e3804a 2010-08-20  572: 							new_headers[header] = headers[header]
80f8e3804a 2010-08-20  573: 						elif not header in ignore_fields:
80f8e3804a 2010-08-20  574: 							print('Undefined header "', header, '": ', headers[header], sep='')
80f8e3804a 2010-08-20  575: 
80f8e3804a 2010-08-20  576: 					# comparing headers with data found in index
80f8e3804a 2010-08-20  577: 					# if any header has changed (except Pragma) file is fully downloaded
80f8e3804a 2010-08-20  578: 					# same if we get more or less headers
80f8e3804a 2010-08-20  579: 					old_keys = set(index[url].keys())
80f8e3804a 2010-08-20  580: 					old_keys.discard('_time')
80f8e3804a 2010-08-20  581: 					old_keys.discard('Pragma')
80f8e3804a 2010-08-20  582: 					more_keys = set(new_headers.keys()) - old_keys
80f8e3804a 2010-08-20  583: 					more_keys.discard('Pragma')
80f8e3804a 2010-08-20  584: 					less_keys = old_keys - set(new_headers.keys())
80f8e3804a 2010-08-20  585: 					if len(more_keys) > 0:
80f8e3804a 2010-08-20  586: 						if not len(old_keys) == 0:
80f8e3804a 2010-08-20  587: 							print('More headers appear:', more_keys)
80f8e3804a 2010-08-20  588: 						reload = True
80f8e3804a 2010-08-20  589: 					elif len(less_keys) > 0:
80f8e3804a 2010-08-20  590: 						print('Less headers appear:', less_keys)
80f8e3804a 2010-08-20  591: 					else:
80f8e3804a 2010-08-20  592: 						for key in index[url].keys():
80f8e3804a 2010-08-20  593: 							if key[0] != '_' and key != 'Pragma' and not index[url][key] == new_headers[key]:
80f8e3804a 2010-08-20  594: 								print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
80f8e3804a 2010-08-20  595: 								reload = True
80f8e3804a 2010-08-20  596: 
80f8e3804a 2010-08-20  597: 					# downloading file
80f8e3804a 2010-08-20  598: 					if reload:
80f8e3804a 2010-08-20  599: 						if 'Content-Length' in headers:
80f8e3804a 2010-08-20  600: 							print('Downloading', headers['Content-Length'], 'bytes [', end='')
80f8e3804a 2010-08-20  601: 						else:
80f8e3804a 2010-08-20  602: 							print('Downloading [', end='')
80f8e3804a 2010-08-20  603: 						sys.stdout.flush()
80f8e3804a 2010-08-20  604: 
80f8e3804a 2010-08-20  605: 						# file is created at temporary location and moved in place only when download completes
80f8e3804a 2010-08-20  606: 						temp_file = open(options.dir + os.sep + '.tmp', 'wb')
80f8e3804a 2010-08-20  607: 						buffer = source.read(block_size)
80f8e3804a 2010-08-20  608: 						megablocks = 0
80f8e3804a 2010-08-20  609: 						blocks = 0
80f8e3804a 2010-08-20  610: 						megs = 0
80f8e3804a 2010-08-20  611: 						while len(buffer) > 0:
80f8e3804a 2010-08-20  612: 							temp_file.write(buffer)
80f8e3804a 2010-08-20  613: 							buffer = source.read(block_size)
80f8e3804a 2010-08-20  614: 							blocks += 1
80f8e3804a 2010-08-20  615: 							if blocks > 102400/block_size:
80f8e3804a 2010-08-20  616: 								megablocks += 1
80f8e3804a 2010-08-20  617: 								if megablocks > 10:
80f8e3804a 2010-08-20  618: 									megablocks = megablocks - 10
80f8e3804a 2010-08-20  619: 									megs += 1
80f8e3804a 2010-08-20  620: 									print('{}Mb'.format(megs), end='')
80f8e3804a 2010-08-20  621: 								else:
80f8e3804a 2010-08-20  622: 									print('.', end='')
80f8e3804a 2010-08-20  623: 								blocks = blocks - 102400/block_size
80f8e3804a 2010-08-20  624: 							sys.stdout.flush()
80f8e3804a 2010-08-20  625: 						temp_file.close()
80f8e3804a 2010-08-20  626: 						print(']')
80f8e3804a 2010-08-20  627: 						os.renames(options.dir + os.sep + '.tmp', file_name)
80f8e3804a 2010-08-20  628: 
80f8e3804a 2010-08-20  629: 						checked_files += 1
80f8e3804a 2010-08-20  630: 
80f8e3804a 2010-08-20  631: 					# storing new time mark and storing new headers
80f8e3804a 2010-08-20  632: 					new_headers['_time'] = datetime.datetime.now()
80f8e3804a 2010-08-20  633: 					index[url] = new_headers
80f8e3804a 2010-08-20  634: 					index.sync()
80f8e3804a 2010-08-20  635: 
80f8e3804a 2010-08-20  636: 			except urllib.error.HTTPError as error:
80f8e3804a 2010-08-20  637: 				# in case of error we don't need to do anything actually,
80f8e3804a 2010-08-20  638: 				# if file download stalls or fails the file would not be moved to it's location
80f8e3804a 2010-08-20  639: 				print(error)
80f8e3804a 2010-08-20  640: 
80f8e3804a 2010-08-20  641: 		if options.verbose:
80f8e3804a 2010-08-20  642: 			print('[', len(unchecked_files), '/', checked_files, ']')
80f8e3804a 2010-08-20  643: 
80f8e3804a 2010-08-20  644: 		# checking if there were any files downloaded, if yes - restarting sequence
80f8e3804a 2010-08-20  645: 		if checked_files == 0:
80f8e3804a 2010-08-20  646: 			break