Samesite - proxy that can cache partial transfers: samesite.py at [cab908195f]

File samesite.py artifact f557383e69 part of check-in cab908195f

#!/usr/bin/env python3.1

import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request

class Config:
	__slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
	_default = {
		'general': {
			'port': '8008',
		},
		'_other': {
			'verbose': 'no',
			'noetag': 'no',
			'noparts': 'no',
			'strip': '',
			'sub': '',
	},}

	# function to read in config file
	def __init__(self):
		import configparser, optparse

		parser = optparse.OptionParser()
		parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
		(self.options, args) = parser.parse_args()

		assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)

		configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
		if configDir:
			self.root = configDir.group(1)
		else:
			self.root = os.getcwd()

		self._config = configparser.ConfigParser()
		self._config.readfp(open(self.options.config))

		for section in self._config.sections():
			if section != 'general':
				if self._config.has_option(section, 'dir'):
					if re.compile('^/$').match(self._config.get(section, 'dir')):
						self._config.set(section, 'dir', self.root + os.sep + section)
					thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
					if thisDir:
						self._config.set(section, 'dir', thisDir.group(1))
					if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
						self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
				else:
					self._config.set(section, 'dir', self.root + os.sep + section)

				if not self._config.has_option(section, 'root'):
					self._config.set(section, 'root', section)

	# function to select config file section or create one
	def section(self, section):
		if not self._config.has_section(section):
			self._config.add_section(section)
		self._section = section

	# function to get config parameter, if parameter doesn't exists the default
	# value or None is substituted
	def __getitem__(self, name):
		if not self._config.has_option(self._section, name):
			if self._section in self._default:
				if name in self._default[self._section]:
					self._config.set(self._section, name, self._default[self._section][name])
				else:
					self._config.set(self._section, name, None)
			elif name in self._default['_other']:
				self._config.set(self._section, name, self._default['_other'][name])
			else:
				self._config.set(self._section, name, None)
		return(self._config.get(self._section, name))

config = Config()

#assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'

const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma'])
const_ignore_fields = set([
	'Accept-Ranges', 'Age',
	'Cache-Control', 'Connection', 'Content-Type',
	'Date',
	'Expires',
	'Server',
	'Via',
	'X-Cache', 'X-Cache-Lookup', 'X-Powered-By'
])

block_size = 4096

'''
# later, kqueue would be good but later
class Connection:
	__slots__ = frozenset(('__address', '__input', '__socket', '__status', 'error', 'method', 'url', 'http_version'))

	def __init__(self, socket, address):
		self.__address = address
		self.__input = b''
		self.__socket = socket
		self.__status = 0

	def read(self, kev):
		buffer = self.__socket.recv(kev.data)
		exhausted = False
		if len(buffer) == 0:
			eof = True
		else:
			self.__input += buffer
			while not exhausted:
				if self.__status == -1:
					exhausted = True
				elif self.__status == 0:
					endstring = self.__input.find(b'\n')
					if endstring > 0:
						print('Processing request line.')
						line = self.__input[:endstring].decode('ascii')
						self.__input = self.__input[endstring + 1:]
						isRequest = re.compile('(GET) ([^ ]+) HTTP/(1\.0)').match(line)
						if not isRequest:
							self.error = 'Not a HTTP connection.'
							self.__status = -1
						else:
							self.method = isRequest.group(1)
							self.url = isRequest.group(2)
							self.http_version = isRequest.group(3)
							self.__status = 1
					else:
						exhausted = True
				elif self.__status == 1:
					endstring = self.__input.find(b'\n')
					if endstring > 0:
						print('Processing header line.' + repr(self.__input))
						line = self.__input[:endstring].decode('ascii')
						self.__input = self.__input[endstring + 1:]
						isHeader = re.compile('([^:]*): +(.*)').match(line)
						if not isHeader:
							self.error = 'Bad header.'
							return(False)
						# process header here
					elif endstring == 0:
						self.__status = 2
					else:
						exhausted = True

	def write(self, kev):
		pass

if options.port:
	import select, socket

	sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
	try:
		sock.bind(('127.0.0.1', int(options.port)))
		sock.listen(-1)

		kq = select.kqueue()
		assert kq.fileno() != -1, "Fatal error: can't initialise kqueue."

		kq.control([select.kevent(sock, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
		timeout = None

		connections = {sock.fileno(): None}

		while True:
			kevs = kq.control(None, 1, timeout)

			for kev in kevs:
				if type(connections[kev.ident]) == Connection:
					print(kev.ident, kev.data, kev.filter, kev.flags)
					assert kev.data != 0, 'No data available.'
					if kev.filter == select.KQ_FILTER_READ:
						connections[kev.ident].read(kev)
					elif kev.filter == select.KQ_FILTER_WRITE:
						connections[kev.ident].write(kev)
					else:
						assert kev.filter in (select.KQ_FILTER_READ, select.KQ_FILTER_WRITE), 'Do we support other filters?'
				else:
					(conn, addr) = sock.accept()
					print('Connection from ' + repr(addr))
					kq.control([select.kevent(conn, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
					connections[conn.fileno()] = Connection(conn, addr)

				if kev.flags >> 15 == 1:
					kq.control([select.kevent(kev.ident, select.KQ_FILTER_READ, select.KQ_EV_DELETE)], 0)
					kq.control([select.kevent(kev.ident, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE)], 0)
					del(connections[kev.ident])
	finally:
		sock.close()
'''

# XXX how about rechecking files?
if True:
	import http.server

	class MyRequestHandler(http.server.BaseHTTPRequestHandler):
		def __process(self):
			# reload means file needs to be reloaded to serve request
			reload = False
			# recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
			recheck = False
			# file_stat means file definitely exists
			file_stat = None
			# requested_ranges holds data about any range requested
			requested_ranges = None
			# records holds data from index locally, should be written back upon successfull completion
			record = None

			myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
			if myPath:
				my_path = myPath.group(1)
			else:
				my_path = self.path

			config.section(self.headers['Host'])

			if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0:
				string = re.compile(config['strip']).sub(config['sub'], my_path)
				my_path = string

			info = 'Checking file: ' + my_path

			if not os.access(config['dir'], os.X_OK):
				os.mkdir(config['dir'])
			# this is file index - everything is stored in this file
			# _parts - list of stored parts of file
			# _time - last time the file was checked
			# everything else is just the headers
			index = shelve.open(config['dir'] + os.sep + '.index')

			desc_fields = const_desc_fields.copy()
			ignore_fields = const_ignore_fields.copy()
			if not config['noetag']:
				desc_fields.add('ETag')
			else:
				ignore_fields.add('ETag')

			proxy_ignored = set([
				'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language',
				'Cache-Control', 'Connection', 'Content-Length', 'Cookie',
				'Host',
				'If-Modified-Since', 'If-Unmodified-Since',
				'Referer',
				'User-Agent',
				'Via',
				'X-Forwarded-For', 'X-REMOVED',
			])

			print('===============[ {} request ]==='.format(self.command))

			for header in self.headers:
				if header in proxy_ignored:
					pass
				elif header in ('Range'):
					isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
					if isRange:
						requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
					else:
						return()
				elif header in ('Pragma'):
					if my_path in index:
						index[my_path][header] = self.headers[header]
				else:
					print('Unknown header - ', header, ': ', self.headers[header], sep='')
					return()
				print(header, self.headers[header])

			# creating file name from my_path
			file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
			# partial file or unfinished download
			temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)

			# creating empty placeholder in index
			# if there's no space map and there's no file in real directory - we have no file
			# if there's an empty space map - file is full
			# space map generally covers every bit of file we don't posess currently
			if not my_path in index:
				info += '\nThis one is new.'
				reload = True
				record = {}
			else:
				# forcibly checking file if no file present
				if os.access(file_name, os.R_OK):
					file_stat = os.stat(file_name)
				elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK):
					file_stat = os.stat(temp_name)
				else:
					info += '\nFile not found or inaccessible.'
					index[my_path]['_parts'] = None
					reload = True
				record = index[my_path]

			if not '_parts' in record:
				record['_parts'] = None

			if record['_parts'] == None:
				recheck = True

			# forcibly checking file if file size doesn't match with index data
			if not reload:
				if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
					if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']):
						info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length'])
						record['_parts'] = None
						reload = True

			# forcibly checking file if index holds Pragma header
			if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache':
				info +='\nPragma on: recheck imminent.'
				recheck = True

			# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
			if not recheck and not reload and '_time' in record and (datetime.datetime.now() - datetime.timedelta(hours = 4) - record['_time']).days < 0:
				recheck = True

			print(info)
			if reload or recheck:

				try:
					request = 'http://' + config['root'] + self.path
					my_headers = {}
					for header in ('Cache-Control', 'Cookie', 'Referer', 'User-Agent'):
						if header in self.headers:
							my_headers[header] = self.headers[header]

					needed = None
					# XXX and if we specify full file we don't go partial?
					if requested_ranges != None:
						if '_parts' in record and record['_parts'] != None:
							if config['noparts']:
								needed = record['_parts']
							else:
								needed = record['_parts'] | requested_ranges
						elif not config['noparts']:
							needed = requested_ranges
						ranges = ()
						print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
						if needed != None and len(needed) > 0:
							needed.rewind()
							while True:
								range = needed.pop()
								if range[0] == None:
									break
								ranges += '{}-{}'.format(range[0], range[1] - 1),
							my_headers['Range'] = 'bytes=' + ','.join(ranges)

					request = urllib.request.Request(request, headers = my_headers)

					with urllib.request.urlopen(request) as source:
						new_record = {}
						new_record['_parts'] = record['_parts']
						headers = source.info()

						# stripping unneeded headers (XXX make this inplace?)
						for header in headers:
							if header in desc_fields:
								#if header == 'Pragma' and headers[header] != 'no-cache':
								if header == 'Content-Length':
									if 'Content-Range' not in headers:
										new_record[header] = int(headers[header])
								else:
									new_record[header] = headers[header]
							elif header == 'Content-Range':
								range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
								if range:
									new_record['Content-Length'] = int(range.group(3))
								else:	
									assert False, 'Content-Range unrecognized.'
							elif not header in ignore_fields:
								print('Undefined header "', header, '": ', headers[header], sep='')

						# comparing headers with data found in index
						# if any header has changed (except Pragma) file is fully downloaded
						# same if we get more or less headers
						old_keys = set(record.keys())
						old_keys.discard('_time')
						old_keys.discard('Pragma')
						more_keys = set(new_record.keys()) - old_keys
						more_keys.discard('Pragma')
						less_keys = old_keys - set(new_record.keys())
						if len(more_keys) > 0:
							if not len(old_keys) == 0:
								print('More headers appear:', more_keys)
							reload = True
						elif len(less_keys) > 0:
							print('Less headers appear:', less_keys)
						else:
							for key in record.keys():
								if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]:
									print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
									print(type(record[key]), type(new_record[key]))
									reload = True

						if reload:
							print('Reloading.')
							if os.access(temp_name, os.R_OK):
								os.unlink(temp_name)
							if os.access(file_name, os.R_OK):
								os.unlink(file_name)
							new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
						print(new_record)

						# downloading file or segment
						if 'Content-Length' in new_record:
							if needed == None:
								needed = new_record['_parts']
							else:
								if len(needed) > 1:
									print("Multipart requests currently not supported.")
									assert False, 'Skip this one for now.'
						else:
							assert False, 'No Content-Length or Content-Range header.'

						new_record['_time'] = datetime.datetime.now()
						if self.command not in ('HEAD'):
							# file is created at temporary location and moved in place only when download completes
							if not os.access(temp_name, os.R_OK):
								empty_name = config['dir'] + os.sep + '.tmp'
								with open(empty_name, 'w+b') as some_file:
									pass
								os.renames(empty_name, temp_name)
							temp_file = open(temp_name, 'r+b')
							needed.rewind()
							while True:
								(start, end) = needed.pop()
								if start == None:
									break
								stream_last = start
								old_record = new_record
								if end - start < block_size:
									req_block_size = end - start
								else:
									req_block_size = block_size
								buffer = source.read(req_block_size)
								length = len(buffer)
								while length > 0 and stream_last < end:
									stream_pos = stream_last + length
									assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
									temp_file.seek(stream_last)
									temp_file.write(buffer)
									new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
									index[my_path] = old_record
									index.sync()
									old_record = new_record
									stream_last = stream_pos
									if end - stream_last < block_size:
										req_block_size = end - stream_last
									buffer = source.read(req_block_size)
									length = len(buffer)
							# moving downloaded data to real file
							temp_file.close()

						print(new_record)
						index[my_path] = new_record
						index.sync()

				except urllib.error.HTTPError as error:
					# in case of error we don't need to do anything actually,
					# if file download stalls or fails the file would not be moved to it's location
					print(error)

			if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
				# just moving
				# drop old dirs XXX
				print('Moving temporary file to new destination.')
				os.renames(temp_name, file_name)

			if not my_path in index:
				self.send_response(502)
				self.end_headers()
				return

			if self.command == 'HEAD':
				self.send_response(200)
				if 'Content-Length' in index[my_path]:
					self.send_header('Content-Length', index[my_path]['Content-Length'])
				self.send_header('Accept-Ranges', 'bytes')
				self.send_header('Content-Type', 'application/octet-stream')
				if 'Last-Modified' in index[my_path]:
					self.send_header('Last-Modified', index[my_path]['Last-Modified'])
				self.end_headers()
			else:
				if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
					file_name = temp_name

				with open(file_name, 'rb') as real_file:
					file_stat = os.stat(file_name)
					if 'Range' in self.headers:
						self.send_response(206)
						ranges = ()
						requested_ranges.rewind()
						while True:
							pair = requested_ranges.pop()
							if pair[0] == None:
								break
							ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
						self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length']))
					else:
						self.send_response(200)
						self.send_header('Content-Length', str(file_stat.st_size))
						requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
					if 'Last-Modified' in index[my_path]:
						self.send_header('Last-Modified', index[my_path]['Last-Modified'])
					self.send_header('Content-Type', 'application/octet-stream')
					self.end_headers()
					if self.command in ('GET'):
						if len(requested_ranges) > 0:
							requested_ranges.rewind()
							(start, end) = requested_ranges.pop()
						else:
							start = 0
							end = index[my_path]['Content-Length']
						real_file.seek(start)
						if block_size > end - start:
							req_block_size = end - start
						else:
							req_block_size = block_size
						buffer = real_file.read(req_block_size)
						length = len(buffer)
						while length > 0:
							self.wfile.write(buffer)
							start += len(buffer)
							if req_block_size > end - start:
								req_block_size = end - start
							if req_block_size == 0:
								break
							buffer = real_file.read(req_block_size)
							length = len(buffer)
					
		def do_HEAD(self):
			return self.__process()
		def do_GET(self):
			return self.__process()

	config.section('general')
	server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
	server.serve_forever()

else:
	while True:
		unchecked_files = set()
		checked_files = 0

		# reading log and storing found urls for processing
		# check file mtime XXX
		with open(options.log, 'r') as log_file:
			log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
			for line in log_file:
				this_line = log_line.match(line.strip())
				if this_line:
					unchecked_files.add(this_line.group(2))

		for url in unchecked_files:
			reload = False
			recheck = False
			info = 'Checking file: ' + url

			# creating empty placeholder in index
			if not url in index:
				info += '\nThis one is new.'
				index[url] = {}
				reload = True

			# creating file name from url
			file_name = options.dir + re.compile('%20').sub(' ', url)

			# forcibly checking file if no file present
			if not reload and not os.access(file_name, os.R_OK):
				info += '\nFile not found or inaccessible.'
				reload = True

			# forcibly checking file if file size doesn't match with index data
			elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
				info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
				reload = True

			# forcibly checking file if index hods Pragma header
			if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
				info +='\nPragma on: recheck imminent.'
				recheck = True

			# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
			if not recheck and not reload and (options.noupdate or ('_time' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['_time']).days < 0)):
				if options.verbose:
					print(info)
				continue
			else:
				print(info)

			try:
				with urllib.request.urlopen(options.root + url) as source:
					new_headers = {}
					headers = source.info()

					# stripping unneeded headers (XXX make this inplace?)
					for header in headers:
						if header in desc_fields:
							if header == 'Pragma' and headers[header] != 'no-cache':
								print('Pragma:', headers[header])
							new_headers[header] = headers[header]
						elif not header in ignore_fields:
							print('Undefined header "', header, '": ', headers[header], sep='')

					# comparing headers with data found in index
					# if any header has changed (except Pragma) file is fully downloaded
					# same if we get more or less headers
					old_keys = set(index[url].keys())
					old_keys.discard('_time')
					old_keys.discard('Pragma')
					more_keys = set(new_headers.keys()) - old_keys
					more_keys.discard('Pragma')
					less_keys = old_keys - set(new_headers.keys())
					if len(more_keys) > 0:
						if not len(old_keys) == 0:
							print('More headers appear:', more_keys)
						reload = True
					elif len(less_keys) > 0:
						print('Less headers appear:', less_keys)
					else:
						for key in index[url].keys():
							if key[0] != '_' and key != 'Pragma' and not index[url][key] == new_headers[key]:
								print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
								reload = True

					# downloading file
					if reload:
						if 'Content-Length' in headers:
							print('Downloading', headers['Content-Length'], 'bytes [', end='')
						else:
							print('Downloading [', end='')
						sys.stdout.flush()

						# file is created at temporary location and moved in place only when download completes
						temp_file = open(options.dir + os.sep + '.tmp', 'wb')
						buffer = source.read(block_size)
						megablocks = 0
						blocks = 0
						megs = 0
						while len(buffer) > 0:
							temp_file.write(buffer)
							buffer = source.read(block_size)
							blocks += 1
							if blocks > 102400/block_size:
								megablocks += 1
								if megablocks > 10:
									megablocks = megablocks - 10
									megs += 1
									print('{}Mb'.format(megs), end='')
								else:
									print('.', end='')
								blocks = blocks - 102400/block_size
							sys.stdout.flush()
						temp_file.close()
						print(']')
						os.renames(options.dir + os.sep + '.tmp', file_name)

						checked_files += 1

					# storing new time mark and storing new headers
					new_headers['_time'] = datetime.datetime.now()
					index[url] = new_headers
					index.sync()

			except urllib.error.HTTPError as error:
				# in case of error we don't need to do anything actually,
				# if file download stalls or fails the file would not be moved to it's location
				print(error)

		if options.verbose:
			print('[', len(unchecked_files), '/', checked_files, ']')

		# checking if there were any files downloaded, if yes - restarting sequence
		if checked_files == 0:
			break