Index: samesite.py ================================================================== --- samesite.py +++ samesite.py @@ -1,8 +1,10 @@ -#!/usr/bin/env python3.2 +#!/usr/bin/env python + +from __future__ import unicode_literals, print_function -import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request +import bsddb.dbshelve, copy, datetime, os, BaseHTTPServer, sys, spacemap, re, urllib2 class Config: __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root']) _default = { 'general': { @@ -16,11 +18,11 @@ 'sub': '', },} # function to read in config file def __init__(self): - import configparser, optparse + import ConfigParser, optparse parser = optparse.OptionParser() parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf') (self.options, args) = parser.parse_args() @@ -30,11 +32,11 @@ if configDir: self.root = configDir.group(1) else: self.root = os.getcwd() - self._config = configparser.ConfigParser() + self._config = ConfigParser.ConfigParser() self._config.readfp(open(self.options.config)) for section in self._config.sections(): if section != 'general': if self._config.has_option(section, 'dir'): @@ -74,27 +76,25 @@ config = Config() #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable' -const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma']) +const_desc_fields = set(['content-length', 'last-modified', 'pragma']) const_ignore_fields = set([ - 'Accept-Ranges', 'Age', - 'Cache-Control', 'Connection', 'Content-Type', - 'Date', - 'Expires', - 'Referer', - 'Server', - 'Via', - 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By', + 'accept-ranges', 'age', + 'cache-control', 'connection', 'content-type', + 'date', + 'expires', + 'referer', + 'server', + 'via', + 'x-cache', 'x-cache-lookup', 'x-livetool', 'x-powered-by', ]) block_size = 4096 -import http.server - -class MyRequestHandler(http.server.BaseHTTPRequestHandler): +class MyRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): def __process(self): # reload means file needs to be reloaded to serve request reload = False # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy recheck = False @@ -109,11 +109,11 @@ if myPath: my_path = myPath.group(1) else: my_path = self.path - config.section(self.headers['Host']) + config.section(self.headers['host']) if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0: string = re.compile(config['strip']).sub(config['sub'], my_path) my_path = string @@ -123,42 +123,42 @@ os.mkdir(config['dir']) # this is file index - everything is stored in this file # _parts - list of stored parts of file # _time - last time the file was checked # everything else is just the headers - index = shelve.open(config['dir'] + os.sep + '.index') + index = bsddb.dbshelve.open(config['dir'] + os.sep + '.index') desc_fields = const_desc_fields.copy() ignore_fields = const_ignore_fields.copy() if config['noetag'] == 'no': - desc_fields.add('ETag') + desc_fields.add('etag') else: - ignore_fields.add('ETag') + ignore_fields.add('etag') proxy_ignored = set([ - 'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language', - 'Cache-Control', 'Connection', 'Content-Length', 'Cookie', - 'Host', - 'If-Modified-Since', 'If-Unmodified-Since', - 'Referer', - 'User-Agent', - 'Via', - 'X-Forwarded-For', 'X-Last-HR', 'X-Last-HTTP-Status-Code', 'X-REMOVED', 'X-Real-IP', 'X-Retry-Count', + 'accept', 'accept-charset', 'accept-encoding', 'accept-language', + 'cache-control', 'connection', 'content-length', 'cookie', + 'host', + 'if-modified-since', 'if-unmodified-since', + 'referer', + 'user-agent', + 'via', + 'x-forwarded-for', 'x-last-hr', 'x-last-http-status-code', 'x-removed', 'x-real-ip', 'x-retry-count', ]) print('===============[ {} request ]==='.format(self.command)) for header in self.headers: if header in proxy_ignored: pass - elif header in ('Range'): + elif header in ('range'): isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header]) if isRange: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1}) else: return() - elif header in ('Pragma'): + elif header in ('pragma'): if my_path in index: index[my_path][header] = self.headers[header] else: print('Unknown header - ', header, ': ', self.headers[header], sep='') return() @@ -199,17 +199,17 @@ recheck = True # forcibly checking file if file size doesn't match with index data if not reload: if '_parts' in record and record['_parts'] == spacemap.SpaceMap(): - if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']): - info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length']) + if 'content-length' in record and file_stat and file_stat.st_size != int(record['content-length']): + info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['content-length']) record['_parts'] = None reload = True # forcibly checking file if index holds Pragma header - if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache': + if not reload and 'pragma' in record and record['pragma'] == 'no-cache': info +='\nPragma on: recheck imminent.' recheck = True # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago if not recheck and not reload and '_time' in record and (record['_time'] - datetime.datetime.now() + datetime.timedelta(hours = 4)).days < 0: @@ -220,11 +220,11 @@ if reload or recheck: try: request = 'http://' + config['root'] + self.path my_headers = {} - for header in ('Cache-Control', 'Cookie', 'Referer', 'User-Agent'): + for header in ('cache-control', 'cookie', 'referer', 'user-agent'): if header in self.headers: my_headers[header] = self.headers[header] needed = None if '_parts' in record and record['_parts'] != None: @@ -241,127 +241,128 @@ while True: range = needed.pop() if range[0] == None: break ranges += '{}-{}'.format(range[0], range[1] - 1), - my_headers['Range'] = 'bytes=' + ','.join(ranges) - - request = urllib.request.Request(request, headers = my_headers) - - with urllib.request.urlopen(request) as source: - new_record = {} - new_record['_parts'] = record['_parts'] - headers = source.info() - - # stripping unneeded headers (XXX make this inplace?) - for header in headers: - if header in desc_fields: - #if header == 'Pragma' and headers[header] != 'no-cache': - if header == 'Content-Length': - if 'Content-Range' not in headers: - new_record[header] = int(headers[header]) - else: - new_record[header] = headers[header] - elif header == 'Content-Range': - range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header]) - if range: - new_record['Content-Length'] = int(range.group(3)) - else: - assert False, 'Content-Range unrecognized.' - elif not header in ignore_fields: - print('Undefined header "', header, '": ', headers[header], sep='') - - # comparing headers with data found in index - # if any header has changed (except Pragma) file is fully downloaded - # same if we get more or less headers - old_keys = set(record.keys()) - old_keys.discard('_time') - old_keys.discard('Pragma') - more_keys = set(new_record.keys()) - old_keys - more_keys.discard('Pragma') - less_keys = old_keys - set(new_record.keys()) - if len(more_keys) > 0: - if not len(old_keys) == 0: - print('More headers appear:', more_keys) - reload = True - elif len(less_keys) > 0: - print('Less headers appear:', less_keys) - else: - for key in record.keys(): - if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]: - print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='') - print(type(record[key]), type(new_record[key])) - reload = True - - if reload: - print('Reloading.') - if os.access(temp_name, os.R_OK): - os.unlink(temp_name) - if os.access(file_name, os.R_OK): - os.unlink(file_name) - if 'Content-Length' in new_record: - new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])}) - else: - new_record['_parts'] = spacemap.SpaceMap() - print(new_record) - - # downloading file or segment - if 'Content-Length' in new_record: - if needed == None: - needed = new_record['_parts'] - else: - if len(needed) > 1: - print("Multipart requests currently not supported.") - assert False, 'Skip this one for now.' - #else: - #assert False, 'No Content-Length or Content-Range header.' - - new_record['_time'] = datetime.datetime.now() - if self.command not in ('HEAD'): - # file is created at temporary location and moved in place only when download completes - if not os.access(temp_name, os.R_OK): - empty_name = config['dir'] + os.sep + '.tmp' - with open(empty_name, 'w+b') as some_file: - pass - os.renames(empty_name, temp_name) - temp_file = open(temp_name, 'r+b') - if requested_ranges == None and needed == None: - needed = new_record['_parts'] - needed.rewind() - while True: - (start, end) = needed.pop() - if start == None: - break - stream_last = start - old_record = new_record - if end - start < block_size: - req_block_size = end - start - else: - req_block_size = block_size - buffer = source.read(req_block_size) - length = len(buffer) - while length > 0 and stream_last < end: - stream_pos = stream_last + length - assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end) - temp_file.seek(stream_last) - temp_file.write(buffer) - new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos}) - index[my_path] = old_record - index.sync() - old_record = new_record - stream_last = stream_pos - if end - stream_last < block_size: - req_block_size = end - stream_last - buffer = source.read(req_block_size) - length = len(buffer) - # moving downloaded data to real file - temp_file.close() - - index[my_path] = new_record - index.sync() - - except urllib.error.HTTPError as error: + my_headers['range'] = 'bytes=' + ','.join(ranges) + + request = urllib2.Request(request, headers = my_headers) + + source = urllib2.urlopen(request) + new_record = {} + new_record['_parts'] = record['_parts'] + headers = source.info() + + # stripping unneeded headers (XXX make this inplace?) + for header in headers: + if header in desc_fields: + #if header == 'Pragma' and headers[header] != 'no-cache': + if header == 'content-length': + if 'content-range' not in headers: + new_record[header] = int(headers[header]) + else: + new_record[header] = headers[header] + elif header == 'content-range': + range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header]) + if range: + new_record['content-length'] = int(range.group(3)) + else: + assert False, 'Content-Range unrecognized.' + elif not header in ignore_fields: + print('Undefined header "', header, '": ', headers[header], sep='') + + # comparing headers with data found in index + # if any header has changed (except Pragma) file is fully downloaded + # same if we get more or less headers + old_keys = set(record.keys()) + old_keys.discard('_time') + old_keys.discard('pragma') + more_keys = set(new_record.keys()) - old_keys + more_keys.discard('pragma') + less_keys = old_keys - set(new_record.keys()) + if len(more_keys) > 0: + if len(old_keys) != 0: + print('More headers appear:', more_keys) + reload = True + elif len(less_keys) > 0: + print('Less headers appear:', less_keys) + else: + for key in record.keys(): + if key[0] != '_' and key != 'pragma' and record[key] != new_record[key]: + print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='') + print(type(record[key]), type(new_record[key])) + reload = True + + if reload: + print('Reloading.') + if os.access(temp_name, os.R_OK): + os.unlink(temp_name) + if os.access(file_name, os.R_OK): + os.unlink(file_name) + if 'content-length' in new_record: + new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['content-length'])}) + if not new_record['_parts']: + new_record['_parts'] = spacemap.SpaceMap() + print(new_record) + + # downloading file or segment + if 'content-length' in new_record: + if needed == None: + needed = new_record['_parts'] + else: + if len(needed) > 1: + print("Multipart requests currently not supported.") + assert False, 'Skip this one for now.' + #else: + #assert False, 'No content-length or Content-Range header.' + + new_record['_time'] = datetime.datetime.now() + if self.command not in ('HEAD'): + # file is created at temporary location and moved in place only when download completes + if not os.access(temp_name, os.R_OK): + empty_name = config['dir'] + os.sep + '.tmp' + with open(empty_name, 'w+b') as some_file: + pass + os.renames(empty_name, temp_name) + temp_file = open(temp_name, 'r+b') + if requested_ranges == None and needed == None: + needed = new_record['_parts'] + needed.rewind() + while True: + (start, end) = needed.pop() + if start == None: + break + stream_last = start + old_record = copy.copy(new_record) + if end - start < block_size: + req_block_size = end - start + else: + req_block_size = block_size + buffer = source.read(req_block_size) + length = len(buffer) + while length > 0 and stream_last < end: + stream_pos = stream_last + length + assert stream_pos <= end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end) + temp_file.seek(stream_last) + temp_file.write(buffer) + x = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos}) + new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos}) + index[my_path] = old_record + index.sync() + old_record = copy.copy(new_record) + stream_last = stream_pos + if end - stream_last < block_size: + req_block_size = end - stream_last + buffer = source.read(req_block_size) + length = len(buffer) + # moving downloaded data to real file + temp_file.close() + + index[my_path] = new_record + index.sync() + + except urllib2.HTTPError as error: # in case of error we don't need to do anything actually, # if file download stalls or fails the file would not be moved to it's location print(error) print(index[my_path]) @@ -377,50 +378,50 @@ self.end_headers() return if self.command == 'HEAD': self.send_response(200) - if 'Content-Length' in index[my_path]: - self.send_header('Content-Length', index[my_path]['Content-Length']) - self.send_header('Accept-Ranges', 'bytes') - self.send_header('Content-Type', 'application/octet-stream') - if 'Last-Modified' in index[my_path]: - self.send_header('Last-Modified', index[my_path]['Last-Modified']) + if 'content-length' in index[my_path]: + self.send_header('content-length', index[my_path]['content-length']) + self.send_header('accept-ranges', 'bytes') + self.send_header('content-type', 'application/octet-stream') + if 'last-modified' in index[my_path]: + self.send_header('last-modified', index[my_path]['last-modified']) self.end_headers() else: if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK): file_name = temp_name with open(file_name, 'rb') as real_file: file_stat = os.stat(file_name) - if 'Range' in self.headers: + if 'range' in self.headers: self.send_response(206) ranges = () requested_ranges.rewind() while True: pair = requested_ranges.pop() if pair[0] == None: break ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)), - self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length'])) + self.send_header('content-range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['content-length'])) else: self.send_response(200) - self.send_header('Content-Length', str(file_stat.st_size)) + self.send_header('content-length', str(file_stat.st_size)) requested_ranges = spacemap.SpaceMap({0: file_stat.st_size}) - if 'Last-Modified' in index[my_path]: - self.send_header('Last-Modified', index[my_path]['Last-Modified']) - self.send_header('Content-Type', 'application/octet-stream') + if 'last-modified' in index[my_path]: + self.send_header('last-modified', index[my_path]['last-modified']) + self.send_header('content-type', 'application/octet-stream') self.end_headers() if self.command in ('GET'): if len(requested_ranges) > 0: requested_ranges.rewind() (start, end) = requested_ranges.pop() else: start = 0 # XXX ugly hack - if 'Content-Length' in index[my_path]: - end = index[my_path]['Content-Length'] + if 'content-length' in index[my_path]: + end = index[my_path]['content-length'] else: end = 0 real_file.seek(start) if block_size > end - start: req_block_size = end - start @@ -442,7 +443,7 @@ return self.__process() def do_GET(self): return self.__process() config.section('general') -server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler) +server = BaseHTTPServer.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler) server.serve_forever()