Index: samesite.py ================================================================== --- samesite.py +++ samesite.py @@ -1,47 +1,86 @@ #!/usr/bin/env python3.1 -import datetime, http.cookiejar, optparse, os, sys, shelve, re, urllib.request - -from spacemap import SpaceMap - -parser = optparse.OptionParser() -parser.add_option('-v', '--verbose', action = 'store_true', dest = 'verbose', help = 'turns on verbose status notifications', metavar = 'bool', default = False) -parser.add_option('-d', '--dir', action = 'store', dest = 'dir', help = 'specify directory where the files should be stored', metavar = 'string', default = None) -parser.add_option('-r', '--root', action = 'store', dest = 'root', help = 'specify a site from which data should be mirrored', metavar = 'string', default = None) -parser.add_option('-l', '--log', action = 'store', dest = 'log', help = 'specify a log file to process', metavar = 'string', default = None) -parser.add_option('-e', '--skip-etag', action = 'store_true', dest = 'noetag', help = 'do not process etags', metavar = 'bool', default = False) -parser.add_option('-p', '--port', action = 'store', dest = 'port', help = 'listen on this port for incoming connections', metavar = 'integer', default = None) -parser.add_option('-n', '--no-update', action = 'store_true', dest = 'noupdate', help = 'do not update already downloaded files', metavar = 'bool', default = 'False') -(options, args) = parser.parse_args() - -assert options.dir, 'Directory not specified' -assert options.root, 'Server not specified' -assert options.log or options.port, 'Log file or port not specified' -assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable' - -optionsDirWithSep = re.compile('^(.*?)/?$').match(options.dir) -if optionsDirWithSep: - options.dir = optionsDirWithSep.group(1) - -# this is file index - everything is stored in this file -# _parts - list of stored parts of file -# _time - last time the file was checked -# everything else is just the headers -index = shelve.open(options.dir + os.sep + '.index') -desc_fields = ('Content-Length', 'Pragma', 'Last-Modified') -ignore_fields = ('Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By') - -if not options.noetag: - desc_fields += 'ETag', -else: - ignore_fields += 'ETag', +import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request + +class Config: + __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root']) + _default = { + 'general': { + 'port': '8008', + }, + '_other': { + 'verbose': 'no', + 'noetag': 'no', + 'noparts': 'no', + },} + + # function to read in config file + def __init__(self): + import configparser, optparse + + parser = optparse.OptionParser() + parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf') + (self.options, args) = parser.parse_args() + + assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config) + + configDir = re.compile('^(.*)/[^/]+$').match(self.options.config) + if configDir: + self.root = configDir.group(1) + else: + self.root = os.getcwd() + + self._config = configparser.ConfigParser() + self._config.readfp(open(self.options.config)) + + for section in self._config.sections(): + if section != 'general': + if self._config.has_option(section, 'dir'): + if re.compile('^/$').match(self._config.get(section, 'dir')): + self._config.set(section, 'dir', self.root + os.sep + section) + thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir')) + if thisDir: + self._config.set(section, 'dir', thisDir.group(1)) + if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')): + self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir')) + else: + self._config.set(section, 'dir', self.root + os.sep + section) + + if not self._config.has_option(section, 'root'): + self._config.set(section, 'root', section) + + # function to select config file section or create one + def section(self, section): + if not self._config.has_section(section): + self._config.add_section(section) + self._section = section + + # function to get config parameter, if parameter doesn't exists the default + # value or None is substituted + def __getitem__(self, name): + if not self._config.has_option(self._section, name): + if self._section in self._default: + if name in self._default[self._section]: + self._config.set(self._section, name, self._default[self._section][name]) + else: + self._config.set(self._section, name, None) + elif name in self._default['_other']: + self._config.set(self._section, name, self._default['_other'][name]) + else: + self._config.set(self._section, name, None) + return(self._config.get(self._section, name)) + +config = Config() + +#assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable' + +const_desc_fields = set(['Content-Length', 'Pragma', 'Last-Modified']) +const_ignore_fields = set(['Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By']) block_size = 4096 -temp_file_name = options.dir + os.sep + '.tmp' - ''' # later, kqueue would be good but later class Connection: __slots__ = frozenset(('__address', '__input', '__socket', '__status', 'error', 'method', 'url', 'http_version')) @@ -138,11 +177,12 @@ del(connections[kev.ident]) finally: sock.close() ''' -if options.port: +# XXX how about rechecking files? +if True: import http.server class MyRequestHandler(http.server.BaseHTTPRequestHandler): def __process(self): # reload means file needs to be reloaded to serve request @@ -160,10 +200,27 @@ myPath = re.compile('^(.*?)(\?.*)$').match(self.path) if myPath: my_path = myPath.group(1) else: my_path = self.path + + config.section(self.headers['Host']) + + if not os.access(config['dir'], os.X_OK): + os.mkdir(config['dir']) + # this is file index - everything is stored in this file + # _parts - list of stored parts of file + # _time - last time the file was checked + # everything else is just the headers + index = shelve.open(config['dir'] + os.sep + '.index') + + desc_fields = const_desc_fields.copy() + ignore_fields = const_ignore_fields.copy() + if not config['noetag']: + desc_fields.add('ETag') + else: + ignore_fields.add('ETag') proxy_ignored = ('Accept', 'Accept-Encoding', 'Cache-Control', 'Connection', 'Host', 'If-Modified-Since', 'If-Unmodified-Since', @@ -178,13 +235,16 @@ if header in proxy_ignored: pass elif header in ('Range'): isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header]) if isRange: - requested_ranges = SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1}) + requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1}) else: return() + elif header in ('Pragma'): + if my_path in index: + index[my_path][header] = self.headers[header] else: print('Unknown header - ', header, ': ', self.headers[header], sep='') return() print(header, self.headers[header]) @@ -193,35 +253,36 @@ # if there's an empty space map - file is full # space map generally covers every bit of file we don't posess currently if not my_path in index: info += '\nThis one is new.' reload = True - record = {'_parts': None} + record = {} else: record = index[my_path] - if '_parts' in index[my_path]: - if index[my_path]['_parts'] == {0: -1}: - index[my_path]['_parts'] = None + + if not '_parts' in record: + record['_parts'] = None # creating file name from my_path - file_name = options.dir + os.sep + re.compile('%20').sub(' ', my_path) + file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path) # partial file or unfinished download - temp_name = options.dir + os.sep + '.parts' + re.compile('%20').sub(' ', my_path) + temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path) # forcibly checking file if no file present if os.access(file_name, os.R_OK): file_stat = os.stat(file_name) elif '_parts' in record and os.access(temp_name, os.R_OK): file_stat = os.stat(temp_name) elif not reload: + print(record) info += '\nFile not found or inaccessible.' - record = {'_parts': None} + record['_parts'] = None reload = True # forcibly checking file if file size doesn't match with index data if not reload: - if '_parts' in record and record['_parts'] == SpaceMap(): + if '_parts' in record and record['_parts'] == spacemap.SpaceMap(): if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']): info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length']) reload = True # forcibly checking file if index holds Pragma header @@ -235,15 +296,18 @@ print(info) if reload or recheck: try: - request = options.root + my_path + request = 'http://' + config['root'] + my_path needed = None + # XXX and if we specify full file we don't go partial? if requested_ranges != None: if '_parts' in record and record['_parts'] != None: needed = record['_parts'] & requested_ranges + elif config['noparts']: + needed = record['_parts'] else: needed = requested_ranges ranges = () print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed)) if len(needed) > 0: @@ -305,11 +369,11 @@ if os.access(temp_name, os.R_OK): os.unlink(temp_name) if os.access(file_name, os.R_OK): os.unlink(file_name) if new_record['_parts'] == None or reload: - new_record['_parts'] = SpaceMap({0: int(new_record['Content-Length'])}) + new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])}) print(new_record) # downloading file or segment if 'Content-Length' in new_record: if needed == None: @@ -323,11 +387,11 @@ new_record['_time'] = datetime.datetime.now() if self.command not in ('HEAD'): # file is created at temporary location and moved in place only when download completes if not os.access(temp_name, os.R_OK): - empty_name = options.dir + os.sep + '.tmp' + empty_name = config['dir'] + os.sep + '.tmp' with open(empty_name, 'w+b') as some_file: pass os.renames(empty_name, temp_name) temp_file = open(temp_name, 'r+b') needed.rewind() @@ -346,11 +410,11 @@ while length > 0 and stream_last < end: stream_pos = stream_last + length assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end) temp_file.seek(stream_last) temp_file.write(buffer) - new_record['_parts'] = new_record['_parts'] - SpaceMap({stream_last: stream_pos}) + new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos}) index[my_path] = old_record index.sync() old_record = new_record stream_last = stream_pos if end - stream_last < block_size: @@ -357,24 +421,25 @@ req_block_size = end - stream_last buffer = source.read(req_block_size) length = len(buffer) # moving downloaded data to real file temp_file.close() - if new_record['_parts'] == SpaceMap(): - # just moving - # drop old dirs XXX - print('Moving temporary file to new destination.') - os.renames(temp_name, file_name) print(new_record) index[my_path] = new_record index.sync() except urllib.error.HTTPError as error: # in case of error we don't need to do anything actually, # if file download stalls or fails the file would not be moved to it's location print(error) + + if '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap(): + # just moving + # drop old dirs XXX + print('Moving temporary file to new destination.') + os.renames(temp_name, file_name) if self.command == 'HEAD': self.send_response(200) if 'Content-Length' in index[my_path]: self.send_header('Content-Length', index[my_path]['Content-Length']) @@ -382,11 +447,11 @@ self.send_header('Content-Type', 'application/octet-stream') if 'Last-Modified' in index[my_path]: self.send_header('Last-Modified', index[my_path]['Last-Modified']) self.end_headers() else: - if index[my_path]['_parts'] != SpaceMap(): + if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK): file_name = temp_name with open(file_name, 'rb') as real_file: file_stat = os.stat(file_name) if 'Range' in self.headers: @@ -400,11 +465,11 @@ ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)), self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length'])) else: self.send_response(200) self.send_header('Content-Length', str(file_stat.st_size)) - requested_ranges = SpaceMap({0: file_stat.st_size}) + requested_ranges = spacemap.SpaceMap({0: file_stat.st_size}) self.send_header('Last-Modified', index[my_path]['Last-Modified']) self.send_header('Content-Type', 'application/octet-stream') self.end_headers() if self.command in ('GET'): if len(requested_ranges) > 0: @@ -433,11 +498,12 @@ def do_HEAD(self): return self.__process() def do_GET(self): return self.__process() - server = http.server.HTTPServer(('127.0.0.1', int(options.port)), MyRequestHandler) + config.section('general') + server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler) server.serve_forever() else: while True: unchecked_files = set()