Lines of samesite.py from check-in 8425e2e393 that are changed by the sequence of edits moving toward check-in 601ec56da6:
8425e2e393 2011-12-14 1: #!/usr/bin/env python3.2 2: 8425e2e393 2011-12-14 3: import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request 4: 5: class Config: 6: __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root']) 7: _default = { 8: 'general': { 9: 'port': '8008', 10: }, 11: '_other': { 12: 'verbose': 'no', 13: 'noetag': 'no', 14: 'noparts': 'no', 15: 'strip': '', 16: 'sub': '', 17: },} 18: 19: # function to read in config file 20: def __init__(self): 8425e2e393 2011-12-14 21: import configparser, optparse 22: 23: parser = optparse.OptionParser() 24: parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf') 25: (self.options, args) = parser.parse_args() 26: 27: assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config) 28: 29: configDir = re.compile('^(.*)/[^/]+$').match(self.options.config) 30: if configDir: 31: self.root = configDir.group(1) 32: else: 33: self.root = os.getcwd() 34: 8425e2e393 2011-12-14 35: self._config = configparser.ConfigParser() 36: self._config.readfp(open(self.options.config)) 37: 38: for section in self._config.sections(): 39: if section != 'general': 40: if self._config.has_option(section, 'dir'): 41: if re.compile('^/$').match(self._config.get(section, 'dir')): 42: self._config.set(section, 'dir', self.root + os.sep + section) 43: thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir')) 44: if thisDir: 45: self._config.set(section, 'dir', thisDir.group(1)) 46: if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')): 47: self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir')) 48: else: 49: self._config.set(section, 'dir', self.root + os.sep + section) 50: 51: if not self._config.has_option(section, 'root'): 52: self._config.set(section, 'root', section) 53: 54: # function to select config file section or create one 55: def section(self, section): 56: if not self._config.has_section(section): 57: self._config.add_section(section) 58: self._section = section 59: 60: # function to get config parameter, if parameter doesn't exists the default 61: # value or None is substituted 62: def __getitem__(self, name): 63: if not self._config.has_option(self._section, name): 64: if self._section in self._default: 65: if name in self._default[self._section]: 66: self._config.set(self._section, name, self._default[self._section][name]) 67: else: 68: self._config.set(self._section, name, None) 69: elif name in self._default['_other']: 70: self._config.set(self._section, name, self._default['_other'][name]) 71: else: 72: self._config.set(self._section, name, None) 73: return(self._config.get(self._section, name)) 74: 75: config = Config() 76: 77: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable' 78: 8425e2e393 2011-12-14 79: const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma']) 80: const_ignore_fields = set([ 8425e2e393 2011-12-14 81: 'Accept-Ranges', 'Age', 8425e2e393 2011-12-14 82: 'Cache-Control', 'Connection', 'Content-Type', 8425e2e393 2011-12-14 83: 'Date', 8425e2e393 2011-12-14 84: 'Expires', 8425e2e393 2011-12-14 85: 'Referer', 8425e2e393 2011-12-14 86: 'Server', 8425e2e393 2011-12-14 87: 'Via', 8425e2e393 2011-12-14 88: 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By', 89: ]) 90: 91: block_size = 4096 92: 8425e2e393 2011-12-14 93: import http.server 8425e2e393 2011-12-14 94: 8425e2e393 2011-12-14 95: class MyRequestHandler(http.server.BaseHTTPRequestHandler): 96: def __process(self): 97: # reload means file needs to be reloaded to serve request 98: reload = False 99: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy 100: recheck = False 101: # file_stat means file definitely exists 102: file_stat = None 103: # requested_ranges holds data about any range requested 104: requested_ranges = None 105: # records holds data from index locally, should be written back upon successfull completion 106: record = None 107: 108: myPath = re.compile('^(.*?)(\?.*)$').match(self.path) 109: if myPath: 110: my_path = myPath.group(1) 111: else: 112: my_path = self.path 113: 8425e2e393 2011-12-14 114: config.section(self.headers['Host']) 115: 116: if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0: 117: string = re.compile(config['strip']).sub(config['sub'], my_path) 118: my_path = string 119: 120: info = 'Checking file: ' + my_path 121: 122: if not os.access(config['dir'], os.X_OK): 123: os.mkdir(config['dir']) 124: # this is file index - everything is stored in this file 125: # _parts - list of stored parts of file 126: # _time - last time the file was checked 127: # everything else is just the headers 8425e2e393 2011-12-14 128: index = shelve.open(config['dir'] + os.sep + '.index') 129: 130: desc_fields = const_desc_fields.copy() 131: ignore_fields = const_ignore_fields.copy() 132: if config['noetag'] == 'no': 8425e2e393 2011-12-14 133: desc_fields.add('ETag') 134: else: 8425e2e393 2011-12-14 135: ignore_fields.add('ETag') 136: 137: proxy_ignored = set([ 8425e2e393 2011-12-14 138: 'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language', 8425e2e393 2011-12-14 139: 'Cache-Control', 'Connection', 'Content-Length', 'Cookie', 8425e2e393 2011-12-14 140: 'Host', 8425e2e393 2011-12-14 141: 'If-Modified-Since', 'If-Unmodified-Since', 8425e2e393 2011-12-14 142: 'Referer', 8425e2e393 2011-12-14 143: 'User-Agent', 8425e2e393 2011-12-14 144: 'Via', 8425e2e393 2011-12-14 145: 'X-Forwarded-For', 'X-Last-HR', 'X-Last-HTTP-Status-Code', 'X-REMOVED', 'X-Real-IP', 'X-Retry-Count', 146: ]) 147: 148: print('===============[ {} request ]==='.format(self.command)) 149: 150: for header in self.headers: 151: if header in proxy_ignored: 152: pass 8425e2e393 2011-12-14 153: elif header in ('Range'): 154: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header]) 155: if isRange: 156: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1}) 157: else: 158: return() 8425e2e393 2011-12-14 159: elif header in ('Pragma'): 160: if my_path in index: 161: index[my_path][header] = self.headers[header] 162: else: 163: print('Unknown header - ', header, ': ', self.headers[header], sep='') 164: return() 165: print(header, self.headers[header]) 166: 167: # creating file name from my_path 168: file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path) 169: # partial file or unfinished download 170: temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path) 171: 172: # creating empty placeholder in index 173: # if there's no space map and there's no file in real directory - we have no file 174: # if there's an empty space map - file is full 175: # space map generally covers every bit of file we don't posess currently 176: if not my_path in index: 177: info += '\nThis one is new.' 178: reload = True 179: record = {} 180: else: 181: # forcibly checking file if no file present 182: record = index[my_path] 183: if os.access(file_name, os.R_OK): 184: info += '\nFull file found.' 185: file_stat = os.stat(file_name) 186: elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK): 187: info += '\nPartial file found.' 188: file_stat = os.stat(temp_name) 189: recheck = True 190: else: 191: info += '\nFile not found or inaccessible.' 192: record['_parts'] = None 193: reload = True 194: 195: if not '_parts' in record: 196: record['_parts'] = None 197: 198: if record['_parts'] == None: 199: recheck = True 200: 201: # forcibly checking file if file size doesn't match with index data 202: if not reload: 203: if '_parts' in record and record['_parts'] == spacemap.SpaceMap(): 8425e2e393 2011-12-14 204: if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']): 8425e2e393 2011-12-14 205: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length']) 206: record['_parts'] = None 207: reload = True 208: 209: # forcibly checking file if index holds Pragma header 8425e2e393 2011-12-14 210: if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache': 211: info +='\nPragma on: recheck imminent.' 212: recheck = True 213: 214: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago 215: if not recheck and not reload and '_time' in record and (record['_time'] - datetime.datetime.now() + datetime.timedelta(hours = 4)).days < 0: 216: info += '\nFile is old - rechecking.' 217: recheck = True 218: 219: print(info) 220: if reload or recheck: 221: 222: try: 223: request = 'http://' + config['root'] + self.path 224: my_headers = {} 8425e2e393 2011-12-14 225: for header in ('Cache-Control', 'Cookie', 'Referer', 'User-Agent'): 226: if header in self.headers: 227: my_headers[header] = self.headers[header] 228: 229: needed = None 230: if '_parts' in record and record['_parts'] != None: 231: if config['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap(): 232: needed = record['_parts'] 233: else: 234: needed = record['_parts'] & requested_ranges 235: elif config['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap(): 236: needed = requested_ranges 237: ranges = () 238: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed)) 239: if needed != None and len(needed) > 0: 240: needed.rewind() 241: while True: 242: range = needed.pop() 243: if range[0] == None: 244: break 245: ranges += '{}-{}'.format(range[0], range[1] - 1), 8425e2e393 2011-12-14 246: my_headers['Range'] = 'bytes=' + ','.join(ranges) 8425e2e393 2011-12-14 247: 8425e2e393 2011-12-14 248: request = urllib.request.Request(request, headers = my_headers) 8425e2e393 2011-12-14 249: 8425e2e393 2011-12-14 250: with urllib.request.urlopen(request) as source: 8425e2e393 2011-12-14 251: new_record = {} 8425e2e393 2011-12-14 252: new_record['_parts'] = record['_parts'] 8425e2e393 2011-12-14 253: headers = source.info() 8425e2e393 2011-12-14 254: 8425e2e393 2011-12-14 255: # stripping unneeded headers (XXX make this inplace?) 8425e2e393 2011-12-14 256: for header in headers: 8425e2e393 2011-12-14 257: if header in desc_fields: 8425e2e393 2011-12-14 258: #if header == 'Pragma' and headers[header] != 'no-cache': 8425e2e393 2011-12-14 259: if header == 'Content-Length': 8425e2e393 2011-12-14 260: if 'Content-Range' not in headers: 8425e2e393 2011-12-14 261: new_record[header] = int(headers[header]) 8425e2e393 2011-12-14 262: else: 8425e2e393 2011-12-14 263: new_record[header] = headers[header] 8425e2e393 2011-12-14 264: elif header == 'Content-Range': 8425e2e393 2011-12-14 265: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header]) 8425e2e393 2011-12-14 266: if range: 8425e2e393 2011-12-14 267: new_record['Content-Length'] = int(range.group(3)) 8425e2e393 2011-12-14 268: else: 8425e2e393 2011-12-14 269: assert False, 'Content-Range unrecognized.' 8425e2e393 2011-12-14 270: elif not header in ignore_fields: 8425e2e393 2011-12-14 271: print('Undefined header "', header, '": ', headers[header], sep='') 8425e2e393 2011-12-14 272: 8425e2e393 2011-12-14 273: # comparing headers with data found in index 8425e2e393 2011-12-14 274: # if any header has changed (except Pragma) file is fully downloaded 8425e2e393 2011-12-14 275: # same if we get more or less headers 8425e2e393 2011-12-14 276: old_keys = set(record.keys()) 8425e2e393 2011-12-14 277: old_keys.discard('_time') 8425e2e393 2011-12-14 278: old_keys.discard('Pragma') 8425e2e393 2011-12-14 279: more_keys = set(new_record.keys()) - old_keys 8425e2e393 2011-12-14 280: more_keys.discard('Pragma') 8425e2e393 2011-12-14 281: less_keys = old_keys - set(new_record.keys()) 8425e2e393 2011-12-14 282: if len(more_keys) > 0: 8425e2e393 2011-12-14 283: if not len(old_keys) == 0: 8425e2e393 2011-12-14 284: print('More headers appear:', more_keys) 8425e2e393 2011-12-14 285: reload = True 8425e2e393 2011-12-14 286: elif len(less_keys) > 0: 8425e2e393 2011-12-14 287: print('Less headers appear:', less_keys) 8425e2e393 2011-12-14 288: else: 8425e2e393 2011-12-14 289: for key in record.keys(): 8425e2e393 2011-12-14 290: if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]: 8425e2e393 2011-12-14 291: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='') 8425e2e393 2011-12-14 292: print(type(record[key]), type(new_record[key])) 8425e2e393 2011-12-14 293: reload = True 8425e2e393 2011-12-14 294: 8425e2e393 2011-12-14 295: if reload: 8425e2e393 2011-12-14 296: print('Reloading.') 8425e2e393 2011-12-14 297: if os.access(temp_name, os.R_OK): 8425e2e393 2011-12-14 298: os.unlink(temp_name) 8425e2e393 2011-12-14 299: if os.access(file_name, os.R_OK): 8425e2e393 2011-12-14 300: os.unlink(file_name) 8425e2e393 2011-12-14 301: if 'Content-Length' in new_record: 8425e2e393 2011-12-14 302: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])}) 8425e2e393 2011-12-14 303: else: 8425e2e393 2011-12-14 304: new_record['_parts'] = spacemap.SpaceMap() 8425e2e393 2011-12-14 305: print(new_record) 8425e2e393 2011-12-14 306: 8425e2e393 2011-12-14 307: # downloading file or segment 8425e2e393 2011-12-14 308: if 'Content-Length' in new_record: 8425e2e393 2011-12-14 309: if needed == None: 8425e2e393 2011-12-14 310: needed = new_record['_parts'] 8425e2e393 2011-12-14 311: else: 8425e2e393 2011-12-14 312: if len(needed) > 1: 8425e2e393 2011-12-14 313: print("Multipart requests currently not supported.") 8425e2e393 2011-12-14 314: assert False, 'Skip this one for now.' 8425e2e393 2011-12-14 315: #else: 8425e2e393 2011-12-14 316: #assert False, 'No Content-Length or Content-Range header.' 8425e2e393 2011-12-14 317: 8425e2e393 2011-12-14 318: new_record['_time'] = datetime.datetime.now() 8425e2e393 2011-12-14 319: if self.command not in ('HEAD'): 8425e2e393 2011-12-14 320: # file is created at temporary location and moved in place only when download completes 8425e2e393 2011-12-14 321: if not os.access(temp_name, os.R_OK): 8425e2e393 2011-12-14 322: empty_name = config['dir'] + os.sep + '.tmp' 8425e2e393 2011-12-14 323: with open(empty_name, 'w+b') as some_file: 8425e2e393 2011-12-14 324: pass 8425e2e393 2011-12-14 325: os.renames(empty_name, temp_name) 8425e2e393 2011-12-14 326: temp_file = open(temp_name, 'r+b') 8425e2e393 2011-12-14 327: if requested_ranges == None and needed == None: 8425e2e393 2011-12-14 328: needed = new_record['_parts'] 8425e2e393 2011-12-14 329: needed.rewind() 8425e2e393 2011-12-14 330: while True: 8425e2e393 2011-12-14 331: (start, end) = needed.pop() 8425e2e393 2011-12-14 332: if start == None: 8425e2e393 2011-12-14 333: break 8425e2e393 2011-12-14 334: stream_last = start 8425e2e393 2011-12-14 335: old_record = new_record 8425e2e393 2011-12-14 336: if end - start < block_size: 8425e2e393 2011-12-14 337: req_block_size = end - start 8425e2e393 2011-12-14 338: else: 8425e2e393 2011-12-14 339: req_block_size = block_size 8425e2e393 2011-12-14 340: buffer = source.read(req_block_size) 8425e2e393 2011-12-14 341: length = len(buffer) 8425e2e393 2011-12-14 342: while length > 0 and stream_last < end: 8425e2e393 2011-12-14 343: stream_pos = stream_last + length 8425e2e393 2011-12-14 344: assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end) 8425e2e393 2011-12-14 345: temp_file.seek(stream_last) 8425e2e393 2011-12-14 346: temp_file.write(buffer) 8425e2e393 2011-12-14 347: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos}) 8425e2e393 2011-12-14 348: index[my_path] = old_record 8425e2e393 2011-12-14 349: index.sync() 8425e2e393 2011-12-14 350: old_record = new_record 8425e2e393 2011-12-14 351: stream_last = stream_pos 8425e2e393 2011-12-14 352: if end - stream_last < block_size: 8425e2e393 2011-12-14 353: req_block_size = end - stream_last 8425e2e393 2011-12-14 354: buffer = source.read(req_block_size) 8425e2e393 2011-12-14 355: length = len(buffer) 8425e2e393 2011-12-14 356: # moving downloaded data to real file 8425e2e393 2011-12-14 357: temp_file.close() 8425e2e393 2011-12-14 358: 8425e2e393 2011-12-14 359: index[my_path] = new_record 8425e2e393 2011-12-14 360: index.sync() 8425e2e393 2011-12-14 361: 8425e2e393 2011-12-14 362: except urllib.error.HTTPError as error: 363: # in case of error we don't need to do anything actually, 364: # if file download stalls or fails the file would not be moved to it's location 365: print(error) 366: 367: print(index[my_path]) 368: 369: if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap(): 370: # just moving 371: # drop old dirs XXX 372: print('Moving temporary file to new destination.') 373: os.renames(temp_name, file_name) 374: 375: if not my_path in index: 376: self.send_response(502) 377: self.end_headers() 378: return 379: 380: if self.command == 'HEAD': 381: self.send_response(200) 8425e2e393 2011-12-14 382: if 'Content-Length' in index[my_path]: 8425e2e393 2011-12-14 383: self.send_header('Content-Length', index[my_path]['Content-Length']) 8425e2e393 2011-12-14 384: self.send_header('Accept-Ranges', 'bytes') 8425e2e393 2011-12-14 385: self.send_header('Content-Type', 'application/octet-stream') 8425e2e393 2011-12-14 386: if 'Last-Modified' in index[my_path]: 8425e2e393 2011-12-14 387: self.send_header('Last-Modified', index[my_path]['Last-Modified']) 388: self.end_headers() 389: else: 390: if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK): 391: file_name = temp_name 392: 393: with open(file_name, 'rb') as real_file: 394: file_stat = os.stat(file_name) 8425e2e393 2011-12-14 395: if 'Range' in self.headers: 396: self.send_response(206) 397: ranges = () 398: requested_ranges.rewind() 399: while True: 400: pair = requested_ranges.pop() 401: if pair[0] == None: 402: break 403: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)), 8425e2e393 2011-12-14 404: self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length'])) 405: else: 406: self.send_response(200) 8425e2e393 2011-12-14 407: self.send_header('Content-Length', str(file_stat.st_size)) 408: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size}) 8425e2e393 2011-12-14 409: if 'Last-Modified' in index[my_path]: 8425e2e393 2011-12-14 410: self.send_header('Last-Modified', index[my_path]['Last-Modified']) 8425e2e393 2011-12-14 411: self.send_header('Content-Type', 'application/octet-stream') 412: self.end_headers() 413: if self.command in ('GET'): 414: if len(requested_ranges) > 0: 415: requested_ranges.rewind() 416: (start, end) = requested_ranges.pop() 417: else: 418: start = 0 419: # XXX ugly hack 8425e2e393 2011-12-14 420: if 'Content-Length' in index[my_path]: 8425e2e393 2011-12-14 421: end = index[my_path]['Content-Length'] 422: else: 423: end = 0 424: real_file.seek(start) 425: if block_size > end - start: 426: req_block_size = end - start 427: else: 428: req_block_size = block_size 429: buffer = real_file.read(req_block_size) 430: length = len(buffer) 431: while length > 0: 432: self.wfile.write(buffer) 433: start += len(buffer) 434: if req_block_size > end - start: 435: req_block_size = end - start 436: if req_block_size == 0: 437: break 438: buffer = real_file.read(req_block_size) 439: length = len(buffer) 440: 441: def do_HEAD(self): 442: return self.__process() 443: def do_GET(self): 444: return self.__process() 445: 446: config.section('general') 8425e2e393 2011-12-14 447: server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler) 448: server.serve_forever()