Lines of samesite.py from check-in 90160dbf50 that are changed by the sequence of edits moving toward check-in 9a8a46bcf0:
90160dbf50 2011-03-06 1: #!/usr/bin/env python3.1 2: 3: import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request 4: 5: class Config: 6: __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root']) 7: _default = { 8: 'general': { 9: 'port': '8008', 10: }, 11: '_other': { 12: 'verbose': 'no', 13: 'noetag': 'no', 14: 'noparts': 'no', 15: 'strip': '', 16: 'sub': '', 17: },} 18: 19: # function to read in config file 20: def __init__(self): 21: import configparser, optparse 22: 23: parser = optparse.OptionParser() 24: parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf') 25: (self.options, args) = parser.parse_args() 26: 27: assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config) 28: 29: configDir = re.compile('^(.*)/[^/]+$').match(self.options.config) 30: if configDir: 31: self.root = configDir.group(1) 32: else: 33: self.root = os.getcwd() 34: 35: self._config = configparser.ConfigParser() 36: self._config.readfp(open(self.options.config)) 37: 38: for section in self._config.sections(): 39: if section != 'general': 40: if self._config.has_option(section, 'dir'): 41: if re.compile('^/$').match(self._config.get(section, 'dir')): 42: self._config.set(section, 'dir', self.root + os.sep + section) 43: thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir')) 44: if thisDir: 45: self._config.set(section, 'dir', thisDir.group(1)) 46: if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')): 47: self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir')) 48: else: 49: self._config.set(section, 'dir', self.root + os.sep + section) 50: 51: if not self._config.has_option(section, 'root'): 52: self._config.set(section, 'root', section) 53: 54: # function to select config file section or create one 55: def section(self, section): 56: if not self._config.has_section(section): 57: self._config.add_section(section) 58: self._section = section 59: 60: # function to get config parameter, if parameter doesn't exists the default 61: # value or None is substituted 62: def __getitem__(self, name): 63: if not self._config.has_option(self._section, name): 64: if self._section in self._default: 65: if name in self._default[self._section]: 66: self._config.set(self._section, name, self._default[self._section][name]) 67: else: 68: self._config.set(self._section, name, None) 69: elif name in self._default['_other']: 70: self._config.set(self._section, name, self._default['_other'][name]) 71: else: 72: self._config.set(self._section, name, None) 73: return(self._config.get(self._section, name)) 74: 75: config = Config() 76: 77: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable' 78: 79: const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma']) 80: const_ignore_fields = set([ 81: 'Accept-Ranges', 'Age', 82: 'Cache-Control', 'Connection', 'Content-Type', 83: 'Date', 84: 'Expires', 85: 'Referer', 86: 'Server', 87: 'Via', 90160dbf50 2011-03-06 88: 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By' 89: ]) 90: 91: block_size = 4096 92: 93: import http.server 94: 95: class MyRequestHandler(http.server.BaseHTTPRequestHandler): 96: def __process(self): 97: # reload means file needs to be reloaded to serve request 98: reload = False 99: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy 100: recheck = False 101: # file_stat means file definitely exists 102: file_stat = None 103: # requested_ranges holds data about any range requested 104: requested_ranges = None 105: # records holds data from index locally, should be written back upon successfull completion 106: record = None 107: 108: myPath = re.compile('^(.*?)(\?.*)$').match(self.path) 109: if myPath: 110: my_path = myPath.group(1) 111: else: 112: my_path = self.path 113: 114: config.section(self.headers['Host']) 115: 116: if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0: 117: string = re.compile(config['strip']).sub(config['sub'], my_path) 118: my_path = string 119: 120: info = 'Checking file: ' + my_path 121: 122: if not os.access(config['dir'], os.X_OK): 123: os.mkdir(config['dir']) 124: # this is file index - everything is stored in this file 125: # _parts - list of stored parts of file 126: # _time - last time the file was checked 127: # everything else is just the headers 128: index = shelve.open(config['dir'] + os.sep + '.index') 129: 130: desc_fields = const_desc_fields.copy() 131: ignore_fields = const_ignore_fields.copy() 132: if config['noetag'] == 'no': 133: desc_fields.add('ETag') 134: else: 135: ignore_fields.add('ETag') 136: 137: proxy_ignored = set([ 138: 'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language', 139: 'Cache-Control', 'Connection', 'Content-Length', 'Cookie', 140: 'Host', 141: 'If-Modified-Since', 'If-Unmodified-Since', 142: 'Referer', 143: 'User-Agent', 144: 'Via', 90160dbf50 2011-03-06 145: 'X-Forwarded-For', 'X-REMOVED', 146: ]) 147: 148: print('===============[ {} request ]==='.format(self.command)) 149: 150: for header in self.headers: 151: if header in proxy_ignored: 152: pass 153: elif header in ('Range'): 154: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header]) 155: if isRange: 156: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1}) 157: else: 158: return() 159: elif header in ('Pragma'): 160: if my_path in index: 161: index[my_path][header] = self.headers[header] 162: else: 163: print('Unknown header - ', header, ': ', self.headers[header], sep='') 164: return() 165: print(header, self.headers[header]) 166: 167: # creating file name from my_path 168: file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path) 169: # partial file or unfinished download 170: temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path) 171: 172: # creating empty placeholder in index 173: # if there's no space map and there's no file in real directory - we have no file 174: # if there's an empty space map - file is full 175: # space map generally covers every bit of file we don't posess currently 176: if not my_path in index: 177: info += '\nThis one is new.' 178: reload = True 179: record = {} 180: else: 181: # forcibly checking file if no file present 182: record = index[my_path] 183: if os.access(file_name, os.R_OK): 184: info += '\nFull file found.' 185: file_stat = os.stat(file_name) 186: elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK): 187: info += '\nPartial file found.' 188: file_stat = os.stat(temp_name) 189: recheck = True 190: else: 191: info += '\nFile not found or inaccessible.' 192: record['_parts'] = None 193: reload = True 194: 195: if not '_parts' in record: 196: record['_parts'] = None 197: 198: if record['_parts'] == None: 199: recheck = True 200: 201: # forcibly checking file if file size doesn't match with index data 202: if not reload: 203: if '_parts' in record and record['_parts'] == spacemap.SpaceMap(): 204: if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']): 205: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length']) 206: record['_parts'] = None 207: reload = True 208: 209: # forcibly checking file if index holds Pragma header 210: if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache': 211: info +='\nPragma on: recheck imminent.' 212: recheck = True 213: 214: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago 215: if not recheck and not reload and '_time' in record and (datetime.datetime.now() - datetime.timedelta(hours = 4) - record['_time']).days < 0: 216: recheck = True 217: 218: print(info) 219: if reload or recheck: 220: 221: try: 222: request = 'http://' + config['root'] + self.path 223: my_headers = {} 224: for header in ('Cache-Control', 'Cookie', 'Referer', 'User-Agent'): 225: if header in self.headers: 226: my_headers[header] = self.headers[header] 227: 228: needed = None 229: if '_parts' in record and record['_parts'] != None: 230: if config['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap(): 231: needed = record['_parts'] 232: else: 90160dbf50 2011-03-06 233: needed = record['_parts'] | requested_ranges 234: elif config['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap(): 235: needed = requested_ranges 236: ranges = () 237: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed)) 238: if needed != None and len(needed) > 0: 239: needed.rewind() 240: while True: 241: range = needed.pop() 242: if range[0] == None: 243: break 244: ranges += '{}-{}'.format(range[0], range[1] - 1), 245: my_headers['Range'] = 'bytes=' + ','.join(ranges) 246: 247: request = urllib.request.Request(request, headers = my_headers) 248: 249: with urllib.request.urlopen(request) as source: 250: new_record = {} 251: new_record['_parts'] = record['_parts'] 252: headers = source.info() 253: 254: # stripping unneeded headers (XXX make this inplace?) 255: for header in headers: 256: if header in desc_fields: 257: #if header == 'Pragma' and headers[header] != 'no-cache': 258: if header == 'Content-Length': 259: if 'Content-Range' not in headers: 260: new_record[header] = int(headers[header]) 261: else: 262: new_record[header] = headers[header] 263: elif header == 'Content-Range': 264: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header]) 265: if range: 266: new_record['Content-Length'] = int(range.group(3)) 267: else: 268: assert False, 'Content-Range unrecognized.' 269: elif not header in ignore_fields: 270: print('Undefined header "', header, '": ', headers[header], sep='') 271: 272: # comparing headers with data found in index 273: # if any header has changed (except Pragma) file is fully downloaded 274: # same if we get more or less headers 275: old_keys = set(record.keys()) 276: old_keys.discard('_time') 277: old_keys.discard('Pragma') 278: more_keys = set(new_record.keys()) - old_keys 279: more_keys.discard('Pragma') 280: less_keys = old_keys - set(new_record.keys()) 281: if len(more_keys) > 0: 282: if not len(old_keys) == 0: 283: print('More headers appear:', more_keys) 284: reload = True 285: elif len(less_keys) > 0: 286: print('Less headers appear:', less_keys) 287: else: 288: for key in record.keys(): 289: if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]: 290: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='') 291: print(type(record[key]), type(new_record[key])) 292: reload = True 293: 294: if reload: 295: print('Reloading.') 296: if os.access(temp_name, os.R_OK): 297: os.unlink(temp_name) 298: if os.access(file_name, os.R_OK): 299: os.unlink(file_name) 90160dbf50 2011-03-06 300: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])}) 301: print(new_record) 302: 303: # downloading file or segment 304: if 'Content-Length' in new_record: 305: if needed == None: 306: needed = new_record['_parts'] 307: else: 308: if len(needed) > 1: 309: print("Multipart requests currently not supported.") 310: assert False, 'Skip this one for now.' 90160dbf50 2011-03-06 311: else: 90160dbf50 2011-03-06 312: assert False, 'No Content-Length or Content-Range header.' 313: 314: new_record['_time'] = datetime.datetime.now() 315: if self.command not in ('HEAD'): 316: # file is created at temporary location and moved in place only when download completes 317: if not os.access(temp_name, os.R_OK): 318: empty_name = config['dir'] + os.sep + '.tmp' 319: with open(empty_name, 'w+b') as some_file: 320: pass 321: os.renames(empty_name, temp_name) 322: temp_file = open(temp_name, 'r+b') 323: if requested_ranges == None and needed == None: 324: needed = new_record['_parts'] 325: needed.rewind() 326: while True: 327: (start, end) = needed.pop() 328: if start == None: 329: break 330: stream_last = start 331: old_record = new_record 332: if end - start < block_size: 333: req_block_size = end - start 334: else: 335: req_block_size = block_size 336: buffer = source.read(req_block_size) 337: length = len(buffer) 338: while length > 0 and stream_last < end: 339: stream_pos = stream_last + length 340: assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end) 341: temp_file.seek(stream_last) 342: temp_file.write(buffer) 343: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos}) 344: index[my_path] = old_record 345: index.sync() 346: old_record = new_record 347: stream_last = stream_pos 348: if end - stream_last < block_size: 349: req_block_size = end - stream_last 350: buffer = source.read(req_block_size) 351: length = len(buffer) 352: # moving downloaded data to real file 353: temp_file.close() 354: 355: index[my_path] = new_record 356: index.sync() 357: 358: except urllib.error.HTTPError as error: 359: # in case of error we don't need to do anything actually, 360: # if file download stalls or fails the file would not be moved to it's location 361: print(error) 362: 363: print(index[my_path]) 364: 365: if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap(): 366: # just moving 367: # drop old dirs XXX 368: print('Moving temporary file to new destination.') 369: os.renames(temp_name, file_name) 370: 371: if not my_path in index: 372: self.send_response(502) 373: self.end_headers() 374: return 375: 376: if self.command == 'HEAD': 377: self.send_response(200) 378: if 'Content-Length' in index[my_path]: 379: self.send_header('Content-Length', index[my_path]['Content-Length']) 380: self.send_header('Accept-Ranges', 'bytes') 381: self.send_header('Content-Type', 'application/octet-stream') 382: if 'Last-Modified' in index[my_path]: 383: self.send_header('Last-Modified', index[my_path]['Last-Modified']) 384: self.end_headers() 385: else: 386: if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK): 387: file_name = temp_name 388: 389: with open(file_name, 'rb') as real_file: 390: file_stat = os.stat(file_name) 391: if 'Range' in self.headers: 392: self.send_response(206) 393: ranges = () 394: requested_ranges.rewind() 395: while True: 396: pair = requested_ranges.pop() 397: if pair[0] == None: 398: break 399: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)), 400: self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length'])) 401: else: 402: self.send_response(200) 403: self.send_header('Content-Length', str(file_stat.st_size)) 404: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size}) 405: if 'Last-Modified' in index[my_path]: 406: self.send_header('Last-Modified', index[my_path]['Last-Modified']) 407: self.send_header('Content-Type', 'application/octet-stream') 408: self.end_headers() 409: if self.command in ('GET'): 410: if len(requested_ranges) > 0: 411: requested_ranges.rewind() 412: (start, end) = requested_ranges.pop() 413: else: 414: start = 0 90160dbf50 2011-03-06 415: end = index[my_path]['Content-Length'] 416: real_file.seek(start) 417: if block_size > end - start: 418: req_block_size = end - start 419: else: 420: req_block_size = block_size 421: buffer = real_file.read(req_block_size) 422: length = len(buffer) 423: while length > 0: 424: self.wfile.write(buffer) 425: start += len(buffer) 426: if req_block_size > end - start: 427: req_block_size = end - start 428: if req_block_size == 0: 429: break 430: buffer = real_file.read(req_block_size) 431: length = len(buffer) 432: 433: def do_HEAD(self): 434: return self.__process() 435: def do_GET(self): 436: return self.__process() 437: 438: config.section('general') 439: server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler) 440: server.serve_forever()