Lines of samesite.py from check-in 996aa0149d that are changed by the sequence of edits moving toward check-in 7ff9724ae4:
996aa0149d 2013-03-13 1: #!/usr/bin/env python3.2 2: 3: import argparse, os 4: parser = argparse.ArgumentParser() 5: parser.add_argument('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf') 6: args = parser.parse_args() 7: assert os.access(args.config, os.R_OK), "Fatal error: can't read {}".format(args.config) 8: 9: import configparser 10: config = configparser.ConfigParser({ 11: 'port': '8008', 12: 'verbose': 'no', 13: 'noetag': 'no', 14: 'noparts': 'no', 15: 'strip': '', 16: 'sub': '', 17: 'proto': 'http', 18: }) 19: config.read(args.config) 20: 21: cache_dir = os.path.realpath(os.path.dirname(args.config)) 22: 23: import re 24: for section in config.sections(): 25: if section != 'DEFAULT': 26: if 'dir' in config[section]: 27: if not re.compile('^/.*').match(config[section]['dir']): 28: config[section]['dir'] = cache_dir + os.sep + section 29: thisDir = re.compile('^(.*)/$').match(config[section]['dir']) 30: if thisDir: 31: config[section]['dir'] = thisDir.group(1) 32: if not re.compile('^/(.*)$').match(config[section]['dir']): 33: config[section]['dir'] = cache_dir + os.sep + config[section]['dir'] 34: else: 35: config[section]['dir'] = cache_dir + os.sep + section 36: if not 'root' in config[section]: 37: config[section]['root'] = section 38: 39: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable' 40: 41: const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma']) 42: const_ignore_fields = set([ 43: 'Accept-Ranges', 'Age', 44: 'Cache-Control', 'Connection', 'Content-Type', 45: 'Date', 46: 'Expires', 47: 'Referer', 48: 'Server', 49: 'Via', 50: 'X-Cache', 'X-Cache-Lookup', 'X-Livetool', 'X-Powered-By', 51: ]) 52: 53: block_size = 8192 54: 55: import bsddb3.dbshelve, copy, datetime, http.server, spacemap, urllib.request, urllib.error 56: 57: class MyRequestHandler(http.server.BaseHTTPRequestHandler): 58: def __process(self): 59: # reload means file needs to be reloaded to serve request 60: reload = False 61: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy 62: recheck = False 63: # file_stat means file definitely exists 64: file_stat = None 65: # requested_ranges holds data about any range requested 66: requested_ranges = None 67: # records holds data from index locally, should be written back upon successfull completion 68: record = None 69: 70: myPath = re.compile('^(.*?)(\?.*)$').match(self.path) 71: if myPath: 72: my_path = myPath.group(1) 73: else: 74: my_path = self.path 75: 76: if not config.has_section(self.headers['Host']): 77: config.add_section(self.headers['Host']) 78: config[self.headers['Host']]['root'] = self.headers['Host'] 79: config[self.headers['Host']]['dir'] = cache_dir + os.sep + self.headers['Host'] 80: config_host = config[self.headers['Host']] 81: 82: if config_host['sub'] != None and config_host['strip'] != None and len(config_host['strip']) > 0: 83: string = re.compile(config_host['strip']).sub(config_host['sub'], my_path) 84: my_path = string 85: 86: my_path_b = my_path.encode('utf-8') 87: info = 'Checking file: ' + my_path 88: 89: if not os.access(config_host['dir'], os.X_OK): 90: os.mkdir(config_host['dir']) 91: # this is file index - everything is stored in this file 92: # _parts - list of stored parts of file 93: # _time - last time the file was checked 94: # everything else is just the headers 95: index = bsddb3.dbshelve.open(config_host['dir'] + os.sep + '.index') 96: 97: desc_fields = const_desc_fields.copy() 98: ignore_fields = const_ignore_fields.copy() 99: if config_host['noetag'] == 'no': 100: desc_fields.add('ETag') 101: else: 102: ignore_fields.add('ETag') 103: 104: proxy_ignored = set([ 105: 'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language', 106: 'Cache-Control', 'Connection', 'Content-Length', 'Cookie', 107: 'Host', 996aa0149d 2013-03-13 108: 'If-Modified-Since', 'If-Unmodified-Since', 109: 'Referer', 110: 'UA-CPU', 'User-Agent', 111: 'Via', 112: 'X-Forwarded-For', 'X-Last-HR', 'X-Last-HTTP-Status-Code', 'X-Old-UID', 'X-Removed', 'X-Real-IP', 'X-Retry-Count', 113: ]) 114: 115: print('===============[ {} request ]==='.format(self.command)) 116: 117: for header in self.headers: 118: if header in proxy_ignored: 119: pass 120: elif header in ('Range'): 121: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header]) 122: if isRange: 123: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1}) 124: else: 125: return() 126: elif header in ('Pragma'): 127: if my_path_b in index: 128: index[my_path_b][header] = self.headers[header] 129: else: 130: print('Unknown header - ', header, ': ', self.headers[header], sep='') 131: return() 132: print(header, self.headers[header]) 133: 134: # creating file name from my_path 135: file_name = config_host['dir'] + os.sep + re.compile('%20').sub(' ', my_path) 136: # partial file or unfinished download 137: temp_name = config_host['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path) 138: 139: # creating empty placeholder in index 140: # if there's no space map and there's no file in real directory - we have no file 141: # if there's an empty space map - file is full 142: # space map generally covers every bit of file we don't posess currently 143: if not my_path_b in index: 144: info += '\nThis one is new.' 145: reload = True 146: record = {} 147: else: 148: # forcibly checking file if no file present 149: record = index[my_path_b] 150: if os.access(file_name, os.R_OK): 151: info += '\nFull file found.' 152: file_stat = os.stat(file_name) 153: elif '_parts' in index[my_path_b] and os.access(temp_name, os.R_OK): 154: info += '\nPartial file found.' 155: file_stat = os.stat(temp_name) 156: recheck = True 157: else: 158: info += '\nFile not found or inaccessible.' 159: record['_parts'] = None 160: reload = True 161: 162: if not '_parts' in record: 163: record['_parts'] = None 164: 165: if record['_parts'] == None: 166: recheck = True 167: 168: # forcibly checking file if file size doesn't match with index data 169: if not reload: 170: if '_parts' in record and record['_parts'] == spacemap.SpaceMap(): 171: if 'content-length' in record and file_stat and file_stat.st_size != int(record['content-length']): 172: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['content-length']) 173: record['_parts'] = None 174: reload = True 175: 176: # forcibly checking file if index holds Pragma header 177: if not reload and 'pragma' in record and record['pragma'] == 'no-cache': 178: info +='\nPragma on: recheck imminent.' 179: recheck = True 180: 181: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago 182: if not recheck and not reload and '_time' in record and (record['_time'] - datetime.datetime.now() + datetime.timedelta(hours = 4)).days < 0: 183: info += '\nFile is old - rechecking.' 184: recheck = True 185: 186: print(info) 187: if reload or recheck: 188: 189: try: 190: request = config_host['proto'] + '://' + config_host['root'] + self.path 191: my_headers = {} 192: for header in ('Accept', 'Cache-Control', 'Cookie', 'Referer', 'User-Agent'): 193: if header in self.headers: 194: my_headers[header] = self.headers[header] 195: 196: needed = None 197: if self.command not in ('HEAD'): 198: if '_parts' in record and record['_parts'] != None: 199: if config_host['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap(): 200: needed = record['_parts'] 201: else: 202: needed = record['_parts'] & requested_ranges 203: elif config_host['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap(): 204: needed = requested_ranges 205: ranges = () 206: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed)) 207: if needed != None and len(needed) > 0: 208: needed.rewind() 209: while True: 210: range = needed.pop() 211: if range[0] == None: 212: break 213: ranges += '{}-{}'.format(range[0], range[1] - 1), 214: my_headers['Range'] = 'bytes=' + ','.join(ranges) 215: 216: #my_headers['Accept-Encoding'] = 'gzip, compress, deflate, identity; q=0' 217: request = urllib.request.Request(request, headers = my_headers) 218: 219: source = urllib.request.urlopen(request, timeout = 60) 220: new_record = {} 221: new_record['_parts'] = record['_parts'] 222: headers = source.info() 223: 224: if 'Content-Encoding' in headers and headers['Content-Encoding'] == 'gzip': 225: import gzip 226: source = gzip.GzipFile(fileobj=source) 227: 228: # stripping unneeded headers (XXX make this inplace?) 229: for header in headers: 230: if header in desc_fields: 231: #if header == 'Pragma' and headers[header] != 'no-cache': 232: if header == 'Content-Length': 233: if 'Content-Range' not in headers: 234: new_record[header] = int(headers[header]) 235: else: 236: new_record[header] = headers[header] 237: elif header == 'Content-Range': 238: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header]) 239: if range: 240: new_record['Content-Length'] = int(range.group(3)) 241: else: 242: assert False, 'Content-Range unrecognized.' 243: elif not header in ignore_fields: 244: print('Undefined header "', header, '": ', headers[header], sep='') 245: 246: # comparing headers with data found in index 247: # if any header has changed (except Pragma) file is fully downloaded 248: # same if we get more or less headers 249: old_keys = set(record.keys()) 250: old_keys.discard('_time') 251: old_keys.discard('Pragma') 252: more_keys = set(new_record.keys()) - old_keys 253: more_keys.discard('Pragma') 254: less_keys = old_keys - set(new_record.keys()) 255: if len(more_keys) > 0: 256: if len(old_keys) != 0: 257: print('More headers appear:', more_keys) 258: reload = True 259: elif len(less_keys) > 0: 260: print('Less headers appear:', less_keys) 261: else: 262: for key in record.keys(): 263: if key[0] != '_' and key != 'Pragma' and record[key] != new_record[key]: 264: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='') 265: print(type(record[key]), type(new_record[key])) 266: reload = True 267: 268: if reload: 269: print('Reloading.') 270: if os.access(temp_name, os.R_OK): 271: os.unlink(temp_name) 272: if os.access(file_name, os.R_OK): 273: os.unlink(file_name) 274: if 'Content-Length' in new_record: 275: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])}) 276: if not new_record['_parts']: 277: new_record['_parts'] = spacemap.SpaceMap() 278: print(new_record) 279: 280: # downloading file or segment 281: if 'Content-Length' in new_record: 282: if needed == None: 283: needed = new_record['_parts'] 284: else: 285: if len(needed) > 1: 286: print("Multipart requests currently not supported.") 287: assert False, 'Skip this one for now.' 288: #else: 289: #assert False, 'No content-length or Content-Range header.' 290: 291: new_record['_time'] = datetime.datetime.now() 292: if self.command not in ('HEAD'): 293: # file is created at temporary location and moved in place only when download completes 294: if not os.access(temp_name, os.R_OK): 295: empty_name = config_host['dir'] + os.sep + '.tmp' 296: with open(empty_name, 'w+b') as some_file: 297: pass 298: os.renames(empty_name, temp_name) 299: temp_file = open(temp_name, 'r+b') 300: if requested_ranges == None and needed == None: 301: needed = new_record['_parts'] 302: needed.rewind() 303: while True: 304: # XXX can make this implicit - one request per range 305: (start, end) = needed.pop() 306: if start == None: 307: break 308: stream_last = start 309: old_record = copy.copy(new_record) 310: if end - start < block_size: 311: req_block_size = end - start 312: else: 313: req_block_size = block_size 314: buffer = source.read(req_block_size) 315: length = len(buffer) 316: while length > 0 and stream_last < end: 317: stream_pos = stream_last + length 318: assert stream_pos <= end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end) 319: temp_file.seek(stream_last) 320: temp_file.write(buffer) 321: x = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos}) 322: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos}) 323: index[my_path_b] = old_record 324: index.sync() 325: old_record = copy.copy(new_record) 326: stream_last = stream_pos 327: if end - stream_last < block_size: 328: req_block_size = end - stream_last 329: buffer = source.read(req_block_size) 330: length = len(buffer) 331: # moving downloaded data to real file 332: temp_file.close() 333: 334: index[my_path_b] = new_record 335: index.sync() 336: 337: except urllib.error.HTTPError as error: 338: # in case of error we don't need to do anything actually, 339: # if file download stalls or fails the file would not be moved to it's location 340: print(error, repr(my_headers)) 341: 342: print(index[my_path_b]) 343: 344: if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path_b] and index[my_path_b]['_parts'] == spacemap.SpaceMap(): 345: # just moving 346: # drop old dirs XXX 347: print('Moving temporary file to new destination.') 348: os.renames(temp_name, file_name) 349: 350: if not my_path_b in index: 351: self.send_response(502) 352: self.end_headers() 353: return 354: 355: if self.command == 'HEAD': 356: self.send_response(200) 357: if 'Content-Length' in index[my_path_b]: 358: self.send_header('Content-Length', index[my_path_b]['Content-Length']) 359: self.send_header('Accept-Ranges', 'bytes') 360: self.send_header('Content-Type', 'application/octet-stream') 361: if 'Last-Modified' in index[my_path_b]: 362: self.send_header('Last-Modified', index[my_path_b]['Last-Modified']) 363: self.end_headers() 364: else: 365: if ('_parts' in index[my_path_b] and index[my_path_b]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK): 366: file_name = temp_name 367: 368: with open(file_name, 'rb') as real_file: 369: file_stat = os.stat(file_name) 370: if 'Range' in self.headers: 371: self.send_response(206) 372: ranges = () 373: requested_ranges.rewind() 374: while True: 375: pair = requested_ranges.pop() 376: if pair[0] == None: 377: break 378: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)), 379: self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path_b]['Content-Length'])) 380: else: 381: self.send_response(200) 382: self.send_header('Content-Length', str(file_stat.st_size)) 383: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size}) 384: if 'Last-Modified' in index[my_path_b]: 385: self.send_header('Last-Modified', index[my_path_b]['Last-Modified']) 386: self.send_header('Content-Type', 'application/octet-stream') 387: self.end_headers() 388: if self.command in ('GET'): 389: if len(requested_ranges) > 0: 390: requested_ranges.rewind() 391: (start, end) = requested_ranges.pop() 392: else: 393: start = 0 394: # XXX ugly hack 395: if 'Content-Length' in index[my_path_b]: 396: end = index[my_path_b]['Content-Length'] 397: else: 398: end = 0 399: real_file.seek(start) 400: if block_size > end - start: 401: req_block_size = end - start 402: else: 403: req_block_size = block_size 404: buffer = real_file.read(req_block_size) 405: length = len(buffer) 406: while length > 0: 407: self.wfile.write(buffer) 408: start += len(buffer) 409: if req_block_size > end - start: 410: req_block_size = end - start 411: if req_block_size == 0: 412: break 413: buffer = real_file.read(req_block_size) 414: length = len(buffer) 415: 416: def do_HEAD(self): 417: return self.__process() 418: def do_GET(self): 419: return self.__process() 420: 421: server = http.server.HTTPServer(('127.0.0.1', int(config['DEFAULT']['port'])), MyRequestHandler) 422: server.serve_forever() 423: 424: #gevent.joinall()