Lines of samesite.py from check-in 82969b1fc2 that are changed by the sequence of edits moving toward check-in d8731957ad:
82969b1fc2 2012-01-25 1: #!/usr/bin/env python 2: 3: from __future__ import unicode_literals, print_function 4: 5: #import gevent.monkey 6: #gevent.monkey.patch_all() 7: 82969b1fc2 2012-01-25 8: import bsddb.dbshelve, copy, datetime, os, BaseHTTPServer, sys, spacemap, re, urllib2 82969b1fc2 2012-01-25 9: 82969b1fc2 2012-01-25 10: class Config: 82969b1fc2 2012-01-25 11: __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root']) 82969b1fc2 2012-01-25 12: _default = { 82969b1fc2 2012-01-25 13: 'general': { 82969b1fc2 2012-01-25 14: 'port': '8008', 82969b1fc2 2012-01-25 15: }, 82969b1fc2 2012-01-25 16: '_other': { 82969b1fc2 2012-01-25 17: 'verbose': 'no', 82969b1fc2 2012-01-25 18: 'noetag': 'no', 82969b1fc2 2012-01-25 19: 'noparts': 'no', 82969b1fc2 2012-01-25 20: 'strip': '', 82969b1fc2 2012-01-25 21: 'sub': '', 82969b1fc2 2012-01-25 22: 'proto': 'http', 82969b1fc2 2012-01-25 23: },} 82969b1fc2 2012-01-25 24: 82969b1fc2 2012-01-25 25: # function to read in config file 82969b1fc2 2012-01-25 26: def __init__(self): 82969b1fc2 2012-01-25 27: import ConfigParser, optparse 82969b1fc2 2012-01-25 28: 82969b1fc2 2012-01-25 29: parser = optparse.OptionParser() 82969b1fc2 2012-01-25 30: parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf') 82969b1fc2 2012-01-25 31: (self.options, args) = parser.parse_args() 82969b1fc2 2012-01-25 32: 82969b1fc2 2012-01-25 33: assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config) 82969b1fc2 2012-01-25 34: 82969b1fc2 2012-01-25 35: configDir = re.compile('^(.*)/[^/]+$').match(self.options.config) 82969b1fc2 2012-01-25 36: if configDir: 82969b1fc2 2012-01-25 37: self.root = configDir.group(1) 38: else: 82969b1fc2 2012-01-25 39: self.root = os.getcwd() 82969b1fc2 2012-01-25 40: 82969b1fc2 2012-01-25 41: self._config = ConfigParser.ConfigParser() 82969b1fc2 2012-01-25 42: self._config.readfp(open(self.options.config)) 82969b1fc2 2012-01-25 43: 82969b1fc2 2012-01-25 44: for section in self._config.sections(): 82969b1fc2 2012-01-25 45: if section != 'general': 82969b1fc2 2012-01-25 46: if self._config.has_option(section, 'dir'): 82969b1fc2 2012-01-25 47: if re.compile('^/$').match(self._config.get(section, 'dir')): 82969b1fc2 2012-01-25 48: self._config.set(section, 'dir', self.root + os.sep + section) 82969b1fc2 2012-01-25 49: thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir')) 82969b1fc2 2012-01-25 50: if thisDir: 82969b1fc2 2012-01-25 51: self._config.set(section, 'dir', thisDir.group(1)) 82969b1fc2 2012-01-25 52: if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')): 82969b1fc2 2012-01-25 53: self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir')) 82969b1fc2 2012-01-25 54: else: 82969b1fc2 2012-01-25 55: self._config.set(section, 'dir', self.root + os.sep + section) 82969b1fc2 2012-01-25 56: 82969b1fc2 2012-01-25 57: if not self._config.has_option(section, 'root'): 82969b1fc2 2012-01-25 58: self._config.set(section, 'root', section) 82969b1fc2 2012-01-25 59: 82969b1fc2 2012-01-25 60: # function to select config file section or create one 82969b1fc2 2012-01-25 61: def section(self, section): 82969b1fc2 2012-01-25 62: if not self._config.has_section(section): 82969b1fc2 2012-01-25 63: self._config.add_section(section) 82969b1fc2 2012-01-25 64: self._section = section 82969b1fc2 2012-01-25 65: 82969b1fc2 2012-01-25 66: # function to get config parameter, if parameter doesn't exists the default 82969b1fc2 2012-01-25 67: # value or None is substituted 82969b1fc2 2012-01-25 68: def __getitem__(self, name): 82969b1fc2 2012-01-25 69: if not self._config.has_option(self._section, name): 82969b1fc2 2012-01-25 70: if self._section in self._default: 82969b1fc2 2012-01-25 71: if name in self._default[self._section]: 82969b1fc2 2012-01-25 72: self._config.set(self._section, name, self._default[self._section][name]) 82969b1fc2 2012-01-25 73: else: 82969b1fc2 2012-01-25 74: self._config.set(self._section, name, None) 82969b1fc2 2012-01-25 75: elif name in self._default['_other']: 82969b1fc2 2012-01-25 76: self._config.set(self._section, name, self._default['_other'][name]) 82969b1fc2 2012-01-25 77: else: 82969b1fc2 2012-01-25 78: self._config.set(self._section, name, None) 82969b1fc2 2012-01-25 79: return(self._config.get(self._section, name)) 80: 82969b1fc2 2012-01-25 81: config = Config() 82: 83: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable' 84: 82969b1fc2 2012-01-25 85: const_desc_fields = set(['content-length', 'last-modified', 'pragma']) 86: const_ignore_fields = set([ 82969b1fc2 2012-01-25 87: 'accept-ranges', 'age', 82969b1fc2 2012-01-25 88: 'cache-control', 'connection', 'content-type', 82969b1fc2 2012-01-25 89: 'date', 82969b1fc2 2012-01-25 90: 'expires', 82969b1fc2 2012-01-25 91: 'referer', 82969b1fc2 2012-01-25 92: 'server', 82969b1fc2 2012-01-25 93: 'via', 82969b1fc2 2012-01-25 94: 'x-cache', 'x-cache-lookup', 'x-livetool', 'x-powered-by', 95: ]) 96: 97: block_size = 8192 98: 82969b1fc2 2012-01-25 99: class MyRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler): 100: def __process(self): 101: # reload means file needs to be reloaded to serve request 102: reload = False 103: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy 104: recheck = False 105: # file_stat means file definitely exists 106: file_stat = None 107: # requested_ranges holds data about any range requested 108: requested_ranges = None 109: # records holds data from index locally, should be written back upon successfull completion 110: record = None 111: 112: myPath = re.compile('^(.*?)(\?.*)$').match(self.path) 113: if myPath: 114: my_path = myPath.group(1) 115: else: 116: my_path = self.path 117: 82969b1fc2 2012-01-25 118: config.section(self.headers['host']) 119: 82969b1fc2 2012-01-25 120: if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0: 82969b1fc2 2012-01-25 121: string = re.compile(config['strip']).sub(config['sub'], my_path) 122: my_path = string 123: 124: info = 'Checking file: ' + my_path 125: 82969b1fc2 2012-01-25 126: if not os.access(config['dir'], os.X_OK): 82969b1fc2 2012-01-25 127: os.mkdir(config['dir']) 128: # this is file index - everything is stored in this file 129: # _parts - list of stored parts of file 130: # _time - last time the file was checked 131: # everything else is just the headers 82969b1fc2 2012-01-25 132: index = bsddb.dbshelve.open(config['dir'] + os.sep + '.index') 133: 134: desc_fields = const_desc_fields.copy() 135: ignore_fields = const_ignore_fields.copy() 82969b1fc2 2012-01-25 136: if config['noetag'] == 'no': 137: desc_fields.add('etag') 138: else: 139: ignore_fields.add('etag') 140: 141: proxy_ignored = set([ 82969b1fc2 2012-01-25 142: 'accept', 'accept-charset', 'accept-encoding', 'accept-language', 82969b1fc2 2012-01-25 143: 'cache-control', 'connection', 'content-length', 'cookie', 82969b1fc2 2012-01-25 144: 'host', 82969b1fc2 2012-01-25 145: 'if-modified-since', 'if-unmodified-since', 82969b1fc2 2012-01-25 146: 'referer', 82969b1fc2 2012-01-25 147: 'ua-cpu', 'user-agent', 82969b1fc2 2012-01-25 148: 'via', 82969b1fc2 2012-01-25 149: 'x-forwarded-for', 'x-last-hr', 'x-last-http-status-code', 'x-removed', 'x-real-ip', 'x-retry-count', 150: ]) 151: 152: print('===============[ {} request ]==='.format(self.command)) 153: 154: for header in self.headers: 155: if header in proxy_ignored: 156: pass 82969b1fc2 2012-01-25 157: elif header in ('range'): 158: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header]) 159: if isRange: 160: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1}) 161: else: 162: return() 82969b1fc2 2012-01-25 163: elif header in ('pragma'): 82969b1fc2 2012-01-25 164: if my_path in index: 82969b1fc2 2012-01-25 165: index[my_path][header] = self.headers[header] 166: else: 167: print('Unknown header - ', header, ': ', self.headers[header], sep='') 168: return() 169: print(header, self.headers[header]) 170: 171: # creating file name from my_path 82969b1fc2 2012-01-25 172: file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path) 173: # partial file or unfinished download 82969b1fc2 2012-01-25 174: temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path) 175: 176: # creating empty placeholder in index 177: # if there's no space map and there's no file in real directory - we have no file 178: # if there's an empty space map - file is full 179: # space map generally covers every bit of file we don't posess currently 82969b1fc2 2012-01-25 180: if not my_path in index: 181: info += '\nThis one is new.' 182: reload = True 183: record = {} 184: else: 185: # forcibly checking file if no file present 82969b1fc2 2012-01-25 186: record = index[my_path] 187: if os.access(file_name, os.R_OK): 188: info += '\nFull file found.' 189: file_stat = os.stat(file_name) 82969b1fc2 2012-01-25 190: elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK): 191: info += '\nPartial file found.' 192: file_stat = os.stat(temp_name) 193: recheck = True 194: else: 195: info += '\nFile not found or inaccessible.' 196: record['_parts'] = None 197: reload = True 198: 199: if not '_parts' in record: 200: record['_parts'] = None 201: 202: if record['_parts'] == None: 203: recheck = True 204: 205: # forcibly checking file if file size doesn't match with index data 206: if not reload: 207: if '_parts' in record and record['_parts'] == spacemap.SpaceMap(): 208: if 'content-length' in record and file_stat and file_stat.st_size != int(record['content-length']): 209: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['content-length']) 210: record['_parts'] = None 211: reload = True 212: 213: # forcibly checking file if index holds Pragma header 214: if not reload and 'pragma' in record and record['pragma'] == 'no-cache': 215: info +='\nPragma on: recheck imminent.' 216: recheck = True 217: 218: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago 219: if not recheck and not reload and '_time' in record and (record['_time'] - datetime.datetime.now() + datetime.timedelta(hours = 4)).days < 0: 220: info += '\nFile is old - rechecking.' 221: recheck = True 222: 223: print(info) 224: if reload or recheck: 225: 226: try: 82969b1fc2 2012-01-25 227: request = config['proto'] + '://' + config['root'] + self.path 228: my_headers = {} 82969b1fc2 2012-01-25 229: for header in ('cache-control', 'cookie', 'referer', 'user-agent'): 230: if header in self.headers: 231: my_headers[header] = self.headers[header] 232: 233: needed = None 234: if self.command not in ('HEAD'): 235: if '_parts' in record and record['_parts'] != None: 82969b1fc2 2012-01-25 236: if config['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap(): 237: needed = record['_parts'] 238: else: 239: needed = record['_parts'] & requested_ranges 82969b1fc2 2012-01-25 240: elif config['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap(): 241: needed = requested_ranges 242: ranges = () 243: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed)) 244: if needed != None and len(needed) > 0: 245: needed.rewind() 246: while True: 247: range = needed.pop() 248: if range[0] == None: 249: break 250: ranges += '{}-{}'.format(range[0], range[1] - 1), 82969b1fc2 2012-01-25 251: my_headers['range'] = 'bytes=' + ','.join(ranges) 252: 253: my_headers['Accept-Encoding'] = 'gzip, compress, deflate, identity; q=0' 82969b1fc2 2012-01-25 254: request = urllib2.Request(request, headers = my_headers) 255: 82969b1fc2 2012-01-25 256: source = urllib2.urlopen(request, timeout = 60) 257: new_record = {} 258: new_record['_parts'] = record['_parts'] 259: headers = source.info() 260: 82969b1fc2 2012-01-25 261: if 'content-encoding' in headers and headers['content-encoding'] == 'gzip': 262: import gzip 263: source = gzip.GzipFile(fileobj=source) 264: 265: # stripping unneeded headers (XXX make this inplace?) 266: for header in headers: 267: if header in desc_fields: 268: #if header == 'Pragma' and headers[header] != 'no-cache': 82969b1fc2 2012-01-25 269: if header == 'content-length': 82969b1fc2 2012-01-25 270: if 'content-range' not in headers: 271: new_record[header] = int(headers[header]) 272: else: 273: new_record[header] = headers[header] 82969b1fc2 2012-01-25 274: elif header == 'content-range': 275: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header]) 276: if range: 82969b1fc2 2012-01-25 277: new_record['content-length'] = int(range.group(3)) 278: else: 279: assert False, 'Content-Range unrecognized.' 280: elif not header in ignore_fields: 281: print('Undefined header "', header, '": ', headers[header], sep='') 282: 283: # comparing headers with data found in index 284: # if any header has changed (except Pragma) file is fully downloaded 285: # same if we get more or less headers 286: old_keys = set(record.keys()) 287: old_keys.discard('_time') 82969b1fc2 2012-01-25 288: old_keys.discard('pragma') 289: more_keys = set(new_record.keys()) - old_keys 82969b1fc2 2012-01-25 290: more_keys.discard('pragma') 291: less_keys = old_keys - set(new_record.keys()) 292: if len(more_keys) > 0: 293: if len(old_keys) != 0: 294: print('More headers appear:', more_keys) 295: reload = True 296: elif len(less_keys) > 0: 297: print('Less headers appear:', less_keys) 298: else: 299: for key in record.keys(): 82969b1fc2 2012-01-25 300: if key[0] != '_' and key != 'pragma' and record[key] != new_record[key]: 301: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='') 302: print(type(record[key]), type(new_record[key])) 303: reload = True 304: 305: if reload: 306: print('Reloading.') 307: if os.access(temp_name, os.R_OK): 308: os.unlink(temp_name) 309: if os.access(file_name, os.R_OK): 310: os.unlink(file_name) 82969b1fc2 2012-01-25 311: if 'content-length' in new_record: 82969b1fc2 2012-01-25 312: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['content-length'])}) 313: if not new_record['_parts']: 314: new_record['_parts'] = spacemap.SpaceMap() 315: print(new_record) 316: 317: # downloading file or segment 82969b1fc2 2012-01-25 318: if 'content-length' in new_record: 319: if needed == None: 320: needed = new_record['_parts'] 321: else: 322: if len(needed) > 1: 323: print("Multipart requests currently not supported.") 324: assert False, 'Skip this one for now.' 325: #else: 326: #assert False, 'No content-length or Content-Range header.' 327: 328: new_record['_time'] = datetime.datetime.now() 329: if self.command not in ('HEAD'): 330: # file is created at temporary location and moved in place only when download completes 331: if not os.access(temp_name, os.R_OK): 82969b1fc2 2012-01-25 332: empty_name = config['dir'] + os.sep + '.tmp' 333: with open(empty_name, 'w+b') as some_file: 334: pass 335: os.renames(empty_name, temp_name) 336: temp_file = open(temp_name, 'r+b') 337: if requested_ranges == None and needed == None: 338: needed = new_record['_parts'] 339: needed.rewind() 340: while True: 341: # XXX can make this implicit - one request per range 342: (start, end) = needed.pop() 343: if start == None: 344: break 345: stream_last = start 346: old_record = copy.copy(new_record) 347: if end - start < block_size: 348: req_block_size = end - start 349: else: 350: req_block_size = block_size 351: buffer = source.read(req_block_size) 352: length = len(buffer) 353: while length > 0 and stream_last < end: 354: stream_pos = stream_last + length 355: assert stream_pos <= end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end) 356: temp_file.seek(stream_last) 357: temp_file.write(buffer) 358: x = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos}) 359: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos}) 82969b1fc2 2012-01-25 360: index[my_path] = old_record 361: index.sync() 362: old_record = copy.copy(new_record) 363: stream_last = stream_pos 364: if end - stream_last < block_size: 365: req_block_size = end - stream_last 366: buffer = source.read(req_block_size) 367: length = len(buffer) 368: # moving downloaded data to real file 369: temp_file.close() 370: 82969b1fc2 2012-01-25 371: index[my_path] = new_record 372: index.sync() 373: 82969b1fc2 2012-01-25 374: except urllib2.HTTPError as error: 375: # in case of error we don't need to do anything actually, 376: # if file download stalls or fails the file would not be moved to it's location 377: print(error) 378: 82969b1fc2 2012-01-25 379: print(index[my_path]) 380: 82969b1fc2 2012-01-25 381: if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap(): 382: # just moving 383: # drop old dirs XXX 384: print('Moving temporary file to new destination.') 385: os.renames(temp_name, file_name) 386: 82969b1fc2 2012-01-25 387: if not my_path in index: 388: self.send_response(502) 389: self.end_headers() 390: return 391: 392: if self.command == 'HEAD': 393: self.send_response(200) 82969b1fc2 2012-01-25 394: if 'content-length' in index[my_path]: 82969b1fc2 2012-01-25 395: self.send_header('content-length', index[my_path]['content-length']) 82969b1fc2 2012-01-25 396: self.send_header('accept-ranges', 'bytes') 82969b1fc2 2012-01-25 397: self.send_header('content-type', 'application/octet-stream') 82969b1fc2 2012-01-25 398: if 'last-modified' in index[my_path]: 82969b1fc2 2012-01-25 399: self.send_header('last-modified', index[my_path]['last-modified']) 400: self.end_headers() 401: else: 82969b1fc2 2012-01-25 402: if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK): 403: file_name = temp_name 404: 405: with open(file_name, 'rb') as real_file: 406: file_stat = os.stat(file_name) 82969b1fc2 2012-01-25 407: if 'range' in self.headers: 408: self.send_response(206) 409: ranges = () 410: requested_ranges.rewind() 411: while True: 412: pair = requested_ranges.pop() 413: if pair[0] == None: 414: break 415: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)), 82969b1fc2 2012-01-25 416: self.send_header('content-range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['content-length'])) 417: else: 418: self.send_response(200) 82969b1fc2 2012-01-25 419: self.send_header('content-length', str(file_stat.st_size)) 420: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size}) 82969b1fc2 2012-01-25 421: if 'last-modified' in index[my_path]: 82969b1fc2 2012-01-25 422: self.send_header('last-modified', index[my_path]['last-modified']) 82969b1fc2 2012-01-25 423: self.send_header('content-type', 'application/octet-stream') 424: self.end_headers() 425: if self.command in ('GET'): 426: if len(requested_ranges) > 0: 427: requested_ranges.rewind() 428: (start, end) = requested_ranges.pop() 429: else: 430: start = 0 431: # XXX ugly hack 82969b1fc2 2012-01-25 432: if 'content-length' in index[my_path]: 82969b1fc2 2012-01-25 433: end = index[my_path]['content-length'] 434: else: 435: end = 0 436: real_file.seek(start) 437: if block_size > end - start: 438: req_block_size = end - start 439: else: 440: req_block_size = block_size 441: buffer = real_file.read(req_block_size) 442: length = len(buffer) 443: while length > 0: 444: self.wfile.write(buffer) 445: start += len(buffer) 446: if req_block_size > end - start: 447: req_block_size = end - start 448: if req_block_size == 0: 449: break 450: buffer = real_file.read(req_block_size) 451: length = len(buffer) 452: 453: def do_HEAD(self): 454: return self.__process() 455: def do_GET(self): 456: return self.__process() 457: 82969b1fc2 2012-01-25 458: config.section('general') 82969b1fc2 2012-01-25 459: server = BaseHTTPServer.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler) 460: server.serve_forever() 461: 462: #gevent.joinall()