Lines of
samesite.py
from check-in 82969b1fc2
that are changed by the sequence of edits moving toward
check-in d8731957ad:
82969b1fc2 2012-01-25 1: #!/usr/bin/env python
2:
3: from __future__ import unicode_literals, print_function
4:
5: #import gevent.monkey
6: #gevent.monkey.patch_all()
7:
82969b1fc2 2012-01-25 8: import bsddb.dbshelve, copy, datetime, os, BaseHTTPServer, sys, spacemap, re, urllib2
82969b1fc2 2012-01-25 9:
82969b1fc2 2012-01-25 10: class Config:
82969b1fc2 2012-01-25 11: __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
82969b1fc2 2012-01-25 12: _default = {
82969b1fc2 2012-01-25 13: 'general': {
82969b1fc2 2012-01-25 14: 'port': '8008',
82969b1fc2 2012-01-25 15: },
82969b1fc2 2012-01-25 16: '_other': {
82969b1fc2 2012-01-25 17: 'verbose': 'no',
82969b1fc2 2012-01-25 18: 'noetag': 'no',
82969b1fc2 2012-01-25 19: 'noparts': 'no',
82969b1fc2 2012-01-25 20: 'strip': '',
82969b1fc2 2012-01-25 21: 'sub': '',
82969b1fc2 2012-01-25 22: 'proto': 'http',
82969b1fc2 2012-01-25 23: },}
82969b1fc2 2012-01-25 24:
82969b1fc2 2012-01-25 25: # function to read in config file
82969b1fc2 2012-01-25 26: def __init__(self):
82969b1fc2 2012-01-25 27: import ConfigParser, optparse
82969b1fc2 2012-01-25 28:
82969b1fc2 2012-01-25 29: parser = optparse.OptionParser()
82969b1fc2 2012-01-25 30: parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
82969b1fc2 2012-01-25 31: (self.options, args) = parser.parse_args()
82969b1fc2 2012-01-25 32:
82969b1fc2 2012-01-25 33: assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
82969b1fc2 2012-01-25 34:
82969b1fc2 2012-01-25 35: configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
82969b1fc2 2012-01-25 36: if configDir:
82969b1fc2 2012-01-25 37: self.root = configDir.group(1)
38: else:
82969b1fc2 2012-01-25 39: self.root = os.getcwd()
82969b1fc2 2012-01-25 40:
82969b1fc2 2012-01-25 41: self._config = ConfigParser.ConfigParser()
82969b1fc2 2012-01-25 42: self._config.readfp(open(self.options.config))
82969b1fc2 2012-01-25 43:
82969b1fc2 2012-01-25 44: for section in self._config.sections():
82969b1fc2 2012-01-25 45: if section != 'general':
82969b1fc2 2012-01-25 46: if self._config.has_option(section, 'dir'):
82969b1fc2 2012-01-25 47: if re.compile('^/$').match(self._config.get(section, 'dir')):
82969b1fc2 2012-01-25 48: self._config.set(section, 'dir', self.root + os.sep + section)
82969b1fc2 2012-01-25 49: thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
82969b1fc2 2012-01-25 50: if thisDir:
82969b1fc2 2012-01-25 51: self._config.set(section, 'dir', thisDir.group(1))
82969b1fc2 2012-01-25 52: if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
82969b1fc2 2012-01-25 53: self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
82969b1fc2 2012-01-25 54: else:
82969b1fc2 2012-01-25 55: self._config.set(section, 'dir', self.root + os.sep + section)
82969b1fc2 2012-01-25 56:
82969b1fc2 2012-01-25 57: if not self._config.has_option(section, 'root'):
82969b1fc2 2012-01-25 58: self._config.set(section, 'root', section)
82969b1fc2 2012-01-25 59:
82969b1fc2 2012-01-25 60: # function to select config file section or create one
82969b1fc2 2012-01-25 61: def section(self, section):
82969b1fc2 2012-01-25 62: if not self._config.has_section(section):
82969b1fc2 2012-01-25 63: self._config.add_section(section)
82969b1fc2 2012-01-25 64: self._section = section
82969b1fc2 2012-01-25 65:
82969b1fc2 2012-01-25 66: # function to get config parameter, if parameter doesn't exists the default
82969b1fc2 2012-01-25 67: # value or None is substituted
82969b1fc2 2012-01-25 68: def __getitem__(self, name):
82969b1fc2 2012-01-25 69: if not self._config.has_option(self._section, name):
82969b1fc2 2012-01-25 70: if self._section in self._default:
82969b1fc2 2012-01-25 71: if name in self._default[self._section]:
82969b1fc2 2012-01-25 72: self._config.set(self._section, name, self._default[self._section][name])
82969b1fc2 2012-01-25 73: else:
82969b1fc2 2012-01-25 74: self._config.set(self._section, name, None)
82969b1fc2 2012-01-25 75: elif name in self._default['_other']:
82969b1fc2 2012-01-25 76: self._config.set(self._section, name, self._default['_other'][name])
82969b1fc2 2012-01-25 77: else:
82969b1fc2 2012-01-25 78: self._config.set(self._section, name, None)
82969b1fc2 2012-01-25 79: return(self._config.get(self._section, name))
80:
82969b1fc2 2012-01-25 81: config = Config()
82:
83: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
84:
82969b1fc2 2012-01-25 85: const_desc_fields = set(['content-length', 'last-modified', 'pragma'])
86: const_ignore_fields = set([
82969b1fc2 2012-01-25 87: 'accept-ranges', 'age',
82969b1fc2 2012-01-25 88: 'cache-control', 'connection', 'content-type',
82969b1fc2 2012-01-25 89: 'date',
82969b1fc2 2012-01-25 90: 'expires',
82969b1fc2 2012-01-25 91: 'referer',
82969b1fc2 2012-01-25 92: 'server',
82969b1fc2 2012-01-25 93: 'via',
82969b1fc2 2012-01-25 94: 'x-cache', 'x-cache-lookup', 'x-livetool', 'x-powered-by',
95: ])
96:
97: block_size = 8192
98:
82969b1fc2 2012-01-25 99: class MyRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
100: def __process(self):
101: # reload means file needs to be reloaded to serve request
102: reload = False
103: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
104: recheck = False
105: # file_stat means file definitely exists
106: file_stat = None
107: # requested_ranges holds data about any range requested
108: requested_ranges = None
109: # records holds data from index locally, should be written back upon successfull completion
110: record = None
111:
112: myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
113: if myPath:
114: my_path = myPath.group(1)
115: else:
116: my_path = self.path
117:
82969b1fc2 2012-01-25 118: config.section(self.headers['host'])
119:
82969b1fc2 2012-01-25 120: if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0:
82969b1fc2 2012-01-25 121: string = re.compile(config['strip']).sub(config['sub'], my_path)
122: my_path = string
123:
124: info = 'Checking file: ' + my_path
125:
82969b1fc2 2012-01-25 126: if not os.access(config['dir'], os.X_OK):
82969b1fc2 2012-01-25 127: os.mkdir(config['dir'])
128: # this is file index - everything is stored in this file
129: # _parts - list of stored parts of file
130: # _time - last time the file was checked
131: # everything else is just the headers
82969b1fc2 2012-01-25 132: index = bsddb.dbshelve.open(config['dir'] + os.sep + '.index')
133:
134: desc_fields = const_desc_fields.copy()
135: ignore_fields = const_ignore_fields.copy()
82969b1fc2 2012-01-25 136: if config['noetag'] == 'no':
137: desc_fields.add('etag')
138: else:
139: ignore_fields.add('etag')
140:
141: proxy_ignored = set([
82969b1fc2 2012-01-25 142: 'accept', 'accept-charset', 'accept-encoding', 'accept-language',
82969b1fc2 2012-01-25 143: 'cache-control', 'connection', 'content-length', 'cookie',
82969b1fc2 2012-01-25 144: 'host',
82969b1fc2 2012-01-25 145: 'if-modified-since', 'if-unmodified-since',
82969b1fc2 2012-01-25 146: 'referer',
82969b1fc2 2012-01-25 147: 'ua-cpu', 'user-agent',
82969b1fc2 2012-01-25 148: 'via',
82969b1fc2 2012-01-25 149: 'x-forwarded-for', 'x-last-hr', 'x-last-http-status-code', 'x-removed', 'x-real-ip', 'x-retry-count',
150: ])
151:
152: print('===============[ {} request ]==='.format(self.command))
153:
154: for header in self.headers:
155: if header in proxy_ignored:
156: pass
82969b1fc2 2012-01-25 157: elif header in ('range'):
158: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
159: if isRange:
160: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
161: else:
162: return()
82969b1fc2 2012-01-25 163: elif header in ('pragma'):
82969b1fc2 2012-01-25 164: if my_path in index:
82969b1fc2 2012-01-25 165: index[my_path][header] = self.headers[header]
166: else:
167: print('Unknown header - ', header, ': ', self.headers[header], sep='')
168: return()
169: print(header, self.headers[header])
170:
171: # creating file name from my_path
82969b1fc2 2012-01-25 172: file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
173: # partial file or unfinished download
82969b1fc2 2012-01-25 174: temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
175:
176: # creating empty placeholder in index
177: # if there's no space map and there's no file in real directory - we have no file
178: # if there's an empty space map - file is full
179: # space map generally covers every bit of file we don't posess currently
82969b1fc2 2012-01-25 180: if not my_path in index:
181: info += '\nThis one is new.'
182: reload = True
183: record = {}
184: else:
185: # forcibly checking file if no file present
82969b1fc2 2012-01-25 186: record = index[my_path]
187: if os.access(file_name, os.R_OK):
188: info += '\nFull file found.'
189: file_stat = os.stat(file_name)
82969b1fc2 2012-01-25 190: elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK):
191: info += '\nPartial file found.'
192: file_stat = os.stat(temp_name)
193: recheck = True
194: else:
195: info += '\nFile not found or inaccessible.'
196: record['_parts'] = None
197: reload = True
198:
199: if not '_parts' in record:
200: record['_parts'] = None
201:
202: if record['_parts'] == None:
203: recheck = True
204:
205: # forcibly checking file if file size doesn't match with index data
206: if not reload:
207: if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
208: if 'content-length' in record and file_stat and file_stat.st_size != int(record['content-length']):
209: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['content-length'])
210: record['_parts'] = None
211: reload = True
212:
213: # forcibly checking file if index holds Pragma header
214: if not reload and 'pragma' in record and record['pragma'] == 'no-cache':
215: info +='\nPragma on: recheck imminent.'
216: recheck = True
217:
218: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
219: if not recheck and not reload and '_time' in record and (record['_time'] - datetime.datetime.now() + datetime.timedelta(hours = 4)).days < 0:
220: info += '\nFile is old - rechecking.'
221: recheck = True
222:
223: print(info)
224: if reload or recheck:
225:
226: try:
82969b1fc2 2012-01-25 227: request = config['proto'] + '://' + config['root'] + self.path
228: my_headers = {}
82969b1fc2 2012-01-25 229: for header in ('cache-control', 'cookie', 'referer', 'user-agent'):
230: if header in self.headers:
231: my_headers[header] = self.headers[header]
232:
233: needed = None
234: if self.command not in ('HEAD'):
235: if '_parts' in record and record['_parts'] != None:
82969b1fc2 2012-01-25 236: if config['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap():
237: needed = record['_parts']
238: else:
239: needed = record['_parts'] & requested_ranges
82969b1fc2 2012-01-25 240: elif config['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap():
241: needed = requested_ranges
242: ranges = ()
243: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
244: if needed != None and len(needed) > 0:
245: needed.rewind()
246: while True:
247: range = needed.pop()
248: if range[0] == None:
249: break
250: ranges += '{}-{}'.format(range[0], range[1] - 1),
82969b1fc2 2012-01-25 251: my_headers['range'] = 'bytes=' + ','.join(ranges)
252:
253: my_headers['Accept-Encoding'] = 'gzip, compress, deflate, identity; q=0'
82969b1fc2 2012-01-25 254: request = urllib2.Request(request, headers = my_headers)
255:
82969b1fc2 2012-01-25 256: source = urllib2.urlopen(request, timeout = 60)
257: new_record = {}
258: new_record['_parts'] = record['_parts']
259: headers = source.info()
260:
82969b1fc2 2012-01-25 261: if 'content-encoding' in headers and headers['content-encoding'] == 'gzip':
262: import gzip
263: source = gzip.GzipFile(fileobj=source)
264:
265: # stripping unneeded headers (XXX make this inplace?)
266: for header in headers:
267: if header in desc_fields:
268: #if header == 'Pragma' and headers[header] != 'no-cache':
82969b1fc2 2012-01-25 269: if header == 'content-length':
82969b1fc2 2012-01-25 270: if 'content-range' not in headers:
271: new_record[header] = int(headers[header])
272: else:
273: new_record[header] = headers[header]
82969b1fc2 2012-01-25 274: elif header == 'content-range':
275: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
276: if range:
82969b1fc2 2012-01-25 277: new_record['content-length'] = int(range.group(3))
278: else:
279: assert False, 'Content-Range unrecognized.'
280: elif not header in ignore_fields:
281: print('Undefined header "', header, '": ', headers[header], sep='')
282:
283: # comparing headers with data found in index
284: # if any header has changed (except Pragma) file is fully downloaded
285: # same if we get more or less headers
286: old_keys = set(record.keys())
287: old_keys.discard('_time')
82969b1fc2 2012-01-25 288: old_keys.discard('pragma')
289: more_keys = set(new_record.keys()) - old_keys
82969b1fc2 2012-01-25 290: more_keys.discard('pragma')
291: less_keys = old_keys - set(new_record.keys())
292: if len(more_keys) > 0:
293: if len(old_keys) != 0:
294: print('More headers appear:', more_keys)
295: reload = True
296: elif len(less_keys) > 0:
297: print('Less headers appear:', less_keys)
298: else:
299: for key in record.keys():
82969b1fc2 2012-01-25 300: if key[0] != '_' and key != 'pragma' and record[key] != new_record[key]:
301: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
302: print(type(record[key]), type(new_record[key]))
303: reload = True
304:
305: if reload:
306: print('Reloading.')
307: if os.access(temp_name, os.R_OK):
308: os.unlink(temp_name)
309: if os.access(file_name, os.R_OK):
310: os.unlink(file_name)
82969b1fc2 2012-01-25 311: if 'content-length' in new_record:
82969b1fc2 2012-01-25 312: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['content-length'])})
313: if not new_record['_parts']:
314: new_record['_parts'] = spacemap.SpaceMap()
315: print(new_record)
316:
317: # downloading file or segment
82969b1fc2 2012-01-25 318: if 'content-length' in new_record:
319: if needed == None:
320: needed = new_record['_parts']
321: else:
322: if len(needed) > 1:
323: print("Multipart requests currently not supported.")
324: assert False, 'Skip this one for now.'
325: #else:
326: #assert False, 'No content-length or Content-Range header.'
327:
328: new_record['_time'] = datetime.datetime.now()
329: if self.command not in ('HEAD'):
330: # file is created at temporary location and moved in place only when download completes
331: if not os.access(temp_name, os.R_OK):
82969b1fc2 2012-01-25 332: empty_name = config['dir'] + os.sep + '.tmp'
333: with open(empty_name, 'w+b') as some_file:
334: pass
335: os.renames(empty_name, temp_name)
336: temp_file = open(temp_name, 'r+b')
337: if requested_ranges == None and needed == None:
338: needed = new_record['_parts']
339: needed.rewind()
340: while True:
341: # XXX can make this implicit - one request per range
342: (start, end) = needed.pop()
343: if start == None:
344: break
345: stream_last = start
346: old_record = copy.copy(new_record)
347: if end - start < block_size:
348: req_block_size = end - start
349: else:
350: req_block_size = block_size
351: buffer = source.read(req_block_size)
352: length = len(buffer)
353: while length > 0 and stream_last < end:
354: stream_pos = stream_last + length
355: assert stream_pos <= end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
356: temp_file.seek(stream_last)
357: temp_file.write(buffer)
358: x = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
359: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
82969b1fc2 2012-01-25 360: index[my_path] = old_record
361: index.sync()
362: old_record = copy.copy(new_record)
363: stream_last = stream_pos
364: if end - stream_last < block_size:
365: req_block_size = end - stream_last
366: buffer = source.read(req_block_size)
367: length = len(buffer)
368: # moving downloaded data to real file
369: temp_file.close()
370:
82969b1fc2 2012-01-25 371: index[my_path] = new_record
372: index.sync()
373:
82969b1fc2 2012-01-25 374: except urllib2.HTTPError as error:
375: # in case of error we don't need to do anything actually,
376: # if file download stalls or fails the file would not be moved to it's location
377: print(error)
378:
82969b1fc2 2012-01-25 379: print(index[my_path])
380:
82969b1fc2 2012-01-25 381: if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
382: # just moving
383: # drop old dirs XXX
384: print('Moving temporary file to new destination.')
385: os.renames(temp_name, file_name)
386:
82969b1fc2 2012-01-25 387: if not my_path in index:
388: self.send_response(502)
389: self.end_headers()
390: return
391:
392: if self.command == 'HEAD':
393: self.send_response(200)
82969b1fc2 2012-01-25 394: if 'content-length' in index[my_path]:
82969b1fc2 2012-01-25 395: self.send_header('content-length', index[my_path]['content-length'])
82969b1fc2 2012-01-25 396: self.send_header('accept-ranges', 'bytes')
82969b1fc2 2012-01-25 397: self.send_header('content-type', 'application/octet-stream')
82969b1fc2 2012-01-25 398: if 'last-modified' in index[my_path]:
82969b1fc2 2012-01-25 399: self.send_header('last-modified', index[my_path]['last-modified'])
400: self.end_headers()
401: else:
82969b1fc2 2012-01-25 402: if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
403: file_name = temp_name
404:
405: with open(file_name, 'rb') as real_file:
406: file_stat = os.stat(file_name)
82969b1fc2 2012-01-25 407: if 'range' in self.headers:
408: self.send_response(206)
409: ranges = ()
410: requested_ranges.rewind()
411: while True:
412: pair = requested_ranges.pop()
413: if pair[0] == None:
414: break
415: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
82969b1fc2 2012-01-25 416: self.send_header('content-range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['content-length']))
417: else:
418: self.send_response(200)
82969b1fc2 2012-01-25 419: self.send_header('content-length', str(file_stat.st_size))
420: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
82969b1fc2 2012-01-25 421: if 'last-modified' in index[my_path]:
82969b1fc2 2012-01-25 422: self.send_header('last-modified', index[my_path]['last-modified'])
82969b1fc2 2012-01-25 423: self.send_header('content-type', 'application/octet-stream')
424: self.end_headers()
425: if self.command in ('GET'):
426: if len(requested_ranges) > 0:
427: requested_ranges.rewind()
428: (start, end) = requested_ranges.pop()
429: else:
430: start = 0
431: # XXX ugly hack
82969b1fc2 2012-01-25 432: if 'content-length' in index[my_path]:
82969b1fc2 2012-01-25 433: end = index[my_path]['content-length']
434: else:
435: end = 0
436: real_file.seek(start)
437: if block_size > end - start:
438: req_block_size = end - start
439: else:
440: req_block_size = block_size
441: buffer = real_file.read(req_block_size)
442: length = len(buffer)
443: while length > 0:
444: self.wfile.write(buffer)
445: start += len(buffer)
446: if req_block_size > end - start:
447: req_block_size = end - start
448: if req_block_size == 0:
449: break
450: buffer = real_file.read(req_block_size)
451: length = len(buffer)
452:
453: def do_HEAD(self):
454: return self.__process()
455: def do_GET(self):
456: return self.__process()
457:
82969b1fc2 2012-01-25 458: config.section('general')
82969b1fc2 2012-01-25 459: server = BaseHTTPServer.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
460: server.serve_forever()
461:
462: #gevent.joinall()