Lines of
samesite.py
from check-in 31a8af9ff1
that are changed by the sequence of edits moving toward
check-in a81f1a70fb:
1: #!/usr/bin/env python
2:
3: from __future__ import unicode_literals, print_function
4:
5: import bsddb.dbshelve, copy, datetime, os, BaseHTTPServer, sys, spacemap, re, urllib2
6:
7: class Config:
8: __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
9: _default = {
10: 'general': {
11: 'port': '8008',
12: },
13: '_other': {
14: 'verbose': 'no',
15: 'noetag': 'no',
16: 'noparts': 'no',
17: 'strip': '',
18: 'sub': '',
19: },}
20:
21: # function to read in config file
22: def __init__(self):
23: import ConfigParser, optparse
24:
25: parser = optparse.OptionParser()
26: parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
27: (self.options, args) = parser.parse_args()
28:
29: assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
30:
31: configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
32: if configDir:
33: self.root = configDir.group(1)
34: else:
35: self.root = os.getcwd()
36:
37: self._config = ConfigParser.ConfigParser()
38: self._config.readfp(open(self.options.config))
39:
40: for section in self._config.sections():
41: if section != 'general':
42: if self._config.has_option(section, 'dir'):
43: if re.compile('^/$').match(self._config.get(section, 'dir')):
44: self._config.set(section, 'dir', self.root + os.sep + section)
45: thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
46: if thisDir:
47: self._config.set(section, 'dir', thisDir.group(1))
48: if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
49: self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
50: else:
51: self._config.set(section, 'dir', self.root + os.sep + section)
52:
53: if not self._config.has_option(section, 'root'):
54: self._config.set(section, 'root', section)
55:
56: # function to select config file section or create one
57: def section(self, section):
58: if not self._config.has_section(section):
59: self._config.add_section(section)
60: self._section = section
61:
62: # function to get config parameter, if parameter doesn't exists the default
63: # value or None is substituted
64: def __getitem__(self, name):
65: if not self._config.has_option(self._section, name):
66: if self._section in self._default:
67: if name in self._default[self._section]:
68: self._config.set(self._section, name, self._default[self._section][name])
69: else:
70: self._config.set(self._section, name, None)
71: elif name in self._default['_other']:
72: self._config.set(self._section, name, self._default['_other'][name])
73: else:
74: self._config.set(self._section, name, None)
75: return(self._config.get(self._section, name))
76:
77: config = Config()
78:
79: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
80:
81: const_desc_fields = set(['content-length', 'last-modified', 'pragma'])
82: const_ignore_fields = set([
83: 'accept-ranges', 'age',
84: 'cache-control', 'connection', 'content-type',
85: 'date',
86: 'expires',
87: 'referer',
88: 'server',
89: 'via',
90: 'x-cache', 'x-cache-lookup', 'x-livetool', 'x-powered-by',
91: ])
92:
93: block_size = 4096
94:
95: class MyRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
96: def __process(self):
97: # reload means file needs to be reloaded to serve request
98: reload = False
99: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
100: recheck = False
101: # file_stat means file definitely exists
102: file_stat = None
103: # requested_ranges holds data about any range requested
104: requested_ranges = None
105: # records holds data from index locally, should be written back upon successfull completion
106: record = None
107:
108: myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
109: if myPath:
110: my_path = myPath.group(1)
111: else:
112: my_path = self.path
113:
114: config.section(self.headers['host'])
115:
116: if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0:
117: string = re.compile(config['strip']).sub(config['sub'], my_path)
118: my_path = string
119:
120: info = 'Checking file: ' + my_path
121:
122: if not os.access(config['dir'], os.X_OK):
123: os.mkdir(config['dir'])
124: # this is file index - everything is stored in this file
125: # _parts - list of stored parts of file
126: # _time - last time the file was checked
127: # everything else is just the headers
128: index = bsddb.dbshelve.open(config['dir'] + os.sep + '.index')
129:
130: desc_fields = const_desc_fields.copy()
131: ignore_fields = const_ignore_fields.copy()
132: if config['noetag'] == 'no':
133: desc_fields.add('etag')
134: else:
135: ignore_fields.add('etag')
136:
137: proxy_ignored = set([
138: 'accept', 'accept-charset', 'accept-encoding', 'accept-language',
139: 'cache-control', 'connection', 'content-length', 'cookie',
140: 'host',
141: 'if-modified-since', 'if-unmodified-since',
142: 'referer',
143: 'user-agent',
144: 'via',
145: 'x-forwarded-for', 'x-last-hr', 'x-last-http-status-code', 'x-removed', 'x-real-ip', 'x-retry-count',
146: ])
147:
148: print('===============[ {} request ]==='.format(self.command))
149:
150: for header in self.headers:
151: if header in proxy_ignored:
152: pass
153: elif header in ('range'):
154: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
155: if isRange:
156: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
157: else:
158: return()
159: elif header in ('pragma'):
160: if my_path in index:
161: index[my_path][header] = self.headers[header]
162: else:
163: print('Unknown header - ', header, ': ', self.headers[header], sep='')
164: return()
165: print(header, self.headers[header])
166:
167: # creating file name from my_path
168: file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
169: # partial file or unfinished download
170: temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
171:
172: # creating empty placeholder in index
173: # if there's no space map and there's no file in real directory - we have no file
174: # if there's an empty space map - file is full
175: # space map generally covers every bit of file we don't posess currently
176: if not my_path in index:
177: info += '\nThis one is new.'
178: reload = True
179: record = {}
180: else:
181: # forcibly checking file if no file present
182: record = index[my_path]
183: if os.access(file_name, os.R_OK):
184: info += '\nFull file found.'
185: file_stat = os.stat(file_name)
186: elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK):
187: info += '\nPartial file found.'
188: file_stat = os.stat(temp_name)
31a8af9ff1 2012-01-16 189: recheck = True
190: else:
191: info += '\nFile not found or inaccessible.'
192: record['_parts'] = None
193: reload = True
194:
195: if not '_parts' in record:
196: record['_parts'] = None
197:
198: if record['_parts'] == None:
199: recheck = True
200:
201: # forcibly checking file if file size doesn't match with index data
202: if not reload:
203: if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
204: if 'content-length' in record and file_stat and file_stat.st_size != int(record['content-length']):
205: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['content-length'])
206: record['_parts'] = None
207: reload = True
208:
209: # forcibly checking file if index holds Pragma header
210: if not reload and 'pragma' in record and record['pragma'] == 'no-cache':
211: info +='\nPragma on: recheck imminent.'
212: recheck = True
213:
214: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
215: if not recheck and not reload and '_time' in record and (record['_time'] - datetime.datetime.now() + datetime.timedelta(hours = 4)).days < 0:
216: info += '\nFile is old - rechecking.'
217: recheck = True
218:
219: print(info)
220: if reload or recheck:
221:
222: try:
223: request = 'http://' + config['root'] + self.path
224: my_headers = {}
225: for header in ('cache-control', 'cookie', 'referer', 'user-agent'):
226: if header in self.headers:
227: my_headers[header] = self.headers[header]
228:
229: needed = None
230: if self.command not in ('HEAD'):
231: if '_parts' in record and record['_parts'] != None:
232: if config['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap():
233: needed = record['_parts']
234: else:
235: needed = record['_parts'] & requested_ranges
236: elif config['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap():
237: needed = requested_ranges
238: ranges = ()
239: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
240: if needed != None and len(needed) > 0:
241: needed.rewind()
242: while True:
243: range = needed.pop()
244: if range[0] == None:
245: break
246: ranges += '{}-{}'.format(range[0], range[1] - 1),
247: my_headers['range'] = 'bytes=' + ','.join(ranges)
248:
249: my_headers['Accept-Encoding'] = 'gzip'
250: request = urllib2.Request(request, headers = my_headers)
251:
31a8af9ff1 2012-01-16 252: source = urllib2.urlopen(request)
253: new_record = {}
254: new_record['_parts'] = record['_parts']
255: headers = source.info()
256:
257: if 'content-encoding' in headers and headers['content-encoding'] == 'gzip':
31a8af9ff1 2012-01-16 258: import gzip, StringIO
31a8af9ff1 2012-01-16 259: buf = StringIO.StringIO(source.read())
31a8af9ff1 2012-01-16 260: source = gzip.GzipFile(fileobj=buf)
261:
262: # stripping unneeded headers (XXX make this inplace?)
263: for header in headers:
264: if header in desc_fields:
265: #if header == 'Pragma' and headers[header] != 'no-cache':
266: if header == 'content-length':
267: if 'content-range' not in headers:
268: new_record[header] = int(headers[header])
269: else:
270: new_record[header] = headers[header]
271: elif header == 'content-range':
272: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
273: if range:
274: new_record['content-length'] = int(range.group(3))
275: else:
276: assert False, 'Content-Range unrecognized.'
277: elif not header in ignore_fields:
278: print('Undefined header "', header, '": ', headers[header], sep='')
279:
280: # comparing headers with data found in index
281: # if any header has changed (except Pragma) file is fully downloaded
282: # same if we get more or less headers
283: old_keys = set(record.keys())
284: old_keys.discard('_time')
285: old_keys.discard('pragma')
286: more_keys = set(new_record.keys()) - old_keys
287: more_keys.discard('pragma')
288: less_keys = old_keys - set(new_record.keys())
289: if len(more_keys) > 0:
290: if len(old_keys) != 0:
291: print('More headers appear:', more_keys)
292: reload = True
293: elif len(less_keys) > 0:
294: print('Less headers appear:', less_keys)
295: else:
296: for key in record.keys():
297: if key[0] != '_' and key != 'pragma' and record[key] != new_record[key]:
298: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
299: print(type(record[key]), type(new_record[key]))
300: reload = True
301:
302: if reload:
303: print('Reloading.')
304: if os.access(temp_name, os.R_OK):
305: os.unlink(temp_name)
306: if os.access(file_name, os.R_OK):
307: os.unlink(file_name)
308: if 'content-length' in new_record:
309: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['content-length'])})
310: if not new_record['_parts']:
311: new_record['_parts'] = spacemap.SpaceMap()
312: print(new_record)
313:
314: # downloading file or segment
315: if 'content-length' in new_record:
316: if needed == None:
317: needed = new_record['_parts']
318: else:
319: if len(needed) > 1:
320: print("Multipart requests currently not supported.")
321: assert False, 'Skip this one for now.'
322: #else:
323: #assert False, 'No content-length or Content-Range header.'
324:
325: new_record['_time'] = datetime.datetime.now()
326: if self.command not in ('HEAD'):
327: # file is created at temporary location and moved in place only when download completes
328: if not os.access(temp_name, os.R_OK):
329: empty_name = config['dir'] + os.sep + '.tmp'
330: with open(empty_name, 'w+b') as some_file:
331: pass
332: os.renames(empty_name, temp_name)
333: temp_file = open(temp_name, 'r+b')
334: if requested_ranges == None and needed == None:
335: needed = new_record['_parts']
336: needed.rewind()
337: while True:
338: # XXX can make this implicit - one request per range
339: (start, end) = needed.pop()
340: if start == None:
341: break
342: stream_last = start
343: old_record = copy.copy(new_record)
344: if end - start < block_size:
345: req_block_size = end - start
346: else:
347: req_block_size = block_size
348: buffer = source.read(req_block_size)
349: length = len(buffer)
350: while length > 0 and stream_last < end:
351: stream_pos = stream_last + length
352: assert stream_pos <= end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
353: temp_file.seek(stream_last)
354: temp_file.write(buffer)
355: x = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
356: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
357: index[my_path] = old_record
358: index.sync()
359: old_record = copy.copy(new_record)
360: stream_last = stream_pos
361: if end - stream_last < block_size:
362: req_block_size = end - stream_last
363: buffer = source.read(req_block_size)
364: length = len(buffer)
365: # moving downloaded data to real file
366: temp_file.close()
367:
368: index[my_path] = new_record
369: index.sync()
370:
371: except urllib2.HTTPError as error:
372: # in case of error we don't need to do anything actually,
373: # if file download stalls or fails the file would not be moved to it's location
374: print(error)
375:
376: print(index[my_path])
377:
378: if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
379: # just moving
380: # drop old dirs XXX
381: print('Moving temporary file to new destination.')
382: os.renames(temp_name, file_name)
383:
384: if not my_path in index:
385: self.send_response(502)
386: self.end_headers()
387: return
388:
389: if self.command == 'HEAD':
390: self.send_response(200)
391: if 'content-length' in index[my_path]:
392: self.send_header('content-length', index[my_path]['content-length'])
393: self.send_header('accept-ranges', 'bytes')
394: self.send_header('content-type', 'application/octet-stream')
395: if 'last-modified' in index[my_path]:
396: self.send_header('last-modified', index[my_path]['last-modified'])
397: self.end_headers()
398: else:
399: if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
400: file_name = temp_name
401:
402: with open(file_name, 'rb') as real_file:
403: file_stat = os.stat(file_name)
404: if 'range' in self.headers:
405: self.send_response(206)
406: ranges = ()
407: requested_ranges.rewind()
408: while True:
409: pair = requested_ranges.pop()
410: if pair[0] == None:
411: break
412: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
413: self.send_header('content-range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['content-length']))
414: else:
415: self.send_response(200)
416: self.send_header('content-length', str(file_stat.st_size))
417: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
418: if 'last-modified' in index[my_path]:
419: self.send_header('last-modified', index[my_path]['last-modified'])
420: self.send_header('content-type', 'application/octet-stream')
421: self.end_headers()
422: if self.command in ('GET'):
423: if len(requested_ranges) > 0:
424: requested_ranges.rewind()
425: (start, end) = requested_ranges.pop()
426: else:
427: start = 0
428: # XXX ugly hack
429: if 'content-length' in index[my_path]:
430: end = index[my_path]['content-length']
431: else:
432: end = 0
433: real_file.seek(start)
434: if block_size > end - start:
435: req_block_size = end - start
436: else:
437: req_block_size = block_size
438: buffer = real_file.read(req_block_size)
439: length = len(buffer)
440: while length > 0:
441: self.wfile.write(buffer)
442: start += len(buffer)
443: if req_block_size > end - start:
444: req_block_size = end - start
445: if req_block_size == 0:
446: break
447: buffer = real_file.read(req_block_size)
448: length = len(buffer)
449:
450: def do_HEAD(self):
451: return self.__process()
452: def do_GET(self):
453: return self.__process()
454:
455: config.section('general')
456: server = BaseHTTPServer.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
457: server.serve_forever()