Lines of
samesite.py
from check-in b67d2538b2
that are changed by the sequence of edits moving toward
check-in a2857db2b5:
1: #!/usr/bin/env python3.2
2:
3: from __future__ import unicode_literals, print_function
4:
5: #import gevent.monkey
6: #gevent.monkey.patch_all()
7:
8: import argparse, os
9: parser = argparse.ArgumentParser()
10: parser.add_argument('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
11: args = parser.parse_args()
12: assert os.access(args.config, os.R_OK), "Fatal error: can't read {}".format(args.config)
13:
14: import configparser
15: config = configparser.ConfigParser({
16: 'port': '8008',
17: 'verbose': 'no',
18: 'noetag': 'no',
19: 'noparts': 'no',
20: 'strip': '',
21: 'sub': '',
22: 'proto': 'http',
23: })
24: config.read(args.config)
25:
26: cache_dir = os.path.realpath(os.path.dirname(args.config))
27:
28: import re
29: for section in config.sections():
30: if section != 'DEFAULT':
31: if 'dir' in config[section]:
32: if not re.compile('^/.*').match(config[section]['dir']):
33: config[section]['dir'] = cache_dir + os.sep + section
34: thisDir = re.compile('^(.*)/$').match(config[section]['dir'])
35: if thisDir:
36: config[section]['dir'] = thisDir.group(1)
37: if not re.compile('^/(.*)$').match(config[section]['dir']):
38: config[section]['dir'] = cache_dir + os.sep + config[section]['dir']
39: else:
40: config[section]['dir'] = cache_dir + os.sep + section
41:
42: if not 'root' in config[section]:
43: config[section]['root'] = section
44:
45: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
46:
47: const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma'])
48: const_ignore_fields = set([
49: 'Accept-Ranges', 'Age',
50: 'Cache-Control', 'Connection', 'Content-Type',
51: 'Date',
52: 'Expires',
53: 'Referer',
54: 'Server',
55: 'Via',
56: 'X-Cache', 'X-Cache-Lookup', 'X-Livetool', 'X-Powered-By',
57: ])
58:
59: block_size = 8192
60:
61: import bsddb3.dbshelve, copy, datetime, http.server, spacemap, urllib.request, urllib.error
62:
63: class MyRequestHandler(http.server.BaseHTTPRequestHandler):
64: def __process(self):
65: # reload means file needs to be reloaded to serve request
66: reload = False
67: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
68: recheck = False
69: # file_stat means file definitely exists
70: file_stat = None
71: # requested_ranges holds data about any range requested
72: requested_ranges = None
73: # records holds data from index locally, should be written back upon successfull completion
74: record = None
75:
76: myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
77: if myPath:
78: my_path = myPath.group(1)
79: else:
80: my_path = self.path
81:
82: config_host = config[self.headers['Host']]
83:
84: if config_host['sub'] != None and config_host['strip'] != None and len(config_host['strip']) > 0:
85: string = re.compile(config_host['strip']).sub(config_host['sub'], my_path)
86: my_path = string
87:
88: my_path_b = my_path.encode('utf-8')
89: info = 'Checking file: ' + my_path
90:
91: if not os.access(config_host['dir'], os.X_OK):
92: os.mkdir(config_host['dir'])
93: # this is file index - everything is stored in this file
94: # _parts - list of stored parts of file
95: # _time - last time the file was checked
96: # everything else is just the headers
97: index = bsddb3.dbshelve.open(config_host['dir'] + os.sep + '.index')
98:
99: desc_fields = const_desc_fields.copy()
100: ignore_fields = const_ignore_fields.copy()
101: if config_host['noetag'] == 'no':
102: desc_fields.add('ETag')
103: else:
104: ignore_fields.add('ETag')
105:
106: proxy_ignored = set([
107: 'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language',
108: 'Cache-Control', 'Connection', 'Content-Length', 'Cookie',
109: 'Host',
110: 'If-Modified-Since', 'If-Unmodified-Since',
111: 'Referer',
b67d2538b2 2012-08-09 112: 'Ua-Cpu', 'User-Agent',
113: 'Via',
114: 'X-Forwarded-For', 'X-Last-HR', 'X-Last-HTTP-Status-Code', 'X-Old-UID', 'X-Removed', 'X-Real-IP', 'X-Retry-Count',
115: ])
116:
117: print('===============[ {} request ]==='.format(self.command))
118:
119: for header in self.headers:
120: if header in proxy_ignored:
121: pass
122: elif header in ('Range'):
123: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
124: if isRange:
125: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
126: else:
127: return()
128: elif header in ('Pragma'):
129: if my_path_b in index:
130: index[my_path_b][header] = self.headers[header]
131: else:
132: print('Unknown header - ', header, ': ', self.headers[header], sep='')
133: return()
134: print(header, self.headers[header])
135:
136: # creating file name from my_path
137: file_name = config_host['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
138: # partial file or unfinished download
139: temp_name = config_host['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
140:
141: # creating empty placeholder in index
142: # if there's no space map and there's no file in real directory - we have no file
143: # if there's an empty space map - file is full
144: # space map generally covers every bit of file we don't posess currently
145: if not my_path_b in index:
146: info += '\nThis one is new.'
147: reload = True
148: record = {}
149: else:
150: # forcibly checking file if no file present
151: record = index[my_path_b]
152: if os.access(file_name, os.R_OK):
153: info += '\nFull file found.'
154: file_stat = os.stat(file_name)
155: elif '_parts' in index[my_path_b] and os.access(temp_name, os.R_OK):
156: info += '\nPartial file found.'
157: file_stat = os.stat(temp_name)
158: recheck = True
159: else:
160: info += '\nFile not found or inaccessible.'
161: record['_parts'] = None
162: reload = True
163:
164: if not '_parts' in record:
165: record['_parts'] = None
166:
167: if record['_parts'] == None:
168: recheck = True
169:
170: # forcibly checking file if file size doesn't match with index data
171: if not reload:
172: if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
173: if 'content-length' in record and file_stat and file_stat.st_size != int(record['content-length']):
174: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['content-length'])
175: record['_parts'] = None
176: reload = True
177:
178: # forcibly checking file if index holds Pragma header
179: if not reload and 'pragma' in record and record['pragma'] == 'no-cache':
180: info +='\nPragma on: recheck imminent.'
181: recheck = True
182:
183: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
184: if not recheck and not reload and '_time' in record and (record['_time'] - datetime.datetime.now() + datetime.timedelta(hours = 4)).days < 0:
185: info += '\nFile is old - rechecking.'
186: recheck = True
187:
188: print(info)
189: if reload or recheck:
190:
191: try:
192: request = config_host['proto'] + '://' + config_host['root'] + self.path
193: my_headers = {}
194: for header in ('Accept', 'Cache-Control', 'Cookie', 'Referer', 'User-Agent'):
195: if header in self.headers:
196: my_headers[header] = self.headers[header]
197:
198: needed = None
199: if self.command not in ('HEAD'):
200: if '_parts' in record and record['_parts'] != None:
201: if config_host['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap():
202: needed = record['_parts']
203: else:
204: needed = record['_parts'] & requested_ranges
205: elif config_host['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap():
206: needed = requested_ranges
207: ranges = ()
208: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
209: if needed != None and len(needed) > 0:
210: needed.rewind()
211: while True:
212: range = needed.pop()
213: if range[0] == None:
214: break
215: ranges += '{}-{}'.format(range[0], range[1] - 1),
216: my_headers['Range'] = 'bytes=' + ','.join(ranges)
217:
b67d2538b2 2012-08-09 218: my_headers['Accept-Encoding'] = 'gzip, compress, deflate, identity; q=0'
219: request = urllib.request.Request(request, headers = my_headers)
220:
221: source = urllib.request.urlopen(request, timeout = 60)
222: new_record = {}
223: new_record['_parts'] = record['_parts']
224: headers = source.info()
225:
226: if 'Content-Encoding' in headers and headers['Content-Encoding'] == 'gzip':
227: import gzip
228: source = gzip.GzipFile(fileobj=source)
229:
230: # stripping unneeded headers (XXX make this inplace?)
231: for header in headers:
232: if header in desc_fields:
233: #if header == 'Pragma' and headers[header] != 'no-cache':
234: if header == 'Content-Length':
235: if 'Content-Range' not in headers:
236: new_record[header] = int(headers[header])
237: else:
238: new_record[header] = headers[header]
239: elif header == 'Content-Range':
240: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
241: if range:
242: new_record['Content-Length'] = int(range.group(3))
243: else:
244: assert False, 'Content-Range unrecognized.'
245: elif not header in ignore_fields:
246: print('Undefined header "', header, '": ', headers[header], sep='')
247:
248: # comparing headers with data found in index
249: # if any header has changed (except Pragma) file is fully downloaded
250: # same if we get more or less headers
251: old_keys = set(record.keys())
252: old_keys.discard('_time')
253: old_keys.discard('Pragma')
254: more_keys = set(new_record.keys()) - old_keys
255: more_keys.discard('Pragma')
256: less_keys = old_keys - set(new_record.keys())
257: if len(more_keys) > 0:
258: if len(old_keys) != 0:
259: print('More headers appear:', more_keys)
260: reload = True
261: elif len(less_keys) > 0:
262: print('Less headers appear:', less_keys)
263: else:
264: for key in record.keys():
265: if key[0] != '_' and key != 'Pragma' and record[key] != new_record[key]:
266: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
267: print(type(record[key]), type(new_record[key]))
268: reload = True
269:
270: if reload:
271: print('Reloading.')
272: if os.access(temp_name, os.R_OK):
273: os.unlink(temp_name)
274: if os.access(file_name, os.R_OK):
275: os.unlink(file_name)
276: if 'Content-Length' in new_record:
277: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
278: if not new_record['_parts']:
279: new_record['_parts'] = spacemap.SpaceMap()
280: print(new_record)
281:
282: # downloading file or segment
283: if 'Content-Length' in new_record:
284: if needed == None:
285: needed = new_record['_parts']
286: else:
287: if len(needed) > 1:
288: print("Multipart requests currently not supported.")
289: assert False, 'Skip this one for now.'
290: #else:
291: #assert False, 'No content-length or Content-Range header.'
292:
293: new_record['_time'] = datetime.datetime.now()
294: if self.command not in ('HEAD'):
295: # file is created at temporary location and moved in place only when download completes
296: if not os.access(temp_name, os.R_OK):
297: empty_name = config_host['dir'] + os.sep + '.tmp'
298: with open(empty_name, 'w+b') as some_file:
299: pass
300: os.renames(empty_name, temp_name)
301: temp_file = open(temp_name, 'r+b')
302: if requested_ranges == None and needed == None:
303: needed = new_record['_parts']
304: needed.rewind()
305: while True:
306: # XXX can make this implicit - one request per range
307: (start, end) = needed.pop()
308: if start == None:
309: break
310: stream_last = start
311: old_record = copy.copy(new_record)
312: if end - start < block_size:
313: req_block_size = end - start
314: else:
315: req_block_size = block_size
316: buffer = source.read(req_block_size)
317: length = len(buffer)
318: while length > 0 and stream_last < end:
319: stream_pos = stream_last + length
320: assert stream_pos <= end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
321: temp_file.seek(stream_last)
322: temp_file.write(buffer)
323: x = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
324: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
325: index[my_path_b] = old_record
326: index.sync()
327: old_record = copy.copy(new_record)
328: stream_last = stream_pos
329: if end - stream_last < block_size:
330: req_block_size = end - stream_last
331: buffer = source.read(req_block_size)
332: length = len(buffer)
333: # moving downloaded data to real file
334: temp_file.close()
335:
336: index[my_path_b] = new_record
337: index.sync()
338:
339: except urllib.error.HTTPError as error:
340: # in case of error we don't need to do anything actually,
341: # if file download stalls or fails the file would not be moved to it's location
342: print(error, repr(my_headers))
343:
344: print(index[my_path_b])
345:
346: if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path_b] and index[my_path_b]['_parts'] == spacemap.SpaceMap():
347: # just moving
348: # drop old dirs XXX
349: print('Moving temporary file to new destination.')
350: os.renames(temp_name, file_name)
351:
352: if not my_path_b in index:
353: self.send_response(502)
354: self.end_headers()
355: return
356:
357: if self.command == 'HEAD':
358: self.send_response(200)
359: if 'Content-Length' in index[my_path_b]:
360: self.send_header('Content-Length', index[my_path_b]['Content-Length'])
361: self.send_header('Accept-Ranges', 'bytes')
362: self.send_header('Content-Type', 'application/octet-stream')
363: if 'Last-Modified' in index[my_path_b]:
364: self.send_header('Last-Modified', index[my_path_b]['Last-Modified'])
365: self.end_headers()
366: else:
367: if ('_parts' in index[my_path_b] and index[my_path_b]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
368: file_name = temp_name
369:
370: with open(file_name, 'rb') as real_file:
371: file_stat = os.stat(file_name)
372: if 'Range' in self.headers:
373: self.send_response(206)
374: ranges = ()
375: requested_ranges.rewind()
376: while True:
377: pair = requested_ranges.pop()
378: if pair[0] == None:
379: break
380: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
381: self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path_b]['Content-Length']))
382: else:
383: self.send_response(200)
384: self.send_header('Content-Length', str(file_stat.st_size))
385: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
386: if 'Last-Modified' in index[my_path_b]:
387: self.send_header('Last-Modified', index[my_path_b]['Last-Modified'])
388: self.send_header('Content-Type', 'application/octet-stream')
389: self.end_headers()
390: if self.command in ('GET'):
391: if len(requested_ranges) > 0:
392: requested_ranges.rewind()
393: (start, end) = requested_ranges.pop()
394: else:
395: start = 0
396: # XXX ugly hack
397: if 'Content-Length' in index[my_path_b]:
398: end = index[my_path_b]['Content-Length']
399: else:
400: end = 0
401: real_file.seek(start)
402: if block_size > end - start:
403: req_block_size = end - start
404: else:
405: req_block_size = block_size
406: buffer = real_file.read(req_block_size)
407: length = len(buffer)
408: while length > 0:
409: self.wfile.write(buffer)
410: start += len(buffer)
411: if req_block_size > end - start:
412: req_block_size = end - start
413: if req_block_size == 0:
414: break
415: buffer = real_file.read(req_block_size)
416: length = len(buffer)
417:
418: def do_HEAD(self):
419: return self.__process()
420: def do_GET(self):
421: return self.__process()
422:
423: server = http.server.HTTPServer(('127.0.0.1', int(config['DEFAULT']['port'])), MyRequestHandler)
424: server.serve_forever()
425:
426: #gevent.joinall()