Lines of
samesite.py
from check-in 6cf3431e69
that are changed by the sequence of edits moving toward
check-in f57e6e032b:
1: #!/usr/bin/env python3.3
2:
3: import argparse, os
4: parser = argparse.ArgumentParser()
5: parser.add_argument('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
6: args = parser.parse_args()
7: assert os.access(args.config, os.R_OK), "Fatal error: can't read {}".format(args.config)
8:
9: import configparser
10: config = configparser.ConfigParser({
11: 'port': '8008',
12: 'verbose': 'no',
13: 'noetag': 'no',
14: 'noparts': 'no',
15: 'strip': '',
16: 'sub': '',
17: 'proto': 'http',
18: })
19: config.read(args.config)
20:
21: cache_dir = os.path.realpath(os.path.dirname(args.config))
22:
23: import re
24: for section in config.sections():
25: if section != 'DEFAULT':
26: if 'dir' in config[section]:
27: if not re.compile('^/.*').match(config[section]['dir']):
28: config[section]['dir'] = cache_dir + os.sep + section
29: thisDir = re.compile('^(.*)/$').match(config[section]['dir'])
30: if thisDir:
31: config[section]['dir'] = thisDir.group(1)
32: if not re.compile('^/(.*)$').match(config[section]['dir']):
33: config[section]['dir'] = cache_dir + os.sep + config[section]['dir']
34: else:
35: config[section]['dir'] = cache_dir + os.sep + section
36: if not 'root' in config[section]:
37: config[section]['root'] = section
38:
39: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
40:
41: const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma'])
42: const_ignore_fields = set([
43: 'Accept-Ranges', 'Age',
44: 'Cache-Control', 'Connection', 'Content-Type',
45: 'Date',
46: 'Expires',
47: 'Referer',
48: 'Server',
49: 'Via',
6cf3431e69 2013-08-23 50: 'X-Cache', 'X-Cache-Lookup', 'X-Livetool', 'X-Powered-By',
51: ])
52:
53: block_size = 8192
54:
55: import bsddb3.dbshelve, copy, datetime, http.server, spacemap, urllib.request, urllib.error
56:
57: class MyRequestHandler(http.server.BaseHTTPRequestHandler):
58: def __process(self):
59: # reload means file needs to be reloaded to serve request
60: reload = False
61: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
62: recheck = False
63: # file_stat means file definitely exists
64: file_stat = None
65: # requested_ranges holds data about any range requested
66: requested_ranges = None
67: # records holds data from index locally, should be written back upon successfull completion
68: record = None
69:
70: myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
71: if myPath:
72: my_path = myPath.group(1)
73: else:
74: my_path = self.path
75:
76: if not config.has_section(self.headers['Host']):
77: config.add_section(self.headers['Host'])
78: config[self.headers['Host']]['root'] = self.headers['Host']
79: config[self.headers['Host']]['dir'] = cache_dir + os.sep + self.headers['Host']
80: config_host = config[self.headers['Host']]
81:
82: if config_host['sub'] != None and config_host['strip'] != None and len(config_host['strip']) > 0:
83: string = re.compile(config_host['strip']).sub(config_host['sub'], my_path)
84: my_path = string
85:
86: my_path_b = my_path.encode('utf-8')
87: info = 'Checking file: ' + my_path
88:
89: if not os.access(config_host['dir'], os.X_OK):
90: os.mkdir(config_host['dir'])
91: # this is file index - everything is stored in this file
92: # _parts - list of stored parts of file
93: # _time - last time the file was checked
94: # everything else is just the headers
95: index = bsddb3.dbshelve.open(config_host['dir'] + os.sep + '.index')
96:
97: desc_fields = const_desc_fields.copy()
98: ignore_fields = const_ignore_fields.copy()
99: if config_host['noetag'] == 'no':
100: desc_fields.add('ETag')
101: else:
102: ignore_fields.add('ETag')
103:
104: proxy_ignored = set([
105: 'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language',
106: 'Cache-Control', 'Connection', 'Content-Length', 'Cookie',
107: 'Host',
108: 'If-Modified-Since', 'If-None-Match', 'If-Unmodified-Since',
109: 'Referer',
110: 'UA-CPU', 'User-Agent',
111: 'Via',
112: 'X-Forwarded-For', 'X-Last-HR', 'X-Last-HTTP-Status-Code', 'X-Old-UID', 'X-Removed', 'X-Real-IP', 'X-Retry-Count',
113: ])
114:
115: print('===============[ {} request ]==='.format(self.command))
116:
117: for header in self.headers:
118: if header in proxy_ignored:
119: pass
120: elif header in ('Range'):
121: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
122: if isRange:
123: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
124: else:
125: return()
126: elif header in ('Pragma'):
127: if my_path_b in index:
128: index[my_path_b][header] = self.headers[header]
129: else:
130: print('Unknown header - ', header, ': ', self.headers[header], sep='')
131: return()
132: print(header, self.headers[header])
133:
134: # creating file name from my_path
135: file_name = config_host['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
136: # partial file or unfinished download
137: temp_name = config_host['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
138:
139: # creating empty placeholder in index
140: # if there's no space map and there's no file in real directory - we have no file
141: # if there's an empty space map - file is full
142: # space map generally covers every bit of file we don't posess currently
143: if not my_path_b in index:
144: info += '\nThis one is new.'
145: reload = True
146: record = {}
147: else:
148: # forcibly checking file if no file present
149: record = index[my_path_b]
150: if os.access(file_name, os.R_OK):
151: info += '\nFull file found.'
152: file_stat = os.stat(file_name)
153: elif '_parts' in index[my_path_b] and os.access(temp_name, os.R_OK):
154: info += '\nPartial file found.'
155: file_stat = os.stat(temp_name)
156: recheck = True
157: else:
158: info += '\nFile not found or inaccessible.'
159: record['_parts'] = None
160: reload = True
161:
162: if not '_parts' in record:
163: record['_parts'] = None
164:
165: if record['_parts'] == None:
166: recheck = True
167:
168: # forcibly checking file if file size doesn't match with index data
169: if not reload:
170: if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
171: if 'content-length' in record and file_stat and file_stat.st_size != int(record['content-length']):
172: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['content-length'])
173: record['_parts'] = None
174: reload = True
175:
176: # forcibly checking file if index holds Pragma header
177: if not reload and 'pragma' in record and record['pragma'] == 'no-cache':
178: info +='\nPragma on: recheck imminent.'
179: recheck = True
180:
181: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
182: if not recheck and not reload and '_time' in record and (record['_time'] - datetime.datetime.now() + datetime.timedelta(hours = 4)).days < 0:
183: info += '\nFile is old - rechecking.'
184: recheck = True
185:
186: print(info)
187: if reload or recheck:
188:
189: try:
190: request = config_host['proto'] + '://' + config_host['root'] + self.path
191: my_headers = {}
192: for header in ('Accept', 'Cache-Control', 'Cookie', 'Referer', 'User-Agent'):
193: if header in self.headers:
194: my_headers[header] = self.headers[header]
195:
196: needed = None
197: if self.command not in ('HEAD'):
198: if '_parts' in record and record['_parts'] != None:
199: if config_host['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap():
200: needed = record['_parts']
201: else:
202: needed = record['_parts'] & requested_ranges
203: elif config_host['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap():
204: needed = requested_ranges
205: ranges = ()
206: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
207: if needed != None and len(needed) > 0:
208: needed.rewind()
209: while True:
210: range = needed.pop()
211: if range[0] == None:
212: break
213: ranges += '{}-{}'.format(range[0], range[1] - 1),
214: my_headers['Range'] = 'bytes=' + ','.join(ranges)
215:
216: #my_headers['Accept-Encoding'] = 'gzip, compress, deflate, identity; q=0'
217: request = urllib.request.Request(request, headers = my_headers)
218:
219: source = urllib.request.urlopen(request, timeout = 60)
220: new_record = {}
221: new_record['_parts'] = record['_parts']
222: headers = source.info()
223:
224: if 'Content-Encoding' in headers and headers['Content-Encoding'] == 'gzip':
225: import gzip
226: source = gzip.GzipFile(fileobj=source)
227:
228: # stripping unneeded headers (XXX make this inplace?)
229: for header in headers:
230: if header in desc_fields:
231: #if header == 'Pragma' and headers[header] != 'no-cache':
232: if header == 'Content-Length':
233: if 'Content-Range' not in headers:
234: new_record[header] = int(headers[header])
235: else:
236: new_record[header] = headers[header]
237: elif header == 'Content-Range':
238: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
239: if range:
240: new_record['Content-Length'] = int(range.group(3))
241: else:
242: assert False, 'Content-Range unrecognized.'
243: elif not header in ignore_fields:
244: print('Undefined header "', header, '": ', headers[header], sep='')
245:
246: # comparing headers with data found in index
247: # if any header has changed (except Pragma) file is fully downloaded
248: # same if we get more or less headers
249: old_keys = set(record.keys())
250: old_keys.discard('_time')
251: old_keys.discard('Pragma')
252: more_keys = set(new_record.keys()) - old_keys
253: more_keys.discard('Pragma')
254: less_keys = old_keys - set(new_record.keys())
255: if len(more_keys) > 0:
256: if len(old_keys) != 0:
257: print('More headers appear:', more_keys)
258: reload = True
259: elif len(less_keys) > 0:
260: print('Less headers appear:', less_keys)
261: else:
262: for key in record.keys():
263: if key[0] != '_' and key != 'Pragma' and record[key] != new_record[key]:
264: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
265: print(type(record[key]), type(new_record[key]))
266: reload = True
267:
268: if reload:
269: print('Reloading.')
270: if os.access(temp_name, os.R_OK):
271: os.unlink(temp_name)
272: if os.access(file_name, os.R_OK):
273: os.unlink(file_name)
274: if 'Content-Length' in new_record:
275: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
276: if not new_record['_parts']:
277: new_record['_parts'] = spacemap.SpaceMap()
278: print(new_record)
279:
280: # downloading file or segment
281: if 'Content-Length' in new_record:
282: if needed == None:
283: needed = new_record['_parts']
284: else:
285: if len(needed) > 1:
286: print("Multipart requests currently not supported.")
287: assert False, 'Skip this one for now.'
288: #else:
289: #assert False, 'No content-length or Content-Range header.'
290:
291: new_record['_time'] = datetime.datetime.now()
292: if self.command not in ('HEAD'):
293: # file is created at temporary location and moved in place only when download completes
294: if not os.access(temp_name, os.R_OK):
295: empty_name = config_host['dir'] + os.sep + '.tmp'
296: with open(empty_name, 'w+b') as some_file:
297: pass
298: os.renames(empty_name, temp_name)
299: temp_file = open(temp_name, 'r+b')
300: if requested_ranges == None and needed == None:
301: needed = new_record['_parts']
302: needed.rewind()
303: while True:
304: # XXX can make this implicit - one request per range
305: (start, end) = needed.pop()
306: if start == None:
307: break
308: stream_last = start
309: old_record = copy.copy(new_record)
310: if end - start < block_size:
311: req_block_size = end - start
312: else:
313: req_block_size = block_size
314: buffer = source.read(req_block_size)
315: length = len(buffer)
316: while length > 0 and stream_last < end:
317: stream_pos = stream_last + length
318: assert stream_pos <= end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
319: temp_file.seek(stream_last)
320: temp_file.write(buffer)
321: x = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
322: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
6cf3431e69 2013-08-23 323: index[my_path_b] = old_record
6cf3431e69 2013-08-23 324: index.sync()
325: old_record = copy.copy(new_record)
326: stream_last = stream_pos
327: if end - stream_last < block_size:
328: req_block_size = end - stream_last
329: buffer = source.read(req_block_size)
330: length = len(buffer)
331: # moving downloaded data to real file
332: temp_file.close()
333:
334: index[my_path_b] = new_record
335: index.sync()
336:
337: except urllib.error.HTTPError as error:
338: # in case of error we don't need to do anything actually,
339: # if file download stalls or fails the file would not be moved to it's location
340: self.send_response(error.code)
341: self.end_headers()
342: print(error, repr(my_headers))
343: return
344:
6cf3431e69 2013-08-23 345: print(index[my_path_b])
346:
347: if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path_b] and index[my_path_b]['_parts'] == spacemap.SpaceMap():
348: # just moving
349: # drop old dirs XXX
350: print('Moving temporary file to new destination.')
351: os.renames(temp_name, file_name)
352:
353: if not my_path_b in index:
354: self.send_response(502)
355: self.end_headers()
356: return
357:
358: if self.command == 'HEAD':
359: self.send_response(200)
360: if 'Content-Length' in index[my_path_b]:
361: self.send_header('Content-Length', index[my_path_b]['Content-Length'])
362: self.send_header('Accept-Ranges', 'bytes')
363: self.send_header('Content-Type', 'application/octet-stream')
364: if 'Last-Modified' in index[my_path_b]:
365: self.send_header('Last-Modified', index[my_path_b]['Last-Modified'])
366: self.end_headers()
367: else:
368: if ('_parts' in index[my_path_b] and index[my_path_b]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
369: file_name = temp_name
370:
371: with open(file_name, 'rb') as real_file:
372: file_stat = os.stat(file_name)
373: if 'Range' in self.headers:
374: self.send_response(206)
375: ranges = ()
376: requested_ranges.rewind()
377: while True:
378: pair = requested_ranges.pop()
379: if pair[0] == None:
380: break
381: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
382: self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path_b]['Content-Length']))
383: else:
384: self.send_response(200)
385: self.send_header('Content-Length', str(file_stat.st_size))
386: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
387: if 'Last-Modified' in index[my_path_b]:
388: self.send_header('Last-Modified', index[my_path_b]['Last-Modified'])
389: self.send_header('Content-Type', 'application/octet-stream')
390: self.end_headers()
391: if self.command in ('GET'):
392: if len(requested_ranges) > 0:
393: requested_ranges.rewind()
394: (start, end) = requested_ranges.pop()
395: else:
396: start = 0
397: # XXX ugly hack
398: if 'Content-Length' in index[my_path_b]:
399: end = index[my_path_b]['Content-Length']
400: else:
401: end = 0
402: real_file.seek(start)
403: if block_size > end - start:
404: req_block_size = end - start
405: else:
406: req_block_size = block_size
407: buffer = real_file.read(req_block_size)
408: length = len(buffer)
409: while length > 0:
410: self.wfile.write(buffer)
411: start += len(buffer)
412: if req_block_size > end - start:
413: req_block_size = end - start
414: if req_block_size == 0:
415: break
416: buffer = real_file.read(req_block_size)
417: length = len(buffer)
418:
419: def do_HEAD(self):
420: return self.__process()
421: def do_GET(self):
422: return self.__process()
423:
424: server = http.server.HTTPServer(('127.0.0.1', int(config['DEFAULT']['port'])), MyRequestHandler)
425: server.serve_forever()
426:
427: #gevent.joinall()