Lines of
samesite.py
from check-in 8425e2e393
that are changed by the sequence of edits moving toward
check-in 601ec56da6:
8425e2e393 2011-12-14 1: #!/usr/bin/env python3.2
2:
8425e2e393 2011-12-14 3: import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request
4:
5: class Config:
6: __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
7: _default = {
8: 'general': {
9: 'port': '8008',
10: },
11: '_other': {
12: 'verbose': 'no',
13: 'noetag': 'no',
14: 'noparts': 'no',
15: 'strip': '',
16: 'sub': '',
17: },}
18:
19: # function to read in config file
20: def __init__(self):
8425e2e393 2011-12-14 21: import configparser, optparse
22:
23: parser = optparse.OptionParser()
24: parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
25: (self.options, args) = parser.parse_args()
26:
27: assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
28:
29: configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
30: if configDir:
31: self.root = configDir.group(1)
32: else:
33: self.root = os.getcwd()
34:
8425e2e393 2011-12-14 35: self._config = configparser.ConfigParser()
36: self._config.readfp(open(self.options.config))
37:
38: for section in self._config.sections():
39: if section != 'general':
40: if self._config.has_option(section, 'dir'):
41: if re.compile('^/$').match(self._config.get(section, 'dir')):
42: self._config.set(section, 'dir', self.root + os.sep + section)
43: thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
44: if thisDir:
45: self._config.set(section, 'dir', thisDir.group(1))
46: if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
47: self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
48: else:
49: self._config.set(section, 'dir', self.root + os.sep + section)
50:
51: if not self._config.has_option(section, 'root'):
52: self._config.set(section, 'root', section)
53:
54: # function to select config file section or create one
55: def section(self, section):
56: if not self._config.has_section(section):
57: self._config.add_section(section)
58: self._section = section
59:
60: # function to get config parameter, if parameter doesn't exists the default
61: # value or None is substituted
62: def __getitem__(self, name):
63: if not self._config.has_option(self._section, name):
64: if self._section in self._default:
65: if name in self._default[self._section]:
66: self._config.set(self._section, name, self._default[self._section][name])
67: else:
68: self._config.set(self._section, name, None)
69: elif name in self._default['_other']:
70: self._config.set(self._section, name, self._default['_other'][name])
71: else:
72: self._config.set(self._section, name, None)
73: return(self._config.get(self._section, name))
74:
75: config = Config()
76:
77: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
78:
8425e2e393 2011-12-14 79: const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma'])
80: const_ignore_fields = set([
8425e2e393 2011-12-14 81: 'Accept-Ranges', 'Age',
8425e2e393 2011-12-14 82: 'Cache-Control', 'Connection', 'Content-Type',
8425e2e393 2011-12-14 83: 'Date',
8425e2e393 2011-12-14 84: 'Expires',
8425e2e393 2011-12-14 85: 'Referer',
8425e2e393 2011-12-14 86: 'Server',
8425e2e393 2011-12-14 87: 'Via',
8425e2e393 2011-12-14 88: 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By',
89: ])
90:
91: block_size = 4096
92:
8425e2e393 2011-12-14 93: import http.server
8425e2e393 2011-12-14 94:
8425e2e393 2011-12-14 95: class MyRequestHandler(http.server.BaseHTTPRequestHandler):
96: def __process(self):
97: # reload means file needs to be reloaded to serve request
98: reload = False
99: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
100: recheck = False
101: # file_stat means file definitely exists
102: file_stat = None
103: # requested_ranges holds data about any range requested
104: requested_ranges = None
105: # records holds data from index locally, should be written back upon successfull completion
106: record = None
107:
108: myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
109: if myPath:
110: my_path = myPath.group(1)
111: else:
112: my_path = self.path
113:
8425e2e393 2011-12-14 114: config.section(self.headers['Host'])
115:
116: if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0:
117: string = re.compile(config['strip']).sub(config['sub'], my_path)
118: my_path = string
119:
120: info = 'Checking file: ' + my_path
121:
122: if not os.access(config['dir'], os.X_OK):
123: os.mkdir(config['dir'])
124: # this is file index - everything is stored in this file
125: # _parts - list of stored parts of file
126: # _time - last time the file was checked
127: # everything else is just the headers
8425e2e393 2011-12-14 128: index = shelve.open(config['dir'] + os.sep + '.index')
129:
130: desc_fields = const_desc_fields.copy()
131: ignore_fields = const_ignore_fields.copy()
132: if config['noetag'] == 'no':
8425e2e393 2011-12-14 133: desc_fields.add('ETag')
134: else:
8425e2e393 2011-12-14 135: ignore_fields.add('ETag')
136:
137: proxy_ignored = set([
8425e2e393 2011-12-14 138: 'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language',
8425e2e393 2011-12-14 139: 'Cache-Control', 'Connection', 'Content-Length', 'Cookie',
8425e2e393 2011-12-14 140: 'Host',
8425e2e393 2011-12-14 141: 'If-Modified-Since', 'If-Unmodified-Since',
8425e2e393 2011-12-14 142: 'Referer',
8425e2e393 2011-12-14 143: 'User-Agent',
8425e2e393 2011-12-14 144: 'Via',
8425e2e393 2011-12-14 145: 'X-Forwarded-For', 'X-Last-HR', 'X-Last-HTTP-Status-Code', 'X-REMOVED', 'X-Real-IP', 'X-Retry-Count',
146: ])
147:
148: print('===============[ {} request ]==='.format(self.command))
149:
150: for header in self.headers:
151: if header in proxy_ignored:
152: pass
8425e2e393 2011-12-14 153: elif header in ('Range'):
154: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
155: if isRange:
156: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
157: else:
158: return()
8425e2e393 2011-12-14 159: elif header in ('Pragma'):
160: if my_path in index:
161: index[my_path][header] = self.headers[header]
162: else:
163: print('Unknown header - ', header, ': ', self.headers[header], sep='')
164: return()
165: print(header, self.headers[header])
166:
167: # creating file name from my_path
168: file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
169: # partial file or unfinished download
170: temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
171:
172: # creating empty placeholder in index
173: # if there's no space map and there's no file in real directory - we have no file
174: # if there's an empty space map - file is full
175: # space map generally covers every bit of file we don't posess currently
176: if not my_path in index:
177: info += '\nThis one is new.'
178: reload = True
179: record = {}
180: else:
181: # forcibly checking file if no file present
182: record = index[my_path]
183: if os.access(file_name, os.R_OK):
184: info += '\nFull file found.'
185: file_stat = os.stat(file_name)
186: elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK):
187: info += '\nPartial file found.'
188: file_stat = os.stat(temp_name)
189: recheck = True
190: else:
191: info += '\nFile not found or inaccessible.'
192: record['_parts'] = None
193: reload = True
194:
195: if not '_parts' in record:
196: record['_parts'] = None
197:
198: if record['_parts'] == None:
199: recheck = True
200:
201: # forcibly checking file if file size doesn't match with index data
202: if not reload:
203: if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
8425e2e393 2011-12-14 204: if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']):
8425e2e393 2011-12-14 205: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length'])
206: record['_parts'] = None
207: reload = True
208:
209: # forcibly checking file if index holds Pragma header
8425e2e393 2011-12-14 210: if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache':
211: info +='\nPragma on: recheck imminent.'
212: recheck = True
213:
214: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
215: if not recheck and not reload and '_time' in record and (record['_time'] - datetime.datetime.now() + datetime.timedelta(hours = 4)).days < 0:
216: info += '\nFile is old - rechecking.'
217: recheck = True
218:
219: print(info)
220: if reload or recheck:
221:
222: try:
223: request = 'http://' + config['root'] + self.path
224: my_headers = {}
8425e2e393 2011-12-14 225: for header in ('Cache-Control', 'Cookie', 'Referer', 'User-Agent'):
226: if header in self.headers:
227: my_headers[header] = self.headers[header]
228:
229: needed = None
230: if '_parts' in record and record['_parts'] != None:
231: if config['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap():
232: needed = record['_parts']
233: else:
234: needed = record['_parts'] & requested_ranges
235: elif config['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap():
236: needed = requested_ranges
237: ranges = ()
238: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
239: if needed != None and len(needed) > 0:
240: needed.rewind()
241: while True:
242: range = needed.pop()
243: if range[0] == None:
244: break
245: ranges += '{}-{}'.format(range[0], range[1] - 1),
8425e2e393 2011-12-14 246: my_headers['Range'] = 'bytes=' + ','.join(ranges)
8425e2e393 2011-12-14 247:
8425e2e393 2011-12-14 248: request = urllib.request.Request(request, headers = my_headers)
8425e2e393 2011-12-14 249:
8425e2e393 2011-12-14 250: with urllib.request.urlopen(request) as source:
8425e2e393 2011-12-14 251: new_record = {}
8425e2e393 2011-12-14 252: new_record['_parts'] = record['_parts']
8425e2e393 2011-12-14 253: headers = source.info()
8425e2e393 2011-12-14 254:
8425e2e393 2011-12-14 255: # stripping unneeded headers (XXX make this inplace?)
8425e2e393 2011-12-14 256: for header in headers:
8425e2e393 2011-12-14 257: if header in desc_fields:
8425e2e393 2011-12-14 258: #if header == 'Pragma' and headers[header] != 'no-cache':
8425e2e393 2011-12-14 259: if header == 'Content-Length':
8425e2e393 2011-12-14 260: if 'Content-Range' not in headers:
8425e2e393 2011-12-14 261: new_record[header] = int(headers[header])
8425e2e393 2011-12-14 262: else:
8425e2e393 2011-12-14 263: new_record[header] = headers[header]
8425e2e393 2011-12-14 264: elif header == 'Content-Range':
8425e2e393 2011-12-14 265: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
8425e2e393 2011-12-14 266: if range:
8425e2e393 2011-12-14 267: new_record['Content-Length'] = int(range.group(3))
8425e2e393 2011-12-14 268: else:
8425e2e393 2011-12-14 269: assert False, 'Content-Range unrecognized.'
8425e2e393 2011-12-14 270: elif not header in ignore_fields:
8425e2e393 2011-12-14 271: print('Undefined header "', header, '": ', headers[header], sep='')
8425e2e393 2011-12-14 272:
8425e2e393 2011-12-14 273: # comparing headers with data found in index
8425e2e393 2011-12-14 274: # if any header has changed (except Pragma) file is fully downloaded
8425e2e393 2011-12-14 275: # same if we get more or less headers
8425e2e393 2011-12-14 276: old_keys = set(record.keys())
8425e2e393 2011-12-14 277: old_keys.discard('_time')
8425e2e393 2011-12-14 278: old_keys.discard('Pragma')
8425e2e393 2011-12-14 279: more_keys = set(new_record.keys()) - old_keys
8425e2e393 2011-12-14 280: more_keys.discard('Pragma')
8425e2e393 2011-12-14 281: less_keys = old_keys - set(new_record.keys())
8425e2e393 2011-12-14 282: if len(more_keys) > 0:
8425e2e393 2011-12-14 283: if not len(old_keys) == 0:
8425e2e393 2011-12-14 284: print('More headers appear:', more_keys)
8425e2e393 2011-12-14 285: reload = True
8425e2e393 2011-12-14 286: elif len(less_keys) > 0:
8425e2e393 2011-12-14 287: print('Less headers appear:', less_keys)
8425e2e393 2011-12-14 288: else:
8425e2e393 2011-12-14 289: for key in record.keys():
8425e2e393 2011-12-14 290: if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]:
8425e2e393 2011-12-14 291: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
8425e2e393 2011-12-14 292: print(type(record[key]), type(new_record[key]))
8425e2e393 2011-12-14 293: reload = True
8425e2e393 2011-12-14 294:
8425e2e393 2011-12-14 295: if reload:
8425e2e393 2011-12-14 296: print('Reloading.')
8425e2e393 2011-12-14 297: if os.access(temp_name, os.R_OK):
8425e2e393 2011-12-14 298: os.unlink(temp_name)
8425e2e393 2011-12-14 299: if os.access(file_name, os.R_OK):
8425e2e393 2011-12-14 300: os.unlink(file_name)
8425e2e393 2011-12-14 301: if 'Content-Length' in new_record:
8425e2e393 2011-12-14 302: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
8425e2e393 2011-12-14 303: else:
8425e2e393 2011-12-14 304: new_record['_parts'] = spacemap.SpaceMap()
8425e2e393 2011-12-14 305: print(new_record)
8425e2e393 2011-12-14 306:
8425e2e393 2011-12-14 307: # downloading file or segment
8425e2e393 2011-12-14 308: if 'Content-Length' in new_record:
8425e2e393 2011-12-14 309: if needed == None:
8425e2e393 2011-12-14 310: needed = new_record['_parts']
8425e2e393 2011-12-14 311: else:
8425e2e393 2011-12-14 312: if len(needed) > 1:
8425e2e393 2011-12-14 313: print("Multipart requests currently not supported.")
8425e2e393 2011-12-14 314: assert False, 'Skip this one for now.'
8425e2e393 2011-12-14 315: #else:
8425e2e393 2011-12-14 316: #assert False, 'No Content-Length or Content-Range header.'
8425e2e393 2011-12-14 317:
8425e2e393 2011-12-14 318: new_record['_time'] = datetime.datetime.now()
8425e2e393 2011-12-14 319: if self.command not in ('HEAD'):
8425e2e393 2011-12-14 320: # file is created at temporary location and moved in place only when download completes
8425e2e393 2011-12-14 321: if not os.access(temp_name, os.R_OK):
8425e2e393 2011-12-14 322: empty_name = config['dir'] + os.sep + '.tmp'
8425e2e393 2011-12-14 323: with open(empty_name, 'w+b') as some_file:
8425e2e393 2011-12-14 324: pass
8425e2e393 2011-12-14 325: os.renames(empty_name, temp_name)
8425e2e393 2011-12-14 326: temp_file = open(temp_name, 'r+b')
8425e2e393 2011-12-14 327: if requested_ranges == None and needed == None:
8425e2e393 2011-12-14 328: needed = new_record['_parts']
8425e2e393 2011-12-14 329: needed.rewind()
8425e2e393 2011-12-14 330: while True:
8425e2e393 2011-12-14 331: (start, end) = needed.pop()
8425e2e393 2011-12-14 332: if start == None:
8425e2e393 2011-12-14 333: break
8425e2e393 2011-12-14 334: stream_last = start
8425e2e393 2011-12-14 335: old_record = new_record
8425e2e393 2011-12-14 336: if end - start < block_size:
8425e2e393 2011-12-14 337: req_block_size = end - start
8425e2e393 2011-12-14 338: else:
8425e2e393 2011-12-14 339: req_block_size = block_size
8425e2e393 2011-12-14 340: buffer = source.read(req_block_size)
8425e2e393 2011-12-14 341: length = len(buffer)
8425e2e393 2011-12-14 342: while length > 0 and stream_last < end:
8425e2e393 2011-12-14 343: stream_pos = stream_last + length
8425e2e393 2011-12-14 344: assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
8425e2e393 2011-12-14 345: temp_file.seek(stream_last)
8425e2e393 2011-12-14 346: temp_file.write(buffer)
8425e2e393 2011-12-14 347: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
8425e2e393 2011-12-14 348: index[my_path] = old_record
8425e2e393 2011-12-14 349: index.sync()
8425e2e393 2011-12-14 350: old_record = new_record
8425e2e393 2011-12-14 351: stream_last = stream_pos
8425e2e393 2011-12-14 352: if end - stream_last < block_size:
8425e2e393 2011-12-14 353: req_block_size = end - stream_last
8425e2e393 2011-12-14 354: buffer = source.read(req_block_size)
8425e2e393 2011-12-14 355: length = len(buffer)
8425e2e393 2011-12-14 356: # moving downloaded data to real file
8425e2e393 2011-12-14 357: temp_file.close()
8425e2e393 2011-12-14 358:
8425e2e393 2011-12-14 359: index[my_path] = new_record
8425e2e393 2011-12-14 360: index.sync()
8425e2e393 2011-12-14 361:
8425e2e393 2011-12-14 362: except urllib.error.HTTPError as error:
363: # in case of error we don't need to do anything actually,
364: # if file download stalls or fails the file would not be moved to it's location
365: print(error)
366:
367: print(index[my_path])
368:
369: if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
370: # just moving
371: # drop old dirs XXX
372: print('Moving temporary file to new destination.')
373: os.renames(temp_name, file_name)
374:
375: if not my_path in index:
376: self.send_response(502)
377: self.end_headers()
378: return
379:
380: if self.command == 'HEAD':
381: self.send_response(200)
8425e2e393 2011-12-14 382: if 'Content-Length' in index[my_path]:
8425e2e393 2011-12-14 383: self.send_header('Content-Length', index[my_path]['Content-Length'])
8425e2e393 2011-12-14 384: self.send_header('Accept-Ranges', 'bytes')
8425e2e393 2011-12-14 385: self.send_header('Content-Type', 'application/octet-stream')
8425e2e393 2011-12-14 386: if 'Last-Modified' in index[my_path]:
8425e2e393 2011-12-14 387: self.send_header('Last-Modified', index[my_path]['Last-Modified'])
388: self.end_headers()
389: else:
390: if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
391: file_name = temp_name
392:
393: with open(file_name, 'rb') as real_file:
394: file_stat = os.stat(file_name)
8425e2e393 2011-12-14 395: if 'Range' in self.headers:
396: self.send_response(206)
397: ranges = ()
398: requested_ranges.rewind()
399: while True:
400: pair = requested_ranges.pop()
401: if pair[0] == None:
402: break
403: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
8425e2e393 2011-12-14 404: self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length']))
405: else:
406: self.send_response(200)
8425e2e393 2011-12-14 407: self.send_header('Content-Length', str(file_stat.st_size))
408: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
8425e2e393 2011-12-14 409: if 'Last-Modified' in index[my_path]:
8425e2e393 2011-12-14 410: self.send_header('Last-Modified', index[my_path]['Last-Modified'])
8425e2e393 2011-12-14 411: self.send_header('Content-Type', 'application/octet-stream')
412: self.end_headers()
413: if self.command in ('GET'):
414: if len(requested_ranges) > 0:
415: requested_ranges.rewind()
416: (start, end) = requested_ranges.pop()
417: else:
418: start = 0
419: # XXX ugly hack
8425e2e393 2011-12-14 420: if 'Content-Length' in index[my_path]:
8425e2e393 2011-12-14 421: end = index[my_path]['Content-Length']
422: else:
423: end = 0
424: real_file.seek(start)
425: if block_size > end - start:
426: req_block_size = end - start
427: else:
428: req_block_size = block_size
429: buffer = real_file.read(req_block_size)
430: length = len(buffer)
431: while length > 0:
432: self.wfile.write(buffer)
433: start += len(buffer)
434: if req_block_size > end - start:
435: req_block_size = end - start
436: if req_block_size == 0:
437: break
438: buffer = real_file.read(req_block_size)
439: length = len(buffer)
440:
441: def do_HEAD(self):
442: return self.__process()
443: def do_GET(self):
444: return self.__process()
445:
446: config.section('general')
8425e2e393 2011-12-14 447: server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
448: server.serve_forever()