Lines of
samesite.py
from check-in e7b837a681
that are changed by the sequence of edits moving toward
check-in b0975a28fb:
1: #!/usr/bin/env python3.1
2:
3: import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request
4:
5: class Config:
6: __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
7: _default = {
8: 'general': {
9: 'port': '8008',
10: },
11: '_other': {
12: 'verbose': 'no',
13: 'noetag': 'no',
14: 'noparts': 'no',
15: },}
16:
17: # function to read in config file
18: def __init__(self):
19: import configparser, optparse
20:
21: parser = optparse.OptionParser()
22: parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
23: (self.options, args) = parser.parse_args()
24:
25: assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
26:
27: configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
28: if configDir:
29: self.root = configDir.group(1)
30: else:
31: self.root = os.getcwd()
32:
33: self._config = configparser.ConfigParser()
34: self._config.readfp(open(self.options.config))
35:
36: for section in self._config.sections():
37: if section != 'general':
38: if self._config.has_option(section, 'dir'):
39: if re.compile('^/$').match(self._config.get(section, 'dir')):
40: self._config.set(section, 'dir', self.root + os.sep + section)
41: thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
42: if thisDir:
43: self._config.set(section, 'dir', thisDir.group(1))
44: if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
45: self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
46: else:
47: self._config.set(section, 'dir', self.root + os.sep + section)
48:
49: if not self._config.has_option(section, 'root'):
50: self._config.set(section, 'root', section)
51:
52: # function to select config file section or create one
53: def section(self, section):
54: if not self._config.has_section(section):
55: self._config.add_section(section)
56: self._section = section
57:
58: # function to get config parameter, if parameter doesn't exists the default
59: # value or None is substituted
60: def __getitem__(self, name):
61: if not self._config.has_option(self._section, name):
62: if self._section in self._default:
63: if name in self._default[self._section]:
64: self._config.set(self._section, name, self._default[self._section][name])
65: else:
66: self._config.set(self._section, name, None)
67: elif name in self._default['_other']:
68: self._config.set(self._section, name, self._default['_other'][name])
69: else:
70: self._config.set(self._section, name, None)
71: return(self._config.get(self._section, name))
72:
73: config = Config()
74:
75: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
76:
77: const_desc_fields = set(['Content-Length', 'Pragma', 'Last-Modified'])
78: const_ignore_fields = set(['Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By'])
79:
80: block_size = 4096
81:
82: '''
83: # later, kqueue would be good but later
84: class Connection:
85: __slots__ = frozenset(('__address', '__input', '__socket', '__status', 'error', 'method', 'url', 'http_version'))
86:
87: def __init__(self, socket, address):
88: self.__address = address
89: self.__input = b''
90: self.__socket = socket
91: self.__status = 0
92:
93: def read(self, kev):
94: buffer = self.__socket.recv(kev.data)
95: exhausted = False
96: if len(buffer) == 0:
97: eof = True
98: else:
99: self.__input += buffer
100: while not exhausted:
101: if self.__status == -1:
102: exhausted = True
103: elif self.__status == 0:
104: endstring = self.__input.find(b'\n')
105: if endstring > 0:
106: print('Processing request line.')
107: line = self.__input[:endstring].decode('ascii')
108: self.__input = self.__input[endstring + 1:]
109: isRequest = re.compile('(GET) ([^ ]+) HTTP/(1\.0)').match(line)
110: if not isRequest:
111: self.error = 'Not a HTTP connection.'
112: self.__status = -1
113: else:
114: self.method = isRequest.group(1)
115: self.url = isRequest.group(2)
116: self.http_version = isRequest.group(3)
117: self.__status = 1
118: else:
119: exhausted = True
120: elif self.__status == 1:
121: endstring = self.__input.find(b'\n')
122: if endstring > 0:
123: print('Processing header line.' + repr(self.__input))
124: line = self.__input[:endstring].decode('ascii')
125: self.__input = self.__input[endstring + 1:]
126: isHeader = re.compile('([^:]*): +(.*)').match(line)
127: if not isHeader:
128: self.error = 'Bad header.'
129: return(False)
130: # process header here
131: elif endstring == 0:
132: self.__status = 2
133: else:
134: exhausted = True
135:
136: def write(self, kev):
137: pass
138:
139: if options.port:
140: import select, socket
141:
142: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
143: try:
144: sock.bind(('127.0.0.1', int(options.port)))
145: sock.listen(-1)
146:
147: kq = select.kqueue()
148: assert kq.fileno() != -1, "Fatal error: can't initialise kqueue."
149:
150: kq.control([select.kevent(sock, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
151: timeout = None
152:
153: connections = {sock.fileno(): None}
154:
155: while True:
156: kevs = kq.control(None, 1, timeout)
157:
158: for kev in kevs:
159: if type(connections[kev.ident]) == Connection:
160: print(kev.ident, kev.data, kev.filter, kev.flags)
161: assert kev.data != 0, 'No data available.'
162: if kev.filter == select.KQ_FILTER_READ:
163: connections[kev.ident].read(kev)
164: elif kev.filter == select.KQ_FILTER_WRITE:
165: connections[kev.ident].write(kev)
166: else:
167: assert kev.filter in (select.KQ_FILTER_READ, select.KQ_FILTER_WRITE), 'Do we support other filters?'
168: else:
169: (conn, addr) = sock.accept()
170: print('Connection from ' + repr(addr))
171: kq.control([select.kevent(conn, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
172: connections[conn.fileno()] = Connection(conn, addr)
173:
174: if kev.flags >> 15 == 1:
175: kq.control([select.kevent(kev.ident, select.KQ_FILTER_READ, select.KQ_EV_DELETE)], 0)
176: kq.control([select.kevent(kev.ident, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE)], 0)
177: del(connections[kev.ident])
178: finally:
179: sock.close()
180: '''
181:
182: # XXX how about rechecking files?
183: if True:
184: import http.server
185:
186: class MyRequestHandler(http.server.BaseHTTPRequestHandler):
187: def __process(self):
188: # reload means file needs to be reloaded to serve request
189: reload = False
190: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
191: recheck = False
192: # file_stat means file definitely exists
193: file_stat = None
194: # requested_ranges holds data about any range requested
195: requested_ranges = None
196: # records holds data from index locally, should be written back upon successfull completion
197: record = None
198: info = 'Checking file: ' + self.path
199:
200: myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
201: if myPath:
202: my_path = myPath.group(1)
203: else:
204: my_path = self.path
205:
206: config.section(self.headers['Host'])
207:
208: if not os.access(config['dir'], os.X_OK):
209: os.mkdir(config['dir'])
210: # this is file index - everything is stored in this file
211: # _parts - list of stored parts of file
212: # _time - last time the file was checked
213: # everything else is just the headers
214: index = shelve.open(config['dir'] + os.sep + '.index')
215:
216: desc_fields = const_desc_fields.copy()
217: ignore_fields = const_ignore_fields.copy()
218: if not config['noetag']:
219: desc_fields.add('ETag')
220: else:
221: ignore_fields.add('ETag')
222:
223: proxy_ignored = ('Accept', 'Accept-Encoding',
224: 'Cache-Control', 'Connection',
225: 'Host',
226: 'If-Modified-Since', 'If-Unmodified-Since',
227: 'User-Agent',
228: 'Via',
229: 'X-Forwarded-For',
230: )
231:
232: print('===============[ {} request ]==='.format(self.command))
233:
234: for header in self.headers:
235: if header in proxy_ignored:
236: pass
237: elif header in ('Range'):
238: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
239: if isRange:
240: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
241: else:
242: return()
243: elif header in ('Pragma'):
244: if my_path in index:
245: index[my_path][header] = self.headers[header]
246: else:
247: print('Unknown header - ', header, ': ', self.headers[header], sep='')
248: return()
249: print(header, self.headers[header])
250:
251: # creating empty placeholder in index
252: # if there's no space map and there's no file in real directory - we have no file
253: # if there's an empty space map - file is full
254: # space map generally covers every bit of file we don't posess currently
255: if not my_path in index:
256: info += '\nThis one is new.'
257: reload = True
258: record = {}
259: else:
260: record = index[my_path]
261:
262: if not '_parts' in record:
263: record['_parts'] = None
264:
e7b837a681 2010-08-25 265: # creating file name from my_path
e7b837a681 2010-08-25 266: file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
e7b837a681 2010-08-25 267: # partial file or unfinished download
e7b837a681 2010-08-25 268: temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
e7b837a681 2010-08-25 269:
e7b837a681 2010-08-25 270: # forcibly checking file if no file present
e7b837a681 2010-08-25 271: if os.access(file_name, os.R_OK):
e7b837a681 2010-08-25 272: file_stat = os.stat(file_name)
e7b837a681 2010-08-25 273: elif '_parts' in record and os.access(temp_name, os.R_OK):
e7b837a681 2010-08-25 274: file_stat = os.stat(temp_name)
e7b837a681 2010-08-25 275: elif not reload:
e7b837a681 2010-08-25 276: print(record)
e7b837a681 2010-08-25 277: info += '\nFile not found or inaccessible.'
e7b837a681 2010-08-25 278: record['_parts'] = None
e7b837a681 2010-08-25 279: reload = True
280:
281: # forcibly checking file if file size doesn't match with index data
282: if not reload:
283: if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
284: if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']):
285: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length'])
286: reload = True
287:
288: # forcibly checking file if index holds Pragma header
289: if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache':
290: info +='\nPragma on: recheck imminent.'
291: recheck = True
292:
293: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
294: if not recheck and not reload and '_time' in record and (datetime.datetime.now() - datetime.timedelta(hours = 4) - record['_time']).days < 0:
295: recheck = True
296:
297: print(info)
298: if reload or recheck:
299:
300: try:
301: request = 'http://' + config['root'] + my_path
302: needed = None
303: # XXX and if we specify full file we don't go partial?
304: if requested_ranges != None:
305: if '_parts' in record and record['_parts'] != None:
e7b837a681 2010-08-25 306: needed = record['_parts'] & requested_ranges
e7b837a681 2010-08-25 307: elif config['noparts']:
e7b837a681 2010-08-25 308: needed = record['_parts']
e7b837a681 2010-08-25 309: else:
310: needed = requested_ranges
311: ranges = ()
312: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
e7b837a681 2010-08-25 313: if len(needed) > 0:
314: needed.rewind()
315: while True:
316: range = needed.pop()
317: if range[0] == None:
318: break
319: ranges += '{}-{}'.format(range[0], range[1] - 1),
320: request = urllib.request.Request(request, headers = {'Range': 'bytes=' + ','.join(ranges)})
321:
322: with urllib.request.urlopen(request) as source:
323: new_record = {}
324: new_record['_parts'] = record['_parts']
325: headers = source.info()
326:
327: # stripping unneeded headers (XXX make this inplace?)
328: for header in headers:
329: if header in desc_fields:
330: #if header == 'Pragma' and headers[header] != 'no-cache':
331: if header == 'Content-Length':
332: if 'Content-Range' not in headers:
333: new_record[header] = int(headers[header])
334: else:
335: new_record[header] = headers[header]
336: elif header == 'Content-Range':
337: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
338: if range:
339: new_record['Content-Length'] = int(range.group(3))
340: else:
341: assert False, 'Content-Range unrecognized.'
342: elif not header in ignore_fields:
343: print('Undefined header "', header, '": ', headers[header], sep='')
344:
345: # comparing headers with data found in index
346: # if any header has changed (except Pragma) file is fully downloaded
347: # same if we get more or less headers
348: old_keys = set(record.keys())
349: old_keys.discard('_time')
350: old_keys.discard('Pragma')
351: more_keys = set(new_record.keys()) - old_keys
352: more_keys.discard('Pragma')
353: less_keys = old_keys - set(new_record.keys())
354: if len(more_keys) > 0:
355: if not len(old_keys) == 0:
356: print('More headers appear:', more_keys)
357: reload = True
358: elif len(less_keys) > 0:
359: print('Less headers appear:', less_keys)
360: else:
361: for key in record.keys():
362: if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]:
363: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
364: print(type(record[key]), type(new_record[key]))
365: reload = True
366:
367: if reload:
368: print('Reloading.')
369: if os.access(temp_name, os.R_OK):
370: os.unlink(temp_name)
371: if os.access(file_name, os.R_OK):
372: os.unlink(file_name)
e7b837a681 2010-08-25 373: if new_record['_parts'] == None or reload:
374: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
375: print(new_record)
376:
377: # downloading file or segment
378: if 'Content-Length' in new_record:
379: if needed == None:
380: needed = new_record['_parts']
381: else:
382: if len(needed) > 1:
383: print("Multipart requests currently not supported.")
384: assert False, 'Skip this one for now.'
385: else:
386: assert False, 'No Content-Length or Content-Range header.'
387:
388: new_record['_time'] = datetime.datetime.now()
389: if self.command not in ('HEAD'):
390: # file is created at temporary location and moved in place only when download completes
391: if not os.access(temp_name, os.R_OK):
392: empty_name = config['dir'] + os.sep + '.tmp'
393: with open(empty_name, 'w+b') as some_file:
394: pass
395: os.renames(empty_name, temp_name)
396: temp_file = open(temp_name, 'r+b')
397: needed.rewind()
398: while True:
399: (start, end) = needed.pop()
400: if start == None:
401: break
402: stream_last = start
403: old_record = new_record
404: if end - start < block_size:
405: req_block_size = end - start
406: else:
407: req_block_size = block_size
408: buffer = source.read(req_block_size)
409: length = len(buffer)
410: while length > 0 and stream_last < end:
411: stream_pos = stream_last + length
412: assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
413: temp_file.seek(stream_last)
414: temp_file.write(buffer)
415: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
416: index[my_path] = old_record
417: index.sync()
418: old_record = new_record
419: stream_last = stream_pos
420: if end - stream_last < block_size:
421: req_block_size = end - stream_last
422: buffer = source.read(req_block_size)
423: length = len(buffer)
424: # moving downloaded data to real file
425: temp_file.close()
426:
427: print(new_record)
428: index[my_path] = new_record
429: index.sync()
430:
431: except urllib.error.HTTPError as error:
432: # in case of error we don't need to do anything actually,
433: # if file download stalls or fails the file would not be moved to it's location
434: print(error)
435:
436: if '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
437: # just moving
438: # drop old dirs XXX
439: print('Moving temporary file to new destination.')
440: os.renames(temp_name, file_name)
441:
442: if self.command == 'HEAD':
443: self.send_response(200)
444: if 'Content-Length' in index[my_path]:
445: self.send_header('Content-Length', index[my_path]['Content-Length'])
446: self.send_header('Accept-Ranges', 'bytes')
447: self.send_header('Content-Type', 'application/octet-stream')
448: if 'Last-Modified' in index[my_path]:
449: self.send_header('Last-Modified', index[my_path]['Last-Modified'])
450: self.end_headers()
451: else:
452: if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
453: file_name = temp_name
454:
455: with open(file_name, 'rb') as real_file:
456: file_stat = os.stat(file_name)
457: if 'Range' in self.headers:
458: self.send_response(206)
459: ranges = ()
460: requested_ranges.rewind()
461: while True:
462: pair = requested_ranges.pop()
463: if pair[0] == None:
464: break
465: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
466: self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length']))
467: else:
468: self.send_response(200)
469: self.send_header('Content-Length', str(file_stat.st_size))
470: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
471: self.send_header('Last-Modified', index[my_path]['Last-Modified'])
472: self.send_header('Content-Type', 'application/octet-stream')
473: self.end_headers()
474: if self.command in ('GET'):
475: if len(requested_ranges) > 0:
476: requested_ranges.rewind()
477: (start, end) = requested_ranges.pop()
478: else:
479: start = 0
480: end = index[my_path]['Content-Length']
481: real_file.seek(start)
482: if block_size > end - start:
483: req_block_size = end - start
484: else:
485: req_block_size = block_size
486: buffer = real_file.read(req_block_size)
487: length = len(buffer)
488: while length > 0:
489: self.wfile.write(buffer)
490: start += len(buffer)
491: if req_block_size > end - start:
492: req_block_size = end - start
493: if req_block_size == 0:
494: break
495: buffer = real_file.read(req_block_size)
496: length = len(buffer)
497:
498: def do_HEAD(self):
499: return self.__process()
500: def do_GET(self):
501: return self.__process()
502:
503: config.section('general')
504: server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
505: server.serve_forever()
506:
507: else:
508: while True:
509: unchecked_files = set()
510: checked_files = 0
511:
512: # reading log and storing found urls for processing
513: # check file mtime XXX
514: with open(options.log, 'r') as log_file:
515: log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
516: for line in log_file:
517: this_line = log_line.match(line.strip())
518: if this_line:
519: unchecked_files.add(this_line.group(2))
520:
521: for url in unchecked_files:
522: reload = False
523: recheck = False
524: info = 'Checking file: ' + url
525:
526: # creating empty placeholder in index
527: if not url in index:
528: info += '\nThis one is new.'
529: index[url] = {}
530: reload = True
531:
532: # creating file name from url
533: file_name = options.dir + re.compile('%20').sub(' ', url)
534:
535: # forcibly checking file if no file present
536: if not reload and not os.access(file_name, os.R_OK):
537: info += '\nFile not found or inaccessible.'
538: reload = True
539:
540: # forcibly checking file if file size doesn't match with index data
541: elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
542: info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
543: reload = True
544:
545: # forcibly checking file if index hods Pragma header
546: if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
547: info +='\nPragma on: recheck imminent.'
548: recheck = True
549:
550: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
551: if not recheck and not reload and (options.noupdate or ('_time' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['_time']).days < 0)):
552: if options.verbose:
553: print(info)
554: continue
555: else:
556: print(info)
557:
558: try:
559: with urllib.request.urlopen(options.root + url) as source:
560: new_headers = {}
561: headers = source.info()
562:
563: # stripping unneeded headers (XXX make this inplace?)
564: for header in headers:
565: if header in desc_fields:
566: if header == 'Pragma' and headers[header] != 'no-cache':
567: print('Pragma:', headers[header])
568: new_headers[header] = headers[header]
569: elif not header in ignore_fields:
570: print('Undefined header "', header, '": ', headers[header], sep='')
571:
572: # comparing headers with data found in index
573: # if any header has changed (except Pragma) file is fully downloaded
574: # same if we get more or less headers
575: old_keys = set(index[url].keys())
576: old_keys.discard('_time')
577: old_keys.discard('Pragma')
578: more_keys = set(new_headers.keys()) - old_keys
579: more_keys.discard('Pragma')
580: less_keys = old_keys - set(new_headers.keys())
581: if len(more_keys) > 0:
582: if not len(old_keys) == 0:
583: print('More headers appear:', more_keys)
584: reload = True
585: elif len(less_keys) > 0:
586: print('Less headers appear:', less_keys)
587: else:
588: for key in index[url].keys():
589: if key[0] != '_' and key != 'Pragma' and not index[url][key] == new_headers[key]:
590: print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
591: reload = True
592:
593: # downloading file
594: if reload:
595: if 'Content-Length' in headers:
596: print('Downloading', headers['Content-Length'], 'bytes [', end='')
597: else:
598: print('Downloading [', end='')
599: sys.stdout.flush()
600:
601: # file is created at temporary location and moved in place only when download completes
602: temp_file = open(options.dir + os.sep + '.tmp', 'wb')
603: buffer = source.read(block_size)
604: megablocks = 0
605: blocks = 0
606: megs = 0
607: while len(buffer) > 0:
608: temp_file.write(buffer)
609: buffer = source.read(block_size)
610: blocks += 1
611: if blocks > 102400/block_size:
612: megablocks += 1
613: if megablocks > 10:
614: megablocks = megablocks - 10
615: megs += 1
616: print('{}Mb'.format(megs), end='')
617: else:
618: print('.', end='')
619: blocks = blocks - 102400/block_size
620: sys.stdout.flush()
621: temp_file.close()
622: print(']')
623: os.renames(options.dir + os.sep + '.tmp', file_name)
624:
625: checked_files += 1
626:
627: # storing new time mark and storing new headers
628: new_headers['_time'] = datetime.datetime.now()
629: index[url] = new_headers
630: index.sync()
631:
632: except urllib.error.HTTPError as error:
633: # in case of error we don't need to do anything actually,
634: # if file download stalls or fails the file would not be moved to it's location
635: print(error)
636:
637: if options.verbose:
638: print('[', len(unchecked_files), '/', checked_files, ']')
639:
640: # checking if there were any files downloaded, if yes - restarting sequence
641: if checked_files == 0:
642: break