Lines of
samesite.py
from check-in b0975a28fb
that are changed by the sequence of edits moving toward
check-in cab908195f:
1: #!/usr/bin/env python3.1
2:
3: import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request
4:
5: class Config:
6: __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
7: _default = {
8: 'general': {
9: 'port': '8008',
10: },
11: '_other': {
12: 'verbose': 'no',
13: 'noetag': 'no',
14: 'noparts': 'no',
15: },}
16:
17: # function to read in config file
18: def __init__(self):
19: import configparser, optparse
20:
21: parser = optparse.OptionParser()
22: parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
23: (self.options, args) = parser.parse_args()
24:
25: assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
26:
27: configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
28: if configDir:
29: self.root = configDir.group(1)
30: else:
31: self.root = os.getcwd()
32:
33: self._config = configparser.ConfigParser()
34: self._config.readfp(open(self.options.config))
35:
36: for section in self._config.sections():
37: if section != 'general':
38: if self._config.has_option(section, 'dir'):
39: if re.compile('^/$').match(self._config.get(section, 'dir')):
40: self._config.set(section, 'dir', self.root + os.sep + section)
41: thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
42: if thisDir:
43: self._config.set(section, 'dir', thisDir.group(1))
44: if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
45: self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
46: else:
47: self._config.set(section, 'dir', self.root + os.sep + section)
48:
49: if not self._config.has_option(section, 'root'):
50: self._config.set(section, 'root', section)
51:
52: # function to select config file section or create one
53: def section(self, section):
54: if not self._config.has_section(section):
55: self._config.add_section(section)
56: self._section = section
57:
58: # function to get config parameter, if parameter doesn't exists the default
59: # value or None is substituted
60: def __getitem__(self, name):
61: if not self._config.has_option(self._section, name):
62: if self._section in self._default:
63: if name in self._default[self._section]:
64: self._config.set(self._section, name, self._default[self._section][name])
65: else:
66: self._config.set(self._section, name, None)
67: elif name in self._default['_other']:
68: self._config.set(self._section, name, self._default['_other'][name])
69: else:
70: self._config.set(self._section, name, None)
71: return(self._config.get(self._section, name))
72:
73: config = Config()
74:
75: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
76:
b0975a28fb 2010-08-26 77: const_desc_fields = set(['Content-Length', 'Pragma', 'Last-Modified'])
b0975a28fb 2010-08-26 78: const_ignore_fields = set(['Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By'])
79:
80: block_size = 4096
81:
82: '''
83: # later, kqueue would be good but later
84: class Connection:
85: __slots__ = frozenset(('__address', '__input', '__socket', '__status', 'error', 'method', 'url', 'http_version'))
86:
87: def __init__(self, socket, address):
88: self.__address = address
89: self.__input = b''
90: self.__socket = socket
91: self.__status = 0
92:
93: def read(self, kev):
94: buffer = self.__socket.recv(kev.data)
95: exhausted = False
96: if len(buffer) == 0:
97: eof = True
98: else:
99: self.__input += buffer
100: while not exhausted:
101: if self.__status == -1:
102: exhausted = True
103: elif self.__status == 0:
104: endstring = self.__input.find(b'\n')
105: if endstring > 0:
106: print('Processing request line.')
107: line = self.__input[:endstring].decode('ascii')
108: self.__input = self.__input[endstring + 1:]
109: isRequest = re.compile('(GET) ([^ ]+) HTTP/(1\.0)').match(line)
110: if not isRequest:
111: self.error = 'Not a HTTP connection.'
112: self.__status = -1
113: else:
114: self.method = isRequest.group(1)
115: self.url = isRequest.group(2)
116: self.http_version = isRequest.group(3)
117: self.__status = 1
118: else:
119: exhausted = True
120: elif self.__status == 1:
121: endstring = self.__input.find(b'\n')
122: if endstring > 0:
123: print('Processing header line.' + repr(self.__input))
124: line = self.__input[:endstring].decode('ascii')
125: self.__input = self.__input[endstring + 1:]
126: isHeader = re.compile('([^:]*): +(.*)').match(line)
127: if not isHeader:
128: self.error = 'Bad header.'
129: return(False)
130: # process header here
131: elif endstring == 0:
132: self.__status = 2
133: else:
134: exhausted = True
135:
136: def write(self, kev):
137: pass
138:
139: if options.port:
140: import select, socket
141:
142: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
143: try:
144: sock.bind(('127.0.0.1', int(options.port)))
145: sock.listen(-1)
146:
147: kq = select.kqueue()
148: assert kq.fileno() != -1, "Fatal error: can't initialise kqueue."
149:
150: kq.control([select.kevent(sock, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
151: timeout = None
152:
153: connections = {sock.fileno(): None}
154:
155: while True:
156: kevs = kq.control(None, 1, timeout)
157:
158: for kev in kevs:
159: if type(connections[kev.ident]) == Connection:
160: print(kev.ident, kev.data, kev.filter, kev.flags)
161: assert kev.data != 0, 'No data available.'
162: if kev.filter == select.KQ_FILTER_READ:
163: connections[kev.ident].read(kev)
164: elif kev.filter == select.KQ_FILTER_WRITE:
165: connections[kev.ident].write(kev)
166: else:
167: assert kev.filter in (select.KQ_FILTER_READ, select.KQ_FILTER_WRITE), 'Do we support other filters?'
168: else:
169: (conn, addr) = sock.accept()
170: print('Connection from ' + repr(addr))
171: kq.control([select.kevent(conn, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
172: connections[conn.fileno()] = Connection(conn, addr)
173:
174: if kev.flags >> 15 == 1:
175: kq.control([select.kevent(kev.ident, select.KQ_FILTER_READ, select.KQ_EV_DELETE)], 0)
176: kq.control([select.kevent(kev.ident, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE)], 0)
177: del(connections[kev.ident])
178: finally:
179: sock.close()
180: '''
181:
182: # XXX how about rechecking files?
183: if True:
184: import http.server
185:
186: class MyRequestHandler(http.server.BaseHTTPRequestHandler):
187: def __process(self):
188: # reload means file needs to be reloaded to serve request
189: reload = False
190: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
191: recheck = False
192: # file_stat means file definitely exists
193: file_stat = None
194: # requested_ranges holds data about any range requested
195: requested_ranges = None
196: # records holds data from index locally, should be written back upon successfull completion
197: record = None
b0975a28fb 2010-08-26 198: info = 'Checking file: ' + self.path
199:
200: myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
201: if myPath:
202: my_path = myPath.group(1)
203: else:
204: my_path = self.path
205:
206: config.section(self.headers['Host'])
207:
208: if not os.access(config['dir'], os.X_OK):
209: os.mkdir(config['dir'])
210: # this is file index - everything is stored in this file
211: # _parts - list of stored parts of file
212: # _time - last time the file was checked
213: # everything else is just the headers
214: index = shelve.open(config['dir'] + os.sep + '.index')
215:
216: desc_fields = const_desc_fields.copy()
217: ignore_fields = const_ignore_fields.copy()
218: if not config['noetag']:
219: desc_fields.add('ETag')
220: else:
221: ignore_fields.add('ETag')
222:
b0975a28fb 2010-08-26 223: proxy_ignored = ('Accept', 'Accept-Encoding',
b0975a28fb 2010-08-26 224: 'Cache-Control', 'Connection',
225: 'Host',
226: 'If-Modified-Since', 'If-Unmodified-Since',
227: 'User-Agent',
228: 'Via',
b0975a28fb 2010-08-26 229: 'X-Forwarded-For',
b0975a28fb 2010-08-26 230: )
231:
232: print('===============[ {} request ]==='.format(self.command))
233:
234: for header in self.headers:
235: if header in proxy_ignored:
236: pass
237: elif header in ('Range'):
238: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
239: if isRange:
240: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
241: else:
242: return()
243: elif header in ('Pragma'):
244: if my_path in index:
245: index[my_path][header] = self.headers[header]
246: else:
247: print('Unknown header - ', header, ': ', self.headers[header], sep='')
248: return()
249: print(header, self.headers[header])
250:
251: # creating file name from my_path
252: file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
253: # partial file or unfinished download
254: temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
255:
256: # creating empty placeholder in index
257: # if there's no space map and there's no file in real directory - we have no file
258: # if there's an empty space map - file is full
259: # space map generally covers every bit of file we don't posess currently
260: if not my_path in index:
261: info += '\nThis one is new.'
262: reload = True
263: record = {}
264: else:
265: # forcibly checking file if no file present
266: if os.access(file_name, os.R_OK):
267: file_stat = os.stat(file_name)
268: elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK):
269: file_stat = os.stat(temp_name)
270: else:
271: info += '\nFile not found or inaccessible.'
272: index[my_path]['_parts'] = None
273: reload = True
274: record = index[my_path]
275:
b0975a28fb 2010-08-26 276: print(record)
b0975a28fb 2010-08-26 277:
278: if not '_parts' in record:
279: record['_parts'] = None
280:
281: if record['_parts'] == None:
282: recheck = True
283:
284: # forcibly checking file if file size doesn't match with index data
285: if not reload:
286: if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
287: if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']):
288: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length'])
289: record['_parts'] = None
290: reload = True
291:
292: # forcibly checking file if index holds Pragma header
293: if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache':
294: info +='\nPragma on: recheck imminent.'
295: recheck = True
296:
297: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
298: if not recheck and not reload and '_time' in record and (datetime.datetime.now() - datetime.timedelta(hours = 4) - record['_time']).days < 0:
299: recheck = True
300:
301: print(info)
302: if reload or recheck:
303:
304: try:
b0975a28fb 2010-08-26 305: request = 'http://' + config['root'] + my_path
306: needed = None
307: # XXX and if we specify full file we don't go partial?
308: if requested_ranges != None:
309: if '_parts' in record and record['_parts'] != None:
310: if config['noparts']:
311: needed = record['_parts']
312: else:
313: needed = record['_parts'] | requested_ranges
314: elif not config['noparts']:
315: needed = requested_ranges
316: ranges = ()
317: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
318: if needed != None and len(needed) > 0:
319: needed.rewind()
320: while True:
321: range = needed.pop()
322: if range[0] == None:
323: break
324: ranges += '{}-{}'.format(range[0], range[1] - 1),
b0975a28fb 2010-08-26 325: request = urllib.request.Request(request, headers = {'Range': 'bytes=' + ','.join(ranges)})
326:
327: with urllib.request.urlopen(request) as source:
328: new_record = {}
329: new_record['_parts'] = record['_parts']
330: headers = source.info()
331:
332: # stripping unneeded headers (XXX make this inplace?)
333: for header in headers:
334: if header in desc_fields:
335: #if header == 'Pragma' and headers[header] != 'no-cache':
336: if header == 'Content-Length':
337: if 'Content-Range' not in headers:
338: new_record[header] = int(headers[header])
339: else:
340: new_record[header] = headers[header]
341: elif header == 'Content-Range':
342: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
343: if range:
344: new_record['Content-Length'] = int(range.group(3))
345: else:
346: assert False, 'Content-Range unrecognized.'
347: elif not header in ignore_fields:
348: print('Undefined header "', header, '": ', headers[header], sep='')
349:
350: # comparing headers with data found in index
351: # if any header has changed (except Pragma) file is fully downloaded
352: # same if we get more or less headers
353: old_keys = set(record.keys())
354: old_keys.discard('_time')
355: old_keys.discard('Pragma')
356: more_keys = set(new_record.keys()) - old_keys
357: more_keys.discard('Pragma')
358: less_keys = old_keys - set(new_record.keys())
359: if len(more_keys) > 0:
360: if not len(old_keys) == 0:
361: print('More headers appear:', more_keys)
362: reload = True
363: elif len(less_keys) > 0:
364: print('Less headers appear:', less_keys)
365: else:
366: for key in record.keys():
367: if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]:
368: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
369: print(type(record[key]), type(new_record[key]))
370: reload = True
371:
372: if reload:
373: print('Reloading.')
374: if os.access(temp_name, os.R_OK):
375: os.unlink(temp_name)
376: if os.access(file_name, os.R_OK):
377: os.unlink(file_name)
378: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
379: print(new_record)
380:
381: # downloading file or segment
382: if 'Content-Length' in new_record:
383: if needed == None:
384: needed = new_record['_parts']
385: else:
386: if len(needed) > 1:
387: print("Multipart requests currently not supported.")
388: assert False, 'Skip this one for now.'
389: else:
390: assert False, 'No Content-Length or Content-Range header.'
391:
392: new_record['_time'] = datetime.datetime.now()
393: if self.command not in ('HEAD'):
394: # file is created at temporary location and moved in place only when download completes
395: if not os.access(temp_name, os.R_OK):
396: empty_name = config['dir'] + os.sep + '.tmp'
397: with open(empty_name, 'w+b') as some_file:
398: pass
399: os.renames(empty_name, temp_name)
400: temp_file = open(temp_name, 'r+b')
401: needed.rewind()
402: while True:
403: (start, end) = needed.pop()
404: if start == None:
405: break
406: stream_last = start
407: old_record = new_record
408: if end - start < block_size:
409: req_block_size = end - start
410: else:
411: req_block_size = block_size
412: buffer = source.read(req_block_size)
413: length = len(buffer)
414: while length > 0 and stream_last < end:
415: stream_pos = stream_last + length
416: assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
417: temp_file.seek(stream_last)
418: temp_file.write(buffer)
419: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
420: index[my_path] = old_record
421: index.sync()
422: old_record = new_record
423: stream_last = stream_pos
424: if end - stream_last < block_size:
425: req_block_size = end - stream_last
426: buffer = source.read(req_block_size)
427: length = len(buffer)
428: # moving downloaded data to real file
429: temp_file.close()
430:
431: print(new_record)
432: index[my_path] = new_record
433: index.sync()
434:
435: except urllib.error.HTTPError as error:
436: # in case of error we don't need to do anything actually,
437: # if file download stalls or fails the file would not be moved to it's location
438: print(error)
439:
b0975a28fb 2010-08-26 440: if '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
441: # just moving
442: # drop old dirs XXX
443: print('Moving temporary file to new destination.')
444: os.renames(temp_name, file_name)
445:
446: if self.command == 'HEAD':
447: self.send_response(200)
448: if 'Content-Length' in index[my_path]:
449: self.send_header('Content-Length', index[my_path]['Content-Length'])
450: self.send_header('Accept-Ranges', 'bytes')
451: self.send_header('Content-Type', 'application/octet-stream')
452: if 'Last-Modified' in index[my_path]:
453: self.send_header('Last-Modified', index[my_path]['Last-Modified'])
454: self.end_headers()
455: else:
456: if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
457: file_name = temp_name
458:
459: with open(file_name, 'rb') as real_file:
460: file_stat = os.stat(file_name)
461: if 'Range' in self.headers:
462: self.send_response(206)
463: ranges = ()
464: requested_ranges.rewind()
465: while True:
466: pair = requested_ranges.pop()
467: if pair[0] == None:
468: break
469: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
470: self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length']))
471: else:
472: self.send_response(200)
473: self.send_header('Content-Length', str(file_stat.st_size))
474: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
b0975a28fb 2010-08-26 475: self.send_header('Last-Modified', index[my_path]['Last-Modified'])
476: self.send_header('Content-Type', 'application/octet-stream')
477: self.end_headers()
478: if self.command in ('GET'):
479: if len(requested_ranges) > 0:
480: requested_ranges.rewind()
481: (start, end) = requested_ranges.pop()
482: else:
483: start = 0
484: end = index[my_path]['Content-Length']
485: real_file.seek(start)
486: if block_size > end - start:
487: req_block_size = end - start
488: else:
489: req_block_size = block_size
490: buffer = real_file.read(req_block_size)
491: length = len(buffer)
492: while length > 0:
493: self.wfile.write(buffer)
494: start += len(buffer)
495: if req_block_size > end - start:
496: req_block_size = end - start
497: if req_block_size == 0:
498: break
499: buffer = real_file.read(req_block_size)
500: length = len(buffer)
501:
502: def do_HEAD(self):
503: return self.__process()
504: def do_GET(self):
505: return self.__process()
506:
507: config.section('general')
508: server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
509: server.serve_forever()
510:
511: else:
512: while True:
513: unchecked_files = set()
514: checked_files = 0
515:
516: # reading log and storing found urls for processing
517: # check file mtime XXX
518: with open(options.log, 'r') as log_file:
519: log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
520: for line in log_file:
521: this_line = log_line.match(line.strip())
522: if this_line:
523: unchecked_files.add(this_line.group(2))
524:
525: for url in unchecked_files:
526: reload = False
527: recheck = False
528: info = 'Checking file: ' + url
529:
530: # creating empty placeholder in index
531: if not url in index:
532: info += '\nThis one is new.'
533: index[url] = {}
534: reload = True
535:
536: # creating file name from url
537: file_name = options.dir + re.compile('%20').sub(' ', url)
538:
539: # forcibly checking file if no file present
540: if not reload and not os.access(file_name, os.R_OK):
541: info += '\nFile not found or inaccessible.'
542: reload = True
543:
544: # forcibly checking file if file size doesn't match with index data
545: elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
546: info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
547: reload = True
548:
549: # forcibly checking file if index hods Pragma header
550: if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
551: info +='\nPragma on: recheck imminent.'
552: recheck = True
553:
554: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
555: if not recheck and not reload and (options.noupdate or ('_time' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['_time']).days < 0)):
556: if options.verbose:
557: print(info)
558: continue
559: else:
560: print(info)
561:
562: try:
563: with urllib.request.urlopen(options.root + url) as source:
564: new_headers = {}
565: headers = source.info()
566:
567: # stripping unneeded headers (XXX make this inplace?)
568: for header in headers:
569: if header in desc_fields:
570: if header == 'Pragma' and headers[header] != 'no-cache':
571: print('Pragma:', headers[header])
572: new_headers[header] = headers[header]
573: elif not header in ignore_fields:
574: print('Undefined header "', header, '": ', headers[header], sep='')
575:
576: # comparing headers with data found in index
577: # if any header has changed (except Pragma) file is fully downloaded
578: # same if we get more or less headers
579: old_keys = set(index[url].keys())
580: old_keys.discard('_time')
581: old_keys.discard('Pragma')
582: more_keys = set(new_headers.keys()) - old_keys
583: more_keys.discard('Pragma')
584: less_keys = old_keys - set(new_headers.keys())
585: if len(more_keys) > 0:
586: if not len(old_keys) == 0:
587: print('More headers appear:', more_keys)
588: reload = True
589: elif len(less_keys) > 0:
590: print('Less headers appear:', less_keys)
591: else:
592: for key in index[url].keys():
593: if key[0] != '_' and key != 'Pragma' and not index[url][key] == new_headers[key]:
594: print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
595: reload = True
596:
597: # downloading file
598: if reload:
599: if 'Content-Length' in headers:
600: print('Downloading', headers['Content-Length'], 'bytes [', end='')
601: else:
602: print('Downloading [', end='')
603: sys.stdout.flush()
604:
605: # file is created at temporary location and moved in place only when download completes
606: temp_file = open(options.dir + os.sep + '.tmp', 'wb')
607: buffer = source.read(block_size)
608: megablocks = 0
609: blocks = 0
610: megs = 0
611: while len(buffer) > 0:
612: temp_file.write(buffer)
613: buffer = source.read(block_size)
614: blocks += 1
615: if blocks > 102400/block_size:
616: megablocks += 1
617: if megablocks > 10:
618: megablocks = megablocks - 10
619: megs += 1
620: print('{}Mb'.format(megs), end='')
621: else:
622: print('.', end='')
623: blocks = blocks - 102400/block_size
624: sys.stdout.flush()
625: temp_file.close()
626: print(']')
627: os.renames(options.dir + os.sep + '.tmp', file_name)
628:
629: checked_files += 1
630:
631: # storing new time mark and storing new headers
632: new_headers['_time'] = datetime.datetime.now()
633: index[url] = new_headers
634: index.sync()
635:
636: except urllib.error.HTTPError as error:
637: # in case of error we don't need to do anything actually,
638: # if file download stalls or fails the file would not be moved to it's location
639: print(error)
640:
641: if options.verbose:
642: print('[', len(unchecked_files), '/', checked_files, ']')
643:
644: # checking if there were any files downloaded, if yes - restarting sequence
645: if checked_files == 0:
646: break