Lines of
samesite.py
from check-in 827033dd7e
that are changed by the sequence of edits moving toward
check-in 80f8e3804a:
1: #!/usr/bin/env python3.1
2:
3: import datetime, http.cookiejar, optparse, os, sys, shelve, re, urllib.request
4:
5: parser = optparse.OptionParser()
6: parser.add_option('-v', '--verbose', action = 'store_true', dest = 'verbose', help = 'turns on verbose status notifications', metavar = 'bool', default = False)
7: parser.add_option('-d', '--dir', action = 'store', dest = 'dir', help = 'specify directory where the files should be stored', metavar = 'string', default = None)
8: parser.add_option('-r', '--root', action = 'store', dest = 'root', help = 'specify a site from which data should be mirrored', metavar = 'string', default = None)
9: parser.add_option('-l', '--log', action = 'store', dest = 'log', help = 'specify a log file to process', metavar = 'string', default = None)
10: parser.add_option('-e', '--skip-etag', action = 'store_true', dest = 'noetag', help = 'do not process etags', metavar = 'bool', default = False)
11: (options, args) = parser.parse_args()
12:
13: assert options.dir, 'Directory not specified'
14: assert options.root, 'Server not specified'
827033dd7e 2010-08-04 15: assert options.log, 'Log file not specified'
827033dd7e 2010-08-04 16: assert os.access(options.log, os.R_OK), 'Log file unreadable'
17:
18: # this is file index - everything is stored in this file
827033dd7e 2010-08-04 19: index = shelve.open(options.dir + '/.index')
20: desc_fields = ('Content-Length', 'Pragma', 'Last-Modified')
21: ignore_fields = ('Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By')
22:
23: if not options.noetag:
24: desc_fields += 'ETag',
25: else:
26: ignore_fields += 'ETag',
27:
827033dd7e 2010-08-04 28: block_size = 32768
827033dd7e 2010-08-04 29:
827033dd7e 2010-08-04 30: while True:
827033dd7e 2010-08-04 31: unchecked_files = set()
827033dd7e 2010-08-04 32: checked_files = 0
827033dd7e 2010-08-04 33:
827033dd7e 2010-08-04 34: # reading log and storing found urls for processing
827033dd7e 2010-08-04 35: # check file mtime XXX
827033dd7e 2010-08-04 36: with open(options.log, 'r') as log_file:
827033dd7e 2010-08-04 37: log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
827033dd7e 2010-08-04 38: for line in log_file:
827033dd7e 2010-08-04 39: this_line = log_line.match(line.strip())
827033dd7e 2010-08-04 40: if this_line:
827033dd7e 2010-08-04 41: unchecked_files.add(this_line.group(2))
827033dd7e 2010-08-04 42:
827033dd7e 2010-08-04 43: for url in unchecked_files:
827033dd7e 2010-08-04 44: reload = False
827033dd7e 2010-08-04 45: recheck = False
827033dd7e 2010-08-04 46: info = 'Checking file: ' + url
827033dd7e 2010-08-04 47:
827033dd7e 2010-08-04 48: # creating empty placeholder in index
827033dd7e 2010-08-04 49: if not url in index:
827033dd7e 2010-08-04 50: info += '\nThis one is new.'
827033dd7e 2010-08-04 51: index[url] = {}
827033dd7e 2010-08-04 52: reload = True
827033dd7e 2010-08-04 53:
827033dd7e 2010-08-04 54: # creating file name from url
827033dd7e 2010-08-04 55: file_name = options.dir + re.compile('%20').sub(' ', url)
827033dd7e 2010-08-04 56:
827033dd7e 2010-08-04 57: # forcibly checking file if no file present
827033dd7e 2010-08-04 58: if not reload and not os.access(file_name, os.R_OK):
827033dd7e 2010-08-04 59: info += '\nFile not found or inaccessible.'
827033dd7e 2010-08-04 60: reload = True
827033dd7e 2010-08-04 61:
827033dd7e 2010-08-04 62: # forcibly checking file if file size doesn't match with index data
827033dd7e 2010-08-04 63: elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
827033dd7e 2010-08-04 64: info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
827033dd7e 2010-08-04 65: reload = True
827033dd7e 2010-08-04 66:
827033dd7e 2010-08-04 67: # forcibly checking file if index hods Pragma header
827033dd7e 2010-08-04 68: if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
827033dd7e 2010-08-04 69: info +='\nPragma on: recheck imminent.'
827033dd7e 2010-08-04 70: recheck = True
827033dd7e 2010-08-04 71:
827033dd7e 2010-08-04 72: if options.verbose:
827033dd7e 2010-08-04 73: print(info)
827033dd7e 2010-08-04 74:
827033dd7e 2010-08-04 75: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
827033dd7e 2010-08-04 76: if not recheck and not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
827033dd7e 2010-08-04 77: continue
827033dd7e 2010-08-04 78:
827033dd7e 2010-08-04 79: try:
827033dd7e 2010-08-04 80: with urllib.request.urlopen(options.root + url) as source:
827033dd7e 2010-08-04 81: new_headers = {}
827033dd7e 2010-08-04 82: headers = source.info()
827033dd7e 2010-08-04 83: if not options.verbose:
827033dd7e 2010-08-04 84: print(info)
827033dd7e 2010-08-04 85:
827033dd7e 2010-08-04 86: # stripping unneeded headers (XXX make this inplace?)
827033dd7e 2010-08-04 87: for header in headers:
827033dd7e 2010-08-04 88: if header in desc_fields:
827033dd7e 2010-08-04 89: if header == 'Pragma' and headers[header] != 'no-cache':
827033dd7e 2010-08-04 90: print('Pragma:', headers[header])
827033dd7e 2010-08-04 91: new_headers[header] = headers[header]
827033dd7e 2010-08-04 92: elif not header in ignore_fields:
827033dd7e 2010-08-04 93: print('Undefined header "', header, '": ', headers[header], sep='')
827033dd7e 2010-08-04 94:
827033dd7e 2010-08-04 95: # comparing headers with data found in index
827033dd7e 2010-08-04 96: # if any header has changed (except Pragma) file is fully downloaded
827033dd7e 2010-08-04 97: # same if we get more or less headers
827033dd7e 2010-08-04 98: old_keys = set(index[url].keys())
827033dd7e 2010-08-04 99: old_keys.discard('__time__')
827033dd7e 2010-08-04 100: old_keys.discard('Pragma')
827033dd7e 2010-08-04 101: more_keys = set(new_headers.keys()) - old_keys
827033dd7e 2010-08-04 102: more_keys.discard('Pragma')
827033dd7e 2010-08-04 103: less_keys = old_keys - set(new_headers.keys())
827033dd7e 2010-08-04 104: if len(more_keys) > 0:
827033dd7e 2010-08-04 105: if not len(old_keys) == 0:
827033dd7e 2010-08-04 106: print('More headers appear:', more_keys)
827033dd7e 2010-08-04 107: reload = True
827033dd7e 2010-08-04 108: elif len(less_keys) > 0:
827033dd7e 2010-08-04 109: print('Less headers appear:', less_keys)
827033dd7e 2010-08-04 110: else:
827033dd7e 2010-08-04 111: for key in index[url].keys():
827033dd7e 2010-08-04 112: if key not in ('__time__', 'Pragma') and not index[url][key] == new_headers[key]:
827033dd7e 2010-08-04 113: print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
827033dd7e 2010-08-04 114: reload = True
827033dd7e 2010-08-04 115:
827033dd7e 2010-08-04 116: # downloading file
827033dd7e 2010-08-04 117: if reload:
827033dd7e 2010-08-04 118: if 'Content-Length' in headers:
827033dd7e 2010-08-04 119: print('Downloading', headers['Content-Length'], 'bytes [', end='')
827033dd7e 2010-08-04 120: else:
827033dd7e 2010-08-04 121: print('Downloading [', end='')
827033dd7e 2010-08-04 122: sys.stdout.flush()
827033dd7e 2010-08-04 123:
827033dd7e 2010-08-04 124: # file is created at temporary location and moved in place only when download completes
827033dd7e 2010-08-04 125: temp_file = open(options.dir + '/.tmp', 'wb')
827033dd7e 2010-08-04 126: buffer = source.read(block_size)
827033dd7e 2010-08-04 127: blocks = 0
827033dd7e 2010-08-04 128: megs = 0
827033dd7e 2010-08-04 129: while len(buffer) > 0:
827033dd7e 2010-08-04 130: temp_file.write(buffer)
827033dd7e 2010-08-04 131: print('.', end='')
827033dd7e 2010-08-04 132: sys.stdout.flush()
827033dd7e 2010-08-04 133: buffer = source.read(block_size)
827033dd7e 2010-08-04 134: blocks += 1
827033dd7e 2010-08-04 135: if blocks > 1024*1024/block_size:
827033dd7e 2010-08-04 136: blocks = blocks - 1024*1024/block_size
827033dd7e 2010-08-04 137: megs += 1
827033dd7e 2010-08-04 138: print('{}Mb'.format(megs), end='')
827033dd7e 2010-08-04 139: temp_file.close()
827033dd7e 2010-08-04 140: print(']')
827033dd7e 2010-08-04 141: os.renames(options.dir + '/.tmp', file_name)
827033dd7e 2010-08-04 142:
827033dd7e 2010-08-04 143: checked_files += 1
827033dd7e 2010-08-04 144:
827033dd7e 2010-08-04 145: # storing new time mark and storing new headers
827033dd7e 2010-08-04 146: new_headers['__time__'] = datetime.datetime.now()
827033dd7e 2010-08-04 147: index[url] = new_headers
827033dd7e 2010-08-04 148: index.sync()
827033dd7e 2010-08-04 149:
827033dd7e 2010-08-04 150: except urllib.error.HTTPError as error:
827033dd7e 2010-08-04 151: # in case of error we don't need to do anything actually,
827033dd7e 2010-08-04 152: # if file download stalls or fails the file would not be moved to it's location
827033dd7e 2010-08-04 153: print(error)
827033dd7e 2010-08-04 154:
827033dd7e 2010-08-04 155: if options.verbose:
827033dd7e 2010-08-04 156: print('[', len(unchecked_files), '/', checked_files, ']')
827033dd7e 2010-08-04 157:
827033dd7e 2010-08-04 158: # checking if there were any files downloaded, if yes - restarting sequence
827033dd7e 2010-08-04 159: if checked_files == 0:
827033dd7e 2010-08-04 160: break