08ae38b6ce 2010-06-25 1: #!/usr/bin/env python3.1
08ae38b6ce 2010-06-25 2:
08ae38b6ce 2010-06-25 3: import datetime, http.cookiejar, optparse, os, sys, shelve, re, urllib.request
08ae38b6ce 2010-06-25 4:
08ae38b6ce 2010-06-25 5: parser = optparse.OptionParser()
08ae38b6ce 2010-06-25 6: parser.add_option('-v', '--verbose', action = 'store_true', dest = 'verbose', help = 'turns on verbose status notifications', metavar = 'bool', default = False)
08ae38b6ce 2010-06-25 7: parser.add_option('-d', '--dir', action = 'store', dest = 'dir', help = 'specify directory where the files should be stored', metavar = 'string', default = None)
08ae38b6ce 2010-06-25 8: parser.add_option('-r', '--root', action = 'store', dest = 'root', help = 'specify a site from which data should be mirrored', metavar = 'string', default = None)
08ae38b6ce 2010-06-25 9: parser.add_option('-l', '--log', action = 'store', dest = 'log', help = 'specify a log file to process', metavar = 'string', default = None)
08ae38b6ce 2010-06-25 10: (options, args) = parser.parse_args()
08ae38b6ce 2010-06-25 11:
08ae38b6ce 2010-06-25 12: if not options.dir:
08ae38b6ce 2010-06-25 13: print('Directory not specified')
08ae38b6ce 2010-06-25 14: exit(1)
08ae38b6ce 2010-06-25 15:
08ae38b6ce 2010-06-25 16: if not options.root:
08ae38b6ce 2010-06-25 17: print('Server not specified')
08ae38b6ce 2010-06-25 18: exit(1)
08ae38b6ce 2010-06-25 19:
08ae38b6ce 2010-06-25 20: if not options.log:
08ae38b6ce 2010-06-25 21: print('Log file not specified')
08ae38b6ce 2010-06-25 22: exit(1)
08ae38b6ce 2010-06-25 23:
08ae38b6ce 2010-06-25 24: if not os.access(options.log, os.R_OK):
08ae38b6ce 2010-06-25 25: print('Log file unreadable')
08ae38b6ce 2010-06-25 26: exit(1)
08ae38b6ce 2010-06-25 27:
08ae38b6ce 2010-06-25 28: # this is file index - everything is stored in this file
08ae38b6ce 2010-06-25 29: index = shelve.open(options.dir + '/.index')
08ae38b6ce 2010-06-25 30: desc_fields = ('Content-Length', 'ETag', 'Pragma', 'Last-Modified')
08ae38b6ce 2010-06-25 31: ignore_fields = ('Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup')
08ae38b6ce 2010-06-25 32:
7b27f1db02 2010-07-01 33: block_size = 32768
7b27f1db02 2010-07-01 34:
08ae38b6ce 2010-06-25 35: while True:
08ae38b6ce 2010-06-25 36: unchecked_files = set()
08ae38b6ce 2010-06-25 37: checked_files = 0
08ae38b6ce 2010-06-25 38:
08ae38b6ce 2010-06-25 39: # reading log and storing found urls for processing
08ae38b6ce 2010-06-25 40: # check file mtime XXX
08ae38b6ce 2010-06-25 41: with open(options.log, 'r') as log_file:
7b27f1db02 2010-07-01 42: log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
08ae38b6ce 2010-06-25 43: for line in log_file:
08ae38b6ce 2010-06-25 44: this_line = log_line.match(line.strip())
08ae38b6ce 2010-06-25 45: if this_line:
7b27f1db02 2010-07-01 46: unchecked_files.add(this_line.group(2))
08ae38b6ce 2010-06-25 47:
08ae38b6ce 2010-06-25 48: for url in unchecked_files:
08ae38b6ce 2010-06-25 49:
08ae38b6ce 2010-06-25 50: # creating empty placeholder in index
08ae38b6ce 2010-06-25 51: if not url in index:
08ae38b6ce 2010-06-25 52: index[url] = {}
08ae38b6ce 2010-06-25 53: reload = False
08ae38b6ce 2010-06-25 54:
08ae38b6ce 2010-06-25 55: # creating file name from url
08ae38b6ce 2010-06-25 56: file_name = options.dir + re.compile('%20').sub(' ', url)
08ae38b6ce 2010-06-25 57:
08ae38b6ce 2010-06-25 58: # forcibly checking file if no file present
08ae38b6ce 2010-06-25 59: if not os.access(file_name, os.R_OK):
08ae38b6ce 2010-06-25 60: reload = True
08ae38b6ce 2010-06-25 61:
08ae38b6ce 2010-06-25 62: # forcibly checking file if file size doesn't match with index data
08ae38b6ce 2010-06-25 63: elif 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
08ae38b6ce 2010-06-25 64: print('File size is', os.stat(file_name).st_size, 'and stored file size is', index[url]['Content-Length'])
08ae38b6ce 2010-06-25 65: reload = True
08ae38b6ce 2010-06-25 66:
08ae38b6ce 2010-06-25 67: # forcibly checking file if index hods Pragma header
08ae38b6ce 2010-06-25 68: if 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
08ae38b6ce 2010-06-25 69: reload = True
08ae38b6ce 2010-06-25 70:
08ae38b6ce 2010-06-25 71: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
08ae38b6ce 2010-06-25 72: if not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
08ae38b6ce 2010-06-25 73: continue
08ae38b6ce 2010-06-25 74: print('Checking file:', url)
08ae38b6ce 2010-06-25 75: try:
08ae38b6ce 2010-06-25 76: with urllib.request.urlopen(options.root + url) as source:
08ae38b6ce 2010-06-25 77: new_headers = {}
08ae38b6ce 2010-06-25 78: headers = source.info()
08ae38b6ce 2010-06-25 79:
08ae38b6ce 2010-06-25 80: # stripping unneeded headers (XXX make this inplace?)
08ae38b6ce 2010-06-25 81: for header in headers:
08ae38b6ce 2010-06-25 82: if header in desc_fields:
08ae38b6ce 2010-06-25 83: if header == 'Pragma' and headers[header] != 'no-cache':
08ae38b6ce 2010-06-25 84: print('Pragma:', headers[header])
08ae38b6ce 2010-06-25 85: new_headers[header] = headers[header]
08ae38b6ce 2010-06-25 86: elif not header in ignore_fields:
7b27f1db02 2010-07-01 87: print('Undefined header "', header, '": ', headers[header], sep='')
08ae38b6ce 2010-06-25 88:
08ae38b6ce 2010-06-25 89: # comparing headers with data found in index
08ae38b6ce 2010-06-25 90: # if any header has changed (except Pragma) file is fully downloaded
08ae38b6ce 2010-06-25 91: # same if we get more or less headers
08ae38b6ce 2010-06-25 92: old_keys = set(index[url].keys())
08ae38b6ce 2010-06-25 93: old_keys.discard('__time__')
08ae38b6ce 2010-06-25 94: old_keys.discard('Pragma')
08ae38b6ce 2010-06-25 95: more_keys = set(new_headers.keys()) - old_keys
08ae38b6ce 2010-06-25 96: more_keys.discard('Pragma')
08ae38b6ce 2010-06-25 97: less_keys = old_keys - set(new_headers.keys())
08ae38b6ce 2010-06-25 98: if len(more_keys) > 0:
08ae38b6ce 2010-06-25 99: print('More headers appear:', more_keys)
08ae38b6ce 2010-06-25 100: reload = True
08ae38b6ce 2010-06-25 101: elif len(less_keys) > 0:
08ae38b6ce 2010-06-25 102: print('Less headers appear:', less_keys)
08ae38b6ce 2010-06-25 103: reload = True
08ae38b6ce 2010-06-25 104: else:
08ae38b6ce 2010-06-25 105: for key in index[url].keys():
08ae38b6ce 2010-06-25 106: if key not in ('__time__', 'Pragma') and not index[url][key] == new_headers[key]:
7b27f1db02 2010-07-01 107: print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
08ae38b6ce 2010-06-25 108: reload = True
08ae38b6ce 2010-06-25 109:
08ae38b6ce 2010-06-25 110: # downloading file
08ae38b6ce 2010-06-25 111: if reload:
08ae38b6ce 2010-06-25 112: if 'Content-Length' in headers:
08ae38b6ce 2010-06-25 113: print('Downloading', headers['Content-Length'], 'bytes [', end='')
08ae38b6ce 2010-06-25 114: else:
08ae38b6ce 2010-06-25 115: print('Downloading [', end='')
08ae38b6ce 2010-06-25 116: sys.stdout.flush()
08ae38b6ce 2010-06-25 117:
08ae38b6ce 2010-06-25 118: # file is created at temporary location and moved in place only when download completes
08ae38b6ce 2010-06-25 119: temp_file = open(options.dir + '/.tmp', 'wb')
7b27f1db02 2010-07-01 120: buffer = source.read(block_size)
7b27f1db02 2010-07-01 121: blocks = 0
7b27f1db02 2010-07-01 122: megs = 0
08ae38b6ce 2010-06-25 123: while len(buffer) > 0:
08ae38b6ce 2010-06-25 124: temp_file.write(buffer)
08ae38b6ce 2010-06-25 125: print('.', end='')
08ae38b6ce 2010-06-25 126: sys.stdout.flush()
7b27f1db02 2010-07-01 127: buffer = source.read(block_size)
7b27f1db02 2010-07-01 128: blocks += 1
7b27f1db02 2010-07-01 129: if blocks > 1024*1024/block_size:
7b27f1db02 2010-07-01 130: blocks = blocks - 1024*1024/block_size
7b27f1db02 2010-07-01 131: megs += 1
7b27f1db02 2010-07-01 132: print('{}Mb'.format(megs), end='')
08ae38b6ce 2010-06-25 133: temp_file.close()
08ae38b6ce 2010-06-25 134: print(']')
08ae38b6ce 2010-06-25 135: os.renames(options.dir + '/.tmp', file_name)
08ae38b6ce 2010-06-25 136:
08ae38b6ce 2010-06-25 137: checked_files += 1
08ae38b6ce 2010-06-25 138:
08ae38b6ce 2010-06-25 139: # storing new time mark and storing new headers
08ae38b6ce 2010-06-25 140: new_headers['__time__'] = datetime.datetime.now()
08ae38b6ce 2010-06-25 141: index[url] = new_headers
7b27f1db02 2010-07-01 142: index.sync()
08ae38b6ce 2010-06-25 143:
08ae38b6ce 2010-06-25 144: except urllib.error.HTTPError as error:
08ae38b6ce 2010-06-25 145: # in case of error we don't need to do anything actually,
08ae38b6ce 2010-06-25 146: # if file download stalls or fails the file would not be moved to it's location
08ae38b6ce 2010-06-25 147: print(error)
08ae38b6ce 2010-06-25 148:
08ae38b6ce 2010-06-25 149: print('[', len(unchecked_files), '/', checked_files, ']')
08ae38b6ce 2010-06-25 150:
08ae38b6ce 2010-06-25 151: # checking if there were any files downloaded, if yes - restarting sequence
08ae38b6ce 2010-06-25 152: if checked_files == 0:
08ae38b6ce 2010-06-25 153: break