Lines of
samesite.py
from check-in 08ae38b6ce
that are changed by the sequence of edits moving toward
check-in 7b27f1db02:
1: #!/usr/bin/env python3.1
2:
3: import datetime, http.cookiejar, optparse, os, sys, shelve, re, urllib.request
4:
5: parser = optparse.OptionParser()
6: parser.add_option('-v', '--verbose', action = 'store_true', dest = 'verbose', help = 'turns on verbose status notifications', metavar = 'bool', default = False)
7: parser.add_option('-d', '--dir', action = 'store', dest = 'dir', help = 'specify directory where the files should be stored', metavar = 'string', default = None)
8: parser.add_option('-r', '--root', action = 'store', dest = 'root', help = 'specify a site from which data should be mirrored', metavar = 'string', default = None)
9: parser.add_option('-l', '--log', action = 'store', dest = 'log', help = 'specify a log file to process', metavar = 'string', default = None)
10: (options, args) = parser.parse_args()
11:
12: if not options.dir:
13: print('Directory not specified')
14: exit(1)
15:
16: if not options.root:
17: print('Server not specified')
18: exit(1)
19:
20: if not options.log:
21: print('Log file not specified')
22: exit(1)
23:
24: if not os.access(options.log, os.R_OK):
25: print('Log file unreadable')
26: exit(1)
27:
28: # this is file index - everything is stored in this file
29: index = shelve.open(options.dir + '/.index')
30: desc_fields = ('Content-Length', 'ETag', 'Pragma', 'Last-Modified')
31: ignore_fields = ('Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup')
32:
33: while True:
34: unchecked_files = set()
35: checked_files = 0
36:
37: # reading log and storing found urls for processing
38: # check file mtime XXX
39: with open(options.log, 'r') as log_file:
08ae38b6ce 2010-06-25 40: log_line = re.compile('^[^ ]+ - - \[.*] "GET (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
41: for line in log_file:
42: this_line = log_line.match(line.strip())
43: if this_line:
08ae38b6ce 2010-06-25 44: unchecked_files.add(this_line.group(1))
45:
46: for url in unchecked_files:
47:
48: # creating empty placeholder in index
49: if not url in index:
50: index[url] = {}
51: reload = False
52:
53: # creating file name from url
54: file_name = options.dir + re.compile('%20').sub(' ', url)
55:
56: # forcibly checking file if no file present
57: if not os.access(file_name, os.R_OK):
58: reload = True
59:
60: # forcibly checking file if file size doesn't match with index data
61: elif 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
62: print('File size is', os.stat(file_name).st_size, 'and stored file size is', index[url]['Content-Length'])
63: reload = True
64:
65: # forcibly checking file if index hods Pragma header
66: if 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
67: reload = True
68:
69: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
70: if not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
71: continue
72: print('Checking file:', url)
73: try:
74: with urllib.request.urlopen(options.root + url) as source:
75: new_headers = {}
76: headers = source.info()
77:
78: # stripping unneeded headers (XXX make this inplace?)
79: for header in headers:
80: if header in desc_fields:
81: if header == 'Pragma' and headers[header] != 'no-cache':
82: print('Pragma:', headers[header])
83: new_headers[header] = headers[header]
84: elif not header in ignore_fields:
08ae38b6ce 2010-06-25 85: print('Undefined header', header, ':', headers[header])
86:
87: # comparing headers with data found in index
88: # if any header has changed (except Pragma) file is fully downloaded
89: # same if we get more or less headers
90: old_keys = set(index[url].keys())
91: old_keys.discard('__time__')
92: old_keys.discard('Pragma')
93: more_keys = set(new_headers.keys()) - old_keys
94: more_keys.discard('Pragma')
95: less_keys = old_keys - set(new_headers.keys())
96: if len(more_keys) > 0:
97: print('More headers appear:', more_keys)
98: reload = True
99: elif len(less_keys) > 0:
100: print('Less headers appear:', less_keys)
101: reload = True
102: else:
103: for key in index[url].keys():
104: if key not in ('__time__', 'Pragma') and not index[url][key] == new_headers[key]:
08ae38b6ce 2010-06-25 105: print('Header', key, 'changed from', index[url][key], 'to', new_headers[key])
106: reload = True
107:
108: # downloading file
109: if reload:
110: if 'Content-Length' in headers:
111: print('Downloading', headers['Content-Length'], 'bytes [', end='')
112: else:
113: print('Downloading [', end='')
114: sys.stdout.flush()
115:
116: # file is created at temporary location and moved in place only when download completes
117: temp_file = open(options.dir + '/.tmp', 'wb')
08ae38b6ce 2010-06-25 118: buffer = source.read(4096)
119: while len(buffer) > 0:
120: temp_file.write(buffer)
121: print('.', end='')
122: sys.stdout.flush()
08ae38b6ce 2010-06-25 123: buffer = source.read(4096)
124: temp_file.close()
125: print(']')
126: os.renames(options.dir + '/.tmp', file_name)
127:
128: checked_files += 1
129:
130: # storing new time mark and storing new headers
131: new_headers['__time__'] = datetime.datetime.now()
132: index[url] = new_headers
133:
134: except urllib.error.HTTPError as error:
135: # in case of error we don't need to do anything actually,
136: # if file download stalls or fails the file would not be moved to it's location
137: print(error)
138:
139: print('[', len(unchecked_files), '/', checked_files, ']')
140:
141: # checking if there were any files downloaded, if yes - restarting sequence
142: if checked_files == 0:
143: break