50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
+
+
+
-
+
|
# creating empty placeholder in index
if not url in index:
index[url] = {}
reload = False
# creating file name from url
file_name = options.dir + re.compile('%20').sub(' ', url)
print('Checking file:', url)
# forcibly checking file if no file present
if not os.access(file_name, os.R_OK):
print('File not found or inaccessible.')
reload = True
# forcibly checking file if file size doesn't match with index data
elif 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
print('File size is', os.stat(file_name).st_size, 'and stored file size is', index[url]['Content-Length'])
reload = True
# forcibly checking file if index hods Pragma header
if 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
print('Pragma on: recheck iminent.')
reload = True
# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
if not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
continue
print('Checking file:', url)
try:
print('Contacting website.')
with urllib.request.urlopen(options.root + url) as source:
new_headers = {}
headers = source.info()
# stripping unneeded headers (XXX make this inplace?)
for header in headers:
if header in desc_fields:
|
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
|
+
+
+
-
+
|
old_keys = set(index[url].keys())
old_keys.discard('__time__')
old_keys.discard('Pragma')
more_keys = set(new_headers.keys()) - old_keys
more_keys.discard('Pragma')
less_keys = old_keys - set(new_headers.keys())
if len(more_keys) > 0:
if len(old_keys) == 0:
print('No data on that file yet.')
else:
print('More headers appear:', more_keys)
print('More headers appear:', more_keys)
reload = True
elif len(less_keys) > 0:
print('Less headers appear:', less_keys)
reload = True
else:
for key in index[url].keys():
if key not in ('__time__', 'Pragma') and not index[url][key] == new_headers[key]:
|