39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
this_line = log_line.match(line.strip())
if this_line:
unchecked_files.add(this_line.group(2))
for url in unchecked_files:
reload = False
recheck = False
print('Checking file:', url)
# creating empty placeholder in index
if not url in index:
print('This one is new.')
index[url] = {}
reload = True
# creating file name from url
file_name = options.dir + re.compile('%20').sub(' ', url)
# forcibly checking file if no file present
if not reload and not os.access(file_name, os.R_OK):
print('File not found or inaccessible.')
reload = True
# forcibly checking file if file size doesn't match with index data
elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
print('File size is ', os.stat(file_name).st_size, ' and stored file size is ', index[url]['Content-Length'], '.', sep='')
reload = True
# forcibly checking file if index hods Pragma header
if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
print('Pragma on: recheck imminent.')
recheck = True
# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
if not recheck and not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
continue
try:
with urllib.request.urlopen(options.root + url) as source:
new_headers = {}
headers = source.info()
# stripping unneeded headers (XXX make this inplace?)
for header in headers:
if header in desc_fields:
if header == 'Pragma' and headers[header] != 'no-cache':
print('Pragma:', headers[header])
new_headers[header] = headers[header]
|
|
|
|
|
|
>
>
>
>
>
|
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
this_line = log_line.match(line.strip())
if this_line:
unchecked_files.add(this_line.group(2))
for url in unchecked_files:
reload = False
recheck = False
info = 'Checking file: ' + url
# creating empty placeholder in index
if not url in index:
info += '\nThis one is new.'
index[url] = {}
reload = True
# creating file name from url
file_name = options.dir + re.compile('%20').sub(' ', url)
# forcibly checking file if no file present
if not reload and not os.access(file_name, os.R_OK):
info += '\nFile not found or inaccessible.'
reload = True
# forcibly checking file if file size doesn't match with index data
elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
reload = True
# forcibly checking file if index hods Pragma header
if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
info +='\nPragma on: recheck imminent.'
recheck = True
if options.verbose:
print(info)
# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
if not recheck and not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
continue
try:
with urllib.request.urlopen(options.root + url) as source:
new_headers = {}
headers = source.info()
if not options.verbose:
print(info)
# stripping unneeded headers (XXX make this inplace?)
for header in headers:
if header in desc_fields:
if header == 'Pragma' and headers[header] != 'no-cache':
print('Pragma:', headers[header])
new_headers[header] = headers[header]
|
143
144
145
146
147
148
149
150
151
152
153
154
|
index.sync()
except urllib.error.HTTPError as error:
# in case of error we don't need to do anything actually,
# if file download stalls or fails the file would not be moved to it's location
print(error)
print('[', len(unchecked_files), '/', checked_files, ']')
# checking if there were any files downloaded, if yes - restarting sequence
if checked_files == 0:
break
|
>
|
|
148
149
150
151
152
153
154
155
156
157
158
159
160
|
index.sync()
except urllib.error.HTTPError as error:
# in case of error we don't need to do anything actually,
# if file download stalls or fails the file would not be moved to it's location
print(error)
if options.verbose:
print('[', len(unchecked_files), '/', checked_files, ']')
# checking if there were any files downloaded, if yes - restarting sequence
if checked_files == 0:
break
|