Index: samesite.py ================================================================== --- samesite.py +++ samesite.py @@ -28,22 +28,24 @@ # this is file index - everything is stored in this file index = shelve.open(options.dir + '/.index') desc_fields = ('Content-Length', 'ETag', 'Pragma', 'Last-Modified') ignore_fields = ('Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup') +block_size = 32768 + while True: unchecked_files = set() checked_files = 0 # reading log and storing found urls for processing # check file mtime XXX with open(options.log, 'r') as log_file: - log_line = re.compile('^[^ ]+ - - \[.*] "GET (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$') + log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$') for line in log_file: this_line = log_line.match(line.strip()) if this_line: - unchecked_files.add(this_line.group(1)) + unchecked_files.add(this_line.group(2)) for url in unchecked_files: # creating empty placeholder in index if not url in index: @@ -80,11 +82,11 @@ if header in desc_fields: if header == 'Pragma' and headers[header] != 'no-cache': print('Pragma:', headers[header]) new_headers[header] = headers[header] elif not header in ignore_fields: - print('Undefined header', header, ':', headers[header]) + print('Undefined header "', header, '": ', headers[header], sep='') # comparing headers with data found in index # if any header has changed (except Pragma) file is fully downloaded # same if we get more or less headers old_keys = set(index[url].keys()) @@ -100,11 +102,11 @@ print('Less headers appear:', less_keys) reload = True else: for key in index[url].keys(): if key not in ('__time__', 'Pragma') and not index[url][key] == new_headers[key]: - print('Header', key, 'changed from', index[url][key], 'to', new_headers[key]) + print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='') reload = True # downloading file if reload: if 'Content-Length' in headers: @@ -113,25 +115,33 @@ print('Downloading [', end='') sys.stdout.flush() # file is created at temporary location and moved in place only when download completes temp_file = open(options.dir + '/.tmp', 'wb') - buffer = source.read(4096) + buffer = source.read(block_size) + blocks = 0 + megs = 0 while len(buffer) > 0: temp_file.write(buffer) print('.', end='') sys.stdout.flush() - buffer = source.read(4096) + buffer = source.read(block_size) + blocks += 1 + if blocks > 1024*1024/block_size: + blocks = blocks - 1024*1024/block_size + megs += 1 + print('{}Mb'.format(megs), end='') temp_file.close() print(']') os.renames(options.dir + '/.tmp', file_name) checked_files += 1 # storing new time mark and storing new headers new_headers['__time__'] = datetime.datetime.now() index[url] = new_headers + index.sync() except urllib.error.HTTPError as error: # in case of error we don't need to do anything actually, # if file download stalls or fails the file would not be moved to it's location print(error)