#!/usr/bin/env python3.1
import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request
class Config:
__slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
_default = {
'general': {
'port': '8008',
},
'_other': {
'verbose': 'no',
'noetag': 'no',
'noparts': 'no',
'strip': '',
'sub': '',
},}
# function to read in config file
def __init__(self):
import configparser, optparse
parser = optparse.OptionParser()
parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
(self.options, args) = parser.parse_args()
assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
if configDir:
self.root = configDir.group(1)
else:
self.root = os.getcwd()
self._config = configparser.ConfigParser()
self._config.readfp(open(self.options.config))
for section in self._config.sections():
if section != 'general':
if self._config.has_option(section, 'dir'):
if re.compile('^/$').match(self._config.get(section, 'dir')):
self._config.set(section, 'dir', self.root + os.sep + section)
thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
if thisDir:
self._config.set(section, 'dir', thisDir.group(1))
if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
else:
self._config.set(section, 'dir', self.root + os.sep + section)
if not self._config.has_option(section, 'root'):
self._config.set(section, 'root', section)
# function to select config file section or create one
def section(self, section):
if not self._config.has_section(section):
self._config.add_section(section)
self._section = section
# function to get config parameter, if parameter doesn't exists the default
# value or None is substituted
def __getitem__(self, name):
if not self._config.has_option(self._section, name):
if self._section in self._default:
if name in self._default[self._section]:
self._config.set(self._section, name, self._default[self._section][name])
else:
self._config.set(self._section, name, None)
elif name in self._default['_other']:
self._config.set(self._section, name, self._default['_other'][name])
else:
self._config.set(self._section, name, None)
return(self._config.get(self._section, name))
config = Config()
#assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma'])
const_ignore_fields = set([
'Accept-Ranges', 'Age',
'Cache-Control', 'Connection', 'Content-Type',
'Date',
'Expires',
'Referer',
'Server',
'Via',
'X-Cache', 'X-Cache-Lookup', 'X-Powered-By'
])
block_size = 4096
'''
# later, kqueue would be good but later
class Connection:
__slots__ = frozenset(('__address', '__input', '__socket', '__status', 'error', 'method', 'url', 'http_version'))
def __init__(self, socket, address):
self.__address = address
self.__input = b''
self.__socket = socket
self.__status = 0
def read(self, kev):
buffer = self.__socket.recv(kev.data)
exhausted = False
if len(buffer) == 0:
eof = True
else:
self.__input += buffer
while not exhausted:
if self.__status == -1:
exhausted = True
elif self.__status == 0:
endstring = self.__input.find(b'\n')
if endstring > 0:
print('Processing request line.')
line = self.__input[:endstring].decode('ascii')
self.__input = self.__input[endstring + 1:]
isRequest = re.compile('(GET) ([^ ]+) HTTP/(1\.0)').match(line)
if not isRequest:
self.error = 'Not a HTTP connection.'
self.__status = -1
else:
self.method = isRequest.group(1)
self.url = isRequest.group(2)
self.http_version = isRequest.group(3)
self.__status = 1
else:
exhausted = True
elif self.__status == 1:
endstring = self.__input.find(b'\n')
if endstring > 0:
print('Processing header line.' + repr(self.__input))
line = self.__input[:endstring].decode('ascii')
self.__input = self.__input[endstring + 1:]
isHeader = re.compile('([^:]*): +(.*)').match(line)
if not isHeader:
self.error = 'Bad header.'
return(False)
# process header here
elif endstring == 0:
self.__status = 2
else:
exhausted = True
def write(self, kev):
pass
if options.port:
import select, socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
sock.bind(('127.0.0.1', int(options.port)))
sock.listen(-1)
kq = select.kqueue()
assert kq.fileno() != -1, "Fatal error: can't initialise kqueue."
kq.control([select.kevent(sock, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
timeout = None
connections = {sock.fileno(): None}
while True:
kevs = kq.control(None, 1, timeout)
for kev in kevs:
if type(connections[kev.ident]) == Connection:
print(kev.ident, kev.data, kev.filter, kev.flags)
assert kev.data != 0, 'No data available.'
if kev.filter == select.KQ_FILTER_READ:
connections[kev.ident].read(kev)
elif kev.filter == select.KQ_FILTER_WRITE:
connections[kev.ident].write(kev)
else:
assert kev.filter in (select.KQ_FILTER_READ, select.KQ_FILTER_WRITE), 'Do we support other filters?'
else:
(conn, addr) = sock.accept()
print('Connection from ' + repr(addr))
kq.control([select.kevent(conn, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
connections[conn.fileno()] = Connection(conn, addr)
if kev.flags >> 15 == 1:
kq.control([select.kevent(kev.ident, select.KQ_FILTER_READ, select.KQ_EV_DELETE)], 0)
kq.control([select.kevent(kev.ident, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE)], 0)
del(connections[kev.ident])
finally:
sock.close()
'''
# XXX how about rechecking files?
if True:
import http.server
class MyRequestHandler(http.server.BaseHTTPRequestHandler):
def __process(self):
# reload means file needs to be reloaded to serve request
reload = False
# recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
recheck = False
# file_stat means file definitely exists
file_stat = None
# requested_ranges holds data about any range requested
requested_ranges = None
# records holds data from index locally, should be written back upon successfull completion
record = None
myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
if myPath:
my_path = myPath.group(1)
else:
my_path = self.path
config.section(self.headers['Host'])
if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0:
string = re.compile(config['strip']).sub(config['sub'], my_path)
my_path = string
info = 'Checking file: ' + my_path
if not os.access(config['dir'], os.X_OK):
os.mkdir(config['dir'])
# this is file index - everything is stored in this file
# _parts - list of stored parts of file
# _time - last time the file was checked
# everything else is just the headers
index = shelve.open(config['dir'] + os.sep + '.index')
desc_fields = const_desc_fields.copy()
ignore_fields = const_ignore_fields.copy()
if not config['noetag']:
desc_fields.add('ETag')
else:
ignore_fields.add('ETag')
proxy_ignored = set([
'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language',
'Cache-Control', 'Connection', 'Content-Length', 'Cookie',
'Host',
'If-Modified-Since', 'If-Unmodified-Since',
'Referer',
'User-Agent',
'Via',
'X-Forwarded-For', 'X-REMOVED',
])
print('===============[ {} request ]==='.format(self.command))
for header in self.headers:
if header in proxy_ignored:
pass
elif header in ('Range'):
isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
if isRange:
requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
else:
return()
elif header in ('Pragma'):
if my_path in index:
index[my_path][header] = self.headers[header]
else:
print('Unknown header - ', header, ': ', self.headers[header], sep='')
return()
print(header, self.headers[header])
# creating file name from my_path
file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
# partial file or unfinished download
temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
# creating empty placeholder in index
# if there's no space map and there's no file in real directory - we have no file
# if there's an empty space map - file is full
# space map generally covers every bit of file we don't posess currently
if not my_path in index:
info += '\nThis one is new.'
reload = True
record = {}
else:
# forcibly checking file if no file present
if os.access(file_name, os.R_OK):
info += '\nFull file found.'
file_stat = os.stat(file_name)
elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK):
info += '\nPartial file found.'
file_stat = os.stat(temp_name)
else:
info += '\nFile not found or inaccessible.'
index[my_path]['_parts'] = None
reload = True
record = index[my_path]
if not '_parts' in record:
record['_parts'] = None
if record['_parts'] == None:
recheck = True
# forcibly checking file if file size doesn't match with index data
if not reload:
if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']):
info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length'])
record['_parts'] = None
reload = True
# forcibly checking file if index holds Pragma header
if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache':
info +='\nPragma on: recheck imminent.'
recheck = True
# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
if not recheck and not reload and '_time' in record and (datetime.datetime.now() - datetime.timedelta(hours = 4) - record['_time']).days < 0:
recheck = True
print(info)
if reload or recheck:
try:
request = 'http://' + config['root'] + self.path
my_headers = {}
for header in ('Cache-Control', 'Cookie', 'Referer', 'User-Agent'):
if header in self.headers:
my_headers[header] = self.headers[header]
needed = None
if '_parts' in record and record['_parts'] != None:
if config['noparts'] or requested_ranges == None:
needed = record['_parts']
else:
needed = record['_parts'] | requested_ranges
elif not config['noparts']:
needed = requested_ranges
ranges = ()
print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
if needed != None and len(needed) > 0:
needed.rewind()
while True:
range = needed.pop()
if range[0] == None:
break
ranges += '{}-{}'.format(range[0], range[1] - 1),
my_headers['Range'] = 'bytes=' + ','.join(ranges)
request = urllib.request.Request(request, headers = my_headers)
with urllib.request.urlopen(request) as source:
new_record = {}
new_record['_parts'] = record['_parts']
headers = source.info()
# stripping unneeded headers (XXX make this inplace?)
for header in headers:
if header in desc_fields:
#if header == 'Pragma' and headers[header] != 'no-cache':
if header == 'Content-Length':
if 'Content-Range' not in headers:
new_record[header] = int(headers[header])
else:
new_record[header] = headers[header]
elif header == 'Content-Range':
range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
if range:
new_record['Content-Length'] = int(range.group(3))
else:
assert False, 'Content-Range unrecognized.'
elif not header in ignore_fields:
print('Undefined header "', header, '": ', headers[header], sep='')
# comparing headers with data found in index
# if any header has changed (except Pragma) file is fully downloaded
# same if we get more or less headers
old_keys = set(record.keys())
old_keys.discard('_time')
old_keys.discard('Pragma')
more_keys = set(new_record.keys()) - old_keys
more_keys.discard('Pragma')
less_keys = old_keys - set(new_record.keys())
if len(more_keys) > 0:
if not len(old_keys) == 0:
print('More headers appear:', more_keys)
reload = True
elif len(less_keys) > 0:
print('Less headers appear:', less_keys)
else:
for key in record.keys():
if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]:
print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
print(type(record[key]), type(new_record[key]))
reload = True
if reload:
print('Reloading.')
if os.access(temp_name, os.R_OK):
os.unlink(temp_name)
if os.access(file_name, os.R_OK):
os.unlink(file_name)
new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
print(new_record)
# downloading file or segment
if 'Content-Length' in new_record:
if needed == None:
needed = new_record['_parts']
else:
if len(needed) > 1:
print("Multipart requests currently not supported.")
assert False, 'Skip this one for now.'
else:
assert False, 'No Content-Length or Content-Range header.'
new_record['_time'] = datetime.datetime.now()
if self.command not in ('HEAD'):
# file is created at temporary location and moved in place only when download completes
if not os.access(temp_name, os.R_OK):
empty_name = config['dir'] + os.sep + '.tmp'
with open(empty_name, 'w+b') as some_file:
pass
os.renames(empty_name, temp_name)
temp_file = open(temp_name, 'r+b')
if requested_ranges == None and needed == None:
needed = new_record['_parts']
needed.rewind()
while True:
(start, end) = needed.pop()
if start == None:
break
stream_last = start
old_record = new_record
if end - start < block_size:
req_block_size = end - start
else:
req_block_size = block_size
buffer = source.read(req_block_size)
length = len(buffer)
while length > 0 and stream_last < end:
stream_pos = stream_last + length
assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
temp_file.seek(stream_last)
temp_file.write(buffer)
new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
index[my_path] = old_record
index.sync()
old_record = new_record
stream_last = stream_pos
if end - stream_last < block_size:
req_block_size = end - stream_last
buffer = source.read(req_block_size)
length = len(buffer)
# moving downloaded data to real file
temp_file.close()
print(new_record)
index[my_path] = new_record
index.sync()
except urllib.error.HTTPError as error:
# in case of error we don't need to do anything actually,
# if file download stalls or fails the file would not be moved to it's location
print(error)
if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
# just moving
# drop old dirs XXX
print('Moving temporary file to new destination.')
os.renames(temp_name, file_name)
if not my_path in index:
self.send_response(502)
self.end_headers()
return
if self.command == 'HEAD':
self.send_response(200)
if 'Content-Length' in index[my_path]:
self.send_header('Content-Length', index[my_path]['Content-Length'])
self.send_header('Accept-Ranges', 'bytes')
self.send_header('Content-Type', 'application/octet-stream')
if 'Last-Modified' in index[my_path]:
self.send_header('Last-Modified', index[my_path]['Last-Modified'])
self.end_headers()
else:
if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
file_name = temp_name
with open(file_name, 'rb') as real_file:
file_stat = os.stat(file_name)
if 'Range' in self.headers:
self.send_response(206)
ranges = ()
requested_ranges.rewind()
while True:
pair = requested_ranges.pop()
if pair[0] == None:
break
ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length']))
else:
self.send_response(200)
self.send_header('Content-Length', str(file_stat.st_size))
requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
if 'Last-Modified' in index[my_path]:
self.send_header('Last-Modified', index[my_path]['Last-Modified'])
self.send_header('Content-Type', 'application/octet-stream')
self.end_headers()
if self.command in ('GET'):
if len(requested_ranges) > 0:
requested_ranges.rewind()
(start, end) = requested_ranges.pop()
else:
start = 0
end = index[my_path]['Content-Length']
real_file.seek(start)
if block_size > end - start:
req_block_size = end - start
else:
req_block_size = block_size
buffer = real_file.read(req_block_size)
length = len(buffer)
while length > 0:
self.wfile.write(buffer)
start += len(buffer)
if req_block_size > end - start:
req_block_size = end - start
if req_block_size == 0:
break
buffer = real_file.read(req_block_size)
length = len(buffer)
def do_HEAD(self):
return self.__process()
def do_GET(self):
return self.__process()
config.section('general')
server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
server.serve_forever()
else:
while True:
unchecked_files = set()
checked_files = 0
# reading log and storing found urls for processing
# check file mtime XXX
with open(options.log, 'r') as log_file:
log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
for line in log_file:
this_line = log_line.match(line.strip())
if this_line:
unchecked_files.add(this_line.group(2))
for url in unchecked_files:
reload = False
recheck = False
info = 'Checking file: ' + url
# creating empty placeholder in index
if not url in index:
info += '\nThis one is new.'
index[url] = {}
reload = True
# creating file name from url
file_name = options.dir + re.compile('%20').sub(' ', url)
# forcibly checking file if no file present
if not reload and not os.access(file_name, os.R_OK):
info += '\nFile not found or inaccessible.'
reload = True
# forcibly checking file if file size doesn't match with index data
elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
reload = True
# forcibly checking file if index hods Pragma header
if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
info +='\nPragma on: recheck imminent.'
recheck = True
# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
if not recheck and not reload and (options.noupdate or ('_time' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['_time']).days < 0)):
if options.verbose:
print(info)
continue
else:
print(info)
try:
with urllib.request.urlopen(options.root + url) as source:
new_headers = {}
headers = source.info()
# stripping unneeded headers (XXX make this inplace?)
for header in headers:
if header in desc_fields:
if header == 'Pragma' and headers[header] != 'no-cache':
print('Pragma:', headers[header])
new_headers[header] = headers[header]
elif not header in ignore_fields:
print('Undefined header "', header, '": ', headers[header], sep='')
# comparing headers with data found in index
# if any header has changed (except Pragma) file is fully downloaded
# same if we get more or less headers
old_keys = set(index[url].keys())
old_keys.discard('_time')
old_keys.discard('Pragma')
more_keys = set(new_headers.keys()) - old_keys
more_keys.discard('Pragma')
less_keys = old_keys - set(new_headers.keys())
if len(more_keys) > 0:
if not len(old_keys) == 0:
print('More headers appear:', more_keys)
reload = True
elif len(less_keys) > 0:
print('Less headers appear:', less_keys)
else:
for key in index[url].keys():
if key[0] != '_' and key != 'Pragma' and not index[url][key] == new_headers[key]:
print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
reload = True
# downloading file
if reload:
if 'Content-Length' in headers:
print('Downloading', headers['Content-Length'], 'bytes [', end='')
else:
print('Downloading [', end='')
sys.stdout.flush()
# file is created at temporary location and moved in place only when download completes
temp_file = open(options.dir + os.sep + '.tmp', 'wb')
buffer = source.read(block_size)
megablocks = 0
blocks = 0
megs = 0
while len(buffer) > 0:
temp_file.write(buffer)
buffer = source.read(block_size)
blocks += 1
if blocks > 102400/block_size:
megablocks += 1
if megablocks > 10:
megablocks = megablocks - 10
megs += 1
print('{}Mb'.format(megs), end='')
else:
print('.', end='')
blocks = blocks - 102400/block_size
sys.stdout.flush()
temp_file.close()
print(']')
os.renames(options.dir + os.sep + '.tmp', file_name)
checked_files += 1
# storing new time mark and storing new headers
new_headers['_time'] = datetime.datetime.now()
index[url] = new_headers
index.sync()
except urllib.error.HTTPError as error:
# in case of error we don't need to do anything actually,
# if file download stalls or fails the file would not be moved to it's location
print(error)
if options.verbose:
print('[', len(unchecked_files), '/', checked_files, ']')
# checking if there were any files downloaded, if yes - restarting sequence
if checked_files == 0:
break