Lines of
samesite.py
from check-in c3db1a007e
that are changed by the sequence of edits moving toward
check-in 90160dbf50:
1: #!/usr/bin/env python3.1
2:
3: import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request
4:
5: class Config:
6: __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
7: _default = {
8: 'general': {
9: 'port': '8008',
10: },
11: '_other': {
12: 'verbose': 'no',
13: 'noetag': 'no',
14: 'noparts': 'no',
15: 'strip': '',
16: 'sub': '',
17: },}
18:
19: # function to read in config file
20: def __init__(self):
21: import configparser, optparse
22:
23: parser = optparse.OptionParser()
24: parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
25: (self.options, args) = parser.parse_args()
26:
27: assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
28:
29: configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
30: if configDir:
31: self.root = configDir.group(1)
32: else:
33: self.root = os.getcwd()
34:
35: self._config = configparser.ConfigParser()
36: self._config.readfp(open(self.options.config))
37:
38: for section in self._config.sections():
39: if section != 'general':
40: if self._config.has_option(section, 'dir'):
41: if re.compile('^/$').match(self._config.get(section, 'dir')):
42: self._config.set(section, 'dir', self.root + os.sep + section)
43: thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
44: if thisDir:
45: self._config.set(section, 'dir', thisDir.group(1))
46: if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
47: self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
48: else:
49: self._config.set(section, 'dir', self.root + os.sep + section)
50:
51: if not self._config.has_option(section, 'root'):
52: self._config.set(section, 'root', section)
53:
54: # function to select config file section or create one
55: def section(self, section):
56: if not self._config.has_section(section):
57: self._config.add_section(section)
58: self._section = section
59:
60: # function to get config parameter, if parameter doesn't exists the default
61: # value or None is substituted
62: def __getitem__(self, name):
63: if not self._config.has_option(self._section, name):
64: if self._section in self._default:
65: if name in self._default[self._section]:
66: self._config.set(self._section, name, self._default[self._section][name])
67: else:
68: self._config.set(self._section, name, None)
69: elif name in self._default['_other']:
70: self._config.set(self._section, name, self._default['_other'][name])
71: else:
72: self._config.set(self._section, name, None)
73: return(self._config.get(self._section, name))
74:
75: config = Config()
76:
77: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
78:
79: const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma'])
80: const_ignore_fields = set([
81: 'Accept-Ranges', 'Age',
82: 'Cache-Control', 'Connection', 'Content-Type',
83: 'Date',
84: 'Expires',
85: 'Referer',
86: 'Server',
87: 'Via',
88: 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By'
89: ])
90:
91: block_size = 4096
92:
c3db1a007e 2010-09-16 93: '''
c3db1a007e 2010-09-16 94: # later, kqueue would be good but later
c3db1a007e 2010-09-16 95: class Connection:
c3db1a007e 2010-09-16 96: __slots__ = frozenset(('__address', '__input', '__socket', '__status', 'error', 'method', 'url', 'http_version'))
c3db1a007e 2010-09-16 97:
c3db1a007e 2010-09-16 98: def __init__(self, socket, address):
c3db1a007e 2010-09-16 99: self.__address = address
c3db1a007e 2010-09-16 100: self.__input = b''
c3db1a007e 2010-09-16 101: self.__socket = socket
c3db1a007e 2010-09-16 102: self.__status = 0
c3db1a007e 2010-09-16 103:
c3db1a007e 2010-09-16 104: def read(self, kev):
c3db1a007e 2010-09-16 105: buffer = self.__socket.recv(kev.data)
c3db1a007e 2010-09-16 106: exhausted = False
c3db1a007e 2010-09-16 107: if len(buffer) == 0:
c3db1a007e 2010-09-16 108: eof = True
c3db1a007e 2010-09-16 109: else:
c3db1a007e 2010-09-16 110: self.__input += buffer
c3db1a007e 2010-09-16 111: while not exhausted:
c3db1a007e 2010-09-16 112: if self.__status == -1:
c3db1a007e 2010-09-16 113: exhausted = True
c3db1a007e 2010-09-16 114: elif self.__status == 0:
c3db1a007e 2010-09-16 115: endstring = self.__input.find(b'\n')
c3db1a007e 2010-09-16 116: if endstring > 0:
c3db1a007e 2010-09-16 117: print('Processing request line.')
c3db1a007e 2010-09-16 118: line = self.__input[:endstring].decode('ascii')
c3db1a007e 2010-09-16 119: self.__input = self.__input[endstring + 1:]
c3db1a007e 2010-09-16 120: isRequest = re.compile('(GET) ([^ ]+) HTTP/(1\.0)').match(line)
c3db1a007e 2010-09-16 121: if not isRequest:
c3db1a007e 2010-09-16 122: self.error = 'Not a HTTP connection.'
c3db1a007e 2010-09-16 123: self.__status = -1
c3db1a007e 2010-09-16 124: else:
c3db1a007e 2010-09-16 125: self.method = isRequest.group(1)
c3db1a007e 2010-09-16 126: self.url = isRequest.group(2)
c3db1a007e 2010-09-16 127: self.http_version = isRequest.group(3)
c3db1a007e 2010-09-16 128: self.__status = 1
c3db1a007e 2010-09-16 129: else:
c3db1a007e 2010-09-16 130: exhausted = True
c3db1a007e 2010-09-16 131: elif self.__status == 1:
c3db1a007e 2010-09-16 132: endstring = self.__input.find(b'\n')
c3db1a007e 2010-09-16 133: if endstring > 0:
c3db1a007e 2010-09-16 134: print('Processing header line.' + repr(self.__input))
c3db1a007e 2010-09-16 135: line = self.__input[:endstring].decode('ascii')
c3db1a007e 2010-09-16 136: self.__input = self.__input[endstring + 1:]
c3db1a007e 2010-09-16 137: isHeader = re.compile('([^:]*): +(.*)').match(line)
c3db1a007e 2010-09-16 138: if not isHeader:
c3db1a007e 2010-09-16 139: self.error = 'Bad header.'
c3db1a007e 2010-09-16 140: return(False)
c3db1a007e 2010-09-16 141: # process header here
c3db1a007e 2010-09-16 142: elif endstring == 0:
c3db1a007e 2010-09-16 143: self.__status = 2
c3db1a007e 2010-09-16 144: else:
c3db1a007e 2010-09-16 145: exhausted = True
c3db1a007e 2010-09-16 146:
c3db1a007e 2010-09-16 147: def write(self, kev):
c3db1a007e 2010-09-16 148: pass
c3db1a007e 2010-09-16 149:
c3db1a007e 2010-09-16 150: if options.port:
c3db1a007e 2010-09-16 151: import select, socket
c3db1a007e 2010-09-16 152:
c3db1a007e 2010-09-16 153: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
c3db1a007e 2010-09-16 154: try:
c3db1a007e 2010-09-16 155: sock.bind(('127.0.0.1', int(options.port)))
c3db1a007e 2010-09-16 156: sock.listen(-1)
c3db1a007e 2010-09-16 157:
c3db1a007e 2010-09-16 158: kq = select.kqueue()
c3db1a007e 2010-09-16 159: assert kq.fileno() != -1, "Fatal error: can't initialise kqueue."
c3db1a007e 2010-09-16 160:
c3db1a007e 2010-09-16 161: kq.control([select.kevent(sock, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
c3db1a007e 2010-09-16 162: timeout = None
c3db1a007e 2010-09-16 163:
c3db1a007e 2010-09-16 164: connections = {sock.fileno(): None}
c3db1a007e 2010-09-16 165:
c3db1a007e 2010-09-16 166: while True:
c3db1a007e 2010-09-16 167: kevs = kq.control(None, 1, timeout)
c3db1a007e 2010-09-16 168:
c3db1a007e 2010-09-16 169: for kev in kevs:
c3db1a007e 2010-09-16 170: if type(connections[kev.ident]) == Connection:
c3db1a007e 2010-09-16 171: print(kev.ident, kev.data, kev.filter, kev.flags)
c3db1a007e 2010-09-16 172: assert kev.data != 0, 'No data available.'
c3db1a007e 2010-09-16 173: if kev.filter == select.KQ_FILTER_READ:
c3db1a007e 2010-09-16 174: connections[kev.ident].read(kev)
c3db1a007e 2010-09-16 175: elif kev.filter == select.KQ_FILTER_WRITE:
c3db1a007e 2010-09-16 176: connections[kev.ident].write(kev)
c3db1a007e 2010-09-16 177: else:
c3db1a007e 2010-09-16 178: assert kev.filter in (select.KQ_FILTER_READ, select.KQ_FILTER_WRITE), 'Do we support other filters?'
c3db1a007e 2010-09-16 179: else:
c3db1a007e 2010-09-16 180: (conn, addr) = sock.accept()
c3db1a007e 2010-09-16 181: print('Connection from ' + repr(addr))
c3db1a007e 2010-09-16 182: kq.control([select.kevent(conn, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
c3db1a007e 2010-09-16 183: connections[conn.fileno()] = Connection(conn, addr)
c3db1a007e 2010-09-16 184:
c3db1a007e 2010-09-16 185: if kev.flags >> 15 == 1:
c3db1a007e 2010-09-16 186: kq.control([select.kevent(kev.ident, select.KQ_FILTER_READ, select.KQ_EV_DELETE)], 0)
c3db1a007e 2010-09-16 187: kq.control([select.kevent(kev.ident, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE)], 0)
c3db1a007e 2010-09-16 188: del(connections[kev.ident])
c3db1a007e 2010-09-16 189: finally:
c3db1a007e 2010-09-16 190: sock.close()
c3db1a007e 2010-09-16 191: '''
c3db1a007e 2010-09-16 192:
c3db1a007e 2010-09-16 193: # XXX how about rechecking files?
c3db1a007e 2010-09-16 194: if True:
c3db1a007e 2010-09-16 195: import http.server
c3db1a007e 2010-09-16 196:
c3db1a007e 2010-09-16 197: class MyRequestHandler(http.server.BaseHTTPRequestHandler):
c3db1a007e 2010-09-16 198: def __process(self):
c3db1a007e 2010-09-16 199: # reload means file needs to be reloaded to serve request
c3db1a007e 2010-09-16 200: reload = False
c3db1a007e 2010-09-16 201: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
c3db1a007e 2010-09-16 202: recheck = False
c3db1a007e 2010-09-16 203: # file_stat means file definitely exists
c3db1a007e 2010-09-16 204: file_stat = None
c3db1a007e 2010-09-16 205: # requested_ranges holds data about any range requested
c3db1a007e 2010-09-16 206: requested_ranges = None
c3db1a007e 2010-09-16 207: # records holds data from index locally, should be written back upon successfull completion
c3db1a007e 2010-09-16 208: record = None
c3db1a007e 2010-09-16 209:
c3db1a007e 2010-09-16 210: myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
c3db1a007e 2010-09-16 211: if myPath:
c3db1a007e 2010-09-16 212: my_path = myPath.group(1)
c3db1a007e 2010-09-16 213: else:
c3db1a007e 2010-09-16 214: my_path = self.path
c3db1a007e 2010-09-16 215:
c3db1a007e 2010-09-16 216: config.section(self.headers['Host'])
c3db1a007e 2010-09-16 217:
c3db1a007e 2010-09-16 218: if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0:
c3db1a007e 2010-09-16 219: string = re.compile(config['strip']).sub(config['sub'], my_path)
c3db1a007e 2010-09-16 220: my_path = string
c3db1a007e 2010-09-16 221:
c3db1a007e 2010-09-16 222: info = 'Checking file: ' + my_path
c3db1a007e 2010-09-16 223:
c3db1a007e 2010-09-16 224: if not os.access(config['dir'], os.X_OK):
c3db1a007e 2010-09-16 225: os.mkdir(config['dir'])
c3db1a007e 2010-09-16 226: # this is file index - everything is stored in this file
c3db1a007e 2010-09-16 227: # _parts - list of stored parts of file
c3db1a007e 2010-09-16 228: # _time - last time the file was checked
c3db1a007e 2010-09-16 229: # everything else is just the headers
c3db1a007e 2010-09-16 230: index = shelve.open(config['dir'] + os.sep + '.index')
c3db1a007e 2010-09-16 231:
c3db1a007e 2010-09-16 232: desc_fields = const_desc_fields.copy()
c3db1a007e 2010-09-16 233: ignore_fields = const_ignore_fields.copy()
c3db1a007e 2010-09-16 234: if config['noetag'] == 'no':
c3db1a007e 2010-09-16 235: desc_fields.add('ETag')
c3db1a007e 2010-09-16 236: else:
c3db1a007e 2010-09-16 237: ignore_fields.add('ETag')
c3db1a007e 2010-09-16 238:
c3db1a007e 2010-09-16 239: proxy_ignored = set([
c3db1a007e 2010-09-16 240: 'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language',
c3db1a007e 2010-09-16 241: 'Cache-Control', 'Connection', 'Content-Length', 'Cookie',
c3db1a007e 2010-09-16 242: 'Host',
c3db1a007e 2010-09-16 243: 'If-Modified-Since', 'If-Unmodified-Since',
c3db1a007e 2010-09-16 244: 'Referer',
c3db1a007e 2010-09-16 245: 'User-Agent',
c3db1a007e 2010-09-16 246: 'Via',
c3db1a007e 2010-09-16 247: 'X-Forwarded-For', 'X-REMOVED',
c3db1a007e 2010-09-16 248: ])
c3db1a007e 2010-09-16 249:
c3db1a007e 2010-09-16 250: print('===============[ {} request ]==='.format(self.command))
c3db1a007e 2010-09-16 251:
c3db1a007e 2010-09-16 252: for header in self.headers:
c3db1a007e 2010-09-16 253: if header in proxy_ignored:
c3db1a007e 2010-09-16 254: pass
c3db1a007e 2010-09-16 255: elif header in ('Range'):
c3db1a007e 2010-09-16 256: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
c3db1a007e 2010-09-16 257: if isRange:
c3db1a007e 2010-09-16 258: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
c3db1a007e 2010-09-16 259: else:
c3db1a007e 2010-09-16 260: return()
c3db1a007e 2010-09-16 261: elif header in ('Pragma'):
c3db1a007e 2010-09-16 262: if my_path in index:
c3db1a007e 2010-09-16 263: index[my_path][header] = self.headers[header]
c3db1a007e 2010-09-16 264: else:
c3db1a007e 2010-09-16 265: print('Unknown header - ', header, ': ', self.headers[header], sep='')
c3db1a007e 2010-09-16 266: return()
c3db1a007e 2010-09-16 267: print(header, self.headers[header])
c3db1a007e 2010-09-16 268:
c3db1a007e 2010-09-16 269: # creating file name from my_path
c3db1a007e 2010-09-16 270: file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
c3db1a007e 2010-09-16 271: # partial file or unfinished download
c3db1a007e 2010-09-16 272: temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
c3db1a007e 2010-09-16 273:
c3db1a007e 2010-09-16 274: # creating empty placeholder in index
c3db1a007e 2010-09-16 275: # if there's no space map and there's no file in real directory - we have no file
c3db1a007e 2010-09-16 276: # if there's an empty space map - file is full
c3db1a007e 2010-09-16 277: # space map generally covers every bit of file we don't posess currently
c3db1a007e 2010-09-16 278: if not my_path in index:
c3db1a007e 2010-09-16 279: info += '\nThis one is new.'
c3db1a007e 2010-09-16 280: reload = True
c3db1a007e 2010-09-16 281: record = {}
c3db1a007e 2010-09-16 282: else:
c3db1a007e 2010-09-16 283: # forcibly checking file if no file present
c3db1a007e 2010-09-16 284: record = index[my_path]
c3db1a007e 2010-09-16 285: if os.access(file_name, os.R_OK):
c3db1a007e 2010-09-16 286: info += '\nFull file found.'
c3db1a007e 2010-09-16 287: file_stat = os.stat(file_name)
c3db1a007e 2010-09-16 288: elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK):
c3db1a007e 2010-09-16 289: info += '\nPartial file found.'
c3db1a007e 2010-09-16 290: file_stat = os.stat(temp_name)
c3db1a007e 2010-09-16 291: recheck = True
c3db1a007e 2010-09-16 292: else:
c3db1a007e 2010-09-16 293: info += '\nFile not found or inaccessible.'
c3db1a007e 2010-09-16 294: record['_parts'] = None
c3db1a007e 2010-09-16 295: reload = True
c3db1a007e 2010-09-16 296:
c3db1a007e 2010-09-16 297: if not '_parts' in record:
c3db1a007e 2010-09-16 298: record['_parts'] = None
c3db1a007e 2010-09-16 299:
c3db1a007e 2010-09-16 300: if record['_parts'] == None:
c3db1a007e 2010-09-16 301: recheck = True
c3db1a007e 2010-09-16 302:
c3db1a007e 2010-09-16 303: # forcibly checking file if file size doesn't match with index data
c3db1a007e 2010-09-16 304: if not reload:
c3db1a007e 2010-09-16 305: if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
c3db1a007e 2010-09-16 306: if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']):
c3db1a007e 2010-09-16 307: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length'])
c3db1a007e 2010-09-16 308: record['_parts'] = None
c3db1a007e 2010-09-16 309: reload = True
c3db1a007e 2010-09-16 310:
c3db1a007e 2010-09-16 311: # forcibly checking file if index holds Pragma header
c3db1a007e 2010-09-16 312: if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache':
c3db1a007e 2010-09-16 313: info +='\nPragma on: recheck imminent.'
c3db1a007e 2010-09-16 314: recheck = True
c3db1a007e 2010-09-16 315:
c3db1a007e 2010-09-16 316: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
c3db1a007e 2010-09-16 317: if not recheck and not reload and '_time' in record and (datetime.datetime.now() - datetime.timedelta(hours = 4) - record['_time']).days < 0:
c3db1a007e 2010-09-16 318: recheck = True
c3db1a007e 2010-09-16 319:
c3db1a007e 2010-09-16 320: print(info)
c3db1a007e 2010-09-16 321: if reload or recheck:
c3db1a007e 2010-09-16 322:
c3db1a007e 2010-09-16 323: try:
c3db1a007e 2010-09-16 324: request = 'http://' + config['root'] + self.path
c3db1a007e 2010-09-16 325: my_headers = {}
c3db1a007e 2010-09-16 326: for header in ('Cache-Control', 'Cookie', 'Referer', 'User-Agent'):
c3db1a007e 2010-09-16 327: if header in self.headers:
c3db1a007e 2010-09-16 328: my_headers[header] = self.headers[header]
c3db1a007e 2010-09-16 329:
c3db1a007e 2010-09-16 330: needed = None
c3db1a007e 2010-09-16 331: if '_parts' in record and record['_parts'] != None:
c3db1a007e 2010-09-16 332: if config['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap():
c3db1a007e 2010-09-16 333: needed = record['_parts']
c3db1a007e 2010-09-16 334: else:
c3db1a007e 2010-09-16 335: needed = record['_parts'] | requested_ranges
c3db1a007e 2010-09-16 336: elif config['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap():
c3db1a007e 2010-09-16 337: needed = requested_ranges
c3db1a007e 2010-09-16 338: ranges = ()
c3db1a007e 2010-09-16 339: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
c3db1a007e 2010-09-16 340: if needed != None and len(needed) > 0:
c3db1a007e 2010-09-16 341: needed.rewind()
c3db1a007e 2010-09-16 342: while True:
c3db1a007e 2010-09-16 343: range = needed.pop()
c3db1a007e 2010-09-16 344: if range[0] == None:
c3db1a007e 2010-09-16 345: break
c3db1a007e 2010-09-16 346: ranges += '{}-{}'.format(range[0], range[1] - 1),
c3db1a007e 2010-09-16 347: my_headers['Range'] = 'bytes=' + ','.join(ranges)
c3db1a007e 2010-09-16 348:
c3db1a007e 2010-09-16 349: request = urllib.request.Request(request, headers = my_headers)
c3db1a007e 2010-09-16 350:
c3db1a007e 2010-09-16 351: with urllib.request.urlopen(request) as source:
c3db1a007e 2010-09-16 352: new_record = {}
c3db1a007e 2010-09-16 353: new_record['_parts'] = record['_parts']
c3db1a007e 2010-09-16 354: headers = source.info()
c3db1a007e 2010-09-16 355:
c3db1a007e 2010-09-16 356: # stripping unneeded headers (XXX make this inplace?)
c3db1a007e 2010-09-16 357: for header in headers:
c3db1a007e 2010-09-16 358: if header in desc_fields:
c3db1a007e 2010-09-16 359: #if header == 'Pragma' and headers[header] != 'no-cache':
c3db1a007e 2010-09-16 360: if header == 'Content-Length':
c3db1a007e 2010-09-16 361: if 'Content-Range' not in headers:
c3db1a007e 2010-09-16 362: new_record[header] = int(headers[header])
c3db1a007e 2010-09-16 363: else:
c3db1a007e 2010-09-16 364: new_record[header] = headers[header]
c3db1a007e 2010-09-16 365: elif header == 'Content-Range':
c3db1a007e 2010-09-16 366: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
c3db1a007e 2010-09-16 367: if range:
c3db1a007e 2010-09-16 368: new_record['Content-Length'] = int(range.group(3))
c3db1a007e 2010-09-16 369: else:
c3db1a007e 2010-09-16 370: assert False, 'Content-Range unrecognized.'
c3db1a007e 2010-09-16 371: elif not header in ignore_fields:
c3db1a007e 2010-09-16 372: print('Undefined header "', header, '": ', headers[header], sep='')
c3db1a007e 2010-09-16 373:
c3db1a007e 2010-09-16 374: # comparing headers with data found in index
c3db1a007e 2010-09-16 375: # if any header has changed (except Pragma) file is fully downloaded
c3db1a007e 2010-09-16 376: # same if we get more or less headers
c3db1a007e 2010-09-16 377: old_keys = set(record.keys())
c3db1a007e 2010-09-16 378: old_keys.discard('_time')
c3db1a007e 2010-09-16 379: old_keys.discard('Pragma')
c3db1a007e 2010-09-16 380: more_keys = set(new_record.keys()) - old_keys
c3db1a007e 2010-09-16 381: more_keys.discard('Pragma')
c3db1a007e 2010-09-16 382: less_keys = old_keys - set(new_record.keys())
c3db1a007e 2010-09-16 383: if len(more_keys) > 0:
c3db1a007e 2010-09-16 384: if not len(old_keys) == 0:
c3db1a007e 2010-09-16 385: print('More headers appear:', more_keys)
c3db1a007e 2010-09-16 386: reload = True
c3db1a007e 2010-09-16 387: elif len(less_keys) > 0:
c3db1a007e 2010-09-16 388: print('Less headers appear:', less_keys)
c3db1a007e 2010-09-16 389: else:
c3db1a007e 2010-09-16 390: for key in record.keys():
c3db1a007e 2010-09-16 391: if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]:
c3db1a007e 2010-09-16 392: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
c3db1a007e 2010-09-16 393: print(type(record[key]), type(new_record[key]))
c3db1a007e 2010-09-16 394: reload = True
c3db1a007e 2010-09-16 395:
c3db1a007e 2010-09-16 396: if reload:
c3db1a007e 2010-09-16 397: print('Reloading.')
c3db1a007e 2010-09-16 398: if os.access(temp_name, os.R_OK):
c3db1a007e 2010-09-16 399: os.unlink(temp_name)
c3db1a007e 2010-09-16 400: if os.access(file_name, os.R_OK):
c3db1a007e 2010-09-16 401: os.unlink(file_name)
c3db1a007e 2010-09-16 402: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
c3db1a007e 2010-09-16 403: print(new_record)
c3db1a007e 2010-09-16 404:
c3db1a007e 2010-09-16 405: # downloading file or segment
c3db1a007e 2010-09-16 406: if 'Content-Length' in new_record:
c3db1a007e 2010-09-16 407: if needed == None:
c3db1a007e 2010-09-16 408: needed = new_record['_parts']
c3db1a007e 2010-09-16 409: else:
c3db1a007e 2010-09-16 410: if len(needed) > 1:
c3db1a007e 2010-09-16 411: print("Multipart requests currently not supported.")
c3db1a007e 2010-09-16 412: assert False, 'Skip this one for now.'
c3db1a007e 2010-09-16 413: else:
c3db1a007e 2010-09-16 414: assert False, 'No Content-Length or Content-Range header.'
c3db1a007e 2010-09-16 415:
c3db1a007e 2010-09-16 416: new_record['_time'] = datetime.datetime.now()
c3db1a007e 2010-09-16 417: if self.command not in ('HEAD'):
c3db1a007e 2010-09-16 418: # file is created at temporary location and moved in place only when download completes
c3db1a007e 2010-09-16 419: if not os.access(temp_name, os.R_OK):
c3db1a007e 2010-09-16 420: empty_name = config['dir'] + os.sep + '.tmp'
c3db1a007e 2010-09-16 421: with open(empty_name, 'w+b') as some_file:
c3db1a007e 2010-09-16 422: pass
c3db1a007e 2010-09-16 423: os.renames(empty_name, temp_name)
c3db1a007e 2010-09-16 424: temp_file = open(temp_name, 'r+b')
c3db1a007e 2010-09-16 425: if requested_ranges == None and needed == None:
c3db1a007e 2010-09-16 426: needed = new_record['_parts']
c3db1a007e 2010-09-16 427: needed.rewind()
c3db1a007e 2010-09-16 428: while True:
c3db1a007e 2010-09-16 429: (start, end) = needed.pop()
c3db1a007e 2010-09-16 430: if start == None:
c3db1a007e 2010-09-16 431: break
c3db1a007e 2010-09-16 432: stream_last = start
c3db1a007e 2010-09-16 433: old_record = new_record
c3db1a007e 2010-09-16 434: if end - start < block_size:
c3db1a007e 2010-09-16 435: req_block_size = end - start
c3db1a007e 2010-09-16 436: else:
c3db1a007e 2010-09-16 437: req_block_size = block_size
c3db1a007e 2010-09-16 438: buffer = source.read(req_block_size)
c3db1a007e 2010-09-16 439: length = len(buffer)
c3db1a007e 2010-09-16 440: while length > 0 and stream_last < end:
c3db1a007e 2010-09-16 441: stream_pos = stream_last + length
c3db1a007e 2010-09-16 442: assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
c3db1a007e 2010-09-16 443: temp_file.seek(stream_last)
c3db1a007e 2010-09-16 444: temp_file.write(buffer)
c3db1a007e 2010-09-16 445: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
c3db1a007e 2010-09-16 446: index[my_path] = old_record
c3db1a007e 2010-09-16 447: index.sync()
c3db1a007e 2010-09-16 448: old_record = new_record
c3db1a007e 2010-09-16 449: stream_last = stream_pos
c3db1a007e 2010-09-16 450: if end - stream_last < block_size:
c3db1a007e 2010-09-16 451: req_block_size = end - stream_last
c3db1a007e 2010-09-16 452: buffer = source.read(req_block_size)
c3db1a007e 2010-09-16 453: length = len(buffer)
c3db1a007e 2010-09-16 454: # moving downloaded data to real file
c3db1a007e 2010-09-16 455: temp_file.close()
c3db1a007e 2010-09-16 456:
c3db1a007e 2010-09-16 457: index[my_path] = new_record
c3db1a007e 2010-09-16 458: index.sync()
c3db1a007e 2010-09-16 459:
c3db1a007e 2010-09-16 460: except urllib.error.HTTPError as error:
c3db1a007e 2010-09-16 461: # in case of error we don't need to do anything actually,
c3db1a007e 2010-09-16 462: # if file download stalls or fails the file would not be moved to it's location
c3db1a007e 2010-09-16 463: print(error)
c3db1a007e 2010-09-16 464:
c3db1a007e 2010-09-16 465: print(index[my_path])
c3db1a007e 2010-09-16 466:
c3db1a007e 2010-09-16 467: if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
c3db1a007e 2010-09-16 468: # just moving
c3db1a007e 2010-09-16 469: # drop old dirs XXX
c3db1a007e 2010-09-16 470: print('Moving temporary file to new destination.')
c3db1a007e 2010-09-16 471: os.renames(temp_name, file_name)
c3db1a007e 2010-09-16 472:
c3db1a007e 2010-09-16 473: if not my_path in index:
c3db1a007e 2010-09-16 474: self.send_response(502)
c3db1a007e 2010-09-16 475: self.end_headers()
c3db1a007e 2010-09-16 476: return
c3db1a007e 2010-09-16 477:
c3db1a007e 2010-09-16 478: if self.command == 'HEAD':
c3db1a007e 2010-09-16 479: self.send_response(200)
c3db1a007e 2010-09-16 480: if 'Content-Length' in index[my_path]:
c3db1a007e 2010-09-16 481: self.send_header('Content-Length', index[my_path]['Content-Length'])
c3db1a007e 2010-09-16 482: self.send_header('Accept-Ranges', 'bytes')
c3db1a007e 2010-09-16 483: self.send_header('Content-Type', 'application/octet-stream')
c3db1a007e 2010-09-16 484: if 'Last-Modified' in index[my_path]:
c3db1a007e 2010-09-16 485: self.send_header('Last-Modified', index[my_path]['Last-Modified'])
c3db1a007e 2010-09-16 486: self.end_headers()
c3db1a007e 2010-09-16 487: else:
c3db1a007e 2010-09-16 488: if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
c3db1a007e 2010-09-16 489: file_name = temp_name
c3db1a007e 2010-09-16 490:
c3db1a007e 2010-09-16 491: with open(file_name, 'rb') as real_file:
c3db1a007e 2010-09-16 492: file_stat = os.stat(file_name)
c3db1a007e 2010-09-16 493: if 'Range' in self.headers:
c3db1a007e 2010-09-16 494: self.send_response(206)
c3db1a007e 2010-09-16 495: ranges = ()
c3db1a007e 2010-09-16 496: requested_ranges.rewind()
c3db1a007e 2010-09-16 497: while True:
c3db1a007e 2010-09-16 498: pair = requested_ranges.pop()
c3db1a007e 2010-09-16 499: if pair[0] == None:
c3db1a007e 2010-09-16 500: break
c3db1a007e 2010-09-16 501: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
c3db1a007e 2010-09-16 502: self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length']))
c3db1a007e 2010-09-16 503: else:
c3db1a007e 2010-09-16 504: self.send_response(200)
c3db1a007e 2010-09-16 505: self.send_header('Content-Length', str(file_stat.st_size))
c3db1a007e 2010-09-16 506: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
c3db1a007e 2010-09-16 507: if 'Last-Modified' in index[my_path]:
c3db1a007e 2010-09-16 508: self.send_header('Last-Modified', index[my_path]['Last-Modified'])
c3db1a007e 2010-09-16 509: self.send_header('Content-Type', 'application/octet-stream')
c3db1a007e 2010-09-16 510: self.end_headers()
c3db1a007e 2010-09-16 511: if self.command in ('GET'):
c3db1a007e 2010-09-16 512: if len(requested_ranges) > 0:
c3db1a007e 2010-09-16 513: requested_ranges.rewind()
c3db1a007e 2010-09-16 514: (start, end) = requested_ranges.pop()
c3db1a007e 2010-09-16 515: else:
c3db1a007e 2010-09-16 516: start = 0
c3db1a007e 2010-09-16 517: end = index[my_path]['Content-Length']
c3db1a007e 2010-09-16 518: real_file.seek(start)
c3db1a007e 2010-09-16 519: if block_size > end - start:
c3db1a007e 2010-09-16 520: req_block_size = end - start
c3db1a007e 2010-09-16 521: else:
c3db1a007e 2010-09-16 522: req_block_size = block_size
c3db1a007e 2010-09-16 523: buffer = real_file.read(req_block_size)
c3db1a007e 2010-09-16 524: length = len(buffer)
c3db1a007e 2010-09-16 525: while length > 0:
c3db1a007e 2010-09-16 526: self.wfile.write(buffer)
c3db1a007e 2010-09-16 527: start += len(buffer)
c3db1a007e 2010-09-16 528: if req_block_size > end - start:
c3db1a007e 2010-09-16 529: req_block_size = end - start
c3db1a007e 2010-09-16 530: if req_block_size == 0:
c3db1a007e 2010-09-16 531: break
c3db1a007e 2010-09-16 532: buffer = real_file.read(req_block_size)
c3db1a007e 2010-09-16 533: length = len(buffer)
c3db1a007e 2010-09-16 534:
c3db1a007e 2010-09-16 535: def do_HEAD(self):
c3db1a007e 2010-09-16 536: return self.__process()
c3db1a007e 2010-09-16 537: def do_GET(self):
c3db1a007e 2010-09-16 538: return self.__process()
c3db1a007e 2010-09-16 539:
c3db1a007e 2010-09-16 540: config.section('general')
c3db1a007e 2010-09-16 541: server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
c3db1a007e 2010-09-16 542: server.serve_forever()
c3db1a007e 2010-09-16 543:
c3db1a007e 2010-09-16 544: else:
c3db1a007e 2010-09-16 545: while True:
c3db1a007e 2010-09-16 546: unchecked_files = set()
c3db1a007e 2010-09-16 547: checked_files = 0
c3db1a007e 2010-09-16 548:
c3db1a007e 2010-09-16 549: # reading log and storing found urls for processing
c3db1a007e 2010-09-16 550: # check file mtime XXX
c3db1a007e 2010-09-16 551: with open(options.log, 'r') as log_file:
c3db1a007e 2010-09-16 552: log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
c3db1a007e 2010-09-16 553: for line in log_file:
c3db1a007e 2010-09-16 554: this_line = log_line.match(line.strip())
c3db1a007e 2010-09-16 555: if this_line:
c3db1a007e 2010-09-16 556: unchecked_files.add(this_line.group(2))
c3db1a007e 2010-09-16 557:
c3db1a007e 2010-09-16 558: for url in unchecked_files:
c3db1a007e 2010-09-16 559: reload = False
c3db1a007e 2010-09-16 560: recheck = False
c3db1a007e 2010-09-16 561: info = 'Checking file: ' + url
c3db1a007e 2010-09-16 562:
c3db1a007e 2010-09-16 563: # creating empty placeholder in index
c3db1a007e 2010-09-16 564: if not url in index:
c3db1a007e 2010-09-16 565: info += '\nThis one is new.'
c3db1a007e 2010-09-16 566: index[url] = {}
c3db1a007e 2010-09-16 567: reload = True
c3db1a007e 2010-09-16 568:
c3db1a007e 2010-09-16 569: # creating file name from url
c3db1a007e 2010-09-16 570: file_name = options.dir + re.compile('%20').sub(' ', url)
c3db1a007e 2010-09-16 571:
c3db1a007e 2010-09-16 572: # forcibly checking file if no file present
c3db1a007e 2010-09-16 573: if not reload and not os.access(file_name, os.R_OK):
c3db1a007e 2010-09-16 574: info += '\nFile not found or inaccessible.'
c3db1a007e 2010-09-16 575: reload = True
c3db1a007e 2010-09-16 576:
c3db1a007e 2010-09-16 577: # forcibly checking file if file size doesn't match with index data
c3db1a007e 2010-09-16 578: elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
c3db1a007e 2010-09-16 579: info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
c3db1a007e 2010-09-16 580: reload = True
c3db1a007e 2010-09-16 581:
c3db1a007e 2010-09-16 582: # forcibly checking file if index hods Pragma header
c3db1a007e 2010-09-16 583: if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
c3db1a007e 2010-09-16 584: info +='\nPragma on: recheck imminent.'
c3db1a007e 2010-09-16 585: recheck = True
c3db1a007e 2010-09-16 586:
c3db1a007e 2010-09-16 587: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
c3db1a007e 2010-09-16 588: if not recheck and not reload and (options.noupdate or ('_time' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['_time']).days < 0)):
c3db1a007e 2010-09-16 589: if options.verbose:
c3db1a007e 2010-09-16 590: print(info)
c3db1a007e 2010-09-16 591: continue
c3db1a007e 2010-09-16 592: else:
c3db1a007e 2010-09-16 593: print(info)
c3db1a007e 2010-09-16 594:
c3db1a007e 2010-09-16 595: try:
c3db1a007e 2010-09-16 596: with urllib.request.urlopen(options.root + url) as source:
c3db1a007e 2010-09-16 597: new_headers = {}
598: headers = source.info()
599:
600: # stripping unneeded headers (XXX make this inplace?)
601: for header in headers:
602: if header in desc_fields:
c3db1a007e 2010-09-16 603: if header == 'Pragma' and headers[header] != 'no-cache':
c3db1a007e 2010-09-16 604: print('Pragma:', headers[header])
c3db1a007e 2010-09-16 605: new_headers[header] = headers[header]
606: elif not header in ignore_fields:
607: print('Undefined header "', header, '": ', headers[header], sep='')
608:
609: # comparing headers with data found in index
610: # if any header has changed (except Pragma) file is fully downloaded
611: # same if we get more or less headers
c3db1a007e 2010-09-16 612: old_keys = set(index[url].keys())
613: old_keys.discard('_time')
614: old_keys.discard('Pragma')
c3db1a007e 2010-09-16 615: more_keys = set(new_headers.keys()) - old_keys
616: more_keys.discard('Pragma')
c3db1a007e 2010-09-16 617: less_keys = old_keys - set(new_headers.keys())
618: if len(more_keys) > 0:
619: if not len(old_keys) == 0:
620: print('More headers appear:', more_keys)
621: reload = True
622: elif len(less_keys) > 0:
623: print('Less headers appear:', less_keys)
624: else:
c3db1a007e 2010-09-16 625: for key in index[url].keys():
c3db1a007e 2010-09-16 626: if key[0] != '_' and key != 'Pragma' and not index[url][key] == new_headers[key]:
c3db1a007e 2010-09-16 627: print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
628: reload = True
629:
c3db1a007e 2010-09-16 630: # downloading file
c3db1a007e 2010-09-16 631: if reload:
c3db1a007e 2010-09-16 632: if 'Content-Length' in headers:
c3db1a007e 2010-09-16 633: print('Downloading', headers['Content-Length'], 'bytes [', end='')
c3db1a007e 2010-09-16 634: else:
c3db1a007e 2010-09-16 635: print('Downloading [', end='')
c3db1a007e 2010-09-16 636: sys.stdout.flush()
c3db1a007e 2010-09-16 637:
638: # file is created at temporary location and moved in place only when download completes
c3db1a007e 2010-09-16 639: temp_file = open(options.dir + os.sep + '.tmp', 'wb')
c3db1a007e 2010-09-16 640: buffer = source.read(block_size)
c3db1a007e 2010-09-16 641: megablocks = 0
c3db1a007e 2010-09-16 642: blocks = 0
c3db1a007e 2010-09-16 643: megs = 0
c3db1a007e 2010-09-16 644: while len(buffer) > 0:
c3db1a007e 2010-09-16 645: temp_file.write(buffer)
c3db1a007e 2010-09-16 646: buffer = source.read(block_size)
c3db1a007e 2010-09-16 647: blocks += 1
c3db1a007e 2010-09-16 648: if blocks > 102400/block_size:
c3db1a007e 2010-09-16 649: megablocks += 1
c3db1a007e 2010-09-16 650: if megablocks > 10:
c3db1a007e 2010-09-16 651: megablocks = megablocks - 10
c3db1a007e 2010-09-16 652: megs += 1
c3db1a007e 2010-09-16 653: print('{}Mb'.format(megs), end='')
c3db1a007e 2010-09-16 654: else:
c3db1a007e 2010-09-16 655: print('.', end='')
c3db1a007e 2010-09-16 656: blocks = blocks - 102400/block_size
c3db1a007e 2010-09-16 657: sys.stdout.flush()
c3db1a007e 2010-09-16 658: temp_file.close()
c3db1a007e 2010-09-16 659: print(']')
c3db1a007e 2010-09-16 660: os.renames(options.dir + os.sep + '.tmp', file_name)
c3db1a007e 2010-09-16 661:
c3db1a007e 2010-09-16 662: checked_files += 1
c3db1a007e 2010-09-16 663:
c3db1a007e 2010-09-16 664: # storing new time mark and storing new headers
c3db1a007e 2010-09-16 665: new_headers['_time'] = datetime.datetime.now()
c3db1a007e 2010-09-16 666: index[url] = new_headers
667: index.sync()
668:
669: except urllib.error.HTTPError as error:
670: # in case of error we don't need to do anything actually,
671: # if file download stalls or fails the file would not be moved to it's location
672: print(error)
673:
c3db1a007e 2010-09-16 674: if options.verbose:
c3db1a007e 2010-09-16 675: print('[', len(unchecked_files), '/', checked_files, ']')
c3db1a007e 2010-09-16 676:
c3db1a007e 2010-09-16 677: # checking if there were any files downloaded, if yes - restarting sequence
c3db1a007e 2010-09-16 678: if checked_files == 0:
c3db1a007e 2010-09-16 679: break