08ae38b6ce 2010-06-25 1: #!/usr/bin/env python3.1
08ae38b6ce 2010-06-25 2:
e7b837a681 2010-08-25 3: import datetime, http.cookiejar, os, sys, shelve, spacemap, re, urllib.request
e7b837a681 2010-08-25 4:
e7b837a681 2010-08-25 5: class Config:
e7b837a681 2010-08-25 6: __slots__ = frozenset(['_config', '_default', '_section', 'options', 'root'])
e7b837a681 2010-08-25 7: _default = {
e7b837a681 2010-08-25 8: 'general': {
e7b837a681 2010-08-25 9: 'port': '8008',
e7b837a681 2010-08-25 10: },
e7b837a681 2010-08-25 11: '_other': {
e7b837a681 2010-08-25 12: 'verbose': 'no',
e7b837a681 2010-08-25 13: 'noetag': 'no',
e7b837a681 2010-08-25 14: 'noparts': 'no',
cab908195f 2010-09-06 15: 'strip': '',
cab908195f 2010-09-06 16: 'sub': '',
e7b837a681 2010-08-25 17: },}
e7b837a681 2010-08-25 18:
e7b837a681 2010-08-25 19: # function to read in config file
e7b837a681 2010-08-25 20: def __init__(self):
e7b837a681 2010-08-25 21: import configparser, optparse
e7b837a681 2010-08-25 22:
e7b837a681 2010-08-25 23: parser = optparse.OptionParser()
e7b837a681 2010-08-25 24: parser.add_option('-c', '--config', dest = 'config', help = 'config file location', metavar = 'FILE', default = 'samesite.conf')
e7b837a681 2010-08-25 25: (self.options, args) = parser.parse_args()
e7b837a681 2010-08-25 26:
e7b837a681 2010-08-25 27: assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
e7b837a681 2010-08-25 28:
e7b837a681 2010-08-25 29: configDir = re.compile('^(.*)/[^/]+$').match(self.options.config)
e7b837a681 2010-08-25 30: if configDir:
e7b837a681 2010-08-25 31: self.root = configDir.group(1)
e7b837a681 2010-08-25 32: else:
e7b837a681 2010-08-25 33: self.root = os.getcwd()
e7b837a681 2010-08-25 34:
e7b837a681 2010-08-25 35: self._config = configparser.ConfigParser()
e7b837a681 2010-08-25 36: self._config.readfp(open(self.options.config))
e7b837a681 2010-08-25 37:
e7b837a681 2010-08-25 38: for section in self._config.sections():
e7b837a681 2010-08-25 39: if section != 'general':
e7b837a681 2010-08-25 40: if self._config.has_option(section, 'dir'):
e7b837a681 2010-08-25 41: if re.compile('^/$').match(self._config.get(section, 'dir')):
e7b837a681 2010-08-25 42: self._config.set(section, 'dir', self.root + os.sep + section)
e7b837a681 2010-08-25 43: thisDir = re.compile('^(.*)/$').match(self._config.get(section, 'dir'))
e7b837a681 2010-08-25 44: if thisDir:
e7b837a681 2010-08-25 45: self._config.set(section, 'dir', thisDir.group(1))
e7b837a681 2010-08-25 46: if not re.compile('^/(.*)$').match(self._config.get(section, 'dir')):
e7b837a681 2010-08-25 47: self._config.set(section, 'dir', self.root + os.sep + self._config.get(section, 'dir'))
e7b837a681 2010-08-25 48: else:
e7b837a681 2010-08-25 49: self._config.set(section, 'dir', self.root + os.sep + section)
e7b837a681 2010-08-25 50:
e7b837a681 2010-08-25 51: if not self._config.has_option(section, 'root'):
e7b837a681 2010-08-25 52: self._config.set(section, 'root', section)
e7b837a681 2010-08-25 53:
e7b837a681 2010-08-25 54: # function to select config file section or create one
e7b837a681 2010-08-25 55: def section(self, section):
e7b837a681 2010-08-25 56: if not self._config.has_section(section):
e7b837a681 2010-08-25 57: self._config.add_section(section)
e7b837a681 2010-08-25 58: self._section = section
e7b837a681 2010-08-25 59:
e7b837a681 2010-08-25 60: # function to get config parameter, if parameter doesn't exists the default
e7b837a681 2010-08-25 61: # value or None is substituted
e7b837a681 2010-08-25 62: def __getitem__(self, name):
e7b837a681 2010-08-25 63: if not self._config.has_option(self._section, name):
e7b837a681 2010-08-25 64: if self._section in self._default:
e7b837a681 2010-08-25 65: if name in self._default[self._section]:
e7b837a681 2010-08-25 66: self._config.set(self._section, name, self._default[self._section][name])
e7b837a681 2010-08-25 67: else:
e7b837a681 2010-08-25 68: self._config.set(self._section, name, None)
e7b837a681 2010-08-25 69: elif name in self._default['_other']:
e7b837a681 2010-08-25 70: self._config.set(self._section, name, self._default['_other'][name])
e7b837a681 2010-08-25 71: else:
e7b837a681 2010-08-25 72: self._config.set(self._section, name, None)
e7b837a681 2010-08-25 73: return(self._config.get(self._section, name))
e7b837a681 2010-08-25 74:
e7b837a681 2010-08-25 75: config = Config()
e7b837a681 2010-08-25 76:
e7b837a681 2010-08-25 77: #assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
e7b837a681 2010-08-25 78:
cab908195f 2010-09-06 79: const_desc_fields = set(['Content-Length', 'Last-Modified', 'Pragma'])
cab908195f 2010-09-06 80: const_ignore_fields = set([
cab908195f 2010-09-06 81: 'Accept-Ranges', 'Age',
cab908195f 2010-09-06 82: 'Cache-Control', 'Connection', 'Content-Type',
cab908195f 2010-09-06 83: 'Date',
cab908195f 2010-09-06 84: 'Expires',
439e1753a4 2010-09-07 85: 'Referer',
cab908195f 2010-09-06 86: 'Server',
cab908195f 2010-09-06 87: 'Via',
cab908195f 2010-09-06 88: 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By'
cab908195f 2010-09-06 89: ])
e7b837a681 2010-08-25 90:
e7b837a681 2010-08-25 91: block_size = 4096
80f8e3804a 2010-08-20 92:
80f8e3804a 2010-08-20 93: '''
80f8e3804a 2010-08-20 94: # later, kqueue would be good but later
80f8e3804a 2010-08-20 95: class Connection:
80f8e3804a 2010-08-20 96: __slots__ = frozenset(('__address', '__input', '__socket', '__status', 'error', 'method', 'url', 'http_version'))
80f8e3804a 2010-08-20 97:
80f8e3804a 2010-08-20 98: def __init__(self, socket, address):
80f8e3804a 2010-08-20 99: self.__address = address
80f8e3804a 2010-08-20 100: self.__input = b''
80f8e3804a 2010-08-20 101: self.__socket = socket
80f8e3804a 2010-08-20 102: self.__status = 0
80f8e3804a 2010-08-20 103:
80f8e3804a 2010-08-20 104: def read(self, kev):
80f8e3804a 2010-08-20 105: buffer = self.__socket.recv(kev.data)
80f8e3804a 2010-08-20 106: exhausted = False
80f8e3804a 2010-08-20 107: if len(buffer) == 0:
80f8e3804a 2010-08-20 108: eof = True
80f8e3804a 2010-08-20 109: else:
80f8e3804a 2010-08-20 110: self.__input += buffer
80f8e3804a 2010-08-20 111: while not exhausted:
80f8e3804a 2010-08-20 112: if self.__status == -1:
80f8e3804a 2010-08-20 113: exhausted = True
80f8e3804a 2010-08-20 114: elif self.__status == 0:
80f8e3804a 2010-08-20 115: endstring = self.__input.find(b'\n')
80f8e3804a 2010-08-20 116: if endstring > 0:
80f8e3804a 2010-08-20 117: print('Processing request line.')
80f8e3804a 2010-08-20 118: line = self.__input[:endstring].decode('ascii')
80f8e3804a 2010-08-20 119: self.__input = self.__input[endstring + 1:]
80f8e3804a 2010-08-20 120: isRequest = re.compile('(GET) ([^ ]+) HTTP/(1\.0)').match(line)
80f8e3804a 2010-08-20 121: if not isRequest:
80f8e3804a 2010-08-20 122: self.error = 'Not a HTTP connection.'
80f8e3804a 2010-08-20 123: self.__status = -1
80f8e3804a 2010-08-20 124: else:
80f8e3804a 2010-08-20 125: self.method = isRequest.group(1)
80f8e3804a 2010-08-20 126: self.url = isRequest.group(2)
80f8e3804a 2010-08-20 127: self.http_version = isRequest.group(3)
80f8e3804a 2010-08-20 128: self.__status = 1
80f8e3804a 2010-08-20 129: else:
80f8e3804a 2010-08-20 130: exhausted = True
80f8e3804a 2010-08-20 131: elif self.__status == 1:
80f8e3804a 2010-08-20 132: endstring = self.__input.find(b'\n')
80f8e3804a 2010-08-20 133: if endstring > 0:
80f8e3804a 2010-08-20 134: print('Processing header line.' + repr(self.__input))
80f8e3804a 2010-08-20 135: line = self.__input[:endstring].decode('ascii')
80f8e3804a 2010-08-20 136: self.__input = self.__input[endstring + 1:]
80f8e3804a 2010-08-20 137: isHeader = re.compile('([^:]*): +(.*)').match(line)
80f8e3804a 2010-08-20 138: if not isHeader:
80f8e3804a 2010-08-20 139: self.error = 'Bad header.'
80f8e3804a 2010-08-20 140: return(False)
80f8e3804a 2010-08-20 141: # process header here
80f8e3804a 2010-08-20 142: elif endstring == 0:
80f8e3804a 2010-08-20 143: self.__status = 2
80f8e3804a 2010-08-20 144: else:
80f8e3804a 2010-08-20 145: exhausted = True
80f8e3804a 2010-08-20 146:
80f8e3804a 2010-08-20 147: def write(self, kev):
80f8e3804a 2010-08-20 148: pass
80f8e3804a 2010-08-20 149:
80f8e3804a 2010-08-20 150: if options.port:
80f8e3804a 2010-08-20 151: import select, socket
80f8e3804a 2010-08-20 152:
80f8e3804a 2010-08-20 153: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
80f8e3804a 2010-08-20 154: try:
80f8e3804a 2010-08-20 155: sock.bind(('127.0.0.1', int(options.port)))
80f8e3804a 2010-08-20 156: sock.listen(-1)
80f8e3804a 2010-08-20 157:
80f8e3804a 2010-08-20 158: kq = select.kqueue()
80f8e3804a 2010-08-20 159: assert kq.fileno() != -1, "Fatal error: can't initialise kqueue."
80f8e3804a 2010-08-20 160:
80f8e3804a 2010-08-20 161: kq.control([select.kevent(sock, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
80f8e3804a 2010-08-20 162: timeout = None
80f8e3804a 2010-08-20 163:
80f8e3804a 2010-08-20 164: connections = {sock.fileno(): None}
80f8e3804a 2010-08-20 165:
80f8e3804a 2010-08-20 166: while True:
80f8e3804a 2010-08-20 167: kevs = kq.control(None, 1, timeout)
80f8e3804a 2010-08-20 168:
80f8e3804a 2010-08-20 169: for kev in kevs:
80f8e3804a 2010-08-20 170: if type(connections[kev.ident]) == Connection:
80f8e3804a 2010-08-20 171: print(kev.ident, kev.data, kev.filter, kev.flags)
80f8e3804a 2010-08-20 172: assert kev.data != 0, 'No data available.'
80f8e3804a 2010-08-20 173: if kev.filter == select.KQ_FILTER_READ:
80f8e3804a 2010-08-20 174: connections[kev.ident].read(kev)
80f8e3804a 2010-08-20 175: elif kev.filter == select.KQ_FILTER_WRITE:
80f8e3804a 2010-08-20 176: connections[kev.ident].write(kev)
80f8e3804a 2010-08-20 177: else:
80f8e3804a 2010-08-20 178: assert kev.filter in (select.KQ_FILTER_READ, select.KQ_FILTER_WRITE), 'Do we support other filters?'
80f8e3804a 2010-08-20 179: else:
80f8e3804a 2010-08-20 180: (conn, addr) = sock.accept()
80f8e3804a 2010-08-20 181: print('Connection from ' + repr(addr))
80f8e3804a 2010-08-20 182: kq.control([select.kevent(conn, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
80f8e3804a 2010-08-20 183: connections[conn.fileno()] = Connection(conn, addr)
80f8e3804a 2010-08-20 184:
80f8e3804a 2010-08-20 185: if kev.flags >> 15 == 1:
80f8e3804a 2010-08-20 186: kq.control([select.kevent(kev.ident, select.KQ_FILTER_READ, select.KQ_EV_DELETE)], 0)
80f8e3804a 2010-08-20 187: kq.control([select.kevent(kev.ident, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE)], 0)
80f8e3804a 2010-08-20 188: del(connections[kev.ident])
80f8e3804a 2010-08-20 189: finally:
80f8e3804a 2010-08-20 190: sock.close()
80f8e3804a 2010-08-20 191: '''
80f8e3804a 2010-08-20 192:
e7b837a681 2010-08-25 193: # XXX how about rechecking files?
e7b837a681 2010-08-25 194: if True:
80f8e3804a 2010-08-20 195: import http.server
80f8e3804a 2010-08-20 196:
80f8e3804a 2010-08-20 197: class MyRequestHandler(http.server.BaseHTTPRequestHandler):
80f8e3804a 2010-08-20 198: def __process(self):
80f8e3804a 2010-08-20 199: # reload means file needs to be reloaded to serve request
80f8e3804a 2010-08-20 200: reload = False
80f8e3804a 2010-08-20 201: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
80f8e3804a 2010-08-20 202: recheck = False
80f8e3804a 2010-08-20 203: # file_stat means file definitely exists
80f8e3804a 2010-08-20 204: file_stat = None
80f8e3804a 2010-08-20 205: # requested_ranges holds data about any range requested
80f8e3804a 2010-08-20 206: requested_ranges = None
80f8e3804a 2010-08-20 207: # records holds data from index locally, should be written back upon successfull completion
80f8e3804a 2010-08-20 208: record = None
d0071bdbc7 2010-08-20 209:
d0071bdbc7 2010-08-20 210: myPath = re.compile('^(.*?)(\?.*)$').match(self.path)
d0071bdbc7 2010-08-20 211: if myPath:
d0071bdbc7 2010-08-20 212: my_path = myPath.group(1)
d0071bdbc7 2010-08-20 213: else:
d0071bdbc7 2010-08-20 214: my_path = self.path
d0071bdbc7 2010-08-20 215:
e7b837a681 2010-08-25 216: config.section(self.headers['Host'])
cab908195f 2010-09-06 217:
cab908195f 2010-09-06 218: if config['sub'] != None and config['strip'] != None and len(config['strip']) > 0:
cab908195f 2010-09-06 219: string = re.compile(config['strip']).sub(config['sub'], my_path)
cab908195f 2010-09-06 220: my_path = string
cab908195f 2010-09-06 221:
cab908195f 2010-09-06 222: info = 'Checking file: ' + my_path
e7b837a681 2010-08-25 223:
e7b837a681 2010-08-25 224: if not os.access(config['dir'], os.X_OK):
e7b837a681 2010-08-25 225: os.mkdir(config['dir'])
e7b837a681 2010-08-25 226: # this is file index - everything is stored in this file
e7b837a681 2010-08-25 227: # _parts - list of stored parts of file
e7b837a681 2010-08-25 228: # _time - last time the file was checked
e7b837a681 2010-08-25 229: # everything else is just the headers
e7b837a681 2010-08-25 230: index = shelve.open(config['dir'] + os.sep + '.index')
e7b837a681 2010-08-25 231:
e7b837a681 2010-08-25 232: desc_fields = const_desc_fields.copy()
e7b837a681 2010-08-25 233: ignore_fields = const_ignore_fields.copy()
c3db1a007e 2010-09-16 234: if config['noetag'] == 'no':
e7b837a681 2010-08-25 235: desc_fields.add('ETag')
e7b837a681 2010-08-25 236: else:
e7b837a681 2010-08-25 237: ignore_fields.add('ETag')
e7b837a681 2010-08-25 238:
cab908195f 2010-09-06 239: proxy_ignored = set([
cab908195f 2010-09-06 240: 'Accept', 'Accept-Charset', 'Accept-Encoding', 'Accept-Language',
cab908195f 2010-09-06 241: 'Cache-Control', 'Connection', 'Content-Length', 'Cookie',
80f8e3804a 2010-08-20 242: 'Host',
fb10031536 2010-08-21 243: 'If-Modified-Since', 'If-Unmodified-Since',
cab908195f 2010-09-06 244: 'Referer',
80f8e3804a 2010-08-20 245: 'User-Agent',
80f8e3804a 2010-08-20 246: 'Via',
cab908195f 2010-09-06 247: 'X-Forwarded-For', 'X-REMOVED',
cab908195f 2010-09-06 248: ])
80f8e3804a 2010-08-20 249:
fb10031536 2010-08-21 250: print('===============[ {} request ]==='.format(self.command))
80f8e3804a 2010-08-20 251:
80f8e3804a 2010-08-20 252: for header in self.headers:
80f8e3804a 2010-08-20 253: if header in proxy_ignored:
80f8e3804a 2010-08-20 254: pass
80f8e3804a 2010-08-20 255: elif header in ('Range'):
80f8e3804a 2010-08-20 256: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
80f8e3804a 2010-08-20 257: if isRange:
e7b837a681 2010-08-25 258: requested_ranges = spacemap.SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
80f8e3804a 2010-08-20 259: else:
80f8e3804a 2010-08-20 260: return()
e7b837a681 2010-08-25 261: elif header in ('Pragma'):
e7b837a681 2010-08-25 262: if my_path in index:
e7b837a681 2010-08-25 263: index[my_path][header] = self.headers[header]
80f8e3804a 2010-08-20 264: else:
80f8e3804a 2010-08-20 265: print('Unknown header - ', header, ': ', self.headers[header], sep='')
80f8e3804a 2010-08-20 266: return()
80f8e3804a 2010-08-20 267: print(header, self.headers[header])
e7b837a681 2010-08-25 268:
b0975a28fb 2010-08-26 269: # creating file name from my_path
b0975a28fb 2010-08-26 270: file_name = config['dir'] + os.sep + re.compile('%20').sub(' ', my_path)
b0975a28fb 2010-08-26 271: # partial file or unfinished download
b0975a28fb 2010-08-26 272: temp_name = config['dir'] + os.sep + '.parts' + re.compile('%20').sub(' ', my_path)
b0975a28fb 2010-08-26 273:
80f8e3804a 2010-08-20 274: # creating empty placeholder in index
80f8e3804a 2010-08-20 275: # if there's no space map and there's no file in real directory - we have no file
80f8e3804a 2010-08-20 276: # if there's an empty space map - file is full
80f8e3804a 2010-08-20 277: # space map generally covers every bit of file we don't posess currently
d0071bdbc7 2010-08-20 278: if not my_path in index:
80f8e3804a 2010-08-20 279: info += '\nThis one is new.'
80f8e3804a 2010-08-20 280: reload = True
e7b837a681 2010-08-25 281: record = {}
80f8e3804a 2010-08-20 282: else:
b0975a28fb 2010-08-26 283: # forcibly checking file if no file present
c3db1a007e 2010-09-16 284: record = index[my_path]
b0975a28fb 2010-08-26 285: if os.access(file_name, os.R_OK):
439e1753a4 2010-09-07 286: info += '\nFull file found.'
b0975a28fb 2010-08-26 287: file_stat = os.stat(file_name)
b0975a28fb 2010-08-26 288: elif '_parts' in index[my_path] and os.access(temp_name, os.R_OK):
439e1753a4 2010-09-07 289: info += '\nPartial file found.'
b0975a28fb 2010-08-26 290: file_stat = os.stat(temp_name)
c3db1a007e 2010-09-16 291: recheck = True
b0975a28fb 2010-08-26 292: else:
b0975a28fb 2010-08-26 293: info += '\nFile not found or inaccessible.'
c3db1a007e 2010-09-16 294: record['_parts'] = None
b0975a28fb 2010-08-26 295: reload = True
e7b837a681 2010-08-25 296:
e7b837a681 2010-08-25 297: if not '_parts' in record:
e7b837a681 2010-08-25 298: record['_parts'] = None
e7b837a681 2010-08-25 299:
b0975a28fb 2010-08-26 300: if record['_parts'] == None:
b0975a28fb 2010-08-26 301: recheck = True
80f8e3804a 2010-08-20 302:
80f8e3804a 2010-08-20 303: # forcibly checking file if file size doesn't match with index data
80f8e3804a 2010-08-20 304: if not reload:
e7b837a681 2010-08-25 305: if '_parts' in record and record['_parts'] == spacemap.SpaceMap():
80f8e3804a 2010-08-20 306: if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']):
80f8e3804a 2010-08-20 307: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length'])
b0975a28fb 2010-08-26 308: record['_parts'] = None
80f8e3804a 2010-08-20 309: reload = True
80f8e3804a 2010-08-20 310:
80f8e3804a 2010-08-20 311: # forcibly checking file if index holds Pragma header
80f8e3804a 2010-08-20 312: if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache':
80f8e3804a 2010-08-20 313: info +='\nPragma on: recheck imminent.'
80f8e3804a 2010-08-20 314: recheck = True
80f8e3804a 2010-08-20 315:
80f8e3804a 2010-08-20 316: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
80f8e3804a 2010-08-20 317: if not recheck and not reload and '_time' in record and (datetime.datetime.now() - datetime.timedelta(hours = 4) - record['_time']).days < 0:
80f8e3804a 2010-08-20 318: recheck = True
80f8e3804a 2010-08-20 319:
80f8e3804a 2010-08-20 320: print(info)
80f8e3804a 2010-08-20 321: if reload or recheck:
80f8e3804a 2010-08-20 322:
80f8e3804a 2010-08-20 323: try:
cab908195f 2010-09-06 324: request = 'http://' + config['root'] + self.path
cab908195f 2010-09-06 325: my_headers = {}
cab908195f 2010-09-06 326: for header in ('Cache-Control', 'Cookie', 'Referer', 'User-Agent'):
cab908195f 2010-09-06 327: if header in self.headers:
cab908195f 2010-09-06 328: my_headers[header] = self.headers[header]
cab908195f 2010-09-06 329:
fb10031536 2010-08-21 330: needed = None
439e1753a4 2010-09-07 331: if '_parts' in record and record['_parts'] != None:
c3db1a007e 2010-09-16 332: if config['noparts'] != 'no' or requested_ranges == None or requested_ranges == spacemap.SpaceMap():
439e1753a4 2010-09-07 333: needed = record['_parts']
439e1753a4 2010-09-07 334: else:
439e1753a4 2010-09-07 335: needed = record['_parts'] | requested_ranges
c3db1a007e 2010-09-16 336: elif config['noparts'] =='no' and requested_ranges != None and requested_ranges != spacemap.SpaceMap():
439e1753a4 2010-09-07 337: needed = requested_ranges
439e1753a4 2010-09-07 338: ranges = ()
439e1753a4 2010-09-07 339: print('Missing ranges: {}, requested ranges: {}, needed ranges: {}.'.format(record['_parts'], requested_ranges, needed))
439e1753a4 2010-09-07 340: if needed != None and len(needed) > 0:
439e1753a4 2010-09-07 341: needed.rewind()
439e1753a4 2010-09-07 342: while True:
439e1753a4 2010-09-07 343: range = needed.pop()
439e1753a4 2010-09-07 344: if range[0] == None:
439e1753a4 2010-09-07 345: break
439e1753a4 2010-09-07 346: ranges += '{}-{}'.format(range[0], range[1] - 1),
439e1753a4 2010-09-07 347: my_headers['Range'] = 'bytes=' + ','.join(ranges)
cab908195f 2010-09-06 348:
cab908195f 2010-09-06 349: request = urllib.request.Request(request, headers = my_headers)
80f8e3804a 2010-08-20 350:
80f8e3804a 2010-08-20 351: with urllib.request.urlopen(request) as source:
80f8e3804a 2010-08-20 352: new_record = {}
80f8e3804a 2010-08-20 353: new_record['_parts'] = record['_parts']
80f8e3804a 2010-08-20 354: headers = source.info()
80f8e3804a 2010-08-20 355:
80f8e3804a 2010-08-20 356: # stripping unneeded headers (XXX make this inplace?)
80f8e3804a 2010-08-20 357: for header in headers:
80f8e3804a 2010-08-20 358: if header in desc_fields:
80f8e3804a 2010-08-20 359: #if header == 'Pragma' and headers[header] != 'no-cache':
80f8e3804a 2010-08-20 360: if header == 'Content-Length':
80f8e3804a 2010-08-20 361: if 'Content-Range' not in headers:
fb10031536 2010-08-21 362: new_record[header] = int(headers[header])
80f8e3804a 2010-08-20 363: else:
80f8e3804a 2010-08-20 364: new_record[header] = headers[header]
80f8e3804a 2010-08-20 365: elif header == 'Content-Range':
80f8e3804a 2010-08-20 366: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
80f8e3804a 2010-08-20 367: if range:
fb10031536 2010-08-21 368: new_record['Content-Length'] = int(range.group(3))
80f8e3804a 2010-08-20 369: else:
80f8e3804a 2010-08-20 370: assert False, 'Content-Range unrecognized.'
80f8e3804a 2010-08-20 371: elif not header in ignore_fields:
80f8e3804a 2010-08-20 372: print('Undefined header "', header, '": ', headers[header], sep='')
80f8e3804a 2010-08-20 373:
80f8e3804a 2010-08-20 374: # comparing headers with data found in index
80f8e3804a 2010-08-20 375: # if any header has changed (except Pragma) file is fully downloaded
80f8e3804a 2010-08-20 376: # same if we get more or less headers
80f8e3804a 2010-08-20 377: old_keys = set(record.keys())
80f8e3804a 2010-08-20 378: old_keys.discard('_time')
80f8e3804a 2010-08-20 379: old_keys.discard('Pragma')
80f8e3804a 2010-08-20 380: more_keys = set(new_record.keys()) - old_keys
80f8e3804a 2010-08-20 381: more_keys.discard('Pragma')
80f8e3804a 2010-08-20 382: less_keys = old_keys - set(new_record.keys())
80f8e3804a 2010-08-20 383: if len(more_keys) > 0:
80f8e3804a 2010-08-20 384: if not len(old_keys) == 0:
80f8e3804a 2010-08-20 385: print('More headers appear:', more_keys)
80f8e3804a 2010-08-20 386: reload = True
80f8e3804a 2010-08-20 387: elif len(less_keys) > 0:
80f8e3804a 2010-08-20 388: print('Less headers appear:', less_keys)
80f8e3804a 2010-08-20 389: else:
80f8e3804a 2010-08-20 390: for key in record.keys():
80f8e3804a 2010-08-20 391: if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]:
80f8e3804a 2010-08-20 392: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
fb10031536 2010-08-21 393: print(type(record[key]), type(new_record[key]))
80f8e3804a 2010-08-20 394: reload = True
80f8e3804a 2010-08-20 395:
80f8e3804a 2010-08-20 396: if reload:
80f8e3804a 2010-08-20 397: print('Reloading.')
80f8e3804a 2010-08-20 398: if os.access(temp_name, os.R_OK):
80f8e3804a 2010-08-20 399: os.unlink(temp_name)
80f8e3804a 2010-08-20 400: if os.access(file_name, os.R_OK):
80f8e3804a 2010-08-20 401: os.unlink(file_name)
e7b837a681 2010-08-25 402: new_record['_parts'] = spacemap.SpaceMap({0: int(new_record['Content-Length'])})
fb10031536 2010-08-21 403: print(new_record)
80f8e3804a 2010-08-20 404:
80f8e3804a 2010-08-20 405: # downloading file or segment
80f8e3804a 2010-08-20 406: if 'Content-Length' in new_record:
fb10031536 2010-08-21 407: if needed == None:
fb10031536 2010-08-21 408: needed = new_record['_parts']
80f8e3804a 2010-08-20 409: else:
fb10031536 2010-08-21 410: if len(needed) > 1:
80f8e3804a 2010-08-20 411: print("Multipart requests currently not supported.")
80f8e3804a 2010-08-20 412: assert False, 'Skip this one for now.'
80f8e3804a 2010-08-20 413: else:
80f8e3804a 2010-08-20 414: assert False, 'No Content-Length or Content-Range header.'
80f8e3804a 2010-08-20 415:
fb10031536 2010-08-21 416: new_record['_time'] = datetime.datetime.now()
fb10031536 2010-08-21 417: if self.command not in ('HEAD'):
fb10031536 2010-08-21 418: # file is created at temporary location and moved in place only when download completes
fb10031536 2010-08-21 419: if not os.access(temp_name, os.R_OK):
e7b837a681 2010-08-25 420: empty_name = config['dir'] + os.sep + '.tmp'
fb10031536 2010-08-21 421: with open(empty_name, 'w+b') as some_file:
fb10031536 2010-08-21 422: pass
fb10031536 2010-08-21 423: os.renames(empty_name, temp_name)
fb10031536 2010-08-21 424: temp_file = open(temp_name, 'r+b')
439e1753a4 2010-09-07 425: if requested_ranges == None and needed == None:
439e1753a4 2010-09-07 426: needed = new_record['_parts']
fb10031536 2010-08-21 427: needed.rewind()
fb10031536 2010-08-21 428: while True:
fb10031536 2010-08-21 429: (start, end) = needed.pop()
fb10031536 2010-08-21 430: if start == None:
fb10031536 2010-08-21 431: break
fb10031536 2010-08-21 432: stream_last = start
fb10031536 2010-08-21 433: old_record = new_record
fb10031536 2010-08-21 434: if end - start < block_size:
fb10031536 2010-08-21 435: req_block_size = end - start
fb10031536 2010-08-21 436: else:
fb10031536 2010-08-21 437: req_block_size = block_size
fb10031536 2010-08-21 438: buffer = source.read(req_block_size)
fb10031536 2010-08-21 439: length = len(buffer)
fb10031536 2010-08-21 440: while length > 0 and stream_last < end:
fb10031536 2010-08-21 441: stream_pos = stream_last + length
fb10031536 2010-08-21 442: assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
fb10031536 2010-08-21 443: temp_file.seek(stream_last)
fb10031536 2010-08-21 444: temp_file.write(buffer)
e7b837a681 2010-08-25 445: new_record['_parts'] = new_record['_parts'] - spacemap.SpaceMap({stream_last: stream_pos})
fb10031536 2010-08-21 446: index[my_path] = old_record
fb10031536 2010-08-21 447: index.sync()
fb10031536 2010-08-21 448: old_record = new_record
fb10031536 2010-08-21 449: stream_last = stream_pos
fb10031536 2010-08-21 450: if end - stream_last < block_size:
fb10031536 2010-08-21 451: req_block_size = end - stream_last
fb10031536 2010-08-21 452: buffer = source.read(req_block_size)
fb10031536 2010-08-21 453: length = len(buffer)
fb10031536 2010-08-21 454: # moving downloaded data to real file
fb10031536 2010-08-21 455: temp_file.close()
fb10031536 2010-08-21 456:
fb10031536 2010-08-21 457: index[my_path] = new_record
fb10031536 2010-08-21 458: index.sync()
80f8e3804a 2010-08-20 459:
80f8e3804a 2010-08-20 460: except urllib.error.HTTPError as error:
80f8e3804a 2010-08-20 461: # in case of error we don't need to do anything actually,
80f8e3804a 2010-08-20 462: # if file download stalls or fails the file would not be moved to it's location
80f8e3804a 2010-08-20 463: print(error)
c3db1a007e 2010-09-16 464:
c3db1a007e 2010-09-16 465: print(index[my_path])
cab908195f 2010-09-06 466:
cab908195f 2010-09-06 467: if not os.access(file_name, os.R_OK) and os.access(temp_name, os.R_OK) and '_parts' in index[my_path] and index[my_path]['_parts'] == spacemap.SpaceMap():
e7b837a681 2010-08-25 468: # just moving
e7b837a681 2010-08-25 469: # drop old dirs XXX
e7b837a681 2010-08-25 470: print('Moving temporary file to new destination.')
e7b837a681 2010-08-25 471: os.renames(temp_name, file_name)
cab908195f 2010-09-06 472:
cab908195f 2010-09-06 473: if not my_path in index:
cab908195f 2010-09-06 474: self.send_response(502)
cab908195f 2010-09-06 475: self.end_headers()
cab908195f 2010-09-06 476: return
fb10031536 2010-08-21 477:
80f8e3804a 2010-08-20 478: if self.command == 'HEAD':
80f8e3804a 2010-08-20 479: self.send_response(200)
d0071bdbc7 2010-08-20 480: if 'Content-Length' in index[my_path]:
d0071bdbc7 2010-08-20 481: self.send_header('Content-Length', index[my_path]['Content-Length'])
80f8e3804a 2010-08-20 482: self.send_header('Accept-Ranges', 'bytes')
80f8e3804a 2010-08-20 483: self.send_header('Content-Type', 'application/octet-stream')
d0071bdbc7 2010-08-20 484: if 'Last-Modified' in index[my_path]:
d0071bdbc7 2010-08-20 485: self.send_header('Last-Modified', index[my_path]['Last-Modified'])
80f8e3804a 2010-08-20 486: self.end_headers()
80f8e3804a 2010-08-20 487: else:
e7b837a681 2010-08-25 488: if ('_parts' in index[my_path] and index[my_path]['_parts'] != spacemap.SpaceMap()) or not os.access(file_name, os.R_OK):
80f8e3804a 2010-08-20 489: file_name = temp_name
80f8e3804a 2010-08-20 490:
80f8e3804a 2010-08-20 491: with open(file_name, 'rb') as real_file:
80f8e3804a 2010-08-20 492: file_stat = os.stat(file_name)
fb10031536 2010-08-21 493: if 'Range' in self.headers:
fb10031536 2010-08-21 494: self.send_response(206)
80f8e3804a 2010-08-20 495: ranges = ()
80f8e3804a 2010-08-20 496: requested_ranges.rewind()
80f8e3804a 2010-08-20 497: while True:
80f8e3804a 2010-08-20 498: pair = requested_ranges.pop()
80f8e3804a 2010-08-20 499: if pair[0] == None:
80f8e3804a 2010-08-20 500: break
80f8e3804a 2010-08-20 501: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
fb10031536 2010-08-21 502: self.send_header('Content-Range', 'bytes {}/{}'.format(','.join(ranges), index[my_path]['Content-Length']))
80f8e3804a 2010-08-20 503: else:
fb10031536 2010-08-21 504: self.send_response(200)
80f8e3804a 2010-08-20 505: self.send_header('Content-Length', str(file_stat.st_size))
e7b837a681 2010-08-25 506: requested_ranges = spacemap.SpaceMap({0: file_stat.st_size})
cab908195f 2010-09-06 507: if 'Last-Modified' in index[my_path]:
cab908195f 2010-09-06 508: self.send_header('Last-Modified', index[my_path]['Last-Modified'])
80f8e3804a 2010-08-20 509: self.send_header('Content-Type', 'application/octet-stream')
80f8e3804a 2010-08-20 510: self.end_headers()
80f8e3804a 2010-08-20 511: if self.command in ('GET'):
fb10031536 2010-08-21 512: if len(requested_ranges) > 0:
fb10031536 2010-08-21 513: requested_ranges.rewind()
fb10031536 2010-08-21 514: (start, end) = requested_ranges.pop()
fb10031536 2010-08-21 515: else:
fb10031536 2010-08-21 516: start = 0
fb10031536 2010-08-21 517: end = index[my_path]['Content-Length']
80f8e3804a 2010-08-20 518: real_file.seek(start)
80f8e3804a 2010-08-20 519: if block_size > end - start:
80f8e3804a 2010-08-20 520: req_block_size = end - start
80f8e3804a 2010-08-20 521: else:
80f8e3804a 2010-08-20 522: req_block_size = block_size
80f8e3804a 2010-08-20 523: buffer = real_file.read(req_block_size)
80f8e3804a 2010-08-20 524: length = len(buffer)
80f8e3804a 2010-08-20 525: while length > 0:
80f8e3804a 2010-08-20 526: self.wfile.write(buffer)
80f8e3804a 2010-08-20 527: start += len(buffer)
80f8e3804a 2010-08-20 528: if req_block_size > end - start:
80f8e3804a 2010-08-20 529: req_block_size = end - start
80f8e3804a 2010-08-20 530: if req_block_size == 0:
80f8e3804a 2010-08-20 531: break
80f8e3804a 2010-08-20 532: buffer = real_file.read(req_block_size)
80f8e3804a 2010-08-20 533: length = len(buffer)
80f8e3804a 2010-08-20 534:
80f8e3804a 2010-08-20 535: def do_HEAD(self):
80f8e3804a 2010-08-20 536: return self.__process()
80f8e3804a 2010-08-20 537: def do_GET(self):
80f8e3804a 2010-08-20 538: return self.__process()
80f8e3804a 2010-08-20 539:
e7b837a681 2010-08-25 540: config.section('general')
e7b837a681 2010-08-25 541: server = http.server.HTTPServer(('127.0.0.1', int(config['port'])), MyRequestHandler)
80f8e3804a 2010-08-20 542: server.serve_forever()
80f8e3804a 2010-08-20 543:
80f8e3804a 2010-08-20 544: else:
80f8e3804a 2010-08-20 545: while True:
80f8e3804a 2010-08-20 546: unchecked_files = set()
80f8e3804a 2010-08-20 547: checked_files = 0
80f8e3804a 2010-08-20 548:
80f8e3804a 2010-08-20 549: # reading log and storing found urls for processing
80f8e3804a 2010-08-20 550: # check file mtime XXX
80f8e3804a 2010-08-20 551: with open(options.log, 'r') as log_file:
80f8e3804a 2010-08-20 552: log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
80f8e3804a 2010-08-20 553: for line in log_file:
80f8e3804a 2010-08-20 554: this_line = log_line.match(line.strip())
80f8e3804a 2010-08-20 555: if this_line:
80f8e3804a 2010-08-20 556: unchecked_files.add(this_line.group(2))
80f8e3804a 2010-08-20 557:
80f8e3804a 2010-08-20 558: for url in unchecked_files:
80f8e3804a 2010-08-20 559: reload = False
80f8e3804a 2010-08-20 560: recheck = False
80f8e3804a 2010-08-20 561: info = 'Checking file: ' + url
80f8e3804a 2010-08-20 562:
80f8e3804a 2010-08-20 563: # creating empty placeholder in index
80f8e3804a 2010-08-20 564: if not url in index:
80f8e3804a 2010-08-20 565: info += '\nThis one is new.'
80f8e3804a 2010-08-20 566: index[url] = {}
80f8e3804a 2010-08-20 567: reload = True
80f8e3804a 2010-08-20 568:
80f8e3804a 2010-08-20 569: # creating file name from url
80f8e3804a 2010-08-20 570: file_name = options.dir + re.compile('%20').sub(' ', url)
80f8e3804a 2010-08-20 571:
80f8e3804a 2010-08-20 572: # forcibly checking file if no file present
80f8e3804a 2010-08-20 573: if not reload and not os.access(file_name, os.R_OK):
80f8e3804a 2010-08-20 574: info += '\nFile not found or inaccessible.'
80f8e3804a 2010-08-20 575: reload = True
80f8e3804a 2010-08-20 576:
80f8e3804a 2010-08-20 577: # forcibly checking file if file size doesn't match with index data
80f8e3804a 2010-08-20 578: elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
80f8e3804a 2010-08-20 579: info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
80f8e3804a 2010-08-20 580: reload = True
80f8e3804a 2010-08-20 581:
80f8e3804a 2010-08-20 582: # forcibly checking file if index hods Pragma header
80f8e3804a 2010-08-20 583: if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
80f8e3804a 2010-08-20 584: info +='\nPragma on: recheck imminent.'
80f8e3804a 2010-08-20 585: recheck = True
80f8e3804a 2010-08-20 586:
80f8e3804a 2010-08-20 587: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
80f8e3804a 2010-08-20 588: if not recheck and not reload and (options.noupdate or ('_time' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['_time']).days < 0)):
80f8e3804a 2010-08-20 589: if options.verbose:
80f8e3804a 2010-08-20 590: print(info)
80f8e3804a 2010-08-20 591: continue
80f8e3804a 2010-08-20 592: else:
80f8e3804a 2010-08-20 593: print(info)
80f8e3804a 2010-08-20 594:
80f8e3804a 2010-08-20 595: try:
80f8e3804a 2010-08-20 596: with urllib.request.urlopen(options.root + url) as source:
80f8e3804a 2010-08-20 597: new_headers = {}
80f8e3804a 2010-08-20 598: headers = source.info()
80f8e3804a 2010-08-20 599:
80f8e3804a 2010-08-20 600: # stripping unneeded headers (XXX make this inplace?)
80f8e3804a 2010-08-20 601: for header in headers:
80f8e3804a 2010-08-20 602: if header in desc_fields:
80f8e3804a 2010-08-20 603: if header == 'Pragma' and headers[header] != 'no-cache':
80f8e3804a 2010-08-20 604: print('Pragma:', headers[header])
80f8e3804a 2010-08-20 605: new_headers[header] = headers[header]
80f8e3804a 2010-08-20 606: elif not header in ignore_fields:
80f8e3804a 2010-08-20 607: print('Undefined header "', header, '": ', headers[header], sep='')
80f8e3804a 2010-08-20 608:
80f8e3804a 2010-08-20 609: # comparing headers with data found in index
80f8e3804a 2010-08-20 610: # if any header has changed (except Pragma) file is fully downloaded
80f8e3804a 2010-08-20 611: # same if we get more or less headers
80f8e3804a 2010-08-20 612: old_keys = set(index[url].keys())
80f8e3804a 2010-08-20 613: old_keys.discard('_time')
80f8e3804a 2010-08-20 614: old_keys.discard('Pragma')
80f8e3804a 2010-08-20 615: more_keys = set(new_headers.keys()) - old_keys
80f8e3804a 2010-08-20 616: more_keys.discard('Pragma')
80f8e3804a 2010-08-20 617: less_keys = old_keys - set(new_headers.keys())
80f8e3804a 2010-08-20 618: if len(more_keys) > 0:
80f8e3804a 2010-08-20 619: if not len(old_keys) == 0:
80f8e3804a 2010-08-20 620: print('More headers appear:', more_keys)
80f8e3804a 2010-08-20 621: reload = True
80f8e3804a 2010-08-20 622: elif len(less_keys) > 0:
80f8e3804a 2010-08-20 623: print('Less headers appear:', less_keys)
80f8e3804a 2010-08-20 624: else:
80f8e3804a 2010-08-20 625: for key in index[url].keys():
80f8e3804a 2010-08-20 626: if key[0] != '_' and key != 'Pragma' and not index[url][key] == new_headers[key]:
80f8e3804a 2010-08-20 627: print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
80f8e3804a 2010-08-20 628: reload = True
80f8e3804a 2010-08-20 629:
80f8e3804a 2010-08-20 630: # downloading file
80f8e3804a 2010-08-20 631: if reload:
80f8e3804a 2010-08-20 632: if 'Content-Length' in headers:
80f8e3804a 2010-08-20 633: print('Downloading', headers['Content-Length'], 'bytes [', end='')
80f8e3804a 2010-08-20 634: else:
80f8e3804a 2010-08-20 635: print('Downloading [', end='')
80f8e3804a 2010-08-20 636: sys.stdout.flush()
80f8e3804a 2010-08-20 637:
80f8e3804a 2010-08-20 638: # file is created at temporary location and moved in place only when download completes
80f8e3804a 2010-08-20 639: temp_file = open(options.dir + os.sep + '.tmp', 'wb')
80f8e3804a 2010-08-20 640: buffer = source.read(block_size)
80f8e3804a 2010-08-20 641: megablocks = 0
80f8e3804a 2010-08-20 642: blocks = 0
80f8e3804a 2010-08-20 643: megs = 0
80f8e3804a 2010-08-20 644: while len(buffer) > 0:
80f8e3804a 2010-08-20 645: temp_file.write(buffer)
80f8e3804a 2010-08-20 646: buffer = source.read(block_size)
80f8e3804a 2010-08-20 647: blocks += 1
80f8e3804a 2010-08-20 648: if blocks > 102400/block_size:
80f8e3804a 2010-08-20 649: megablocks += 1
80f8e3804a 2010-08-20 650: if megablocks > 10:
80f8e3804a 2010-08-20 651: megablocks = megablocks - 10
80f8e3804a 2010-08-20 652: megs += 1
80f8e3804a 2010-08-20 653: print('{}Mb'.format(megs), end='')
80f8e3804a 2010-08-20 654: else:
80f8e3804a 2010-08-20 655: print('.', end='')
80f8e3804a 2010-08-20 656: blocks = blocks - 102400/block_size
80f8e3804a 2010-08-20 657: sys.stdout.flush()
80f8e3804a 2010-08-20 658: temp_file.close()
80f8e3804a 2010-08-20 659: print(']')
80f8e3804a 2010-08-20 660: os.renames(options.dir + os.sep + '.tmp', file_name)
80f8e3804a 2010-08-20 661:
80f8e3804a 2010-08-20 662: checked_files += 1
80f8e3804a 2010-08-20 663:
80f8e3804a 2010-08-20 664: # storing new time mark and storing new headers
80f8e3804a 2010-08-20 665: new_headers['_time'] = datetime.datetime.now()
80f8e3804a 2010-08-20 666: index[url] = new_headers
80f8e3804a 2010-08-20 667: index.sync()
80f8e3804a 2010-08-20 668:
80f8e3804a 2010-08-20 669: except urllib.error.HTTPError as error:
80f8e3804a 2010-08-20 670: # in case of error we don't need to do anything actually,
80f8e3804a 2010-08-20 671: # if file download stalls or fails the file would not be moved to it's location
80f8e3804a 2010-08-20 672: print(error)
80f8e3804a 2010-08-20 673:
80f8e3804a 2010-08-20 674: if options.verbose:
80f8e3804a 2010-08-20 675: print('[', len(unchecked_files), '/', checked_files, ']')
80f8e3804a 2010-08-20 676:
80f8e3804a 2010-08-20 677: # checking if there were any files downloaded, if yes - restarting sequence
80f8e3804a 2010-08-20 678: if checked_files == 0:
80f8e3804a 2010-08-20 679: break