08ae38b6ce 2010-06-25 1: #!/usr/bin/env python3.1
08ae38b6ce 2010-06-25 2:
08ae38b6ce 2010-06-25 3: import datetime, http.cookiejar, optparse, os, sys, shelve, re, urllib.request
08ae38b6ce 2010-06-25 4:
80f8e3804a 2010-08-20 5: from spacemap import SpaceMap
80f8e3804a 2010-08-20 6:
08ae38b6ce 2010-06-25 7: parser = optparse.OptionParser()
08ae38b6ce 2010-06-25 8: parser.add_option('-v', '--verbose', action = 'store_true', dest = 'verbose', help = 'turns on verbose status notifications', metavar = 'bool', default = False)
08ae38b6ce 2010-06-25 9: parser.add_option('-d', '--dir', action = 'store', dest = 'dir', help = 'specify directory where the files should be stored', metavar = 'string', default = None)
08ae38b6ce 2010-06-25 10: parser.add_option('-r', '--root', action = 'store', dest = 'root', help = 'specify a site from which data should be mirrored', metavar = 'string', default = None)
08ae38b6ce 2010-06-25 11: parser.add_option('-l', '--log', action = 'store', dest = 'log', help = 'specify a log file to process', metavar = 'string', default = None)
38b25713eb 2010-07-26 12: parser.add_option('-e', '--skip-etag', action = 'store_true', dest = 'noetag', help = 'do not process etags', metavar = 'bool', default = False)
80f8e3804a 2010-08-20 13: parser.add_option('-p', '--port', action = 'store', dest = 'port', help = 'listen on this port for incoming connections', metavar = 'integer', default = None)
80f8e3804a 2010-08-20 14: parser.add_option('-n', '--no-update', action = 'store_true', dest = 'noupdate', help = 'do not update already downloaded files', metavar = 'bool', default = 'False')
08ae38b6ce 2010-06-25 15: (options, args) = parser.parse_args()
08ae38b6ce 2010-06-25 16:
38b25713eb 2010-07-26 17: assert options.dir, 'Directory not specified'
38b25713eb 2010-07-26 18: assert options.root, 'Server not specified'
80f8e3804a 2010-08-20 19: assert options.log or options.port, 'Log file or port not specified'
80f8e3804a 2010-08-20 20: assert options.port or os.access(options.log, os.R_OK), 'Log file unreadable'
80f8e3804a 2010-08-20 21:
80f8e3804a 2010-08-20 22: optionsDirWithSep = re.compile('^(.*?)/?$').match(options.dir)
80f8e3804a 2010-08-20 23: if optionsDirWithSep:
80f8e3804a 2010-08-20 24: options.dir = optionsDirWithSep.group(1)
08ae38b6ce 2010-06-25 25:
08ae38b6ce 2010-06-25 26: # this is file index - everything is stored in this file
80f8e3804a 2010-08-20 27: # _parts - list of stored parts of file
80f8e3804a 2010-08-20 28: # _time - last time the file was checked
80f8e3804a 2010-08-20 29: # everything else is just the headers
80f8e3804a 2010-08-20 30: index = shelve.open(options.dir + os.sep + '.index')
38b25713eb 2010-07-26 31: desc_fields = ('Content-Length', 'Pragma', 'Last-Modified')
38b25713eb 2010-07-26 32: ignore_fields = ('Accept-Ranges', 'Age', 'Cache-Control', 'Connection', 'Content-Type', 'Date', 'Expires', 'Server', 'Via', 'X-Cache', 'X-Cache-Lookup', 'X-Powered-By')
38b25713eb 2010-07-26 33:
38b25713eb 2010-07-26 34: if not options.noetag:
38b25713eb 2010-07-26 35: desc_fields += 'ETag',
38b25713eb 2010-07-26 36: else:
38b25713eb 2010-07-26 37: ignore_fields += 'ETag',
38b25713eb 2010-07-26 38:
80f8e3804a 2010-08-20 39: block_size = 4096
80f8e3804a 2010-08-20 40:
80f8e3804a 2010-08-20 41: temp_file_name = options.dir + os.sep + '.tmp'
80f8e3804a 2010-08-20 42:
80f8e3804a 2010-08-20 43: '''
80f8e3804a 2010-08-20 44: # later, kqueue would be good but later
80f8e3804a 2010-08-20 45: class Connection:
80f8e3804a 2010-08-20 46: __slots__ = frozenset(('__address', '__input', '__socket', '__status', 'error', 'method', 'url', 'http_version'))
80f8e3804a 2010-08-20 47:
80f8e3804a 2010-08-20 48: def __init__(self, socket, address):
80f8e3804a 2010-08-20 49: self.__address = address
80f8e3804a 2010-08-20 50: self.__input = b''
80f8e3804a 2010-08-20 51: self.__socket = socket
80f8e3804a 2010-08-20 52: self.__status = 0
80f8e3804a 2010-08-20 53:
80f8e3804a 2010-08-20 54: def read(self, kev):
80f8e3804a 2010-08-20 55: buffer = self.__socket.recv(kev.data)
80f8e3804a 2010-08-20 56: exhausted = False
80f8e3804a 2010-08-20 57: if len(buffer) == 0:
80f8e3804a 2010-08-20 58: eof = True
80f8e3804a 2010-08-20 59: else:
80f8e3804a 2010-08-20 60: self.__input += buffer
80f8e3804a 2010-08-20 61: while not exhausted:
80f8e3804a 2010-08-20 62: if self.__status == -1:
80f8e3804a 2010-08-20 63: exhausted = True
80f8e3804a 2010-08-20 64: elif self.__status == 0:
80f8e3804a 2010-08-20 65: endstring = self.__input.find(b'\n')
80f8e3804a 2010-08-20 66: if endstring > 0:
80f8e3804a 2010-08-20 67: print('Processing request line.')
80f8e3804a 2010-08-20 68: line = self.__input[:endstring].decode('ascii')
80f8e3804a 2010-08-20 69: self.__input = self.__input[endstring + 1:]
80f8e3804a 2010-08-20 70: isRequest = re.compile('(GET) ([^ ]+) HTTP/(1\.0)').match(line)
80f8e3804a 2010-08-20 71: if not isRequest:
80f8e3804a 2010-08-20 72: self.error = 'Not a HTTP connection.'
80f8e3804a 2010-08-20 73: self.__status = -1
80f8e3804a 2010-08-20 74: else:
80f8e3804a 2010-08-20 75: self.method = isRequest.group(1)
80f8e3804a 2010-08-20 76: self.url = isRequest.group(2)
80f8e3804a 2010-08-20 77: self.http_version = isRequest.group(3)
80f8e3804a 2010-08-20 78: self.__status = 1
80f8e3804a 2010-08-20 79: else:
80f8e3804a 2010-08-20 80: exhausted = True
80f8e3804a 2010-08-20 81: elif self.__status == 1:
80f8e3804a 2010-08-20 82: endstring = self.__input.find(b'\n')
80f8e3804a 2010-08-20 83: if endstring > 0:
80f8e3804a 2010-08-20 84: print('Processing header line.' + repr(self.__input))
80f8e3804a 2010-08-20 85: line = self.__input[:endstring].decode('ascii')
80f8e3804a 2010-08-20 86: self.__input = self.__input[endstring + 1:]
80f8e3804a 2010-08-20 87: isHeader = re.compile('([^:]*): +(.*)').match(line)
80f8e3804a 2010-08-20 88: if not isHeader:
80f8e3804a 2010-08-20 89: self.error = 'Bad header.'
80f8e3804a 2010-08-20 90: return(False)
80f8e3804a 2010-08-20 91: # process header here
80f8e3804a 2010-08-20 92: elif endstring == 0:
80f8e3804a 2010-08-20 93: self.__status = 2
80f8e3804a 2010-08-20 94: else:
80f8e3804a 2010-08-20 95: exhausted = True
80f8e3804a 2010-08-20 96:
80f8e3804a 2010-08-20 97: def write(self, kev):
80f8e3804a 2010-08-20 98: pass
80f8e3804a 2010-08-20 99:
80f8e3804a 2010-08-20 100: if options.port:
80f8e3804a 2010-08-20 101: import select, socket
80f8e3804a 2010-08-20 102:
80f8e3804a 2010-08-20 103: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
80f8e3804a 2010-08-20 104: try:
80f8e3804a 2010-08-20 105: sock.bind(('127.0.0.1', int(options.port)))
80f8e3804a 2010-08-20 106: sock.listen(-1)
80f8e3804a 2010-08-20 107:
80f8e3804a 2010-08-20 108: kq = select.kqueue()
80f8e3804a 2010-08-20 109: assert kq.fileno() != -1, "Fatal error: can't initialise kqueue."
80f8e3804a 2010-08-20 110:
80f8e3804a 2010-08-20 111: kq.control([select.kevent(sock, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
80f8e3804a 2010-08-20 112: timeout = None
80f8e3804a 2010-08-20 113:
80f8e3804a 2010-08-20 114: connections = {sock.fileno(): None}
80f8e3804a 2010-08-20 115:
80f8e3804a 2010-08-20 116: while True:
80f8e3804a 2010-08-20 117: kevs = kq.control(None, 1, timeout)
80f8e3804a 2010-08-20 118:
80f8e3804a 2010-08-20 119: for kev in kevs:
80f8e3804a 2010-08-20 120: if type(connections[kev.ident]) == Connection:
80f8e3804a 2010-08-20 121: print(kev.ident, kev.data, kev.filter, kev.flags)
80f8e3804a 2010-08-20 122: assert kev.data != 0, 'No data available.'
80f8e3804a 2010-08-20 123: if kev.filter == select.KQ_FILTER_READ:
80f8e3804a 2010-08-20 124: connections[kev.ident].read(kev)
80f8e3804a 2010-08-20 125: elif kev.filter == select.KQ_FILTER_WRITE:
80f8e3804a 2010-08-20 126: connections[kev.ident].write(kev)
80f8e3804a 2010-08-20 127: else:
80f8e3804a 2010-08-20 128: assert kev.filter in (select.KQ_FILTER_READ, select.KQ_FILTER_WRITE), 'Do we support other filters?'
80f8e3804a 2010-08-20 129: else:
80f8e3804a 2010-08-20 130: (conn, addr) = sock.accept()
80f8e3804a 2010-08-20 131: print('Connection from ' + repr(addr))
80f8e3804a 2010-08-20 132: kq.control([select.kevent(conn, select.KQ_FILTER_READ, select.KQ_EV_ADD)], 0)
80f8e3804a 2010-08-20 133: connections[conn.fileno()] = Connection(conn, addr)
80f8e3804a 2010-08-20 134:
80f8e3804a 2010-08-20 135: if kev.flags >> 15 == 1:
80f8e3804a 2010-08-20 136: kq.control([select.kevent(kev.ident, select.KQ_FILTER_READ, select.KQ_EV_DELETE)], 0)
80f8e3804a 2010-08-20 137: kq.control([select.kevent(kev.ident, select.KQ_FILTER_WRITE, select.KQ_EV_DELETE)], 0)
80f8e3804a 2010-08-20 138: del(connections[kev.ident])
80f8e3804a 2010-08-20 139: finally:
80f8e3804a 2010-08-20 140: sock.close()
80f8e3804a 2010-08-20 141: '''
80f8e3804a 2010-08-20 142:
80f8e3804a 2010-08-20 143: if options.port:
80f8e3804a 2010-08-20 144: import http.server
80f8e3804a 2010-08-20 145:
80f8e3804a 2010-08-20 146: class MyRequestHandler(http.server.BaseHTTPRequestHandler):
80f8e3804a 2010-08-20 147: def __process(self):
80f8e3804a 2010-08-20 148: # reload means file needs to be reloaded to serve request
80f8e3804a 2010-08-20 149: reload = False
80f8e3804a 2010-08-20 150: # recheck means file needs to be checked, this also means that if file hav been modified we can serve older copy
80f8e3804a 2010-08-20 151: recheck = False
80f8e3804a 2010-08-20 152: # file_stat means file definitely exists
80f8e3804a 2010-08-20 153: file_stat = None
80f8e3804a 2010-08-20 154: # requested_ranges holds data about any range requested
80f8e3804a 2010-08-20 155: requested_ranges = None
80f8e3804a 2010-08-20 156: # records holds data from index locally, should be written back upon successfull completion
80f8e3804a 2010-08-20 157: record = None
80f8e3804a 2010-08-20 158: info = 'Checking file: ' + self.path
80f8e3804a 2010-08-20 159:
80f8e3804a 2010-08-20 160: proxy_ignored = ('Accept', 'Accept-Encoding',
80f8e3804a 2010-08-20 161: 'Cache-Control', 'Connection',
80f8e3804a 2010-08-20 162: 'Host',
80f8e3804a 2010-08-20 163: 'User-Agent',
80f8e3804a 2010-08-20 164: 'Via',
80f8e3804a 2010-08-20 165: 'X-Forwarded-For',
80f8e3804a 2010-08-20 166: )
80f8e3804a 2010-08-20 167:
80f8e3804a 2010-08-20 168: print('Command:', self.command)
80f8e3804a 2010-08-20 169:
80f8e3804a 2010-08-20 170: for header in self.headers:
80f8e3804a 2010-08-20 171: if header in proxy_ignored:
80f8e3804a 2010-08-20 172: pass
80f8e3804a 2010-08-20 173: elif header in ('Range'):
80f8e3804a 2010-08-20 174: isRange = re.compile('bytes=(\d+)-(\d+)').match(self.headers[header])
80f8e3804a 2010-08-20 175: if isRange:
80f8e3804a 2010-08-20 176: requested_ranges = SpaceMap({int(isRange.group(1)): int(isRange.group(2)) + 1})
80f8e3804a 2010-08-20 177: else:
80f8e3804a 2010-08-20 178: return()
80f8e3804a 2010-08-20 179: else:
80f8e3804a 2010-08-20 180: print('Unknown header - ', header, ': ', self.headers[header], sep='')
80f8e3804a 2010-08-20 181: return()
80f8e3804a 2010-08-20 182: print(header, self.headers[header])
80f8e3804a 2010-08-20 183: print(self.path)
80f8e3804a 2010-08-20 184:
80f8e3804a 2010-08-20 185: # creating empty placeholder in index
80f8e3804a 2010-08-20 186: # if there's no space map and there's no file in real directory - we have no file
80f8e3804a 2010-08-20 187: # if there's an empty space map - file is full
80f8e3804a 2010-08-20 188: # space map generally covers every bit of file we don't posess currently
80f8e3804a 2010-08-20 189: if not self.path in index:
80f8e3804a 2010-08-20 190: info += '\nThis one is new.'
80f8e3804a 2010-08-20 191: reload = True
80f8e3804a 2010-08-20 192: record = {'_parts': None}
80f8e3804a 2010-08-20 193: else:
80f8e3804a 2010-08-20 194: record = index[self.path]
80f8e3804a 2010-08-20 195: if '_parts' in index[self.path]:
80f8e3804a 2010-08-20 196: print(record['_parts'])
80f8e3804a 2010-08-20 197: if index[self.path]['_parts'] == {0: -1}:
80f8e3804a 2010-08-20 198: index[self.path]['_parts'] = None
80f8e3804a 2010-08-20 199:
80f8e3804a 2010-08-20 200: # creating file name from self.path
80f8e3804a 2010-08-20 201: file_name = options.dir + os.sep + re.compile('%20').sub(' ', self.path)
80f8e3804a 2010-08-20 202: # partial file or unfinished download
80f8e3804a 2010-08-20 203: temp_name = options.dir + os.sep + '.parts' + re.compile('%20').sub(' ', self.path)
80f8e3804a 2010-08-20 204:
80f8e3804a 2010-08-20 205: # forcibly checking file if no file present
80f8e3804a 2010-08-20 206: if os.access(file_name, os.R_OK):
80f8e3804a 2010-08-20 207: file_stat = os.stat(file_name)
80f8e3804a 2010-08-20 208: elif '_parts' in record and os.access(temp_name, os.R_OK):
80f8e3804a 2010-08-20 209: file_stat = os.stat(temp_name)
80f8e3804a 2010-08-20 210: elif not reload:
80f8e3804a 2010-08-20 211: info += '\nFile not found or inaccessible.'
80f8e3804a 2010-08-20 212: reload = True
80f8e3804a 2010-08-20 213:
80f8e3804a 2010-08-20 214: # forcibly checking file if file size doesn't match with index data
80f8e3804a 2010-08-20 215: if not reload:
80f8e3804a 2010-08-20 216: if '_parts' in record and record['_parts'] == SpaceMap():
80f8e3804a 2010-08-20 217: if 'Content-Length' in record and file_stat and file_stat.st_size != int(record['Content-Length']):
80f8e3804a 2010-08-20 218: info += '\nFile size is {} and stored file size is {}.'.format(file_stat.st_size, record['Content-Length'])
80f8e3804a 2010-08-20 219: reload = True
80f8e3804a 2010-08-20 220:
80f8e3804a 2010-08-20 221: # forcibly checking file if index holds Pragma header
80f8e3804a 2010-08-20 222: if not reload and 'Pragma' in record and record['Pragma'] == 'no-cache':
80f8e3804a 2010-08-20 223: info +='\nPragma on: recheck imminent.'
80f8e3804a 2010-08-20 224: recheck = True
80f8e3804a 2010-08-20 225:
80f8e3804a 2010-08-20 226: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
80f8e3804a 2010-08-20 227: if not recheck and not reload and '_time' in record and (datetime.datetime.now() - datetime.timedelta(hours = 4) - record['_time']).days < 0:
80f8e3804a 2010-08-20 228: recheck = True
80f8e3804a 2010-08-20 229:
80f8e3804a 2010-08-20 230: print(info)
80f8e3804a 2010-08-20 231: if reload or recheck:
80f8e3804a 2010-08-20 232:
80f8e3804a 2010-08-20 233: try:
80f8e3804a 2010-08-20 234: request = options.root + self.path
80f8e3804a 2010-08-20 235: if requested_ranges != None:
80f8e3804a 2010-08-20 236: if '_parts' in record and record['_parts'] != None:
80f8e3804a 2010-08-20 237: needed = record['_parts'] & requested_ranges
80f8e3804a 2010-08-20 238: else:
80f8e3804a 2010-08-20 239: needed = requested_ranges
80f8e3804a 2010-08-20 240: ranges = ()
80f8e3804a 2010-08-20 241: print('Requesting ranges:', ranges)
80f8e3804a 2010-08-20 242: print('Not stored ranges:', record['_parts'])
80f8e3804a 2010-08-20 243: print('Requested ranges:', requested_ranges)
80f8e3804a 2010-08-20 244: print('Needed ranges:', needed)
80f8e3804a 2010-08-20 245: needed.rewind()
80f8e3804a 2010-08-20 246: while True:
80f8e3804a 2010-08-20 247: range = needed.pop()
80f8e3804a 2010-08-20 248: if range[0] == None:
80f8e3804a 2010-08-20 249: break
80f8e3804a 2010-08-20 250: ranges += '{}-{}'.format(range[0], range[1] - 1),
80f8e3804a 2010-08-20 251: request = urllib.request.Request(request, headers = {'Range': 'bytes=' + ','.join(ranges)})
80f8e3804a 2010-08-20 252:
80f8e3804a 2010-08-20 253: with urllib.request.urlopen(request) as source:
80f8e3804a 2010-08-20 254: new_record = {}
80f8e3804a 2010-08-20 255: new_record['_parts'] = record['_parts']
80f8e3804a 2010-08-20 256: headers = source.info()
80f8e3804a 2010-08-20 257:
80f8e3804a 2010-08-20 258: # stripping unneeded headers (XXX make this inplace?)
80f8e3804a 2010-08-20 259: for header in headers:
80f8e3804a 2010-08-20 260: if header in desc_fields:
80f8e3804a 2010-08-20 261: #if header == 'Pragma' and headers[header] != 'no-cache':
80f8e3804a 2010-08-20 262: print(header, headers[header])
80f8e3804a 2010-08-20 263: if header == 'Content-Length':
80f8e3804a 2010-08-20 264: if 'Content-Range' not in headers:
80f8e3804a 2010-08-20 265: new_record[header] = headers[header]
80f8e3804a 2010-08-20 266: else:
80f8e3804a 2010-08-20 267: new_record[header] = headers[header]
80f8e3804a 2010-08-20 268: elif header == 'Content-Range':
80f8e3804a 2010-08-20 269: range = re.compile('^bytes (\d+)-(\d+)/(\d+)$').match(headers[header])
80f8e3804a 2010-08-20 270: if range:
80f8e3804a 2010-08-20 271: new_record['Content-Length'] = range.group(3)
80f8e3804a 2010-08-20 272: else:
80f8e3804a 2010-08-20 273: assert False, 'Content-Range unrecognized.'
80f8e3804a 2010-08-20 274: elif not header in ignore_fields:
80f8e3804a 2010-08-20 275: print('Undefined header "', header, '": ', headers[header], sep='')
80f8e3804a 2010-08-20 276:
80f8e3804a 2010-08-20 277: if new_record['_parts'] == None:
80f8e3804a 2010-08-20 278: new_record['_parts'] = SpaceMap({0: int(new_record['Content-Length'])})
80f8e3804a 2010-08-20 279: print(new_record)
80f8e3804a 2010-08-20 280:
80f8e3804a 2010-08-20 281: # comparing headers with data found in index
80f8e3804a 2010-08-20 282: # if any header has changed (except Pragma) file is fully downloaded
80f8e3804a 2010-08-20 283: # same if we get more or less headers
80f8e3804a 2010-08-20 284: old_keys = set(record.keys())
80f8e3804a 2010-08-20 285: old_keys.discard('_time')
80f8e3804a 2010-08-20 286: old_keys.discard('Pragma')
80f8e3804a 2010-08-20 287: more_keys = set(new_record.keys()) - old_keys
80f8e3804a 2010-08-20 288: more_keys.discard('Pragma')
80f8e3804a 2010-08-20 289: less_keys = old_keys - set(new_record.keys())
80f8e3804a 2010-08-20 290: if len(more_keys) > 0:
80f8e3804a 2010-08-20 291: if not len(old_keys) == 0:
80f8e3804a 2010-08-20 292: print('More headers appear:', more_keys)
80f8e3804a 2010-08-20 293: reload = True
80f8e3804a 2010-08-20 294: elif len(less_keys) > 0:
80f8e3804a 2010-08-20 295: print('Less headers appear:', less_keys)
80f8e3804a 2010-08-20 296: else:
80f8e3804a 2010-08-20 297: for key in record.keys():
80f8e3804a 2010-08-20 298: if key[0] != '_' and key != 'Pragma' and not record[key] == new_record[key]:
80f8e3804a 2010-08-20 299: print('Header "', key, '" changed from [', record[key], '] to [', new_record[key], ']', sep='')
80f8e3804a 2010-08-20 300: reload = True
80f8e3804a 2010-08-20 301:
80f8e3804a 2010-08-20 302: if reload:
80f8e3804a 2010-08-20 303: print('Reloading.')
80f8e3804a 2010-08-20 304: if os.access(temp_name, os.R_OK):
80f8e3804a 2010-08-20 305: os.unlink(temp_name)
80f8e3804a 2010-08-20 306: if os.access(file_name, os.R_OK):
80f8e3804a 2010-08-20 307: os.unlink(file_name)
80f8e3804a 2010-08-20 308:
80f8e3804a 2010-08-20 309: # downloading file or segment
80f8e3804a 2010-08-20 310: if 'Content-Length' in new_record:
80f8e3804a 2010-08-20 311: if requested_ranges == None:
80f8e3804a 2010-08-20 312: requested_ranges = new_record['_parts']
80f8e3804a 2010-08-20 313: else:
80f8e3804a 2010-08-20 314: if len(requested_ranges) > 1:
80f8e3804a 2010-08-20 315: print("Multipart requests currently not supported.")
80f8e3804a 2010-08-20 316: assert False, 'Skip this one for now.'
80f8e3804a 2010-08-20 317: else:
80f8e3804a 2010-08-20 318: assert False, 'No Content-Length or Content-Range header.'
80f8e3804a 2010-08-20 319:
80f8e3804a 2010-08-20 320: if reload:
80f8e3804a 2010-08-20 321: new_record['_time'] = datetime.datetime.now()
80f8e3804a 2010-08-20 322: if self.command not in ('HEAD'):
80f8e3804a 2010-08-20 323: # file is created at temporary location and moved in place only when download completes
80f8e3804a 2010-08-20 324: if not os.access(temp_name, os.R_OK):
80f8e3804a 2010-08-20 325: empty_name = options.dir + os.sep + '.tmp'
80f8e3804a 2010-08-20 326: with open(empty_name, 'w+b') as some_file:
80f8e3804a 2010-08-20 327: pass
80f8e3804a 2010-08-20 328: os.renames(empty_name, temp_name)
80f8e3804a 2010-08-20 329: temp_file = open(temp_name, 'r+b')
80f8e3804a 2010-08-20 330: requested_ranges.rewind()
80f8e3804a 2010-08-20 331: while True:
80f8e3804a 2010-08-20 332: (start, end) = requested_ranges.pop()
80f8e3804a 2010-08-20 333: if start == None:
80f8e3804a 2010-08-20 334: break
80f8e3804a 2010-08-20 335: stream_last = start
80f8e3804a 2010-08-20 336: old_record = new_record
80f8e3804a 2010-08-20 337: if end - start < block_size:
80f8e3804a 2010-08-20 338: req_block_size = end - start
80f8e3804a 2010-08-20 339: else:
80f8e3804a 2010-08-20 340: req_block_size = block_size
80f8e3804a 2010-08-20 341: buffer = source.read(req_block_size)
80f8e3804a 2010-08-20 342: print(buffer)
80f8e3804a 2010-08-20 343: length = len(buffer)
80f8e3804a 2010-08-20 344: while length > 0 and stream_last < end:
80f8e3804a 2010-08-20 345: stream_pos = stream_last + length
80f8e3804a 2010-08-20 346: assert not stream_pos > end, 'Received more data then requested: pos:{} start:{} end:{}.'.format(stream_pos, start, end)
80f8e3804a 2010-08-20 347: print('Writing', length, 'bytes to temp file at position', stream_last)
80f8e3804a 2010-08-20 348: temp_file.seek(stream_last)
80f8e3804a 2010-08-20 349: temp_file.write(buffer)
80f8e3804a 2010-08-20 350: new_record['_parts'] = new_record['_parts'] - SpaceMap({stream_last: stream_pos})
80f8e3804a 2010-08-20 351: print(new_record)
80f8e3804a 2010-08-20 352: index[self.path] = old_record
80f8e3804a 2010-08-20 353: index.sync()
80f8e3804a 2010-08-20 354: old_record = new_record
80f8e3804a 2010-08-20 355: stream_last = stream_pos
80f8e3804a 2010-08-20 356: if end - stream_last < block_size:
80f8e3804a 2010-08-20 357: req_block_size = end - stream_last
80f8e3804a 2010-08-20 358: buffer = source.read(req_block_size)
80f8e3804a 2010-08-20 359: print(buffer)
80f8e3804a 2010-08-20 360: length = len(buffer)
80f8e3804a 2010-08-20 361: print(new_record)
80f8e3804a 2010-08-20 362: index[self.path] = new_record
80f8e3804a 2010-08-20 363: index.sync()
80f8e3804a 2010-08-20 364: temp_file.close()
80f8e3804a 2010-08-20 365:
80f8e3804a 2010-08-20 366: # moving downloaded data to real file
80f8e3804a 2010-08-20 367: if new_record['_parts'] == SpaceMap():
80f8e3804a 2010-08-20 368: if type(request) != str:
80f8e3804a 2010-08-20 369: # just moving
80f8e3804a 2010-08-20 370: # drop old dirs XXX
80f8e3804a 2010-08-20 371: print('Moving temporary file to new destination.')
80f8e3804a 2010-08-20 372: os.renames(temp_name, file_name)
80f8e3804a 2010-08-20 373:
80f8e3804a 2010-08-20 374: except urllib.error.HTTPError as error:
80f8e3804a 2010-08-20 375: # in case of error we don't need to do anything actually,
80f8e3804a 2010-08-20 376: # if file download stalls or fails the file would not be moved to it's location
80f8e3804a 2010-08-20 377: print(error)
80f8e3804a 2010-08-20 378:
80f8e3804a 2010-08-20 379: if self.command == 'HEAD':
80f8e3804a 2010-08-20 380: self.send_response(200)
80f8e3804a 2010-08-20 381: if 'Content-Length' in index[self.path]:
80f8e3804a 2010-08-20 382: self.send_header('Content-Length', index[self.path]['Content-Length'])
80f8e3804a 2010-08-20 383: self.send_header('Accept-Ranges', 'bytes')
80f8e3804a 2010-08-20 384: self.send_header('Content-Type', 'application/octet-stream')
80f8e3804a 2010-08-20 385: if 'Last-Modified' in index[self.path]:
80f8e3804a 2010-08-20 386: self.send_header('Last-Modified', index[self.path]['Last-Modified'])
80f8e3804a 2010-08-20 387: self.end_headers()
80f8e3804a 2010-08-20 388: else:
80f8e3804a 2010-08-20 389: if index[self.path]['_parts'] != SpaceMap():
80f8e3804a 2010-08-20 390: file_name = temp_name
80f8e3804a 2010-08-20 391:
80f8e3804a 2010-08-20 392: with open(file_name, 'rb') as real_file:
80f8e3804a 2010-08-20 393: file_stat = os.stat(file_name)
80f8e3804a 2010-08-20 394: self.send_response(200)
80f8e3804a 2010-08-20 395: self.send_header('Last-Modified', index[self.path]['Last-Modified'])
80f8e3804a 2010-08-20 396: if requested_ranges != None:
80f8e3804a 2010-08-20 397: ranges = ()
80f8e3804a 2010-08-20 398: requested_ranges.rewind()
80f8e3804a 2010-08-20 399: while True:
80f8e3804a 2010-08-20 400: pair = requested_ranges.pop()
80f8e3804a 2010-08-20 401: if pair[0] == None:
80f8e3804a 2010-08-20 402: break
80f8e3804a 2010-08-20 403: ranges += '{}-{}'.format(pair[0], str(pair[1] - 1)),
80f8e3804a 2010-08-20 404: self.send_header('Content-Range', 'bytes ' + ','.join(ranges) + '/' + index[self.path]['Content-Length'])
80f8e3804a 2010-08-20 405: else:
80f8e3804a 2010-08-20 406: self.send_header('Content-Length', str(file_stat.st_size))
80f8e3804a 2010-08-20 407: requested_ranges = SpaceMap({0: file_stat.st_size})
80f8e3804a 2010-08-20 408: self.send_header('Content-Type', 'application/octet-stream')
80f8e3804a 2010-08-20 409: self.end_headers()
80f8e3804a 2010-08-20 410: if self.command in ('GET'):
80f8e3804a 2010-08-20 411: requested_ranges.rewind()
80f8e3804a 2010-08-20 412: (start, end) = requested_ranges.pop()
80f8e3804a 2010-08-20 413: print('Seeking file to position', start)
80f8e3804a 2010-08-20 414: real_file.seek(start)
80f8e3804a 2010-08-20 415: if block_size > end - start:
80f8e3804a 2010-08-20 416: req_block_size = end - start
80f8e3804a 2010-08-20 417: else:
80f8e3804a 2010-08-20 418: req_block_size = block_size
80f8e3804a 2010-08-20 419: print('block_size is', req_block_size)
80f8e3804a 2010-08-20 420: buffer = real_file.read(req_block_size)
80f8e3804a 2010-08-20 421: length = len(buffer)
80f8e3804a 2010-08-20 422: while length > 0:
80f8e3804a 2010-08-20 423: self.wfile.write(buffer)
80f8e3804a 2010-08-20 424: start += len(buffer)
80f8e3804a 2010-08-20 425: if req_block_size > end - start:
80f8e3804a 2010-08-20 426: req_block_size = end - start
80f8e3804a 2010-08-20 427: if req_block_size == 0:
80f8e3804a 2010-08-20 428: break
80f8e3804a 2010-08-20 429: print('block_size is', req_block_size)
80f8e3804a 2010-08-20 430: buffer = real_file.read(req_block_size)
80f8e3804a 2010-08-20 431: length = len(buffer)
80f8e3804a 2010-08-20 432:
80f8e3804a 2010-08-20 433: def do_HEAD(self):
80f8e3804a 2010-08-20 434: return self.__process()
80f8e3804a 2010-08-20 435: def do_GET(self):
80f8e3804a 2010-08-20 436: return self.__process()
80f8e3804a 2010-08-20 437:
80f8e3804a 2010-08-20 438: server = http.server.HTTPServer(('127.0.0.1', int(options.port)), MyRequestHandler)
80f8e3804a 2010-08-20 439: server.serve_forever()
80f8e3804a 2010-08-20 440:
80f8e3804a 2010-08-20 441: else:
80f8e3804a 2010-08-20 442: while True:
80f8e3804a 2010-08-20 443: unchecked_files = set()
80f8e3804a 2010-08-20 444: checked_files = 0
80f8e3804a 2010-08-20 445:
80f8e3804a 2010-08-20 446: # reading log and storing found urls for processing
80f8e3804a 2010-08-20 447: # check file mtime XXX
80f8e3804a 2010-08-20 448: with open(options.log, 'r') as log_file:
80f8e3804a 2010-08-20 449: log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
80f8e3804a 2010-08-20 450: for line in log_file:
80f8e3804a 2010-08-20 451: this_line = log_line.match(line.strip())
80f8e3804a 2010-08-20 452: if this_line:
80f8e3804a 2010-08-20 453: unchecked_files.add(this_line.group(2))
80f8e3804a 2010-08-20 454:
80f8e3804a 2010-08-20 455: for url in unchecked_files:
80f8e3804a 2010-08-20 456: reload = False
80f8e3804a 2010-08-20 457: recheck = False
80f8e3804a 2010-08-20 458: info = 'Checking file: ' + url
80f8e3804a 2010-08-20 459:
80f8e3804a 2010-08-20 460: # creating empty placeholder in index
80f8e3804a 2010-08-20 461: if not url in index:
80f8e3804a 2010-08-20 462: info += '\nThis one is new.'
80f8e3804a 2010-08-20 463: index[url] = {}
80f8e3804a 2010-08-20 464: reload = True
80f8e3804a 2010-08-20 465:
80f8e3804a 2010-08-20 466: # creating file name from url
80f8e3804a 2010-08-20 467: file_name = options.dir + re.compile('%20').sub(' ', url)
80f8e3804a 2010-08-20 468:
80f8e3804a 2010-08-20 469: # forcibly checking file if no file present
80f8e3804a 2010-08-20 470: if not reload and not os.access(file_name, os.R_OK):
80f8e3804a 2010-08-20 471: info += '\nFile not found or inaccessible.'
80f8e3804a 2010-08-20 472: reload = True
80f8e3804a 2010-08-20 473:
80f8e3804a 2010-08-20 474: # forcibly checking file if file size doesn't match with index data
80f8e3804a 2010-08-20 475: elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
80f8e3804a 2010-08-20 476: info += '\nFile size is ' + os.stat(file_name).st_size + ' and stored file size is ' + index[url]['Content-Length'] + '.'
80f8e3804a 2010-08-20 477: reload = True
80f8e3804a 2010-08-20 478:
80f8e3804a 2010-08-20 479: # forcibly checking file if index hods Pragma header
80f8e3804a 2010-08-20 480: if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
80f8e3804a 2010-08-20 481: info +='\nPragma on: recheck imminent.'
80f8e3804a 2010-08-20 482: recheck = True
80f8e3804a 2010-08-20 483:
80f8e3804a 2010-08-20 484: # skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
80f8e3804a 2010-08-20 485: if not recheck and not reload and (options.noupdate or ('_time' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['_time']).days < 0)):
80f8e3804a 2010-08-20 486: if options.verbose:
80f8e3804a 2010-08-20 487: print(info)
80f8e3804a 2010-08-20 488: continue
80f8e3804a 2010-08-20 489: else:
80f8e3804a 2010-08-20 490: print(info)
80f8e3804a 2010-08-20 491:
80f8e3804a 2010-08-20 492: try:
80f8e3804a 2010-08-20 493: with urllib.request.urlopen(options.root + url) as source:
80f8e3804a 2010-08-20 494: new_headers = {}
80f8e3804a 2010-08-20 495: headers = source.info()
80f8e3804a 2010-08-20 496:
80f8e3804a 2010-08-20 497: # stripping unneeded headers (XXX make this inplace?)
80f8e3804a 2010-08-20 498: for header in headers:
80f8e3804a 2010-08-20 499: if header in desc_fields:
80f8e3804a 2010-08-20 500: if header == 'Pragma' and headers[header] != 'no-cache':
80f8e3804a 2010-08-20 501: print('Pragma:', headers[header])
80f8e3804a 2010-08-20 502: new_headers[header] = headers[header]
80f8e3804a 2010-08-20 503: elif not header in ignore_fields:
80f8e3804a 2010-08-20 504: print('Undefined header "', header, '": ', headers[header], sep='')
80f8e3804a 2010-08-20 505:
80f8e3804a 2010-08-20 506: # comparing headers with data found in index
80f8e3804a 2010-08-20 507: # if any header has changed (except Pragma) file is fully downloaded
80f8e3804a 2010-08-20 508: # same if we get more or less headers
80f8e3804a 2010-08-20 509: old_keys = set(index[url].keys())
80f8e3804a 2010-08-20 510: old_keys.discard('_time')
80f8e3804a 2010-08-20 511: old_keys.discard('Pragma')
80f8e3804a 2010-08-20 512: more_keys = set(new_headers.keys()) - old_keys
80f8e3804a 2010-08-20 513: more_keys.discard('Pragma')
80f8e3804a 2010-08-20 514: less_keys = old_keys - set(new_headers.keys())
80f8e3804a 2010-08-20 515: if len(more_keys) > 0:
80f8e3804a 2010-08-20 516: if not len(old_keys) == 0:
80f8e3804a 2010-08-20 517: print('More headers appear:', more_keys)
80f8e3804a 2010-08-20 518: reload = True
80f8e3804a 2010-08-20 519: elif len(less_keys) > 0:
80f8e3804a 2010-08-20 520: print('Less headers appear:', less_keys)
80f8e3804a 2010-08-20 521: else:
80f8e3804a 2010-08-20 522: for key in index[url].keys():
80f8e3804a 2010-08-20 523: if key[0] != '_' and key != 'Pragma' and not index[url][key] == new_headers[key]:
80f8e3804a 2010-08-20 524: print('Header "', key, '" changed from [', index[url][key], '] to [', new_headers[key], ']', sep='')
80f8e3804a 2010-08-20 525: reload = True
80f8e3804a 2010-08-20 526:
80f8e3804a 2010-08-20 527: # downloading file
80f8e3804a 2010-08-20 528: if reload:
80f8e3804a 2010-08-20 529: if 'Content-Length' in headers:
80f8e3804a 2010-08-20 530: print('Downloading', headers['Content-Length'], 'bytes [', end='')
80f8e3804a 2010-08-20 531: else:
80f8e3804a 2010-08-20 532: print('Downloading [', end='')
80f8e3804a 2010-08-20 533: sys.stdout.flush()
80f8e3804a 2010-08-20 534:
80f8e3804a 2010-08-20 535: # file is created at temporary location and moved in place only when download completes
80f8e3804a 2010-08-20 536: temp_file = open(options.dir + os.sep + '.tmp', 'wb')
80f8e3804a 2010-08-20 537: buffer = source.read(block_size)
80f8e3804a 2010-08-20 538: megablocks = 0
80f8e3804a 2010-08-20 539: blocks = 0
80f8e3804a 2010-08-20 540: megs = 0
80f8e3804a 2010-08-20 541: while len(buffer) > 0:
80f8e3804a 2010-08-20 542: temp_file.write(buffer)
80f8e3804a 2010-08-20 543: buffer = source.read(block_size)
80f8e3804a 2010-08-20 544: blocks += 1
80f8e3804a 2010-08-20 545: if blocks > 102400/block_size:
80f8e3804a 2010-08-20 546: megablocks += 1
80f8e3804a 2010-08-20 547: if megablocks > 10:
80f8e3804a 2010-08-20 548: megablocks = megablocks - 10
80f8e3804a 2010-08-20 549: megs += 1
80f8e3804a 2010-08-20 550: print('{}Mb'.format(megs), end='')
80f8e3804a 2010-08-20 551: else:
80f8e3804a 2010-08-20 552: print('.', end='')
80f8e3804a 2010-08-20 553: blocks = blocks - 102400/block_size
80f8e3804a 2010-08-20 554: sys.stdout.flush()
80f8e3804a 2010-08-20 555: temp_file.close()
80f8e3804a 2010-08-20 556: print(']')
80f8e3804a 2010-08-20 557: os.renames(options.dir + os.sep + '.tmp', file_name)
80f8e3804a 2010-08-20 558:
80f8e3804a 2010-08-20 559: checked_files += 1
80f8e3804a 2010-08-20 560:
80f8e3804a 2010-08-20 561: # storing new time mark and storing new headers
80f8e3804a 2010-08-20 562: new_headers['_time'] = datetime.datetime.now()
80f8e3804a 2010-08-20 563: index[url] = new_headers
80f8e3804a 2010-08-20 564: index.sync()
80f8e3804a 2010-08-20 565:
80f8e3804a 2010-08-20 566: except urllib.error.HTTPError as error:
80f8e3804a 2010-08-20 567: # in case of error we don't need to do anything actually,
80f8e3804a 2010-08-20 568: # if file download stalls or fails the file would not be moved to it's location
80f8e3804a 2010-08-20 569: print(error)
80f8e3804a 2010-08-20 570:
80f8e3804a 2010-08-20 571: if options.verbose:
80f8e3804a 2010-08-20 572: print('[', len(unchecked_files), '/', checked_files, ']')
80f8e3804a 2010-08-20 573:
80f8e3804a 2010-08-20 574: # checking if there were any files downloaded, if yes - restarting sequence
80f8e3804a 2010-08-20 575: if checked_files == 0:
80f8e3804a 2010-08-20 576: break