Squid url redirector

Annotation For squid-tagger.py
anonymous

Annotation For squid-tagger.py

Origin for each line in squid-tagger.py from check-in ae30851739:

d500448801 2009-10-05    1: #!/usr/bin/env python3.1
d500448801 2009-10-05    2: 
ae30851739 2010-08-12    3: import postgresql.api, re, sys
d500448801 2009-10-05    4: 
b93dc49210 2009-10-13    5: # wrapper around syslog, can be muted
d500448801 2009-10-05    6: class Logger:
d500448801 2009-10-05    7: 	__slots__ = frozenset(['_syslog'])
d500448801 2009-10-05    8: 
d500448801 2009-10-05    9: 	def __init__(self):
d500448801 2009-10-05   10: 		config.section('log')
d500448801 2009-10-05   11: 		if config['silent'] == 'yes':
d500448801 2009-10-05   12: 			self._syslog = None
d500448801 2009-10-05   13: 		else:
d500448801 2009-10-05   14: 			import syslog
d500448801 2009-10-05   15: 			self._syslog = syslog
d500448801 2009-10-05   16: 			self._syslog.openlog('squidTag')
d500448801 2009-10-05   17: 
d500448801 2009-10-05   18: 	def info(self, message):
4b22e25f24 2009-10-07   19: 		if self._syslog:
d500448801 2009-10-05   20: 			self._syslog.syslog(self._syslog.LOG_INFO, message)
d500448801 2009-10-05   21: 
d500448801 2009-10-05   22: 	def notice(self, message):
4b22e25f24 2009-10-07   23: 		if self._syslog:
d500448801 2009-10-05   24: 			self._syslog.syslog(self._syslog.LOG_NOTICE, message)
d500448801 2009-10-05   25: 
b93dc49210 2009-10-13   26: # wrapper around database
d500448801 2009-10-05   27: class tagDB:
ae30851739 2010-08-12   28: 	__slots__ = frozenset(('_check_stmt', '_db', '_dump_stmt'))
b93dc49210 2009-10-13   29: 
b93dc49210 2009-10-13   30: 	def __init__(self):
9450c03d41 2010-08-07   31: 		config.section('database')
9450c03d41 2010-08-07   32: 		self._db = postgresql.open(
9450c03d41 2010-08-07   33: 			'pq://{}:{}@{}/{}'.format(
9450c03d41 2010-08-07   34: 				config['user'],
9450c03d41 2010-08-07   35: 				config['password'],
9450c03d41 2010-08-07   36: 				config['host'],
9450c03d41 2010-08-07   37: 				config['database'],
9450c03d41 2010-08-07   38: 		) )
ae30851739 2010-08-12   39: 		self._check_stmt = None
ae30851739 2010-08-12   40: 		self._dump_stmt = None
b93dc49210 2009-10-13   41: 
b93dc49210 2009-10-13   42: 	def check(self, site, ip_address):
ae30851739 2010-08-12   43: 		if self._check_stmt == None:
ae30851739 2010-08-12   44: 			self._check_stmt = self._db.prepare("select redirect_url, regexp from site_rule where site <@ tripdomain($1) and netmask >> $2::text::inet order by array_length(site, 1) desc")
b93dc49210 2009-10-13   45: 		return(self._check_stmt(site, ip_address))
ae30851739 2010-08-12   46: 
ae30851739 2010-08-12   47: 	def dump(self):
ae30851739 2010-08-12   48: 		if self._dump_stmt == None:
ae30851739 2010-08-12   49: 			self._dump_stmt = self._db.prepare("select untrip(site), tag, regexp from urls natural join site natural join tag order by site, tag")
ae30851739 2010-08-12   50: 		return(self._dump_stmt())
b93dc49210 2009-10-13   51: 
b93dc49210 2009-10-13   52: # abstract class with basic checking functionality
b93dc49210 2009-10-13   53: class Checker:
ed7808827d 2009-10-14   54: 	__slots__ = frozenset(['_db', '_log'])
7e3418d94f 2009-10-12   55: 
7e3418d94f 2009-10-12   56: 	def __init__(self):
b93dc49210 2009-10-13   57: 		self._db = tagDB()
b93dc49210 2009-10-13   58: 		self._log = Logger()
7c13294e9f 2010-08-07   59: 		self._log.info('started\n')
b93dc49210 2009-10-13   60: 
ed7808827d 2009-10-14   61: 	def process(self, id, site, ip_address, url_path, line = None):
b93dc49210 2009-10-13   62: 		self._log.info('trying {}\n'.format(site))
b93dc49210 2009-10-13   63: 		result = self._db.check(site, ip_address)
b93dc49210 2009-10-13   64: 		#reply = '{}://{}/{}'.format(req[4], req[1], req[3])
b93dc49210 2009-10-13   65: 		reply = '-'
b93dc49210 2009-10-13   66: 		for row in result:
b93dc49210 2009-10-13   67: 			if row != None and row[0] != None:
b93dc49210 2009-10-13   68: 				if row[1] != None:
b93dc49210 2009-10-13   69: 					self._log.info('trying regexp "{}" versus "{}"\n'.format(row[1], url_path))
d2c54d0451 2010-03-01   70: 					try:
d2c54d0451 2010-03-01   71: 						if re.compile(row[1]).match(url_path):
1fa8a88371 2010-07-14   72: 							reply = row[0].format(url_path)
d2c54d0451 2010-03-01   73: 							break
d2c54d0451 2010-03-01   74: 						else:
d2c54d0451 2010-03-01   75: 							continue
d2c54d0451 2010-03-01   76: 					except:
d2c54d0451 2010-03-01   77: 						self._log.info("can't compile regexp")
b93dc49210 2009-10-13   78: 				else:
1fa8a88371 2010-07-14   79: 					reply = row[0].format(url_path)
b93dc49210 2009-10-13   80: 					break
b93dc49210 2009-10-13   81: 		self.writeline('{} {}\n'.format(id, reply))
7e3418d94f 2009-10-12   82: 
7e3418d94f 2009-10-12   83: 	def check(self, line):
7e3418d94f 2009-10-12   84: 		request = re.compile('^([0-9]+)\ (http|ftp):\/\/([-\w.:]+)\/([^ ]*)\ ([0-9.]+)\/(-|[\w\.]+)\ (-|\w+)\ (-|GET|HEAD|POST).*$').match(line)
7e3418d94f 2009-10-12   85: 		if request:
7e3418d94f 2009-10-12   86: 			id = request.group(1)
7e3418d94f 2009-10-12   87: 			#proto = request.group(2)
7e3418d94f 2009-10-12   88: 			site = request.group(3)
7e3418d94f 2009-10-12   89: 			url_path = request.group(4)
7e3418d94f 2009-10-12   90: 			ip_address = request.group(5)
ed7808827d 2009-10-14   91: 			self.process(id, site, ip_address, url_path, line)
26fc9b34d9 2010-08-07   92: 			return(True)
7e3418d94f 2009-10-12   93: 		else:
7e3418d94f 2009-10-12   94: 			self._log.info('bad request\n')
b93dc49210 2009-10-13   95: 			self.writeline(line)
26fc9b34d9 2010-08-07   96: 			return(False)
b93dc49210 2009-10-13   97: 
b93dc49210 2009-10-13   98: 	def writeline(self, string):
b93dc49210 2009-10-13   99: 		self._log.info('sending: ' + string)
b93dc49210 2009-10-13  100: 		sys.stdout.write(string)
b93dc49210 2009-10-13  101: 		sys.stdout.flush()
b93dc49210 2009-10-13  102: 
ed7808827d 2009-10-14  103: 	def loop(self):
ed7808827d 2009-10-14  104: 		while True:
ed7808827d 2009-10-14  105: 			line = sys.stdin.readline()
ed7808827d 2009-10-14  106: 			if len(line) == 0:
ed7808827d 2009-10-14  107: 				break
ed7808827d 2009-10-14  108: 			self.check(line)
ed7808827d 2009-10-14  109: 
b93dc49210 2009-10-13  110: # threaded checking facility
b93dc49210 2009-10-13  111: class CheckerThread(Checker):
ed7808827d 2009-10-14  112: 	__slots__ = frozenset(['_lock', '_lock_exit', '_lock_queue', '_queue'])
b93dc49210 2009-10-13  113: 
b93dc49210 2009-10-13  114: 	def __init__(self):
ae30851739 2010-08-12  115: 		import _thread
ae30851739 2010-08-12  116: 
ed7808827d 2009-10-14  117: 		# basic initialisation
b93dc49210 2009-10-13  118: 		Checker.__init__(self)
ed7808827d 2009-10-14  119: 
b93dc49210 2009-10-13  120: 		# Spin lock. Loop acquires it on start then releases it when holding queue
b93dc49210 2009-10-13  121: 		# lock. This way the thread proceeds without stops while queue has data and
b93dc49210 2009-10-13  122: 		# gets stalled when no data present. The lock is released by queue writer
b93dc49210 2009-10-13  123: 		# after storing something into the queue
b93dc49210 2009-10-13  124: 		self._lock = _thread.allocate_lock()
ed7808827d 2009-10-14  125: 		self._lock_exit = _thread.allocate_lock()
b93dc49210 2009-10-13  126: 		self._lock_queue = _thread.allocate_lock()
b93dc49210 2009-10-13  127: 		self._lock.acquire()
b93dc49210 2009-10-13  128: 		self._queue = []
b93dc49210 2009-10-13  129: 		_thread.start_new_thread(self._start, ())
b93dc49210 2009-10-13  130: 
b93dc49210 2009-10-13  131: 	def _start(self):
b93dc49210 2009-10-13  132: 		while True:
b93dc49210 2009-10-13  133: 			self._lock.acquire()
ed7808827d 2009-10-14  134: 			with self._lock_queue:
ed7808827d 2009-10-14  135: 				# yes this should be written this way, and yes, this is why I hate threading
ed7808827d 2009-10-14  136: 				if len(self._queue) > 1:
ed7808827d 2009-10-14  137: 					if self._lock.locked():
ed7808827d 2009-10-14  138: 						self._lock.release()
ed7808827d 2009-10-14  139: 				req = self._queue.pop(0)
ed7808827d 2009-10-14  140: 			Checker.process(self, req[0], req[1], req[2], req[3])
ed7808827d 2009-10-14  141: 			with self._lock_queue:
ed7808827d 2009-10-14  142: 				if len(self._queue) == 0:
ed7808827d 2009-10-14  143: 					if self._lock_exit.locked():
ed7808827d 2009-10-14  144: 						self._lock_exit.release()
ed7808827d 2009-10-14  145: 
ed7808827d 2009-10-14  146: 	def process(self, id, site, ip_address, url_path, line):
ed7808827d 2009-10-14  147: 		with self._lock_queue:
ed7808827d 2009-10-14  148: 			self._queue.append((id, site, ip_address, url_path))
ed7808827d 2009-10-14  149: 			self._log.info('request {} queued ({})\n'.format(id, line))
ed7808827d 2009-10-14  150: 			if not self._lock_exit.locked():
ed7808827d 2009-10-14  151: 				self._lock_exit.acquire()
ed7808827d 2009-10-14  152: 			if self._lock.locked():
ed7808827d 2009-10-14  153: 				self._lock.release()
ed7808827d 2009-10-14  154: 
ed7808827d 2009-10-14  155: 	def loop(self):
ed7808827d 2009-10-14  156: 		while True:
ed7808827d 2009-10-14  157: 			line = sys.stdin.readline()
ed7808827d 2009-10-14  158: 			if len(line) == 0:
ed7808827d 2009-10-14  159: 				break
ed7808827d 2009-10-14  160: 			self.check(line)
ed7808827d 2009-10-14  161: 		self._lock_exit.acquire()
ed7808827d 2009-10-14  162: 
26fc9b34d9 2010-08-07  163: # kqueue enabled class for BSD's
ed7808827d 2009-10-14  164: class CheckerKqueue(Checker):
ed7808827d 2009-10-14  165: 	__slots__ = frozenset(['_kq', '_select', '_queue'])
ed7808827d 2009-10-14  166: 
ed7808827d 2009-10-14  167: 	def __init__(self):
ed7808827d 2009-10-14  168: 		# basic initialisation
ed7808827d 2009-10-14  169: 		Checker.__init__(self)
ed7808827d 2009-10-14  170: 
ed7808827d 2009-10-14  171: 		# importing select module
ed7808827d 2009-10-14  172: 		import select
ed7808827d 2009-10-14  173: 		self._select = select
ed7808827d 2009-10-14  174: 
ed7808827d 2009-10-14  175: 		# kreating kqueue
ed7808827d 2009-10-14  176: 		self._kq = self._select.kqueue()
7c13294e9f 2010-08-07  177: 		assert self._kq.fileno() != -1, "Fatal error: can't initialise kqueue."
ed7808827d 2009-10-14  178: 
ed7808827d 2009-10-14  179: 		# watching sys.stdin for data
ed7808827d 2009-10-14  180: 		self._kq.control([self._select.kevent(sys.stdin, self._select.KQ_FILTER_READ, self._select.KQ_EV_ADD)], 0)
ed7808827d 2009-10-14  181: 
ed7808827d 2009-10-14  182: 		# creating data queue
ed7808827d 2009-10-14  183: 		self._queue = []
ed7808827d 2009-10-14  184: 
ed7808827d 2009-10-14  185: 	def loop(self):
ed7808827d 2009-10-14  186: 		# Wait for data by default
ed7808827d 2009-10-14  187: 		timeout = None
26fc9b34d9 2010-08-07  188: 		eof = False
26fc9b34d9 2010-08-07  189: 		buffer = ''
ed7808827d 2009-10-14  190: 		while True:
26fc9b34d9 2010-08-07  191: 			# checking if there is any data or witing for data to arrive
ed7808827d 2009-10-14  192: 			kevs = self._kq.control(None, 1, timeout)
7c13294e9f 2010-08-07  193: 
ae1c0114c1 2010-08-09  194: 			for kev in kevs:
ae1c0114c1 2010-08-09  195: 				if kev.filter == self._select.KQ_FILTER_READ and kev.data > 0:
ae1c0114c1 2010-08-09  196: 					# reading data in
ae1c0114c1 2010-08-09  197: 					new_buffer = sys.stdin.read(kev.data)
ae1c0114c1 2010-08-09  198: 					# if no data was sent - we have reached end of file
ae1c0114c1 2010-08-09  199: 					if len(new_buffer) == 0:
ae1c0114c1 2010-08-09  200: 						eof = True
ae1c0114c1 2010-08-09  201: 					else:
ae1c0114c1 2010-08-09  202: 						# adding current buffer to old buffer remains
ae1c0114c1 2010-08-09  203: 						buffer += new_buffer
ae1c0114c1 2010-08-09  204: 						# splitting to lines
ae1c0114c1 2010-08-09  205: 						lines = buffer.split('\n')
ae1c0114c1 2010-08-09  206: 						# last line that was not terminate by newline returns to buffer
ae1c0114c1 2010-08-09  207: 						buffer = lines[-1]
ae1c0114c1 2010-08-09  208: 						# an only if there was at least one newline
ae1c0114c1 2010-08-09  209: 						if len(lines) > 1:
ae1c0114c1 2010-08-09  210: 							for line in lines[:-1]:
ae1c0114c1 2010-08-09  211: 								# add data to the queue
ae1c0114c1 2010-08-09  212: 								if self.check(line + '\n'):
ae1c0114c1 2010-08-09  213: 									# don't wait for more data, start processing
ae1c0114c1 2010-08-09  214: 									timeout = 0
ae1c0114c1 2010-08-09  215: 
ae1c0114c1 2010-08-09  216: 				# detect end of stream and exit if possible
ae1c0114c1 2010-08-09  217: 				if kev.flags >> 15 == 1:
ae1c0114c1 2010-08-09  218: 					self._kq.control([self._select.kevent(sys.stdin, self._select.KQ_FILTER_READ, self._select.KQ_EV_DELETE)], 0)
ae1c0114c1 2010-08-09  219: 					eof = True
ae1c0114c1 2010-08-09  220: 
ae1c0114c1 2010-08-09  221: 			if len(kevs) == 0:
7c13294e9f 2010-08-07  222: 				if len(self._queue) > 0:
7c13294e9f 2010-08-07  223: 					# get one request and process it
26fc9b34d9 2010-08-07  224: 					req = self._queue.pop(0)
26fc9b34d9 2010-08-07  225: 					Checker.process(self, req[0], req[1], req[2], req[3])
26fc9b34d9 2010-08-07  226: 					if len(self._queue) == 0:
26fc9b34d9 2010-08-07  227: 						# wait for data - we have nothing to process
26fc9b34d9 2010-08-07  228: 						timeout = None
7c13294e9f 2010-08-07  229: 
7c13294e9f 2010-08-07  230: 			# if queue is empty and we reached end of stream - we can exit
7c13294e9f 2010-08-07  231: 			if len(self._queue) == 0 and eof:
7c13294e9f 2010-08-07  232: 				break
ed7808827d 2009-10-14  233: 
ed7808827d 2009-10-14  234: 	def process(self, id, site, ip_address, url_path, line):
26fc9b34d9 2010-08-07  235: 		# simply adding data to the queue
ed7808827d 2009-10-14  236: 		self._queue.append((id, site, ip_address, url_path))
ed7808827d 2009-10-14  237: 		self._log.info('request {} queued ({})\n'.format(id, line))
7e3418d94f 2009-10-12  238: 
fc934cead1 2009-10-13  239: # this classes processes config file and substitutes default values
d500448801 2009-10-05  240: class Config:
ae30851739 2010-08-12  241: 	__slots__ = frozenset(['_config', '_default', '_section', 'options'])
b93dc49210 2009-10-13  242: 	_default = {
b93dc49210 2009-10-13  243: 		'reactor': {
b93dc49210 2009-10-13  244: 			'reactor': 'thread',
b93dc49210 2009-10-13  245: 		},
fc934cead1 2009-10-13  246: 		'log': {
fc934cead1 2009-10-13  247: 			'silent': 'no',
fc934cead1 2009-10-13  248: 		},
fc934cead1 2009-10-13  249: 		'database': {
fc934cead1 2009-10-13  250: 			'host': 'localhost',
fc934cead1 2009-10-13  251: 			'database': 'squidTag',
fc934cead1 2009-10-13  252: 	},}
d500448801 2009-10-05  253: 
fc934cead1 2009-10-13  254: 	# function to read in config file
d500448801 2009-10-05  255: 	def __init__(self):
ae30851739 2010-08-12  256: 		import configparser, optparse, os
ae30851739 2010-08-12  257: 
d500448801 2009-10-05  258: 		parser = optparse.OptionParser()
d500448801 2009-10-05  259: 		parser.add_option('-c', '--config', dest = 'config',
d500448801 2009-10-05  260: 			help = 'config file location', metavar = 'FILE',
d500448801 2009-10-05  261: 			default = '/usr/local/etc/squid-tagger.conf')
ae30851739 2010-08-12  262: 		parser.add_option('-d', '--dump', dest = 'dump',
ae30851739 2010-08-12  263: 			help = 'dump database', action = 'store_true', metavar = 'bool',
ae30851739 2010-08-12  264: 			default = False)
d500448801 2009-10-05  265: 
ae30851739 2010-08-12  266: 		(self.options, args) = parser.parse_args()
d500448801 2009-10-05  267: 
ae30851739 2010-08-12  268: 		assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
d500448801 2009-10-05  269: 
d500448801 2009-10-05  270: 		self._config = configparser.ConfigParser()
ae30851739 2010-08-12  271: 		self._config.readfp(open(self.options.config))
d500448801 2009-10-05  272: 
fc934cead1 2009-10-13  273: 	# function to select config file section or create one
d500448801 2009-10-05  274: 	def section(self, section):
fc934cead1 2009-10-13  275: 		if not self._config.has_section(section):
fc934cead1 2009-10-13  276: 			self._config.add_section(section)
d500448801 2009-10-05  277: 		self._section = section
d500448801 2009-10-05  278: 
fc934cead1 2009-10-13  279: 	# function to get config parameter, if parameter doesn't exists the default
fc934cead1 2009-10-13  280: 	# value or None is substituted
d500448801 2009-10-05  281: 	def __getitem__(self, name):
fc934cead1 2009-10-13  282: 		if not self._config.has_option(self._section, name):
b93dc49210 2009-10-13  283: 			if self._section in self._default:
b93dc49210 2009-10-13  284: 				if name in self._default[self._section]:
fc934cead1 2009-10-13  285: 					self._config.set(self._section, name, self._default[self._section][name])
fc934cead1 2009-10-13  286: 				else:
fc934cead1 2009-10-13  287: 					self._config.set(self._section, name, None)
fc934cead1 2009-10-13  288: 			else:
fc934cead1 2009-10-13  289: 				self._config.set(self._section, name, None)
b93dc49210 2009-10-13  290: 		return(self._config.get(self._section, name))
d500448801 2009-10-05  291: 
fc934cead1 2009-10-13  292: # initializing and reading in config file
d500448801 2009-10-05  293: config = Config()
d500448801 2009-10-05  294: 
ae30851739 2010-08-12  295: if config.options.dump:
ae30851739 2010-08-12  296: 	# dumping database
ae30851739 2010-08-12  297: 	import csv
ae30851739 2010-08-12  298: 
ae30851739 2010-08-12  299: 	tagdb = tagDB()
ae30851739 2010-08-12  300: 
ae30851739 2010-08-12  301: 	csv_writer = csv.writer(sys.stdout)
ae30851739 2010-08-12  302: 	csv_writer.writerow(['site', 'tags', 'regexp'])
ae30851739 2010-08-12  303: 	for row in tagdb.dump():
ae30851739 2010-08-12  304: 		csv_writer.writerow([row[0], '{' + ','.join(row[1]) + '}', row[2]])
ae30851739 2010-08-12  305: 
ae30851739 2010-08-12  306: else:
ae30851739 2010-08-12  307: 	# main loop
ae30851739 2010-08-12  308: 	config.section('reactor')
ae30851739 2010-08-12  309: 	if config['reactor'] == 'thread':
ae30851739 2010-08-12  310: 		checker = CheckerThread()
ae30851739 2010-08-12  311: 	elif config['reactor'] == 'plain':
ae30851739 2010-08-12  312: 		checker = Checker()
ae30851739 2010-08-12  313: 	elif config['reactor'] == 'kqueue':
ae30851739 2010-08-12  314: 		checker = CheckerKqueue()
ae30851739 2010-08-12  315: 
ae30851739 2010-08-12  316: 	checker.loop()