0ef24b1937 2011-04-06 1: #!/usr/bin/env python
0ef24b1937 2011-04-06 2:
0ef24b1937 2011-04-06 3: from __future__ import division, print_function, unicode_literals
0ef24b1937 2011-04-06 4:
0ef24b1937 2011-04-06 5: import gevent.monkey
0ef24b1937 2011-04-06 6: gevent.monkey.patch_all()
0ef24b1937 2011-04-06 7:
0ef24b1937 2011-04-06 8: import fcntl, gevent.core, gevent.pool, gevent.queue, gevent.socket, os, psycopg2, re, sys
0ef24b1937 2011-04-06 9:
0ef24b1937 2011-04-06 10: # //inclusion start
0ef24b1937 2011-04-06 11: # Copyright (C) 2010 Daniele Varrazzo <daniele.varrazzo@gmail.com>
0ef24b1937 2011-04-06 12: # and licensed under the MIT license:
0ef24b1937 2011-04-06 13:
0ef24b1937 2011-04-06 14: def gevent_wait_callback(conn, timeout=None):
0ef24b1937 2011-04-06 15: """A wait callback useful to allow gevent to work with Psycopg."""
0ef24b1937 2011-04-06 16: while 1:
0ef24b1937 2011-04-06 17: state = conn.poll()
0ef24b1937 2011-04-06 18: if state == psycopg2.extensions.POLL_OK:
0ef24b1937 2011-04-06 19: break
0ef24b1937 2011-04-06 20: elif state == psycopg2.extensions.POLL_READ:
0ef24b1937 2011-04-06 21: gevent.socket.wait_read(conn.fileno(), timeout=timeout)
0ef24b1937 2011-04-06 22: elif state == psycopg2.extensions.POLL_WRITE:
0ef24b1937 2011-04-06 23: gevent.socket.wait_write(conn.fileno(), timeout=timeout)
0ef24b1937 2011-04-06 24: else:
0ef24b1937 2011-04-06 25: raise psycopg2.OperationalError("Bad result from poll: %r" % state)
0ef24b1937 2011-04-06 26:
0ef24b1937 2011-04-06 27: if not hasattr(psycopg2.extensions, 'set_wait_callback'):
0ef24b1937 2011-04-06 28: raise ImportError("support for coroutines not available in this Psycopg version (%s)" % psycopg2.__version__)
0ef24b1937 2011-04-06 29: psycopg2.extensions.set_wait_callback(gevent_wait_callback)
0ef24b1937 2011-04-06 30:
0ef24b1937 2011-04-06 31: # //inclusion end
fc934cead1 2009-10-13 32:
fc934cead1 2009-10-13 33: # this classes processes config file and substitutes default values
fc934cead1 2009-10-13 34: class Config:
ae30851739 2010-08-12 35: __slots__ = frozenset(['_config', '_default', '_section', 'options'])
b93dc49210 2009-10-13 36: _default = {
fc934cead1 2009-10-13 37: 'log': {
fc934cead1 2009-10-13 38: 'silent': 'no',
fc934cead1 2009-10-13 39: },
fc934cead1 2009-10-13 40: 'database': {
fc934cead1 2009-10-13 41: 'host': 'localhost',
fc934cead1 2009-10-13 42: 'database': 'squidTag',
fc934cead1 2009-10-13 43: },}
fc934cead1 2009-10-13 44:
fc934cead1 2009-10-13 45: # function to read in config file
fc934cead1 2009-10-13 46: def __init__(self):
0ef24b1937 2011-04-06 47: import ConfigParser, optparse, os
ae30851739 2010-08-12 48:
fc934cead1 2009-10-13 49: parser = optparse.OptionParser()
fc934cead1 2009-10-13 50: parser.add_option('-c', '--config', dest = 'config',
fc934cead1 2009-10-13 51: help = 'config file location', metavar = 'FILE',
fc934cead1 2009-10-13 52: default = '/usr/local/etc/squid-tagger.conf')
ae30851739 2010-08-12 53: parser.add_option('-d', '--dump', dest = 'dump',
ae30851739 2010-08-12 54: help = 'dump database', action = 'store_true', metavar = 'bool',
ae30851739 2010-08-12 55: default = False)
31e69c4237 2010-08-12 56: parser.add_option('-f', '--flush-database', dest = 'flush_db',
31e69c4237 2010-08-12 57: help = 'flush previous database on load', default = False,
31e69c4237 2010-08-12 58: action = 'store_true', metavar = 'bool')
31e69c4237 2010-08-12 59: parser.add_option('-l', '--load', dest = 'load',
31e69c4237 2010-08-12 60: help = 'load database', action = 'store_true', metavar = 'bool',
31e69c4237 2010-08-12 61: default = False)
d301d9adc6 2010-08-13 62: parser.add_option('-D', '--dump-conf', dest = 'dump_conf',
d301d9adc6 2010-08-13 63: help = 'dump filtering rules', default = False, metavar = 'bool',
d301d9adc6 2010-08-13 64: action = 'store_true')
d301d9adc6 2010-08-13 65: parser.add_option('-L', '--load-conf', dest = 'load_conf',
d301d9adc6 2010-08-13 66: help = 'load filtering rules', default = False, metavar = 'bool',
d301d9adc6 2010-08-13 67: action = 'store_true')
7c13294e9f 2010-08-07 68:
ae30851739 2010-08-12 69: (self.options, args) = parser.parse_args()
7c13294e9f 2010-08-07 70:
ae30851739 2010-08-12 71: assert os.access(self.options.config, os.R_OK), "Fatal error: can't read {}".format(self.options.config)
7c13294e9f 2010-08-07 72:
0ef24b1937 2011-04-06 73: self._config = ConfigParser.ConfigParser()
ae30851739 2010-08-12 74: self._config.readfp(open(self.options.config))
fc934cead1 2009-10-13 75:
fc934cead1 2009-10-13 76: # function to select config file section or create one
fc934cead1 2009-10-13 77: def section(self, section):
fc934cead1 2009-10-13 78: if not self._config.has_section(section):
fc934cead1 2009-10-13 79: self._config.add_section(section)
fc934cead1 2009-10-13 80: self._section = section
fc934cead1 2009-10-13 81:
fc934cead1 2009-10-13 82: # function to get config parameter, if parameter doesn't exists the default
fc934cead1 2009-10-13 83: # value or None is substituted
fc934cead1 2009-10-13 84: def __getitem__(self, name):
fc934cead1 2009-10-13 85: if not self._config.has_option(self._section, name):
b93dc49210 2009-10-13 86: if self._section in self._default:
b93dc49210 2009-10-13 87: if name in self._default[self._section]:
fc934cead1 2009-10-13 88: self._config.set(self._section, name, self._default[self._section][name])
fc934cead1 2009-10-13 89: else:
fc934cead1 2009-10-13 90: self._config.set(self._section, name, None)
fc934cead1 2009-10-13 91: else:
fc934cead1 2009-10-13 92: self._config.set(self._section, name, None)
b93dc49210 2009-10-13 93: return(self._config.get(self._section, name))
d500448801 2009-10-05 94:
fc934cead1 2009-10-13 95: # initializing and reading in config file
fc934cead1 2009-10-13 96: config = Config()
d500448801 2009-10-05 97:
39b97ced92 2011-06-05 98: # wrapper around syslog, can be muted
39b97ced92 2011-06-05 99: class Logger(object):
39b97ced92 2011-06-05 100: __slots__ = frozenset(['_syslog'])
39b97ced92 2011-06-05 101:
39b97ced92 2011-06-05 102: def __init__(self):
39b97ced92 2011-06-05 103: config.section('log')
39b97ced92 2011-06-05 104: if config['silent'] == 'yes':
39b97ced92 2011-06-05 105: self._syslog = None
39b97ced92 2011-06-05 106: else:
39b97ced92 2011-06-05 107: import syslog
39b97ced92 2011-06-05 108: self._syslog = syslog
39b97ced92 2011-06-05 109: self._syslog.openlog(str('squidTag'))
39b97ced92 2011-06-05 110:
39b97ced92 2011-06-05 111: def info(self, message):
39b97ced92 2011-06-05 112: if self._syslog != None:
39b97ced92 2011-06-05 113: self._syslog.syslog(self._syslog.LOG_INFO, message)
39b97ced92 2011-06-05 114:
39b97ced92 2011-06-05 115: def notice(self, message):
39b97ced92 2011-06-05 116: if self._syslog != None:
39b97ced92 2011-06-05 117: self._syslog.syslog(self._syslog.LOG_NOTICE, message)
39b97ced92 2011-06-05 118:
39b97ced92 2011-06-05 119: logger = Logger()
39b97ced92 2011-06-05 120:
39b97ced92 2011-06-05 121: # tiny wrapper around a file to make reads from it geventable
39b97ced92 2011-06-05 122: # or should i move this somewhere?
39b97ced92 2011-06-05 123:
39b97ced92 2011-06-05 124: class FReadlineQueue(gevent.queue.Queue):
39b97ced92 2011-06-05 125: # storing file descriptor, leftover
39b97ced92 2011-06-05 126: __slots__ = frozenset(['_fd', '_tail'])
39b97ced92 2011-06-05 127:
39b97ced92 2011-06-05 128: def __init__(self, fd):
39b97ced92 2011-06-05 129: # initialising class
39b97ced92 2011-06-05 130: gevent.queue.Queue.__init__(self)
39b97ced92 2011-06-05 131: # storing file descriptor
39b97ced92 2011-06-05 132: self._fd = fd
39b97ced92 2011-06-05 133: # using empty tail
39b97ced92 2011-06-05 134: self._tail = ''
39b97ced92 2011-06-05 135: # setting up event
39b97ced92 2011-06-05 136: self._install_wait()
39b97ced92 2011-06-05 137:
39b97ced92 2011-06-05 138: def _install_wait(self):
39b97ced92 2011-06-05 139: fileno = self._fd.fileno()
39b97ced92 2011-06-05 140: # putting file to nonblocking mode
39b97ced92 2011-06-05 141: fcntl.fcntl(fileno, fcntl.F_SETFL, fcntl.fcntl(fileno, fcntl.F_GETFL) | os.O_NONBLOCK)
39b97ced92 2011-06-05 142: # installing event handler
39b97ced92 2011-06-05 143: gevent.core.read_event(fileno, self._wait_helper)
39b97ced92 2011-06-05 144:
39b97ced92 2011-06-05 145: def _wait_helper(self, ev, evtype):
39b97ced92 2011-06-05 146: # reading one buffer from stream
39b97ced92 2011-06-05 147: buf = self._fd.read(4096)
39b97ced92 2011-06-05 148: # splitting stream by line ends
39b97ced92 2011-06-05 149: rows = buf.decode('l1').split('\n')
39b97ced92 2011-06-05 150: # adding tail to the first element if there is some tail
39b97ced92 2011-06-05 151: if len(self._tail) > 0:
39b97ced92 2011-06-05 152: rows[0] = self._tail + rows[0]
39b97ced92 2011-06-05 153: # popping out last (incomplete) element
39b97ced92 2011-06-05 154: self._tail = rows.pop(-1)
39b97ced92 2011-06-05 155: # dropping all complete elements to the queue
39b97ced92 2011-06-05 156: for row in rows:
39b97ced92 2011-06-05 157: self.put_nowait(row)
39b97ced92 2011-06-05 158: logger.info('request: ' + row)
39b97ced92 2011-06-05 159: if len(buf) > 0:
39b97ced92 2011-06-05 160: # no EOF, reinstalling event handler
39b97ced92 2011-06-05 161: gevent.core.read_event(self._fd.fileno(), self._wait_helper)
39b97ced92 2011-06-05 162: else:
39b97ced92 2011-06-05 163: # EOF found, sending EOF to queue
39b97ced92 2011-06-05 164: self.put_nowait(None)
39b97ced92 2011-06-05 165:
39b97ced92 2011-06-05 166: stdin = FReadlineQueue(sys.stdin)
39b97ced92 2011-06-05 167:
39b97ced92 2011-06-05 168: # wrapper around database
39b97ced92 2011-06-05 169: class tagDB(object):
39b97ced92 2011-06-05 170: __slots__ = frozenset(['_cursor', '_db'])
39b97ced92 2011-06-05 171:
39b97ced92 2011-06-05 172: def __init__(self):
39b97ced92 2011-06-05 173: config.section('database')
d2c7ba18a4 2011-09-14 174: if config['host'] == None:
d2c7ba18a4 2011-09-14 175: self._db = psycopg2.connect(
d2c7ba18a4 2011-09-14 176: database = config['database'],
d2c7ba18a4 2011-09-14 177: user = config['user'],
d2c7ba18a4 2011-09-14 178: password = config['password']
d2c7ba18a4 2011-09-14 179: )
d2c7ba18a4 2011-09-14 180: else:
d2c7ba18a4 2011-09-14 181: self._db = psycopg2.connect(
d2c7ba18a4 2011-09-14 182: database = config['database'],
d2c7ba18a4 2011-09-14 183: host = config['host'],
d2c7ba18a4 2011-09-14 184: user = config['user'],
d2c7ba18a4 2011-09-14 185: password = config['password']
d2c7ba18a4 2011-09-14 186: )
39b97ced92 2011-06-05 187: self._cursor = self._db.cursor()
39b97ced92 2011-06-05 188:
39b97ced92 2011-06-05 189: def _field_names(self):
39b97ced92 2011-06-05 190: names = []
39b97ced92 2011-06-05 191: for record in self._cursor.description:
39b97ced92 2011-06-05 192: names.append(record.name)
39b97ced92 2011-06-05 193: return(names)
39b97ced92 2011-06-05 194:
39b97ced92 2011-06-05 195: def check(self, site, ip_address):
39b97ced92 2011-06-05 196: self._cursor.execute("select * from (select redirect_url, regexp from site_rule where site <@ tripdomain(%s) and netmask >>= %s order by array_length(site, 1) desc) a group by redirect_url, regexp", [site, ip_address])
39b97ced92 2011-06-05 197: return(self._cursor.fetchall())
39b97ced92 2011-06-05 198:
39b97ced92 2011-06-05 199: def dump(self):
39b97ced92 2011-06-05 200: self._cursor.execute("select untrip(site) as site, tag::text, regexp from urls order by site, tag")
39b97ced92 2011-06-05 201: return(self._field_names(), self._cursor.fetchall())
39b97ced92 2011-06-05 202:
39b97ced92 2011-06-05 203: def load(self, data):
39b97ced92 2011-06-05 204: if config.options.flush_db:
39b97ced92 2011-06-05 205: self._cursor.execute('delete from urls;')
39b97ced92 2011-06-05 206: bundle = []
39b97ced92 2011-06-05 207: for row in data:
39b97ced92 2011-06-05 208: if len(row) == 2:
39b97ced92 2011-06-05 209: bundle.append([row[0], row[1], None])
39b97ced92 2011-06-05 210: else:
39b97ced92 2011-06-05 211: bundle.append([row[0], row[1], row[2]])
39b97ced92 2011-06-05 212: self._cursor.executemany("insert into urls (site, tag, regexp) values (tripdomain(%s), %s, %s)", bundle)
39b97ced92 2011-06-05 213: self._cursor.execute("update urls set regexp = NULL where regexp = ''")
39b97ced92 2011-06-05 214: self._db.commit()
39b97ced92 2011-06-05 215:
39b97ced92 2011-06-05 216: def load_conf(self, csv_data):
39b97ced92 2011-06-05 217: self._cursor.execute('delete from rules;')
39b97ced92 2011-06-05 218: bundle = []
39b97ced92 2011-06-05 219: for row in csv_data:
39b97ced92 2011-06-05 220: bundle.append([row[0], row[1], int(row[2]), int(row[3]), row[4], row[5], row[6]])
39b97ced92 2011-06-05 221: self._cursor.executemany("insert into rules (netmask, redirect_url, from_weekday, to_weekday, from_time, to_time, tag) values (%s::text::cidr, %s, %s, %s, %s::text::time, %s::text::time, %s::text::text[])", bundle)
39b97ced92 2011-06-05 222: self._db.commit()
39b97ced92 2011-06-05 223:
39b97ced92 2011-06-05 224: def dump_conf(self):
39b97ced92 2011-06-05 225: self._cursor.execute("select netmask, redirect_url, from_weekday, to_weekday, from_time, to_time, tag::text from rules")
39b97ced92 2011-06-05 226: return(self._field_names(), self._cursor.fetchall())
39b97ced92 2011-06-05 227:
39b97ced92 2011-06-05 228: # abstract class with basic checking functionality
39b97ced92 2011-06-05 229: class Checker(object):
39b97ced92 2011-06-05 230: __slots__ = frozenset(['_db', '_log', '_queue', '_request'])
39b97ced92 2011-06-05 231:
39b97ced92 2011-06-05 232: def __init__(self, queue, logger):
39b97ced92 2011-06-05 233: self._db = tagDB()
39b97ced92 2011-06-05 234: self._log = logger
39b97ced92 2011-06-05 235: self._log.info('started\n')
39b97ced92 2011-06-05 236: self._request = re.compile('^([0-9]+)\ (http|ftp):\/\/([-\w.:]+)\/([^ ]*)\ ([0-9.]+)\/(-|[\w\.]+)\ (-|\w+)\ (-|GET|HEAD|POST).*$')
39b97ced92 2011-06-05 237: self._queue = queue
39b97ced92 2011-06-05 238:
39b97ced92 2011-06-05 239: def process(self, id, site, ip_address, url_path, line = None):
39b97ced92 2011-06-05 240: #self._log.info('trying {}\n'.format(site))
39b97ced92 2011-06-05 241: result = self._db.check(site, ip_address)
39b97ced92 2011-06-05 242: reply = None
39b97ced92 2011-06-05 243: #self._log.info('got {} lines from database'.format(len(result)))
39b97ced92 2011-06-05 244: for row in result:
39b97ced92 2011-06-05 245: if row != None and row[0] != None:
39b97ced92 2011-06-05 246: if row[1] != None:
39b97ced92 2011-06-05 247: self._log.info('trying regexp "{}" versus "{}"\n'.format(row[1], url_path))
39b97ced92 2011-06-05 248: try:
39b97ced92 2011-06-05 249: if re.compile(row[1]).match(url_path):
39b97ced92 2011-06-05 250: reply = row[0].format(url_path)
39b97ced92 2011-06-05 251: else:
39b97ced92 2011-06-05 252: continue
39b97ced92 2011-06-05 253: except:
39b97ced92 2011-06-05 254: self._log.info("can't compile regexp")
39b97ced92 2011-06-05 255: else:
39b97ced92 2011-06-05 256: reply = row[0].format(url_path)
39b97ced92 2011-06-05 257: if reply != None:
39b97ced92 2011-06-05 258: self.writeline('{} {}\n'.format(id, reply))
39b97ced92 2011-06-05 259: return(True)
39b97ced92 2011-06-05 260: self.writeline('{}\n'.format(id))
39b97ced92 2011-06-05 261:
39b97ced92 2011-06-05 262: def check(self):
39b97ced92 2011-06-05 263: while True:
39b97ced92 2011-06-05 264: line = self._queue.get()
39b97ced92 2011-06-05 265: if line == None:
39b97ced92 2011-06-05 266: break
39b97ced92 2011-06-05 267: #self._log.info('request: ' + line)
39b97ced92 2011-06-05 268: request = self._request.match(line)
39b97ced92 2011-06-05 269: if request:
39b97ced92 2011-06-05 270: id = request.group(1)
39b97ced92 2011-06-05 271: #proto = request.group(2)
39b97ced92 2011-06-05 272: site = request.group(3)
39b97ced92 2011-06-05 273: url_path = request.group(4)
39b97ced92 2011-06-05 274: ip_address = request.group(5)
39b97ced92 2011-06-05 275: self.process(id, site, ip_address, url_path, line)
39b97ced92 2011-06-05 276: else:
39b97ced92 2011-06-05 277: self._log.info('bad request\n')
39b97ced92 2011-06-05 278: self.writeline(line + '\n')
39b97ced92 2011-06-05 279:
39b97ced92 2011-06-05 280: def writeline(self, string):
39b97ced92 2011-06-05 281: self._log.info('sending: ' + string)
39b97ced92 2011-06-05 282: sys.stdout.write(string)
39b97ced92 2011-06-05 283: sys.stdout.flush()
39b97ced92 2011-06-05 284:
39b97ced92 2011-06-05 285: def loop(self):
39b97ced92 2011-06-05 286: pool = gevent.pool.Pool()
39b97ced92 2011-06-05 287: pool.spawn(self.check)
39b97ced92 2011-06-05 288: pool.join()
39b97ced92 2011-06-05 289:
d301d9adc6 2010-08-13 290: if config.options.dump or config.options.load or config.options.dump_conf or config.options.load_conf:
d301d9adc6 2010-08-13 291: import csv
d301d9adc6 2010-08-13 292:
d301d9adc6 2010-08-13 293: tagdb = tagDB()
bde51dc0c7 2010-08-26 294: data_fields = ['site', 'tag', 'regexp']
d301d9adc6 2010-08-13 295: conf_fields = ['netmask', 'redirect_url', 'from_weekday', 'to_weekday', 'from_time', 'to_time', 'tag']
d301d9adc6 2010-08-13 296:
d301d9adc6 2010-08-13 297: if config.options.dump or config.options.dump_conf:
0ef24b1937 2011-04-06 298: csv_writer = csv.writer(sys.stdout)
d301d9adc6 2010-08-13 299: if config.options.dump:
bde51dc0c7 2010-08-26 300: dump = tagdb.dump()
bde51dc0c7 2010-08-26 301: elif config.options.dump_conf:
bde51dc0c7 2010-08-26 302: dump = tagdb.dump_conf()
bde51dc0c7 2010-08-26 303:
0ef24b1937 2011-04-06 304: csv_writer.writerow(dump[0])
0ef24b1937 2011-04-06 305: for line in dump[1]:
0ef24b1937 2011-04-06 306: csv_writer.writerow(line)
d301d9adc6 2010-08-13 307:
d301d9adc6 2010-08-13 308: elif config.options.load or config.options.load_conf:
d301d9adc6 2010-08-13 309: csv_reader = csv.reader(sys.stdin)
d301d9adc6 2010-08-13 310: first_row = next(csv_reader)
d301d9adc6 2010-08-13 311:
d301d9adc6 2010-08-13 312: if config.options.load:
bde51dc0c7 2010-08-26 313: fields = data_fields
bde51dc0c7 2010-08-26 314: load = tagdb.load
bde51dc0c7 2010-08-26 315: elif config.options.load_conf:
bde51dc0c7 2010-08-26 316: fields = conf_fields
bde51dc0c7 2010-08-26 317: load = tagdb.load_conf
bde51dc0c7 2010-08-26 318:
bde51dc0c7 2010-08-26 319: assert first_row == fields, 'File must contain csv data with theese columns: ' + repr(fields)
bde51dc0c7 2010-08-26 320: load(csv_reader)
d301d9adc6 2010-08-13 321:
d301d9adc6 2010-08-13 322: else:
d301d9adc6 2010-08-13 323: # main loop
39b97ced92 2011-06-05 324: Checker(stdin, logger).loop()