Overview
Comment: | the idead of moving out array values to they own tables was good at saving disk space but very bad at speed, reverted all of that |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | master | trunk |
Files: | files | file ages | folders |
SHA3-256: |
bde51dc0c7f885e36dffe60df2cfe597 |
User & Date: | c.kworr@d4daf22a-8aaf-11de-a64d-234b64dd91b4 on 2010-08-26 19:49:47.000 |
Other Links: | branch diff | manifest | tags |
Context
2010-08-30
| ||
08:32 | fixed field naming after database upgrade check-in: 8a8515f5c4 user: c.kworr@d4daf22a-8aaf-11de-a64d-234b64dd91b4 tags: master, trunk | |
2010-08-26
| ||
19:49 | the idead of moving out array values to they own tables was good at saving disk space but very bad at speed, reverted all of that check-in: bde51dc0c7 user: c.kworr@d4daf22a-8aaf-11de-a64d-234b64dd91b4 tags: master, trunk | |
2010-08-13
| ||
14:53 | new script which can dump squidguard database to compatible format check-in: 1a367d050d user: c.kworr@d4daf22a-8aaf-11de-a64d-234b64dd91b4 tags: master, trunk | |
Changes
Modified database.sql
from [c2d74124ad]
to [952aeefe4b].
|
| < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | -- general array sorting and domain processing functions -- sorts array CREATE or replace FUNCTION sort(original anyarray) RETURNS anyarray LANGUAGE sql IMMUTABLE STRICT AS $_$ select array_agg(item) as result from (select unnest($1) as item order by item) a; $_$; -- sorts array and removes duplicates CREATE or replace FUNCTION usort(original anyarray) RETURNS anyarray LANGUAGE sql IMMUTABLE STRICT AS $_$ select array_agg(item) as result from (select distinct unnest($1) as item order by item) a; $_$; -- general database structure -- table to hold all rules CREATE TABLE rules ( netmask cidr NOT NULL, redirect_url text DEFAULT 'about::blank'::text NOT NULL, from_weekday smallint DEFAULT 0 NOT NULL, to_weekday smallint DEFAULT 6 NOT NULL, from_time time without time zone DEFAULT '00:00:00'::time without time zone NOT NULL, to_time time without time zone DEFAULT '23:59:59'::time without time zone NOT NULL, tag text[] NOT NULL ); -- table to hold tag - site links CREATE TABLE urls ( date_added timestamp without time zone DEFAULT ('now'::text)::timestamp(0) without time zone NOT NULL, site text[] NOT NULL, tag text[] NOT NULL, regexp text ); create unique index urls_rst on urls (regexp, usort(site), usort(tag)); -- rule to join all tables into one to simplify access -- automaticall uses current day and time data create view site_rule as select redirect_url, netmask, site, regexp from rules join urls on (urls.tag && rules.tag) where ('now'::text)::time without time zone >= from_time and ('now'::text)::time without time zone <= to_time and date_part('dow'::text, now()) >= (from_weekday)::double precision and date_part('dow'::text, now()) <= (to_weekday)::double precision; CREATE PROCEDURAL LANGUAGE plpgsql; -- transforms domain into ordered array for indexing CREATE or replace FUNCTION tripdomain(url text) RETURNS text[] LANGUAGE plpgsql IMMUTABLE STRICT AS $_$ declare result text[]; splitted text[]; |
︙ | ︙ | |||
129 130 131 132 133 134 135 | pair := string_to_array(site[x], ':'); splitted[0 - pair[2]::integer] := pair[1]; x := x - 1; end loop; return array_to_string(splitted, '.'); end; $_$; | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 84 85 86 87 88 89 90 | pair := string_to_array(site[x], ':'); splitted[0 - pair[2]::integer] := pair[1]; x := x - 1; end loop; return array_to_string(splitted, '.'); end; $_$; |
Modified squid-tagger.py
from [c46e709b29]
to [08a021606d].
︙ | ︙ | |||
40 41 42 43 44 45 46 | def check(self, site, ip_address): if self._check_stmt == None: self._check_stmt = self._db.prepare("select redirect_url, regexp from site_rule where site <@ tripdomain($1) and netmask >> $2::text::inet order by array_length(site, 1) desc") return(self._check_stmt(site, ip_address)) def dump(self): | | | < < < | | | | | | | | | > | 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | def check(self, site, ip_address): if self._check_stmt == None: self._check_stmt = self._db.prepare("select redirect_url, regexp from site_rule where site <@ tripdomain($1) and netmask >> $2::text::inet order by array_length(site, 1) desc") return(self._check_stmt(site, ip_address)) def dump(self): return(self._db.prepare("copy (select untrip(site) as site, tag, regexp from urls order by site, tag) to stdout csv header")()) def load(self, data): with self._db.xact(): if config.options.flush_db: self._db.execute('delete from urls;') insert = self._db.prepare("insert into urls (site, tag, regexp) values (tripdomain($1), $2::text::text[], $3)") for row in data: if len(row) == 2: insert(row[0], row[1], None) else: insert(row[0], row[1], row[2]) self._db.execute("update urls set regexp = NULL where regexp = ''") self._db.execute('vacuum analyze urls;') def load_conf(self, csv_data): with self._db.xact(): self._db.execute('delete from rules;') insertconf = self._db.prepare("insert into rules (netmask, redirect_url, from_weekday, to_weekday, from_time, to_time, tag) values ($1::text::cidr, $2, $3, $4, $5::text::time, $6::text::time, $7::text::text[])") for row in csv_data: insertconf(row[0], row[1], int(row[2]), int(row[3]), row[4], row[5], row[6]) self._db.execute('vacuum analyze rules;') def dump_conf(self): return(self._db.prepare("copy (select netmask, redirect_url, from_weekday, to_weekday, from_time, to_time, tag from rules) to stdout csv header")()) # abstract class with basic checking functionality class Checker: __slots__ = frozenset(['_db', '_log', '_request']) def __init__(self): self._db = tagDB() self._log = Logger() self._log.info('started\n') self._request = re.compile('^([0-9]+)\ (http|ftp):\/\/([-\w.:]+)\/([^ ]*)\ ([0-9.]+)\/(-|[\w\.]+)\ (-|\w+)\ (-|GET|HEAD|POST).*$') def process(self, id, site, ip_address, url_path, line = None): self._log.info('trying {}\n'.format(site)) result = self._db.check(site, ip_address) reply = '-' for row in result: if row != None and row[0] != None: |
︙ | ︙ | |||
100 101 102 103 104 105 106 | self._log.info("can't compile regexp") else: reply = row[0].format(url_path) break self.writeline('{} {}\n'.format(id, reply)) def check(self, line): | | | 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | self._log.info("can't compile regexp") else: reply = row[0].format(url_path) break self.writeline('{} {}\n'.format(id, reply)) def check(self, line): request = self._request.match(line) if request: id = request.group(1) #proto = request.group(2) site = request.group(3) url_path = request.group(4) ip_address = request.group(5) self.process(id, site, ip_address, url_path, line) |
︙ | ︙ | |||
236 237 238 239 240 241 242 243 244 245 246 247 248 249 | # don't wait for more data, start processing timeout = 0 # detect end of stream and exit if possible if kev.flags >> 15 == 1: self._kq.control([self._select.kevent(sys.stdin, self._select.KQ_FILTER_READ, self._select.KQ_EV_DELETE)], 0) eof = True if len(kevs) == 0: if len(self._queue) > 0: # get one request and process it req = self._queue.pop(0) Checker.process(self, req[0], req[1], req[2], req[3]) if len(self._queue) == 0: | > | 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 | # don't wait for more data, start processing timeout = 0 # detect end of stream and exit if possible if kev.flags >> 15 == 1: self._kq.control([self._select.kevent(sys.stdin, self._select.KQ_FILTER_READ, self._select.KQ_EV_DELETE)], 0) eof = True #timeout = 0 if len(kevs) == 0: if len(self._queue) > 0: # get one request and process it req = self._queue.pop(0) Checker.process(self, req[0], req[1], req[2], req[3]) if len(self._queue) == 0: |
︙ | ︙ | |||
284 285 286 287 288 289 290 | default = '/usr/local/etc/squid-tagger.conf') parser.add_option('-d', '--dump', dest = 'dump', help = 'dump database', action = 'store_true', metavar = 'bool', default = False) parser.add_option('-f', '--flush-database', dest = 'flush_db', help = 'flush previous database on load', default = False, action = 'store_true', metavar = 'bool') | < < < | 283 284 285 286 287 288 289 290 291 292 293 294 295 296 | default = '/usr/local/etc/squid-tagger.conf') parser.add_option('-d', '--dump', dest = 'dump', help = 'dump database', action = 'store_true', metavar = 'bool', default = False) parser.add_option('-f', '--flush-database', dest = 'flush_db', help = 'flush previous database on load', default = False, action = 'store_true', metavar = 'bool') parser.add_option('-l', '--load', dest = 'load', help = 'load database', action = 'store_true', metavar = 'bool', default = False) parser.add_option('-D', '--dump-conf', dest = 'dump_conf', help = 'dump filtering rules', default = False, metavar = 'bool', action = 'store_true') parser.add_option('-L', '--load-conf', dest = 'load_conf', |
︙ | ︙ | |||
330 331 332 333 334 335 336 | # initializing and reading in config file config = Config() if config.options.dump or config.options.load or config.options.dump_conf or config.options.load_conf: import csv tagdb = tagDB() | | < < < | < < < < | | > > < < | | < < | > > | | 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 | # initializing and reading in config file config = Config() if config.options.dump or config.options.load or config.options.dump_conf or config.options.load_conf: import csv tagdb = tagDB() data_fields = ['site', 'tag', 'regexp'] conf_fields = ['netmask', 'redirect_url', 'from_weekday', 'to_weekday', 'from_time', 'to_time', 'tag'] if config.options.dump or config.options.dump_conf: if config.options.dump: dump = tagdb.dump() elif config.options.dump_conf: dump = tagdb.dump_conf() for line in dump: sys.stdout.write(line.decode('utf-8')) elif config.options.load or config.options.load_conf: csv_reader = csv.reader(sys.stdin) first_row = next(csv_reader) if config.options.load: fields = data_fields load = tagdb.load elif config.options.load_conf: fields = conf_fields load = tagdb.load_conf assert first_row == fields, 'File must contain csv data with theese columns: ' + repr(fields) load(csv_reader) else: # main loop config.section('reactor') if config['reactor'] == 'thread': checker = CheckerThread() elif config['reactor'] == 'plain': checker = Checker() elif config['reactor'] == 'kqueue': checker = CheckerKqueue() checker.loop() |