Index: database.sql ================================================================== --- database.sql +++ database.sql @@ -1,85 +1,5 @@ --- general database structure --- table to hold all rules -CREATE TABLE rules ( - netmask cidr NOT NULL, - redirect_url text DEFAULT 'about::blank'::text NOT NULL, - from_weekday smallint DEFAULT 0 NOT NULL, - to_weekday smallint DEFAULT 6 NOT NULL, - from_time time without time zone DEFAULT '00:00:00'::time without time zone NOT NULL, - to_time time without time zone DEFAULT '23:59:59'::time without time zone NOT NULL, - id_tag smallint NOT NULL -); - -ALTER TABLE ONLY rules - ADD CONSTRAINT rules_tag_f FOREIGN KEY (id_tag) REFERENCES tag(id_tag) MATCH FULL - ON UPDATE RESTRICT ON DELETE RESTRICT DEFERRABLE INITIALLY DEFERRED; - --- table to hold site arrays -CREATE TABLE site ( - id_site serial, - site text[] NOT NULL -); - -ALTER TABLE ONLY site - ADD CONSTRAINT site_pkey PRIMARY KEY (id_site); - -CREATE UNIQUE INDEX site_u ON site (usort(site)); - -CREATE INDEX site_g ON site USING gin (site); - --- table to hold tag combinations -CREATE TABLE tag ( - id_tag serial, - tag text[] NOT NULL -); - -ALTER TABLE ONLY tag - ADD CONSTRAINT tag_pkey PRIMARY KEY (id_tag); - -CREATE UNIQUE INDEX tag_u ON tag (usort(tag)); - -CREATE INDEX tag_g ON tag USING gin (tag); - --- table to hold tag - site links -CREATE TABLE urls ( - date_added timestamp without time zone DEFAULT ('now'::text)::timestamp(0) without time zone NOT NULL, - id_site integer NOT NULL, - id_tag smallint NOT NULL, - regexp text -); - -CREATE UNIQUE INDEX urls_pkey ON urls USING btree (id_site, regexp); - -CREATE INDEX urls_id_tag ON urls USING btree (id_tag); - -ALTER TABLE ONLY urls - ADD CONSTRAINT urls_site_f FOREIGN KEY (id_site) REFERENCES site(id_site) MATCH FULL - ON UPDATE RESTRICT ON DELETE RESTRICT DEFERRABLE INITIALLY DEFERRED; - -ALTER TABLE ONLY urls - ADD CONSTRAINT urls_tag_f FOREIGN KEY (id_tag) REFERENCES tag(id_tag) MATCH FULL - ON UPDATE RESTRICT ON DELETE RESTRICT DEFERRABLE INITIALLY DEFERRED; - --- rule to join all tables into one to simplify access --- automaticall uses current day and time data -CREATE VIEW site_rule AS -SELECT a.redirect_url, a.netmask, b.site, b.regexp -FROM (( - SELECT rules.redirect_url, tag.tag AS rule_tag, rules.netmask - FROM rules NATURAL JOIN tag - WHERE ('now'::text)::time without time zone >= rules.from_time - AND ('now'::text)::time without time zone <= rules.to_time - AND date_part('dow'::text, now()) >= (rules.from_weekday)::double precision - AND date_part('dow'::text, now()) <= (rules.to_weekday)::double precision -) a JOIN ( - SELECT site.site, tag.tag AS url_tag, regexp - FROM urls NATURAL JOIN tag NATURAL JOIN site -) b ON (b.url_tag && a.rule_tag)); - -CREATE PROCEDURAL LANGUAGE plpgsql; - -- general array sorting and domain processing functions -- sorts array CREATE or replace FUNCTION sort(original anyarray) RETURNS anyarray LANGUAGE sql IMMUTABLE STRICT AS $_$ @@ -91,10 +11,45 @@ LANGUAGE sql IMMUTABLE STRICT AS $_$ select array_agg(item) as result from (select distinct unnest($1) as item order by item) a; $_$; +-- general database structure +-- table to hold all rules +CREATE TABLE rules ( + netmask cidr NOT NULL, + redirect_url text DEFAULT 'about::blank'::text NOT NULL, + from_weekday smallint DEFAULT 0 NOT NULL, + to_weekday smallint DEFAULT 6 NOT NULL, + from_time time without time zone DEFAULT '00:00:00'::time without time zone NOT NULL, + to_time time without time zone DEFAULT '23:59:59'::time without time zone NOT NULL, + tag text[] NOT NULL +); + +-- table to hold tag - site links +CREATE TABLE urls ( + date_added timestamp without time zone DEFAULT ('now'::text)::timestamp(0) without time zone NOT NULL, + site text[] NOT NULL, + tag text[] NOT NULL, + regexp text +); + +create unique index urls_rst on urls (regexp, usort(site), usort(tag)); + +-- rule to join all tables into one to simplify access +-- automaticall uses current day and time data +create view site_rule as +select redirect_url, netmask, site, regexp +from rules join urls +on (urls.tag && rules.tag) +where ('now'::text)::time without time zone >= from_time + and ('now'::text)::time without time zone <= to_time + and date_part('dow'::text, now()) >= (from_weekday)::double precision + and date_part('dow'::text, now()) <= (to_weekday)::double precision; + +CREATE PROCEDURAL LANGUAGE plpgsql; + -- transforms domain into ordered array for indexing CREATE or replace FUNCTION tripdomain(url text) RETURNS text[] LANGUAGE plpgsql IMMUTABLE STRICT AS $_$ declare @@ -131,116 +86,5 @@ x := x - 1; end loop; return array_to_string(splitted, '.'); end; $_$; - --- functions that works with tables --- this functions returns id of site -create or replace function get_site(my_site text[]) returns integer - language plpgsql strict - as $$ -declare - site_id integer; -begin - select id_site from site where my_site = site into site_id; - if not found then - insert into site (site) values (my_site); - select id_site from site where my_site = site into site_id; - end if; - return site_id; -end; -$$; - -create or replace function get_site(domain text) returns integer - language sql immutable strict - as $$ -select get_site(tripdomain($1)) as result; -$$; - --- this function inserts or updates record with tags to site by site id with regexp -CREATE or replace FUNCTION mark(my_id_site integer, my_id_tag integer, my_regexp text) RETURNS integer - LANGUAGE plpgsql STRICT - AS $$ -declare - -- maybe check should be added to make sure supplied site id really exists - my_tag text[]; -begin - -- selecting tags site already have and adding new tag to them - -- note that tags should be sorted to eliminate permutations - select coalesce(tag, '{}'::text[]) from urls natural left join tag - where id_site = my_id_site and regexp = my_regexp into my_tag; - if not found then - -- no records found - creating new tag - insert into urls (id_site, id_tag, regexp) values (my_id_site, my_id_tag, my_regexp); - else - -- joining tags - select usort(my_tag || tag) from tag where id_tag = my_id_tag into my_tag; - -- updating existing record - update urls set id_tag = get_tag(my_tag) - where id_site = my_id_site and regexp = my_regexp; - end if; - return my_id_site; -end; -$$; - --- this function adds tag to site by site id -CREATE or replace FUNCTION mark(my_id_site integer, new_tag text) RETURNS integer - LANGUAGE sql immutable STRICT - AS $$ -select mark($1, get_tag(array[$2]), NULL) as result; -$$; - --- this function adds tag to domain -CREATE or replace FUNCTION mark(domain text, new_tag text) RETURNS integer - LANGUAGE sql immutable STRICT - AS $$ -select mark(get_site($1), get_tag(array[$2]), NULL) as result; -$$; - --- this function sets tags for site without regexp -CREATE or replace FUNCTION set(my_id_site integer, my_id_tag integer) RETURNS integer - LANGUAGE sql STRICT - AS $$ -delete from urls where $1 = id_site and regexp is NULL; -insert into urls (id_site, id_tag) values ($1, $2); -select $1; -$$; - --- this function sets tags for site/regexp pair -CREATE or replace FUNCTION set(my_id_site integer, my_id_tag integer, my_regexp text) RETURNS integer - LANGUAGE sql STRICT - AS $$ -delete from urls where $1 = id_site and $3 = regexp; -insert into urls (id_site, id_tag, regexp) values ($1, $2, $3); -select $1; -$$; - --- this function stores new data for site/regexp pair -create or replace function set(domain text, tags text, regexp text) returns integer - language sql immutable strict - as $$ -select set(get_site($1), get_tag($2::text[]), $3); -$$; - --- this function stores new data for site/regexp pair -create or replace function set(domain text, tags text) returns integer - language sql immutable strict - as $$ -select set(get_site($1), get_tag($2::text[])); -$$; - --- this function returns id of tag array -create or replace function get_tag(my_tag text[]) returns integer - language plpgsql strict - as $$ -declare - tag_id integer; -begin - select id_tag from tag where usort(my_tag) = tag into tag_id; - if not found then - insert into tag (tag) values (usort(my_tag)); - select id_tag from tag where usort(my_tag) = tag into tag_id; - end if; - return tag_id; -end; -$$; Index: squid-tagger.py ================================================================== --- squid-tagger.py +++ squid-tagger.py @@ -42,47 +42,45 @@ if self._check_stmt == None: self._check_stmt = self._db.prepare("select redirect_url, regexp from site_rule where site <@ tripdomain($1) and netmask >> $2::text::inet order by array_length(site, 1) desc") return(self._check_stmt(site, ip_address)) def dump(self): - return(self._db.prepare("select untrip(site), tag, regexp from urls natural join site natural join tag order by site, tag")()) + return(self._db.prepare("copy (select untrip(site) as site, tag, regexp from urls order by site, tag) to stdout csv header")()) - def load(self, csv_data): + def load(self, data): with self._db.xact(): if config.options.flush_db: self._db.execute('delete from urls;') - if config.options.flush_site: - self._db.execute('delete from site;'); - insertreg = self._db.prepare("select set($1, $2, $3)") - insert = self._db.prepare("select set($1, $2)") - for row in csv_data: - if len(row[2]) > 0: - insertreg(row[0], row[1], row[2]) + insert = self._db.prepare("insert into urls (site, tag, regexp) values (tripdomain($1), $2::text::text[], $3)") + for row in data: + if len(row) == 2: + insert(row[0], row[1], None) else: - insert(row[0], row[1]) - self._db.execute('vacuum analyze site;') + insert(row[0], row[1], row[2]) + self._db.execute("update urls set regexp = NULL where regexp = ''") self._db.execute('vacuum analyze urls;') def load_conf(self, csv_data): with self._db.xact(): self._db.execute('delete from rules;') - insertconf = self._db.prepare("insert into rules (netmask, redirect_url, from_weekday, to_weekday, from_time, to_time, id_tag) values ($1::text::cidr, $2, $3, $4, $5::text::time, $6::text::time, get_tag($7::text::text[]))") + insertconf = self._db.prepare("insert into rules (netmask, redirect_url, from_weekday, to_weekday, from_time, to_time, tag) values ($1::text::cidr, $2, $3, $4, $5::text::time, $6::text::time, $7::text::text[])") for row in csv_data: insertconf(row[0], row[1], int(row[2]), int(row[3]), row[4], row[5], row[6]) self._db.execute('vacuum analyze rules;') def dump_conf(self): - return(self._db.prepare("select netmask, redirect_url, from_weekday, to_weekday, from_time, to_time, tag from rules natural join tag")()) + return(self._db.prepare("copy (select netmask, redirect_url, from_weekday, to_weekday, from_time, to_time, tag from rules) to stdout csv header")()) # abstract class with basic checking functionality class Checker: - __slots__ = frozenset(['_db', '_log']) + __slots__ = frozenset(['_db', '_log', '_request']) def __init__(self): self._db = tagDB() self._log = Logger() self._log.info('started\n') + self._request = re.compile('^([0-9]+)\ (http|ftp):\/\/([-\w.:]+)\/([^ ]*)\ ([0-9.]+)\/(-|[\w\.]+)\ (-|\w+)\ (-|GET|HEAD|POST).*$') def process(self, id, site, ip_address, url_path, line = None): self._log.info('trying {}\n'.format(site)) result = self._db.check(site, ip_address) reply = '-' @@ -102,11 +100,11 @@ reply = row[0].format(url_path) break self.writeline('{} {}\n'.format(id, reply)) def check(self, line): - request = re.compile('^([0-9]+)\ (http|ftp):\/\/([-\w.:]+)\/([^ ]*)\ ([0-9.]+)\/(-|[\w\.]+)\ (-|\w+)\ (-|GET|HEAD|POST).*$').match(line) + request = self._request.match(line) if request: id = request.group(1) #proto = request.group(2) site = request.group(3) url_path = request.group(4) @@ -238,10 +236,11 @@ # detect end of stream and exit if possible if kev.flags >> 15 == 1: self._kq.control([self._select.kevent(sys.stdin, self._select.KQ_FILTER_READ, self._select.KQ_EV_DELETE)], 0) eof = True + #timeout = 0 if len(kevs) == 0: if len(self._queue) > 0: # get one request and process it req = self._queue.pop(0) @@ -286,13 +285,10 @@ help = 'dump database', action = 'store_true', metavar = 'bool', default = False) parser.add_option('-f', '--flush-database', dest = 'flush_db', help = 'flush previous database on load', default = False, action = 'store_true', metavar = 'bool') - parser.add_option('-F', '--flush-site', dest = 'flush_site', - help = 'when flushing previous dtabase flush site index too', - action = 'store_true', default = False, metavar = 'bool') parser.add_option('-l', '--load', dest = 'load', help = 'load database', action = 'store_true', metavar = 'bool', default = False) parser.add_option('-D', '--dump-conf', dest = 'dump_conf', help = 'dump filtering rules', default = False, metavar = 'bool', @@ -332,42 +328,35 @@ if config.options.dump or config.options.load or config.options.dump_conf or config.options.load_conf: import csv tagdb = tagDB() - data_fields = ['site', 'tags', 'regexp'] + data_fields = ['site', 'tag', 'regexp'] conf_fields = ['netmask', 'redirect_url', 'from_weekday', 'to_weekday', 'from_time', 'to_time', 'tag'] if config.options.dump or config.options.dump_conf: - csv_writer = csv.writer(sys.stdout) - if config.options.dump: - # dumping database - csv_writer.writerow(data_fields) - for row in tagdb.dump(): - csv_writer.writerow([row[0], '{' + ','.join(row[1]) + '}', row[2]]) - - elif config.options.dump_conf: - # dumping rules - csv_writer.writerow(conf_fields) - for row in tagdb.dump_conf(): - csv_writer.writerow([row[0], row[1], row[2], row[3], row[4], row[5], '{' + ','.join(row[6]) + '}']) + if config.options.dump: + dump = tagdb.dump() + elif config.options.dump_conf: + dump = tagdb.dump_conf() + + for line in dump: + sys.stdout.write(line.decode('utf-8')) elif config.options.load or config.options.load_conf: csv_reader = csv.reader(sys.stdin) first_row = next(csv_reader) if config.options.load: - # loading database - assert first_row == data_fields, 'File must contain csv data with theese columns: ' + repr(data_fields) - - tagdb.load(csv_reader) - - elif config.options.load_conf: - # loading database - assert first_row == conf_fields, 'File must contain csv data with theese columns: ' + repr(conf_fields) - - tagdb.load_conf(csv_reader) + fields = data_fields + load = tagdb.load + elif config.options.load_conf: + fields = conf_fields + load = tagdb.load_conf + + assert first_row == fields, 'File must contain csv data with theese columns: ' + repr(fields) + load(csv_reader) else: # main loop config.section('reactor') if config['reactor'] == 'thread':