Index: database.sql ================================================================== --- database.sql +++ database.sql @@ -75,31 +75,57 @@ create or replace function get_site(domain text) returns integer language sql immutable strict as $$ select get_site(tripdomain($1)) as result; $$; + +-- this function inserts or updates record with tags to site by site id with regexp +CREATE or replace FUNCTION mark(my_id_site integer, my_id_tag integer, my_regexp text) RETURNS integer + LANGUAGE plpgsql STRICT + AS $$ +declare + -- maybe check should be added to make sure supplied site id really exists + my_tag text[]; +begin + -- selecting tags site already have and adding new tag to them + -- note that tags should be sorted to eliminate permutations + select coalesce(tag, '{}'::text[]) from urls natural left join tag + where id_site = my_id_site and regexp = my_regexp into my_tag; + if not found then + -- no records found - creating new tag + insert into urls (id_site, id_tag, regexp) values (my_id_site, my_id_tag, my_regexp); + else + -- joining tags + select usort(my_tag || tag) from tag where id_tag = my_id_tag into my_tag; + -- updating existing record + update urls set id_tag = get_tag(my_tag) + where id_site = my_id_site and regexp = my_regexp; + end if; + return my_id_site; +end; +$$; -- this function adds tag to site by site id CREATE or replace FUNCTION mark(my_id_site integer, new_tag text) RETURNS integer LANGUAGE plpgsql STRICT AS $$ declare -- maybe check should be added to make sure supplied site id really exists my_tag text[]; - my_tag_id integer; begin -- selecting tags site already have and adding new tag to them -- note that tags should be sorted to eliminate permutations - select coalesce(tag, '{}'::text[]) from urls natural left join tag where id_site = my_id_site into my_tag; + select coalesce(tag, '{}'::text[]) from urls natural left join tag + where id_site = my_id_site and regexp is null into my_tag; if not found then -- no records found - creating new tag insert into urls (id_site, id_tag) values (my_id_site, get_tag(array[new_tag])); else -- joining tags select usort(my_tag || array[new_tag]) into my_tag; -- updating existing record - update urls set id_tag = get_tag(my_tag || array[new_tag]) where id_site = my_id_site; + update urls set id_tag = get_tag(my_tag) where id_site = my_id_site and regexp is null; end if; return my_id_site; end; $$; @@ -107,10 +133,24 @@ CREATE or replace FUNCTION mark(domain text, new_tag text) RETURNS integer LANGUAGE sql immutable STRICT AS $$ select mark(get_site($1), $2) as result; $$; + +-- this function adds tag to domain with regexp +CREATE or replace FUNCTION mark(domain text, tags text[], regexp text) RETURNS integer + LANGUAGE sql immutable STRICT + AS $$ +select mark(get_site($1), get_tag($2), $3) as result; +$$; + +-- this function adds tag to domain with regexp +CREATE or replace FUNCTION mark(domain text, tags text[]) RETURNS integer + LANGUAGE sql immutable STRICT + AS $$ +select mark(get_site($1), get_tag($2), NULL) as result; +$$; -- this function returns id of tag array create or replace function get_tag(my_tag text[]) returns integer language plpgsql strict as $$ @@ -138,10 +178,14 @@ ); ALTER TABLE ONLY rules ADD CONSTRAINT rules_pkey PRIMARY KEY (netmask); +ALTER TABLE ONLY rules + ADD CONSTRAINT rules_tag_f FOREIGN KEY (id_tag) REFERENCES tag(id_tag) MATCH FULL + ON UPDATE RESTRICT ON DELETE RESTRICT DEFERRABLE INITIALLY DEFERRED; + -- table to hold site arrays CREATE TABLE site ( id_site serial, site text[] NOT NULL ); @@ -172,16 +216,21 @@ id_site smallint NOT NULL, id_tag smallint NOT NULL, regexp text ); -ALTER TABLE ONLY urls - ADD CONSTRAINT urls_pkey PRIMARY KEY (date_added); +CREATE UNIQUE INDEX urls_pkey ON urls USING btree (id_site, regexp); + +CREATE INDEX urls_id_tag ON urls USING btree (id_tag); -CREATE UNIQUE INDEX urls_id_site ON urls USING btree (id_site); +ALTER TABLE ONLY urls + ADD CONSTRAINT urls_site_f FOREIGN KEY (id_site) REFERENCES site(id_site) MATCH FULL + ON UPDATE RESTRICT ON DELETE RESTRICT DEFERRABLE INITIALLY DEFERRED; -CREATE UNIQUE INDEX urls_id_tag ON urls USING btree (id_tag); +ALTER TABLE ONLY urls + ADD CONSTRAINT urls_tag_f FOREIGN KEY (id_tag) REFERENCES tag(id_tag) MATCH FULL + ON UPDATE RESTRICT ON DELETE RESTRICT DEFERRABLE INITIALLY DEFERRED; -- rule to join all tables into one to simplify access -- automaticall uses current day and time data CREATE VIEW site_rule AS SELECT a.redirect_url, a.netmask, b.site, b.regexp ADDED st-dump.py Index: st-dump.py ================================================================== --- /dev/null +++ st-dump.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3.1 + +import configparser, csv, optparse, os, postgresql.api, sys + +# wrapper around syslog, can be muted +class Logger: + __slots__ = frozenset(['_syslog']) + + def __init__(self): + config.section('log') + if config['silent'] == 'yes': + self._syslog = None + else: + import syslog + self._syslog = syslog + self._syslog.openlog('squidTag') + + def info(self, message): + if self._syslog: + self._syslog.syslog(self._syslog.LOG_INFO, message) + + def notice(self, message): + if self._syslog: + self._syslog.syslog(self._syslog.LOG_NOTICE, message) + +# wrapper around database +class tagDB: + __slots__ = frozenset(['_prepared', '_dump_stmt', '_db']) + + def __init__(self): + self._prepared = set() + self._db = False + self._dump_stmt = self._curs().prepare("select untrip(site), tag, regexp from urls natural join site natural join tag") + + def _curs(self): + if not self._db: + config.section('database') + self._db = postgresql.open( + 'pq://{}:{}@{}/{}'.format( + config['user'], + config['password'], + config['host'], + config['database'], + ) ) + return(self._db) + + def dump(self): + return(self._dump_stmt()) + +# this classes processes config file and substitutes default values +class Config: + __slots__ = frozenset(['_config', '_default', '_section']) + _default = { + 'reactor': { + 'reactor': 'thread', + }, + 'log': { + 'silent': 'no', + }, + 'database': { + 'host': 'localhost', + 'database': 'squidTag', + },} + + # function to read in config file + def __init__(self): + parser = optparse.OptionParser() + parser.add_option('-c', '--config', dest = 'config', + help = 'config file location', metavar = 'FILE', + default = '/usr/local/etc/squid-tagger.conf') + + (options, args) = parser.parse_args() + + if not os.access(options.config, os.R_OK): + print("Can't read {}: exitting".format(options.config)) + sys.exit(2) + + self._config = configparser.ConfigParser() + self._config.readfp(open(options.config)) + + # function to select config file section or create one + def section(self, section): + if not self._config.has_section(section): + self._config.add_section(section) + self._section = section + + # function to get config parameter, if parameter doesn't exists the default + # value or None is substituted + def __getitem__(self, name): + if not self._config.has_option(self._section, name): + if self._section in self._default: + if name in self._default[self._section]: + self._config.set(self._section, name, self._default[self._section][name]) + else: + self._config.set(self._section, name, None) + else: + self._config.set(self._section, name, None) + return(self._config.get(self._section, name)) + +# initializing and reading in config file +config = Config() + +tagdb = tagDB() + +csv_writer = csv.writer(sys.stdout) +csv_writer.writerow(['site', 'tags', 'regexp']) +for row in tagdb.dump(): + csv_writer.writerow([row[0], '{' + ','.join(row[1]) + '}', row[2]]) ADDED st-load.py Index: st-load.py ================================================================== --- /dev/null +++ st-load.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3.1 + +import configparser, csv, optparse, os, postgresql.api, re, sys + +# wrapper around syslog, can be muted +class Logger: + __slots__ = frozenset(['_syslog']) + + def __init__(self): + config.section('log') + if config['silent'] == 'yes': + self._syslog = None + else: + import syslog + self._syslog = syslog + self._syslog.openlog('squidTag') + + def info(self, message): + if self._syslog: + self._syslog.syslog(self._syslog.LOG_INFO, message) + + def notice(self, message): + if self._syslog: + self._syslog.syslog(self._syslog.LOG_NOTICE, message) + +# wrapper around database +class tagDB: + __slots__ = frozenset(['_prepared', '_db']) + + def __init__(self): + self._prepared = set() + config.section('database') + self._db = postgresql.open( + 'pq://{}:{}@{}/{}'.format( + config['user'], + config['password'], + config['host'], + config['database'], + ) ) + + def load(self, csv_data): + insert = self._db.prepare("select mark($1, array[$2], $3)") + with self._db.xact(): + config.section('loader') + if config['drop_database']: + self._db.execute('delete from urls; delete from site;'); + #print('dropped', config['drop_database']) + for row in csv_data: + insert(row[0], row[1], row[2]) + self._db.execute('vacuum analyze site;') + self._db.execute('vacuum analyze urls;') + +# this classes processes config file and substitutes default values +class Config: + __slots__ = frozenset(['_config', '_default', '_section']) + _default = { + 'reactor': { + 'reactor': 'thread', + }, + 'log': { + 'silent': 'no', + }, + 'database': { + 'user': 'squidTag', + 'password': 'password', + 'host': 'localhost', + 'database': 'squidTag', + }, + 'loader': { + 'drop_database': False, + },} + + # function to read in config file + def __init__(self): + parser = optparse.OptionParser() + parser.add_option('-c', '--config', dest = 'config', + help = 'config file location', metavar = 'FILE', + default = '/usr/local/etc/squid-tagger.conf') + parser.add_option('-d', '--drop-database', dest = 'drop_database', + help = 'signals loader to drop previous database', + action = 'store_true') + + (options, args) = parser.parse_args() + + if options.drop_database: + self._default['loader']['drop_database'] = True + + if not os.access(options.config, os.R_OK): + print("Can't read {}: exitting".format(options.config)) + sys.exit(2) + + self._config = configparser.ConfigParser() + self._config.readfp(open(options.config)) + + # function to select config file section or create one + def section(self, section): + if not self._config.has_section(section): + self._config.add_section(section) + self._section = section + + # function to get config parameter, if parameter doesn't exists the default + # value or None is substituted + def __getitem__(self, name): + if not self._section in self._default or not name in self._default[self._section]: + return None + if not type(self._default[self._section][name]) == type(True): + if not self._config.has_option(self._section, name): + self._config.set(self._section, name, self._default[self._section][name]) + return(self._config.get(self._section, name)) + else: + if not self._config.has_option(self._section, name): + self._config.set(self._section, name, repr(self._default[self._section][name])) + return(self._config.getboolean(self._section, name)) + +# initializing and reading in config file +config = Config() + +tagdb = tagDB() + +csv_reader = csv.reader(sys.stdin) +first_row = next(csv_reader) +if not first_row == ['site', 'tags', 'regexp']: + print('File must contain csv data with three columns: "site", "tags" and "regexp".') + sys.exit(1) +tagdb.load(csv_reader)