ADDED sg_import.py Index: sg_import.py ================================================================== --- /dev/null +++ sg_import.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3.1 + +# This script converts SquidGuard database into format that can be imported to +# squid-tagger. It should be run in SquidGuard database directory and it would +# produce csv stream that can be redirected to squid-tagger for imports: + +# cd /var/db/squidGuard ; path/to/sg_import.py | path/to/squid-tagger.py -l -f + +# This one will flush squid-tagger's database and load selected SquidGuard +# database. + +import codecs, csv, os, re, sys + +data = {} + +for (path, names, files) in os.walk('.'): + tag = path.lstrip('./') + for file in files: + if file in ('domains', 'expressions', 'urls'): + with codecs.open(path + os.sep + file, 'r', 'L1') as source: + for full_line in source: + line = full_line.strip() + if not re.compile('^(#|$)').match(line): + regexp = None + if file == 'expressions': + regexp = line + line = None + if file == 'urls': + (line, sep, regexp) = line.partition('/') + regexp = '^' + re.escape(regexp) + if line in data: + if regexp in data[line]: + data[line][regexp].add(tag) + else: + data[line][regexp] = set([tag]) + else: + data[line] = {regexp: set([tag])} + +cw = csv.writer(sys.stdout) +cw.writerow(['site', 'tags', 'regexp']) + +for domain in data: + for regexp in data[domain]: + cw.writerow([domain, '{' + ','.join(data[domain][regexp]) + '}', regexp])