1a367d050d 2010-08-13 1: #!/usr/bin/env python3.1
1a367d050d 2010-08-13 2:
1a367d050d 2010-08-13 3: # This script converts SquidGuard database into format that can be imported to
1a367d050d 2010-08-13 4: # squid-tagger. It should be run in SquidGuard database directory and it would
1a367d050d 2010-08-13 5: # produce csv stream that can be redirected to squid-tagger for imports:
1a367d050d 2010-08-13 6:
1a367d050d 2010-08-13 7: # cd /var/db/squidGuard ; path/to/sg_import.py | path/to/squid-tagger.py -l -f
1a367d050d 2010-08-13 8:
1a367d050d 2010-08-13 9: # This one will flush squid-tagger's database and load selected SquidGuard
1a367d050d 2010-08-13 10: # database.
1a367d050d 2010-08-13 11:
1a367d050d 2010-08-13 12: import codecs, csv, os, re, sys
1a367d050d 2010-08-13 13:
1a367d050d 2010-08-13 14: data = {}
1a367d050d 2010-08-13 15:
1a367d050d 2010-08-13 16: for (path, names, files) in os.walk('.'):
1a367d050d 2010-08-13 17: tag = path.lstrip('./')
1a367d050d 2010-08-13 18: for file in files:
1a367d050d 2010-08-13 19: if file in ('domains', 'expressions', 'urls'):
1a367d050d 2010-08-13 20: with codecs.open(path + os.sep + file, 'r', 'L1') as source:
1a367d050d 2010-08-13 21: for full_line in source:
1a367d050d 2010-08-13 22: line = full_line.strip()
1a367d050d 2010-08-13 23: if not re.compile('^(#|$)').match(line):
1a367d050d 2010-08-13 24: regexp = None
1a367d050d 2010-08-13 25: if file == 'expressions':
1a367d050d 2010-08-13 26: regexp = line
1a367d050d 2010-08-13 27: line = None
1a367d050d 2010-08-13 28: if file == 'urls':
1a367d050d 2010-08-13 29: (line, sep, regexp) = line.partition('/')
1a367d050d 2010-08-13 30: regexp = '^' + re.escape(regexp)
1a367d050d 2010-08-13 31: if line in data:
1a367d050d 2010-08-13 32: if regexp in data[line]:
1a367d050d 2010-08-13 33: data[line][regexp].add(tag)
1a367d050d 2010-08-13 34: else:
1a367d050d 2010-08-13 35: data[line][regexp] = set([tag])
1a367d050d 2010-08-13 36: else:
1a367d050d 2010-08-13 37: data[line] = {regexp: set([tag])}
1a367d050d 2010-08-13 38:
1a367d050d 2010-08-13 39: cw = csv.writer(sys.stdout)
1a367d050d 2010-08-13 40: cw.writerow(['site', 'tags', 'regexp'])
1a367d050d 2010-08-13 41:
1a367d050d 2010-08-13 42: for domain in data:
1a367d050d 2010-08-13 43: for regexp in data[domain]:
1a367d050d 2010-08-13 44: cw.writerow([domain, '{' + ','.join(data[domain][regexp]) + '}', regexp])