Squid url redirector

Annotation For sg_import.py
anonymous

Annotation For sg_import.py

Origin for each line in sg_import.py from check-in 67e8b3309d:

67e8b3309d 2012-07-09    1: #!/usr/bin/env python3.2
1a367d050d 2010-08-13    2: 
1a367d050d 2010-08-13    3: # This script converts SquidGuard database into format that can be imported to
1a367d050d 2010-08-13    4: # squid-tagger. It should be run in SquidGuard database directory and it would
1a367d050d 2010-08-13    5: # produce csv stream that can be redirected to squid-tagger for imports:
1a367d050d 2010-08-13    6: 
1a367d050d 2010-08-13    7: # cd /var/db/squidGuard ; path/to/sg_import.py | path/to/squid-tagger.py -l -f
1a367d050d 2010-08-13    8: 
1a367d050d 2010-08-13    9: # This one will flush squid-tagger's database and load selected SquidGuard
1a367d050d 2010-08-13   10: # database.
1a367d050d 2010-08-13   11: 
1a367d050d 2010-08-13   12: import codecs, csv, os, re, sys
1a367d050d 2010-08-13   13: 
1a367d050d 2010-08-13   14: data = {}
1a367d050d 2010-08-13   15: 
1a367d050d 2010-08-13   16: for (path, names, files) in os.walk('.'):
1a367d050d 2010-08-13   17: 	tag = path.lstrip('./')
1a367d050d 2010-08-13   18: 	for file in files:
1a367d050d 2010-08-13   19: 		if file in ('domains', 'expressions', 'urls'):
1a367d050d 2010-08-13   20: 			with codecs.open(path + os.sep + file, 'r', 'L1') as source:
1a367d050d 2010-08-13   21: 				for full_line in source:
1a367d050d 2010-08-13   22: 					line = full_line.strip()
1a367d050d 2010-08-13   23: 					if not re.compile('^(#|$)').match(line):
1a367d050d 2010-08-13   24: 						regexp = None
1a367d050d 2010-08-13   25: 						if file == 'expressions':
1a367d050d 2010-08-13   26: 							regexp = line
1a367d050d 2010-08-13   27: 							line = None
1a367d050d 2010-08-13   28: 						if file == 'urls':
1a367d050d 2010-08-13   29: 							(line, sep, regexp) = line.partition('/')
1a367d050d 2010-08-13   30: 							regexp = '^' + re.escape(regexp)
1a367d050d 2010-08-13   31: 						if line in data:
1a367d050d 2010-08-13   32: 							if regexp in data[line]:
1a367d050d 2010-08-13   33: 								data[line][regexp].add(tag)
1a367d050d 2010-08-13   34: 							else:
1a367d050d 2010-08-13   35: 								data[line][regexp] = set([tag])
1a367d050d 2010-08-13   36: 						else:
1a367d050d 2010-08-13   37: 							data[line] = {regexp: set([tag])}
1a367d050d 2010-08-13   38: 
1a367d050d 2010-08-13   39: cw = csv.writer(sys.stdout)
8a8515f5c4 2010-08-30   40: cw.writerow(['site', 'tag', 'regexp'])
1a367d050d 2010-08-13   41: 
1a367d050d 2010-08-13   42: for domain in data:
1a367d050d 2010-08-13   43: 	for regexp in data[domain]:
1a367d050d 2010-08-13   44: 		cw.writerow([domain, '{' + ','.join(data[domain][regexp]) + '}', regexp])