Lines of
nntpdup.py
from check-in 3205f8a9ae
that are changed by the sequence of edits moving toward
check-in 973a1d241e:
1: #!/usr/bin/env python3.4
2:
3: import configparser, email.utils, getpass, imaplib, nntplib, re, sqlite3, sys
4: imaplib._MAXLINE = 1024 * 1024
5: nntplib._MAXLINE = 1024 * 1024
6:
7: config = configparser.ConfigParser(allow_no_value = True)
8: config.read('nntpdup.conf')
9:
3205f8a9ae 2015-10-12 10: server = nntplib.NNTP_SSL(config['connection']['newsserver'])
11: mserver = imaplib.IMAP4_SSL(config['connection']['mailserver'])
12: reMessageId = re.compile('(<[-\][a-zA-Z0-9@.%/=_\$+!&~#\?}]+>)"?\)\)(\d+ \(FLAGS\(\)\))?$')
13: mserver.login(config['connection']['mail_user'], config['connection']['mail_password'])
14: if 'mail_limit' in config['connection']:
15: mailLimit = int(config['connection']['mail_limit'])
16: else:
17: mailLimit = 100
18: if 'header_limit' in config['connection']:
19: headerLimit = int(config['connection']['header_limit'])
20: else:
21: headerLimit = 1000
22:
23: tables = {
24: 'list': ["create table list (id integer primary key, name text, last integer default 0);"],
25: 'ids': ["create table ids (id integer, name text, mask integer, date integer);", "create unique index ids__id_name on ids(id, name);"],
26: }
27:
28: class Folder:
29: def __init__(this, filename):
30: this.db = sqlite3.connect(filename)
31: this.id = None
32: found = set()
33: for row in this.db.execute("select name from sqlite_master where type = 'table';"):
34: found.add(row[0])
35: for absent in set(tables.keys()).difference(found):
36: for query in tables[absent]:
37: this.db.execute(query)
38:
39: def select(this, folderName):
40: this.name = folderName
41: this.id = None
42: while True:
43: present = False
44: for row in this.db.execute("select id, last from list where name = ?;", [folderName]):
45: present = True
46: this.id = row[0]
47: this.last = row[1]
48: if present:
49: break
50: this.db.execute("insert into list(name) values (?);", [folderName])
51: if this.id == None:
52: print('Id not found.')
53: exit(1)
54: this.mask = {}
55: this.get_count()
56:
57: def get_count(this):
58: this.count = 0
59: for row in this.db.execute("select count(*) from ids where id = ? and mask in (3, 1);", [this.id]):
60: this.count = row[0]
61:
62: def get_record_count(this, mask):
63: for row in this.db.execute("select count(*) from ids where id = ? and mask = ?;", [this.id, mask]):
64: return(row[0])
65:
66: def check(this, name):
67: if name in this.mask:
68: return(this.mask[name])
69: for row in this.db.execute("select mask from ids where id = ? and name = ?;", [this.id, name]):
70: this.mask[name] = row[0]
71: return(row[0])
72:
73: def addlast(this, count):
74: this.last += count
75: this.db.execute("update list set last = ? where id = ?;", [this.last, this.id])
76:
77: def droplast(this):
78: this.last = 0
79: this.db.execute("update list set last = ? where id = ?;", [this.last, this.id])
80:
81: def addmail(this, mid):
82: mask = this.check(mid)
83: if mask in (3, 2):
84: this.db.execute("update ids set mask = 3 where id = ? and name = ?;", [this.id, mid])
85: this.mask[mid] = 3
86: else:
87: this.db.execute("insert into ids(id, name, mask) values(?, ?, ?);", [this.id, mid, 1])
88: this.count += 1
89: this.mask[mid] = 1
90:
91: def addnews(this, mid, date = None):
92: mask = this.check(mid)
93: if mask in (1, 3):
94: this.db.execute("update ids set mask = 3, date = ? where id = ? and name = ?;", [date, this.id, mid])
95: this.mask[mid] = 3
96: else:
97: this.db.execute("insert into ids(id, name, mask, date) values(?, ?, ?, ?);", [this.id, mid, 2, date])
98: this.count += 1
99: this.mask[mid] = 2
100:
101: def zeromail(this):
102: this.mask = {}
103: this.db.execute("update ids set mask = 2 where id = ? and mask = 3;", [this.id])
104: this.db.execute("delete from ids where id = ? and mask = 1;", [this.id])
105: this.sync()
106: this.get_count()
107:
108: def zeronews(this):
109: this.mask = {}
110: this.db.execute("update ids set mask = 1 where id = ? and mask = 3;", [this.id])
111: this.db.execute("delete from ids where id = ? and mask = 2;", [this.id])
112: this.droplast()
113: this.sync()
114:
115: def sync(this):
116: this.db.commit()
117:
118: def get_unfetched(this):
119: return(this.db.execute("select name, date from ids where id = ? and mask = 2 order by date desc;", [this.id]))
120:
121: def forget(this, mid):
122: this.db.execute("delete from ids where id = ? and name = ?;", [this.id, mid])
123:
124: def check_folder(mserver, folder, folderName):
125: folder.zeromail()
126: deleted = 0
127: mserver.select(folderName)
128: typ, data = mserver.search(None, 'NOT DELETED')
129: count = 0
130: print(' - building imap index', folderName, '[', end='')
131: for num in data[0].split():
132: found = False
133: typ, data = mserver.fetch(num, '(ENVELOPE)')
134: field = 0
135: for rec in data:
136: if type(rec) == tuple:
137: data[field] = ''.join(i.decode('utf-8', 'ignore') for i in rec)
138: else:
139: data[field] = rec.decode('utf-8', 'ignore')
140: field += 1
141: data = ''.join(data)
142: isMid = reMessageId.search(data)
143: if isMid:
144: mid = isMid.group(1)
145: mask = folder.check(mid)
146: if not mask in (1, 3):
147: folder.addmail(mid)
148: count += 1
149: else:
150: mserver.store(num, '+FLAGS', '\\Deleted')
151: deleted += 1
152: sys.stdout.write('x')
153: sys.stdout.flush()
154: else:
155: print('Message id not found.')
156: print(repr(data))
157: exit(1)
158: if (count % 1000) == 0:
159: sys.stdout.write('.')
160: sys.stdout.flush()
161: print('], deleted:', deleted, end = '')
162: folder.sync()
163: mserver.expunge()
164:
165: folder = Folder('nntpdup.sqlite')
166:
167: limits = [0, 0]
168: limitSteps = [headerLimit / len(config['groups']), mailLimit / len(config['groups'])]
169:
170: maxlength = 0
171: for folderName in (config['groups'].keys()):
172: maxlength = max(maxlength, len(folderName))
173:
174: skew = 1 + int(maxlength / 8)
175:
176: for folderName in (set(config['groups'].keys())):
177: stats = [0, 0]
178: folder.select(folderName)
179:
180: resp = mserver.select(folderName)
181: if resp[0] != 'OK':
182: print("Can't open folder.")
183: exit(1)
184: if int(resp[1][0]) != folder.count:
185: check_folder(mserver, folder, folderName)
186:
187: _, count, first, last, _ = server.group(folderName)
188: limits[0] += limitSteps[0]
189: if last > folder.last:
190: count = 0
191: # we need to fetch new ids
192: request = min(last, folder.last + limits[0])
193: try:
194: for record in server.over((int(folder.last) + 1, int(request)))[1]:
195: mid = record[1]['message-id']
196: if len(record[1]['message-id']) > 0:
197: try:
198: folder.addnews(record[1]['message-id'], email.utils.parsedate_to_datetime(record[1]['date']).timestamp())
199: except OverflowError as err:
200: folder.addnews(record[1]['message-id'])
201: except TypeError as err:
202: folder.addnews(record[1]['message-id'])
203: count += 1
204: except nntplib.NNTPTemporaryError as err:
205: if err.response.startswith('423 '):
206: pass
207: else:
208: raise(err)
209: except nntplib.NNTPPermanentError as err:
210: print(folder.last, request)
211: raise(err)
212: except sqlite3.IntegrityError as err:
213: print(repr(record))
214: print([x for x in map(repr, folder.db.execute("select * from ids where id = ? and name = ?;", [folder.id, record[1]['message-id']]))])
215: raise(err)
216: stats[0] = count
217: limits[0] -= count
218: folder.addlast(request - folder.last)
219: folder.sync()
220: elif folder.get_record_count(1) > 0:
221: folder.droplast()
222:
223: limits[1] += limitSteps[1]
224: if folder.get_record_count(2) > 0:
225: count = 0
226: # there are extra articles
227: raw_date = []
228: unfetched = []
229: for item, env_date in folder.get_unfetched():
230: mask = folder.check(item)
231: if mask == 2:
232: unfetched += (item, env_date),
233: for item, env_date in unfetched:
234: try:
235: _, info = server.article(item)
236: if env_date == None or env_date < 0:
237: date = None
238: backup_date = None
239: out = []
240: for line in info.lines:
241: if len(line) == 0:
242: mesg = email.message_from_string('\n'.join(out))
243: for header in mesg._headers:
244: if header[0] == 'Date':
245: raw_date += header[1],
246: date = email.utils.parsedate(header[1])
247: elif header[0] == 'Original-Received':
248: raw_date += header[1],
249: tmp_date = email.utils.parsedate(header[1].split(';')[-1])
250: if tmp_date != None and tmp_date[0] >= 1970:
251: backup_date = tmp_date
252: if date == None and backup_date == None:
253: print('Date missed.')
254: print(repr(out))
255: exit(1)
256: elif date == None:
257: date = backup_date
258: break
259: try:
260: out.append(line.decode('ascii', 'ignore'))
261: except UnicodeDecodeError:
262: print(repr(line))
263: exit(1)
264: out.append('\n')
265: try:
266: #print('*', item, date, type(date))
267: mserver.append(folderName, None, date, b'\n'.join(info.lines))
268: except AttributeError as err:
269: #print('*', item, raw_date, repr(date))
270: #raise(err)
271: mserver.append(folderName, None, backup_date, b'\n'.join(info.lines))
272: except OverflowError as err:
273: #print('*', item, raw_date, repr(date))
274: #raise(err)
275: mserver.append(folderName, None, backup_date, b'\n'.join(info.lines))
276: else:
277: #print('*', item, env_date, type(env_date))
278: mserver.append(folderName, None, env_date, b'\n'.join(info.lines))
279: folder.addmail(item)
280: folder.sync()
281: count += 1
282: if count >= limits[1]:
283: break
284: except nntplib.NNTPTemporaryError as err:
285: if err.response.startswith('430 No such article'):
286: folder.forget(item)
287: else:
288: print(err.response, item, env_date)
289: raise(err)
290: stats[1] = count
291: limits[1] -= count
292:
293: if stats[0] != 0 or stats[1] != 0:
294: print('# ', folderName, '\t'*(skew - int((len(folderName) + 2) / 8)), '\t'.join(map(str, stats)), sep = '')
295: folder.sync()