Samesite - proxy that can cache partial transfers

Diff
anonymous

Diff

Differences From Artifact [65464ee62d]:

To Artifact [16e005263b]:


42
43
44
45
46
47
48



49
50
51

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

77
78
79
80
81
82
83
84
85
		log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
		for line in log_file:
			this_line = log_line.match(line.strip())
			if this_line:
				unchecked_files.add(this_line.group(2))

	for url in unchecked_files:




		# creating empty placeholder in index
		if not url in index:

			index[url] = {}
		reload = False

		# creating file name from url
		file_name = options.dir + re.compile('%20').sub(' ', url)
		print('Checking file:', url)

		# forcibly checking file if no file present
		if not os.access(file_name, os.R_OK):
			print('File not found or inaccessible.')
			reload = True

		# forcibly checking file if file size doesn't match with index data
		elif 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
			print('File size is', os.stat(file_name).st_size, 'and stored file size is', index[url]['Content-Length'])
			reload = True

		# forcibly checking file if index hods Pragma header
		if 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
			print('Pragma on: recheck iminent.')
			reload = True

		# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
		if not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
			continue

		try:
			print('Contacting website.')
			with urllib.request.urlopen(options.root + url) as source:
				new_headers = {}
				headers = source.info()

				# stripping unneeded headers (XXX make this inplace?)
				for header in headers:
					if header in desc_fields:







>
>
>



>

|



<


|




|
|



|
|
|


|

>

<







42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

82
83
84
85
86
87
88
		log_line = re.compile('^[^ ]+ - - \[.*] "(GET|HEAD) (.*?)(\?.*)? HTTP/1.1" (\d+) \d+ "(.*)" "(.*)"$')
		for line in log_file:
			this_line = log_line.match(line.strip())
			if this_line:
				unchecked_files.add(this_line.group(2))

	for url in unchecked_files:
		reload = False
		recheck = False
		print('Checking file:', url)

		# creating empty placeholder in index
		if not url in index:
			print('This one is new.')
			index[url] = {}
			reload = True

		# creating file name from url
		file_name = options.dir + re.compile('%20').sub(' ', url)


		# forcibly checking file if no file present
		if not reload and not os.access(file_name, os.R_OK):
			print('File not found or inaccessible.')
			reload = True

		# forcibly checking file if file size doesn't match with index data
		elif not reload and 'Content-Length' in index[url] and os.stat(file_name).st_size != int(index[url]['Content-Length']):
			print('File size is ', os.stat(file_name).st_size, ' and stored file size is ', index[url]['Content-Length'], '.', sep='')
			reload = True

		# forcibly checking file if index hods Pragma header
		if not reload and 'Pragma' in index[url] and index[url]['Pragma'] == 'no-cache':
			print('Pragma on: recheck imminent.')
			recheck = True

		# skipping file processing if there's no need to recheck it and we have checked it at least 4 hours ago
		if not recheck and not reload and '__time__' in index[url] and (datetime.datetime.now() - datetime.timedelta(hours = 4) - index[url]['__time__']).days < 0:
			continue

		try:

			with urllib.request.urlopen(options.root + url) as source:
				new_headers = {}
				headers = source.info()

				# stripping unneeded headers (XXX make this inplace?)
				for header in headers:
					if header in desc_fields:
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
				old_keys = set(index[url].keys())
				old_keys.discard('__time__')
				old_keys.discard('Pragma')
				more_keys = set(new_headers.keys()) - old_keys
				more_keys.discard('Pragma')
				less_keys = old_keys - set(new_headers.keys())
				if len(more_keys) > 0:
					if len(old_keys) == 0:
						print('No data on that file yet.')
					else:
						print('More headers appear:', more_keys)
					reload = True
				elif len(less_keys) > 0:
					print('Less headers appear:', less_keys)
					reload = True
				else:
					for key in index[url].keys():







|
<
<







98
99
100
101
102
103
104
105


106
107
108
109
110
111
112
				old_keys = set(index[url].keys())
				old_keys.discard('__time__')
				old_keys.discard('Pragma')
				more_keys = set(new_headers.keys()) - old_keys
				more_keys.discard('Pragma')
				less_keys = old_keys - set(new_headers.keys())
				if len(more_keys) > 0:
					if not len(old_keys) == 0:


						print('More headers appear:', more_keys)
					reload = True
				elif len(less_keys) > 0:
					print('Less headers appear:', less_keys)
					reload = True
				else:
					for key in index[url].keys():