1
0
walkingonions-boosted/bwparser.py
2022-03-17 17:05:34 +01:00

213 lines
7.0 KiB
Python
Executable File

#!/usr/bin/env python3
import os
import re
import sys
import math
import numpy as np
import random
import statistics
from itertools import cycle
class BandwidthParser:
def __init__(self, **kwargs):
self.samples = []
if 'bw_file' in kwargs and kwargs['bw_file'] != None:
self.file_name = kwargs['bw_file']
self._parse_bandwidth_file()
self.total_size = self.get_relay_num()
if 'netscale' in kwargs and kwargs['netscale'] != None:
self.set_scale(kwargs['netscale'])
if 'sample_size' in kwargs and kwargs['sample_size'] != None:
self.sample_size = int(kwargs['sample_size'])
def get_distribution(self):
return self.bandwidths
def get_average(self):
return statistics.mean(self.get_distribution())
def get_fastest_relay_bandwidth(self):
return self.bandwidths[-1]
def get_slowest_relay_bandwidth(self):
return self.bandwidths[0]
def get_relay_num(self):
return len(self.bandwidths)
def get_sample_size(self):
return self.sample_size
def get_relay_throughput(self):
return sum(self.get_distribution()) * 3600
def set_scale(self, networkscale):
self.sample_size = math.ceil(networkscale * self.total_size)
def gen_files(self):
thresh = self.get_quantiles()[2]
buf = np.array(self.bandwidths)
upper_quantile = buf[buf >= thresh]
lower_quantiles = buf[buf < thresh]
src = os.path.dirname(__file__) + "/datasets/" + self.file_name
dest_upper = os.path.dirname(__file__) + "/datasets/" + self.file_name + "-upper"
dest_lower = os.path.dirname(__file__) + "/datasets/" + self.file_name + "-lower"
with open(src) as content:
new_content_upper = ""
new_content_lower = ""
data = content.readlines()
for line in data:
if 'bw=1 ' in line or 'bw=' not in line:
# leave the useless nodes and first lines intact
new_content_upper = new_content_upper + line
new_content_lower = new_content_lower + line
continue
else:
res = re.findall(r'bw=\d+\b', line)
if res != None:
if int(res[0][3:]) in upper_quantile:
new_content_upper = new_content_upper + line
if int(res[0][3:]) in lower_quantiles:
new_content_lower = new_content_lower + line
with open(dest_upper, "w") as new_file:
new_file.write(new_content_upper)
with open(dest_lower, "w") as new_file:
new_file.write(new_content_lower)
def get_files(self):
return [each for each in os.listdir(os.path.dirname(__file__) + "/datasets/") if '-upper' not in each and '-lower' not in each and '.csv' not in each]
def _parse_bandwidth_file(self):
self.bandwidths = []
with open(os.path.dirname(__file__) + "/datasets/" + self.file_name) as content:
data = content.readlines()
for line in data:
# skip metadata lines and unvotable relays
if 'bw=1 ' in line or 'bw=' not in line:
continue
else:
res = re.findall(r'bw=\d+\b', line)
if res != None:
self.bandwidths.append(int(res[0][3:]))
self.bandwidths.sort()
def get_quantiles(self):
q1 = np.quantile(self.bandwidths, .25, interpolation='midpoint')
q2 = np.quantile(self.bandwidths, .50, interpolation='midpoint')
q3 = np.quantile(self.bandwidths, .75, interpolation='midpoint')
return (q1, q2, q3)
class KomloBandwidthParser(BandwidthParser):
def __init__(self, **kwargs):
if 'sample_size' in kwargs and kwargs['sample_size'] != None:
self.sample_size = int(kwargs['sample_size'])
else:
raise ValueError("KomloBandwidthParser requires sample_size")
self.samples = self.bandwidths = self.get_distribution()
self.total_size = self.get_relay_num()
def get_distribution(self):
res = []
for k in range(0, self.sample_size):
x = random.randint(1,2500)
res.append(int(200000-(200000-25000)/3*math.log10(x)))
res.sort()
return res
class JansenBandwidthParser(BandwidthParser):
def get_distribution(self):
# https://metrics.torproject.org/collector/archive/relay-descriptors/bandwidths/
# Algorithm based on https://www.robgjansen.com/publications/tormodel-cset2012.pdf
res = []
n = math.floor(self.total_size/self.sample_size)
r = self.sample_size - n
i = 0
for k in range(0, self.sample_size):
j = i + n
if k < r:
j = j + 1
b = slice(i, j)
if len(self.bandwidths[b]) != 0:
res.append(int(statistics.median(self.bandwidths[b])))
else:
break
i = j
curr_len = len(res)
if curr_len != self.sample_size:
iterat = cycle(res)
add = [next(iterat) for _ in range(abs(self.sample_size - curr_len))]
res = res + add
res.sort()
return res
if __name__ == '__main__':
# some default params
bandwidth_file = os.getenv('BW_FILE')
if len(sys.argv) < 2:
print("Usage: bwparser.py scale [generate]")
exit()
networkscale = float(sys.argv[1])
generate = False
if len(sys.argv) > 2:
generate = True
bw_parser = BandwidthParser(bw_file=bandwidth_file, netscale=networkscale)
jansen_parser = JansenBandwidthParser(bw_file=bandwidth_file, netscale=networkscale)
komlo_parser = KomloBandwidthParser(sample_size=bw_parser.get_sample_size())
res_komlo = komlo_parser.get_distribution()
res_jansen = jansen_parser.get_distribution()
avg_komlo = komlo_parser.get_average()
avg_jansen = jansen_parser.get_average()
print("Relays in bandwidth file:", bw_parser.get_relay_num())
print("Sample size:", bw_parser.get_sample_size())
print("Smallest bandwidth in file:", bw_parser.get_slowest_relay_bandwidth())
print("Largest bandwidth in file:", bw_parser.get_fastest_relay_bandwidth())
print("Quantiles:", bw_parser.get_quantiles())
print("True average bandwidth:", bw_parser.get_average())
print("Komlo average bandwidth:", avg_komlo)
print("Jansen average bandwidth:", avg_jansen)
print("Komlo array size:", str(len(res_komlo)))
print("Jansen array size:", str(len(res_jansen)))
print("Komlo array:", str(res_komlo))
print("Jansen arary:", str(res_jansen))
print("Total throughput for bandwidth sample: ", str(bw_parser.get_relay_throughput()))
print("Total throughput for Jansen array: ", str(jansen_parser.get_relay_throughput()))
if generate:
bw_parser.gen_files()