213 lines
7.0 KiB
Python
213 lines
7.0 KiB
Python
|
#!/usr/bin/env python3
|
||
|
|
||
|
import os
|
||
|
import re
|
||
|
import sys
|
||
|
import math
|
||
|
import numpy as np
|
||
|
import random
|
||
|
import statistics
|
||
|
|
||
|
from itertools import cycle
|
||
|
|
||
|
class BandwidthParser:
|
||
|
def __init__(self, **kwargs):
|
||
|
self.samples = []
|
||
|
|
||
|
if 'bw_file' in kwargs and kwargs['bw_file'] != None:
|
||
|
self.file_name = kwargs['bw_file']
|
||
|
self._parse_bandwidth_file()
|
||
|
|
||
|
self.total_size = self.get_relay_num()
|
||
|
|
||
|
if 'netscale' in kwargs and kwargs['netscale'] != None:
|
||
|
self.set_scale(kwargs['netscale'])
|
||
|
|
||
|
if 'sample_size' in kwargs and kwargs['sample_size'] != None:
|
||
|
self.sample_size = int(kwargs['sample_size'])
|
||
|
|
||
|
def get_distribution(self):
|
||
|
return self.bandwidths
|
||
|
|
||
|
def get_average(self):
|
||
|
return statistics.mean(self.get_distribution())
|
||
|
|
||
|
def get_fastest_relay_bandwidth(self):
|
||
|
return self.bandwidths[-1]
|
||
|
|
||
|
def get_slowest_relay_bandwidth(self):
|
||
|
return self.bandwidths[0]
|
||
|
|
||
|
def get_relay_num(self):
|
||
|
return len(self.bandwidths)
|
||
|
|
||
|
def get_sample_size(self):
|
||
|
return self.sample_size
|
||
|
|
||
|
def get_relay_throughput(self):
|
||
|
return sum(self.get_distribution()) * 3600
|
||
|
|
||
|
def set_scale(self, networkscale):
|
||
|
self.sample_size = math.ceil(networkscale * self.total_size)
|
||
|
|
||
|
def gen_files(self):
|
||
|
thresh = self.get_quantiles()[2]
|
||
|
|
||
|
buf = np.array(self.bandwidths)
|
||
|
|
||
|
upper_quantile = buf[buf >= thresh]
|
||
|
lower_quantiles = buf[buf < thresh]
|
||
|
|
||
|
src = os.path.dirname(__file__) + "/datasets/" + self.file_name
|
||
|
dest_upper = os.path.dirname(__file__) + "/datasets/" + self.file_name + "-upper"
|
||
|
dest_lower = os.path.dirname(__file__) + "/datasets/" + self.file_name + "-lower"
|
||
|
|
||
|
with open(src) as content:
|
||
|
new_content_upper = ""
|
||
|
new_content_lower = ""
|
||
|
data = content.readlines()
|
||
|
|
||
|
for line in data:
|
||
|
if 'bw=1 ' in line or 'bw=' not in line:
|
||
|
# leave the useless nodes and first lines intact
|
||
|
new_content_upper = new_content_upper + line
|
||
|
new_content_lower = new_content_lower + line
|
||
|
continue
|
||
|
else:
|
||
|
res = re.findall(r'bw=\d+\b', line)
|
||
|
if res != None:
|
||
|
if int(res[0][3:]) in upper_quantile:
|
||
|
new_content_upper = new_content_upper + line
|
||
|
if int(res[0][3:]) in lower_quantiles:
|
||
|
new_content_lower = new_content_lower + line
|
||
|
|
||
|
with open(dest_upper, "w") as new_file:
|
||
|
new_file.write(new_content_upper)
|
||
|
|
||
|
with open(dest_lower, "w") as new_file:
|
||
|
new_file.write(new_content_lower)
|
||
|
|
||
|
def get_files(self):
|
||
|
return [each for each in os.listdir(os.path.dirname(__file__) + "/datasets/") if '-upper' not in each and '-lower' not in each and '.csv' not in each]
|
||
|
|
||
|
def _parse_bandwidth_file(self):
|
||
|
self.bandwidths = []
|
||
|
with open(os.path.dirname(__file__) + "/datasets/" + self.file_name) as content:
|
||
|
data = content.readlines()
|
||
|
for line in data:
|
||
|
# skip metadata lines and unvotable relays
|
||
|
if 'bw=1 ' in line or 'bw=' not in line:
|
||
|
continue
|
||
|
else:
|
||
|
res = re.findall(r'bw=\d+\b', line)
|
||
|
if res != None:
|
||
|
self.bandwidths.append(int(res[0][3:]))
|
||
|
|
||
|
self.bandwidths.sort()
|
||
|
|
||
|
def get_quantiles(self):
|
||
|
q1 = np.quantile(self.bandwidths, .25, interpolation='midpoint')
|
||
|
q2 = np.quantile(self.bandwidths, .50, interpolation='midpoint')
|
||
|
q3 = np.quantile(self.bandwidths, .75, interpolation='midpoint')
|
||
|
|
||
|
return (q1, q2, q3)
|
||
|
|
||
|
class KomloBandwidthParser(BandwidthParser):
|
||
|
def __init__(self, **kwargs):
|
||
|
if 'sample_size' in kwargs and kwargs['sample_size'] != None:
|
||
|
self.sample_size = int(kwargs['sample_size'])
|
||
|
else:
|
||
|
raise ValueError("KomloBandwidthParser requires sample_size")
|
||
|
|
||
|
self.samples = self.bandwidths = self.get_distribution()
|
||
|
self.total_size = self.get_relay_num()
|
||
|
|
||
|
def get_distribution(self):
|
||
|
res = []
|
||
|
for k in range(0, self.sample_size):
|
||
|
x = random.randint(1,2500)
|
||
|
res.append(int(200000-(200000-25000)/3*math.log10(x)))
|
||
|
|
||
|
res.sort()
|
||
|
return res
|
||
|
|
||
|
class JansenBandwidthParser(BandwidthParser):
|
||
|
|
||
|
def get_distribution(self):
|
||
|
# https://metrics.torproject.org/collector/archive/relay-descriptors/bandwidths/
|
||
|
# Algorithm based on https://www.robgjansen.com/publications/tormodel-cset2012.pdf
|
||
|
res = []
|
||
|
n = math.floor(self.total_size/self.sample_size)
|
||
|
r = self.sample_size - n
|
||
|
i = 0
|
||
|
for k in range(0, self.sample_size):
|
||
|
j = i + n
|
||
|
if k < r:
|
||
|
j = j + 1
|
||
|
b = slice(i, j)
|
||
|
if len(self.bandwidths[b]) != 0:
|
||
|
res.append(int(statistics.median(self.bandwidths[b])))
|
||
|
else:
|
||
|
break
|
||
|
i = j
|
||
|
|
||
|
curr_len = len(res)
|
||
|
|
||
|
if curr_len != self.sample_size:
|
||
|
iterat = cycle(res)
|
||
|
add = [next(iterat) for _ in range(abs(self.sample_size - curr_len))]
|
||
|
res = res + add
|
||
|
|
||
|
res.sort()
|
||
|
return res
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
# some default params
|
||
|
bandwidth_file = os.getenv('BW_FILE')
|
||
|
|
||
|
if len(sys.argv) < 2:
|
||
|
print("Usage: bwparser.py scale [generate]")
|
||
|
exit()
|
||
|
|
||
|
networkscale = float(sys.argv[1])
|
||
|
|
||
|
generate = False
|
||
|
|
||
|
if len(sys.argv) > 2:
|
||
|
generate = True
|
||
|
|
||
|
bw_parser = BandwidthParser(bw_file=bandwidth_file, netscale=networkscale)
|
||
|
|
||
|
jansen_parser = JansenBandwidthParser(bw_file=bandwidth_file, netscale=networkscale)
|
||
|
|
||
|
komlo_parser = KomloBandwidthParser(sample_size=bw_parser.get_sample_size())
|
||
|
|
||
|
res_komlo = komlo_parser.get_distribution()
|
||
|
res_jansen = jansen_parser.get_distribution()
|
||
|
|
||
|
avg_komlo = komlo_parser.get_average()
|
||
|
avg_jansen = jansen_parser.get_average()
|
||
|
|
||
|
print("Relays in bandwidth file:", bw_parser.get_relay_num())
|
||
|
print("Sample size:", bw_parser.get_sample_size())
|
||
|
|
||
|
print("Smallest bandwidth in file:", bw_parser.get_slowest_relay_bandwidth())
|
||
|
print("Largest bandwidth in file:", bw_parser.get_fastest_relay_bandwidth())
|
||
|
print("Quantiles:", bw_parser.get_quantiles())
|
||
|
|
||
|
print("True average bandwidth:", bw_parser.get_average())
|
||
|
print("Komlo average bandwidth:", avg_komlo)
|
||
|
print("Jansen average bandwidth:", avg_jansen)
|
||
|
|
||
|
print("Komlo array size:", str(len(res_komlo)))
|
||
|
print("Jansen array size:", str(len(res_jansen)))
|
||
|
|
||
|
print("Komlo array:", str(res_komlo))
|
||
|
print("Jansen arary:", str(res_jansen))
|
||
|
|
||
|
print("Total throughput for bandwidth sample: ", str(bw_parser.get_relay_throughput()))
|
||
|
print("Total throughput for Jansen array: ", str(jansen_parser.get_relay_throughput()))
|
||
|
|
||
|
if generate:
|
||
|
bw_parser.gen_files()
|