#!/usr/bin/env python3 import os import re import sys import math import numpy as np import random import statistics from itertools import cycle class BandwidthParser: def __init__(self, **kwargs): self.samples = [] if 'bw_file' in kwargs and kwargs['bw_file'] != None: self.file_name = kwargs['bw_file'] self._parse_bandwidth_file() self.total_size = self.get_relay_num() if 'netscale' in kwargs and kwargs['netscale'] != None: self.set_scale(kwargs['netscale']) if 'sample_size' in kwargs and kwargs['sample_size'] != None: self.sample_size = int(kwargs['sample_size']) def get_distribution(self): return self.bandwidths def get_average(self): return statistics.mean(self.get_distribution()) def get_fastest_relay_bandwidth(self): return self.bandwidths[-1] def get_slowest_relay_bandwidth(self): return self.bandwidths[0] def get_relay_num(self): return len(self.bandwidths) def get_sample_size(self): return self.sample_size def get_relay_throughput(self): return sum(self.get_distribution()) * 3600 def set_scale(self, networkscale): self.sample_size = math.ceil(networkscale * self.total_size) def gen_files(self): thresh = self.get_quantiles()[2] buf = np.array(self.bandwidths) upper_quantile = buf[buf >= thresh] lower_quantiles = buf[buf < thresh] src = os.path.dirname(__file__) + "/datasets/" + self.file_name dest_upper = os.path.dirname(__file__) + "/datasets/" + self.file_name + "-upper" dest_lower = os.path.dirname(__file__) + "/datasets/" + self.file_name + "-lower" with open(src) as content: new_content_upper = "" new_content_lower = "" data = content.readlines() for line in data: if 'bw=1 ' in line or 'bw=' not in line: # leave the useless nodes and first lines intact new_content_upper = new_content_upper + line new_content_lower = new_content_lower + line continue else: res = re.findall(r'bw=\d+\b', line) if res != None: if int(res[0][3:]) in upper_quantile: new_content_upper = new_content_upper + line if int(res[0][3:]) in lower_quantiles: new_content_lower = new_content_lower + line with open(dest_upper, "w") as new_file: new_file.write(new_content_upper) with open(dest_lower, "w") as new_file: new_file.write(new_content_lower) def get_files(self): return [each for each in os.listdir(os.path.dirname(__file__) + "/datasets/") if '-upper' not in each and '-lower' not in each and '.csv' not in each] def _parse_bandwidth_file(self): self.bandwidths = [] with open(os.path.dirname(__file__) + "/datasets/" + self.file_name) as content: data = content.readlines() for line in data: # skip metadata lines and unvotable relays if 'bw=1 ' in line or 'bw=' not in line: continue else: res = re.findall(r'bw=\d+\b', line) if res != None: self.bandwidths.append(int(res[0][3:])) self.bandwidths.sort() def get_quantiles(self): q1 = np.quantile(self.bandwidths, .25, interpolation='midpoint') q2 = np.quantile(self.bandwidths, .50, interpolation='midpoint') q3 = np.quantile(self.bandwidths, .75, interpolation='midpoint') return (q1, q2, q3) class KomloBandwidthParser(BandwidthParser): def __init__(self, **kwargs): if 'sample_size' in kwargs and kwargs['sample_size'] != None: self.sample_size = int(kwargs['sample_size']) else: raise ValueError("KomloBandwidthParser requires sample_size") self.samples = self.bandwidths = self.get_distribution() self.total_size = self.get_relay_num() def get_distribution(self): res = [] for k in range(0, self.sample_size): x = random.randint(1,2500) res.append(int(200000-(200000-25000)/3*math.log10(x))) res.sort() return res class JansenBandwidthParser(BandwidthParser): def get_distribution(self): # https://metrics.torproject.org/collector/archive/relay-descriptors/bandwidths/ # Algorithm based on https://www.robgjansen.com/publications/tormodel-cset2012.pdf res = [] n = math.floor(self.total_size/self.sample_size) r = self.sample_size - n i = 0 for k in range(0, self.sample_size): j = i + n if k < r: j = j + 1 b = slice(i, j) if len(self.bandwidths[b]) != 0: res.append(int(statistics.median(self.bandwidths[b]))) else: break i = j curr_len = len(res) if curr_len != self.sample_size: iterat = cycle(res) add = [next(iterat) for _ in range(abs(self.sample_size - curr_len))] res = res + add res.sort() return res if __name__ == '__main__': # some default params bandwidth_file = os.getenv('BW_FILE') if len(sys.argv) < 2: print("Usage: bwparser.py scale [generate]") exit() networkscale = float(sys.argv[1]) generate = False if len(sys.argv) > 2: generate = True bw_parser = BandwidthParser(bw_file=bandwidth_file, netscale=networkscale) jansen_parser = JansenBandwidthParser(bw_file=bandwidth_file, netscale=networkscale) komlo_parser = KomloBandwidthParser(sample_size=bw_parser.get_sample_size()) res_komlo = komlo_parser.get_distribution() res_jansen = jansen_parser.get_distribution() avg_komlo = komlo_parser.get_average() avg_jansen = jansen_parser.get_average() print("Relays in bandwidth file:", bw_parser.get_relay_num()) print("Sample size:", bw_parser.get_sample_size()) print("Smallest bandwidth in file:", bw_parser.get_slowest_relay_bandwidth()) print("Largest bandwidth in file:", bw_parser.get_fastest_relay_bandwidth()) print("Quantiles:", bw_parser.get_quantiles()) print("True average bandwidth:", bw_parser.get_average()) print("Komlo average bandwidth:", avg_komlo) print("Jansen average bandwidth:", avg_jansen) print("Komlo array size:", str(len(res_komlo))) print("Jansen array size:", str(len(res_jansen))) print("Komlo array:", str(res_komlo)) print("Jansen arary:", str(res_jansen)) print("Total throughput for bandwidth sample: ", str(bw_parser.get_relay_throughput())) print("Total throughput for Jansen array: ", str(jansen_parser.get_relay_throughput())) if generate: bw_parser.gen_files()