walkingonions-boosted/bwparser.py

#!/usr/bin/env python3

import os
import re
import sys
import math
import numpy as np
import random
import statistics

from itertools import cycle

class BandwidthParser:
    def __init__(self, **kwargs):
        self.samples = []

        if 'bw_file' in kwargs and kwargs['bw_file'] != None:
            self.file_name = kwargs['bw_file']
            self._parse_bandwidth_file()

        self.total_size = self.get_relay_num()

        if 'netscale' in kwargs and kwargs['netscale'] != None:
            self.set_scale(kwargs['netscale'])

        if 'sample_size' in kwargs and kwargs['sample_size'] != None:
            self.sample_size = int(kwargs['sample_size'])

    def get_distribution(self):
        return self.bandwidths

    def get_average(self):
        return statistics.mean(self.get_distribution())

    def get_fastest_relay_bandwidth(self):
        return self.bandwidths[-1]

    def get_slowest_relay_bandwidth(self):
        return self.bandwidths[0]

    def get_relay_num(self):
        return len(self.bandwidths)

    def get_sample_size(self):
        return self.sample_size

    def get_relay_throughput(self):
        return sum(self.get_distribution()) * 3600

    def set_scale(self, networkscale):
        self.sample_size = math.ceil(networkscale * self.total_size)

    def gen_files(self):
        thresh = self.get_quantiles()[2]

        buf = np.array(self.bandwidths)

        upper_quantile = buf[buf >= thresh]
        lower_quantiles = buf[buf < thresh]

        src = os.path.dirname(__file__) + "/datasets/" + self.file_name
        dest_upper = os.path.dirname(__file__) + "/datasets/" + self.file_name + "-upper"
        dest_lower = os.path.dirname(__file__) + "/datasets/" + self.file_name + "-lower"

        with open(src) as content:
            new_content_upper = ""
            new_content_lower = ""
            data = content.readlines()

            for line in data:
                if 'bw=1 ' in line or 'bw=' not in line:
                    # leave the useless nodes and first lines intact
                    new_content_upper = new_content_upper + line
                    new_content_lower = new_content_lower + line
                    continue
                else:
                    res = re.findall(r'bw=\d+\b', line)
                    if res != None:
                        if int(res[0][3:]) in upper_quantile:
                            new_content_upper = new_content_upper + line
                        if int(res[0][3:]) in lower_quantiles:
                            new_content_lower = new_content_lower + line

            with open(dest_upper, "w") as new_file:
                new_file.write(new_content_upper)

            with open(dest_lower, "w") as new_file:
                new_file.write(new_content_lower)

    def get_files(self):
        return [each for each in os.listdir(os.path.dirname(__file__) + "/datasets/") if '-upper' not in each and '-lower' not in each and '.csv' not in each]

    def _parse_bandwidth_file(self):
        self.bandwidths = []
        with open(os.path.dirname(__file__) + "/datasets/" + self.file_name) as content:
            data = content.readlines()
            for line in data:
                # skip metadata lines and unvotable relays
                if 'bw=1 ' in line or 'bw=' not in line:
                    continue
                else:
                    res = re.findall(r'bw=\d+\b', line)
                    if res != None:
                        self.bandwidths.append(int(res[0][3:]))

        self.bandwidths.sort()

    def get_quantiles(self):
        q1 = np.quantile(self.bandwidths, .25, interpolation='midpoint')
        q2 = np.quantile(self.bandwidths, .50, interpolation='midpoint')
        q3 = np.quantile(self.bandwidths, .75, interpolation='midpoint')

        return (q1, q2, q3)

class KomloBandwidthParser(BandwidthParser):
    def __init__(self, **kwargs):
        if 'sample_size' in kwargs and kwargs['sample_size'] != None:
            self.sample_size = int(kwargs['sample_size'])
        else:
            raise ValueError("KomloBandwidthParser requires sample_size")

        self.samples = self.bandwidths = self.get_distribution()
        self.total_size = self.get_relay_num()

    def get_distribution(self):
        res = []
        for k in range(0, self.sample_size):
            x = random.randint(1,2500)
            res.append(int(200000-(200000-25000)/3*math.log10(x)))

        res.sort()
        return res

class JansenBandwidthParser(BandwidthParser):

    def get_distribution(self):
        # https://metrics.torproject.org/collector/archive/relay-descriptors/bandwidths/
        # Algorithm based on https://www.robgjansen.com/publications/tormodel-cset2012.pdf
        res = []
        n = math.floor(self.total_size/self.sample_size)
        r = self.sample_size - n
        i = 0
        for k in range(0, self.sample_size):
            j = i + n
            if k < r:
                j = j + 1
            b = slice(i, j)
            if len(self.bandwidths[b]) != 0:
                res.append(int(statistics.median(self.bandwidths[b])))
            else:
                break
            i = j

        curr_len = len(res)

        if curr_len != self.sample_size:
            iterat = cycle(res)
            add = [next(iterat) for _ in range(abs(self.sample_size - curr_len))]
            res = res + add

        res.sort()
        return res

if __name__ == '__main__':
    # some default params
    bandwidth_file = os.getenv('BW_FILE')

    if len(sys.argv) < 2:
        print("Usage: bwparser.py scale [generate]")
        exit()

    networkscale = float(sys.argv[1])

    generate = False

    if len(sys.argv) > 2:
        generate = True

    bw_parser = BandwidthParser(bw_file=bandwidth_file, netscale=networkscale)

    jansen_parser = JansenBandwidthParser(bw_file=bandwidth_file, netscale=networkscale)

    komlo_parser = KomloBandwidthParser(sample_size=bw_parser.get_sample_size())

    res_komlo = komlo_parser.get_distribution()
    res_jansen = jansen_parser.get_distribution()

    avg_komlo = komlo_parser.get_average()
    avg_jansen = jansen_parser.get_average()

    print("Relays in bandwidth file:", bw_parser.get_relay_num())
    print("Sample size:", bw_parser.get_sample_size())

    print("Smallest bandwidth in file:", bw_parser.get_slowest_relay_bandwidth())
    print("Largest bandwidth in file:", bw_parser.get_fastest_relay_bandwidth())
    print("Quantiles:", bw_parser.get_quantiles())

    print("True average bandwidth:", bw_parser.get_average())
    print("Komlo average bandwidth:", avg_komlo)
    print("Jansen average bandwidth:", avg_jansen)

    print("Komlo array size:", str(len(res_komlo)))
    print("Jansen array size:", str(len(res_jansen)))

    print("Komlo array:", str(res_komlo))
    print("Jansen arary:", str(res_jansen))

    print("Total throughput for bandwidth sample: ", str(bw_parser.get_relay_throughput()))
    print("Total throughput for Jansen array: ", str(jansen_parser.get_relay_throughput()))

    if generate:
        bw_parser.gen_files()