Browse Source

More changes to make experiments more stable and improve logging

Steven Engler 4 years ago
parent
commit
2a6d4de2a0
5 changed files with 221 additions and 59 deletions
  1. 1 1
      src/chutney_manager.py
  2. 10 0
      src/data_helpers.py
  3. 58 9
      src/experiment.py
  4. 103 27
      src/log_system_usage.py
  5. 49 22
      src/relay_working_experiment.py

+ 1 - 1
src/chutney_manager.py

@@ -89,7 +89,7 @@ class Node:
 		no good way to get the actual value.
 		"""
 		#
-		return 8000+index
+		return 15000+index
 	#
 	def _value_formatter(self, value):
 		if type(value) == str:

+ 10 - 0
src/data_helpers.py

@@ -17,6 +17,16 @@ def read_server_results(filename):
 		return results
 	#
 #
+def read_server_results_brief(filename):
+	with open(filename, 'r') as f:
+		return json.load(f)
+	#
+#
+def read_remote_cpu_usage(filename):
+	with gzip.GzipFile(filename, 'rb') as f:
+		return pickle.load(f)
+	#
+#
 def read_client_info(filename):
 	with gzip.GzipFile(filename, 'rb') as f:
 		return pickle.load(f)

+ 58 - 9
src/experiment.py

@@ -16,6 +16,7 @@ import tempfile
 import stem.control
 import stem.descriptor.remote
 import stem.process
+import numpy as np
 #
 import numa
 import log_system_usage
@@ -32,6 +33,9 @@ class DummyEnterExit:
 		pass
 	#
 #
+class RepeatExperimentError(Exception):
+	pass
+#
 class Experiment:
 	def __init__(self, save_data_path, measureme_log_path, num_bytes, num_streams_per_client,
 	             num_clients, num_guards, num_authorities, num_exits, circuit_generator_builder,
@@ -139,8 +143,13 @@ class Experiment:
 						#
 					#
 				#
-				num_lines_to_print = 200
+				num_lines_to_print = 50
 				logging.debug('Last '+str(num_lines_to_print)+' lines of Chutney output:\n'+'\n'.join(chutney_network.startup_output.split('\n')[-num_lines_to_print:]))
+				if self.save_data_path is not None:
+					with open(os.path.join(self.save_data_path, 'chutney-startup.log'), 'w') as f:
+						f.write(chutney_network.startup_output)
+					#
+				#
 				#with chutney_network as net:
 				with chutney_network:
 					nicknames = [self.nodes[x].guess_nickname(x) for x in range(len(self.nodes))]
@@ -193,9 +202,29 @@ class Experiment:
 		p.join()
 		#
 		results = [x['results'] for x in server.results]
+		results_brief = []
+		#
+		for r in results:
+			to_add = {}
+			to_add['first_byte'] = r['deltas']['timestamps'][0]
+			to_add['last_byte'] = r['deltas']['timestamps'][-1]
+			to_add['data_size'] = r['data_size']
+			to_add['measured_data_size'] = int(np.sum(r['deltas']['bytes']))
+			to_add['custom_data'] = json.loads(r['custom_data'].decode('utf-8'))
+			results_brief.append(to_add)
+		#
+		num_expected_results = len(self.proxy_control_ports)*self.num_streams_per_client
+		#
+		threshold = 0.95
+		if len(results)/num_expected_results < threshold:
+			logging.warn('Less than {}% of streams completed: {}/{}'.format(round(threshold*100), len(results), num_expected_results))
+			raise RepeatExperimentError
 		#
 		if self.save_data_path is not None:
 			logging.info('Starting to save server results...')
+			with open(os.path.join(self.save_data_path, 'server_results_brief.json'), 'w') as f:
+				json.dump(results_brief, f)
+			#
 			with gzip.GzipFile(os.path.join(self.save_data_path, 'server_results.pickle.gz'), 'wb') as f:
 				pickle.dump(results, f, protocol=4)
 			#
@@ -207,7 +236,7 @@ class Experiment:
 			time_of_last_byte = max([x['time_of_last_byte'] for x in results])
 			total_transfer_rate = sum([x['data_size'] for x in results])/(time_of_last_byte-time_of_first_byte)
 			#
-			logging.info('Group size: %d/%d', len(results), len(self.proxy_control_ports)*self.num_streams_per_client)
+			logging.info('Group size: %d/%d', len(results), num_expected_results)
 			logging.info('Avg Transferred (MiB): %.4f', avg_data_size/(1024**2))
 			logging.info('Avg Transfer rate (MiB/s): %.4f', avg_transfer_rate/(1024**2))
 			logging.info('Total Transfer rate (MiB/s): %.4f', total_transfer_rate/(1024**2))
@@ -216,7 +245,7 @@ class Experiment:
 	def start_system_logging(self, next_action=None):
 		stop_cpu_logging_event = multiprocessing.Event()
 		p = multiprocessing.Process(target=log_system_usage.log_cpu_stats,
-		                            args=(os.path.join(self.save_data_path, 'cpu_stats.pickle.gz'), 0.1, stop_cpu_logging_event))
+		                            args=(os.path.join(self.save_data_path, 'cpu_stats.pickle.gz'), 0.5, [], stop_cpu_logging_event))
 		p.start()
 		#
 		try:
@@ -229,13 +258,33 @@ class Experiment:
 		p.join()
 	#
 	def start_throughput_clients(self):
-		logging.debug('Getting consensus')
-		try:
-			consensus = stem.descriptor.remote.get_consensus(endpoints=(stem.DirPort('127.0.0.1', 7000),))
-		except Exception as e:
-			raise Exception('Unable to retrieve the consensus') from e
+		circuit_generator = None
+		consensus_attempts_remaining = 10
+		num_expecting_relays = len([x for x in range(len(self.nodes)) if ('client', 1) not in self.nodes[x].options.items()])
+		while circuit_generator is None and consensus_attempts_remaining > 0:
+			logging.debug('Getting consensus')
+			try:
+				consensus = stem.descriptor.remote.get_consensus(endpoints=(stem.DirPort('127.0.0.1', 10000),)).run()
+			except Exception as e:
+				raise Exception('Unable to retrieve the consensus') from e
+			#
+			num_relays = len([1 for desc in consensus])
+			logging.info('Got consensus with {}/{} descriptors'.format(num_relays, num_expecting_relays))
+			#
+			if num_relays != num_expecting_relays:
+				logging.info('Not enough descriptors, trying again in 20 seconds...')
+				time.sleep(20)
+			else:
+				try:
+					circuit_generator = self.circuit_generator_builder(consensus, self.server_address)
+				except AssertionError:
+					logging.exception('Problem with the consensus, trying again in 10 seconds...')
+					time.sleep(10)
+				#
+			#
+			consensus_attempts_remaining -= 1
 		#
-		circuit_generator = self.circuit_generator_builder(consensus, self.server_address)
+		assert circuit_generator is not None, 'Could not build the circuit generator'
 		#
 		proxy_addresses = []
 		for control_port in self.proxy_control_ports:

+ 103 - 27
src/log_system_usage.py

@@ -3,11 +3,20 @@
 import time
 import threading
 import subprocess
+import re
 import sys
+import os
 import pickle
 import gzip
 #
 PROC_STAT_HEADERS = ('user', 'nice', 'system', 'idle', 'iowait', 'irq', 'softirq', 'steal', 'guest', 'guest_nice')
+PROC_PID_STAT_HEADERS = ('pid', 'comm', 'state', 'ppid', 'pgrp', 'session', 'tty_nr', 'tpgid', 'flags', 'minflt',
+                         'cminflt', 'majflt', 'cmajflt', 'utime', 'stime', 'cutime', 'cstime', 'priority', 'nice',
+                         'num_threads', 'itrealvalue', 'starttime', 'vsize', 'rss', 'rsslim', 'startcode',
+                         'endcode', 'startstack', 'kstkesp', 'kstkeip', 'signal', 'blocked', 'sigignore',
+                         'sigcatch', 'wchan', 'nswap', 'cnswap', 'exit_signal', 'processor', 'rt_priority',
+                         'policy', 'delayacct_blkio_ticks', 'guest_time', 'cguest_time', 'start_data', 'end_data',
+                         'start_brk', 'arg_start', 'arg_end', 'env_start', 'env_end', 'exit_code')
 #
 def get_cpu_stats(path='/proc/stat'):
 	"""
@@ -42,21 +51,61 @@ def get_cpu_stats(path='/proc/stat'):
 	#
 	return stats
 #
+def parse_stat_file(path):
+	with open(path, 'r') as f:
+		contents = f.read()
+	#
+	raw_stats = re.findall("(\(.*\)|\S+)", contents, flags=re.DOTALL)
+	proc_stats = {x[0]: x[1] for x in zip(PROC_PID_STAT_HEADERS, raw_stats)}
+	for k in proc_stats:
+		if k != 'comm' and k != 'state':
+			proc_stats[k] = int(proc_stats[k])
+		#
+	#
+	return proc_stats
+#
+def get_proc_stats(pid):
+	pid = int(pid)
+	path = os.path.join('/proc', str(pid), 'stat')
+	#
+	return parse_stat_file(path)
+#
+def get_thread_stats(tid):
+	tid = int(tid)
+	path = os.path.join('/proc', str(tid), 'task', str(tid), 'stat')
+	#
+	return parse_stat_file(path)
+#
+def calculate_cpu_stats(stats):
+	idle = stats['idle'] + stats['iowait']
+	non_idle = stats['user'] + stats['nice'] + stats['system'] + stats['irq'] + stats['softirq'] + stats['steal']
+	#
+	return (idle, non_idle)
+#
 def calculate_cpu_usage(initial, current):
 	"""
 	Calculation adapted from: https://stackoverflow.com/questions/23367857/accurate-calculation-of-cpu-usage-given-in-percentage-in-linux/
 	"""
 	#
-	initial_idle = initial['idle'] + initial['iowait']
-	current_idle = current['idle'] + current['iowait']
+	(initial_idle, initial_non_idle) = calculate_cpu_stats(initial)
+	initial_total = initial_idle + initial_non_idle
 	#
-	initial_non_idle = initial['user'] + initial['nice'] + initial['system'] + initial['irq'] + initial['softirq'] + initial['steal']
-	current_non_idle = current['user'] + current['nice'] + current['system'] + current['irq'] + current['softirq'] + current['steal']
+	(current_idle, current_non_idle) = calculate_cpu_stats(current)
+	current_total = current_idle + current_non_idle
 	#
+	return (current_non_idle-initial_non_idle)/(current_total-initial_total)
+#
+def calculate_process_cpu_usage(process_initial, process_current, cpu_initial, cpu_current):
+	(initial_idle, initial_non_idle) = calculate_cpu_stats(cpu_initial)
 	initial_total = initial_idle + initial_non_idle
+	#
+	(current_idle, current_non_idle) = calculate_cpu_stats(cpu_current)
 	current_total = current_idle + current_non_idle
 	#
-	return (current_non_idle-initial_non_idle)/(current_total-initial_total)
+	process_initial_non_idle = process_initial['utime'] + process_initial['stime']
+	process_current_non_idle = process_current['utime'] + process_current['stime']
+	#
+	return (process_current_non_idle-process_initial_non_idle)/(current_total-initial_total)
 #
 def calculate_cpu_usage_continuous(stats):
 	cpu_usages = []
@@ -65,18 +114,30 @@ def calculate_cpu_usage_continuous(stats):
 	#
 	return cpu_usages
 #
-def get_running_processes():
-	lines = subprocess.check_output(['ps', '-a', '-x', '-o', 'pid,state,args', '--no-headers']).decode('utf-8').split('\n')
-	lines = [line.strip() for line in lines]
-	lines = [line.split(' ', 2) for line in lines if len(line) != 0]
-	#
-	data = []
-	for line in lines:
-		data.append({'pid':int(line[0]), 'state':line[1], 'args':line[2]})
-	#
-	return data
+def calculate_process_cpu_usage_continuous(process_stats, cpu_stats):
+	process_usages = []
+	assert all([len(process_stats) == len(cpu_stats[i]) for i in cpu_stats])
+	for i in range(len(process_stats)-1):
+		using_core_0 = process_stats[i]['processor']
+		using_core_1 = process_stats[i+1]['processor']
+		usage_0 = calculate_process_cpu_usage(process_stats[i], process_stats[i+1], cpu_stats[using_core_0][i], cpu_stats[using_core_0][i+1])
+		usage_1 = calculate_process_cpu_usage(process_stats[i], process_stats[i+1], cpu_stats[using_core_1][i], cpu_stats[using_core_1][i+1])
+		process_usages.append((usage_0+usage_1)/2)
+	#
+	return process_usages
 #
-def log_cpu_stats(path, interval, stop_event):
+#def get_running_processes():
+#	lines = subprocess.check_output(['ps', '-a', '-x', '-o', 'pid,state,args', '--no-headers']).decode('utf-8').split('\n')
+#	lines = [line.strip() for line in lines]
+#	lines = [line.split(' ', 2) for line in lines if len(line) != 0]
+#	#
+#	data = []
+#	for line in lines:
+#		data.append({'pid':int(line[0]), 'state':line[1], 'args':line[2]})
+#	#
+#	return data
+#
+def log_cpu_stats(path, interval, pids, stop_event):
 	"""
 	Log the cpu stats to a gz compressed JSON file. Storing JSON
 	seems to only use about 10% more disk space than storing
@@ -88,20 +149,31 @@ def log_cpu_stats(path, interval, stop_event):
 	stop_event: a threading.Event which stops the function
 	"""
 	#
-	stats = {'timestamps':[], 'stats':{'system':[], 'cpus':{}}, 'processes':[]}
+	pids = [int(pid) for pid in pids]
+	threads = {pid: [int(tid) for tid in os.listdir('/proc/{}/task'.format(pid))] for pid in pids}
+	stats = {'timestamps':[],
+	         'cpu':{'system':[],
+	                'id':{x: [] for x in get_cpu_stats()['cpus'].keys()}},
+	         'process':{x: {'pid': [],
+	                        'tid': {y: [] for y in threads[x]}} for x in pids}}
+	#
 	while not stop_event.is_set():
-		stats['timestamps'].append(time.time())
-		#stats['stats'].append(get_cpu_stats())
+		current_time = time.time()
+		stats['timestamps'].append(current_time)
+		#
 		current_stats = get_cpu_stats()
-		stats['stats']['system'].append(current_stats['system'])
+		stats['cpu']['system'].append(current_stats['system'])
 		for cpu in current_stats['cpus']:
-			if cpu not in stats['stats']['cpus']:
-				stats['stats']['cpus'][cpu] = []
+			stats['cpu']['id'][cpu].append(current_stats['cpus'][cpu])
+		#
+		for pid in pids:
+			stats['process'][pid]['pid'].append(get_proc_stats(pid))
+			for tid in threads[pid]:
+				stats['process'][pid]['tid'][tid].append(get_thread_stats(tid))
 			#
-			stats['stats']['cpus'][cpu].append(current_stats['cpus'][cpu])
 		#
-		stats['processes'].append(get_running_processes())
-		stop_event.wait(interval)
+		wait_time = max(0, interval-(time.time()-current_time))
+		stop_event.wait(wait_time)
 	#
 	with gzip.GzipFile(path, 'wb') as f:
 		pickle.dump(stats, f, protocol=4)
@@ -145,11 +217,15 @@ def log_cpu_stats(path, interval, stop_event):
 if __name__ == '__main__':
 	stop_event = threading.Event()
 	#
-	assert len(sys.argv) == 3
+	assert len(sys.argv) >= 3
 	interval = float(sys.argv[1])
 	file_name = sys.argv[2]
+	if len(sys.argv) > 3:
+		pids = sys.argv[3].split(',')
+	else:
+		pids = []
 	#
-	t = threading.Thread(target=log_cpu_stats, args=(file_name, interval, stop_event))
+	t = threading.Thread(target=log_cpu_stats, args=(file_name, interval, pids, stop_event))
 	t.start()
 	#
 	try:

+ 49 - 22
src/relay_working_experiment.py

@@ -13,6 +13,8 @@ import json
 import gzip
 import pickle
 import tempfile
+import collections
+import gc
 #
 import stem.control
 import stem.descriptor.remote
@@ -59,14 +61,17 @@ class CustomExperiment(experiment.Experiment):
 			#local_ip = '129.97.119.196'
 			#target_ip = '129.97.119.226'
 			target_hostname = 'cluck2'
+			target_dir = '/tmp/chutney-net'
 		elif self.remote_name == 'sengler-rpi':
 			local_ip = '129.97.119.196'
 			target_ip = '129.97.169.9'
 			target_hostname = 'sengler-rpi'
+			target_dir = '/tmp/chutney-net'
 		elif self.remote_name is None:
 			local_ip = None
 			target_ip = None
 			target_hostname = None
+			target_dir = None
 		else:
 			raise Exception('hostname not known')
 		#
@@ -85,9 +90,14 @@ class CustomExperiment(experiment.Experiment):
 			target_optional_args['ip'] = target_ip
 		if target_hostname is not None:
 			target_optional_args['remote_hostname'] = target_hostname
+		if target_dir is not None:
+			target_optional_args['remote_net_dir'] = target_dir
 
-		target_optional_args['num_cpus'] = 4 # make sure it can process onion skins fast enough, and keep it consistent between computers
+		target_optional_args['num_cpus'] = 2 # make sure it can process onion skins fast enough, and keep it consistent between computers
+		# tor actually uses one more worker thread than what you ask for
 		target_optional_args['num_additional_eventloops'] = self.num_additional_eventloops
+		target_optional_args['dircache'] = False
+		# the voting interval is 40 seconds which puts an unrealistic workload on the target, so we disable it
 		target_cpu_prof = False #True
 		target_daemon = False
 		target_log_throughput = True
@@ -144,7 +154,11 @@ class CustomExperiment(experiment.Experiment):
 		remote_script_path = '/tmp/log_system_usage.py'
 		remote_save_path = '/tmp/cpu-usage.pickle.gz'
 		local_save_path = os.path.join(self.save_data_path, 'remote-cpu-usage.pickle.gz')
-		command = 'python3 {} 0.1 {}'.format(remote_script_path, remote_save_path)
+		#
+		tor_pids = subprocess.check_output(['ssh', self.remote_name, 'pgrep tor']).decode('utf-8').split()
+		tor_pids = [pid for pid in tor_pids]
+		logging.info('Logging the following pids on {}: {}'.format(self.remote_name, tor_pids))
+		command = 'python3 {} 0.2 {} {}'.format(remote_script_path, remote_save_path, ','.join(tor_pids))
 		#
 		try:
 			subprocess.check_output(['scp', local_script_path, '{}:{}'.format(self.remote_name, remote_script_path)], stderr=subprocess.STDOUT)
@@ -157,6 +171,9 @@ class CustomExperiment(experiment.Experiment):
 			#
 			if next_action is not None:
 				next_action()
+				time.sleep(5)
+				# wait a few seconds so that we have extra data
+				# this may be useful if we need to do averaging
 			#
 			if p.poll() != None:
 				raise Exception('Remote CPU monitoring script exited before it was supposed to')
@@ -196,10 +213,11 @@ class CustomExperiment(experiment.Experiment):
 def build_circuit_generator(consensus, server_address):
 	fingerprints = [desc.nickname for desc in consensus]
 	exit_fingerprints = [desc.nickname for desc in consensus if desc.exit_policy.can_exit_to(*server_address)]
+	authority_fingerprints = [desc.nickname for desc in consensus if desc.nickname.endswith('a')]
 	#
 	target_fingerprints = [desc.nickname for desc in consensus if desc.nickname.endswith('target')]
 	assert len(target_fingerprints) >= 1, 'No target relay in the consensus'
-	non_exit_fingerprints = list(set(fingerprints)-set(exit_fingerprints)-set(target_fingerprints))
+	non_exit_fingerprints = list(set(fingerprints)-set(exit_fingerprints)-set(target_fingerprints)-set(authority_fingerprints))
 	#
 	assert len(exit_fingerprints) >= 1, 'Need at least one exit relay'
 	assert len(non_exit_fingerprints) >= 1, 'Need at least one non-exit relay'
@@ -258,12 +276,18 @@ if __name__ == '__main__':
 	#
 	start_time = time.time()
 	#
-	tors = {'working':'/home/sengler/code/working/tor/src/app/tor', 'working-without':'/home/sengler/code/working/tor-without-tcmalloc/src/app/tor', 'dev-without':'/home/sengler/code/dev/tor-throughput-log-0.4.2.6-without-tcmalloc/src/app/tor', 'dev-with':'/home/sengler/code/dev/tor-throughput-log-0.4.2.6-with-tcmalloc/src/app/tor'}
+	#tors = {'working':'/home/sengler/code/working/tor/src/app/tor', 'working-without':'/home/sengler/code/working/tor-without-tcmalloc/src/app/tor', 'dev-without':'/home/sengler/code/dev/tor-throughput-log-0.4.2.6-without-tcmalloc/src/app/tor', 'dev-with':'/home/sengler/code/dev/tor-throughput-log-0.4.2.6-with-tcmalloc/src/app/tor'}
+	tors = collections.OrderedDict()
+	tors['working'] = '/home/sengler/code/working/tor/src/app/tor'
+	tors['working-without'] = '/home/sengler/code/working/tor-without-tcmalloc/src/app/tor'
+	tors['dev-with'] = '/home/sengler/code/dev/tor-throughput-log-0.4.2.6-with-tcmalloc/src/app/tor'
+	tors['dev-without'] = '/home/sengler/code/dev/tor-throughput-log-0.4.2.6-without-tcmalloc/src/app/tor'
 	hosts = ['sengler-rpi', 'cluck2']
 	###hosts = ['cluck2']
 	###hosts = ['sengler-rpi']
 	num_repetitions = 15
 	nums_additional_eventloops_options = [0, 1, 2, 3]
+	#nums_additional_eventloops_options = [3, 2, 1, 0]
 	#
 	#tors = {'working':'/home/sengler/code/working/tor/src/app/tor', 'dev-without':'/home/sengler/code/dev/tor-throughput-log-0.4.2.6-without-tcmalloc/src/app/tor'}
 	#hosts = ['cluck2']
@@ -295,18 +319,18 @@ if __name__ == '__main__':
 					#	num_bytes = 20*(2**20)
 					if host == 'cluck2':
 						num_clients = 150
-						num_guards = 58 # number of relays (including guards)
+						num_guards = 30 # number of relays (including guards)
 						num_authorities = 2 # will also act as a relay or guard
-						num_exits = 60 # will be used only as an exit
+						num_exits = 30 # will be used only as an exit
 						num_streams_per_client = 10
 						num_bytes = 20*(2**20)
 					elif host == 'sengler-rpi':
-						num_clients = 30
-						num_guards = 58 # number of relays (including guards)
-						num_authorities = 2 # will also act as a relay or guard
-						num_exits = 60 # will be used only as an exit
-						num_streams_per_client = 8
-						num_bytes = 10*(2**20)
+						num_clients = 100
+						num_guards = 300 # number of relays (including guards)
+						num_authorities = 3 # will also act as a relay or guard
+						num_exits = 300 # will be used only as an exit
+						num_streams_per_client = 6
+						num_bytes = 5*(2**20)
 					elif host is None:
 						num_clients = 10
 						num_guards = 10 # number of relays (including guards)
@@ -329,11 +353,11 @@ if __name__ == '__main__':
 							os.mkdir(save_data_path)
 							logging.info('Starting on {} using {}-{} ({}), repeat {}, attempt {}'.format(host, tor_name, num_additional_eventloops, tor_path, repeat, attempt))
 							#
-							#experiment = CustomExperiment(args.helgrind, args.target_tor, save_data_path, measureme_log_path, args.num_bytes,
-							experiment = CustomExperiment(args.helgrind, tor_path, num_additional_eventloops, host, save_data_path,
-							                              measureme_log_path, num_bytes,
-							                              num_streams_per_client, num_clients, num_guards, num_authorities, num_exits,
-							                              build_circuit_generator, args.buffer_len, args.wait_range, measureme, test_network=False)
+							#exp = CustomExperiment(args.helgrind, args.target_tor, save_data_path, measureme_log_path, args.num_bytes,
+							exp = CustomExperiment(args.helgrind, tor_path, num_additional_eventloops, host, save_data_path,
+							                       measureme_log_path, num_bytes,
+							                       num_streams_per_client, num_clients, num_guards, num_authorities, num_exits,
+							                       build_circuit_generator, args.buffer_len, args.wait_range, measureme, test_network=False)
 							#
 							def sleep_then_run(duration, func):
 								logging.info('Sleeping for {} seconds before running \'{}\''.format(duration, func.__name__))
@@ -345,9 +369,9 @@ if __name__ == '__main__':
 							#p = subprocess.Popen(['ssh', '-t', 'sengler-rpi', 'python3 /tmp/log_system_usage.py /tmp/usage.gz'])
 							#
 							try:
-								experiment.start_system_logging(lambda: experiment.start_remote_logging(lambda: experiment.start_chutney(lambda: experiment.start_throughput_server(lambda: sleep_then_run(20, experiment.start_throughput_clients)))))
-							except (stem.Timeout, stem.CircuitExtensionFailed):
-								tries = 5
+								exp.start_chutney(lambda: exp.start_throughput_server(lambda: sleep_then_run(120, lambda: exp.start_system_logging(lambda: exp.start_remote_logging(exp.start_throughput_clients)))))
+							except (stem.Timeout, stem.CircuitExtensionFailed, experiment.RepeatExperimentError):
+								tries = 9
 								attempt += 1
 								if attempt < tries:
 									logging.exception('Experiment run failed, trying again ({} tries remaining)'.format(tries-attempt))
@@ -356,11 +380,14 @@ if __name__ == '__main__':
 									raise
 								#
 							#
-							shutil.copytree('/tmp/chutney-net/nodes', os.path.join(save_data_path, 'nodes'))
+							shutil.copytree('/run/user/3271/chutney-net/nodes', os.path.join(save_data_path, 'nodes'))
 							os.system("ps u | grep 'tor'")
-							os.system("rm -rf /tmp/chutney-net/*")
+							os.system("rm -rf /run/user/3271/chutney-net/*")
 							break
 						#
+						exp = None
+						gc.collect()
+						# not sure if this is actually useful, but hopefully it reduces memory usage for when we need to fork
 					#
 				#
 			#