12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216 |
- #!/usr/bin/env python
- # Usage:
- #
- # Regenerate the list:
- # scripts/maint/updateFallbackDirs.py > src/app/config/fallback_dirs.inc 2> fallback_dirs.log
- #
- # Check the existing list:
- # scripts/maint/updateFallbackDirs.py check_existing > fallback_dirs.inc.ok 2> fallback_dirs.log
- # mv fallback_dirs.inc.ok src/app/config/fallback_dirs.inc
- #
- # This script should be run from a stable, reliable network connection,
- # with no other network activity (and not over tor).
- # If this is not possible, please disable:
- # PERFORM_IPV4_DIRPORT_CHECKS and PERFORM_IPV6_DIRPORT_CHECKS
- #
- # Needs dateutil, stem, and potentially other python packages.
- # Optionally uses ipaddress (python 3 builtin) or py2-ipaddress (package)
- # for netblock analysis.
- #
- # Then read the logs to make sure the fallbacks aren't dominated by a single
- # netblock or port.
- # Script by weasel, April 2015
- # Portions by gsathya & karsten, 2013
- # https://trac.torproject.org/projects/tor/attachment/ticket/8374/dir_list.2.py
- # Modifications by teor, 2015
- import StringIO
- import string
- import re
- import datetime
- import gzip
- import os.path
- import json
- import math
- import sys
- import urllib
- import urllib2
- import hashlib
- import dateutil.parser
- # bson_lazy provides bson
- #from bson import json_util
- import copy
- import re
- from stem.descriptor import DocumentHandler
- from stem.descriptor.remote import get_consensus, get_server_descriptors, MAX_FINGERPRINTS
- import logging
- logging.root.name = ''
- HAVE_IPADDRESS = False
- try:
- # python 3 builtin, or install package py2-ipaddress
- # there are several ipaddress implementations for python 2
- # with slightly different semantics with str typed text
- # fortunately, all our IP addresses are in unicode
- import ipaddress
- HAVE_IPADDRESS = True
- except ImportError:
- # if this happens, we avoid doing netblock analysis
- logging.warning('Unable to import ipaddress, please install py2-ipaddress.' +
- ' A fallback list will be created, but optional netblock' +
- ' analysis will not be performed.')
- ## Top-Level Configuration
- # We use semantic versioning: https://semver.org
- # In particular:
- # * major changes include removing a mandatory field, or anything else that
- # would break an appropriately tolerant parser,
- # * minor changes include adding a field,
- # * patch changes include changing header comments or other unstructured
- # content
- FALLBACK_FORMAT_VERSION = '2.0.0'
- SECTION_SEPARATOR_BASE = '====='
- SECTION_SEPARATOR_COMMENT = '/* ' + SECTION_SEPARATOR_BASE + ' */'
- # Output all candidate fallbacks, or only output selected fallbacks?
- OUTPUT_CANDIDATES = False
- # Perform DirPort checks over IPv4?
- # Change this to False if IPv4 doesn't work for you, or if you don't want to
- # download a consensus for each fallback
- # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
- PERFORM_IPV4_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else True
- # Perform DirPort checks over IPv6?
- # If you know IPv6 works for you, set this to True
- # This will exclude IPv6 relays without an IPv6 DirPort configured
- # So it's best left at False until #18394 is implemented
- # Don't check ~1000 candidates when OUTPUT_CANDIDATES is True
- PERFORM_IPV6_DIRPORT_CHECKS = False if OUTPUT_CANDIDATES else False
- # Must relays be running now?
- MUST_BE_RUNNING_NOW = (PERFORM_IPV4_DIRPORT_CHECKS
- or PERFORM_IPV6_DIRPORT_CHECKS)
- # Clients have been using microdesc consensuses by default for a while now
- DOWNLOAD_MICRODESC_CONSENSUS = True
- # If a relay delivers an expired consensus, if it expired less than this many
- # seconds ago, we still allow the relay. This should never be less than -90,
- # as all directory mirrors should have downloaded a consensus 90 minutes
- # before it expires. It should never be more than 24 hours, because clients
- # reject consensuses that are older than REASONABLY_LIVE_TIME.
- # For the consensus expiry check to be accurate, the machine running this
- # script needs an accurate clock.
- #
- # Relays on 0.3.0 and later return a 404 when they are about to serve an
- # expired consensus. This makes them fail the download check.
- # We use a tolerance of 0, so that 0.2.x series relays also fail the download
- # check if they serve an expired consensus.
- CONSENSUS_EXPIRY_TOLERANCE = 0
- # Output fallback name, flags, bandwidth, and ContactInfo in a C comment?
- OUTPUT_COMMENTS = True if OUTPUT_CANDIDATES else False
- # Output matching ContactInfo in fallbacks list?
- # Useful if you're trying to contact operators
- CONTACT_COUNT = True if OUTPUT_CANDIDATES else False
- # How the list should be sorted:
- # fingerprint: is useful for stable diffs of fallback lists
- # measured_bandwidth: is useful when pruning the list based on bandwidth
- # contact: is useful for contacting operators once the list has been pruned
- OUTPUT_SORT_FIELD = 'contact' if OUTPUT_CANDIDATES else 'fingerprint'
- ## OnionOO Settings
- ONIONOO = 'https://onionoo.torproject.org/'
- #ONIONOO = 'https://onionoo.thecthulhu.com/'
- # Don't bother going out to the Internet, just use the files available locally,
- # even if they're very old
- LOCAL_FILES_ONLY = False
- ## Whitelist / Blacklist Filter Settings
- # The whitelist contains entries that are included if all attributes match
- # (IPv4, dirport, orport, id, and optionally IPv6 and IPv6 orport)
- # What happens to entries not in whitelist?
- # When True, they are included, when False, they are excluded
- INCLUDE_UNLISTED_ENTRIES = True if OUTPUT_CANDIDATES else False
- WHITELIST_FILE_NAME = 'scripts/maint/fallback.whitelist'
- FALLBACK_FILE_NAME = 'src/app/config/fallback_dirs.inc'
- # The number of bytes we'll read from a filter file before giving up
- MAX_LIST_FILE_SIZE = 1024 * 1024
- ## Eligibility Settings
- # Require fallbacks to have the same address and port for a set amount of time
- # We used to have this at 1 week, but that caused many fallback failures, which
- # meant that we had to rebuild the list more often. We want fallbacks to be
- # stable for 2 years, so we set it to a few months.
- #
- # If a relay changes address or port, that's it, it's not useful any more,
- # because clients can't find it
- ADDRESS_AND_PORT_STABLE_DAYS = 90
- # We ignore relays that have been down for more than this period
- MAX_DOWNTIME_DAYS = 0 if MUST_BE_RUNNING_NOW else 7
- # FallbackDirs must have a time-weighted-fraction that is greater than or
- # equal to:
- # Mirrors that are down half the time are still useful half the time
- CUTOFF_RUNNING = .50
- CUTOFF_V2DIR = .50
- # Guard flags are removed for some time after a relay restarts, so we ignore
- # the guard flag.
- CUTOFF_GUARD = .00
- # FallbackDirs must have a time-weighted-fraction that is less than or equal
- # to:
- # .00 means no bad exits
- PERMITTED_BADEXIT = .00
- # older entries' weights are adjusted with ALPHA^(age in days)
- AGE_ALPHA = 0.99
- # this factor is used to scale OnionOO entries to [0,1]
- ONIONOO_SCALE_ONE = 999.
- ## Fallback Count Limits
- # The target for these parameters is 20% of the guards in the network
- # This is around 200 as of October 2015
- _FB_POG = 0.2
- FALLBACK_PROPORTION_OF_GUARDS = None if OUTPUT_CANDIDATES else _FB_POG
- # Limit the number of fallbacks (eliminating lowest by advertised bandwidth)
- MAX_FALLBACK_COUNT = None if OUTPUT_CANDIDATES else 200
- # Emit a C #error if the number of fallbacks is less than expected
- MIN_FALLBACK_COUNT = 0 if OUTPUT_CANDIDATES else MAX_FALLBACK_COUNT*0.5
- # The maximum number of fallbacks on the same address, contact, or family
- #
- # With 150 fallbacks, this means each operator sees 5% of client bootstraps.
- # For comparison:
- # - We try to limit guard and exit operators to 5% of the network
- # - The directory authorities used to see 11% of client bootstraps each
- #
- # We also don't want too much of the list to go down if a single operator
- # has to move all their relays.
- MAX_FALLBACKS_PER_IP = 1
- MAX_FALLBACKS_PER_IPV4 = MAX_FALLBACKS_PER_IP
- MAX_FALLBACKS_PER_IPV6 = MAX_FALLBACKS_PER_IP
- MAX_FALLBACKS_PER_CONTACT = 7
- MAX_FALLBACKS_PER_FAMILY = 7
- ## Fallback Bandwidth Requirements
- # Any fallback with the Exit flag has its bandwidth multiplied by this fraction
- # to make sure we aren't further overloading exits
- # (Set to 1.0, because we asked that only lightly loaded exits opt-in,
- # and the extra load really isn't that much for large relays.)
- EXIT_BANDWIDTH_FRACTION = 1.0
- # If a single fallback's bandwidth is too low, it's pointless adding it
- # We expect fallbacks to handle an extra 10 kilobytes per second of traffic
- # Make sure they can support fifty times the expected extra load
- #
- # We convert this to a consensus weight before applying the filter,
- # because all the bandwidth amounts are specified by the relay
- MIN_BANDWIDTH = 50.0 * 10.0 * 1024.0
- # Clients will time out after 30 seconds trying to download a consensus
- # So allow fallback directories half that to deliver a consensus
- # The exact download times might change based on the network connection
- # running this script, but only by a few seconds
- # There is also about a second of python overhead
- CONSENSUS_DOWNLOAD_SPEED_MAX = 15.0
- # If the relay fails a consensus check, retry the download
- # This avoids delisting a relay due to transient network conditions
- CONSENSUS_DOWNLOAD_RETRY = True
- ## Parsing Functions
- def parse_ts(t):
- return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S")
- def remove_bad_chars(raw_string, bad_char_list):
- # Remove each character in the bad_char_list
- cleansed_string = raw_string
- for c in bad_char_list:
- cleansed_string = cleansed_string.replace(c, '')
- return cleansed_string
- def cleanse_unprintable(raw_string):
- # Remove all unprintable characters
- cleansed_string = ''
- for c in raw_string:
- if c in string.printable:
- cleansed_string += c
- return cleansed_string
- def cleanse_whitespace(raw_string):
- # Replace all whitespace characters with a space
- cleansed_string = raw_string
- for c in string.whitespace:
- cleansed_string = cleansed_string.replace(c, ' ')
- return cleansed_string
- def cleanse_c_multiline_comment(raw_string):
- cleansed_string = raw_string
- # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
- cleansed_string = cleanse_whitespace(cleansed_string)
- # ContactInfo and Version can be arbitrary binary data
- cleansed_string = cleanse_unprintable(cleansed_string)
- # Prevent a malicious / unanticipated string from breaking out
- # of a C-style multiline comment
- # This removes '/*' and '*/' and '//'
- bad_char_list = '*/'
- # Prevent a malicious string from using C nulls
- bad_char_list += '\0'
- # Avoid confusing parsers by making sure there is only one comma per fallback
- bad_char_list += ','
- # Avoid confusing parsers by making sure there is only one equals per field
- bad_char_list += '='
- # Be safer by removing bad characters entirely
- cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
- # Some compilers may further process the content of comments
- # There isn't much we can do to cover every possible case
- # But comment-based directives are typically only advisory
- return cleansed_string
- def cleanse_c_string(raw_string):
- cleansed_string = raw_string
- # Embedded newlines should be removed by tor/onionoo, but let's be paranoid
- cleansed_string = cleanse_whitespace(cleansed_string)
- # ContactInfo and Version can be arbitrary binary data
- cleansed_string = cleanse_unprintable(cleansed_string)
- # Prevent a malicious address/fingerprint string from breaking out
- # of a C-style string
- bad_char_list = '"'
- # Prevent a malicious string from using escapes
- bad_char_list += '\\'
- # Prevent a malicious string from using C nulls
- bad_char_list += '\0'
- # Avoid confusing parsers by making sure there is only one comma per fallback
- bad_char_list += ','
- # Avoid confusing parsers by making sure there is only one equals per field
- bad_char_list += '='
- # Be safer by removing bad characters entirely
- cleansed_string = remove_bad_chars(cleansed_string, bad_char_list)
- # Some compilers may further process the content of strings
- # There isn't much we can do to cover every possible case
- # But this typically only results in changes to the string data
- return cleansed_string
- ## OnionOO Source Functions
- # a dictionary of source metadata for each onionoo query we've made
- fetch_source = {}
- # register source metadata for 'what'
- # assumes we only retrieve one document for each 'what'
- def register_fetch_source(what, url, relays_published, version):
- fetch_source[what] = {}
- fetch_source[what]['url'] = url
- fetch_source[what]['relays_published'] = relays_published
- fetch_source[what]['version'] = version
- # list each registered source's 'what'
- def fetch_source_list():
- return sorted(fetch_source.keys())
- # given 'what', provide a multiline C comment describing the source
- def describe_fetch_source(what):
- desc = '/*'
- desc += '\n'
- desc += 'Onionoo Source: '
- desc += cleanse_c_multiline_comment(what)
- desc += ' Date: '
- desc += cleanse_c_multiline_comment(fetch_source[what]['relays_published'])
- desc += ' Version: '
- desc += cleanse_c_multiline_comment(fetch_source[what]['version'])
- desc += '\n'
- desc += 'URL: '
- desc += cleanse_c_multiline_comment(fetch_source[what]['url'])
- desc += '\n'
- desc += '*/'
- return desc
- ## File Processing Functions
- def write_to_file(str, file_name, max_len):
- try:
- with open(file_name, 'w') as f:
- f.write(str[0:max_len])
- except EnvironmentError, error:
- logging.error('Writing file %s failed: %d: %s'%
- (file_name,
- error.errno,
- error.strerror)
- )
- def read_from_file(file_name, max_len):
- try:
- if os.path.isfile(file_name):
- with open(file_name, 'r') as f:
- return f.read(max_len)
- except EnvironmentError, error:
- logging.info('Loading file %s failed: %d: %s'%
- (file_name,
- error.errno,
- error.strerror)
- )
- return None
- def parse_fallback_file(file_name):
- file_data = read_from_file(file_name, MAX_LIST_FILE_SIZE)
- file_data = cleanse_unprintable(file_data)
- file_data = remove_bad_chars(file_data, '\n"\0')
- file_data = re.sub('/\*.*?\*/', '', file_data)
- file_data = file_data.replace(',', '\n')
- file_data = file_data.replace(' weight=10', '')
- return file_data
- def load_possibly_compressed_response_json(response):
- if response.info().get('Content-Encoding') == 'gzip':
- buf = StringIO.StringIO( response.read() )
- f = gzip.GzipFile(fileobj=buf)
- return json.load(f)
- else:
- return json.load(response)
- def load_json_from_file(json_file_name):
- # An exception here may be resolved by deleting the .last_modified
- # and .json files, and re-running the script
- try:
- with open(json_file_name, 'r') as f:
- return json.load(f)
- except EnvironmentError, error:
- raise Exception('Reading not-modified json file %s failed: %d: %s'%
- (json_file_name,
- error.errno,
- error.strerror)
- )
- ## OnionOO Functions
- def datestr_to_datetime(datestr):
- # Parse datetimes like: Fri, 02 Oct 2015 13:34:14 GMT
- if datestr is not None:
- dt = dateutil.parser.parse(datestr)
- else:
- # Never modified - use start of epoch
- dt = datetime.datetime.utcfromtimestamp(0)
- # strip any timezone out (in case they're supported in future)
- dt = dt.replace(tzinfo=None)
- return dt
- def onionoo_fetch(what, **kwargs):
- params = kwargs
- params['type'] = 'relay'
- #params['limit'] = 10
- params['first_seen_days'] = '%d-'%(ADDRESS_AND_PORT_STABLE_DAYS)
- params['last_seen_days'] = '-%d'%(MAX_DOWNTIME_DAYS)
- params['flag'] = 'V2Dir'
- url = ONIONOO + what + '?' + urllib.urlencode(params)
- # Unfortunately, the URL is too long for some OS filenames,
- # but we still don't want to get files from different URLs mixed up
- base_file_name = what + '-' + hashlib.sha1(url).hexdigest()
- full_url_file_name = base_file_name + '.full_url'
- MAX_FULL_URL_LENGTH = 1024
- last_modified_file_name = base_file_name + '.last_modified'
- MAX_LAST_MODIFIED_LENGTH = 64
- json_file_name = base_file_name + '.json'
- if LOCAL_FILES_ONLY:
- # Read from the local file, don't write to anything
- response_json = load_json_from_file(json_file_name)
- else:
- # store the full URL to a file for debugging
- # no need to compare as long as you trust SHA-1
- write_to_file(url, full_url_file_name, MAX_FULL_URL_LENGTH)
- request = urllib2.Request(url)
- request.add_header('Accept-encoding', 'gzip')
- # load the last modified date from the file, if it exists
- last_mod_date = read_from_file(last_modified_file_name,
- MAX_LAST_MODIFIED_LENGTH)
- if last_mod_date is not None:
- request.add_header('If-modified-since', last_mod_date)
- # Parse last modified date
- last_mod = datestr_to_datetime(last_mod_date)
- # Not Modified and still recent enough to be useful
- # Onionoo / Globe used to use 6 hours, but we can afford a day
- required_freshness = datetime.datetime.utcnow()
- # strip any timezone out (to match dateutil.parser)
- required_freshness = required_freshness.replace(tzinfo=None)
- required_freshness -= datetime.timedelta(hours=24)
- # Make the OnionOO request
- response_code = 0
- try:
- response = urllib2.urlopen(request)
- response_code = response.getcode()
- except urllib2.HTTPError, error:
- response_code = error.code
- if response_code == 304: # not modified
- pass
- else:
- raise Exception("Could not get " + url + ": "
- + str(error.code) + ": " + error.reason)
- if response_code == 200: # OK
- last_mod = datestr_to_datetime(response.info().get('Last-Modified'))
- # Check for freshness
- if last_mod < required_freshness:
- if last_mod_date is not None:
- # This check sometimes fails transiently, retry the script if it does
- date_message = "Outdated data: last updated " + last_mod_date
- else:
- date_message = "No data: never downloaded "
- raise Exception(date_message + " from " + url)
- # Process the data
- if response_code == 200: # OK
- response_json = load_possibly_compressed_response_json(response)
- with open(json_file_name, 'w') as f:
- # use the most compact json representation to save space
- json.dump(response_json, f, separators=(',',':'))
- # store the last modified date in its own file
- if response.info().get('Last-modified') is not None:
- write_to_file(response.info().get('Last-Modified'),
- last_modified_file_name,
- MAX_LAST_MODIFIED_LENGTH)
- elif response_code == 304: # Not Modified
- response_json = load_json_from_file(json_file_name)
- else: # Unexpected HTTP response code not covered in the HTTPError above
- raise Exception("Unexpected HTTP response code to " + url + ": "
- + str(response_code))
- register_fetch_source(what,
- url,
- response_json['relays_published'],
- response_json['version'])
- return response_json
- def fetch(what, **kwargs):
- #x = onionoo_fetch(what, **kwargs)
- # don't use sort_keys, as the order of or_addresses is significant
- #print json.dumps(x, indent=4, separators=(',', ': '))
- #sys.exit(0)
- return onionoo_fetch(what, **kwargs)
- ## Fallback Candidate Class
- class Candidate(object):
- CUTOFF_ADDRESS_AND_PORT_STABLE = (datetime.datetime.utcnow()
- - datetime.timedelta(ADDRESS_AND_PORT_STABLE_DAYS))
- def __init__(self, details):
- for f in ['fingerprint', 'nickname', 'last_changed_address_or_port',
- 'consensus_weight', 'or_addresses', 'dir_address']:
- if not f in details: raise Exception("Document has no %s field."%(f,))
- if not 'contact' in details:
- details['contact'] = None
- if not 'flags' in details or details['flags'] is None:
- details['flags'] = []
- if (not 'advertised_bandwidth' in details
- or details['advertised_bandwidth'] is None):
- # relays without advertised bandwidth have it calculated from their
- # consensus weight
- details['advertised_bandwidth'] = 0
- if (not 'effective_family' in details
- or details['effective_family'] is None):
- details['effective_family'] = []
- if not 'platform' in details:
- details['platform'] = None
- details['last_changed_address_or_port'] = parse_ts(
- details['last_changed_address_or_port'])
- self._data = details
- self._stable_sort_or_addresses()
- self._fpr = self._data['fingerprint']
- self._running = self._guard = self._v2dir = 0.
- self._split_dirport()
- self._compute_orport()
- if self.orport is None:
- raise Exception("Failed to get an orport for %s."%(self._fpr,))
- self._compute_ipv6addr()
- if not self.has_ipv6():
- logging.debug("Failed to get an ipv6 address for %s."%(self._fpr,))
- self._compute_version()
- self._extra_info_cache = None
- def _stable_sort_or_addresses(self):
- # replace self._data['or_addresses'] with a stable ordering,
- # sorting the secondary addresses in string order
- # leave the received order in self._data['or_addresses_raw']
- self._data['or_addresses_raw'] = self._data['or_addresses']
- or_address_primary = self._data['or_addresses'][:1]
- # subsequent entries in the or_addresses array are in an arbitrary order
- # so we stabilise the addresses by sorting them in string order
- or_addresses_secondaries_stable = sorted(self._data['or_addresses'][1:])
- or_addresses_stable = or_address_primary + or_addresses_secondaries_stable
- self._data['or_addresses'] = or_addresses_stable
- def get_fingerprint(self):
- return self._fpr
- # is_valid_ipv[46]_address by gsathya, karsten, 2013
- @staticmethod
- def is_valid_ipv4_address(address):
- if not isinstance(address, (str, unicode)):
- return False
- # check if there are four period separated values
- if address.count(".") != 3:
- return False
- # checks that each value in the octet are decimal values between 0-255
- for entry in address.split("."):
- if not entry.isdigit() or int(entry) < 0 or int(entry) > 255:
- return False
- elif entry[0] == "0" and len(entry) > 1:
- return False # leading zeros, for instance in "1.2.3.001"
- return True
- @staticmethod
- def is_valid_ipv6_address(address):
- if not isinstance(address, (str, unicode)):
- return False
- # remove brackets
- address = address[1:-1]
- # addresses are made up of eight colon separated groups of four hex digits
- # with leading zeros being optional
- # https://en.wikipedia.org/wiki/IPv6#Address_format
- colon_count = address.count(":")
- if colon_count > 7:
- return False # too many groups
- elif colon_count != 7 and not "::" in address:
- return False # not enough groups and none are collapsed
- elif address.count("::") > 1 or ":::" in address:
- return False # multiple groupings of zeros can't be collapsed
- found_ipv4_on_previous_entry = False
- for entry in address.split(":"):
- # If an IPv6 address has an embedded IPv4 address,
- # it must be the last entry
- if found_ipv4_on_previous_entry:
- return False
- if not re.match("^[0-9a-fA-f]{0,4}$", entry):
- if not Candidate.is_valid_ipv4_address(entry):
- return False
- else:
- found_ipv4_on_previous_entry = True
- return True
- def _split_dirport(self):
- # Split the dir_address into dirip and dirport
- (self.dirip, _dirport) = self._data['dir_address'].split(':', 2)
- self.dirport = int(_dirport)
- def _compute_orport(self):
- # Choose the first ORPort that's on the same IPv4 address as the DirPort.
- # In rare circumstances, this might not be the primary ORPort address.
- # However, _stable_sort_or_addresses() ensures we choose the same one
- # every time, even if onionoo changes the order of the secondaries.
- self._split_dirport()
- self.orport = None
- for i in self._data['or_addresses']:
- if i != self._data['or_addresses'][0]:
- logging.debug('Secondary IPv4 Address Used for %s: %s'%(self._fpr, i))
- (ipaddr, port) = i.rsplit(':', 1)
- if (ipaddr == self.dirip) and Candidate.is_valid_ipv4_address(ipaddr):
- self.orport = int(port)
- return
- def _compute_ipv6addr(self):
- # Choose the first IPv6 address that uses the same port as the ORPort
- # Or, choose the first IPv6 address in the list
- # _stable_sort_or_addresses() ensures we choose the same IPv6 address
- # every time, even if onionoo changes the order of the secondaries.
- self.ipv6addr = None
- self.ipv6orport = None
- # Choose the first IPv6 address that uses the same port as the ORPort
- for i in self._data['or_addresses']:
- (ipaddr, port) = i.rsplit(':', 1)
- if (port == self.orport) and Candidate.is_valid_ipv6_address(ipaddr):
- self.ipv6addr = ipaddr
- self.ipv6orport = int(port)
- return
- # Choose the first IPv6 address in the list
- for i in self._data['or_addresses']:
- (ipaddr, port) = i.rsplit(':', 1)
- if Candidate.is_valid_ipv6_address(ipaddr):
- self.ipv6addr = ipaddr
- self.ipv6orport = int(port)
- return
- def _compute_version(self):
- # parse the version out of the platform string
- # The platform looks like: "Tor 0.2.7.6 on Linux"
- self._data['version'] = None
- if self._data['platform'] is None:
- return
- # be tolerant of weird whitespacing, use a whitespace split
- tokens = self._data['platform'].split()
- for token in tokens:
- vnums = token.split('.')
- # if it's at least a.b.c.d, with potentially an -alpha-dev, -alpha, -rc
- if (len(vnums) >= 4 and vnums[0].isdigit() and vnums[1].isdigit() and
- vnums[2].isdigit()):
- self._data['version'] = token
- return
- # From #20509
- # bug #20499 affects versions from 0.2.9.1-alpha-dev to 0.2.9.4-alpha-dev
- # and version 0.3.0.0-alpha-dev
- # Exhaustive lists are hard to get wrong
- STALE_CONSENSUS_VERSIONS = ['0.2.9.1-alpha-dev',
- '0.2.9.2-alpha',
- '0.2.9.2-alpha-dev',
- '0.2.9.3-alpha',
- '0.2.9.3-alpha-dev',
- '0.2.9.4-alpha',
- '0.2.9.4-alpha-dev',
- '0.3.0.0-alpha-dev'
- ]
- def is_valid_version(self):
- # call _compute_version before calling this
- # is the version of the relay a version we want as a fallback?
- # checks both recommended versions and bug #20499 / #20509
- #
- # if the relay doesn't have a recommended version field, exclude the relay
- if not self._data.has_key('recommended_version'):
- log_excluded('%s not a candidate: no recommended_version field',
- self._fpr)
- return False
- if not self._data['recommended_version']:
- log_excluded('%s not a candidate: version not recommended', self._fpr)
- return False
- # if the relay doesn't have version field, exclude the relay
- if not self._data.has_key('version'):
- log_excluded('%s not a candidate: no version field', self._fpr)
- return False
- if self._data['version'] in Candidate.STALE_CONSENSUS_VERSIONS:
- logging.warning('%s not a candidate: version delivers stale consensuses',
- self._fpr)
- return False
- return True
- @staticmethod
- def _extract_generic_history(history, which='unknown'):
- # given a tree like this:
- # {
- # "1_month": {
- # "count": 187,
- # "factor": 0.001001001001001001,
- # "first": "2015-02-27 06:00:00",
- # "interval": 14400,
- # "last": "2015-03-30 06:00:00",
- # "values": [
- # 999,
- # 999
- # ]
- # },
- # "1_week": {
- # "count": 169,
- # "factor": 0.001001001001001001,
- # "first": "2015-03-23 07:30:00",
- # "interval": 3600,
- # "last": "2015-03-30 07:30:00",
- # "values": [ ...]
- # },
- # "1_year": {
- # "count": 177,
- # "factor": 0.001001001001001001,
- # "first": "2014-04-11 00:00:00",
- # "interval": 172800,
- # "last": "2015-03-29 00:00:00",
- # "values": [ ...]
- # },
- # "3_months": {
- # "count": 185,
- # "factor": 0.001001001001001001,
- # "first": "2014-12-28 06:00:00",
- # "interval": 43200,
- # "last": "2015-03-30 06:00:00",
- # "values": [ ...]
- # }
- # },
- # extract exactly one piece of data per time interval,
- # using smaller intervals where available.
- #
- # returns list of (age, length, value) dictionaries.
- generic_history = []
- periods = history.keys()
- periods.sort(key = lambda x: history[x]['interval'])
- now = datetime.datetime.utcnow()
- newest = now
- for p in periods:
- h = history[p]
- interval = datetime.timedelta(seconds = h['interval'])
- this_ts = parse_ts(h['last'])
- if (len(h['values']) != h['count']):
- logging.warning('Inconsistent value count in %s document for %s'
- %(p, which))
- for v in reversed(h['values']):
- if (this_ts <= newest):
- agt1 = now - this_ts
- agt2 = interval
- agetmp1 = (agt1.microseconds + (agt1.seconds + agt1.days * 24 * 3600)
- * 10**6) / 10**6
- agetmp2 = (agt2.microseconds + (agt2.seconds + agt2.days * 24 * 3600)
- * 10**6) / 10**6
- generic_history.append(
- { 'age': agetmp1,
- 'length': agetmp2,
- 'value': v
- })
- newest = this_ts
- this_ts -= interval
- if (this_ts + interval != parse_ts(h['first'])):
- logging.warning('Inconsistent time information in %s document for %s'
- %(p, which))
- #print json.dumps(generic_history, sort_keys=True,
- # indent=4, separators=(',', ': '))
- return generic_history
- @staticmethod
- def _avg_generic_history(generic_history):
- a = []
- for i in generic_history:
- if i['age'] > (ADDRESS_AND_PORT_STABLE_DAYS * 24 * 3600):
- continue
- if (i['length'] is not None
- and i['age'] is not None
- and i['value'] is not None):
- w = i['length'] * math.pow(AGE_ALPHA, i['age']/(3600*24))
- a.append( (i['value'] * w, w) )
- sv = math.fsum(map(lambda x: x[0], a))
- sw = math.fsum(map(lambda x: x[1], a))
- if sw == 0.0:
- svw = 0.0
- else:
- svw = sv/sw
- return svw
- def _add_generic_history(self, history):
- periods = r['read_history'].keys()
- periods.sort(key = lambda x: r['read_history'][x]['interval'] )
- print periods
- def add_running_history(self, history):
- pass
- def add_uptime(self, uptime):
- logging.debug('Adding uptime %s.'%(self._fpr,))
- # flags we care about: Running, V2Dir, Guard
- if not 'flags' in uptime:
- logging.debug('No flags in document for %s.'%(self._fpr,))
- return
- for f in ['Running', 'Guard', 'V2Dir']:
- if not f in uptime['flags']:
- logging.debug('No %s in flags for %s.'%(f, self._fpr,))
- return
- running = self._extract_generic_history(uptime['flags']['Running'],
- '%s-Running'%(self._fpr))
- guard = self._extract_generic_history(uptime['flags']['Guard'],
- '%s-Guard'%(self._fpr))
- v2dir = self._extract_generic_history(uptime['flags']['V2Dir'],
- '%s-V2Dir'%(self._fpr))
- if 'BadExit' in uptime['flags']:
- badexit = self._extract_generic_history(uptime['flags']['BadExit'],
- '%s-BadExit'%(self._fpr))
- self._running = self._avg_generic_history(running) / ONIONOO_SCALE_ONE
- self._guard = self._avg_generic_history(guard) / ONIONOO_SCALE_ONE
- self._v2dir = self._avg_generic_history(v2dir) / ONIONOO_SCALE_ONE
- self._badexit = None
- if 'BadExit' in uptime['flags']:
- self._badexit = self._avg_generic_history(badexit) / ONIONOO_SCALE_ONE
- def is_candidate(self):
- try:
- if (MUST_BE_RUNNING_NOW and not self.is_running()):
- log_excluded('%s not a candidate: not running now, unable to check ' +
- 'DirPort consensus download', self._fpr)
- return False
- if (self._data['last_changed_address_or_port'] >
- self.CUTOFF_ADDRESS_AND_PORT_STABLE):
- log_excluded('%s not a candidate: changed address/port recently (%s)',
- self._fpr, self._data['last_changed_address_or_port'])
- return False
- if self._running < CUTOFF_RUNNING:
- log_excluded('%s not a candidate: running avg too low (%lf)',
- self._fpr, self._running)
- return False
- if self._v2dir < CUTOFF_V2DIR:
- log_excluded('%s not a candidate: v2dir avg too low (%lf)',
- self._fpr, self._v2dir)
- return False
- if self._badexit is not None and self._badexit > PERMITTED_BADEXIT:
- log_excluded('%s not a candidate: badexit avg too high (%lf)',
- self._fpr, self._badexit)
- return False
- # this function logs a message depending on which check fails
- if not self.is_valid_version():
- return False
- if self._guard < CUTOFF_GUARD:
- log_excluded('%s not a candidate: guard avg too low (%lf)',
- self._fpr, self._guard)
- return False
- if (not self._data.has_key('consensus_weight')
- or self._data['consensus_weight'] < 1):
- log_excluded('%s not a candidate: consensus weight invalid', self._fpr)
- return False
- except BaseException as e:
- logging.warning("Exception %s when checking if fallback is a candidate",
- str(e))
- return False
- return True
- def is_in_whitelist(self, relaylist):
- """ A fallback matches if each key in the whitelist line matches:
- ipv4
- dirport
- orport
- id
- ipv6 address and port (if present)
- If the fallback has an ipv6 key, the whitelist line must also have
- it, and vice versa, otherwise they don't match. """
- ipv6 = None
- if self.has_ipv6():
- ipv6 = '%s:%d'%(self.ipv6addr, self.ipv6orport)
- for entry in relaylist:
- if entry['id'] != self._fpr:
- # can't log here unless we match an IP and port, because every relay's
- # fingerprint is compared to every entry's fingerprint
- if entry['ipv4'] == self.dirip and int(entry['orport']) == self.orport:
- logging.warning('%s excluded: has OR %s:%d changed fingerprint to ' +
- '%s?', entry['id'], self.dirip, self.orport,
- self._fpr)
- if self.has_ipv6() and entry.has_key('ipv6') and entry['ipv6'] == ipv6:
- logging.warning('%s excluded: has OR %s changed fingerprint to ' +
- '%s?', entry['id'], ipv6, self._fpr)
- continue
- if entry['ipv4'] != self.dirip:
- logging.warning('%s excluded: has it changed IPv4 from %s to %s?',
- self._fpr, entry['ipv4'], self.dirip)
- continue
- if int(entry['dirport']) != self.dirport:
- logging.warning('%s excluded: has it changed DirPort from %s:%d to ' +
- '%s:%d?', self._fpr, self.dirip, int(entry['dirport']),
- self.dirip, self.dirport)
- continue
- if int(entry['orport']) != self.orport:
- logging.warning('%s excluded: has it changed ORPort from %s:%d to ' +
- '%s:%d?', self._fpr, self.dirip, int(entry['orport']),
- self.dirip, self.orport)
- continue
- if entry.has_key('ipv6') and self.has_ipv6():
- # if both entry and fallback have an ipv6 address, compare them
- if entry['ipv6'] != ipv6:
- logging.warning('%s excluded: has it changed IPv6 ORPort from %s ' +
- 'to %s?', self._fpr, entry['ipv6'], ipv6)
- continue
- # if the fallback has an IPv6 address but the whitelist entry
- # doesn't, or vice versa, the whitelist entry doesn't match
- elif entry.has_key('ipv6') and not self.has_ipv6():
- logging.warning('%s excluded: has it lost its former IPv6 address %s?',
- self._fpr, entry['ipv6'])
- continue
- elif not entry.has_key('ipv6') and self.has_ipv6():
- logging.warning('%s excluded: has it gained an IPv6 address %s?',
- self._fpr, ipv6)
- continue
- return True
- return False
- def cw_to_bw_factor(self):
- # any relays with a missing or zero consensus weight are not candidates
- # any relays with a missing advertised bandwidth have it set to zero
- return self._data['advertised_bandwidth'] / self._data['consensus_weight']
- # since advertised_bandwidth is reported by the relay, it can be gamed
- # to avoid this, use the median consensus weight to bandwidth factor to
- # estimate this relay's measured bandwidth, and make that the upper limit
- def measured_bandwidth(self, median_cw_to_bw_factor):
- cw_to_bw= median_cw_to_bw_factor
- # Reduce exit bandwidth to make sure we're not overloading them
- if self.is_exit():
- cw_to_bw *= EXIT_BANDWIDTH_FRACTION
- measured_bandwidth = self._data['consensus_weight'] * cw_to_bw
- if self._data['advertised_bandwidth'] != 0:
- # limit advertised bandwidth (if available) to measured bandwidth
- return min(measured_bandwidth, self._data['advertised_bandwidth'])
- else:
- return measured_bandwidth
- def set_measured_bandwidth(self, median_cw_to_bw_factor):
- self._data['measured_bandwidth'] = self.measured_bandwidth(
- median_cw_to_bw_factor)
- def is_exit(self):
- return 'Exit' in self._data['flags']
- def is_guard(self):
- return 'Guard' in self._data['flags']
- def is_running(self):
- return 'Running' in self._data['flags']
- # does this fallback have an IPv6 address and orport?
- def has_ipv6(self):
- return self.ipv6addr is not None and self.ipv6orport is not None
- # strip leading and trailing brackets from an IPv6 address
- # safe to use on non-bracketed IPv6 and on IPv4 addresses
- # also convert to unicode, and make None appear as ''
- @staticmethod
- def strip_ipv6_brackets(ip):
- if ip is None:
- return unicode('')
- if len(ip) < 2:
- return unicode(ip)
- if ip[0] == '[' and ip[-1] == ']':
- return unicode(ip[1:-1])
- return unicode(ip)
- # are ip_a and ip_b in the same netblock?
- # mask_bits is the size of the netblock
- # takes both IPv4 and IPv6 addresses
- # the versions of ip_a and ip_b must be the same
- # the mask must be valid for the IP version
- @staticmethod
- def netblocks_equal(ip_a, ip_b, mask_bits):
- if ip_a is None or ip_b is None:
- return False
- ip_a = Candidate.strip_ipv6_brackets(ip_a)
- ip_b = Candidate.strip_ipv6_brackets(ip_b)
- a = ipaddress.ip_address(ip_a)
- b = ipaddress.ip_address(ip_b)
- if a.version != b.version:
- raise Exception('Mismatching IP versions in %s and %s'%(ip_a, ip_b))
- if mask_bits > a.max_prefixlen:
- logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
- mask_bits = a.max_prefixlen
- if mask_bits < 0:
- logging.error('Bad IP mask %d for %s and %s'%(mask_bits, ip_a, ip_b))
- mask_bits = 0
- a_net = ipaddress.ip_network('%s/%d'%(ip_a, mask_bits), strict=False)
- return b in a_net
- # is this fallback's IPv4 address (dirip) in the same netblock as other's
- # IPv4 address?
- # mask_bits is the size of the netblock
- def ipv4_netblocks_equal(self, other, mask_bits):
- return Candidate.netblocks_equal(self.dirip, other.dirip, mask_bits)
- # is this fallback's IPv6 address (ipv6addr) in the same netblock as
- # other's IPv6 address?
- # Returns False if either fallback has no IPv6 address
- # mask_bits is the size of the netblock
- def ipv6_netblocks_equal(self, other, mask_bits):
- if not self.has_ipv6() or not other.has_ipv6():
- return False
- return Candidate.netblocks_equal(self.ipv6addr, other.ipv6addr, mask_bits)
- # is this fallback's IPv4 DirPort the same as other's IPv4 DirPort?
- def dirport_equal(self, other):
- return self.dirport == other.dirport
- # is this fallback's IPv4 ORPort the same as other's IPv4 ORPort?
- def ipv4_orport_equal(self, other):
- return self.orport == other.orport
- # is this fallback's IPv6 ORPort the same as other's IPv6 ORPort?
- # Returns False if either fallback has no IPv6 address
- def ipv6_orport_equal(self, other):
- if not self.has_ipv6() or not other.has_ipv6():
- return False
- return self.ipv6orport == other.ipv6orport
- # does this fallback have the same DirPort, IPv4 ORPort, or
- # IPv6 ORPort as other?
- # Ignores IPv6 ORPort if either fallback has no IPv6 address
- def port_equal(self, other):
- return (self.dirport_equal(other) or self.ipv4_orport_equal(other)
- or self.ipv6_orport_equal(other))
- # return a list containing IPv4 ORPort, DirPort, and IPv6 ORPort (if present)
- def port_list(self):
- ports = [self.dirport, self.orport]
- if self.has_ipv6() and not self.ipv6orport in ports:
- ports.append(self.ipv6orport)
- return ports
- # does this fallback share a port with other, regardless of whether the
- # port types match?
- # For example, if self's IPv4 ORPort is 80 and other's DirPort is 80,
- # return True
- def port_shared(self, other):
- for p in self.port_list():
- if p in other.port_list():
- return True
- return False
- # log how long it takes to download a consensus from dirip:dirport
- # returns True if the download failed, False if it succeeded within max_time
- @staticmethod
- def fallback_consensus_download_speed(dirip, dirport, nickname, fingerprint,
- max_time):
- download_failed = False
- # some directory mirrors respond to requests in ways that hang python
- # sockets, which is why we log this line here
- logging.info('Initiating %sconsensus download from %s (%s:%d) %s.',
- 'microdesc ' if DOWNLOAD_MICRODESC_CONSENSUS else '',
- nickname, dirip, dirport, fingerprint)
- # there appears to be about 1 second of overhead when comparing stem's
- # internal trace time and the elapsed time calculated here
- TIMEOUT_SLOP = 1.0
- start = datetime.datetime.utcnow()
- try:
- consensus = get_consensus(
- endpoints = [(dirip, dirport)],
- timeout = (max_time + TIMEOUT_SLOP),
- validate = True,
- retries = 0,
- fall_back_to_authority = False,
- document_handler = DocumentHandler.BARE_DOCUMENT,
- microdescriptor = DOWNLOAD_MICRODESC_CONSENSUS
- ).run()[0]
- end = datetime.datetime.utcnow()
- time_since_expiry = (end - consensus.valid_until).total_seconds()
- except Exception, stem_error:
- end = datetime.datetime.utcnow()
- log_excluded('Unable to retrieve a consensus from %s: %s', nickname,
- stem_error)
- status = 'error: "%s"' % (stem_error)
- level = logging.WARNING
- download_failed = True
- elapsed = (end - start).total_seconds()
- if download_failed:
- # keep the error failure status, and avoid using the variables
- pass
- elif elapsed > max_time:
- status = 'too slow'
- level = logging.WARNING
- download_failed = True
- elif (time_since_expiry > 0):
- status = 'outdated consensus, expired %ds ago'%(int(time_since_expiry))
- if time_since_expiry <= CONSENSUS_EXPIRY_TOLERANCE:
- status += ', tolerating up to %ds'%(CONSENSUS_EXPIRY_TOLERANCE)
- level = logging.INFO
- else:
- status += ', invalid'
- level = logging.WARNING
- download_failed = True
- else:
- status = 'ok'
- level = logging.DEBUG
- logging.log(level, 'Consensus download: %0.1fs %s from %s (%s:%d) %s, ' +
- 'max download time %0.1fs.', elapsed, status, nickname,
- dirip, dirport, fingerprint, max_time)
- return download_failed
- # does this fallback download the consensus fast enough?
- def check_fallback_download_consensus(self):
- # include the relay if we're not doing a check, or we can't check (IPv6)
- ipv4_failed = False
- ipv6_failed = False
- if PERFORM_IPV4_DIRPORT_CHECKS:
- ipv4_failed = Candidate.fallback_consensus_download_speed(self.dirip,
- self.dirport,
- self._data['nickname'],
- self._fpr,
- CONSENSUS_DOWNLOAD_SPEED_MAX)
- if self.has_ipv6() and PERFORM_IPV6_DIRPORT_CHECKS:
- # Clients assume the IPv6 DirPort is the same as the IPv4 DirPort
- ipv6_failed = Candidate.fallback_consensus_download_speed(self.ipv6addr,
- self.dirport,
- self._data['nickname'],
- self._fpr,
- CONSENSUS_DOWNLOAD_SPEED_MAX)
- return ((not ipv4_failed) and (not ipv6_failed))
- # if this fallback has not passed a download check, try it again,
- # and record the result, available in get_fallback_download_consensus
- def try_fallback_download_consensus(self):
- if not self.get_fallback_download_consensus():
- self._data['download_check'] = self.check_fallback_download_consensus()
- # did this fallback pass the download check?
- def get_fallback_download_consensus(self):
- # if we're not performing checks, return True
- if not PERFORM_IPV4_DIRPORT_CHECKS and not PERFORM_IPV6_DIRPORT_CHECKS:
- return True
- # if we are performing checks, but haven't done one, return False
- if not self._data.has_key('download_check'):
- return False
- return self._data['download_check']
- # output an optional header comment and info for this fallback
- # try_fallback_download_consensus before calling this
- def fallbackdir_line(self, fallbacks, prefilter_fallbacks):
- s = ''
- if OUTPUT_COMMENTS:
- s += self.fallbackdir_comment(fallbacks, prefilter_fallbacks)
- # if the download speed is ok, output a C string
- # if it's not, but we OUTPUT_COMMENTS, output a commented-out C string
- if self.get_fallback_download_consensus() or OUTPUT_COMMENTS:
- s += self.fallbackdir_info(self.get_fallback_download_consensus())
- return s
- # output a header comment for this fallback
- def fallbackdir_comment(self, fallbacks, prefilter_fallbacks):
- # /*
- # nickname
- # flags
- # adjusted bandwidth, consensus weight
- # [contact]
- # [identical contact counts]
- # */
- # Multiline C comment
- s = '/*'
- s += '\n'
- s += cleanse_c_multiline_comment(self._data['nickname'])
- s += '\n'
- s += 'Flags: '
- s += cleanse_c_multiline_comment(' '.join(sorted(self._data['flags'])))
- s += '\n'
- # this is an adjusted bandwidth, see calculate_measured_bandwidth()
- bandwidth = self._data['measured_bandwidth']
- weight = self._data['consensus_weight']
- s += 'Bandwidth: %.1f MByte/s, Consensus Weight: %d'%(
- bandwidth/(1024.0*1024.0),
- weight)
- s += '\n'
- if self._data['contact'] is not None:
- s += cleanse_c_multiline_comment(self._data['contact'])
- if CONTACT_COUNT:
- fallback_count = len([f for f in fallbacks
- if f._data['contact'] == self._data['contact']])
- if fallback_count > 1:
- s += '\n'
- s += '%d identical contacts listed' % (fallback_count)
- # output the fallback info C string for this fallback
- # this is the text that would go after FallbackDir in a torrc
- # if this relay failed the download test and we OUTPUT_COMMENTS,
- # comment-out the returned string
- def fallbackdir_info(self, dl_speed_ok):
- # "address:dirport orport=port id=fingerprint"
- # (insert additional madatory fields here)
- # "[ipv6=addr:orport]"
- # (insert additional optional fields here)
- # /* nickname=name */
- # /* extrainfo={0,1} */
- # (insert additional comment fields here)
- # /* ===== */
- # ,
- #
- # Do we want a C string, or a commented-out string?
- c_string = dl_speed_ok
- comment_string = not dl_speed_ok and OUTPUT_COMMENTS
- # If we don't want either kind of string, bail
- if not c_string and not comment_string:
- return ''
- s = ''
- # Comment out the fallback directory entry if it's too slow
- # See the debug output for which address and port is failing
- if comment_string:
- s += '/* Consensus download failed or was too slow:\n'
- # Multi-Line C string with trailing comma (part of a string list)
- # This makes it easier to diff the file, and remove IPv6 lines using grep
- # Integers don't need escaping
- s += '"%s orport=%d id=%s"'%(
- cleanse_c_string(self._data['dir_address']),
- self.orport,
- cleanse_c_string(self._fpr))
- s += '\n'
- # (insert additional madatory fields here)
- if self.has_ipv6():
- s += '" ipv6=%s:%d"'%(cleanse_c_string(self.ipv6addr), self.ipv6orport)
- s += '\n'
- # (insert additional optional fields here)
- if not comment_string:
- s += '/* '
- s += 'nickname=%s'%(cleanse_c_string(self._data['nickname']))
- if not comment_string:
- s += ' */'
- s += '\n'
- # if we know that the fallback is an extrainfo cache, flag it
- # and if we don't know, assume it is not
- if not comment_string:
- s += '/* '
- s += 'extrainfo=%d'%(1 if self._extra_info_cache else 0)
- if not comment_string:
- s += ' */'
- s += '\n'
- # (insert additional comment fields here)
- # The terminator and comma must be the last line in each fallback entry
- if not comment_string:
- s += '/* '
- s += SECTION_SEPARATOR_BASE
- if not comment_string:
- s += ' */'
- s += '\n'
- s += ','
- if comment_string:
- s += '\n'
- s += '*/'
- return s
- ## Fallback Candidate List Class
- class CandidateList(dict):
- def __init__(self):
- pass
- def _add_relay(self, details):
- if not 'dir_address' in details: return
- c = Candidate(details)
- self[ c.get_fingerprint() ] = c
- def _add_uptime(self, uptime):
- try:
- fpr = uptime['fingerprint']
- except KeyError:
- raise Exception("Document has no fingerprint field.")
- try:
- c = self[fpr]
- except KeyError:
- logging.debug('Got unknown relay %s in uptime document.'%(fpr,))
- return
- c.add_uptime(uptime)
- def _add_details(self):
- logging.debug('Loading details document.')
- d = fetch('details',
- fields=('fingerprint,nickname,contact,last_changed_address_or_port,' +
- 'consensus_weight,advertised_bandwidth,or_addresses,' +
- 'dir_address,recommended_version,flags,effective_family,' +
- 'platform'))
- logging.debug('Loading details document done.')
- if not 'relays' in d: raise Exception("No relays found in document.")
- for r in d['relays']: self._add_relay(r)
- def _add_uptimes(self):
- logging.debug('Loading uptime document.')
- d = fetch('uptime')
- logging.debug('Loading uptime document done.')
- if not 'relays' in d: raise Exception("No relays found in document.")
- for r in d['relays']: self._add_uptime(r)
- def add_relays(self):
- self._add_details()
- self._add_uptimes()
- def count_guards(self):
- guard_count = 0
- for fpr in self.keys():
- if self[fpr].is_guard():
- guard_count += 1
- return guard_count
- # Find fallbacks that fit the uptime, stability, and flags criteria,
- # and make an array of them in self.fallbacks
- def compute_fallbacks(self):
- self.fallbacks = map(lambda x: self[x],
- filter(lambda x: self[x].is_candidate(),
- self.keys()))
- # sort fallbacks by their consensus weight to advertised bandwidth factor,
- # lowest to highest
- # used to find the median cw_to_bw_factor()
- def sort_fallbacks_by_cw_to_bw_factor(self):
- self.fallbacks.sort(key=lambda f: f.cw_to_bw_factor())
- # sort fallbacks by their measured bandwidth, highest to lowest
- # calculate_measured_bandwidth before calling this
- # this is useful for reviewing candidates in priority order
- def sort_fallbacks_by_measured_bandwidth(self):
- self.fallbacks.sort(key=lambda f: f._data['measured_bandwidth'],
- reverse=True)
- # sort fallbacks by the data field data_field, lowest to highest
- def sort_fallbacks_by(self, data_field):
- self.fallbacks.sort(key=lambda f: f._data[data_field])
- @staticmethod
- def load_relaylist(file_obj):
- """ Read each line in the file, and parse it like a FallbackDir line:
- an IPv4 address and optional port:
- <IPv4 address>:<port>
- which are parsed into dictionary entries:
- ipv4=<IPv4 address>
- dirport=<port>
- followed by a series of key=value entries:
- orport=<port>
- id=<fingerprint>
- ipv6=<IPv6 address>:<IPv6 orport>
- each line's key/value pairs are placed in a dictonary,
- (of string -> string key/value pairs),
- and these dictionaries are placed in an array.
- comments start with # and are ignored """
- file_data = file_obj['data']
- file_name = file_obj['name']
- relaylist = []
- if file_data is None:
- return relaylist
- for line in file_data.split('\n'):
- relay_entry = {}
- # ignore comments
- line_comment_split = line.split('#')
- line = line_comment_split[0]
- # cleanup whitespace
- line = cleanse_whitespace(line)
- line = line.strip()
- if len(line) == 0:
- continue
- for item in line.split(' '):
- item = item.strip()
- if len(item) == 0:
- continue
- key_value_split = item.split('=')
- kvl = len(key_value_split)
- if kvl < 1 or kvl > 2:
- print '#error Bad %s item: %s, format is key=value.'%(
- file_name, item)
- if kvl == 1:
- # assume that entries without a key are the ipv4 address,
- # perhaps with a dirport
- ipv4_maybe_dirport = key_value_split[0]
- ipv4_maybe_dirport_split = ipv4_maybe_dirport.split(':')
- dirl = len(ipv4_maybe_dirport_split)
- if dirl < 1 or dirl > 2:
- print '#error Bad %s IPv4 item: %s, format is ipv4:port.'%(
- file_name, item)
- if dirl >= 1:
- relay_entry['ipv4'] = ipv4_maybe_dirport_split[0]
- if dirl == 2:
- relay_entry['dirport'] = ipv4_maybe_dirport_split[1]
- elif kvl == 2:
- relay_entry[key_value_split[0]] = key_value_split[1]
- relaylist.append(relay_entry)
- return relaylist
- # apply the fallback whitelist
- def apply_filter_lists(self, whitelist_obj):
- excluded_count = 0
- logging.debug('Applying whitelist')
- # parse the whitelist
- whitelist = self.load_relaylist(whitelist_obj)
- filtered_fallbacks = []
- for f in self.fallbacks:
- in_whitelist = f.is_in_whitelist(whitelist)
- if in_whitelist:
- # include
- filtered_fallbacks.append(f)
- elif INCLUDE_UNLISTED_ENTRIES:
- # include
- filtered_fallbacks.append(f)
- else:
- # exclude
- excluded_count += 1
- log_excluded('Excluding %s: not in whitelist.',
- f._fpr)
- self.fallbacks = filtered_fallbacks
- return excluded_count
- @staticmethod
- def summarise_filters(initial_count, excluded_count):
- return '/* Whitelist excluded %d of %d candidates. */'%(
- excluded_count, initial_count)
- # calculate each fallback's measured bandwidth based on the median
- # consensus weight to advertised bandwidth ratio
- def calculate_measured_bandwidth(self):
- self.sort_fallbacks_by_cw_to_bw_factor()
- median_fallback = self.fallback_median(True)
- if median_fallback is not None:
- median_cw_to_bw_factor = median_fallback.cw_to_bw_factor()
- else:
- # this will never be used, because there are no fallbacks
- median_cw_to_bw_factor = None
- for f in self.fallbacks:
- f.set_measured_bandwidth(median_cw_to_bw_factor)
- # remove relays with low measured bandwidth from the fallback list
- # calculate_measured_bandwidth for each relay before calling this
- def remove_low_bandwidth_relays(self):
- if MIN_BANDWIDTH is None:
- return
- above_min_bw_fallbacks = []
- for f in self.fallbacks:
- if f._data['measured_bandwidth'] >= MIN_BANDWIDTH:
- above_min_bw_fallbacks.append(f)
- else:
- # the bandwidth we log here is limited by the relay's consensus weight
- # as well as its adverttised bandwidth. See set_measured_bandwidth
- # for details
- log_excluded('%s not a candidate: bandwidth %.1fMByte/s too low, ' +
- 'must be at least %.1fMByte/s', f._fpr,
- f._data['measured_bandwidth']/(1024.0*1024.0),
- MIN_BANDWIDTH/(1024.0*1024.0))
- self.fallbacks = above_min_bw_fallbacks
- # the minimum fallback in the list
- # call one of the sort_fallbacks_* functions before calling this
- def fallback_min(self):
- if len(self.fallbacks) > 0:
- return self.fallbacks[-1]
- else:
- return None
- # the median fallback in the list
- # call one of the sort_fallbacks_* functions before calling this
- def fallback_median(self, require_advertised_bandwidth):
- # use the low-median when there are an evan number of fallbacks,
- # for consistency with the bandwidth authorities
- if len(self.fallbacks) > 0:
- median_position = (len(self.fallbacks) - 1) / 2
- if not require_advertised_bandwidth:
- return self.fallbacks[median_position]
- # if we need advertised_bandwidth but this relay doesn't have it,
- # move to a fallback with greater consensus weight until we find one
- while not self.fallbacks[median_position]._data['advertised_bandwidth']:
- median_position += 1
- if median_position >= len(self.fallbacks):
- return None
- return self.fallbacks[median_position]
- else:
- return None
- # the maximum fallback in the list
- # call one of the sort_fallbacks_* functions before calling this
- def fallback_max(self):
- if len(self.fallbacks) > 0:
- return self.fallbacks[0]
- else:
- return None
- # return a new bag suitable for storing attributes
- @staticmethod
- def attribute_new():
- return dict()
- # get the count of attribute in attribute_bag
- # if attribute is None or the empty string, return 0
- @staticmethod
- def attribute_count(attribute, attribute_bag):
- if attribute is None or attribute == '':
- return 0
- if attribute not in attribute_bag:
- return 0
- return attribute_bag[attribute]
- # does attribute_bag contain more than max_count instances of attribute?
- # if so, return False
- # if not, return True
- # if attribute is None or the empty string, or max_count is invalid,
- # always return True
- @staticmethod
- def attribute_allow(attribute, attribute_bag, max_count=1):
- if attribute is None or attribute == '' or max_count <= 0:
- return True
- elif CandidateList.attribute_count(attribute, attribute_bag) >= max_count:
- return False
- else:
- return True
- # add attribute to attribute_bag, incrementing the count if it is already
- # present
- # if attribute is None or the empty string, or count is invalid,
- # do nothing
- @staticmethod
- def attribute_add(attribute, attribute_bag, count=1):
- if attribute is None or attribute == '' or count <= 0:
- pass
- attribute_bag.setdefault(attribute, 0)
- attribute_bag[attribute] += count
- # make sure there are only MAX_FALLBACKS_PER_IP fallbacks per IPv4 address,
- # and per IPv6 address
- # there is only one IPv4 address on each fallback: the IPv4 DirPort address
- # (we choose the IPv4 ORPort which is on the same IPv4 as the DirPort)
- # there is at most one IPv6 address on each fallback: the IPv6 ORPort address
- # we try to match the IPv4 ORPort, but will use any IPv6 address if needed
- # (clients only use the IPv6 ORPort)
- # if there is no IPv6 address, only the IPv4 address is checked
- # return the number of candidates we excluded
- def limit_fallbacks_same_ip(self):
- ip_limit_fallbacks = []
- ip_list = CandidateList.attribute_new()
- for f in self.fallbacks:
- if (CandidateList.attribute_allow(f.dirip, ip_list,
- MAX_FALLBACKS_PER_IPV4)
- and CandidateList.attribute_allow(f.ipv6addr, ip_list,
- MAX_FALLBACKS_PER_IPV6)):
- ip_limit_fallbacks.append(f)
- CandidateList.attribute_add(f.dirip, ip_list)
- if f.has_ipv6():
- CandidateList.attribute_add(f.ipv6addr, ip_list)
- elif not CandidateList.attribute_allow(f.dirip, ip_list,
- MAX_FALLBACKS_PER_IPV4):
- log_excluded('Eliminated %s: already have %d fallback(s) on IPv4 %s'
- %(f._fpr, CandidateList.attribute_count(f.dirip, ip_list),
- f.dirip))
- elif (f.has_ipv6() and
- not CandidateList.attribute_allow(f.ipv6addr, ip_list,
- MAX_FALLBACKS_PER_IPV6)):
- log_excluded('Eliminated %s: already have %d fallback(s) on IPv6 %s'
- %(f._fpr, CandidateList.attribute_count(f.ipv6addr,
- ip_list),
- f.ipv6addr))
- original_count = len(self.fallbacks)
- self.fallbacks = ip_limit_fallbacks
- return original_count - len(self.fallbacks)
- # make sure there are only MAX_FALLBACKS_PER_CONTACT fallbacks for each
- # ContactInfo
- # if there is no ContactInfo, allow the fallback
- # this check can be gamed by providing no ContactInfo, or by setting the
- # ContactInfo to match another fallback
- # However, given the likelihood that relays with the same ContactInfo will
- # go down at similar times, its usefulness outweighs the risk
- def limit_fallbacks_same_contact(self):
- contact_limit_fallbacks = []
- contact_list = CandidateList.attribute_new()
- for f in self.fallbacks:
- if CandidateList.attribute_allow(f._data['contact'], contact_list,
- MAX_FALLBACKS_PER_CONTACT):
- contact_limit_fallbacks.append(f)
- CandidateList.attribute_add(f._data['contact'], contact_list)
- else:
- log_excluded(
- 'Eliminated %s: already have %d fallback(s) on ContactInfo %s'
- %(f._fpr, CandidateList.attribute_count(f._data['contact'],
- contact_list),
- f._data['contact']))
- original_count = len(self.fallbacks)
- self.fallbacks = contact_limit_fallbacks
- return original_count - len(self.fallbacks)
- # make sure there are only MAX_FALLBACKS_PER_FAMILY fallbacks per effective
- # family
- # if there is no family, allow the fallback
- # we use effective family, which ensures mutual family declarations
- # but the check can be gamed by not declaring a family at all
- # if any indirect families exist, the result depends on the order in which
- # fallbacks are sorted in the list
- def limit_fallbacks_same_family(self):
- family_limit_fallbacks = []
- fingerprint_list = CandidateList.attribute_new()
- for f in self.fallbacks:
- if CandidateList.attribute_allow(f._fpr, fingerprint_list,
- MAX_FALLBACKS_PER_FAMILY):
- family_limit_fallbacks.append(f)
- CandidateList.attribute_add(f._fpr, fingerprint_list)
- for family_fingerprint in f._data['effective_family']:
- CandidateList.attribute_add(family_fingerprint, fingerprint_list)
- else:
- # we already have a fallback with this fallback in its effective
- # family
- log_excluded(
- 'Eliminated %s: already have %d fallback(s) in effective family'
- %(f._fpr, CandidateList.attribute_count(f._fpr, fingerprint_list)))
- original_count = len(self.fallbacks)
- self.fallbacks = family_limit_fallbacks
- return original_count - len(self.fallbacks)
- # try once to get the descriptors for fingerprint_list using stem
- # returns an empty list on exception
- @staticmethod
- def get_fallback_descriptors_once(fingerprint_list):
- desc_list = get_server_descriptors(fingerprints=fingerprint_list).run(suppress=True)
- return desc_list
- # try up to max_retries times to get the descriptors for fingerprint_list
- # using stem. Stops retrying when all descriptors have been retrieved.
- # returns a list containing the descriptors that were retrieved
- @staticmethod
- def get_fallback_descriptors(fingerprint_list, max_retries=5):
- # we can't use stem's retries=, because we want to support more than 96
- # descriptors
- #
- # add an attempt for every MAX_FINGERPRINTS (or part thereof) in the list
- max_retries += (len(fingerprint_list) + MAX_FINGERPRINTS - 1) / MAX_FINGERPRINTS
- remaining_list = fingerprint_list
- desc_list = []
- for _ in xrange(max_retries):
- if len(remaining_list) == 0:
- break
- new_desc_list = CandidateList.get_fallback_descriptors_once(remaining_list[0:MAX_FINGERPRINTS])
- for d in new_desc_list:
- try:
- remaining_list.remove(d.fingerprint)
- except ValueError:
- # warn and ignore if a directory mirror returned a bad descriptor
- logging.warning("Directory mirror returned unwanted descriptor %s, ignoring",
- d.fingerprint)
- continue
- desc_list.append(d)
- return desc_list
- # find the fallbacks that cache extra-info documents
- # Onionoo doesn't know this, so we have to use stem
- def mark_extra_info_caches(self):
- fingerprint_list = [ f._fpr for f in self.fallbacks ]
- logging.info("Downloading fallback descriptors to find extra-info caches")
- desc_list = CandidateList.get_fallback_descriptors(fingerprint_list)
- for d in desc_list:
- self[d.fingerprint]._extra_info_cache = d.extra_info_cache
- missing_descriptor_list = [ f._fpr for f in self.fallbacks
- if f._extra_info_cache is None ]
- for f in missing_descriptor_list:
- logging.warning("No descriptor for {}. Assuming extrainfo=0.".format(f))
- # try a download check on each fallback candidate in order
- # stop after max_count successful downloads
- # but don't remove any candidates from the array
- def try_download_consensus_checks(self, max_count):
- dl_ok_count = 0
- for f in self.fallbacks:
- f.try_fallback_download_consensus()
- if f.get_fallback_download_consensus():
- # this fallback downloaded a consensus ok
- dl_ok_count += 1
- if dl_ok_count >= max_count:
- # we have enough fallbacks
- return
- # put max_count successful candidates in the fallbacks array:
- # - perform download checks on each fallback candidate
- # - retry failed candidates if CONSENSUS_DOWNLOAD_RETRY is set
- # - eliminate failed candidates
- # - if there are more than max_count candidates, eliminate lowest bandwidth
- # - if there are fewer than max_count candidates, leave only successful
- # Return the number of fallbacks that failed the consensus check
- def perform_download_consensus_checks(self, max_count):
- self.sort_fallbacks_by_measured_bandwidth()
- self.try_download_consensus_checks(max_count)
- if CONSENSUS_DOWNLOAD_RETRY:
- # try unsuccessful candidates again
- # we could end up with more than max_count successful candidates here
- self.try_download_consensus_checks(max_count)
- # now we have at least max_count successful candidates,
- # or we've tried them all
- original_count = len(self.fallbacks)
- self.fallbacks = filter(lambda x: x.get_fallback_download_consensus(),
- self.fallbacks)
- # some of these failed the check, others skipped the check,
- # if we already had enough successful downloads
- failed_count = original_count - len(self.fallbacks)
- self.fallbacks = self.fallbacks[:max_count]
- return failed_count
- # return a string that describes a/b as a percentage
- @staticmethod
- def describe_percentage(a, b):
- if b != 0:
- return '%d/%d = %.0f%%'%(a, b, (a*100.0)/b)
- else:
- # technically, 0/0 is undefined, but 0.0% is a sensible result
- return '%d/%d = %.0f%%'%(a, b, 0.0)
- # return a dictionary of lists of fallbacks by IPv4 netblock
- # the dictionary is keyed by the fingerprint of an arbitrary fallback
- # in each netblock
- # mask_bits is the size of the netblock
- def fallbacks_by_ipv4_netblock(self, mask_bits):
- netblocks = {}
- for f in self.fallbacks:
- found_netblock = False
- for b in netblocks.keys():
- # we found an existing netblock containing this fallback
- if f.ipv4_netblocks_equal(self[b], mask_bits):
- # add it to the list
- netblocks[b].append(f)
- found_netblock = True
- break
- # make a new netblock based on this fallback's fingerprint
- if not found_netblock:
- netblocks[f._fpr] = [f]
- return netblocks
- # return a dictionary of lists of fallbacks by IPv6 netblock
- # where mask_bits is the size of the netblock
- def fallbacks_by_ipv6_netblock(self, mask_bits):
- netblocks = {}
- for f in self.fallbacks:
- # skip fallbacks without IPv6 addresses
- if not f.has_ipv6():
- continue
- found_netblock = False
- for b in netblocks.keys():
- # we found an existing netblock containing this fallback
- if f.ipv6_netblocks_equal(self[b], mask_bits):
- # add it to the list
- netblocks[b].append(f)
- found_netblock = True
- break
- # make a new netblock based on this fallback's fingerprint
- if not found_netblock:
- netblocks[f._fpr] = [f]
- return netblocks
- # log a message about the proportion of fallbacks in each IPv4 netblock,
- # where mask_bits is the size of the netblock
- def describe_fallback_ipv4_netblock_mask(self, mask_bits):
- fallback_count = len(self.fallbacks)
- shared_netblock_fallback_count = 0
- most_frequent_netblock = None
- netblocks = self.fallbacks_by_ipv4_netblock(mask_bits)
- for b in netblocks.keys():
- if len(netblocks[b]) > 1:
- # how many fallbacks are in a netblock with other fallbacks?
- shared_netblock_fallback_count += len(netblocks[b])
- # what's the netblock with the most fallbacks?
- if (most_frequent_netblock is None
- or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
- most_frequent_netblock = b
- logging.debug('Fallback IPv4 addresses in the same /%d:'%(mask_bits))
- for f in netblocks[b]:
- logging.debug('%s - %s', f.dirip, f._fpr)
- if most_frequent_netblock is not None:
- logging.warning('There are %s fallbacks in the IPv4 /%d containing %s'%(
- CandidateList.describe_percentage(
- len(netblocks[most_frequent_netblock]),
- fallback_count),
- mask_bits,
- self[most_frequent_netblock].dirip))
- if shared_netblock_fallback_count > 0:
- logging.warning(('%s of fallbacks are in an IPv4 /%d with other ' +
- 'fallbacks')%(CandidateList.describe_percentage(
- shared_netblock_fallback_count,
- fallback_count),
- mask_bits))
- # log a message about the proportion of fallbacks in each IPv6 netblock,
- # where mask_bits is the size of the netblock
- def describe_fallback_ipv6_netblock_mask(self, mask_bits):
- fallback_count = len(self.fallbacks_with_ipv6())
- shared_netblock_fallback_count = 0
- most_frequent_netblock = None
- netblocks = self.fallbacks_by_ipv6_netblock(mask_bits)
- for b in netblocks.keys():
- if len(netblocks[b]) > 1:
- # how many fallbacks are in a netblock with other fallbacks?
- shared_netblock_fallback_count += len(netblocks[b])
- # what's the netblock with the most fallbacks?
- if (most_frequent_netblock is None
- or len(netblocks[b]) > len(netblocks[most_frequent_netblock])):
- most_frequent_netblock = b
- logging.debug('Fallback IPv6 addresses in the same /%d:'%(mask_bits))
- for f in netblocks[b]:
- logging.debug('%s - %s', f.ipv6addr, f._fpr)
- if most_frequent_netblock is not None:
- logging.warning('There are %s fallbacks in the IPv6 /%d containing %s'%(
- CandidateList.describe_percentage(
- len(netblocks[most_frequent_netblock]),
- fallback_count),
- mask_bits,
- self[most_frequent_netblock].ipv6addr))
- if shared_netblock_fallback_count > 0:
- logging.warning(('%s of fallbacks are in an IPv6 /%d with other ' +
- 'fallbacks')%(CandidateList.describe_percentage(
- shared_netblock_fallback_count,
- fallback_count),
- mask_bits))
- # log a message about the proportion of fallbacks in each IPv4 /8, /16,
- # and /24
- def describe_fallback_ipv4_netblocks(self):
- # this doesn't actually tell us anything useful
- #self.describe_fallback_ipv4_netblock_mask(8)
- self.describe_fallback_ipv4_netblock_mask(16)
- #self.describe_fallback_ipv4_netblock_mask(24)
- # log a message about the proportion of fallbacks in each IPv6 /12 (RIR),
- # /23 (smaller RIR blocks), /32 (LIR), /48 (Customer), and /64 (Host)
- # https://www.iana.org/assignments/ipv6-unicast-address-assignments/
- def describe_fallback_ipv6_netblocks(self):
- # these don't actually tell us anything useful
- #self.describe_fallback_ipv6_netblock_mask(12)
- #self.describe_fallback_ipv6_netblock_mask(23)
- self.describe_fallback_ipv6_netblock_mask(32)
- #self.describe_fallback_ipv6_netblock_mask(48)
- self.describe_fallback_ipv6_netblock_mask(64)
- # log a message about the proportion of fallbacks in each IPv4 and IPv6
- # netblock
- def describe_fallback_netblocks(self):
- self.describe_fallback_ipv4_netblocks()
- self.describe_fallback_ipv6_netblocks()
- # return a list of fallbacks which are on the IPv4 ORPort port
- def fallbacks_on_ipv4_orport(self, port):
- return filter(lambda x: x.orport == port, self.fallbacks)
- # return a list of fallbacks which are on the IPv6 ORPort port
- def fallbacks_on_ipv6_orport(self, port):
- return filter(lambda x: x.ipv6orport == port, self.fallbacks_with_ipv6())
- # return a list of fallbacks which are on the DirPort port
- def fallbacks_on_dirport(self, port):
- return filter(lambda x: x.dirport == port, self.fallbacks)
- # log a message about the proportion of fallbacks on IPv4 ORPort port
- # and return that count
- def describe_fallback_ipv4_orport(self, port):
- port_count = len(self.fallbacks_on_ipv4_orport(port))
- fallback_count = len(self.fallbacks)
- logging.warning('%s of fallbacks are on IPv4 ORPort %d'%(
- CandidateList.describe_percentage(port_count,
- fallback_count),
- port))
- return port_count
- # log a message about the proportion of IPv6 fallbacks on IPv6 ORPort port
- # and return that count
- def describe_fallback_ipv6_orport(self, port):
- port_count = len(self.fallbacks_on_ipv6_orport(port))
- fallback_count = len(self.fallbacks_with_ipv6())
- logging.warning('%s of IPv6 fallbacks are on IPv6 ORPort %d'%(
- CandidateList.describe_percentage(port_count,
- fallback_count),
- port))
- return port_count
- # log a message about the proportion of fallbacks on DirPort port
- # and return that count
- def describe_fallback_dirport(self, port):
- port_count = len(self.fallbacks_on_dirport(port))
- fallback_count = len(self.fallbacks)
- logging.warning('%s of fallbacks are on DirPort %d'%(
- CandidateList.describe_percentage(port_count,
- fallback_count),
- port))
- return port_count
- # log a message about the proportion of fallbacks on each dirport,
- # each IPv4 orport, and each IPv6 orport
- def describe_fallback_ports(self):
- fallback_count = len(self.fallbacks)
- ipv4_or_count = fallback_count
- ipv4_or_count -= self.describe_fallback_ipv4_orport(443)
- ipv4_or_count -= self.describe_fallback_ipv4_orport(9001)
- logging.warning('%s of fallbacks are on other IPv4 ORPorts'%(
- CandidateList.describe_percentage(ipv4_or_count,
- fallback_count)))
- ipv6_fallback_count = len(self.fallbacks_with_ipv6())
- ipv6_or_count = ipv6_fallback_count
- ipv6_or_count -= self.describe_fallback_ipv6_orport(443)
- ipv6_or_count -= self.describe_fallback_ipv6_orport(9001)
- logging.warning('%s of IPv6 fallbacks are on other IPv6 ORPorts'%(
- CandidateList.describe_percentage(ipv6_or_count,
- ipv6_fallback_count)))
- dir_count = fallback_count
- dir_count -= self.describe_fallback_dirport(80)
- dir_count -= self.describe_fallback_dirport(9030)
- logging.warning('%s of fallbacks are on other DirPorts'%(
- CandidateList.describe_percentage(dir_count,
- fallback_count)))
- # return a list of fallbacks which cache extra-info documents
- def fallbacks_with_extra_info_cache(self):
- return filter(lambda x: x._extra_info_cache, self.fallbacks)
- # log a message about the proportion of fallbacks that cache extra-info docs
- def describe_fallback_extra_info_caches(self):
- extra_info_falback_count = len(self.fallbacks_with_extra_info_cache())
- fallback_count = len(self.fallbacks)
- logging.warning('%s of fallbacks cache extra-info documents'%(
- CandidateList.describe_percentage(extra_info_falback_count,
- fallback_count)))
- # return a list of fallbacks which have the Exit flag
- def fallbacks_with_exit(self):
- return filter(lambda x: x.is_exit(), self.fallbacks)
- # log a message about the proportion of fallbacks with an Exit flag
- def describe_fallback_exit_flag(self):
- exit_falback_count = len(self.fallbacks_with_exit())
- fallback_count = len(self.fallbacks)
- logging.warning('%s of fallbacks have the Exit flag'%(
- CandidateList.describe_percentage(exit_falback_count,
- fallback_count)))
- # return a list of fallbacks which have an IPv6 address
- def fallbacks_with_ipv6(self):
- return filter(lambda x: x.has_ipv6(), self.fallbacks)
- # log a message about the proportion of fallbacks on IPv6
- def describe_fallback_ip_family(self):
- ipv6_falback_count = len(self.fallbacks_with_ipv6())
- fallback_count = len(self.fallbacks)
- logging.warning('%s of fallbacks are on IPv6'%(
- CandidateList.describe_percentage(ipv6_falback_count,
- fallback_count)))
- def summarise_fallbacks(self, eligible_count, operator_count, failed_count,
- guard_count, target_count):
- s = ''
- # Report:
- # whether we checked consensus download times
- # the number of fallback directories (and limits/exclusions, if relevant)
- # min & max fallback bandwidths
- # #error if below minimum count
- if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
- s += '/* Checked %s%s%s DirPorts served a consensus within %.1fs. */'%(
- 'IPv4' if PERFORM_IPV4_DIRPORT_CHECKS else '',
- ' and ' if (PERFORM_IPV4_DIRPORT_CHECKS
- and PERFORM_IPV6_DIRPORT_CHECKS) else '',
- 'IPv6' if PERFORM_IPV6_DIRPORT_CHECKS else '',
- CONSENSUS_DOWNLOAD_SPEED_MAX)
- else:
- s += '/* Did not check IPv4 or IPv6 DirPort consensus downloads. */'
- s += '\n'
- # Multiline C comment with #error if things go bad
- s += '/*'
- s += '\n'
- # Integers don't need escaping in C comments
- fallback_count = len(self.fallbacks)
- if FALLBACK_PROPORTION_OF_GUARDS is None:
- fallback_proportion = ''
- else:
- fallback_proportion = ', Target %d (%d * %.2f)'%(target_count,
- guard_count,
- FALLBACK_PROPORTION_OF_GUARDS)
- s += 'Final Count: %d (Eligible %d%s'%(fallback_count, eligible_count,
- fallback_proportion)
- if MAX_FALLBACK_COUNT is not None:
- s += ', Max %d'%(MAX_FALLBACK_COUNT)
- s += ')\n'
- if eligible_count != fallback_count:
- removed_count = eligible_count - fallback_count
- excess_to_target_or_max = (eligible_count - operator_count - failed_count
- - fallback_count)
- # some 'Failed' failed the check, others 'Skipped' the check,
- # if we already had enough successful downloads
- s += ('Excluded: %d (Same Operator %d, Failed/Skipped Download %d, ' +
- 'Excess %d)')%(removed_count, operator_count, failed_count,
- excess_to_target_or_max)
- s += '\n'
- min_fb = self.fallback_min()
- min_bw = min_fb._data['measured_bandwidth']
- max_fb = self.fallback_max()
- max_bw = max_fb._data['measured_bandwidth']
- s += 'Bandwidth Range: %.1f - %.1f MByte/s'%(min_bw/(1024.0*1024.0),
- max_bw/(1024.0*1024.0))
- s += '\n'
- s += '*/'
- if fallback_count < MIN_FALLBACK_COUNT:
- # We must have a minimum number of fallbacks so they are always
- # reachable, and are in diverse locations
- s += '\n'
- s += '#error Fallback Count %d is too low. '%(fallback_count)
- s += 'Must be at least %d for diversity. '%(MIN_FALLBACK_COUNT)
- s += 'Try adding entries to the whitelist, '
- s += 'or setting INCLUDE_UNLISTED_ENTRIES = True.'
- return s
- def process_existing():
- logging.basicConfig(level=logging.INFO)
- logging.getLogger('stem').setLevel(logging.INFO)
- whitelist = {'data': parse_fallback_file(FALLBACK_FILE_NAME),
- 'name': FALLBACK_FILE_NAME}
- list_fallbacks(whitelist)
- def process_default():
- logging.basicConfig(level=logging.WARNING)
- logging.getLogger('stem').setLevel(logging.WARNING)
- whitelist = {'data': read_from_file(WHITELIST_FILE_NAME, MAX_LIST_FILE_SIZE),
- 'name': WHITELIST_FILE_NAME}
- list_fallbacks(whitelist)
- ## Main Function
- def main():
- if get_command() == 'check_existing':
- process_existing()
- else:
- process_default()
- def get_command():
- if len(sys.argv) == 2:
- return sys.argv[1]
- else:
- return None
- def log_excluded(msg, *args):
- if get_command() == 'check_existing':
- logging.warning(msg, *args)
- else:
- logging.info(msg, *args)
- def list_fallbacks(whitelist):
- """ Fetches required onionoo documents and evaluates the
- fallback directory criteria for each of the relays """
- print "/* type=fallback */"
- print ("/* version={} */"
- .format(cleanse_c_multiline_comment(FALLBACK_FORMAT_VERSION)))
- now = datetime.datetime.utcnow()
- timestamp = now.strftime('%Y%m%d%H%M%S')
- print ("/* timestamp={} */"
- .format(cleanse_c_multiline_comment(timestamp)))
- # end the header with a separator, to make it easier for parsers
- print SECTION_SEPARATOR_COMMENT
- logging.warning('Downloading and parsing Onionoo data. ' +
- 'This may take some time.')
- # find relays that could be fallbacks
- candidates = CandidateList()
- candidates.add_relays()
- # work out how many fallbacks we want
- guard_count = candidates.count_guards()
- if FALLBACK_PROPORTION_OF_GUARDS is None:
- target_count = guard_count
- else:
- target_count = int(guard_count * FALLBACK_PROPORTION_OF_GUARDS)
- # the maximum number of fallbacks is the least of:
- # - the target fallback count (FALLBACK_PROPORTION_OF_GUARDS * guard count)
- # - the maximum fallback count (MAX_FALLBACK_COUNT)
- if MAX_FALLBACK_COUNT is None:
- max_count = target_count
- else:
- max_count = min(target_count, MAX_FALLBACK_COUNT)
- candidates.compute_fallbacks()
- prefilter_fallbacks = copy.copy(candidates.fallbacks)
- # filter with the whitelist
- # if a relay has changed IPv4 address or ports recently, it will be excluded
- # as ineligible before we call apply_filter_lists, and so there will be no
- # warning that the details have changed from those in the whitelist.
- # instead, there will be an info-level log during the eligibility check.
- initial_count = len(candidates.fallbacks)
- excluded_count = candidates.apply_filter_lists(whitelist)
- print candidates.summarise_filters(initial_count, excluded_count)
- eligible_count = len(candidates.fallbacks)
- # calculate the measured bandwidth of each relay,
- # then remove low-bandwidth relays
- candidates.calculate_measured_bandwidth()
- candidates.remove_low_bandwidth_relays()
- # print the raw fallback list
- #for x in candidates.fallbacks:
- # print x.fallbackdir_line(True)
- # print json.dumps(candidates[x]._data, sort_keys=True, indent=4,
- # separators=(',', ': '), default=json_util.default)
- # impose mandatory conditions here, like one per contact, family, IP
- # in measured bandwidth order
- candidates.sort_fallbacks_by_measured_bandwidth()
- operator_count = 0
- # only impose these limits on the final list - operators can nominate
- # multiple candidate fallbacks, and then we choose the best set
- if not OUTPUT_CANDIDATES:
- operator_count += candidates.limit_fallbacks_same_ip()
- operator_count += candidates.limit_fallbacks_same_contact()
- operator_count += candidates.limit_fallbacks_same_family()
- # check if each candidate can serve a consensus
- # there's a small risk we've eliminated relays from the same operator that
- # can serve a consensus, in favour of one that can't
- # but given it takes up to 15 seconds to check each consensus download,
- # the risk is worth it
- if PERFORM_IPV4_DIRPORT_CHECKS or PERFORM_IPV6_DIRPORT_CHECKS:
- logging.warning('Checking consensus download speeds. ' +
- 'This may take some time.')
- failed_count = candidates.perform_download_consensus_checks(max_count)
- # work out which fallbacks cache extra-infos
- candidates.mark_extra_info_caches()
- # analyse and log interesting diversity metrics
- # like netblock, ports, exit, IPv4-only
- # (we can't easily analyse AS, and it's hard to accurately analyse country)
- candidates.describe_fallback_ip_family()
- # if we can't import the ipaddress module, we can't do netblock analysis
- if HAVE_IPADDRESS:
- candidates.describe_fallback_netblocks()
- candidates.describe_fallback_ports()
- candidates.describe_fallback_extra_info_caches()
- candidates.describe_fallback_exit_flag()
- # output C comments summarising the fallback selection process
- if len(candidates.fallbacks) > 0:
- print candidates.summarise_fallbacks(eligible_count, operator_count,
- failed_count, guard_count,
- target_count)
- else:
- print '/* No Fallbacks met criteria */'
- # output C comments specifying the OnionOO data used to create the list
- for s in fetch_source_list():
- print describe_fetch_source(s)
- # start the list with a separator, to make it easy for parsers
- print SECTION_SEPARATOR_COMMENT
- # sort the list differently depending on why we've created it:
- # if we're outputting the final fallback list, sort by fingerprint
- # this makes diffs much more stable
- # otherwise, if we're trying to find a bandwidth cutoff, or we want to
- # contact operators in priority order, sort by bandwidth (not yet
- # implemented)
- # otherwise, if we're contacting operators, sort by contact
- candidates.sort_fallbacks_by(OUTPUT_SORT_FIELD)
- for x in candidates.fallbacks:
- print x.fallbackdir_line(candidates.fallbacks, prefilter_fallbacks)
- if __name__ == "__main__":
- main()
|