123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466 |
- #!/usr/bin/python3
- # This software has been dedicated to the public domain under the CC0
- # public domain dedication.
- #
- # To the extent possible under law, the person who associated CC0
- # with mmdb-convert.py has waived all copyright and related or
- # neighboring rights to mmdb-convert.py.
- #
- # You should have received a copy of the CC0 legalcode along with this
- # work in doc/cc0.txt. If not, see
- # <http://creativecommons.org/publicdomain/zero/1.0/>.
- # Nick Mathewson is responsible for this kludge, but takes no
- # responsibility for it.
- """This kludge is meant to
- parse mmdb files in sufficient detail to dump out the old format
- that Tor expects. It's also meant to be pure-python.
- When given a simplicity/speed tradeoff, it opts for simplicity.
- You will not understand the code without understanding the MaxMind-DB
- file format. It is specified at:
- https://github.com/maxmind/MaxMind-DB/blob/master/MaxMind-DB-spec.md.
- This isn't so much tested. When it breaks, you get to keep both
- pieces.
- """
- import struct
- import bisect
- import socket
- import binascii
- import sys
- import time
- METADATA_MARKER = b'\xab\xcd\xefMaxMind.com'
- # Here's some python2/python3 junk. Better solutions wanted.
- try:
- ord(b"1"[0])
- except TypeError:
- def byte_to_int(b):
- "convert a single element of a bytestring to an integer."
- return b
- else:
- byte_to_int = ord
- # Here's some more python2/python3 junk. Better solutions wanted.
- try:
- str(b"a", "utf8")
- except TypeError:
- bytesToStr = str
- else:
- def bytesToStr(b):
- "convert a bytestring in utf8 to a string."
- return str(b, 'utf8')
- def to_int(s):
- "Parse a big-endian integer from bytestring s."
- result = 0
- for c in s:
- result *= 256
- result += byte_to_int(c)
- return result
- def to_int24(s):
- "Parse a pair of big-endian 24-bit integers from bytestring s."
- a, b, c = struct.unpack("!HHH", s)
- return ((a <<8)+(b>>8)), (((b&0xff)<<16)+c)
- def to_int32(s):
- "Parse a pair of big-endian 32-bit integers from bytestring s."
- a, b = struct.unpack("!LL", s)
- return a, b
- def to_int28(s):
- "Parse a pair of big-endian 28-bit integers from bytestring s."
- a, b = unpack("!LL", s + b'\x00')
- return (((a & 0xf0) << 20) + (a >> 8)), ((a & 0x0f) << 24) + (b >> 8)
- class Tree(object):
- "Holds a node in the tree"
- def __init__(self, left, right):
- self.left = left
- self.right = right
- def resolve_tree(tree, data):
- """Fill in the left_item and right_item fields for all values in the tree
- so that they point to another Tree, or to a Datum, or to None."""
- d = Datum(None, None, None, None)
- def resolve_item(item):
- "Helper: resolve a single index."
- if item < len(tree):
- return tree[item]
- elif item == len(tree):
- return None
- else:
- d.pos = (item - len(tree) - 16)
- p = bisect.bisect_left(data, d)
- assert data[p].pos == d.pos
- return data[p]
- for t in tree:
- t.left_item = resolve_item(t.left)
- t.right_item = resolve_item(t.right)
- def parse_search_tree(s, record_size):
- """Given a bytestring and a record size in bits, parse the tree.
- Return a list of nodes."""
- record_bytes = (record_size*2) // 8
- nodes = []
- p = 0
- try:
- to_leftright = { 24: to_int24,
- 28: to_int28,
- 32: to_int32 }[ record_size ]
- except KeyError:
- raise NotImplementedError("Unsupported record size in bits: %d" %
- record_size)
- while p < len(s):
- left, right = to_leftright(s[p:p+record_bytes])
- p += record_bytes
- nodes.append( Tree(left, right ) )
- return nodes
- class Datum(object):
- """Holds a single entry from the Data section"""
- def __init__(self, pos, kind, ln, data):
- self.pos = pos # Position of this record within data section
- self.kind = kind # Type of this record. one of TP_*
- self.ln = ln # Length field, which might be overloaded.
- self.data = data # Raw bytes data.
- self.children = None # Used for arrays and maps.
- def __repr__(self):
- return "Datum(%r,%r,%r,%r)" % (self.pos, self.kind, self.ln, self.data)
- # Comparison functions used for bsearch
- def __lt__(self, other):
- return self.pos < other.pos
- def __gt__(self, other):
- return self.pos > other.pos
- def __eq__(self, other):
- return self.pos == other.pos
- def build_maps(self):
- """If this is a map or array, fill in its 'map' field if it's a map,
- and the 'map' field of all its children."""
- if not hasattr(self, 'nChildren'):
- return
- if self.kind == TP_ARRAY:
- del self.nChildren
- for c in self.children:
- c.build_maps()
- elif self.kind == TP_MAP:
- del self.nChildren
- self.map = {}
- for i in range(0, len(self.children), 2):
- k = self.children[i].deref()
- v = self.children[i+1].deref()
- v.build_maps()
- if k.kind != TP_UTF8:
- raise ValueError("Bad dictionary key type %d"% k.kind)
- self.map[bytesToStr(k.data)] = v
- def int_val(self):
- """If this is an integer type, return its value"""
- assert self.kind in (TP_UINT16, TP_UINT32, TP_UINT64,
- TP_UINT128, TP_SINT32)
- i = to_int(self.data)
- if self.kind == TP_SINT32:
- if i & 0x80000000:
- i = i - 0x100000000
- return i
- def deref(self):
- """If this value is a pointer, return its pointed-to-value. Chase
- through multiple layers of pointers if need be. If this isn't
- a pointer, return it."""
- n = 0
- s = self
- while s.kind == TP_PTR:
- s = s.ptr
- n += 1
- assert n < 100
- return s
- def resolve_pointers(data):
- """Fill in the ptr field of every pointer in data."""
- search = Datum(None, None, None, None)
- for d in data:
- if d.kind == TP_PTR:
- search.pos = d.ln
- p = bisect.bisect_left(data, search)
- assert data[p].pos == d.ln
- d.ptr = data[p]
- TP_PTR = 1
- TP_UTF8 = 2
- TP_DBL = 3
- TP_BYTES = 4
- TP_UINT16 = 5
- TP_UINT32 = 6
- TP_MAP = 7
- TP_SINT32 = 8
- TP_UINT64 = 9
- TP_UINT128 = 10
- TP_ARRAY = 11
- TP_DCACHE = 12
- TP_END = 13
- TP_BOOL = 14
- TP_FLOAT = 15
- def get_type_and_len(s):
- """Data parsing helper: decode the type value and much-overloaded 'length'
- field for the value starting at s. Return a 3-tuple of type, length,
- and number of bytes used to encode type-plus-length."""
- c = byte_to_int(s[0])
- tp = c >> 5
- skip = 1
- if tp == 0:
- tp = byte_to_int(s[1])+7
- skip = 2
- ln = c & 31
- # I'm sure I don't know what they were thinking here...
- if tp == TP_PTR:
- len_len = (ln >> 3) + 1
- if len_len < 4:
- ln &= 7
- ln <<= len_len * 8
- else:
- ln = 0
- ln += to_int(s[skip:skip+len_len])
- ln += (0, 0, 2048, 526336, 0)[len_len]
- skip += len_len
- elif ln >= 29:
- len_len = ln - 28
- ln = to_int(s[skip:skip+len_len])
- ln += (0, 29, 285, 65821)[len_len]
- skip += len_len
- return tp, ln, skip
- # Set of types for which 'length' doesn't mean length.
- IGNORE_LEN_TYPES = set([
- TP_MAP, # Length is number of key-value pairs that follow.
- TP_ARRAY, # Length is number of members that follow.
- TP_PTR, # Length is index to pointed-to data element.
- TP_BOOL, # Length is 0 or 1.
- TP_DCACHE, # Length is number of members that follow
- ])
- def parse_data_section(s):
- """Given a data section encoded in a bytestring, return a list of
- Datum items."""
- # Stack of possibly nested containers. We use the 'nChildren' member of
- # the last one to tell how many more items nest directly inside.
- stack = []
- # List of all items, including nested ones.
- data = []
- # Byte index within the data section.
- pos = 0
- while s:
- tp, ln, skip = get_type_and_len(s)
- if tp in IGNORE_LEN_TYPES:
- real_len = 0
- else:
- real_len = ln
- d = Datum(pos, tp, ln, s[skip:skip+real_len])
- data.append(d)
- pos += skip+real_len
- s = s[skip+real_len:]
- if stack:
- stack[-1].children.append(d)
- stack[-1].nChildren -= 1
- if stack[-1].nChildren == 0:
- del stack[-1]
- if d.kind == TP_ARRAY:
- d.nChildren = d.ln
- d.children = []
- stack.append(d)
- elif d.kind == TP_MAP:
- d.nChildren = d.ln * 2
- d.children = []
- stack.append(d)
- return data
- def parse_mm_file(s):
- """Parse a MaxMind-DB file."""
- try:
- metadata_ptr = s.rindex(METADATA_MARKER)
- except ValueError:
- raise ValueError("No metadata!")
- metadata = parse_data_section(s[metadata_ptr+len(METADATA_MARKER):])
- if metadata[0].kind != TP_MAP:
- raise ValueError("Bad map")
- metadata[0].build_maps()
- mm = metadata[0].map
- tree_size = (((mm['record_size'].int_val() * 2) // 8 ) *
- mm['node_count'].int_val())
- if s[tree_size:tree_size+16] != b'\x00'*16:
- raise ValueError("Missing section separator!")
- tree = parse_search_tree(s[:tree_size], mm['record_size'].int_val())
- data = parse_data_section(s[tree_size+16:metadata_ptr])
- resolve_pointers(data)
- resolve_tree(tree, data)
- for d in data:
- d.build_maps()
- return metadata, tree, data
- def format_datum(datum):
- """Given a Datum at a leaf of the tree, return the string that we should
- write as its value.
- We first try country->iso_code which is the two-character ISO 3166-1
- country code of the country where MaxMind believes the end user is
- located. If there's no such key, we try registered_country->iso_code
- which is the country in which the ISP has registered the IP address.
- Without falling back to registered_country, we'd leave out all ranges
- that MaxMind thinks belong to anonymous proxies, because those ranges
- don't contain country but only registered_country. In short: let's
- fill all A1 entries with what ARIN et. al think.
- """
- try:
- return bytesToStr(datum.map['country'].map['iso_code'].data)
- except KeyError:
- pass
- try:
- return bytesToStr(datum.map['registered_country'].map['iso_code'].data)
- except KeyError:
- pass
- return None
- IPV4_PREFIX = "0"*96
- def dump_item_ipv4(entries, prefix, val):
- """Dump the information for an IPv4 address to entries, where 'prefix'
- is a string holding a binary prefix for the address, and 'val' is the
- value to dump. If the prefix is not an IPv4 address (it does not start
- with 96 bits of 0), then print nothing.
- """
- if not prefix.startswith(IPV4_PREFIX):
- return
- prefix = prefix[96:]
- v = int(prefix, 2)
- shift = 32 - len(prefix)
- lo = v << shift
- hi = ((v+1) << shift) - 1
- entries.append((lo, hi, val))
- def fmt_item_ipv4(entry):
- """Format an IPv4 range with lo and hi addresses in decimal form."""
- return "%d,%d,%s\n"%(entry[0], entry[1], entry[2])
- def fmt_ipv6_addr(v):
- """Given a 128-bit integer representing an ipv6 address, return a
- string for that ipv6 address."""
- return socket.inet_ntop(socket.AF_INET6, binascii.unhexlify("%032x"%v))
- def fmt_item_ipv6(entry):
- """Format an IPv6 range with lo and hi addresses in hex form."""
- return "%s,%s,%s\n"%(fmt_ipv6_addr(entry[0]),
- fmt_ipv6_addr(entry[1]),
- entry[2])
- IPV4_MAPPED_IPV6_PREFIX = "0"*80 + "1"*16
- IPV6_6TO4_PREFIX = "0010000000000010"
- TEREDO_IPV6_PREFIX = "0010000000000001" + "0"*16
- def dump_item_ipv6(entries, prefix, val):
- """Dump the information for an IPv6 address prefix to entries, where
- 'prefix' is a string holding a binary prefix for the address,
- and 'val' is the value to dump. If the prefix is an IPv4 address
- (starts with 96 bits of 0), is an IPv4-mapped IPv6 address
- (::ffff:0:0/96), or is in the 6to4 mapping subnet (2002::/16), then
- print nothing.
- """
- if prefix.startswith(IPV4_PREFIX) or \
- prefix.startswith(IPV4_MAPPED_IPV6_PREFIX) or \
- prefix.startswith(IPV6_6TO4_PREFIX) or \
- prefix.startswith(TEREDO_IPV6_PREFIX):
- return
- v = int(prefix, 2)
- shift = 128 - len(prefix)
- lo = v << shift
- hi = ((v+1) << shift) - 1
- entries.append((lo, hi, val))
- def dump_tree(entries, node, dump_item, prefix=""):
- """Walk the tree rooted at 'node', and call dump_item on the
- format_datum output of every leaf of the tree."""
- if isinstance(node, Tree):
- dump_tree(entries, node.left_item, dump_item, prefix+"0")
- dump_tree(entries, node.right_item, dump_item, prefix+"1")
- elif isinstance(node, Datum):
- assert node.kind == TP_MAP
- code = format_datum(node)
- if code:
- dump_item(entries, prefix, code)
- else:
- assert node == None
- GEOIP_FILE_HEADER = """\
- # Last updated based on %s Maxmind GeoLite2 Country
- # wget https://geolite.maxmind.com/download/geoip/database/GeoLite2-Country.mmdb.gz
- # gunzip GeoLite2-Country.mmdb.gz
- # python mmdb-convert.py GeoLite2-Country.mmdb
- """
- def write_geoip_file(filename, metadata, the_tree, dump_item, fmt_item):
- """Write the entries in the_tree to filename."""
- entries = []
- dump_tree(entries, the_tree[0], dump_item)
- fobj = open(filename, 'w')
- build_epoch = metadata[0].map['build_epoch'].int_val()
- fobj.write(GEOIP_FILE_HEADER %
- time.strftime('%B %-d %Y', time.gmtime(build_epoch)))
- unwritten = None
- for entry in entries:
- if not unwritten:
- unwritten = entry
- elif unwritten[1] + 1 == entry[0] and unwritten[2] == entry[2]:
- unwritten = (unwritten[0], entry[1], unwritten[2])
- else:
- fobj.write(fmt_item(unwritten))
- unwritten = entry
- if unwritten:
- fobj.write(fmt_item(unwritten))
- fobj.close()
- content = open(sys.argv[1], 'rb').read()
- metadata, the_tree, _ = parse_mm_file(content)
- write_geoip_file('geoip', metadata, the_tree, dump_item_ipv4, fmt_item_ipv4)
- write_geoip_file('geoip6', metadata, the_tree, dump_item_ipv6, fmt_item_ipv6)
|