mmdb-convert.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. #!/usr/bin/python
  2. # This software has been dedicated to the public domain under the CC0
  3. # public domain dedication.
  4. #
  5. # To the extent possible under law, the person who associated CC0
  6. # with mmdb-convert.py has waived all copyright and related or
  7. # neighboring rights to mmdb-convert.py.
  8. #
  9. # You should have received a copy of the CC0 legalcode along with this
  10. # work in doc/cc0.txt. If not, see
  11. # <http://creativecommons.org/publicdomain/zero/1.0/>.
  12. # Nick Mathewson is responsible for this kludge, but takes no
  13. # responsibility for it.
  14. """This kludge is meant to
  15. parse mmdb files in sufficient detail to dump out the old format
  16. that Tor expects. It's also meant to be pure-python.
  17. When given a simplicity/speed tradeoff, it opts for simplicity.
  18. You will not understand the code without understanding the MaxMind-DB
  19. file format. It is specified at:
  20. https://github.com/maxmind/MaxMind-DB/blob/master/MaxMind-DB-spec.md.
  21. This isn't so much tested. When it breaks, you get to keep both
  22. pieces.
  23. """
  24. import struct
  25. import bisect
  26. import socket
  27. import binascii
  28. import sys
  29. import time
  30. METADATA_MARKER = b'\xab\xcd\xefMaxMind.com'
  31. # Here's some python2/python3 junk. Better solutions wanted.
  32. try:
  33. ord(b"1"[0])
  34. except TypeError:
  35. def byte_to_int(b):
  36. "convert a single element of a bytestring to an integer."
  37. return b
  38. else:
  39. byte_to_int = ord
  40. # Here's some more python2/python3 junk. Better solutions wanted.
  41. try:
  42. str(b"a", "utf8")
  43. except TypeError:
  44. bytesToStr = str
  45. else:
  46. def bytesToStr(b):
  47. "convert a bytestring in utf8 to a string."
  48. return str(b, 'utf8')
  49. def to_int(s):
  50. "Parse a big-endian integer from bytestring s."
  51. result = 0
  52. for c in s:
  53. result *= 256
  54. result += byte_to_int(c)
  55. return result
  56. def to_int24(s):
  57. "Parse a pair of big-endian 24-bit integers from bytestring s."
  58. a, b, c = struct.unpack("!HHH", s)
  59. return ((a <<8)+(b>>8)), (((b&0xff)<<16)+c)
  60. def to_int32(s):
  61. "Parse a pair of big-endian 32-bit integers from bytestring s."
  62. a, b = struct.unpack("!LL", s)
  63. return a, b
  64. def to_int28(s):
  65. "Parse a pair of big-endian 28-bit integers from bytestring s."
  66. a, b = struct.unpack("!LL", s + b'\x00')
  67. return (((a & 0xf0) << 20) + (a >> 8)), ((a & 0x0f) << 24) + (b >> 8)
  68. class Tree(object):
  69. "Holds a node in the tree"
  70. def __init__(self, left, right):
  71. self.left = left
  72. self.right = right
  73. def resolve_tree(tree, data):
  74. """Fill in the left_item and right_item fields for all values in the tree
  75. so that they point to another Tree, or to a Datum, or to None."""
  76. d = Datum(None, None, None, None)
  77. def resolve_item(item):
  78. "Helper: resolve a single index."
  79. if item < len(tree):
  80. return tree[item]
  81. elif item == len(tree):
  82. return None
  83. else:
  84. d.pos = (item - len(tree) - 16)
  85. p = bisect.bisect_left(data, d)
  86. assert data[p].pos == d.pos
  87. return data[p]
  88. for t in tree:
  89. t.left_item = resolve_item(t.left)
  90. t.right_item = resolve_item(t.right)
  91. def parse_search_tree(s, record_size):
  92. """Given a bytestring and a record size in bits, parse the tree.
  93. Return a list of nodes."""
  94. record_bytes = (record_size*2) // 8
  95. nodes = []
  96. p = 0
  97. try:
  98. to_leftright = { 24: to_int24,
  99. 28: to_int28,
  100. 32: to_int32 }[ record_size ]
  101. except KeyError:
  102. raise NotImplementedError("Unsupported record size in bits: %d" %
  103. record_size)
  104. while p < len(s):
  105. left, right = to_leftright(s[p:p+record_bytes])
  106. p += record_bytes
  107. nodes.append( Tree(left, right ) )
  108. return nodes
  109. class Datum(object):
  110. """Holds a single entry from the Data section"""
  111. def __init__(self, pos, kind, ln, data):
  112. self.pos = pos # Position of this record within data section
  113. self.kind = kind # Type of this record. one of TP_*
  114. self.ln = ln # Length field, which might be overloaded.
  115. self.data = data # Raw bytes data.
  116. self.children = None # Used for arrays and maps.
  117. def __repr__(self):
  118. return "Datum(%r,%r,%r,%r)" % (self.pos, self.kind, self.ln, self.data)
  119. # Comparison functions used for bsearch
  120. def __lt__(self, other):
  121. return self.pos < other.pos
  122. def __gt__(self, other):
  123. return self.pos > other.pos
  124. def __eq__(self, other):
  125. return self.pos == other.pos
  126. def build_maps(self):
  127. """If this is a map or array, fill in its 'map' field if it's a map,
  128. and the 'map' field of all its children."""
  129. if not hasattr(self, 'nChildren'):
  130. return
  131. if self.kind == TP_ARRAY:
  132. del self.nChildren
  133. for c in self.children:
  134. c.build_maps()
  135. elif self.kind == TP_MAP:
  136. del self.nChildren
  137. self.map = {}
  138. for i in range(0, len(self.children), 2):
  139. k = self.children[i].deref()
  140. v = self.children[i+1].deref()
  141. v.build_maps()
  142. if k.kind != TP_UTF8:
  143. raise ValueError("Bad dictionary key type %d"% k.kind)
  144. self.map[bytesToStr(k.data)] = v
  145. def int_val(self):
  146. """If this is an integer type, return its value"""
  147. assert self.kind in (TP_UINT16, TP_UINT32, TP_UINT64,
  148. TP_UINT128, TP_SINT32)
  149. i = to_int(self.data)
  150. if self.kind == TP_SINT32:
  151. if i & 0x80000000:
  152. i = i - 0x100000000
  153. return i
  154. def deref(self):
  155. """If this value is a pointer, return its pointed-to-value. Chase
  156. through multiple layers of pointers if need be. If this isn't
  157. a pointer, return it."""
  158. n = 0
  159. s = self
  160. while s.kind == TP_PTR:
  161. s = s.ptr
  162. n += 1
  163. assert n < 100
  164. return s
  165. def resolve_pointers(data):
  166. """Fill in the ptr field of every pointer in data."""
  167. search = Datum(None, None, None, None)
  168. for d in data:
  169. if d.kind == TP_PTR:
  170. search.pos = d.ln
  171. p = bisect.bisect_left(data, search)
  172. assert data[p].pos == d.ln
  173. d.ptr = data[p]
  174. TP_PTR = 1
  175. TP_UTF8 = 2
  176. TP_DBL = 3
  177. TP_BYTES = 4
  178. TP_UINT16 = 5
  179. TP_UINT32 = 6
  180. TP_MAP = 7
  181. TP_SINT32 = 8
  182. TP_UINT64 = 9
  183. TP_UINT128 = 10
  184. TP_ARRAY = 11
  185. TP_DCACHE = 12
  186. TP_END = 13
  187. TP_BOOL = 14
  188. TP_FLOAT = 15
  189. def get_type_and_len(s):
  190. """Data parsing helper: decode the type value and much-overloaded 'length'
  191. field for the value starting at s. Return a 3-tuple of type, length,
  192. and number of bytes used to encode type-plus-length."""
  193. c = byte_to_int(s[0])
  194. tp = c >> 5
  195. skip = 1
  196. if tp == 0:
  197. tp = byte_to_int(s[1])+7
  198. skip = 2
  199. ln = c & 31
  200. # I'm sure I don't know what they were thinking here...
  201. if tp == TP_PTR:
  202. len_len = (ln >> 3) + 1
  203. if len_len < 4:
  204. ln &= 7
  205. ln <<= len_len * 8
  206. else:
  207. ln = 0
  208. ln += to_int(s[skip:skip+len_len])
  209. ln += (0, 0, 2048, 526336, 0)[len_len]
  210. skip += len_len
  211. elif ln >= 29:
  212. len_len = ln - 28
  213. ln = to_int(s[skip:skip+len_len])
  214. ln += (0, 29, 285, 65821)[len_len]
  215. skip += len_len
  216. return tp, ln, skip
  217. # Set of types for which 'length' doesn't mean length.
  218. IGNORE_LEN_TYPES = set([
  219. TP_MAP, # Length is number of key-value pairs that follow.
  220. TP_ARRAY, # Length is number of members that follow.
  221. TP_PTR, # Length is index to pointed-to data element.
  222. TP_BOOL, # Length is 0 or 1.
  223. TP_DCACHE, # Length is number of members that follow
  224. ])
  225. def parse_data_section(s):
  226. """Given a data section encoded in a bytestring, return a list of
  227. Datum items."""
  228. # Stack of possibly nested containers. We use the 'nChildren' member of
  229. # the last one to tell how many more items nest directly inside.
  230. stack = []
  231. # List of all items, including nested ones.
  232. data = []
  233. # Byte index within the data section.
  234. pos = 0
  235. while s:
  236. tp, ln, skip = get_type_and_len(s)
  237. if tp in IGNORE_LEN_TYPES:
  238. real_len = 0
  239. else:
  240. real_len = ln
  241. d = Datum(pos, tp, ln, s[skip:skip+real_len])
  242. data.append(d)
  243. pos += skip+real_len
  244. s = s[skip+real_len:]
  245. if stack:
  246. stack[-1].children.append(d)
  247. stack[-1].nChildren -= 1
  248. if stack[-1].nChildren == 0:
  249. del stack[-1]
  250. if d.kind == TP_ARRAY:
  251. d.nChildren = d.ln
  252. d.children = []
  253. stack.append(d)
  254. elif d.kind == TP_MAP:
  255. d.nChildren = d.ln * 2
  256. d.children = []
  257. stack.append(d)
  258. return data
  259. def parse_mm_file(s):
  260. """Parse a MaxMind-DB file."""
  261. try:
  262. metadata_ptr = s.rindex(METADATA_MARKER)
  263. except ValueError:
  264. raise ValueError("No metadata!")
  265. metadata = parse_data_section(s[metadata_ptr+len(METADATA_MARKER):])
  266. if metadata[0].kind != TP_MAP:
  267. raise ValueError("Bad map")
  268. metadata[0].build_maps()
  269. mm = metadata[0].map
  270. tree_size = (((mm['record_size'].int_val() * 2) // 8 ) *
  271. mm['node_count'].int_val())
  272. if s[tree_size:tree_size+16] != b'\x00'*16:
  273. raise ValueError("Missing section separator!")
  274. tree = parse_search_tree(s[:tree_size], mm['record_size'].int_val())
  275. data = parse_data_section(s[tree_size+16:metadata_ptr])
  276. resolve_pointers(data)
  277. resolve_tree(tree, data)
  278. for d in data:
  279. d.build_maps()
  280. return metadata, tree, data
  281. def format_datum(datum):
  282. """Given a Datum at a leaf of the tree, return the string that we should
  283. write as its value.
  284. We first try country->iso_code which is the two-character ISO 3166-1
  285. country code of the country where MaxMind believes the end user is
  286. located. If there's no such key, we try registered_country->iso_code
  287. which is the country in which the ISP has registered the IP address.
  288. Without falling back to registered_country, we'd leave out all ranges
  289. that MaxMind thinks belong to anonymous proxies, because those ranges
  290. don't contain country but only registered_country. In short: let's
  291. fill all A1 entries with what ARIN et. al think.
  292. """
  293. try:
  294. return bytesToStr(datum.map['country'].map['iso_code'].data)
  295. except KeyError:
  296. pass
  297. try:
  298. return bytesToStr(datum.map['registered_country'].map['iso_code'].data)
  299. except KeyError:
  300. pass
  301. return None
  302. IPV4_PREFIX = "0"*96
  303. def dump_item_ipv4(entries, prefix, val):
  304. """Dump the information for an IPv4 address to entries, where 'prefix'
  305. is a string holding a binary prefix for the address, and 'val' is the
  306. value to dump. If the prefix is not an IPv4 address (it does not start
  307. with 96 bits of 0), then print nothing.
  308. """
  309. if not prefix.startswith(IPV4_PREFIX):
  310. return
  311. prefix = prefix[96:]
  312. v = int(prefix, 2)
  313. shift = 32 - len(prefix)
  314. lo = v << shift
  315. hi = ((v+1) << shift) - 1
  316. entries.append((lo, hi, val))
  317. def fmt_item_ipv4(entry):
  318. """Format an IPv4 range with lo and hi addresses in decimal form."""
  319. return "%d,%d,%s\n"%(entry[0], entry[1], entry[2])
  320. def fmt_ipv6_addr(v):
  321. """Given a 128-bit integer representing an ipv6 address, return a
  322. string for that ipv6 address."""
  323. return socket.inet_ntop(socket.AF_INET6, binascii.unhexlify("%032x"%v))
  324. def fmt_item_ipv6(entry):
  325. """Format an IPv6 range with lo and hi addresses in hex form."""
  326. return "%s,%s,%s\n"%(fmt_ipv6_addr(entry[0]),
  327. fmt_ipv6_addr(entry[1]),
  328. entry[2])
  329. IPV4_MAPPED_IPV6_PREFIX = "0"*80 + "1"*16
  330. IPV6_6TO4_PREFIX = "0010000000000010"
  331. TEREDO_IPV6_PREFIX = "0010000000000001" + "0"*16
  332. def dump_item_ipv6(entries, prefix, val):
  333. """Dump the information for an IPv6 address prefix to entries, where
  334. 'prefix' is a string holding a binary prefix for the address,
  335. and 'val' is the value to dump. If the prefix is an IPv4 address
  336. (starts with 96 bits of 0), is an IPv4-mapped IPv6 address
  337. (::ffff:0:0/96), or is in the 6to4 mapping subnet (2002::/16), then
  338. print nothing.
  339. """
  340. if prefix.startswith(IPV4_PREFIX) or \
  341. prefix.startswith(IPV4_MAPPED_IPV6_PREFIX) or \
  342. prefix.startswith(IPV6_6TO4_PREFIX) or \
  343. prefix.startswith(TEREDO_IPV6_PREFIX):
  344. return
  345. v = int(prefix, 2)
  346. shift = 128 - len(prefix)
  347. lo = v << shift
  348. hi = ((v+1) << shift) - 1
  349. entries.append((lo, hi, val))
  350. def dump_tree(entries, node, dump_item, prefix=""):
  351. """Walk the tree rooted at 'node', and call dump_item on the
  352. format_datum output of every leaf of the tree."""
  353. if isinstance(node, Tree):
  354. dump_tree(entries, node.left_item, dump_item, prefix+"0")
  355. dump_tree(entries, node.right_item, dump_item, prefix+"1")
  356. elif isinstance(node, Datum):
  357. assert node.kind == TP_MAP
  358. code = format_datum(node)
  359. if code:
  360. dump_item(entries, prefix, code)
  361. else:
  362. assert node == None
  363. GEOIP_FILE_HEADER = """\
  364. # Last updated based on %s Maxmind GeoLite2 Country
  365. # wget https://geolite.maxmind.com/download/geoip/database/GeoLite2-Country.mmdb.gz
  366. # gunzip GeoLite2-Country.mmdb.gz
  367. # python mmdb-convert.py GeoLite2-Country.mmdb
  368. """
  369. def write_geoip_file(filename, metadata, the_tree, dump_item, fmt_item):
  370. """Write the entries in the_tree to filename."""
  371. entries = []
  372. dump_tree(entries, the_tree[0], dump_item)
  373. fobj = open(filename, 'w')
  374. build_epoch = metadata[0].map['build_epoch'].int_val()
  375. fobj.write(GEOIP_FILE_HEADER %
  376. time.strftime('%B %-d %Y', time.gmtime(build_epoch)))
  377. unwritten = None
  378. for entry in entries:
  379. if not unwritten:
  380. unwritten = entry
  381. elif unwritten[1] + 1 == entry[0] and unwritten[2] == entry[2]:
  382. unwritten = (unwritten[0], entry[1], unwritten[2])
  383. else:
  384. fobj.write(fmt_item(unwritten))
  385. unwritten = entry
  386. if unwritten:
  387. fobj.write(fmt_item(unwritten))
  388. fobj.close()
  389. content = open(sys.argv[1], 'rb').read()
  390. metadata, the_tree, _ = parse_mm_file(content)
  391. write_geoip_file('geoip', metadata, the_tree, dump_item_ipv4, fmt_item_ipv4)
  392. write_geoip_file('geoip6', metadata, the_tree, dump_item_ipv6, fmt_item_ipv6)