author-identities.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. import sys
  2. import subprocess
  3. import re
  4. class Author:
  5. # this "banned" list is just to prevent matching,
  6. # full IDs that contain them will still be properly rewritten
  7. banned = [
  8. None, "", "None", "none@none", "unknown", # common null identities
  9. "bugs", "bugzilla", "dev", # common context-specific identities
  10. # mononyms with likely collisions
  11. "Adam", "Alex", "alex", "Ben", "Bob", "Daniel", "Dmitry", "Ian",
  12. "james", "Jason", "jason", "John", "kevin", "Martin", "martin",
  13. "Michael", "Peter", "Petru", "Pranav", "Tyler"
  14. ]
  15. def __init__(self, names=[], emails=[], full_ids=[], main_id=None):
  16. self.names = {name for name in names if name not in Author.banned}
  17. self.emails = {email for email in emails if email not in Author.banned}
  18. # old mozilla commits used % instead of @ a lot
  19. for email in set(self.emails):
  20. if '%' in email:
  21. self.emails.add(email.replace('%', '@'))
  22. self.full_ids = {full_id for full_id in full_ids}
  23. self.main_id = main_id
  24. def match_identities(self, other):
  25. for name in self.names:
  26. if name in other.names:
  27. return True
  28. for email in self.emails:
  29. if email in other.emails:
  30. return True
  31. return False
  32. def merge_identities(self, other):
  33. self.names |= other.names
  34. self.emails |= other.emails
  35. self.full_ids |= other.full_ids
  36. if other.main_id and not self.main_id:
  37. self.main_id = other.main_id
  38. def generate_mailmap(self):
  39. full_ids = list(self.full_ids)
  40. full_ids.sort()
  41. if self.main_id:
  42. main_id = self.main_id
  43. print(main_id)
  44. else:
  45. ids_with_names = [full_id for full_id in full_ids
  46. if full_id[0] != '<']
  47. if ids_with_names:
  48. ids_with_emails = [full_id for full_id in ids_with_names
  49. if '<>' not in full_id]
  50. if ids_with_emails:
  51. main_id = min(ids_with_emails, key=len)
  52. else:
  53. main_id = min(ids_with_names, key=len)
  54. else:
  55. main_id = "none@none" + min(full_ids, key=len)
  56. secondary_ids = [full_id for full_id in full_ids if full_id != main_id]
  57. for full_id in secondary_ids:
  58. print("{} {}".format(main_id, full_id))
  59. class Authors:
  60. def __init__(self, author_set=set()):
  61. self.names = {}
  62. self.emails = {}
  63. self.authors = author_set
  64. for author in author_set:
  65. self.add_author(author)
  66. def add_author(self, author):
  67. existing_names = author.names & self.names.keys()
  68. existing_emails = author.emails & self.emails.keys()
  69. if existing_names:
  70. name = existing_names.pop()
  71. self.names[name].merge_identities(author)
  72. author = self.names[name]
  73. if existing_emails:
  74. email = existing_emails.pop()
  75. self.emails[email].merge_identities(author)
  76. author = self.emails[email]
  77. for name in author.names - self.names.keys():
  78. self.names[name] = author
  79. for email in author.emails - self.emails.keys():
  80. self.emails[email] = author
  81. self.authors.add(author)
  82. self.dedup(author)
  83. return author
  84. def add_edge_cases(self):
  85. # Edge cases that can't be cleanly handled generically
  86. self.names["Jason Orendorff"].full_ids.add("jason <none@none>")
  87. self.names["Glenn Watson"].full_ids.add("dev <dev@devs-MacBook-Pro.local>")
  88. def dedup(self, author):
  89. for name in author.names:
  90. other = self.names[name]
  91. if author != other:
  92. self.names[name] = author
  93. if other in self.authors:
  94. self.authors.remove(other)
  95. for email in author.emails:
  96. other = self.emails[email]
  97. if author != other:
  98. self.emails[email] = author
  99. if other in self.authors:
  100. self.authors.remove(other)
  101. def scan_and_merge(self, author, identities):
  102. for other in identities:
  103. if author != other and author.match_identities(other):
  104. other.merge_identities(author)
  105. self.scan_and_merge(other, identities)
  106. return
  107. identities.add(author)
  108. def build_full_id(name, email):
  109. if name is None:
  110. name = ""
  111. else:
  112. name += " "
  113. if email is None:
  114. email = ""
  115. return "{}<{}>".format(name, email)
  116. authors = Authors()
  117. # first, include the existing .mailmap, if there is one
  118. with open(sys.argv[1] + '/.mailmap') as f:
  119. for line in f.readlines():
  120. line = line.split('#')[0]
  121. if not len(line) > 0:
  122. continue
  123. # In English: capture a string we're calling name1, which does not
  124. # contain the character to start an email, followed by a non-zero
  125. # number of whitespace characters, but let all of this be optional.
  126. name1_regex = r"(?:(?P<name1>[^<]+)\s+)?"
  127. # In English: capture a string we're calling email1, which is preceded
  128. # by a < and followed by a >, containing no >. Non-optional.
  129. email1_regex = r"<(?P<email1>[^>]+)>"
  130. # In English: same as the name1 regex, but with the non-zero whitespace
  131. # preceding instead of following (and called name2).
  132. name2_regex = r"(?:\s+(?P<name2>[^<]*[^<\s]))?"
  133. # In English: same as email1 regex, but with non-zero whitespace
  134. # preceding the matched email, and all optional (and called email2).
  135. email2_regex = r"(?:\s+<(?P<email2>[^>]+)>)?"
  136. d = re.match(name1_regex + email1_regex + name2_regex + email2_regex,
  137. line).groupdict()
  138. d = {k: d[k].strip() for k in d if d[k] is not None}
  139. # someone took the time to add this to the .mailmap, so we should
  140. # respect their preferred name, rather than treat them all the same
  141. main_id = build_full_id(d.get("name1"), d.get("email1"))
  142. if d.get("email2") is not None:
  143. full_ids = {build_full_id(d.get("name2"), d.get("email2"))}
  144. else:
  145. full_ids = set()
  146. author = Author({d.get("name1"), d.get("name2")},
  147. {d.get("email1"), d.get("email2")},
  148. full_ids, main_id)
  149. authors.add_author(author)
  150. command = "git log --full-history --no-merges --use-mailmap "\
  151. "--format='format:%aN %aE' -- {} | sort | uniq".format(sys.argv[1])
  152. log = subprocess.check_output(command, shell=True,
  153. universal_newlines=True).rstrip()
  154. for author in log.splitlines():
  155. name, email = author.split(' ')
  156. full_id = build_full_id(name, email)
  157. author = Author({name}, {email}, {full_id})
  158. authors.add_author(author)
  159. for author in authors.authors:
  160. author.generate_mailmap()