123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183 |
- import sys
- import subprocess
- import re
- class Author:
- # this "banned" list is just to prevent matching,
- # full IDs that contain them will still be properly rewritten
- banned = [
- None, "", "None", "none@none", "unknown", # common null identities
- "bugs", "bugzilla", "dev", # common context-specific identities
- # mononyms with likely collisions
- "Adam", "Alex", "alex", "Ben", "Bob", "Daniel", "Dmitry", "Ian",
- "james", "Jason", "jason", "John", "kevin", "Martin", "martin",
- "Michael", "Peter", "Petru", "Pranav", "Tyler"
- ]
- def __init__(self, names=[], emails=[], full_ids=[], main_id=None):
- self.names = {name for name in names if name not in Author.banned}
- self.emails = {email for email in emails if email not in Author.banned}
- # old mozilla commits used % instead of @ a lot
- for email in set(self.emails):
- if '%' in email:
- self.emails.add(email.replace('%', '@'))
- self.full_ids = {full_id for full_id in full_ids}
- self.main_id = main_id
- def match_identities(self, other):
- for name in self.names:
- if name in other.names:
- return True
- for email in self.emails:
- if email in other.emails:
- return True
- return False
- def merge_identities(self, other):
- self.names |= other.names
- self.emails |= other.emails
- self.full_ids |= other.full_ids
- if other.main_id and not self.main_id:
- self.main_id = other.main_id
- def generate_mailmap(self):
- full_ids = list(self.full_ids)
- full_ids.sort()
- if self.main_id:
- main_id = self.main_id
- print(main_id)
- else:
- ids_with_names = [full_id for full_id in full_ids
- if full_id[0] != '<']
- if ids_with_names:
- ids_with_emails = [full_id for full_id in ids_with_names
- if '<>' not in full_id]
- if ids_with_emails:
- main_id = min(ids_with_emails, key=len)
- else:
- main_id = min(ids_with_names, key=len)
- else:
- main_id = "none@none" + min(full_ids, key=len)
- secondary_ids = [full_id for full_id in full_ids if full_id != main_id]
- for full_id in secondary_ids:
- print("{} {}".format(main_id, full_id))
- class Authors:
- def __init__(self, author_set=set()):
- self.names = {}
- self.emails = {}
- self.authors = author_set
- for author in author_set:
- self.add_author(author)
- def add_author(self, author):
- existing_names = author.names & self.names.keys()
- existing_emails = author.emails & self.emails.keys()
- if existing_names:
- name = existing_names.pop()
- self.names[name].merge_identities(author)
- author = self.names[name]
- if existing_emails:
- email = existing_emails.pop()
- self.emails[email].merge_identities(author)
- author = self.emails[email]
- for name in author.names - self.names.keys():
- self.names[name] = author
- for email in author.emails - self.emails.keys():
- self.emails[email] = author
- self.authors.add(author)
- self.dedup(author)
- return author
- def add_edge_cases(self):
- # Edge cases that can't be cleanly handled generically
- self.names["Jason Orendorff"].full_ids.add("jason <none@none>")
- self.names["Glenn Watson"].full_ids.add("dev <dev@devs-MacBook-Pro.local>")
- def dedup(self, author):
- for name in author.names:
- other = self.names[name]
- if author != other:
- self.names[name] = author
- if other in self.authors:
- self.authors.remove(other)
- for email in author.emails:
- other = self.emails[email]
- if author != other:
- self.emails[email] = author
- if other in self.authors:
- self.authors.remove(other)
- def scan_and_merge(self, author, identities):
- for other in identities:
- if author != other and author.match_identities(other):
- other.merge_identities(author)
- self.scan_and_merge(other, identities)
- return
- identities.add(author)
- def build_full_id(name, email):
- if name is None:
- name = ""
- else:
- name += " "
- if email is None:
- email = ""
- return "{}<{}>".format(name, email)
- authors = Authors()
- # first, include the existing .mailmap, if there is one
- with open(sys.argv[1] + '/.mailmap') as f:
- for line in f.readlines():
- line = line.split('#')[0]
- if not len(line) > 0:
- continue
- # In English: capture a string we're calling name1, which does not
- # contain the character to start an email, followed by a non-zero
- # number of whitespace characters, but let all of this be optional.
- name1_regex = r"(?:(?P<name1>[^<]+)\s+)?"
- # In English: capture a string we're calling email1, which is preceded
- # by a < and followed by a >, containing no >. Non-optional.
- email1_regex = r"<(?P<email1>[^>]+)>"
- # In English: same as the name1 regex, but with the non-zero whitespace
- # preceding instead of following (and called name2).
- name2_regex = r"(?:\s+(?P<name2>[^<]*[^<\s]))?"
- # In English: same as email1 regex, but with non-zero whitespace
- # preceding the matched email, and all optional (and called email2).
- email2_regex = r"(?:\s+<(?P<email2>[^>]+)>)?"
- d = re.match(name1_regex + email1_regex + name2_regex + email2_regex,
- line).groupdict()
- d = {k: d[k].strip() for k in d if d[k] is not None}
- # someone took the time to add this to the .mailmap, so we should
- # respect their preferred name, rather than treat them all the same
- main_id = build_full_id(d.get("name1"), d.get("email1"))
- if d.get("email2") is not None:
- full_ids = {build_full_id(d.get("name2"), d.get("email2"))}
- else:
- full_ids = set()
- author = Author({d.get("name1"), d.get("name2")},
- {d.get("email1"), d.get("email2")},
- full_ids, main_id)
- authors.add_author(author)
- command = "git log --full-history --no-merges --use-mailmap "\
- "--format='format:%aN %aE' -- {} | sort | uniq".format(sys.argv[1])
- log = subprocess.check_output(command, shell=True,
- universal_newlines=True).rstrip()
- for author in log.splitlines():
- name, email = author.split(' ')
- full_id = build_full_id(name, email)
- author = Author({name}, {email}, {full_id})
- authors.add_author(author)
- for author in authors.authors:
- author.generate_mailmap()
|