import sys import subprocess import re class Author: # this "banned" list is just to prevent matching, # full IDs that contain them will still be properly rewritten banned = [ None, "", "None", "none@none", "unknown", # common null identities "bugs", "bugzilla", "dev", # common context-specific identities # mononyms with likely collisions "Adam", "Alex", "alex", "Ben", "Bob", "Daniel", "Dmitry", "Ian", "james", "Jason", "jason", "John", "kevin", "Martin", "martin", "Michael", "Peter", "Petru", "Pranav", "Tyler" ] def __init__(self, names=[], emails=[], full_ids=[], main_id=None): self.names = {name for name in names if name not in Author.banned} self.emails = {email for email in emails if email not in Author.banned} # old mozilla commits used % instead of @ a lot for email in set(self.emails): if '%' in email: self.emails.add(email.replace('%', '@')) self.full_ids = {full_id for full_id in full_ids} self.main_id = main_id def match_identities(self, other): for name in self.names: if name in other.names: return True for email in self.emails: if email in other.emails: return True return False def merge_identities(self, other): self.names |= other.names self.emails |= other.emails self.full_ids |= other.full_ids if other.main_id and not self.main_id: self.main_id = other.main_id def generate_mailmap(self): full_ids = list(self.full_ids) full_ids.sort() if self.main_id: main_id = self.main_id print(main_id) else: ids_with_names = [full_id for full_id in full_ids if full_id[0] != '<'] if ids_with_names: ids_with_emails = [full_id for full_id in ids_with_names if '<>' not in full_id] if ids_with_emails: main_id = min(ids_with_emails, key=len) else: main_id = min(ids_with_names, key=len) else: main_id = "none@none" + min(full_ids, key=len) secondary_ids = [full_id for full_id in full_ids if full_id != main_id] for full_id in secondary_ids: print("{} {}".format(main_id, full_id)) class Authors: def __init__(self, author_set=set()): self.names = {} self.emails = {} self.authors = author_set for author in author_set: self.add_author(author) def add_author(self, author): existing_names = author.names & self.names.keys() existing_emails = author.emails & self.emails.keys() if existing_names: name = existing_names.pop() self.names[name].merge_identities(author) author = self.names[name] if existing_emails: email = existing_emails.pop() self.emails[email].merge_identities(author) author = self.emails[email] for name in author.names - self.names.keys(): self.names[name] = author for email in author.emails - self.emails.keys(): self.emails[email] = author self.authors.add(author) self.dedup(author) return author def add_edge_cases(self): # Edge cases that can't be cleanly handled generically self.names["Jason Orendorff"].full_ids.add("jason ") self.names["Glenn Watson"].full_ids.add("dev ") def dedup(self, author): for name in author.names: other = self.names[name] if author != other: self.names[name] = author if other in self.authors: self.authors.remove(other) for email in author.emails: other = self.emails[email] if author != other: self.emails[email] = author if other in self.authors: self.authors.remove(other) def scan_and_merge(self, author, identities): for other in identities: if author != other and author.match_identities(other): other.merge_identities(author) self.scan_and_merge(other, identities) return identities.add(author) def build_full_id(name, email): if name is None: name = "" else: name += " " if email is None: email = "" return "{}<{}>".format(name, email) authors = Authors() # first, include the existing .mailmap, if there is one with open(sys.argv[1] + '/.mailmap') as f: for line in f.readlines(): line = line.split('#')[0] if not len(line) > 0: continue # In English: capture a string we're calling name1, which does not # contain the character to start an email, followed by a non-zero # number of whitespace characters, but let all of this be optional. name1_regex = r"(?:(?P[^<]+)\s+)?" # In English: capture a string we're calling email1, which is preceded # by a < and followed by a >, containing no >. Non-optional. email1_regex = r"<(?P[^>]+)>" # In English: same as the name1 regex, but with the non-zero whitespace # preceding instead of following (and called name2). name2_regex = r"(?:\s+(?P[^<]*[^<\s]))?" # In English: same as email1 regex, but with non-zero whitespace # preceding the matched email, and all optional (and called email2). email2_regex = r"(?:\s+<(?P[^>]+)>)?" d = re.match(name1_regex + email1_regex + name2_regex + email2_regex, line).groupdict() d = {k: d[k].strip() for k in d if d[k] is not None} # someone took the time to add this to the .mailmap, so we should # respect their preferred name, rather than treat them all the same main_id = build_full_id(d.get("name1"), d.get("email1")) if d.get("email2") is not None: full_ids = {build_full_id(d.get("name2"), d.get("email2"))} else: full_ids = set() author = Author({d.get("name1"), d.get("name2")}, {d.get("email1"), d.get("email2")}, full_ids, main_id) authors.add_author(author) command = "git log --full-history --no-merges --use-mailmap "\ "--format='format:%aN %aE' -- {} | sort | uniq".format(sys.argv[1]) log = subprocess.check_output(command, shell=True, universal_newlines=True).rstrip() for author in log.splitlines(): name, email = author.split(' ') full_id = build_full_id(name, email) author = Author({name}, {email}, {full_id}) authors.add_author(author) for author in authors.authors: author.generate_mailmap()