123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292 |
- """
- Script that runs several docker containers which in turn runs an analysis on
- a git repository.
- """
- __author__ = "Oscar Svensson"
- __copyright__ = "Copyright (c) 2018 Axis Communications AB"
- __license__ = "MIT"
- import os
- import sys
- import shutil
- import time
- from argparse import ArgumentParser
- from distutils.dir_util import copy_tree
- from multiprocessing import Process, cpu_count
- from git import Repo
- from tqdm import tqdm
- import docker
- def start_container(client, image, name, repo_dir, result_dir):
- """
- Function that starts a docker container and links the repo into it and
- a directory where the results are stored.
- """
- for container in client.containers.list(all=True):
- if name == container.name:
- if container.status == "running":
- container.kill()
- container.remove()
- path = os.path.abspath('./')
- container = client.containers.run(
- image,
- name=name,
- stdin_open=True,
- detach=True,
- volumes={
- str(path + "/scripts"): {
- 'bind': '/root/scripts',
- 'mode': 'rw'
- },
- result_dir: {
- 'bind': '/root/results',
- 'mode': 'rw'
- },
- os.path.abspath(repo_dir): {
- 'bind': '/root/repo',
- 'mode': 'rw'
- }
- },
- command="bash")
- return container
- def run_command(container, command):
- """
- Function that executes a command inside a container.
- """
- return container.exec_run(
- cmd="bash -c \"" + command + "\"", tty=True, privileged=True)
- def run_analysis(t_id, container, commits):
- """
- Function that runs a command inside all docker container.
- """
- for commit in tqdm(
- commits, desc="Progress process {}".format(t_id), position=t_id):
- run_command(container,
- "/root/scripts/analyse_commit {}".format(commit))
- def copy_repo(src, dest):
- """
- Helper function to copy a repository to another destination.
- """
- try:
- shutil.copytree(src, dest)
- except shutil.Error as exp:
- print("Directory not copied. Error: {}".format(exp))
- except OSError as exp:
- print("Directory not copied. Error: {}".format(exp))
- def partion_commits(commits, partitions):
- """
- Function that divides commits into evenly partitions.
- """
- quote, remainder = divmod(len(commits), partitions)
- chunk_commits = [(i * quote + min(i, remainder), (i + 1) * quote + min(i + 1, remainder) - 1)
- for i in range(partitions)]
- chunk_commits[-1] = (chunk_commits[-1][0], chunk_commits[-1][1] + 1)
- commits = [[commit for commit in commits[chunk[0]:chunk[1]]]
- for chunk in chunk_commits]
- return commits
- def start_analysis(image, result_dir, commits=None, cpus=cpu_count()):
- """
- This function starts a docker container that can analyze a git repository. It starts several
- containers if the cpus are more than one.
- """
- client = docker.from_env()
- repo = Repo(REPO)
- # Since the script is working directly on the repository, they have
- # to have a separately copy.
- if not os.path.exists("./repos"):
- os.makedirs("./repos")
- repo_name = os.path.basename(os.path.normpath(REPO))
- for cpu in range(cpus):
- copy_repo(REPO, "./repos/{}{}".format(repo_name, cpu))
- # Split the commits into even parts.
- if not commits:
- commits = [
- str(commit.hexsha) for commit in list(repo.iter_commits('master'))
- ]
- commits = partion_commits(commits, cpus)
- containers = []
- for cpu in range(cpus):
- container = start_container(
- client,
- image=image,
- name="analysis_{}_cpu_{}".format(repo_name, cpu),
- repo_dir="./repos/{}{}".format(repo_name, cpu),
- result_dir=result_dir + "/data{}".format(cpu))
- containers.append(container)
- processes = [
- Process(target=run_analysis, args=(i, containers[i], commits[i]))
- for i in range(cpus)
- ]
- for process in processes:
- process.start()
- for process in processes:
- process.join()
- for container in containers:
- print(container.status)
- print(container.name)
- if (container.status != "exited" or container.status != "dead"):
- container.kill()
- container.remove()
- shutil.rmtree("./repos", ignore_errors=True)
- def parse_commits(commit_file):
- """
- Read the commits from a file and reutrn the content.
- """
- if not os.path.exists(commit_file):
- print("commit_file doesn't exist!!", file=sys.stderr)
- sys.exit(1)
- commits = []
- with open(commit_file, 'r') as cfile:
- commits = [line.strip() for line in cfile.readlines()]
- return commits
- def assemble_directories(result_path, cpus=cpu_count()):
- """
- Copy all results into a single directory.
- """
- result_path = os.path.abspath(result_path)
- paths = ["{}/data{}".format(result_path, i) for i in range(cpus)]
- if not all([os.path.exists(p) for p in paths]):
- print("data paths doesn't exists!", file=sys.stderr)
- return
- files = []
- for path in paths:
- for item in os.listdir(path):
- commit = os.path.join(path, item)
- corrupt = False if (len(os.listdir(commit)) == 2) else True
- if (os.path.isdir(commit) and not corrupt):
- files.append((commit, item))
- print("Saving all analysed commits into a single directory: {}/data_all".
- format(result_path))
- if not os.path.exists("{}/data_all".format(result_path)):
- os.makedirs("{}/data_all".format(result_path))
- for file_tuple in files:
- if not os.path.exists("{}/data_all/{}".format(result_path, file_tuple[1])):
- copy_tree(file_tuple[0], "{}/data_all/{}".format(result_path, file_tuple[1]))
- def check_for_missing_commits(repo_path, result_path):
- """
- Controller function that checks if all commits has been analyzed.
- """
- result_dir = os.path.abspath(result_path)
- if not os.path.exists(result_path):
- print("Result path doesn't exist!", file=sys.stderr)
- return
- repo = Repo(repo_path)
- current_commits = []
- for item in os.listdir(result_dir):
- current_commits.append(item)
- all_repo_commits = [c.hexsha for c in list(repo.iter_commits('master'))]
- missing_commits = set(all_repo_commits) - set(current_commits)
- if missing_commits:
- with open("./missing_commits.txt", 'w') as cfile:
- for commit in missing_commits:
- cfile.write(commit)
- cfile.write('\n')
- print("Wrote missing commits to missing_commits.txt")
- if __name__ == "__main__":
- PARSER = ArgumentParser(description="Utility to run several docker " +
- "containers onto a git repository. " +
- "Each container is given a set of " +
- "commits and is instructed to run " +
- "an analysis on each one of them.")
- PARSER.add_argument(
- "--analyse", "-a", action="store_true", help="Run an analysation.")
- PARSER.add_argument(
- "--image",
- "-i",
- type=str,
- default="code-maat",
- help="Specification of which image to use.")
- PARSER.add_argument(
- "--repo-dir",
- "-r",
- type=str,
- default="../../jenkins",
- help="Specification of which repo to use.")
- PARSER.add_argument(
- "--result-dir",
- "-rd",
- type=str,
- default="/h/oskars",
- help="Specification of where to store the result.")
- PARSER.add_argument(
- "--commits",
- "-c",
- type=str,
- default=None,
- help="Direction to a file containing commits to analyse.")
- PARSER.add_argument(
- "--assemble",
- "-as",
- action="store_true",
- help="Assemble the results into a single directory.")
- PARSER.add_argument(
- "--missing-commits",
- "-mc",
- action="store_true",
- help="Check for non analysed commits.")
- ARGS = PARSER.parse_args()
- global REPO
- REPO = os.path.abspath(ARGS.repo_dir)
- if ARGS.commits:
- COMMITS = parse_commits(ARGS.commits)
- else:
- COMMITS = []
- CLIENT = docker.from_env()
- if ARGS.analyse:
- print("Starting the analysis using {} cpus...".format(cpu_count()))
- START = time.time()
- if COMMITS:
- start_analysis(ARGS.image, ARGS.result_dir, commits=COMMITS)
- else:
- start_analysis(ARGS.image, ARGS.result_dir)
- STOP = time.time()
- print("Done in {}".format(
- time.strftime('%H:%M:%S', time.gmtime(STOP - START))))
- print("Results can be found in {}".format(
- ARGS.result_dir + "/data{" +
- ','.join(["{}".format(i) for i in range(cpu_count())]) + "}"))
- if ARGS.assemble:
- assemble_directories(ARGS.result_dir)
- if ARGS.missing_commits:
- check_for_missing_commits(ARGS.repo_dir, ARGS.result_dir)
|