assemble_features.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. """
  2. Script that runs several docker containers which in turn runs an analysis on
  3. a git repository.
  4. """
  5. __author__ = "Oscar Svensson"
  6. __copyright__ = "Copyright (c) 2018 Axis Communications AB"
  7. __license__ = "MIT"
  8. import os
  9. import sys
  10. import shutil
  11. import time
  12. from argparse import ArgumentParser
  13. from distutils.dir_util import copy_tree
  14. from multiprocessing import Process, cpu_count
  15. from git import Repo
  16. from tqdm import tqdm
  17. import docker
  18. def start_container(client, image, name, repo_dir, result_dir):
  19. """
  20. Function that starts a docker container and links the repo into it and
  21. a directory where the results are stored.
  22. """
  23. for container in client.containers.list(all=True):
  24. if name == container.name:
  25. if container.status == "running":
  26. container.kill()
  27. container.remove()
  28. path = os.path.abspath('./')
  29. container = client.containers.run(
  30. image,
  31. name=name,
  32. stdin_open=True,
  33. detach=True,
  34. volumes={
  35. str(path + "/scripts"): {
  36. 'bind': '/root/scripts',
  37. 'mode': 'rw'
  38. },
  39. result_dir: {
  40. 'bind': '/root/results',
  41. 'mode': 'rw'
  42. },
  43. os.path.abspath(repo_dir): {
  44. 'bind': '/root/repo',
  45. 'mode': 'rw'
  46. }
  47. },
  48. command="bash")
  49. return container
  50. def run_command(container, command):
  51. """
  52. Function that executes a command inside a container.
  53. """
  54. return container.exec_run(
  55. cmd="bash -c \"" + command + "\"", tty=True, privileged=True)
  56. def run_analysis(t_id, container, commits):
  57. """
  58. Function that runs a command inside all docker container.
  59. """
  60. for commit in tqdm(
  61. commits, desc="Progress process {}".format(t_id), position=t_id):
  62. run_command(container,
  63. "/root/scripts/analyse_commit {}".format(commit))
  64. def copy_repo(src, dest):
  65. """
  66. Helper function to copy a repository to another destination.
  67. """
  68. try:
  69. shutil.copytree(src, dest)
  70. except shutil.Error as exp:
  71. print("Directory not copied. Error: {}".format(exp))
  72. except OSError as exp:
  73. print("Directory not copied. Error: {}".format(exp))
  74. def partion_commits(commits, partitions):
  75. """
  76. Function that divides commits into evenly partitions.
  77. """
  78. quote, remainder = divmod(len(commits), partitions)
  79. chunk_commits = [(i * quote + min(i, remainder), (i + 1) * quote + min(i + 1, remainder) - 1)
  80. for i in range(partitions)]
  81. chunk_commits[-1] = (chunk_commits[-1][0], chunk_commits[-1][1] + 1)
  82. commits = [[commit for commit in commits[chunk[0]:chunk[1]]]
  83. for chunk in chunk_commits]
  84. return commits
  85. def start_analysis(image, result_dir, commits=None, cpus=cpu_count()):
  86. """
  87. This function starts a docker container that can analyze a git repository. It starts several
  88. containers if the cpus are more than one.
  89. """
  90. client = docker.from_env()
  91. repo = Repo(REPO)
  92. # Since the script is working directly on the repository, they have
  93. # to have a separately copy.
  94. if not os.path.exists("./repos"):
  95. os.makedirs("./repos")
  96. repo_name = os.path.basename(os.path.normpath(REPO))
  97. for cpu in range(cpus):
  98. copy_repo(REPO, "./repos/{}{}".format(repo_name, cpu))
  99. # Split the commits into even parts.
  100. if not commits:
  101. commits = [
  102. str(commit.hexsha) for commit in list(repo.iter_commits('master'))
  103. ]
  104. commits = partion_commits(commits, cpus)
  105. containers = []
  106. for cpu in range(cpus):
  107. container = start_container(
  108. client,
  109. image=image,
  110. name="analysis_{}_cpu_{}".format(repo_name, cpu),
  111. repo_dir="./repos/{}{}".format(repo_name, cpu),
  112. result_dir=result_dir + "/data{}".format(cpu))
  113. containers.append(container)
  114. processes = [
  115. Process(target=run_analysis, args=(i, containers[i], commits[i]))
  116. for i in range(cpus)
  117. ]
  118. for process in processes:
  119. process.start()
  120. for process in processes:
  121. process.join()
  122. for container in containers:
  123. print(container.status)
  124. print(container.name)
  125. if (container.status != "exited" or container.status != "dead"):
  126. container.kill()
  127. container.remove()
  128. shutil.rmtree("./repos", ignore_errors=True)
  129. def parse_commits(commit_file):
  130. """
  131. Read the commits from a file and reutrn the content.
  132. """
  133. if not os.path.exists(commit_file):
  134. print("commit_file doesn't exist!!", file=sys.stderr)
  135. sys.exit(1)
  136. commits = []
  137. with open(commit_file, 'r') as cfile:
  138. commits = [line.strip() for line in cfile.readlines()]
  139. return commits
  140. def assemble_directories(result_path, cpus=cpu_count()):
  141. """
  142. Copy all results into a single directory.
  143. """
  144. result_path = os.path.abspath(result_path)
  145. paths = ["{}/data{}".format(result_path, i) for i in range(cpus)]
  146. if not all([os.path.exists(p) for p in paths]):
  147. print("data paths doesn't exists!", file=sys.stderr)
  148. return
  149. files = []
  150. for path in paths:
  151. for item in os.listdir(path):
  152. commit = os.path.join(path, item)
  153. corrupt = False if (len(os.listdir(commit)) == 2) else True
  154. if (os.path.isdir(commit) and not corrupt):
  155. files.append((commit, item))
  156. print("Saving all analysed commits into a single directory: {}/data_all".
  157. format(result_path))
  158. if not os.path.exists("{}/data_all".format(result_path)):
  159. os.makedirs("{}/data_all".format(result_path))
  160. for file_tuple in files:
  161. if not os.path.exists("{}/data_all/{}".format(result_path, file_tuple[1])):
  162. copy_tree(file_tuple[0], "{}/data_all/{}".format(result_path, file_tuple[1]))
  163. def check_for_missing_commits(repo_path, result_path):
  164. """
  165. Controller function that checks if all commits has been analyzed.
  166. """
  167. result_dir = os.path.abspath(result_path)
  168. if not os.path.exists(result_path):
  169. print("Result path doesn't exist!", file=sys.stderr)
  170. return
  171. repo = Repo(repo_path)
  172. current_commits = []
  173. for item in os.listdir(result_dir):
  174. current_commits.append(item)
  175. all_repo_commits = [c.hexsha for c in list(repo.iter_commits('master'))]
  176. missing_commits = set(all_repo_commits) - set(current_commits)
  177. if missing_commits:
  178. with open("./missing_commits.txt", 'w') as cfile:
  179. for commit in missing_commits:
  180. cfile.write(commit)
  181. cfile.write('\n')
  182. print("Wrote missing commits to missing_commits.txt")
  183. if __name__ == "__main__":
  184. PARSER = ArgumentParser(description="Utility to run several docker " +
  185. "containers onto a git repository. " +
  186. "Each container is given a set of " +
  187. "commits and is instructed to run " +
  188. "an analysis on each one of them.")
  189. PARSER.add_argument(
  190. "--analyse", "-a", action="store_true", help="Run an analysation.")
  191. PARSER.add_argument(
  192. "--image",
  193. "-i",
  194. type=str,
  195. default="code-maat",
  196. help="Specification of which image to use.")
  197. PARSER.add_argument(
  198. "--repo-dir",
  199. "-r",
  200. type=str,
  201. default="../../jenkins",
  202. help="Specification of which repo to use.")
  203. PARSER.add_argument(
  204. "--result-dir",
  205. "-rd",
  206. type=str,
  207. default="/h/oskars",
  208. help="Specification of where to store the result.")
  209. PARSER.add_argument(
  210. "--commits",
  211. "-c",
  212. type=str,
  213. default=None,
  214. help="Direction to a file containing commits to analyse.")
  215. PARSER.add_argument(
  216. "--assemble",
  217. "-as",
  218. action="store_true",
  219. help="Assemble the results into a single directory.")
  220. PARSER.add_argument(
  221. "--missing-commits",
  222. "-mc",
  223. action="store_true",
  224. help="Check for non analysed commits.")
  225. ARGS = PARSER.parse_args()
  226. global REPO
  227. REPO = os.path.abspath(ARGS.repo_dir)
  228. if ARGS.commits:
  229. COMMITS = parse_commits(ARGS.commits)
  230. else:
  231. COMMITS = []
  232. CLIENT = docker.from_env()
  233. if ARGS.analyse:
  234. print("Starting the analysis using {} cpus...".format(cpu_count()))
  235. START = time.time()
  236. if COMMITS:
  237. start_analysis(ARGS.image, ARGS.result_dir, commits=COMMITS)
  238. else:
  239. start_analysis(ARGS.image, ARGS.result_dir)
  240. STOP = time.time()
  241. print("Done in {}".format(
  242. time.strftime('%H:%M:%S', time.gmtime(STOP - START))))
  243. print("Results can be found in {}".format(
  244. ARGS.result_dir + "/data{" +
  245. ','.join(["{}".format(i) for i in range(cpu_count())]) + "}"))
  246. if ARGS.assemble:
  247. assemble_directories(ARGS.result_dir)
  248. if ARGS.missing_commits:
  249. check_for_missing_commits(ARGS.repo_dir, ARGS.result_dir)