reproduceResults.sh 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. #!/bin/bash
  2. ## How to Use
  3. # Takes 0 inputs, and outputs the data from our techniques.
  4. # Can also take optional flags to run subset of commands, see -h output.
  5. ## Fields to Maintain
  6. # git repos of projects we analyze
  7. # N.b: "webrender_bindings", the only Rust code without its own git repo,
  8. # is handled specially in the last step of this script.
  9. declare -A repos
  10. repos["firefox"]='https://github.com/mozilla/gecko-dev.git'
  11. repos["webrender"]='https://github.com/servo/webrender.git'
  12. repos["servo"]='https://github.com/jtracey/servo-mirror.git'
  13. repos["mp4parse-rust"]='https://github.com/mozilla/mp4parse-rust.git'
  14. #repos["qcms"]='https://github.com/FirefoxGraphics/qcms.git'
  15. repos["encoding_rs"]='https://github.com/hsivonen/encoding_rs.git'
  16. repos["mapped_hyph"]='https://github.com/jfkthame/mapped_hyph.git'
  17. #repos["chardetng"]='https://github.com/hsivonen/chardetng.git'
  18. #repos["shift_or_euc"]='https://github.com/hsivonen/shift_or_euc'
  19. repos["cubeb-coreaudio-rs"]='https://github.com/mozilla/cubeb-coreaudio-rs.git'
  20. # git hash of the most recent commit in the tree we consider
  21. declare -A commits
  22. commits["firefox"]=e5d3122984cea27576ad55b9898f2ec46529c5c9
  23. commits["webrender"]=cb2b55394892ef9ea1e89dbe41fd3a8cebd61468
  24. commits["servo"]=b1578947ef369a1810d1a83373f68bfd7fe23fe1
  25. commits["mp4parse-rust"]=4f70fc9ec2b43f17003c476dcc0ad1737ae100dc
  26. #commits["qcms"]=f2fdcde3912967fa06a5fff0957eebc7901c0645
  27. commits["encoding_rs"]=a962ef4f8e569ccf5a22104d19cc10e8a0b458e6
  28. commits["mapped_hyph"]=c7651a0cffff41996ad13c44f689bd9cd2192c01
  29. #commits["chardetng"]=143dadde20e283a46ef33ba960b517a3283a3d22
  30. commits["cubeb-coreaudio-rs"]=3ea3897147fa52ee3586b81d6d48315f0fba2777
  31. # whitespace-separated list of C++ projects to analyze
  32. cprojects="layers css stagefright qcms uconv hyphenation japanese-encoding cubeb-macos"
  33. # whitespace-separated list of Rust projects to analyze
  34. # (webrender_bindings is in gecko, stylo is in the servo git repo)
  35. # you would also need to update the following scripts/files:
  36. # /code/fetch_bugzilla_bugs/fetch_bugs.sh
  37. # /code/fetch_bugzilla_bugs/filter.sh
  38. # /data/hand-annotated/relevant-dirs.csv
  39. rustprojects="webrender webrender_bindings servo mp4parse-rust encoding_rs mapped_hyph cubeb-coreaudio-rs"
  40. ## Project Directory Structure
  41. # (data dirs are made prior to being populated)
  42. # directory this script is in
  43. scriptDir="$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)"
  44. # root dir of this project
  45. rootDir="$(dirname $scriptDir)"
  46. # dir that stores all code, scripts, etc. (that we run, not analyze)
  47. codeDir="$rootDir/code/"
  48. # dir where we store all generated results and the few hand-annotated files
  49. dataDir="$rootDir/data/"
  50. # dir to clone repos into
  51. repoDir="$dataDir/repos/"
  52. # dir to store structured representation of issues
  53. issuesDir="$dataDir/issues/"
  54. # dir to store structured representation of fixes
  55. fixesDir="$dataDir/fixes/"
  56. # dir to store the output of SZZ
  57. resultsDir="$dataDir/szz-results/"
  58. # dir to store our best guess of which commits introduced which bugs
  59. bugIntroDir="$dataDir/introducers/"
  60. # dir to store the results of comparison of SZZ with available ground truth
  61. groundDir="$dataDir/ground-truth/"
  62. # dir where we store hand-annotated data
  63. # (i.e., data not generated by this script)
  64. annotatedDir="$dataDir/hand-annotated/"
  65. # dir where we store structured data about issues, their fixes, and inducers
  66. bugzillaDir="$dataDir/bugzilla-data/"
  67. # dir where we store the experience of each contributor for each project
  68. experienceDir="$dataDir/experiences/"
  69. # dir where we store the learning curve plots
  70. plotDir="$dataDir/learning-curve-plots/"
  71. set -e
  72. if (($# == 0)); then
  73. allFlag=true
  74. fi
  75. while getopts hacdistuv opt; do
  76. case $opt in
  77. h)
  78. echo "usage: $0 [flags]"
  79. echo " -h print this help and exit"
  80. echo " -a run all steps (this is the default if no args given)"
  81. echo " -c clone each project git repo into $repoDir, if it doesn't already exist"
  82. echo " -d download relevant issues"
  83. echo " -i identify fix commits"
  84. echo " -s run SZZ"
  85. echo " -t compare to ground truth"
  86. echo " -u update experience files"
  87. echo " -v create and visualize learning curves"
  88. exit
  89. ;;
  90. a) allFlag=true ;;
  91. c) cloneFlag=true ;;
  92. d) downloadFlag=true ;;
  93. i) identifyFlag=true ;;
  94. s) szzFlag=true ;;
  95. t) truthFlag=true ;;
  96. u) experienceFlag=true ;;
  97. v) learningFlag=true ;;
  98. \?) echo "unknown flag: -$OPTARG" >&2 ;;
  99. esac
  100. done
  101. # Step -1: check dependencies (only if running everything)
  102. if [ "$allFlag" = true ] ; then
  103. missing=false
  104. for dependency in git jq python3 gradle java javac ; do
  105. if ! type "$dependency" > /dev/null; then
  106. echo "Missing dependency: $dependency" >&2
  107. missing=true
  108. fi
  109. done
  110. for pyMod in numpy scipy matplotlib.pyplot urllib3 ; do
  111. if ! python3 -c "import $pyMod" > /dev/null; then
  112. echo "Missing python module: $pyMod"
  113. missing=true
  114. fi
  115. done
  116. # dvipng texlive-latex-extra texlive-fonts-recommended cm-super
  117. if [ "$missing" = true ] ; then
  118. echo "Aborting due to missing dependencies." >&2
  119. exit 1
  120. fi
  121. git_major=$(git --version | cut -f1 -d.)
  122. git_minor=$(git --version | cut -f2 -d.)
  123. if [ $git_major -lt 2 ] || [ $git_minor -lt 37 ] ; then
  124. echo "Aborting: git needs --since-as-filter, added in 2.37" >&2
  125. exit 1
  126. fi
  127. fi
  128. # Step 0: for each repo that doesn't already exist:
  129. # - clone
  130. # - uncap rename limits
  131. # - generate the .mailmap file
  132. for repo in "${!repos[@]}" ; do
  133. if ! [ -d "$repoDir/$repo" ] && ([ "$allFlag" = true ] || [ "$cloneFlag" = true ])
  134. then
  135. mkdir -p "$repoDir"
  136. git clone "${repos[$repo]}" "$repoDir/$repo"
  137. cd "$repoDir/$repo"
  138. git config diff.renameLimit 0
  139. git checkout "${commits[$repo]}"
  140. python3 "$codeDir/author-identities.py" . | sort > .mailmap
  141. cd "$rootDir"
  142. elif [ "$cloneFlag" = true ] ; then
  143. echo "You're trying to clone $repo, but it seems to already exist." >&2
  144. echo "Remove or move $repoDir/$repo if you really want to clone it again." >&2
  145. echo "Continuing as though this succeeded." >&2
  146. fi
  147. done
  148. # Step 1: get (filtered) issues
  149. if [ "$allFlag" = true ] || [ "$downloadFlag" = true ] ; then
  150. # Pulls and structures bug data we need, as available from Bugzilla
  151. "$codeDir"/fetch_bugzilla_bugs/fetch_bugs.sh "$codeDir" "$issuesDir"
  152. # Additional filters based on conditions not visible in Bugzilla metadata
  153. "$codeDir"/fetch_bugzilla_bugs/filter.sh "$codeDir" "$issuesDir" "$repoDir/firefox"
  154. fi
  155. # Step 2: identify fix commits
  156. if [ "$allFlag" = true ] || [ "$identifyFlag" = true ] ; then
  157. mkdir -p "$fixesDir"
  158. cd "$dataDir"
  159. for project in $cprojects ; do
  160. echo "getting $project fixes"
  161. python3 "$codeDir/fetch_bugzilla_bugs/find_bugzilla_fixes.py" \
  162. --git-path="$repoDir/firefox" \
  163. --issue-list="$issuesDir/$project-issues/"
  164. mv issue_list.json "$fixesDir/$project.json"
  165. done
  166. cd "$rootDir"
  167. fi
  168. # Step 3: run SZZ
  169. if [ "$allFlag" = true ] || [ "$szzFlag" = true ] ; then
  170. cd "$codeDir/szz"
  171. mkdir -p "$resultsDir"
  172. gradle fatJar
  173. for project in $cprojects ; do
  174. echo "running SZZ on $project"
  175. rm -rf "$resultsDir/$project"
  176. java -Xmx5g -jar ./build/libs/szz_find_bug_introducers-0.1.jar -i \
  177. "$fixesDir/$project.json" -r "$repoDir/firefox" -d 1
  178. mv results "$resultsDir/$project"
  179. rm -r issues
  180. done
  181. cd "$rootDir"
  182. fi
  183. # Step 4: compare to ground truth
  184. if [ "$allFlag" = true ] || [ "$truthFlag" = true ] ; then
  185. #for project in $cprojects ; do
  186. # echo "pulling approval-reqs for $project"
  187. # "$codeDir"/fetch_bugzilla_bugs/fetch-all-approval-reqs.sh \
  188. # "$fixesDir/$project.json" "$repoDir/firefox" "$bugzillaDir" "$codeDir"
  189. #done
  190. cd "$resultsDir"
  191. mkdir -p "$groundDir"
  192. mkdir -p "$bugIntroDir"
  193. for project in $cprojects ; do
  194. echo "comparing results for $project"
  195. mkdir -p "$bugIntroDir/$project"
  196. "$codeDir"/compare_results-v2.sh \
  197. "$project" \
  198. "$fixesDir/$project.json" \
  199. "$resultsDir/$project/fix_and_introducers_pairs.json" \
  200. "$annotatedDir/c++.csv" \
  201. "$bugIntroDir/$project/" \
  202. > "$groundDir/$project.csv"
  203. done
  204. cd "$rootDir"
  205. fi
  206. # Step 5: update/generate experience files
  207. if [ "$allFlag" = true ] || [ "$experienceFlag" = true ] ; then
  208. mkdir -p "$plotDir"
  209. for repo in "${!repos[@]}" ; do
  210. rm -rf "$experienceDir/$repo"
  211. mkdir -p "$experienceDir/$repo"
  212. "$codeDir"/learning_curves/genExp.sh "$repoDir/$repo" \
  213. "${commits[$repo]}" "$experienceDir/$repo"
  214. done
  215. fi
  216. # Step 6: generate and plot learning curve
  217. if [ "$allFlag" = true ] || [ "$learningFlag" = true ] ; then
  218. rm -rf "$repoDir/webrender_bindings" "$experienceDir/webrender_bindings"
  219. ln -s "$repoDir/firefox" "$repoDir/webrender_bindings"
  220. ln -s "$experienceDir/firefox" "$experienceDir/webrender_bindings"
  221. echo "creating and plotting learning curves..."
  222. "$codeDir/learning_curves/grid_search.sh" \
  223. "$codeDir" "$dataDir" "$annotatedDir" "$repoDir" \
  224. "$experienceDir" "$plotDir" "$rustprojects"
  225. fi