|
@@ -0,0 +1,114 @@
|
|
|
+""" Generate train and test set. """
|
|
|
+__author__ = "Kristian Berg"
|
|
|
+__copyright__ = "Copyright (c) 2018 Axis Communications AB"
|
|
|
+__license__ = "MIT"
|
|
|
+
|
|
|
+import subprocess
|
|
|
+import re
|
|
|
+import json
|
|
|
+from datetime import datetime, timedelta
|
|
|
+
|
|
|
+# TODO: give update parameter as fraction
|
|
|
+def build_sets(path, sgap=timedelta(days=200), gap=timedelta(days=150),
|
|
|
+ egap=timedelta(days=150), update=timedelta(days=400),
|
|
|
+ testdur=timedelta(days=70), traindur=timedelta(days=2000)):
|
|
|
+ # Determine date of oldest commit in repository
|
|
|
+ command = ['git', 'log', '--reverse', '--date=iso']
|
|
|
+ startdate = datetime_of_commit(path, command=command)
|
|
|
+
|
|
|
+ # Determine date of newest commit in repository
|
|
|
+ command = ['git', 'log', '--date=iso']
|
|
|
+ enddate = datetime_of_commit(path, command=command)
|
|
|
+
|
|
|
+ # Add start and end gaps
|
|
|
+ startdate += sgap
|
|
|
+ enddate -= egap
|
|
|
+
|
|
|
+ # Print stuff
|
|
|
+ print('Start: ' + str(startdate))
|
|
|
+ print('End: ' + str(enddate))
|
|
|
+ print('Duration: ' + str(enddate - startdate))
|
|
|
+ print('len(training) len(testing)')
|
|
|
+
|
|
|
+ # Build list of commit hashes from oldest to newest
|
|
|
+ command = ['git', 'rev-list', '--reverse', 'HEAD']
|
|
|
+ res = subprocess.run(command, cwd=path, stdout=subprocess.PIPE)
|
|
|
+ gitrevlist = res.stdout.decode('utf-8')
|
|
|
+ hashes = gitrevlist.split()
|
|
|
+
|
|
|
+ # Initiate loop variables
|
|
|
+ trainsets = []
|
|
|
+ testsets = []
|
|
|
+ training = []
|
|
|
+ testing = []
|
|
|
+ train_index = 0
|
|
|
+ test_index = 0
|
|
|
+ tsplit = startdate + traindur
|
|
|
+
|
|
|
+ # Adjust start index to correspond to start date
|
|
|
+ commitdate = datetime_of_commit(path, hash=hashes[train_index])
|
|
|
+ while commitdate < startdate:
|
|
|
+ train_index += 1
|
|
|
+ commitdate = datetime_of_commit(path, hash=hashes[train_index])
|
|
|
+
|
|
|
+ # TODO: Last few commits are not used
|
|
|
+ while tsplit + gap + testdur < enddate:
|
|
|
+ # Set test index to correspond to appropriate date
|
|
|
+ test_index = train_index
|
|
|
+ commitdate = datetime_of_commit(path, hash=hashes[test_index])
|
|
|
+ while commitdate < tsplit + gap:
|
|
|
+ test_index += 1
|
|
|
+ commitdate = datetime_of_commit(path, hash=hashes[test_index])
|
|
|
+
|
|
|
+ # Build training set
|
|
|
+ commitdate = datetime_of_commit(path, hash=hashes[train_index])
|
|
|
+ while commitdate < tsplit:
|
|
|
+ training.append(hashes[train_index])
|
|
|
+ train_index += 1
|
|
|
+ commitdate = datetime_of_commit(path, hash=hashes[train_index])
|
|
|
+ trainsets.append(list(training))
|
|
|
+
|
|
|
+ # Build test set
|
|
|
+ testing = []
|
|
|
+ commitdate = datetime_of_commit(path, hash=hashes[test_index])
|
|
|
+ while commitdate < tsplit + gap + testdur:
|
|
|
+ testing.append(hashes[test_index])
|
|
|
+ test_index += 1
|
|
|
+ commitdate = datetime_of_commit(path, hash=hashes[test_index])
|
|
|
+ testsets.append(list(testing))
|
|
|
+
|
|
|
+ # Print stuff
|
|
|
+ print(str(len(training)) + ' ' + str(len(testing)))
|
|
|
+
|
|
|
+ # Loop update
|
|
|
+ tsplit += update
|
|
|
+
|
|
|
+ # Write results to file
|
|
|
+ with open('trainsets.json', 'w') as f:
|
|
|
+ f.write(json.dumps(trainsets))
|
|
|
+ with open('testsets.json', 'w') as f:
|
|
|
+ f.write(json.dumps(testsets))
|
|
|
+
|
|
|
+# Returns date of specific commit given a hash
|
|
|
+# OR date of first commit result given a command
|
|
|
+def datetime_of_commit(path, hash=None, command=None):
|
|
|
+ # Check that either hash or command parameter has a value
|
|
|
+ if hash:
|
|
|
+ command = ['git', 'show', '--quiet', '--date=iso', hash]
|
|
|
+ elif command:
|
|
|
+ if command[0] != 'git':
|
|
|
+ raise ValueError('Not a git command')
|
|
|
+ elif '--date=iso' not in command:
|
|
|
+ raise ValueError('Command needs to specify --date=iso')
|
|
|
+ else:
|
|
|
+ raise ValueError('Either hash or command parameter is needed')
|
|
|
+
|
|
|
+ # Get date of commit
|
|
|
+ res = subprocess.run(command, cwd=path, stdout=subprocess.PIPE)
|
|
|
+ gitlog = res.stdout.decode('utf-8', errors='ignore')
|
|
|
+ match = re.search('(?<=\nDate: )[0-9-+: ]+(?=\n)', gitlog).group(0)
|
|
|
+ date = datetime.strptime(match, '%Y-%m-%d %H:%M:%S %z')
|
|
|
+ return date
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ build_sets('/home/kristiab/Git/jenkins')
|