123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- """ Generate train and test set. """
- __author__ = "Kristian Berg"
- __copyright__ = "Copyright (c) 2018 Axis Communications AB"
- __license__ = "MIT"
- import subprocess
- import re
- import json
- from datetime import datetime, timedelta
- def build_sets(path, sgap=timedelta(days=200), gap=timedelta(days=150),
- egap=timedelta(days=150), update=timedelta(days=400),
- testdur=timedelta(days=70), traindur=timedelta(days=2000)):
-
- command = ['git', 'log', '--reverse', '--date=iso']
- startdate = datetime_of_commit(path, command=command)
-
- command = ['git', 'log', '--date=iso']
- enddate = datetime_of_commit(path, command=command)
-
- startdate += sgap
- enddate -= egap
-
- print('Start: ' + str(startdate))
- print('End: ' + str(enddate))
- print('Duration: ' + str(enddate - startdate))
- print('len(training) len(testing)')
-
- command = ['git', 'rev-list', '--reverse', 'HEAD']
- res = subprocess.run(command, cwd=path, stdout=subprocess.PIPE)
- gitrevlist = res.stdout.decode('utf-8')
- hashes = gitrevlist.split()
-
- trainsets = []
- testsets = []
- training = []
- testing = []
- train_index = 0
- test_index = 0
- tsplit = startdate + traindur
-
- commitdate = datetime_of_commit(path, hash=hashes[train_index])
- while commitdate < startdate:
- train_index += 1
- commitdate = datetime_of_commit(path, hash=hashes[train_index])
-
- while tsplit + gap + testdur < enddate:
-
- test_index = train_index
- commitdate = datetime_of_commit(path, hash=hashes[test_index])
- while commitdate < tsplit + gap:
- test_index += 1
- commitdate = datetime_of_commit(path, hash=hashes[test_index])
-
- commitdate = datetime_of_commit(path, hash=hashes[train_index])
- while commitdate < tsplit:
- training.append(hashes[train_index])
- train_index += 1
- commitdate = datetime_of_commit(path, hash=hashes[train_index])
- trainsets.append(list(training))
-
- testing = []
- commitdate = datetime_of_commit(path, hash=hashes[test_index])
- while commitdate < tsplit + gap + testdur:
- testing.append(hashes[test_index])
- test_index += 1
- commitdate = datetime_of_commit(path, hash=hashes[test_index])
- testsets.append(list(testing))
-
- print(str(len(training)) + ' ' + str(len(testing)))
-
- tsplit += update
-
- with open('trainsets.json', 'w') as f:
- f.write(json.dumps(trainsets))
- with open('testsets.json', 'w') as f:
- f.write(json.dumps(testsets))
- def datetime_of_commit(path, hash=None, command=None):
-
- if hash:
- command = ['git', 'show', '--quiet', '--date=iso', hash]
- elif command:
- if command[0] != 'git':
- raise ValueError('Not a git command')
- elif '--date=iso' not in command:
- raise ValueError('Command needs to specify --date=iso')
- else:
- raise ValueError('Either hash or command parameter is needed')
-
- res = subprocess.run(command, cwd=path, stdout=subprocess.PIPE)
- gitlog = res.stdout.decode('utf-8', errors='ignore')
- match = re.search('(?<=\nDate: )[0-9-+: ]+(?=\n)', gitlog).group(0)
- date = datetime.strptime(match, '%Y-%m-%d %H:%M:%S %z')
- return date
- if __name__ == '__main__':
- build_sets('/home/kristiab/Git/jenkins')
|