123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- """ Generate train and test set. """
- __author__ = "Kristian Berg"
- __copyright__ = "Copyright (c) 2018 Axis Communications AB"
- __license__ = "MIT"
- import subprocess
- import re
- import json
- from datetime import datetime, timedelta
- # TODO: give update parameter as fraction
- def build_sets(path, sgap=timedelta(days=200), gap=timedelta(days=150),
- egap=timedelta(days=150), update=timedelta(days=400),
- testdur=timedelta(days=70), traindur=timedelta(days=2000)):
- # Determine date of oldest commit in repository
- command = ['git', 'log', '--reverse', '--date=iso']
- startdate = datetime_of_commit(path, command=command)
- # Determine date of newest commit in repository
- command = ['git', 'log', '--date=iso']
- enddate = datetime_of_commit(path, command=command)
- # Add start and end gaps
- startdate += sgap
- enddate -= egap
- # Print stuff
- print('Start: ' + str(startdate))
- print('End: ' + str(enddate))
- print('Duration: ' + str(enddate - startdate))
- print('len(training) len(testing)')
- # Build list of commit hashes from oldest to newest
- command = ['git', 'rev-list', '--reverse', 'HEAD']
- res = subprocess.run(command, cwd=path, stdout=subprocess.PIPE)
- gitrevlist = res.stdout.decode('utf-8')
- hashes = gitrevlist.split()
- # Initiate loop variables
- trainsets = []
- testsets = []
- training = []
- testing = []
- train_index = 0
- test_index = 0
- tsplit = startdate + traindur
- # Adjust start index to correspond to start date
- commitdate = datetime_of_commit(path, hash=hashes[train_index])
- while commitdate < startdate:
- train_index += 1
- commitdate = datetime_of_commit(path, hash=hashes[train_index])
- # TODO: Last few commits are not used
- while tsplit + gap + testdur < enddate:
- # Set test index to correspond to appropriate date
- test_index = train_index
- commitdate = datetime_of_commit(path, hash=hashes[test_index])
- while commitdate < tsplit + gap:
- test_index += 1
- commitdate = datetime_of_commit(path, hash=hashes[test_index])
- # Build training set
- commitdate = datetime_of_commit(path, hash=hashes[train_index])
- while commitdate < tsplit:
- training.append(hashes[train_index])
- train_index += 1
- commitdate = datetime_of_commit(path, hash=hashes[train_index])
- trainsets.append(list(training))
- # Build test set
- testing = []
- commitdate = datetime_of_commit(path, hash=hashes[test_index])
- while commitdate < tsplit + gap + testdur:
- testing.append(hashes[test_index])
- test_index += 1
- commitdate = datetime_of_commit(path, hash=hashes[test_index])
- testsets.append(list(testing))
- # Print stuff
- print(str(len(training)) + ' ' + str(len(testing)))
- # Loop update
- tsplit += update
- # Write results to file
- with open('trainsets.json', 'w') as f:
- f.write(json.dumps(trainsets))
- with open('testsets.json', 'w') as f:
- f.write(json.dumps(testsets))
- # Returns date of specific commit given a hash
- # OR date of first commit result given a command
- def datetime_of_commit(path, hash=None, command=None):
- # Check that either hash or command parameter has a value
- if hash:
- command = ['git', 'show', '--quiet', '--date=iso', hash]
- elif command:
- if command[0] != 'git':
- raise ValueError('Not a git command')
- elif '--date=iso' not in command:
- raise ValueError('Command needs to specify --date=iso')
- else:
- raise ValueError('Either hash or command parameter is needed')
- # Get date of commit
- res = subprocess.run(command, cwd=path, stdout=subprocess.PIPE)
- gitlog = res.stdout.decode('utf-8', errors='ignore')
- match = re.search('(?<=\nDate: )[0-9-+: ]+(?=\n)', gitlog).group(0)
- date = datetime.strptime(match, '%Y-%m-%d %H:%M:%S %z')
- return date
- if __name__ == '__main__':
- build_sets('/home/kristiab/Git/jenkins')
|