diff --git a/octograb/__init__.py b/octograb/__init__.py index 8a2a8df..683d3da 100644 --- a/octograb/__init__.py +++ b/octograb/__init__.py @@ -13,11 +13,12 @@ def configure(config_name): config = utils.load_json(config_name) logging.basicConfig( - filename = config['log_file'], - format = '[%(asctime)s:%(levelname)s:%(name)s] %(message)s', - level = logging.DEBUG + filename=config['log_file'], + format='[%(asctime)s:%(levelname)s:%(name)s] %(message)s', + level=logging.DEBUG ) utils.make_dir(config['input_dir']) utils.make_dir(config['cache_dir']) - utils.make_dir(config['cache_dir']+'/'+config['preselection']['archives_dir']) \ No newline at end of file + utils.make_dir(config['cache_dir'] + '/' + + config['preselection']['archives_dir']) diff --git a/octograb/models.py b/octograb/models.py index 0c93ff9..64957d2 100644 --- a/octograb/models.py +++ b/octograb/models.py @@ -2,6 +2,7 @@ __all__ = ['ArchiveDataset'] + class ArchiveDataset(dict): def __init__(self): self.__set = {} @@ -10,25 +11,32 @@ def update(self, name, stars=0, commits=0, forks=0, pulls=0): # if repository isn't registered yet if name not in self.__set: self.__set[name] = { - 'stars' : stars, - 'commits' : commits, - 'forks' : forks, - 'pulls' : pulls + 'stars': stars, + 'commits': commits, + 'forks': forks, + 'pulls': pulls } return # if repository is already registered - if stars : self.__set[name]['stars'] += stars - if commits : self.__set[name]['commits'] += commits - if forks : self.__set[name]['forks'] += forks - if pulls : self.__set[name]['pulls'] += pulls + if stars: + self.__set[name]['stars'] += stars + if commits: + self.__set[name]['commits'] += commits + if forks: + self.__set[name]['forks'] += forks + if pulls: + self.__set[name]['pulls'] += pulls def export(self): # convert the dict into a list, so we can sort it d = self.__set - repos = [(k, v['stars'], v['forks'], v['commits']) for k, v in d.iteritems()] + repos = [ + (k, v['stars'], v['forks'], v['commits']) + for k, v in d.iteritems() + ] # sorting by stars, forks and commits, in this order repos.sort(key=operator.itemgetter(1, 2, 3), reverse=True) - - return repos \ No newline at end of file + + return repos diff --git a/octograb/preselection/__init__.py b/octograb/preselection/__init__.py index cffdb9f..a92a754 100644 --- a/octograb/preselection/__init__.py +++ b/octograb/preselection/__init__.py @@ -1,3 +1,3 @@ from octograb.preselection.convert_archives import * from octograb.preselection.merge_archives import * -from octograb.preselection.export_inputs import * \ No newline at end of file +from octograb.preselection.export_inputs import * diff --git a/octograb/preselection/convert_archives.py b/octograb/preselection/convert_archives.py index 8f5bc68..d2a2c0e 100644 --- a/octograb/preselection/convert_archives.py +++ b/octograb/preselection/convert_archives.py @@ -14,7 +14,10 @@ logger = octograb.utils.get_logger('PRESELECTION:CONVERT') + # MAIN ======================================================================== + + def convert_archives(): logger.info('Initializing CONVERT_ARCHIVES.') @@ -31,18 +34,21 @@ def convert_archives(): _save_state(cur_date) logger.info('CONVERT_ARCHIVES finished.') + + # ============================================================================= # HELPERS ===================================================================== + + def _load_state(): state_name = octograb.config['cache_dir'] + '/' + 'preselection.cache' date = None - + logger.info('Trying do load state...') if os.path.isfile(state_name): - f = open(state_name, 'r') - date = cPickle.load(f) - f.close() + with open(state_name, 'r') as f: + date = cPickle.load(f) logger.info('... state loaded successfully.') else: @@ -56,6 +62,7 @@ def _load_state(): return date + def _save_state(date): state_name = octograb.config['cache_dir'] + '/' + 'preselection.cache' @@ -64,6 +71,7 @@ def _save_state(date): octograb.utils.safe_save(data, state_name) logger.info('... state saved.') + def _max_date(): return datetime.datetime( octograb.config['preselection']['to_year'], @@ -72,9 +80,10 @@ def _max_date(): 0 ) + def _process_day(date): _name = date.strftime('%Y-%m-%d') - logger.info('Converting data for archive %s...'%_name) + logger.info('Converting data for archive %s...' % _name) dataset = octograb.models.ArchiveDataset() step = datetime.timedelta(hours=1) @@ -90,12 +99,12 @@ def _process_day(date): # get all urls _base = _c['preselection']['archives_url'] + '/' - archive_urls = [_base+n for n in archive_names] - + archive_urls = [_base+n for n in archive_names] + # download them all for url, path, name in zip(archive_urls, archive_paths, archive_names): - logger.info('Downloading "%s"...'%name) - + logger.info('Downloading "%s"...' % name) + # repeat until download is completed _download_complete = False while not _download_complete: @@ -105,19 +114,20 @@ def _process_day(date): # handle connection error except (socket.error, requests.exceptions.ConnectionError) as e: - logger.error('Connection error, trying again after 15 seconds.') + logger.error('Connection error, trying again after 15' + ' seconds.') time.sleep(15) logger.info('... download completed.') # open them all for path, name in zip(archive_paths, archive_names): - logger.info('Processing "%s"...'%name) + logger.info('Processing "%s"...' % name) _process_file(path, dataset) - logger.info('... "%s" processed'%name) + logger.info('... "%s" processed' % name) # export dataset - logger.info('Exporting %s...'%_name) + logger.info('Exporting %s...' % _name) data = dataset.export() s = octograb.utils.archive_to_csv(data) octograb.utils.safe_save(s, _path+_name+'.csv', no_bkp=True) @@ -130,26 +140,27 @@ def _process_day(date): # ignore if cant remove file except WindowsError as e: - logger.error('Could not remove "%s": "%s"'%(path, e.message)) + logger.error('Could not remove "%s": "%s"' % (path, e.message)) + + logger.info('... archive %s converted.' % _name) - logger.info('... archive %s converted.'%_name) def _process_file(path, dataset): - f = gzip.open(path) - for line in f: - _process_event(line, dataset) - f.close() + with gzip.open(path) as f: + for line in f: + _process_event(line, dataset) + def _process_event(line, dataset): # don't stop processing try: - data = json.loads(line) + data = json.loads(line) except ValueError as e: logger.error(e.message) return type_ = data['type'] - name = data['repo']['name'] + name = data['repo']['name'] if type_ == 'WatchEvent': dataset.update(name, stars=1) @@ -163,4 +174,6 @@ def _process_event(line, dataset): elif type_ == 'PullRequestEvent': dataset.update(name, pulls=1) + + # ============================================================================= diff --git a/octograb/preselection/export_inputs.py b/octograb/preselection/export_inputs.py index 68e511a..6149cc8 100644 --- a/octograb/preselection/export_inputs.py +++ b/octograb/preselection/export_inputs.py @@ -6,6 +6,7 @@ logger = octograb.utils.get_logger('PRESELECTION:EXPORT') + def export_inputs(): logger.info('Initializing EXPORT_INPUTS.') _c = octograb.config @@ -19,15 +20,14 @@ def export_inputs(): # read all repositories logger.debug('Loading dataset with pandas.') repos = pandas.read_csv(archives_name) - logger.debug('%d repositories loaded.'%len(repos)) + logger.debug('%d repositories loaded.' % len(repos)) # selecting data logger.debug('Selecting dataset.') - repos = repos[(repos['stars']>=min_stars) & \ - (repos['forks']>=min_forks) & \ - (repos['commits']>=min_commits)] - logger.debug('%d resulting repositories.'%len(repos)) - + repos = repos[(repos['stars'] >= min_stars) & + (repos['forks'] >= min_forks) & + (repos['commits'] >= min_commits)] + logger.debug('%d resulting repositories.' % len(repos)) # split repositories in batches step = _c['input_per_file'] @@ -36,27 +36,26 @@ def export_inputs(): i = 0 _files = N/step + 1 - logger.info('Splitting %d repositories into %d archives.'%(N, _files)) + logger.info('Splitting %d repositories into %d archives.' % (N, _files)) while n < N: - name = input_name_t%i + name = input_name_t % i items = repos[n: n+step] # lines = '\n'.join(items) - logger.debug('Processing file %d of %d.'%(i, _files)) - logger.info('Saving input "%s"...'%name) + logger.debug('Processing file %d of %d.' % (i, _files)) + logger.info('Saving input "%s"...' % name) with codecs.open(name, 'w', 'utf-8') as f: f.write('stars,forks,commits,name\n') for _, item in items.iterrows(): - f.write('%05d,%05d,%07d,"%s"\n'%(item[0], - item[1], - item[2], - item[3])) + f.write('%05d,%05d,%07d,"%s"\n' % (item[0], + item[1], + item[2], + item[3])) logger.info('... input saved.') n += step i += 1 - logger.debug('%d repositories processed.'%n) + logger.debug('%d repositories processed.' % n) logger.info('EXPORT_INPUTS finished.') - diff --git a/octograb/preselection/merge_archives.py b/octograb/preselection/merge_archives.py index 01427f7..ecba1d3 100644 --- a/octograb/preselection/merge_archives.py +++ b/octograb/preselection/merge_archives.py @@ -9,15 +9,17 @@ # MAIN ======================================================================== + + def merge_archives(): logger.info('Initializing MERGE_ARCHIVES.') dataset = octograb.models.ArchiveDataset() - + _c = octograb.config pattern = _c['cache_dir']+'/'+_c['preselection']['archives_dir']+'/*.csv' for path in glob.iglob(pattern): - logger.info('Processing "%s".'%path) + logger.info('Processing "%s".' % path) with codecs.open(path, 'r', 'utf-8') as f: for i, line in enumerate(f): try: @@ -27,12 +29,12 @@ def merge_archives(): commits = int(items[2]) name = items[3].strip('\n\r"') except Exception as e: - logger.error('Error in line %d of file %s'%(i, path)) - logger.error(' erro message: "%s"'%e.message) + logger.error('Error in line %d of file %s' % (i, path)) + logger.error(' erro message: "%s"' % e.message) dataset.update(name, stars=stars, commits=commits, forks=forks) - logger.info('... "%s" processed.'%path) - + logger.info('... "%s" processed.' % path) + logger.info('Saving final dataset "archives.csv"...') name = _c['cache_dir']+'/archives.csv' @@ -43,11 +45,13 @@ def merge_archives(): header = 'stars,forks,commits,name' body = octograb.utils.archive_to_csv(data) s = '\n'.join(header, body) - + logger.debug('Saving.') octograb.utils.safe_save(s, name, no_bkp=True) logger.info('... dataset saved.') - logger.debug('Total repositories: %d.'%len(data)) + logger.debug('Total repositories: %d.' % len(data)) logger.info('MERGE_ARCHIVES finished.') + + # ============================================================================= diff --git a/octograb/utils.py b/octograb/utils.py index c194247..2382507 100644 --- a/octograb/utils.py +++ b/octograb/utils.py @@ -5,46 +5,53 @@ import logging import codecs -__all__ = ['get_archive_name', 'get_logger', 'safe_save', 'download_file', +__all__ = ['get_archive_name', 'get_logger', 'safe_save', 'download_file', 'load_json', 'make_dir', 'archive_to_csv'] + def get_archive_name(date): '''Returns the archive name given a datetime object.''' return date.strftime('%Y-%m-%d-') + str(int(date.strftime('%H'))) + def archive_to_csv(data): - lines = ['%05d,%05d,%07d,"%s"'%(_[1], _[2], _[3], _[0]) for _ in data] + lines = ['%05d,%05d,%07d,"%s"' % (_[1], _[2], _[3], _[0]) for _ in data] return '\n'.join(lines) + def safe_save(data, path, no_bkp=False): # save file - f = open(path+'-bkp', 'w') - f.write(data) - f.close() + with open(path+'-bkp', 'w') as f: + f.write(data) # erase previous if any - if os.path.isfile(path): os.remove(path) + if os.path.isfile(path): + os.remove(path) os.rename(path+'-bkp', path) - if os.path.isfile(path+'-bkp') and no_bkp: os.remove(path+'-bkp') + if os.path.isfile(path+'-bkp') and no_bkp: + os.remove(path+'-bkp') + def download_file(url, path): r = requests.get(url, stream=True) - f = open(path, 'wb') - for chunk in r.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - f.close() + with open(path, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: + f.write(chunk) + def get_logger(name): return logging.getLogger(name) + def load_json(path): content = codecs.open(path, 'r', 'utf-8').read() - content = re.sub(re.compile("/\*.*?\*/", re.DOTALL) , "", content) - content = re.sub(re.compile("[\s|\n]//.*?\n" ) , "", content) + content = re.sub(re.compile("/\*.*?\*/", re.DOTALL), "", content) + content = re.sub(re.compile("[\s|\n]//.*?\n"), "", content) return json.loads(content) + def make_dir(path): if not os.path.isdir(path): os.mkdir(path) diff --git a/preselection.py b/preselection.py index ce542e8..5c5b1b7 100644 --- a/preselection.py +++ b/preselection.py @@ -9,5 +9,5 @@ # Merge all CSV files and create a single CSV dataset octograb.preselection.merge_archives() -# Grav the large single dataset and split them into small sets to be crawled -octograb.preselection.export_inputs() \ No newline at end of file +# Grab the large single dataset and split them into small sets to be crawled +octograb.preselection.export_inputs()