ml-brasil · payet-s · Oct 24, 2015
diff --git a/octograb/__init__.py b/octograb/__init__.py
@@ -13,11 +13,12 @@ def configure(config_name):
     config = utils.load_json(config_name)
 
     logging.basicConfig(
-        filename = config['log_file'],
-        format   = '[%(asctime)s:%(levelname)s:%(name)s] %(message)s',
-        level    = logging.DEBUG
+        filename=config['log_file'],
+        format='[%(asctime)s:%(levelname)s:%(name)s] %(message)s',
+        level=logging.DEBUG
     )
 
     utils.make_dir(config['input_dir'])
     utils.make_dir(config['cache_dir'])
-    utils.make_dir(config['cache_dir']+'/'+config['preselection']['archives_dir'])
+    utils.make_dir(config['cache_dir'] + '/' +
+                   config['preselection']['archives_dir'])
diff --git a/octograb/models.py b/octograb/models.py
@@ -2,6 +2,7 @@
 
 __all__ = ['ArchiveDataset']
 
+
 class ArchiveDataset(dict):
     def __init__(self):
         self.__set = {}
@@ -10,25 +11,32 @@ def update(self, name, stars=0, commits=0, forks=0, pulls=0):
         # if repository isn't registered yet
         if name not in self.__set:
             self.__set[name] = {
-                'stars'   : stars,
-                'commits' : commits,
-                'forks'   : forks,
-                'pulls'   : pulls
+                'stars':   stars,
+                'commits': commits,
+                'forks':   forks,
+                'pulls':   pulls
             }
             return
 
         # if repository is already registered
-        if stars   : self.__set[name]['stars'] += stars
-        if commits : self.__set[name]['commits'] += commits
-        if forks   : self.__set[name]['forks'] += forks
-        if pulls   : self.__set[name]['pulls'] += pulls
+        if stars:
+            self.__set[name]['stars'] += stars
+        if commits:
+            self.__set[name]['commits'] += commits
+        if forks:
+            self.__set[name]['forks'] += forks
+        if pulls:
+            self.__set[name]['pulls'] += pulls
 
     def export(self):
         # convert the dict into a list, so we can sort it
         d = self.__set
-        repos = [(k, v['stars'], v['forks'], v['commits']) for k, v in d.iteritems()]
+        repos = [
+            (k, v['stars'], v['forks'], v['commits'])
+            for k, v in d.iteritems()
+        ]
 
         # sorting by stars, forks and commits, in this order
         repos.sort(key=operator.itemgetter(1, 2, 3), reverse=True)
-        
-        return repos
+
+        return repos
diff --git a/octograb/preselection/__init__.py b/octograb/preselection/__init__.py
@@ -1,3 +1,3 @@
 from octograb.preselection.convert_archives import *
 from octograb.preselection.merge_archives import *
-from octograb.preselection.export_inputs import *
+from octograb.preselection.export_inputs import *
diff --git a/octograb/preselection/convert_archives.py b/octograb/preselection/convert_archives.py
@@ -14,7 +14,10 @@
 
 logger = octograb.utils.get_logger('PRESELECTION:CONVERT')
 
+
 # MAIN ========================================================================
+
+
 def convert_archives():
     logger.info('Initializing CONVERT_ARCHIVES.')
 
@@ -31,18 +34,21 @@ def convert_archives():
         _save_state(cur_date)
 
     logger.info('CONVERT_ARCHIVES finished.')
+
+
 # =============================================================================
 
 # HELPERS =====================================================================
+
+
 def _load_state():
     state_name = octograb.config['cache_dir'] + '/' + 'preselection.cache'
     date = None
-    
+
     logger.info('Trying do load state...')
     if os.path.isfile(state_name):
-        f = open(state_name, 'r')
-        date = cPickle.load(f)
-        f.close()
+        with open(state_name, 'r') as f:
+            date = cPickle.load(f)
         logger.info('... state loaded successfully.')
 
     else:
@@ -56,6 +62,7 @@ def _load_state():
 
     return date
 
+
 def _save_state(date):
     state_name = octograb.config['cache_dir'] + '/' + 'preselection.cache'
 
@@ -64,6 +71,7 @@ def _save_state(date):
     octograb.utils.safe_save(data, state_name)
     logger.info('... state saved.')
 
+
 def _max_date():
     return datetime.datetime(
         octograb.config['preselection']['to_year'],
@@ -72,9 +80,10 @@ def _max_date():
         0
     )
 
+
 def _process_day(date):
     _name = date.strftime('%Y-%m-%d')
-    logger.info('Converting data for archive %s...'%_name)
+    logger.info('Converting data for archive %s...' % _name)
 
     dataset = octograb.models.ArchiveDataset()
     step = datetime.timedelta(hours=1)
@@ -90,12 +99,12 @@ def _process_day(date):
 
     # get all urls
     _base = _c['preselection']['archives_url'] + '/'
-    archive_urls  = [_base+n for n in archive_names]
-    
+    archive_urls = [_base+n for n in archive_names]
+
     # download them all
     for url, path, name in zip(archive_urls, archive_paths, archive_names):
-        logger.info('Downloading "%s"...'%name)
-        
+        logger.info('Downloading "%s"...' % name)
+
         # repeat until download is completed
         _download_complete = False
         while not _download_complete:
@@ -105,19 +114,20 @@ def _process_day(date):
 
             # handle connection error
             except (socket.error, requests.exceptions.ConnectionError) as e:
-                logger.error('Connection error, trying again after 15 seconds.')
+                logger.error('Connection error, trying again after 15'
+                             ' seconds.')
                 time.sleep(15)
 
         logger.info('... download completed.')
 
     # open them all
     for path, name in zip(archive_paths, archive_names):
-        logger.info('Processing "%s"...'%name)
+        logger.info('Processing "%s"...' % name)
         _process_file(path, dataset)
-        logger.info('... "%s" processed'%name)
+        logger.info('... "%s" processed' % name)
 
     # export dataset
-    logger.info('Exporting %s...'%_name)
+    logger.info('Exporting %s...' % _name)
     data = dataset.export()
     s = octograb.utils.archive_to_csv(data)
     octograb.utils.safe_save(s, _path+_name+'.csv', no_bkp=True)
@@ -130,26 +140,27 @@ def _process_day(date):
 
         # ignore if cant remove file
         except WindowsError as e:
-            logger.error('Could not remove "%s": "%s"'%(path, e.message))
+            logger.error('Could not remove "%s": "%s"' % (path, e.message))
+
+    logger.info('... archive %s converted.' % _name)
 
-    logger.info('... archive %s converted.'%_name)
 
 def _process_file(path, dataset):
-    f = gzip.open(path)
-    for line in f:
-        _process_event(line, dataset)
-    f.close()
+    with gzip.open(path) as f:
+        for line in f:
+            _process_event(line, dataset)
+
 
 def _process_event(line, dataset):
     # don't stop processing
     try:
-        data  = json.loads(line)
+        data = json.loads(line)
     except ValueError as e:
         logger.error(e.message)
         return
 
     type_ = data['type']
-    name  = data['repo']['name']
+    name = data['repo']['name']
 
     if type_ == 'WatchEvent':
         dataset.update(name, stars=1)
@@ -163,4 +174,6 @@ def _process_event(line, dataset):
 
     elif type_ == 'PullRequestEvent':
         dataset.update(name, pulls=1)
+
+
 # =============================================================================
diff --git a/octograb/preselection/export_inputs.py b/octograb/preselection/export_inputs.py
@@ -6,6 +6,7 @@
 
 logger = octograb.utils.get_logger('PRESELECTION:EXPORT')
 
+
 def export_inputs():
     logger.info('Initializing EXPORT_INPUTS.')
     _c = octograb.config
@@ -19,15 +20,14 @@ def export_inputs():
     # read all repositories
     logger.debug('Loading dataset with pandas.')
     repos = pandas.read_csv(archives_name)
-    logger.debug('%d repositories loaded.'%len(repos))
+    logger.debug('%d repositories loaded.' % len(repos))
 
     # selecting data
     logger.debug('Selecting dataset.')
-    repos = repos[(repos['stars']>=min_stars) & \
-                  (repos['forks']>=min_forks) & \
-                  (repos['commits']>=min_commits)]
-    logger.debug('%d resulting repositories.'%len(repos))
-
+    repos = repos[(repos['stars'] >= min_stars) &
+                  (repos['forks'] >= min_forks) &
+                  (repos['commits'] >= min_commits)]
+    logger.debug('%d resulting repositories.' % len(repos))
 
     # split repositories in batches
     step = _c['input_per_file']
@@ -36,27 +36,26 @@ def export_inputs():
     i = 0
 
     _files = N/step + 1
-    logger.info('Splitting %d repositories into %d archives.'%(N, _files))
+    logger.info('Splitting %d repositories into %d archives.' % (N, _files))
     while n < N:
-        name = input_name_t%i
+        name = input_name_t % i
         items = repos[n: n+step]
         # lines = '\n'.join(items)
 
-        logger.debug('Processing file %d of %d.'%(i, _files))
-        logger.info('Saving input "%s"...'%name)
+        logger.debug('Processing file %d of %d.' % (i, _files))
+        logger.info('Saving input "%s"...' % name)
         with codecs.open(name, 'w', 'utf-8') as f:
             f.write('stars,forks,commits,name\n')
             for _, item in items.iterrows():
-                f.write('%05d,%05d,%07d,"%s"\n'%(item[0], 
-                                                 item[1],
-                                                 item[2],
-                                                 item[3]))
+                f.write('%05d,%05d,%07d,"%s"\n' % (item[0],
+                                                   item[1],
+                                                   item[2],
+                                                   item[3]))
         logger.info('... input saved.')
 
         n += step
         i += 1
 
-        logger.debug('%d repositories processed.'%n)
+        logger.debug('%d repositories processed.' % n)
 
     logger.info('EXPORT_INPUTS finished.')
-
diff --git a/octograb/preselection/merge_archives.py b/octograb/preselection/merge_archives.py
@@ -9,15 +9,17 @@
 
 
 # MAIN ========================================================================
+
+
 def merge_archives():
     logger.info('Initializing MERGE_ARCHIVES.')
 
     dataset = octograb.models.ArchiveDataset()
-    
+
     _c = octograb.config
     pattern = _c['cache_dir']+'/'+_c['preselection']['archives_dir']+'/*.csv'
     for path in glob.iglob(pattern):
-        logger.info('Processing "%s".'%path)
+        logger.info('Processing "%s".' % path)
         with codecs.open(path, 'r', 'utf-8') as f:
             for i, line in enumerate(f):
                 try:
@@ -27,12 +29,12 @@ def merge_archives():
                     commits = int(items[2])
                     name = items[3].strip('\n\r"')
                 except Exception as e:
-                    logger.error('Error in line %d of file %s'%(i, path))
-                    logger.error('   erro message: "%s"'%e.message)
+                    logger.error('Error in line %d of file %s' % (i, path))
+                    logger.error('   erro message: "%s"' % e.message)
 
                 dataset.update(name, stars=stars, commits=commits, forks=forks)
-        logger.info('... "%s" processed.'%path)
-    
+        logger.info('... "%s" processed.' % path)
+
     logger.info('Saving final dataset "archives.csv"...')
     name = _c['cache_dir']+'/archives.csv'
 
@@ -43,11 +45,13 @@ def merge_archives():
     header = 'stars,forks,commits,name'
     body = octograb.utils.archive_to_csv(data)
     s = '\n'.join(header, body)
-    
+
     logger.debug('Saving.')
     octograb.utils.safe_save(s, name, no_bkp=True)
     logger.info('... dataset saved.')
 
-    logger.debug('Total repositories: %d.'%len(data))
+    logger.debug('Total repositories: %d.' % len(data))
     logger.info('MERGE_ARCHIVES finished.')
+
+
 # =============================================================================