Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed: Typos, PEP8, with statement #1

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions octograb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ def configure(config_name):
config = utils.load_json(config_name)

logging.basicConfig(
filename = config['log_file'],
format = '[%(asctime)s:%(levelname)s:%(name)s] %(message)s',
level = logging.DEBUG
filename=config['log_file'],
format='[%(asctime)s:%(levelname)s:%(name)s] %(message)s',
level=logging.DEBUG
)

utils.make_dir(config['input_dir'])
utils.make_dir(config['cache_dir'])
utils.make_dir(config['cache_dir']+'/'+config['preselection']['archives_dir'])
utils.make_dir(config['cache_dir'] + '/' +
config['preselection']['archives_dir'])
30 changes: 19 additions & 11 deletions octograb/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

__all__ = ['ArchiveDataset']


class ArchiveDataset(dict):
def __init__(self):
self.__set = {}
Expand All @@ -10,25 +11,32 @@ def update(self, name, stars=0, commits=0, forks=0, pulls=0):
# if repository isn't registered yet
if name not in self.__set:
self.__set[name] = {
'stars' : stars,
'commits' : commits,
'forks' : forks,
'pulls' : pulls
'stars': stars,
'commits': commits,
'forks': forks,
'pulls': pulls
}
return

# if repository is already registered
if stars : self.__set[name]['stars'] += stars
if commits : self.__set[name]['commits'] += commits
if forks : self.__set[name]['forks'] += forks
if pulls : self.__set[name]['pulls'] += pulls
if stars:
self.__set[name]['stars'] += stars
if commits:
self.__set[name]['commits'] += commits
if forks:
self.__set[name]['forks'] += forks
if pulls:
self.__set[name]['pulls'] += pulls

def export(self):
# convert the dict into a list, so we can sort it
d = self.__set
repos = [(k, v['stars'], v['forks'], v['commits']) for k, v in d.iteritems()]
repos = [
(k, v['stars'], v['forks'], v['commits'])
for k, v in d.iteritems()
]

# sorting by stars, forks and commits, in this order
repos.sort(key=operator.itemgetter(1, 2, 3), reverse=True)
return repos

return repos
2 changes: 1 addition & 1 deletion octograb/preselection/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from octograb.preselection.convert_archives import *
from octograb.preselection.merge_archives import *
from octograb.preselection.export_inputs import *
from octograb.preselection.export_inputs import *
55 changes: 34 additions & 21 deletions octograb/preselection/convert_archives.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

logger = octograb.utils.get_logger('PRESELECTION:CONVERT')


# MAIN ========================================================================


def convert_archives():
logger.info('Initializing CONVERT_ARCHIVES.')

Expand All @@ -31,18 +34,21 @@ def convert_archives():
_save_state(cur_date)

logger.info('CONVERT_ARCHIVES finished.')


# =============================================================================

# HELPERS =====================================================================


def _load_state():
state_name = octograb.config['cache_dir'] + '/' + 'preselection.cache'
date = None

logger.info('Trying do load state...')
if os.path.isfile(state_name):
f = open(state_name, 'r')
date = cPickle.load(f)
f.close()
with open(state_name, 'r') as f:
date = cPickle.load(f)
logger.info('... state loaded successfully.')

else:
Expand All @@ -56,6 +62,7 @@ def _load_state():

return date


def _save_state(date):
state_name = octograb.config['cache_dir'] + '/' + 'preselection.cache'

Expand All @@ -64,6 +71,7 @@ def _save_state(date):
octograb.utils.safe_save(data, state_name)
logger.info('... state saved.')


def _max_date():
return datetime.datetime(
octograb.config['preselection']['to_year'],
Expand All @@ -72,9 +80,10 @@ def _max_date():
0
)


def _process_day(date):
_name = date.strftime('%Y-%m-%d')
logger.info('Converting data for archive %s...'%_name)
logger.info('Converting data for archive %s...' % _name)

dataset = octograb.models.ArchiveDataset()
step = datetime.timedelta(hours=1)
Expand All @@ -90,12 +99,12 @@ def _process_day(date):

# get all urls
_base = _c['preselection']['archives_url'] + '/'
archive_urls = [_base+n for n in archive_names]
archive_urls = [_base+n for n in archive_names]

# download them all
for url, path, name in zip(archive_urls, archive_paths, archive_names):
logger.info('Downloading "%s"...'%name)
logger.info('Downloading "%s"...' % name)

# repeat until download is completed
_download_complete = False
while not _download_complete:
Expand All @@ -105,19 +114,20 @@ def _process_day(date):

# handle connection error
except (socket.error, requests.exceptions.ConnectionError) as e:
logger.error('Connection error, trying again after 15 seconds.')
logger.error('Connection error, trying again after 15'
' seconds.')
time.sleep(15)

logger.info('... download completed.')

# open them all
for path, name in zip(archive_paths, archive_names):
logger.info('Processing "%s"...'%name)
logger.info('Processing "%s"...' % name)
_process_file(path, dataset)
logger.info('... "%s" processed'%name)
logger.info('... "%s" processed' % name)

# export dataset
logger.info('Exporting %s...'%_name)
logger.info('Exporting %s...' % _name)
data = dataset.export()
s = octograb.utils.archive_to_csv(data)
octograb.utils.safe_save(s, _path+_name+'.csv', no_bkp=True)
Expand All @@ -130,26 +140,27 @@ def _process_day(date):

# ignore if cant remove file
except WindowsError as e:
logger.error('Could not remove "%s": "%s"'%(path, e.message))
logger.error('Could not remove "%s": "%s"' % (path, e.message))

logger.info('... archive %s converted.' % _name)

logger.info('... archive %s converted.'%_name)

def _process_file(path, dataset):
f = gzip.open(path)
for line in f:
_process_event(line, dataset)
f.close()
with gzip.open(path) as f:
for line in f:
_process_event(line, dataset)


def _process_event(line, dataset):
# don't stop processing
try:
data = json.loads(line)
data = json.loads(line)
except ValueError as e:
logger.error(e.message)
return

type_ = data['type']
name = data['repo']['name']
name = data['repo']['name']

if type_ == 'WatchEvent':
dataset.update(name, stars=1)
Expand All @@ -163,4 +174,6 @@ def _process_event(line, dataset):

elif type_ == 'PullRequestEvent':
dataset.update(name, pulls=1)


# =============================================================================
31 changes: 15 additions & 16 deletions octograb/preselection/export_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

logger = octograb.utils.get_logger('PRESELECTION:EXPORT')


def export_inputs():
logger.info('Initializing EXPORT_INPUTS.')
_c = octograb.config
Expand All @@ -19,15 +20,14 @@ def export_inputs():
# read all repositories
logger.debug('Loading dataset with pandas.')
repos = pandas.read_csv(archives_name)
logger.debug('%d repositories loaded.'%len(repos))
logger.debug('%d repositories loaded.' % len(repos))

# selecting data
logger.debug('Selecting dataset.')
repos = repos[(repos['stars']>=min_stars) & \
(repos['forks']>=min_forks) & \
(repos['commits']>=min_commits)]
logger.debug('%d resulting repositories.'%len(repos))

repos = repos[(repos['stars'] >= min_stars) &
(repos['forks'] >= min_forks) &
(repos['commits'] >= min_commits)]
logger.debug('%d resulting repositories.' % len(repos))

# split repositories in batches
step = _c['input_per_file']
Expand All @@ -36,27 +36,26 @@ def export_inputs():
i = 0

_files = N/step + 1
logger.info('Splitting %d repositories into %d archives.'%(N, _files))
logger.info('Splitting %d repositories into %d archives.' % (N, _files))
while n < N:
name = input_name_t%i
name = input_name_t % i
items = repos[n: n+step]
# lines = '\n'.join(items)

logger.debug('Processing file %d of %d.'%(i, _files))
logger.info('Saving input "%s"...'%name)
logger.debug('Processing file %d of %d.' % (i, _files))
logger.info('Saving input "%s"...' % name)
with codecs.open(name, 'w', 'utf-8') as f:
f.write('stars,forks,commits,name\n')
for _, item in items.iterrows():
f.write('%05d,%05d,%07d,"%s"\n'%(item[0],
item[1],
item[2],
item[3]))
f.write('%05d,%05d,%07d,"%s"\n' % (item[0],
item[1],
item[2],
item[3]))
logger.info('... input saved.')

n += step
i += 1

logger.debug('%d repositories processed.'%n)
logger.debug('%d repositories processed.' % n)

logger.info('EXPORT_INPUTS finished.')

20 changes: 12 additions & 8 deletions octograb/preselection/merge_archives.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,17 @@


# MAIN ========================================================================


def merge_archives():
logger.info('Initializing MERGE_ARCHIVES.')

dataset = octograb.models.ArchiveDataset()

_c = octograb.config
pattern = _c['cache_dir']+'/'+_c['preselection']['archives_dir']+'/*.csv'
for path in glob.iglob(pattern):
logger.info('Processing "%s".'%path)
logger.info('Processing "%s".' % path)
with codecs.open(path, 'r', 'utf-8') as f:
for i, line in enumerate(f):
try:
Expand All @@ -27,12 +29,12 @@ def merge_archives():
commits = int(items[2])
name = items[3].strip('\n\r"')
except Exception as e:
logger.error('Error in line %d of file %s'%(i, path))
logger.error(' erro message: "%s"'%e.message)
logger.error('Error in line %d of file %s' % (i, path))
logger.error(' erro message: "%s"' % e.message)

dataset.update(name, stars=stars, commits=commits, forks=forks)
logger.info('... "%s" processed.'%path)
logger.info('... "%s" processed.' % path)

logger.info('Saving final dataset "archives.csv"...')
name = _c['cache_dir']+'/archives.csv'

Expand All @@ -43,11 +45,13 @@ def merge_archives():
header = 'stars,forks,commits,name'
body = octograb.utils.archive_to_csv(data)
s = '\n'.join(header, body)

logger.debug('Saving.')
octograb.utils.safe_save(s, name, no_bkp=True)
logger.info('... dataset saved.')

logger.debug('Total repositories: %d.'%len(data))
logger.debug('Total repositories: %d.' % len(data))
logger.info('MERGE_ARCHIVES finished.')


# =============================================================================
Loading