-
Notifications
You must be signed in to change notification settings - Fork 200
/
Copy pathbest_practices.py
72 lines (61 loc) · 2.52 KB
/
best_practices.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/python3
# Download JSON data from bestpractices.coreinfrastructure.org and analyze it.
# Since the JSON data is paged, we simply download each page in turn.
# This is written to be simple code; it loads the entire dataset into memory.
# It stores downloaded results in a file; erase the file to force the
# program to re-download the data.
# This example counts projects using the R programming language.
# Modify this example for whatever analysis you want to do.
# Copyright 2018, the Linux Foundation, IDA, and the
# CII Best Practices badge contributors
# SPDX-License-Identifier: MIT
import os, sys, re, json, time, urllib
from urllib.request import urlopen
# File for storing cached value
cached_filename = 'projects.json'
# This is the base URL for project data.
base_url = 'https://bestpractices.coreinfrastructure.org/' + cached_filename
# Retrieve paged JSON data
def retrieve_data():
retrieved_dataset = []
page_number = 1
while True:
url = base_url + '?page=' + str(page_number)
page_string = urlopen(url).read()
page_data = json.loads(page_string)
print(page_number, file=sys.stderr)
if len(page_data) == 0:
break
retrieved_dataset += page_data
page_number += 1
time.sleep(1) # Add delay to avoid hitting rate limits
return retrieved_dataset
# Load JSON data (from a file if possible, else it's retrieved and saved)
def load_data():
if os.path.exists(cached_filename):
print('Loading existing file: %s' % cached_filename, file=sys.stderr)
f = open(cached_filename, 'r')
return json.load(f)
else:
print('Retrieving data', file=sys.stderr)
retrieved_dataset = retrieve_data()
# Save data into file so we don't need to retrieve later
print('Saving data to file: %s' % cached_filename, file=sys.stderr)
f = open(cached_filename, 'w')
json.dump(retrieved_dataset, f)
return retrieved_dataset
#
# Load the JSON data by calling load_data()
#
json_dataset = load_data()
# Now json_dataset has complete set of data. Print a count.
print('Number of projects = ', len(json_dataset))
# Now do some sample analysis. Example analysis:
# Count projects saying they use the R programming language.
re_split_list = re.compile(r' *, *')
r_projects = 0
for project in json_dataset:
if project['implementation_languages']:
langs = re.split(re_split_list, project['implementation_languages'])
if 'R' in langs: r_projects += 1
print('R projects =', r_projects)