Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added multi-threading to the update script #14

Merged
merged 5 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions scripts/add_new_contribution_to_yaml.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
given properties, add a new contribution to the contributions.yaml database file.
"""
from datetime import datetime
from datetime import datetime, UTC
import json
import pathlib
from sys import argv
Expand Down Expand Up @@ -39,7 +39,7 @@

# append new contribution with next index
# add status, at top
datetime_today = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S%z')
datetime_today = datetime.now(UTC).strftime('%Y-%m-%dT%H:%M:%S%z')
contribution = {
'id': max_index + 1,
'status': 'VALID',
Expand Down
35 changes: 24 additions & 11 deletions scripts/fetch_updates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
Reads in the contributions.yaml file, and updates the entries by hitting the 'source' url.
"""
import argparse
from datetime import datetime
from datetime import datetime, UTC
import pathlib
from ruamel.yaml import YAML
from multiprocessing import Pool

from parse_and_validate_properties_txt import read_properties_txt, parse_text, validate_existing


def update_contribution(contribution, props):
datetime_today = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S%z')
datetime_today = datetime.now(UTC).strftime('%Y-%m-%dT%H:%M:%S%z')
contribution['lastUpdated'] = datetime_today
if 'previousVersions' not in contribution:
contribution['previousVersions'] = []
Expand All @@ -29,6 +30,7 @@ def update_contribution(contribution, props):

if 'download' not in contribution:
contribution['download'] = contribution['source'][:contribution['source'].rfind('.')] + '.zip'


def log_broken(contribution, msg):
if contribution['status'] == 'VALID':
Expand All @@ -37,8 +39,10 @@ def log_broken(contribution, msg):
contribution['log'] = []
contribution['log'].append(msg)

def process_contribution(contribution):
date_today = datetime.utcnow().strftime('%Y-%m-%d')
def process_contribution(item):
index, contribution = item

date_today = datetime.now(UTC).strftime('%Y-%m-%d')
this_version = '0'

if contribution['status'] != 'DEPRECATED':
Expand All @@ -51,16 +55,16 @@ def process_contribution(contribution):
properties_raw = read_properties_txt(contribution['source'])
except FileNotFoundError as e:
log_broken(contribution, f'file not found, {e}, {date_today}')
return
return index, contribution
except Exception:
log_broken(contribution, f'url timeout, {date_today}')
return
return index, contribution

try:
props = validate_existing(parse_text(properties_raw))
except Exception:
log_broken(contribution, f'invalid file, {date_today}')
return
return index, contribution

# some library files have field lastUpdated. This also exists in the database, but is defined
# by our scripts, so remove this field.
Expand All @@ -71,6 +75,7 @@ def process_contribution(contribution):
if props['version'] != this_version:
# update from online
update_contribution(contribution, props)
return index, contribution


if __name__ == "__main__":
Expand All @@ -92,14 +97,22 @@ def process_contribution(contribution):
contributions_list = data['contributions']

if index == 'all':
# update all contributions
for contribution in contributions_list:
process_contribution(contribution)
total = len(contributions_list)
completed = 0
print(f"Starting processing of {total} contributions...")

with Pool(processes=256) as pool:
for index, contribution in pool.imap_unordered(process_contribution, enumerate(contributions_list)):
contributions_list[index] = contribution
completed += 1
print(f"Progress: {completed}/{total} ({(completed/total*100):.1f}%)")

print("All processing complete")
else:
# update only contribution with id==index
contribution = next((x for x in contributions_list if x['id'] == int(index)), None)
print(contribution)
process_contribution(contribution)
process_contribution((index, contribution))
print(contribution)

# write all contributions to database file
Expand Down
2 changes: 1 addition & 1 deletion scripts/parse_and_validate_properties_txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def read_properties_txt(properties_url):
'User-Agent': 'Mozilla/5.0',
'Accept': 'text/html',
}
r = requests.get(properties_url, headers=headers)
r = requests.get(properties_url, headers=headers, timeout=30)

if r.status_code != 200:
raise FileNotFoundError(f"status code {r.status_code} returned for url {r.url}")
Expand Down
Loading