From d5a6aadbeeeaaec35d8f1924529fa28a33441b12 Mon Sep 17 00:00:00 2001 From: Stef Tervelde Date: Fri, 13 Dec 2024 07:37:16 +0100 Subject: [PATCH 1/5] Update fetch_updates.py --- scripts/fetch_updates.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/fetch_updates.py b/scripts/fetch_updates.py index 709448e..2420c95 100644 --- a/scripts/fetch_updates.py +++ b/scripts/fetch_updates.py @@ -5,6 +5,7 @@ from datetime import datetime import pathlib from ruamel.yaml import YAML +from multiprocessing import Pool from parse_and_validate_properties_txt import read_properties_txt, parse_text, validate_existing @@ -93,8 +94,8 @@ def process_contribution(contribution): if index == 'all': # update all contributions - for contribution in contributions_list: - process_contribution(contribution) + with Pool() as pool: + pool.map(process_contribution, contributions_list) else: # update only contribution with id==index contribution = next((x for x in contributions_list if x['id'] == int(index)), None) From e285bec1b063c7ca72b5e9619547f9abb508f148 Mon Sep 17 00:00:00 2001 From: Stef Tervelde Date: Fri, 13 Dec 2024 07:41:05 +0100 Subject: [PATCH 2/5] Update fetch_updates.py --- scripts/fetch_updates.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/fetch_updates.py b/scripts/fetch_updates.py index 2420c95..9cec694 100644 --- a/scripts/fetch_updates.py +++ b/scripts/fetch_updates.py @@ -30,6 +30,8 @@ def update_contribution(contribution, props): if 'download' not in contribution: contribution['download'] = contribution['source'][:contribution['source'].rfind('.')] + '.zip' + + print(f"Updated {contribution['source']}: {contribution['lastUpdated']}") def log_broken(contribution, msg): if contribution['status'] == 'VALID': From 64fe1e5058ba17746b78d609a487ed129b4b61a1 Mon Sep 17 00:00:00 2001 From: Stef Tervelde Date: Fri, 13 Dec 2024 08:09:17 +0100 Subject: [PATCH 3/5] progress indicator and timeouts --- scripts/fetch_updates.py | 14 ++++++++++---- scripts/parse_and_validate_properties_txt.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/scripts/fetch_updates.py b/scripts/fetch_updates.py index 9cec694..fa4153e 100644 --- a/scripts/fetch_updates.py +++ b/scripts/fetch_updates.py @@ -31,7 +31,6 @@ def update_contribution(contribution, props): if 'download' not in contribution: contribution['download'] = contribution['source'][:contribution['source'].rfind('.')] + '.zip' - print(f"Updated {contribution['source']}: {contribution['lastUpdated']}") def log_broken(contribution, msg): if contribution['status'] == 'VALID': @@ -95,9 +94,16 @@ def process_contribution(contribution): contributions_list = data['contributions'] if index == 'all': - # update all contributions - with Pool() as pool: - pool.map(process_contribution, contributions_list) + total = len(contributions_list) + completed = 0 + print(f"Starting processing of {total} contributions...") + + with Pool(processes=256) as pool: + for _ in pool.imap_unordered(process_contribution, contributions_list): + completed += 1 + print(f"Progress: {completed}/{total} ({(completed/total*100):.1f}%)") + + print("All processing complete") else: # update only contribution with id==index contribution = next((x for x in contributions_list if x['id'] == int(index)), None) diff --git a/scripts/parse_and_validate_properties_txt.py b/scripts/parse_and_validate_properties_txt.py index d4eed27..d439bb3 100644 --- a/scripts/parse_and_validate_properties_txt.py +++ b/scripts/parse_and_validate_properties_txt.py @@ -64,7 +64,7 @@ def read_properties_txt(properties_url): 'User-Agent': 'Mozilla/5.0', 'Accept': 'text/html', } - r = requests.get(properties_url, headers=headers) + r = requests.get(properties_url, headers=headers, timeout=30) if r.status_code != 200: raise FileNotFoundError(f"status code {r.status_code} returned for url {r.url}") From ea70ea5d5ef9e40449e30615232fda515988f22d Mon Sep 17 00:00:00 2001 From: Stef Tervelde Date: Fri, 13 Dec 2024 08:12:38 +0100 Subject: [PATCH 4/5] warning cleanup --- scripts/add_new_contribution_to_yaml.py | 4 ++-- scripts/fetch_updates.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/add_new_contribution_to_yaml.py b/scripts/add_new_contribution_to_yaml.py index 3ed4832..18812a7 100644 --- a/scripts/add_new_contribution_to_yaml.py +++ b/scripts/add_new_contribution_to_yaml.py @@ -1,7 +1,7 @@ """ given properties, add a new contribution to the contributions.yaml database file. """ -from datetime import datetime +from datetime import datetime, UTC import json import pathlib from sys import argv @@ -39,7 +39,7 @@ # append new contribution with next index # add status, at top - datetime_today = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S%z') + datetime_today = datetime.now(UTC).strftime('%Y-%m-%dT%H:%M:%S%z') contribution = { 'id': max_index + 1, 'status': 'VALID', diff --git a/scripts/fetch_updates.py b/scripts/fetch_updates.py index fa4153e..eb6a87f 100644 --- a/scripts/fetch_updates.py +++ b/scripts/fetch_updates.py @@ -2,7 +2,7 @@ Reads in the contributions.yaml file, and updates the entries by hitting the 'source' url. """ import argparse -from datetime import datetime +from datetime import datetime, UTC import pathlib from ruamel.yaml import YAML from multiprocessing import Pool @@ -11,7 +11,7 @@ def update_contribution(contribution, props): - datetime_today = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S%z') + datetime_today = datetime.now(UTC).strftime('%Y-%m-%dT%H:%M:%S%z') contribution['lastUpdated'] = datetime_today if 'previousVersions' not in contribution: contribution['previousVersions'] = [] @@ -40,7 +40,7 @@ def log_broken(contribution, msg): contribution['log'].append(msg) def process_contribution(contribution): - date_today = datetime.utcnow().strftime('%Y-%m-%d') + date_today = datetime.now(UTC).strftime('%Y-%m-%d') this_version = '0' if contribution['status'] != 'DEPRECATED': From a03e74866e9227d30b1426b212a380ee9bec21b7 Mon Sep 17 00:00:00 2001 From: Stef Tervelde Date: Mon, 16 Dec 2024 17:15:33 +0100 Subject: [PATCH 5/5] Fixed values not being passed by reference --- scripts/fetch_updates.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scripts/fetch_updates.py b/scripts/fetch_updates.py index eb6a87f..971c346 100644 --- a/scripts/fetch_updates.py +++ b/scripts/fetch_updates.py @@ -39,7 +39,9 @@ def log_broken(contribution, msg): contribution['log'] = [] contribution['log'].append(msg) -def process_contribution(contribution): +def process_contribution(item): + index, contribution = item + date_today = datetime.now(UTC).strftime('%Y-%m-%d') this_version = '0' @@ -53,16 +55,16 @@ def process_contribution(contribution): properties_raw = read_properties_txt(contribution['source']) except FileNotFoundError as e: log_broken(contribution, f'file not found, {e}, {date_today}') - return + return index, contribution except Exception: log_broken(contribution, f'url timeout, {date_today}') - return + return index, contribution try: props = validate_existing(parse_text(properties_raw)) except Exception: log_broken(contribution, f'invalid file, {date_today}') - return + return index, contribution # some library files have field lastUpdated. This also exists in the database, but is defined # by our scripts, so remove this field. @@ -73,6 +75,7 @@ def process_contribution(contribution): if props['version'] != this_version: # update from online update_contribution(contribution, props) + return index, contribution if __name__ == "__main__": @@ -99,7 +102,8 @@ def process_contribution(contribution): print(f"Starting processing of {total} contributions...") with Pool(processes=256) as pool: - for _ in pool.imap_unordered(process_contribution, contributions_list): + for index, contribution in pool.imap_unordered(process_contribution, enumerate(contributions_list)): + contributions_list[index] = contribution completed += 1 print(f"Progress: {completed}/{total} ({(completed/total*100):.1f}%)") @@ -108,7 +112,7 @@ def process_contribution(contribution): # update only contribution with id==index contribution = next((x for x in contributions_list if x['id'] == int(index)), None) print(contribution) - process_contribution(contribution) + process_contribution((index, contribution)) print(contribution) # write all contributions to database file