Add route and handlers for closing PR mirrors

kwryankrattiger · kwryankrattiger · commit 9d26e154b471 · 2022-10-21T12:42:28.000-05:00
When a PR is closed by a merge:
* Copy the PR mirror to the graduated binaries mirror
* Prune duplicates that have already been updated in the "develop"
  mirror
* Reindex the graduated binaries mirror
* Delete the PR mirror

When a PR is closed, but not merged:
* Delete the PR mirror

This change removes the need for a sync script based cleaning of PR
binary mirrors.
diff --git a/spackbot/handlers/__init__.py b/spackbot/handlers/__init__.py
@@ -3,3 +3,4 @@
 from .reviewers import add_reviewers, add_issue_maintainers  # noqa
 from .reviewers import add_reviewers  # noqa
 from .style import style_comment, fix_style  # noqa
+from .mirrors import close_pr_mirror  # noqa
diff --git a/spackbot/handlers/mirrors.py b/spackbot/handlers/mirrors.py
@@ -0,0 +1,124 @@
+# Copyright 2013-2021 Lawrence Livermore National Security, LLC and other
+# Spack Project Developers. See the top-level COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+
+from sh.contrib import git
+import os
+
+import spackbot.helpers as helpers
+from spackbot.workers import (
+    copy_pr_mirror,
+    prune_mirror_duplicates,
+    update_mirror_index,
+    delete_pr_mirror,
+    get_queue,
+    TASK_QUEUE_LONG,
+    spack_upstream,
+    pr_expected_base,
+    pr_mirror_base_url,
+    pr_shared_mirror,
+    publish_mirror_base_url,
+)
+
+# If we don't provide a timeout, the default in RQ is 180 seconds
+WORKER_JOB_TIMEOUT = 6 * 60 * 60
+
+logger = helpers.getLogger(__name__)
+
+
+def list_ci_stacks(branch):
+    with helpers.temp_dir() as cwd:
+        # Shallow clone of spack to use for spack getting the current stacks availabe
+        # in the base branch.
+        git.clone("--branch", branch, "--depth", 1, spack_upstream, "spack-develop")
+
+        stacks = []
+        pipeline_root = (
+            f"{cwd}/spack-develop/share/spack/gitlab/cloud_pipelines/stacks/"
+        )
+        for stack in os.listdir(pipeline_root):
+            if os.path.isfile(f"{pipeline_root}/{stack}/spack.yaml"):
+                stacks.append(stack)
+
+        return stacks
+
+
+async def close_pr_mirror(event, gh):
+    payload = event.data
+
+    base_branch = payload["pull_request"]["base"]["ref"]
+    is_merged = payload["pull_request"]["merged"]
+
+    copy_job = None
+
+    if is_merged and base_branch == pr_expected_base:
+        pr_number = payload["number"]
+        pr_branch = payload["pull_request"]["head"]["ref"]
+
+        logger.info(
+            f"PR {pr_number}/{pr_branch} merged to develop, graduating binaries"
+        )
+
+        # Use the "long" running task queue
+        ltask_q = get_queue(TASK_QUEUE_LONG)
+
+        job_metadata = {
+            "type": None,
+            "stack": None,
+        }
+        # Copy all of the stack binaries from the PR to the shared PR
+        # mirror.
+        job_metadata.update({"type": "copy"})
+        pr_mirror_url = f"{pr_mirror_base_url}/pr{pr_number}_{pr_branch}"
+        shared_pr_mirror_url = f"{pr_mirror_base_url}/{pr_shared_mirror}"
+        copy_job = ltask_q.enqueue(
+            copy_pr_mirror,
+            pr_mirror_url,
+            shared_pr_mirror_url,
+            meta=job_metadata,
+            job_timeout=WORKER_JOB_TIMEOUT,
+        )
+        logger.info(f"Copy job queued: {copy_job.id}")
+
+        # Loop all of the stacks detected on the expected base branch
+        for stack in list_ci_stacks(pr_expected_base):
+            job_metadata.update({"stack": stack})
+            # Prune duplicates that have been published after copy
+            # since copy may have introduced duplicates for some reason
+            job_metadata.update({"type": "prune"})
+            shared_pr_mirror_url = f"{pr_mirror_base_url}/{pr_shared_mirror}/{stack}"
+            publish_mirror_url = f"{publish_mirror_base_url}/{stack}/{pr_expected_base}"
+            prune_job = ltask_q.enqueue(
+                prune_mirror_duplicates,
+                shared_pr_mirror_url,
+                publish_mirror_url,
+                job_timeout=WORKER_JOB_TIMEOUT,
+                depends_on=copy_job,
+                meta=job_metadata,
+            )
+            logger.info(f"Pruning job queued: {prune_job.id}")
+
+            stack_mirror_url = f"{pr_mirror_base_url}/{pr_shared_mirror}/{stack}"
+            # Queue a reindex for the stack mirror to attempt to run after
+            # prune.
+            job_metadata.update({"type": "reindex"})
+            update_job = ltask_q.enqueue(
+                update_mirror_index,
+                stack_mirror_url,
+                job_timeout=WORKER_JOB_TIMEOUT,
+                depends_on=prune_job,
+                meta=job_metadata,
+            )
+            logger.info(f"Reindex job queued: {update_job.id}")
+
+    # Delete the mirror
+    job_metadata.update({"type": "delete"})
+    del_job = ltask_q.enqueue(
+        delete_pr_mirror,
+        f"pr{pr_number}_{pr_branch}",
+        meta=job_metadata,
+        job_timeout=WORKER_JOB_TIMEOUT,
+        depends_on=copy_job,
+    )
+    logger.info(f"Delete job queued: {del_job.id}")
diff --git a/spackbot/routes.py b/spackbot/routes.py
@@ -130,3 +130,11 @@ async def label_pull_requests(event, gh, *args, session, **kwargs):
     Add labels to PRs based on which files were modified.
     """
     await handlers.add_labels(event, gh)
+
+
+@router.register("pull_request", action="closed")
+async def on_closed_pull_request(event, gh, *args, session, **kwargs):
+    """
+    Respond to the pull request closed
+    """
+    await handlers.close_pr_mirror(event, gh)
diff --git a/spackbot/workers.py b/spackbot/workers.py
@@ -326,3 +326,181 @@ async def fix_style_task(event):
             await gh.post(
                 event.data["issue"]["comments_url"], {}, data={"body": message}
             )
+
+
+async def copy_pr_mirror(pr_mirror_url, shared_pr_mirror_url):
+    """Create an s3 client to copy between the
+    per-pr mirror and the shared pr mirror.
+    Also accumulated s3 objects to be deleted from
+    the PR mirror
+    """
+    pr_url = helpers.s3_parse_url(pr_mirror_url)
+    shared_pr_url = helpers.s3_parse_url(shared_pr_mirror_url)
+
+    s3 = boto3.resource("s3")
+    pr_bucket_name = pr_url.get("bucket")
+    pr_bucket = s3.Bucket(pr_bucket_name)
+    pr_mirror_prefix = pr_url.get("prefix")
+
+    shared_pr_bucket = s3.Bucket(shared_pr_url.get("bucket"))
+    shared_pr_mirror_prefix = shared_pr_url.get("prefix")
+
+    # Files extensions to copy
+    extensions = (".spack", ".spec.json", ".spec.yaml", ".spec.json.sig")
+
+    for obj in pr_bucket.filter(Prefix=pr_mirror_prefix):
+        if obj.key.endswith(extensions):
+            # Create a new opject replacing the first instance of the pr_mirror_prefix
+            # with the shared_pr_mirror_prefix.
+            new_obj = shared_pr_bucket.Object(
+                obj.key.replace(pr_mirror_prefix, shared_pr_mirror_prefix, 1)
+            )
+            # Copy the PR mirror object to the new object in the shared PR mirror
+            new_obj.copy(
+                {
+                    "Bucket": pr_bucket_name,
+                    "Key": obj.key,
+                }
+            )
+
+
+async def delete_pr_mirror(pr_mirror_url):
+    pr_url = helpers.s3_parse_url(pr_mirror_url)
+
+    s3 = boto3.resource("s3")
+    pr_bucket = s3.Bucket(pr_url.get("bucket"))
+    pr_mirror_prefix = pr_url.get("prefix")
+    pr_bucket.filter(Prefix=pr_mirror_prefix).delete()
+
+
+# Upate index per stack mirror
+async def update_mirror_index(mirror_url):
+    """Use spack buildcache command to update index on remote mirror"""
+
+    # Current job stack
+    job = get_current_job()
+    stack = job.meta["info"]["stack"]
+
+    # Check if another reindex for this stack is queued
+    do_reindex = True
+    ltask_q = get_queue(job.origin)
+
+    for job in ltask_q.jobs:
+        info = job.meta["info"]
+        if info["type"] == "reindex" and info["stack"] == stack:
+            do_reindex = False
+            break
+
+    # Check the queue for more reindex jobs, if there are none,
+    # run reindex on the graduated PR mirror.
+    if do_reindex:
+        print(f"Updating binary index at {mirror_url}")
+        await helpers.run_in_subprocess(
+            [
+                "spack",
+                "-d",
+                "buildcache",
+                "update-index",
+                "--mirror-url",
+                f"'{mirror_url}'",
+            ]
+        )
+
+
+# This works because we guarentee the hash is in the filename.
+# If this assumption is ever broken, this code will break.
+def hash_from_key(key):
+    h = None
+    # hash is 32 chars long between a "-" and a "."
+    # examples include:
+    # linux-ubuntu18.04-x86_64-gcc-8.4.0-armadillo-10.5.0-gq3ijjrtnzgpm4bvuamjr6wa7hzxkypz.spack
+    # linux-ubuntu18.04-x86_64-gcc-8.4.0-armadillo-10.5.0-gq3ijjrtnzgpm4bvuamjr6wa7hzxkypz.spec.json
+    h = re.findall("-([a-zA-Z0-9]{32,32})\.", key.lower())
+    if len(h) > 1:
+        # Error, multiple matches are ambigious
+        h = None
+    elif h:
+        h = h[0]
+    return h
+
+
+# Prune per stack mirror
+async def prune_mirror_duplicates(pr_mirror_url, publish_mirror_url):
+    s3 = boto3.resource("s3")
+
+    pr_url = helpers.s3_parse_url(pr_mirror_url)
+    pr_bucket_name = pr_url.get("bucket")
+    pr_bucket = s3.Bucket(pr_bucket_name)
+    pr_mirror_prefix = pr_url.get("prefix")
+
+    publish_url = helpers.s3_parse_url(publish_mirror_url)
+    publish_bucket = s3.Bucket(publish_url.get("bucket"))
+    publish_mirror_prefix = publish_url.get("prefix")
+
+    # All of the expected possible spec file extensions
+    extensions = (".spec.json", ".spec.yaml", ".spec.json.sig")
+
+    # Get the current time for age based pruning
+    now = datetime.now()
+    pr_specs = {}
+    for obj in pr_bucket.objects.filter(
+        Prefix=pr_mirror_prefix,
+    ):
+        # Need to convert from aware to naive time to get delta
+        last_modified = obj.last_modified.replace(tzinfo=None)
+        # Prune obj.last_modified > 7 days to avoid storing cached objects
+        # that only existed during development.
+        if (now - last_modified).days >= helpers.pr_mirror_retire_after_days:
+            logger.debug(
+                f"pr mirror pruning {obj.key} from s3://{pr_bucket_name}: "
+                "reason(age)"
+            )
+            # Anything older than the retirement age should just be indesciminately
+            # pruned
+            obj.delete()
+
+            # Grab the hash from the object, to ensure all of the files associated with
+            # it are also removed.
+            spec_hash = hash_from_key(obj.key)
+            if spec_hash:
+                pr_specs.add(hash_from_key(obj.key))
+            continue
+
+        if not obj.key.endswith(extensions):
+            continue
+
+        # Get the hashes in the shared PR bucket.
+        spec_hash = hash_from_key(obj.key)
+        if spec_hash:
+            pr_specs.add(hash_from_key(obj.key))
+        else:
+            logger.error(f"Encountered spec file without hash in name: {obj.key}")
+
+    # Check in the published base branch bucket for duplicates to delete
+    delete_specs = {}
+    for obj in publish_bucket.objects.filter(
+        Prefix=publish_mirror_prefix,
+    ):
+        if not obj.key.endswith(extensions):
+            continue
+
+        spec_hash = hash_from_key(obj.key.lower())
+        if spec_hash in pr_specs:
+            delete_specs.add(spec_hash)
+
+    # Also look at the .spack files for deletion
+    extensions = (".spack", *extensions)
+
+    # Delete all of the objects with marked hashes
+    for obj in pr_bucket.objects.filter(
+        Prefix=pr_mirror_prefix,
+    ):
+        if not obj.key.endswith(extensions):
+            continue
+
+        if hash_from_key(obj.key) in delete_specs:
+            logger.debug(
+                f"pr mirror pruning {obj.key} from s3://{pr_bucket_name}: "
+                "reason(published)"
+            )
+            obj.delete()