Skip to content

Commit

Permalink
fix: sbom performance improvements (#2423)
Browse files Browse the repository at this point in the history
* add: `sbom` source to `oss_directory`

* fix: identify missing `sbom` repositories more efficiently

* fix: avoid calling `resolve_repos` when resolving `sbom`

* fix: use dbt `source` macro

* chore: fix `sqlfluff` style for `missing_sbom` staging model
  • Loading branch information
Jabolol authored Oct 28, 2024
1 parent 64f3165 commit d14dee4
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 81 deletions.
2 changes: 2 additions & 0 deletions warehouse/dbt/models/oss_directory_source.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,7 @@ sources:
identifier: collections
- name: repositories
identifier: repositories
- name: sbom
identifier: sbom
- name: missing_sbom
identifier: missing_sbom
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,31 @@
materialized = 'view'
) }}

with source as (
with all_repos as (
select *
from {{ source('ossd', 'missing_sbom') }}
from
{{ source('ossd', 'repositories') }}
),

current_dlt_load_id as (
select max(_dlt_load_id) as max_dlt_load_id
from source
),

last_snapshot as (
all_ossd as (
select *
from source
where _dlt_load_id = (select max_dlt_load_id from current_dlt_load_id)
from
{{ source('ossd', 'sbom') }}
where
artifact_source = 'GITHUB'
)

select *
from last_snapshot
select
`owner` as artifact_namespace,
`name` as artifact_name,
'GITHUB' as artifact_source,
`url` as artifact_url,
ingestion_time as snapshot_at
from
all_repos as ar
left join
all_ossd as ao
on
CONCAT(ao.artifact_namespace, '/', ao.artifact_name) = ar.name_with_owner
where
ao.artifact_namespace is null
13 changes: 0 additions & 13 deletions warehouse/oso_dagster/assets/ossd.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from oso_dagster.dlt_sources.github_repos import (
oss_directory_github_repositories_resource,
oss_directory_github_sbom_resource,
oss_directory_missing_sbom_repositories_resource,
)
from oso_dagster.factories import dlt_factory
from oso_dagster.factories.common import AssetFactoryResponse
Expand Down Expand Up @@ -162,18 +161,6 @@ def sbom(
yield oss_directory_github_sbom_resource(projects_df, gh_token)


@dlt_factory(
key_prefix="ossd",
ins={"projects_df": AssetIn(project_key)},
tags=common_tags,
)
def missing_sbom(
projects_df: pl.DataFrame,
gh_token: str = secret_ref_arg(group_name="ossd", key="github_token"),
):
yield oss_directory_missing_sbom_repositories_resource(projects_df, gh_token)


@discoverable_jobs(dependencies=[repositories])
def ossd_jobs(dependencies: t.List[AssetFactoryResponse]):
repositories = t.cast(AssetsDefinition, list(dependencies[0].assets)[0])
Expand Down
70 changes: 14 additions & 56 deletions warehouse/oso_dagster/dlt_sources/github_repos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,23 +285,13 @@ def github_urls_from_df(self, projects_df: pl.DataFrame):
logger.debug(f"unnested all github urls and got {len(all_github_urls)} rows")
return all_github_urls

def is_repo_missing_sbom(self, repo: Repository) -> bool:
try:
self._gh.rest.dependency_graph.export_sbom(
repo.owner,
repo.name,
)
return False
except RequestFailed as e:
if e.response.status_code != 404:
logger.warning("Error checking for SBOM: %s", e)
return True

def get_sbom_for_repo(self, repo: Repository) -> List[GithubRepositorySBOMItem]:
def get_sbom_for_repo(
self, owner: str, name: str
) -> List[GithubRepositorySBOMItem]:
try:
sbom = self._gh.rest.dependency_graph.export_sbom(
repo.owner,
repo.name,
owner,
name,
)
graph = sbom.parsed_data.sbom
sbom_list: List[GithubRepositorySBOMItem] = []
Expand All @@ -317,8 +307,8 @@ def get_sbom_for_repo(self, repo: Repository) -> List[GithubRepositorySBOMItem]:

sbom_list.append(
GithubRepositorySBOMItem(
artifact_namespace=repo.owner,
artifact_name=repo.name,
artifact_namespace=owner,
artifact_name=name,
artifact_source="GITHUB",
package=package_name,
package_source=package_source.upper(),
Expand All @@ -330,7 +320,7 @@ def get_sbom_for_repo(self, repo: Repository) -> List[GithubRepositorySBOMItem]:
return sbom_list
except RequestFailed as exception:
if exception.response.status_code == 404:
logger.warning("Skipping %s, no SBOM found", repo.url)
logger.warning("Skipping %s, no SBOM found", f"{owner}/{name}")
else:
logger.warning("Error getting SBOM: %s", exception)
return []
Expand Down Expand Up @@ -398,7 +388,7 @@ def oss_directory_github_sbom_resource(
rate_limit_max_retry: int = 5,
server_error_max_rety: int = 3,
):
"""Based on the oss_directory data we resolve sbom manifests for repositories"""
"""Retrieve SBOM information for GitHub repositories"""

config = GithubClientConfig(
gh_token=gh_token,
Expand All @@ -409,41 +399,9 @@ def oss_directory_github_sbom_resource(
gh = GithubRepositoryResolver.get_github_client(config)
resolver = GithubRepositoryResolver(gh)

for repo in resolver.resolve_repos(projects_df):
yield from resolver.get_sbom_for_repo(repo)


@dlt.resource(
name="missing_sbom",
table_name="missing_sbom",
columns=pydantic_to_dlt_nullable_columns(GitHubRespositoryMissingSBOMItem),
write_disposition="append",
)
def oss_directory_missing_sbom_repositories_resource(
projects_df: pl.DataFrame,
gh_token: str = dlt.secrets.value,
rate_limit_max_retry: int = 5,
server_error_max_rety: int = 3,
):
"""Based on the oss_directory data we resolve repositories"""

config = GithubClientConfig(
gh_token=gh_token,
rate_limit_max_retry=rate_limit_max_retry,
server_error_max_rety=server_error_max_rety,
)

gh = GithubRepositoryResolver.get_github_client(config)
resolver = GithubRepositoryResolver(gh)
all_github_urls = resolver.github_urls_from_df(projects_df)
valid_urls = [resolver.parse_url(url) for url in all_github_urls["url"] if url]

yield (
GitHubRespositoryMissingSBOMItem(
artifact_namespace=repo.owner,
artifact_name=repo.name,
artifact_source="GITHUB",
artifact_url=repo.url,
snapshot_at=repo.ingestion_time or datetime.now(UTC),
)
for repo in resolver.resolve_repos(projects_df)
if resolver.is_repo_missing_sbom(repo)
)
for url in valid_urls:
if url.type == GithubURLType.REPOSITORY and url.repository:
yield from resolver.get_sbom_for_repo(url.owner, url.repository)

0 comments on commit d14dee4

Please sign in to comment.