Skip to content

Commit

Permalink
feat: adding support for iac repos
Browse files Browse the repository at this point in the history
  • Loading branch information
cristian-rincon committed Nov 22, 2023
1 parent 4e0b796 commit 910d08b
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 13 deletions.
113 changes: 103 additions & 10 deletions extractor/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,72 @@

import pandas as pd
import requests
from ghapi.all import GhApi
from rich.progress import track

from extractor.checks import StandardCheck
from extractor.logger import logger
from extractor.render import Requirements

URLBASE = "https://pypi.org/pypi"
PYPI_URL_BASE = "https://pypi.org/pypi"
GITHUB_URL_BASE = "https://github.com"
IAC_EXCEPTIONS_LIST = [
"hashicorp/terraform",
"databricks/terraform-provider-databricks",
]


def get_raw_data(project: str) -> Dict[str, str]:
def is_python(project: str) -> bool:
"""
Check if the project is a Python project.
Args:
project: The name of the project.
Returns:
A boolean value.
"""
return project.startswith("python")


def is_iac_project(project: str) -> bool:
"""
Check if the project is an IAC project.
Args:
project: The name of the project.
Returns:
A boolean value.
"""
return project in IAC_EXCEPTIONS_LIST


def get_raw_data_from_github(project: str, version: str) -> Dict[str, str]:
"""
Retrieve raw metadata for a project from a given URL.
Args:
project: The name of the project.
version: The version of the project.
Returns:
A dictionary containing the raw metadata of the project.
"""
api = GhApi()
owner, repo = project.split("/")
data = api.repos.get_content(owner=owner, repo=repo, path="LICENSE", ref=version)
license_url = data.get("html_url")
return {
"name": project,
"version": version,
"license": license_url,
"pypi_release_url": "",
"version_url": license_url.replace("/LICENSE", ""),
}


def get_raw_data_from_pypi(project: str) -> Dict[str, str]:
"""
Retrieve raw metadata for a project from a given URL.
Expand All @@ -25,7 +81,9 @@ def get_raw_data(project: str) -> Dict[str, str]:
"""
try:
r = requests.get(
f"{URLBASE}/{project}/json", headers={"Accept": "application/json"}
f"{PYPI_URL_BASE}/{project}/json",
headers={"Accept": "application/json"},
timeout=5,
)
except Exception as e:
logger.error(e)
Expand All @@ -36,6 +94,22 @@ def get_raw_data(project: str) -> Dict[str, str]:
return r.json()["info"]


def get_raw_data(project: str, version: str = "") -> Dict[str, str]:
"""
Retrieve raw metadata for a project from a given URL.
Args:
project: The name of the project.
Returns:
A dictionary containing the raw metadata of the project.
"""
if is_iac_project(project) or is_python(project):
return get_raw_data_from_github(project, version)
else:
return get_raw_data_from_pypi(project)


def filter_data(raw_data: Dict[str, str], version: str) -> Dict[str, str]:
"""
Filter relevant metadata from raw data.
Expand Down Expand Up @@ -96,30 +170,49 @@ def extract_data(source_path: Path, format: str) -> None:
result = Requirements().render(source_path, format)
pkgs_raw_metadata = []
for pkg in track(result):
filtered_data = filter_data(
get_raw_data(pkg[0]), pkg[1] if len(pkg) > 1 else None
)
if is_iac_project(pkg[0]) or is_python(pkg[0]):
filtered_data = get_raw_data(pkg[0], pkg[1])
else:
filtered_data = filter_data(
get_raw_data(pkg[0]), pkg[1] if len(pkg) > 1 else None
)
if filtered_data:
pkgs_raw_metadata.append(filtered_data)
output = pd.DataFrame(pkgs_raw_metadata)
output["uppercased_name"] = output["name"].str.upper()
output = output.sort_values(by=["uppercased_name"])
output = output.drop_duplicates(subset=["uppercased_name"], keep="first")
del output["uppercased_name"]
# output["uppercased_name"] = output["name"].str.upper()
# output = output.sort_values(by=["uppercased_name"])
# output = output.drop_duplicates(subset=["uppercased_name"], keep="first")
# del output["uppercased_name"]
output["uppercased_version_url"] = output["version_url"].str.upper()
output = output.sort_values(by=["uppercased_version_url"])
output = output.drop_duplicates(subset=["uppercased_version_url"], keep="first")
del output["uppercased_version_url"]
return output


def save_data(data: pd.DataFrame, output: Path):
# Extract directory from the output string
raw_name = str(output).split(".")[0]
output_directory = os.path.dirname(output)
only_version_url_list = data.copy()
only_version_url_list["Commit ID"] = only_version_url_list["version_url"]
# only_version_url_list["Version Number"] = only_version_url_list["version"]
only_version_url_list_df = only_version_url_list[["Commit ID"]]
# only_version_url_list_df = only_version_url_list_df.copy().pivot_table(by=["Commit ID", "Version Number"])
logger.info(f"Storing into: {output}")
if not os.path.exists(output_directory):
os.makedirs(output_directory)
if str(output).endswith(".csv"):
data.to_csv(output, index=False)
only_version_url_path = f"{raw_name}_only_version_urls.csv"
only_version_url_list_df.to_csv(only_version_url_path, index=False)
logger.info(f"Only urls version stored into: {only_version_url_path}")
logger.info("All done! Have a Great day")
elif str(output).endswith(".xlsx"):
data.to_excel(output, index=False)
only_version_url_path = f"{raw_name}_only_version_urls.xlsx"
only_version_url_list_df.to_excel(only_version_url_path, index=False)
logger.info(f"Only urls version stored into: {only_version_url_path}")
logger.info("All done! Have a Great day")
else:
logger.error("Not supported format.")
50 changes: 49 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pandas = "^2.0.2"
loguru = "^0.7.0"
requests = "^2.31.0"
toml = "^0.10.2"
ghapi = "^1.0.4"


[tool.poetry.group.dev.dependencies]
Expand Down
4 changes: 3 additions & 1 deletion tests/mocks/in/pip_freeze/sample_1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,7 @@ scikit-learn==1.2.1
great_expectations==0.15.47
pandas==2.0.2
XlsxWriter
hashicorp/terraform==v1.6.3
databricks/terraform-provider-databricks==v1.29.0
# Sphinx
# sphinx-rtd-theme
# sphinx-rtd-theme
2 changes: 1 addition & 1 deletion tests/mocks/in/pip_freeze/sample_2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ delta-spark==2.3.0
mlflow==1.29.0
matplotlib==3.7.1
databricks-feature-store==0.11.0
mlflow-skinny==2.3.2
mlflow-skinny==2.3.2
2 changes: 2 additions & 0 deletions tests/mocks/in/pip_freeze/sample_3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
hashicorp/terraform==v1.6.3
python/cpython==v3.9.18

0 comments on commit 910d08b

Please sign in to comment.