Skip to content

Commit

Permalink
Merge branch 'dev' into update-zenodo-dois
Browse files Browse the repository at this point in the history
  • Loading branch information
e-belfer authored Nov 27, 2023
2 parents f9893a7 + 649db02 commit 8e7aac7
Show file tree
Hide file tree
Showing 13 changed files with 409 additions and 438 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/update-conda-lockfile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ name: update-conda-lockfile
on:
workflow_dispatch:
schedule:
- cron: "0 9 * * 1-5" # Weekdays at 9AM UTC
- cron: "0 9 * * 1" # Mondays at 9AM UTC
push:
paths:
- "Makefile"
Expand Down
69 changes: 69 additions & 0 deletions devtools/sqlite_to_duckdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""A naive script for converting SQLite to DuckDB."""
import logging
from pathlib import Path

import click
import duckdb

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


@click.command()
@click.argument("sqlite_path", type=click.Path(exists=True, resolve_path=True))
@click.argument(
"duckdb_path", type=click.Path(resolve_path=True, writable=True, allow_dash=False)
)
def convert_sqlite_to_duckdb(sqlite_path, duckdb_path):
"""Convert an SQLite database to DuckDB format.
Args:
sqlite_path (str): Path to the existing SQLite database file.
duckdb_path (str): Path to the new DuckDB database file (should not exist).
Example:
python sqlite_to_duckdb.py sqlite.db duckdb.db
"""
sqlite_path = Path(sqlite_path)
duckdb_path = Path(duckdb_path)

# Check if DuckDB file already exists
if duckdb_path.exists():
click.echo(
f"Error: DuckDB file '{duckdb_path}' already exists. Please provide a new filename."
)
return

# Connect to DuckDB database
duckdb_conn = duckdb.connect(database=str(duckdb_path))
duckdb_cursor = duckdb_conn.cursor()

# Fetch table names from SQLite database using DuckDB
duckdb_cursor.execute(f"ATTACH DATABASE '{sqlite_path}' AS sqlite_db;")
duckdb_cursor.execute("SELECT name FROM main.sqlite_master WHERE type='table';")
table_names = [row[0] for row in duckdb_cursor.fetchall()]

# Copy tables from SQLite to DuckDB
for table_name in table_names:
logger.info(f"Working on table: {table_name}")
# Fetch column names and types from SQLite table using DuckDB
duckdb_cursor.execute(f"PRAGMA table_info(sqlite_db.{table_name});")
columns_info = duckdb_cursor.fetchall()
column_definitions = ", ".join([f"{col[1]} {col[2]}" for col in columns_info])

# Create equivalent table in DuckDB
duckdb_cursor.execute(f"CREATE TABLE {table_name} ({column_definitions});")

# Copy data from SQLite to DuckDB using DuckDB
duckdb_cursor.execute(
f"INSERT INTO {table_name} SELECT * FROM sqlite_db.{table_name};" # noqa: S608
)

# Commit and close connections
duckdb_conn.commit()
duckdb_conn.close()


if __name__ == "__main__":
convert_sqlite_to_duckdb()
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM mambaorg/micromamba:1.5.1
FROM mambaorg/micromamba:1.5.3

USER root

Expand Down
36 changes: 0 additions & 36 deletions docker/docker-compose.yml

This file was deleted.

18 changes: 12 additions & 6 deletions docker/gcp_pudl_etl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,23 @@ ETL_SUCCESS=${PIPESTATUS[0]}

# if pipeline is successful, distribute + publish datasette
if [[ $ETL_SUCCESS == 0 ]]; then
# Dump outputs to s3 bucket if branch is dev or build was triggered by a tag
if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then
copy_outputs_to_distribution_bucket
ETL_SUCCESS=${PIPESTATUS[0]}
fi

# Deploy the updated data to datasette
if [ $GITHUB_REF = "dev" ]; then
python ~/devtools/datasette/publish.py 2>&1 | tee -a $LOGFILE
ETL_SUCCESS=${PIPESTATUS[0]}
fi

# Compress the SQLite DBs for easier distribution
# Remove redundant multi-file EPA CEMS outputs prior to distribution
gzip --verbose $PUDL_OUTPUT/*.sqlite && \
rm -rf $PUDL_OUTPUT/hourly_emissions_epacems/
ETL_SUCCESS=${PIPESTATUS[0]}

# Dump outputs to s3 bucket if branch is dev or build was triggered by a tag
if [ $GITHUB_ACTION_TRIGGER = "push" ] || [ $GITHUB_REF = "dev" ]; then
copy_outputs_to_distribution_bucket
ETL_SUCCESS=${PIPESTATUS[0]}
fi
fi

# Notify slack about entire pipeline's success or failure;
Expand Down
33 changes: 0 additions & 33 deletions docker/local_pudl_etl.sh

This file was deleted.

33 changes: 22 additions & 11 deletions docs/data_access.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,44 +82,55 @@ version of Datasette (see above). These nightly build outputs can be accessed us
AWS CLI, or programmatically via the S3 API. They can also be downloaded directly over
HTTPS using the following links:

* `PUDL SQLite DB <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/pudl.sqlite>`__
* `PUDL SQLite DB <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/pudl.sqlite.gz>`__
* `EPA CEMS Hourly Emissions Parquet (1995-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/hourly_emissions_epacems.parquet>`__
* `Census DP1 SQLite DB (2010) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/censusdp1tract.sqlite>`__
* `Census DP1 SQLite DB (2010) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/censusdp1tract.sqlite.gz>`__

* Raw FERC Form 1:

* `FERC-1 SQLite derived from DBF (1994-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1.sqlite>`__
* `FERC-1 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1_xbrl.sqlite>`__
* `FERC-1 SQLite derived from DBF (1994-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1.sqlite.gz>`__
* `FERC-1 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1_xbrl.sqlite.gz>`__
* `FERC-1 Datapackage (JSON) describing SQLite derived from XBRL <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1_xbrl_datapackage.json>`__
* `FERC-1 XBRL Taxonomy Metadata as JSON (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc1_xbrl_taxonomy_metadata.json>`__

* Raw FERC Form 2:

* `FERC-2 SQLite derived from DBF (1996-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2.sqlite>`__
* `FERC-2 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2_xbrl.sqlite>`__
* `FERC-2 SQLite derived from DBF (1996-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2.sqlite.gz>`__
* `FERC-2 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2_xbrl.sqlite.gz>`__
* `FERC-2 Datapackage (JSON) describing SQLite derived from XBRL <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2_xbrl_datapackage.json>`__
* `FERC-2 XBRL Taxonomy Metadata as JSON (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc2_xbrl_taxonomy_metadata.json>`__

* Raw FERC Form 6:

* `FERC-6 SQLite derived from DBF (2000-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6.sqlite>`__
* `FERC-6 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6_xbrl.sqlite>`__
* `FERC-6 SQLite derived from DBF (2000-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6.sqlite.gz>`__
* `FERC-6 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6_xbrl.sqlite.gz>`__
* `FERC-6 Datapackage (JSON) describing SQLite derived from XBRL <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6_xbrl_datapackage.json>`__
* `FERC-6 XBRL Taxonomy Metadata as JSON (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc6_xbrl_taxonomy_metadata.json>`__

* Raw FERC Form 60:

* `FERC-60 SQLite derived from DBF (2006-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc60.sqlite>`__
* `FERC-60 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc60_xbrl.sqlite>`__
* `FERC-60 SQLite derived from DBF (2006-2020) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc60.sqlite.gz>`__
* `FERC-60 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc60_xbrl.sqlite.gz>`__
* `FERC-60 Datapackage (JSON) describing SQLite derived from XBRL <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc60_xbrl_datapackage.json>`__
* `FERC-60 XBRL Taxonomy Metadata as JSON (2021) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc60_xbrl_taxonomy_metadata.json>`__

* Raw FERC Form 714:

* `FERC-714 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc714_xbrl.sqlite>`__
* `FERC-714 SQLite derived from XBRL (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc714_xbrl.sqlite.gz>`__
* `FERC-714 Datapackage (JSON) describing SQLite derived from XBRL <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc714_xbrl_datapackage.json>`__
* `FERC-714 XBRL Taxonomy Metadata as JSON (2021-2022) <https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/dev/ferc714_xbrl_taxonomy_metadata.json>`__

.. note::

To reduce network transfer times, we ``gzip`` the SQLite database files, which can
be quite large when uncompressed. To decompress them locally, you can use the
``gunzip`` command.


.. code-block:: console
$ gunzip *.sqlite.gz
.. _access-zenodo:

Expand Down
Loading

0 comments on commit 8e7aac7

Please sign in to comment.