Skip to content

Commit

Permalink
Merge branch 'sec10k-schema-fixes' into update-bulk-elec-data
Browse files Browse the repository at this point in the history
  • Loading branch information
e-belfer authored Feb 6, 2025
2 parents df8c7aa + a1e0d28 commit 5396397
Show file tree
Hide file tree
Showing 7 changed files with 500 additions and 220 deletions.
553 changes: 341 additions & 212 deletions docs/catalyst_cites.bib

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions docs/catalyst_pubs.bib
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
@INPROCEEDINGS{10741747,
author={Lamb, Katherine and Belfer, Ella and Selvans, Zane and Norman, Bennett and Gosnell, Christina and Xia, Dazhong and Sharpe, Austen and Schira, Zach},
booktitle={2024 56th North American Power Symposium (NAPS)},
title={The Public Utility Data Liberation Project: Providing Open Data For a Clean Energy Transition},
year={2024},
volume={},
number={},
pages={1-6},
keywords={Green energy;Publishing;Catalysts;Ecosystems;Buildings;Electricity supply industry;Sustainable development;North America;Open data;open source;open data;EIA;FERC;US energy system;data pipeline;outreach},
doi={10.1109/NAPS61145.2024.10741747}}


@misc{PudlSoftware,
author = {Selvans, Zane and
Gosnell, Christina and
Expand Down
31 changes: 30 additions & 1 deletion docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,38 @@ PUDL Release Notes
=======================================================================================

---------------------------------------------------------------------------------------
v2024.XX.x (2024-MM-DD)
v2025.XX.x (2025-MM-DD)
---------------------------------------------------------------------------------------

New Data
^^^^^^^^

SEC Form 10-K Parent-Subsidiary Ownership
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

* We have added some new tables describing the parent-subsidary company ownership
relationships reported in the
`SEC's Form 10-K <https://en.wikipedia.org/wiki/Form_10-K>`__, Exhibit 21
"Subsidiaries of the Registrant". Where possible these tables link the SEC filers or
their subsidiary companies to the corresponding EIA utilities. This work was funded
by
`a grant from the Mozilla Foundation <https://catalyst.coop/2024/02/15/beating-utility-ownership-shell-game/>`__.
Most of the ML models and data preparation took place in the `mozilla-sec-eia
repository <https://github.com/catalyst-cooperative/mozilla-sec-eia>`__ separate from
the main PUDL ETL, as it requires processing hundreds of thousands of PDFs and the
deployment of some ML experiment tracking infrastructure. The new tables are handed
off as nearly finished products to the PUDL ETL pipeline. **Note that these are
preliminary, experimental data products and are known to be incomplete and to contain
errors.** Extracting data tables from unstructured PDFs and the SEC to EIA record
linkage are necessarily probabalistic processes.
* See PRs :pr:`4026,4031,4035,4046,4048,4050` and check out the table descriptions in
the PUDL data dictionary:

* :ref:`out_sec10k__parents_and_subsidiaries`
* :ref:`core_sec10k__quarterly_filings`
* :ref:`core_sec10k__quarterly_exhibit_21_company_ownership`
* :ref:`core_sec10k__quarterly_company_information`

New Data Coverage
^^^^^^^^^^^^^^^^^

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""Rename SEC 10K tables to reflect temporal granularity.
Revision ID: 1e2ec7bf2b64
Revises: ac67e04d1383
Create Date: 2025-02-05 10:52:45.161681
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = '1e2ec7bf2b64'
down_revision = 'ac67e04d1383'
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('core_sec10k__quarterly_company_information',
sa.Column('filename_sec10k', sa.Text(), nullable=False, comment='Name of filing as provided by SEC data portal.'),
sa.Column('filer_count', sa.Integer(), nullable=False, comment='Index company information as some filings contain information for multiple companies.'),
sa.Column('company_information_block', sa.Text(), nullable=False, comment='Title of block of data.'),
sa.Column('company_information_block_count', sa.Integer(), nullable=False, comment='Some blocks are repeated, this defines the index of the data block.'),
sa.Column('company_information_fact_name', sa.Text(), nullable=False, comment='Name of fact within a ``company_information_block``.'),
sa.Column('company_information_fact_value', sa.Text(), nullable=False, comment='Value corresponding with ``company_information_fact_name``.'),
sa.Column('report_date', sa.Date(), nullable=True, comment='Date reported.'),
sa.PrimaryKeyConstraint('filename_sec10k', 'filer_count', 'company_information_block', 'company_information_block_count', 'company_information_fact_name', 'company_information_fact_value', name=op.f('pk_core_sec10k__quarterly_company_information'))
)
op.create_table('core_sec10k__quarterly_exhibit_21_company_ownership',
sa.Column('filename_sec10k', sa.Text(), nullable=True, comment='Name of filing as provided by SEC data portal.'),
sa.Column('subsidiary_company_name', sa.Text(), nullable=True, comment='Name of subsidiary company.'),
sa.Column('subsidiary_company_location', sa.Text(), nullable=True, comment='Location of subsidiary company.'),
sa.Column('fraction_owned', sa.Float(), nullable=True, comment='Proportion of generator ownership attributable to this utility.'),
sa.Column('report_date', sa.Date(), nullable=True, comment='Date reported.')
)
op.create_table('core_sec10k__quarterly_filings',
sa.Column('filename_sec10k', sa.Text(), nullable=False, comment='Name of filing as provided by SEC data portal.'),
sa.Column('central_index_key', sa.Text(), nullable=True, comment='Identifier of the company in SEC database.'),
sa.Column('company_name', sa.Text(), nullable=True, comment='Name of company submitting SEC 10k filing.'),
sa.Column('sec10k_version', sa.Text(), nullable=True, comment='Specific version of SEC 10k filed.'),
sa.Column('filing_date', sa.Date(), nullable=True, comment='Date filing was submitted.'),
sa.Column('exhibit_21_version', sa.Text(), nullable=True, comment='Version of exhibit 21 submitted (if applicable).'),
sa.Column('report_date', sa.Date(), nullable=True, comment='Date reported.'),
sa.PrimaryKeyConstraint('filename_sec10k', name=op.f('pk_core_sec10k__quarterly_filings'))
)
op.drop_table('core_sec10k__company_information')
op.drop_table('core_sec10k__filings')
op.drop_table('core_sec10k__exhibit_21_company_ownership')
with op.batch_alter_table('out_sec10k__parents_and_subsidiaries', schema=None) as batch_op:
batch_op.drop_constraint('fk_out_sec10k__parents_and_subsidiaries_utility_id_eia_core_eia860__scd_utilities', type_='foreignkey')
batch_op.create_foreign_key(batch_op.f('fk_out_sec10k__parents_and_subsidiaries_utility_id_eia_core_eia__entity_utilities'), 'core_eia__entity_utilities', ['utility_id_eia'], ['utility_id_eia'])

# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('out_sec10k__parents_and_subsidiaries', schema=None) as batch_op:
batch_op.drop_constraint(batch_op.f('fk_out_sec10k__parents_and_subsidiaries_utility_id_eia_core_eia__entity_utilities'), type_='foreignkey')
batch_op.create_foreign_key('fk_out_sec10k__parents_and_subsidiaries_utility_id_eia_core_eia860__scd_utilities', 'core_eia860__scd_utilities', ['utility_id_eia', 'report_date'], ['utility_id_eia', 'report_date'])

op.create_table('core_sec10k__exhibit_21_company_ownership',
sa.Column('filename_sec10k', sa.TEXT(), nullable=True),
sa.Column('subsidiary_company_name', sa.TEXT(), nullable=True),
sa.Column('fraction_owned', sa.FLOAT(), nullable=True),
sa.Column('report_date', sa.DATE(), nullable=True),
sa.Column('subsidiary_company_location', sa.TEXT(), nullable=True)
)
op.create_table('core_sec10k__filings',
sa.Column('filename_sec10k', sa.TEXT(), nullable=False),
sa.Column('central_index_key', sa.TEXT(), nullable=True),
sa.Column('company_name', sa.TEXT(), nullable=True),
sa.Column('sec10k_version', sa.TEXT(), nullable=True),
sa.Column('exhibit_21_version', sa.TEXT(), nullable=True),
sa.Column('report_date', sa.DATE(), nullable=True),
sa.Column('filing_date', sa.DATE(), nullable=True),
sa.PrimaryKeyConstraint('filename_sec10k', name='pk_core_sec10k__filings')
)
op.create_table('core_sec10k__company_information',
sa.Column('filename_sec10k', sa.TEXT(), nullable=False),
sa.Column('filer_count', sa.INTEGER(), nullable=False),
sa.Column('company_information_block', sa.TEXT(), nullable=False),
sa.Column('company_information_block_count', sa.INTEGER(), nullable=False),
sa.Column('company_information_fact_name', sa.TEXT(), nullable=False),
sa.Column('company_information_fact_value', sa.TEXT(), nullable=False),
sa.Column('report_date', sa.DATE(), nullable=True),
sa.PrimaryKeyConstraint('filename_sec10k', 'filer_count', 'company_information_block', 'company_information_block_count', 'company_information_fact_name', 'company_information_fact_value', name='pk_core_sec10k__company_information')
)
op.drop_table('core_sec10k__quarterly_filings')
op.drop_table('core_sec10k__quarterly_exhibit_21_company_ownership')
op.drop_table('core_sec10k__quarterly_company_information')
# ### end Alembic commands ###
14 changes: 11 additions & 3 deletions src/pudl/analysis/pudl_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _year_quarter_to_date(year_quarter: pd.Series) -> pd.Series:
io_manager_key="pudl_io_manager",
group_name="pudl_models",
)
def core_sec10k__company_information() -> pd.DataFrame:
def core_sec10k__quarterly_company_information() -> pd.DataFrame:
"""Basic company information extracted from SEC10k filings."""
df = _load_table_from_gcs("core_sec10k__company_information")
df = df.rename(
Expand All @@ -50,7 +50,7 @@ def core_sec10k__company_information() -> pd.DataFrame:
io_manager_key="pudl_io_manager",
group_name="pudl_models",
)
def core_sec10k__exhibit_21_company_ownership() -> pd.DataFrame:
def core_sec10k__quarterly_exhibit_21_company_ownership() -> pd.DataFrame:
"""Company ownership information extracted from sec10k exhibit 21 attachments."""
df = _load_table_from_gcs("core_sec10k__exhibit_21_company_ownership")
df = df.rename(
Expand All @@ -74,7 +74,7 @@ def core_sec10k__exhibit_21_company_ownership() -> pd.DataFrame:
io_manager_key="pudl_io_manager",
group_name="pudl_models",
)
def core_sec10k__filings() -> pd.DataFrame:
def core_sec10k__quarterly_filings() -> pd.DataFrame:
"""Metadata on all 10k filings submitted to SEC."""
df = _load_table_from_gcs("core_sec10k__filings")
df = df.rename(
Expand Down Expand Up @@ -120,5 +120,13 @@ def out_sec10k__parents_and_subsidiaries() -> pd.DataFrame:
"standard_industrial_classification"
].str.extract(r"(.+)\[(\d{4})\]")
df["industry_id_sic"] = df["industry_id_sic"].astype("string")
# Some utilities harvested from EIA 861 data that don't show up in our entity
# tables. These didn't end up improving coverage, and so will be removed upstream.
# Hack for now is to just drop them so the FK constraint is respected.
# See https://github.com/catalyst-cooperative/pudl/issues/4050
bad_utility_ids = [
3579, # Cirro Group, Inc. in Texas
]
df = df[~df.utility_id_eia.isin(bad_utility_ids)]

return df
4 changes: 4 additions & 0 deletions src/pudl/metadata/resources/eia860.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,10 @@
"core_eia861__yearly_utility_data_misc",
"core_eia861__yearly_utility_data_nerc",
"core_eia861__yearly_utility_data_rto",
"core_sec10k__quarterly_filings",
"core_sec10k__quarterly_exhibit_21_company_ownership",
"core_sec10k__quarterly_company_information",
"out_sec10k__parents_and_subsidiaries",
],
},
},
Expand Down
12 changes: 8 additions & 4 deletions src/pudl/metadata/resources/sec10k.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Any

RESOURCE_METADATA: dict[str, dict[str, Any]] = {
"core_sec10k__filings": {
"core_sec10k__quarterly_filings": {
"description": "Metadata describing all submitted SEC 10k filings.",
"schema": {
"fields": [
Expand All @@ -23,7 +23,7 @@
"etl_group": "pudl_models",
"field_namespace": "sec10k",
},
"core_sec10k__exhibit_21_company_ownership": {
"core_sec10k__quarterly_exhibit_21_company_ownership": {
"description": "Company ownership data extracted from Exhibit 21 attachments to SEC 10k filings.",
"schema": {
"fields": [
Expand All @@ -38,7 +38,7 @@
"etl_group": "pudl_models",
"field_namespace": "sec10k",
},
"core_sec10k__company_information": {
"core_sec10k__quarterly_company_information": {
"description": "Company information extracted from SEC 10k filings.",
"schema": {
"fields": [
Expand All @@ -64,7 +64,11 @@
"field_namespace": "sec10k",
},
"out_sec10k__parents_and_subsidiaries": {
"description": "Denormalized table containing SEC 10k company information with mapping between subsidiary and parent companies, as well as a linkage to EIA companies.",
"description": (
"Denormalized table containing SEC 10-K company information with mapping "
"between subsidiary and parent companies, as well as a linkage to EIA "
"utilities."
),
"schema": {
"fields": [
"company_id_sec10k",
Expand Down

0 comments on commit 5396397

Please sign in to comment.