Skip to content

Commit

Permalink
Update AUS, IND, USA, GBR.
Browse files Browse the repository at this point in the history
Update database with partial equivalency to v1.3.0. Missing here are the
additional changes to the generation estimation (to be demonstrated in
an adjacent repo).

Major changes in this update include more hardcoding to match ids and
the odd intersection of official and unofficial data sources.

Include two new years of generation for some countries (2018 and 2019).

Database version: 1.2.4

Changes to be committed:
	modified:   build_databases/build_database_AUS.py
	modified:   build_databases/build_database_IND.py
	modified:   build_databases/build_database_USA.py
	modified:   build_databases/build_global_power_plant_database.py
	modified:   output_database/DATABASE_VERSION
	modified:   output_database/global_power_plant_database.csv
	modified:   output_database/global_power_plant_database_country_summary.csv
	modified:   powerplant_database.py
	new file:   raw_source_files/AUS/NGER_2017-2018.csv
	new file:   raw_source_files/AUS/australia_power_plants.geo.json
	new file:   raw_source_files/IND/database_15.zip
	new file:   raw_source_files/USA/2___Plant_Y2019.xlsx
	new file:   raw_source_files/USA/3_1_Generator_Y2019.xlsx
	new file:   raw_source_files/USA/EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx
	new file:   raw_source_files/USA/EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx
	modified:   raw_source_files/WRI/United Kingdom.csv
	new file:   resources/AUS/AUS_plant_dimension.csv
	modified:   resources/AUS/AUS_plants.csv
	modified:   resources/IND/CEA_plants.csv
	new file:   resources/wiki-solar-exclusion.csv
	modified:   source_databases/AUS-Database.bin
	modified:   source_databases/IND-Database.bin
	modified:   source_databases/USA-Database.bin
	modified:   source_databases/WRI-Database.bin
	modified:   source_databases_csv/database_AUS.csv
	modified:   source_databases_csv/database_IND.csv
	modified:   source_databases_csv/database_USA.csv
	modified:   source_databases_csv/database_WRI.csv
	modified:   utils/database_country_summary.py
  • Loading branch information
loganbyers committed Jan 26, 2022
1 parent 232a666 commit 1cf825b
Show file tree
Hide file tree
Showing 29 changed files with 2,078,702 additions and 1,442,464 deletions.
239 changes: 132 additions & 107 deletions build_databases/build_database_AUS.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"""

import xml.etree.ElementTree as ET
import json
import sys, os
import csv

Expand All @@ -20,11 +21,15 @@
COUNTRY_NAME = u"Australia"
SAVE_CODE = u"AUS"
SOURCE_NAME = u"Australian Renewable Energy Mapping Infrastructure"
SOURCE_URL = u"http://services.ga.gov.au/site_3/rest/services/Electricity_Infrastructure/MapServer"
SOURCE_URL = u"https://www.nationalmap.gov.au/"
GENERATION_SOURCE = u"Australia Clean Energy Regulator"

NGER_URL_1718 = u"http://www.cleanenergyregulator.gov.au/DocumentAssets/Documents/Greenhouse%20and%20energy%20information%20for%20designated%20generation%20facilities%202017-18.csv"
NGER_FILENAME_1718 = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="NGER_2017-2018.csv")

NGER_URL_1617 = u"http://www.cleanenergyregulator.gov.au/DocumentAssets/Documents/Greenhouse%20and%20energy%20information%20for%20designated%20generation%20facilities%202016-17.csv"
NGER_FILENAME_1617 = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="NGER_2016-2017.csv")

NGER_URL_1516 = u"http://www.cleanenergyregulator.gov.au/DocumentAssets/Documents/Greenhouse%20and%20energy%20information%20for%20designated%20generation%20facilities%202015-16.csv"
NGER_FILENAME_1516 = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="NGER_2015-2016.csv")

Expand All @@ -37,18 +42,20 @@
NGER_URL_1213 = u"http://www.cleanenergyregulator.gov.au/DocumentAssets/Documents/2012-13%20Greenhouse%20and%20energy%20information%20for%20designated%20generation%20facilities.csv"
NGER_FILENAME_1213 = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="NGER_2012-2013.csv")

RAW_FILE_NAME = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="australia_power_plants.xml")
RAW_FILE_NAME = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="australia_power_plants.geo.json")
CSV_FILE_NAME = pw.make_file_path(fileType="src_csv", filename="database_AUS.csv")
SAVE_DIRECTORY = pw.make_file_path(fileType="src_bin")
STATIC_ID_FILENAME = pw.make_file_path(fileType="resource", subFolder='AUS', filename="AUS_plants.csv")
STATIC_MATCH_FILENAME = pw.make_file_path(fileType="resource", subFolder='AUS', filename="AUS_plant_dimension.csv")

# other parameters
API_BASE = "http://services.ga.gov.au/site_3/services/Electricity_Infrastructure/MapServer/WFSServer"
API_CALL = "service=WFS&version=1.1.0&request=GetFeature&typeName=National_Major_Power_Stations"
API_BASE = "https://services.ga.gov.au/gis/rest/services/Foundation_Electricity_Infrastructure/MapServer/0/query"
API_CALL = "geometry=-180%2C-90%2C180%2C90&geometryType=esriGeometryEnvelope&inSR=EPSG%3A4326&spatialRel=esriSpatialRelIntersects&outFields=*&returnGeometry=true=&f=geojson"

# optional raw file(s) download
URL = API_BASE + "?" + API_CALL
FILES = {RAW_FILE_NAME: URL,
NGER_FILENAME_1718: NGER_URL_1718,
NGER_FILENAME_1617: NGER_URL_1617,
NGER_FILENAME_1516: NGER_URL_1516,
NGER_FILENAME_1415: NGER_URL_1415,
Expand All @@ -64,7 +71,17 @@
country_thesaurus = pw.make_country_names_thesaurus()

# get permanent IDs for australian plants
linking_table = {k['aremi_oid']: k for k in csv.DictReader(open(STATIC_ID_FILENAME))}
generation_linking_table = {k['gppd_idnr']: k for k in csv.DictReader(open(STATIC_ID_FILENAME))}

id_linking_table = {int(k['objectid']): k for k in csv.DictReader(open(STATIC_MATCH_FILENAME)) if k['objectid']}

fuel_type_assurance = {
# gppd_idnr: primary_fuel
'AUS0000619': 'Solar',
'AUS0000526': 'Solar',
'AUS0000581': 'Solar',
'AUS0000620': 'Wind'
}

# create dictionary for power plant objects
plants_dictionary = {}
Expand All @@ -74,115 +91,123 @@
print(u"Reading NGER files to memory...")

# read NGER file into a list, so the facilities can be referenced by their index in the original file
nger_1718 = list(csv.DictReader(open(NGER_FILENAME_1718)))
nger_1617 = list(csv.DictReader(open(NGER_FILENAME_1617)))
nger_1516 = list(csv.DictReader(open(NGER_FILENAME_1516)))
nger_1415 = list(csv.DictReader(open(NGER_FILENAME_1415)))
nger_1314 = list(csv.DictReader(open(NGER_FILENAME_1314)))
nger_1213 = list(csv.DictReader(open(NGER_FILENAME_1213)))

# create a dictinary of namespaces
ns = {"gml": "http://www.opengis.net/gml",
"Electricity_Infrastructure": "WFS"}

# read data from XML file and parse
count = 1
with open(RAW_FILE_NAME, "rU") as f:
tree = ET.parse(f)
root = tree.getroot()
for station in tree.findall("gml:featureMember", ns):
plant = station.find("Electricity_Infrastructure:National_Major_Power_Stations", ns)
name = pw.format_string(plant.find("Electricity_Infrastructure:NAME", ns).text)

# get object id from AREMI (variable through time)
plant_oid = plant.find("Electricity_Infrastructure:OBJECTID", ns).text
# check if plant is already known, and skip if there is not a record (includes cases where AREMI has duplicated plants)
if plant_oid not in linking_table:
print(u"Error: Don't have prescribed ID for plant {0}; OID={1}.".format(name, plant_oid))
continue
# get the assigned GPPD IDNR as an int, stripping the 'AUS' prefix
plant_id = int(linking_table[plant_oid]['gppd_idnr'][3:])

try:
owner = pw.format_string(plant.find("Electricity_Infrastructure:OWNER", ns).text)
except:
owner = pw.NO_DATA_UNICODE
primary_fuel = pw.standardize_fuel(plant.find("Electricity_Infrastructure:PRIMARYFUELTYPE", ns).text, fuel_thesaurus)
try:
capacity = plant.find("Electricity_Infrastructure:GENERATIONMW", ns).text
capacity = float(capacity)
except:
print(u"Error: Can't read capacity for plant {0}.".format(name))
capacity = pw.NO_DATA_NUMERIC
coords = plant.find("Electricity_Infrastructure:SHAPE/gml:Point/gml:pos", ns).text.split(" ")
try:
longitude = float(coords[0])
latitude = float(coords[1])
geolocation_source = SOURCE_NAME
except:
longitude, latitude = pw.NO_DATA_NUMERIC, pw.NO_DATA_NUMERIC
geolocation_source = pw.NO_DATA_UNICODE

# # Additional information for future interest
# operational_status = plant.find('Electricity_Infrastructure:OPERATIONALSTATUS', ns).text)
# technology = plant.find('Electricity_Infrastructure:GENERATIONTYPE', ns).text)
# try:
# subfuel = plant.find('Electricity_Infrastructure:PRIMARYSUBFUELTYPE', ns).text
# except:
# subfuel = fuel

# date_updated format after split: YYYY-MM-DD
try:
year_updated = int(plant.find("Electricity_Infrastructure:REVISED", ns).text.split("T")[0][0:4])
except:
year_updated = pw.NO_DATA_NUMERIC

# get generation data (if any) from the NGER datasets
generation = []
for yr, lookup in zip(
range(2013, 2018),
[nger_1213, nger_1314, nger_1415, nger_1516, nger_1617]
):
index_title = 'nger_{0}-{1}_index'.format(yr-1, yr)
# get the raw form of the nger indices field
nger_indices_raw = linking_table[plant_oid][index_title]
# if blank, continue to next year
if not nger_indices_raw.rstrip():
continue
# get ampersand-separated list of nger indices
nger_indices = nger_indices_raw.split('&')
# convert to real integers usable for list indexing
nger_indices = map(int, nger_indices)
gwh = 0
for idx in nger_indices:
try:
nger_row = lookup[idx]
except:
print("Error with looking up NGER row for {0} (year = {1}; NGER index = {2};)".format(name, yr, idx))
continue
gen_gj = nger_row['Electricity Production (GJ)']
try:
gen_gwh = float(gen_gj.replace(",", "")) / 3600.
except:
print("Error with NGER generation for {0} (year = {1}; NGER index = {2}; value={3})".format(name, yr, idx, gen_gj))
pass
else:
gwh += gen_gwh
# TODO: give proper time bounds
generation.append(pw.PlantGenerationObject.create(gwh, yr, source=GENERATION_SOURCE))


# assign ID number
idnr = pw.make_id(SAVE_CODE, plant_id)
new_location = pw.LocationObject(pw.NO_DATA_UNICODE, latitude, longitude)
new_plant = pw.PowerPlant(plant_idnr=idnr, plant_name=name, plant_owner=owner,
plant_country=COUNTRY_NAME,
plant_location=new_location, plant_coord_source=geolocation_source,
plant_primary_fuel=primary_fuel, plant_capacity=capacity,
plant_generation=generation,
plant_source=SOURCE_NAME, plant_cap_year=year_updated,
plant_source_url=SOURCE_URL)
plants_dictionary[idnr] = new_plant
count += 1
count = 0
with open(RAW_FILE_NAME, "rU") as fin:
geojson = json.load(fin)


for plant in geojson['features']:
plant_properties = plant['properties']
name_original = pw.format_string(plant_properties['name'])
plant_oid = plant_properties['objectid']

# check if plant is already known, and skip if there is not a record (includes cases where AREMI has duplicated plants)
if plant_oid not in id_linking_table:
print(u"Error: Don't have prescribed ID for plant {0}; OID={1}.".format(name_original, plant_oid))
continue

# get the assigned GPPD identifier
plant_idnr = id_linking_table[plant_oid]['gppd_idnr_assigned']
if not plant_idnr:
print(u"Warning: plant {0}; OID={1} will not be added, ID not found (possible exlucuded on purpose).".format(name_original, plant_oid))
continue

operational_status = plant_properties['operational_status']
if operational_status != 'Operational':
print(u"Warning: plant {0}; OID={1} will not be added, considered unoperational: {2}".format(name_original, plant_oid, operational_status))
continue

# override name
name_enforced = id_linking_table[plant_oid]['name_enforced']

try:
owner = pw.format_string(plant_properties['owner'])
except:
owner = pw.NO_DATA_UNICODE

try:
primary_fuel = pw.standardize_fuel(plant_properties['primaryfueltype'], fuel_thesaurus)
except:
print(u"Error: Can't understand fuel {0} for plant {1}.".format(plant_properties['primaryfueltype'], name_original))
primary_fuel = pw.NO_DATA_UNICODE

if plant_idnr in fuel_type_assurance:
print(u"Warning: overriding fuel for plant {0}.".format(name_original))
primary_fuel = pw.standardize_fuel(fuel_type_assurance[plant_idnr], fuel_thesaurus)
try:
capacity = plant_properties['generationmw']
capacity = float(capacity)
except:
print(u"Error: Can't read capacity for plant {0}.".format(name_original))
capacity = pw.NO_DATA_NUMERIC

coords = plant['geometry']['coordinates']
try:
longitude = float(coords[0])
latitude = float(coords[1])
geolocation_source = SOURCE_NAME
except:
longitude, latitude = pw.NO_DATA_NUMERIC, pw.NO_DATA_NUMERIC
geolocation_source = pw.NO_DATA_UNICODE

# get generation data (if any) from the NGER datasets
generation = []
for yr, lookup in zip(
range(2013, 2019),
[nger_1213, nger_1314, nger_1415, nger_1516, nger_1617, nger_1718]
):
index_title = 'nger_{0}-{1}_index'.format(yr-1, yr)
# get the raw form of the nger indices field
try:
nger_indices_raw = generation_linking_table[plant_idnr][index_title]
except:
print(u"Warning: gppd idnr {0} not found in generation matching table".format(plant_idnr))
break
# if blank, continue to next year
if not nger_indices_raw.rstrip():
continue
# get ampersand-separated list of nger indices
nger_indices = nger_indices_raw.split('&')
# convert to real integers usable for list indexing
nger_indices = map(int, nger_indices)
gwh = 0
for idx in nger_indices:
try:
nger_row = lookup[idx]
except:
print("Error with looking up NGER row for {0} (year = {1}; NGER index = {2};)".format(name_original, yr, idx))
continue
gen_gj = nger_row['Electricity Production (GJ)']
try:
gen_gwh = float(gen_gj.replace(",", "")) / 3600.
except:
print("Error with NGER generation for {0} (year = {1}; NGER index = {2}; value={3})".format(name_original, yr, idx, gen_gj))
pass
else:
gwh += gen_gwh
# TODO: give proper time bounds
generation.append(pw.PlantGenerationObject.create(gwh, yr, source=GENERATION_SOURCE))


new_location = pw.LocationObject(pw.NO_DATA_UNICODE, latitude, longitude)

if primary_fuel:
new_plant = pw.PowerPlant(plant_idnr=plant_idnr, plant_name=name_enforced, plant_owner=owner,
plant_country=COUNTRY_NAME,
plant_location=new_location, plant_coord_source=geolocation_source,
plant_primary_fuel=primary_fuel, plant_capacity=capacity,
plant_generation=generation,
plant_source=SOURCE_NAME, plant_source_url=SOURCE_URL)
plants_dictionary[plant_idnr] = new_plant
count += 1

# report on plants read from file
print(u"...read {0} plants.".format(len(plants_dictionary)))
Expand Down
26 changes: 16 additions & 10 deletions build_databases/build_database_IND.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
SOURCE_URL2 = u"https://www.recregistryindia.nic.in/"
GEOLOCATION_SOURCE_CEA = u"WRI"
SAVE_CODE = u"IND"
RAW_FILE_NAME_CEA = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="database_14.zip")
RAW_FILE_NAME_CEA = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="database_15.zip")
RAW_FILE_NAME_CEA_UZ = pw.make_file_path(fileType="raw", filename=SAVE_CODE)
RAW_FILE_NAME_REC = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="accredited_rec_generators.html")
WRI_DATABASE = pw.make_file_path(fileType="src_bin", filename=u"WRI-Database.bin")
Expand All @@ -53,7 +53,7 @@
SAVE_DIRECTORY = pw.make_file_path(fileType="src_bin")
LOCATION_FILE = pw.make_file_path(fileType="resource", subFolder=SAVE_CODE, filename="plant_locations_IND.csv")
TAB_NAME = u"Data"
DATA_YEAR = 2018 # capacity data from CEA
DATA_YEAR = 2019 # capacity data from CEA

# optional raw files to download
FILES = {
Expand Down Expand Up @@ -88,14 +88,14 @@ def get_CEA_generation(row, col, year, source_name):
with open(PLANT_LOCATIONS_FILE, 'rU') as f:
reader = csv.DictReader(f)
for row in reader:
row['match_key'] = int(row['id_2017-2018'])
row['match_key'] = int(row['id_2018-2019'])
try:
row['latitude'] = float(row['latitude'])
row['longitude'] = float(row['longitude'])
except:
pass
if row['match_key'] in plant_locations:
print(u"-Error: Duplicated ID for 2017-2018: {0}".format(row['match_key']))
print(u"-Error: Duplicated ID for 2018-2019: {0}".format(row['match_key']))
else:
plant_locations[row['match_key']] = row
print("Read location coordinates of {0} CEA-listed plants...".format(len(plant_locations)))
Expand All @@ -106,15 +106,16 @@ def get_CEA_generation(row, col, year, source_name):
'name': u"NAME",
'unit': u"UNIT_NO",
'year': u"DT_ COMM",
'capacity': u"CAPACITY MW AS ON 31/03/2018",
'capacity': u"CAPACITY MW AS ON 31/03/2019",
'type': u"TYPE",
'primary_fuel': u"FUEL 1",
'other_fuel': u"FUEL 2",
'gen_13-14': u"2013-14\n\nNet \nGeneration \nGWh",
#'gen_13-14': u"2013-14\n\nNet \nGeneration \nGWh",
'gen_14-15': u"2014-15\n\nNet \nGeneration \nGWh",
'gen_15-16': u"2015-16\n\nNet \nGeneration \nGWh",
'gen_16-17': u"2016-17\n\nNet \nGeneration \nGWh",
'gen_17-18': u"2017-18\n\nNet \nGeneration \nGWh",
'gen_18-19': u"2018-19\n\nNet \nGeneration \nGWh",
}

# prepare list of units
Expand All @@ -139,11 +140,12 @@ def get_CEA_generation(row, col, year, source_name):
type_col = rv.index(COLNAMES['type'])
primary_fuel_col = rv.index(COLNAMES['primary_fuel'])
other_fuel_col = rv.index(COLNAMES['other_fuel'])
gen_13_14_col = rv.index(COLNAMES['gen_13-14'])
#gen_13_14_col = rv.index(COLNAMES['gen_13-14'])
gen_14_15_col = rv.index(COLNAMES['gen_14-15'])
gen_15_16_col = rv.index(COLNAMES['gen_15-16'])
gen_16_17_col = rv.index(COLNAMES['gen_16-17'])
gen_17_18_col = rv.index(COLNAMES['gen_17-18'])
gen_18_19_col = rv.index(COLNAMES['gen_18-19'])


# parse each row
Expand Down Expand Up @@ -187,17 +189,21 @@ def get_CEA_generation(row, col, year, source_name):
else:
date_number = rv[year_col]
year = pw.excel_date_as_datetime(date_number).year
unit_list[serial_id_val].append({'capacity': capacity, 'year': year})
try:
unit_list[serial_id_val].append({'capacity': capacity, 'year': year})
except:
print("-Error: Attempting to append unit to non-existent plant {0}".format(name))
continue # don't continue reading this line b/c it's not a full plant

# try to load generation data
# TODO: organize this into fiscal year (april through march)
generation_13 = get_CEA_generation(rv, gen_13_14_col, 2013, SOURCE_NAME)
#generation_13 = get_CEA_generation(rv, gen_13_14_col, 2013, SOURCE_NAME)
generation_14 = get_CEA_generation(rv, gen_14_15_col, 2014, SOURCE_NAME)
generation_15 = get_CEA_generation(rv, gen_15_16_col, 2015, SOURCE_NAME)
generation_16 = get_CEA_generation(rv, gen_16_17_col, 2016, SOURCE_NAME)
generation_17 = get_CEA_generation(rv, gen_17_18_col, 2017, SOURCE_NAME)
generation = [generation_13, generation_14, generation_15, generation_16, generation_17]
generation_18 = get_CEA_generation(rv, gen_18_19_col, 2018, SOURCE_NAME)
generation = [generation_14, generation_15, generation_16, generation_17, generation_18]

try:
plant_type = pw.format_string(rv[type_col])
Expand Down
Loading

0 comments on commit 1cf825b

Please sign in to comment.