Update AUS, IND, USA, GBR.

Update database with partial equivalency to v1.3.0. Missing here are the additional changes to the generation estimation (to be demonstrated in an adjacent repo). Major changes in this update include more hardcoding to match ids and the odd intersection of official and unofficial data sources. Include two new years of generation for some countries (2018 and 2019). Database version: 1.2.4 Changes to be committed: modified: build_databases/build_database_AUS.py modified: build_databases/build_database_IND.py modified: build_databases/build_database_USA.py modified: build_databases/build_global_power_plant_database.py modified: output_database/DATABASE_VERSION modified: output_database/global_power_plant_database.csv modified: output_database/global_power_plant_database_country_summary.csv modified: powerplant_database.py new file: raw_source_files/AUS/NGER_2017-2018.csv new file: raw_source_files/AUS/australia_power_plants.geo.json new file: raw_source_files/IND/database_15.zip new file: raw_source_files/USA/2___Plant_Y2019.xlsx new file: raw_source_files/USA/3_1_Generator_Y2019.xlsx new file: raw_source_files/USA/EIA923_Schedules_2_3_4_5_M_12_2018_Final_Revision.xlsx new file: raw_source_files/USA/EIA923_Schedules_2_3_4_5_M_12_2019_Final_Revision.xlsx modified: raw_source_files/WRI/United Kingdom.csv new file: resources/AUS/AUS_plant_dimension.csv modified: resources/AUS/AUS_plants.csv modified: resources/IND/CEA_plants.csv new file: resources/wiki-solar-exclusion.csv modified: source_databases/AUS-Database.bin modified: source_databases/IND-Database.bin modified: source_databases/USA-Database.bin modified: source_databases/WRI-Database.bin modified: source_databases_csv/database_AUS.csv modified: source_databases_csv/database_IND.csv modified: source_databases_csv/database_USA.csv modified: source_databases_csv/database_WRI.csv modified: utils/database_country_summary.py
wri · Jan 26, 2022 · 1cf825b · 1cf825b
1 parent 232a666
commit 1cf825b
Show file tree

Hide file tree

Showing 29 changed files with 2,078,702 additions and 1,442,464 deletions.
diff --git a/build_databases/build_database_AUS.py b/build_databases/build_database_AUS.py
@@ -10,6 +10,7 @@
 """
 
 import xml.etree.ElementTree as ET
+import json
 import sys, os
 import csv
 
@@ -20,11 +21,15 @@
 COUNTRY_NAME = u"Australia"
 SAVE_CODE = u"AUS"
 SOURCE_NAME = u"Australian Renewable Energy Mapping Infrastructure"
-SOURCE_URL = u"http://services.ga.gov.au/site_3/rest/services/Electricity_Infrastructure/MapServer"
+SOURCE_URL = u"https://www.nationalmap.gov.au/"
 GENERATION_SOURCE = u"Australia Clean Energy Regulator"
 
+NGER_URL_1718 = u"http://www.cleanenergyregulator.gov.au/DocumentAssets/Documents/Greenhouse%20and%20energy%20information%20for%20designated%20generation%20facilities%202017-18.csv"
+NGER_FILENAME_1718 = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="NGER_2017-2018.csv")
+
 NGER_URL_1617 = u"http://www.cleanenergyregulator.gov.au/DocumentAssets/Documents/Greenhouse%20and%20energy%20information%20for%20designated%20generation%20facilities%202016-17.csv"
 NGER_FILENAME_1617 = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="NGER_2016-2017.csv")
+
 NGER_URL_1516 = u"http://www.cleanenergyregulator.gov.au/DocumentAssets/Documents/Greenhouse%20and%20energy%20information%20for%20designated%20generation%20facilities%202015-16.csv"
 NGER_FILENAME_1516 = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="NGER_2015-2016.csv")
 
@@ -37,18 +42,20 @@
 NGER_URL_1213 = u"http://www.cleanenergyregulator.gov.au/DocumentAssets/Documents/2012-13%20Greenhouse%20and%20energy%20information%20for%20designated%20generation%20facilities.csv"
 NGER_FILENAME_1213 = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="NGER_2012-2013.csv")
 
-RAW_FILE_NAME = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="australia_power_plants.xml")
+RAW_FILE_NAME = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="australia_power_plants.geo.json")
 CSV_FILE_NAME = pw.make_file_path(fileType="src_csv", filename="database_AUS.csv")
 SAVE_DIRECTORY = pw.make_file_path(fileType="src_bin")
 STATIC_ID_FILENAME = pw.make_file_path(fileType="resource", subFolder='AUS', filename="AUS_plants.csv")
+STATIC_MATCH_FILENAME = pw.make_file_path(fileType="resource", subFolder='AUS', filename="AUS_plant_dimension.csv")
 
 # other parameters
-API_BASE = "http://services.ga.gov.au/site_3/services/Electricity_Infrastructure/MapServer/WFSServer"
-API_CALL = "service=WFS&version=1.1.0&request=GetFeature&typeName=National_Major_Power_Stations"
+API_BASE = "https://services.ga.gov.au/gis/rest/services/Foundation_Electricity_Infrastructure/MapServer/0/query"
+API_CALL = "geometry=-180%2C-90%2C180%2C90&geometryType=esriGeometryEnvelope&inSR=EPSG%3A4326&spatialRel=esriSpatialRelIntersects&outFields=*&returnGeometry=true=&f=geojson"
 
 # optional raw file(s) download
 URL = API_BASE + "?" + API_CALL
 FILES = {RAW_FILE_NAME: URL,
+		NGER_FILENAME_1718: NGER_URL_1718,
 		NGER_FILENAME_1617: NGER_URL_1617,
 		NGER_FILENAME_1516: NGER_URL_1516,
 		NGER_FILENAME_1415: NGER_URL_1415,
@@ -64,7 +71,17 @@
 country_thesaurus = pw.make_country_names_thesaurus()
 
 # get permanent IDs for australian plants
-linking_table = {k['aremi_oid']: k for k in csv.DictReader(open(STATIC_ID_FILENAME))}
+generation_linking_table = {k['gppd_idnr']: k for k in csv.DictReader(open(STATIC_ID_FILENAME))}
+
+id_linking_table = {int(k['objectid']): k for k in csv.DictReader(open(STATIC_MATCH_FILENAME)) if k['objectid']}
+
+fuel_type_assurance = {
+	# gppd_idnr: primary_fuel
+	'AUS0000619': 'Solar',
+	'AUS0000526': 'Solar',
+	'AUS0000581': 'Solar',
+	'AUS0000620': 'Wind'
+}
 
 # create dictionary for power plant objects
 plants_dictionary = {}
@@ -74,115 +91,123 @@
 print(u"Reading NGER files to memory...")
 
 # read NGER file into a list, so the facilities can be referenced by their index in the original file
+nger_1718 = list(csv.DictReader(open(NGER_FILENAME_1718)))
 nger_1617 = list(csv.DictReader(open(NGER_FILENAME_1617)))
 nger_1516 = list(csv.DictReader(open(NGER_FILENAME_1516)))
 nger_1415 = list(csv.DictReader(open(NGER_FILENAME_1415)))
 nger_1314 = list(csv.DictReader(open(NGER_FILENAME_1314)))
 nger_1213 = list(csv.DictReader(open(NGER_FILENAME_1213)))
 
-# create a dictinary of namespaces
-ns = {"gml": "http://www.opengis.net/gml",
-    "Electricity_Infrastructure": "WFS"}
-
 # read data from XML file and parse
-count = 1
-with open(RAW_FILE_NAME, "rU") as f:
-    tree = ET.parse(f)
-    root = tree.getroot()
-    for station in tree.findall("gml:featureMember", ns):
-        plant = station.find("Electricity_Infrastructure:National_Major_Power_Stations", ns)
-        name = pw.format_string(plant.find("Electricity_Infrastructure:NAME", ns).text)
-
-        # get object id from AREMI (variable through time)
-        plant_oid = plant.find("Electricity_Infrastructure:OBJECTID", ns).text
-        # check if plant is already known, and skip if there is not a record (includes cases where AREMI has duplicated plants)
-        if plant_oid not in linking_table:
-            print(u"Error: Don't have prescribed ID for plant {0}; OID={1}.".format(name, plant_oid))
-            continue
-        # get the assigned GPPD IDNR as an int, stripping the 'AUS' prefix
-        plant_id = int(linking_table[plant_oid]['gppd_idnr'][3:])
-
-        try:
-            owner = pw.format_string(plant.find("Electricity_Infrastructure:OWNER", ns).text)
-        except:
-            owner = pw.NO_DATA_UNICODE
-        primary_fuel = pw.standardize_fuel(plant.find("Electricity_Infrastructure:PRIMARYFUELTYPE", ns).text, fuel_thesaurus)
-        try:
-            capacity = plant.find("Electricity_Infrastructure:GENERATIONMW", ns).text
-            capacity = float(capacity)
-        except:
-            print(u"Error: Can't read capacity for plant {0}.".format(name))
-            capacity = pw.NO_DATA_NUMERIC
-        coords = plant.find("Electricity_Infrastructure:SHAPE/gml:Point/gml:pos", ns).text.split(" ")
-        try:
-            longitude = float(coords[0])
-            latitude = float(coords[1])
-            geolocation_source = SOURCE_NAME
-        except:
-            longitude, latitude = pw.NO_DATA_NUMERIC, pw.NO_DATA_NUMERIC
-            geolocation_source = pw.NO_DATA_UNICODE
-
-        # # Additional information for future interest
-        # operational_status = plant.find('Electricity_Infrastructure:OPERATIONALSTATUS', ns).text)
-        # technology = plant.find('Electricity_Infrastructure:GENERATIONTYPE', ns).text)
-        # try:
-            # subfuel = plant.find('Electricity_Infrastructure:PRIMARYSUBFUELTYPE', ns).text
-        # except:
-            # subfuel = fuel
-
-        # date_updated format after split: YYYY-MM-DD
-        try:
-            year_updated = int(plant.find("Electricity_Infrastructure:REVISED", ns).text.split("T")[0][0:4])
-        except:
-            year_updated = pw.NO_DATA_NUMERIC
-
-        # get generation data (if any) from the NGER datasets
-        generation = []
-        for yr, lookup in zip(
-                range(2013, 2018),
-                [nger_1213, nger_1314, nger_1415, nger_1516, nger_1617]
-            ):
-            index_title = 'nger_{0}-{1}_index'.format(yr-1, yr)
-            # get the raw form of the nger indices field
-            nger_indices_raw = linking_table[plant_oid][index_title]
-            # if blank, continue to next year
-            if not nger_indices_raw.rstrip():
-                continue
-            # get ampersand-separated list of nger indices
-            nger_indices = nger_indices_raw.split('&')
-            # convert to real integers usable for list indexing
-            nger_indices = map(int, nger_indices)
-            gwh = 0
-            for idx in nger_indices:
-                try:
-                    nger_row = lookup[idx]
-                except:
-                    print("Error with looking up NGER row for {0} (year = {1}; NGER index = {2};)".format(name, yr, idx))
-                    continue
-                gen_gj = nger_row['Electricity Production (GJ)']
-                try:
-                    gen_gwh = float(gen_gj.replace(",", ""))  / 3600.
-                except:
-                    print("Error with NGER generation for {0} (year = {1}; NGER index = {2}; value={3})".format(name, yr, idx, gen_gj))
-                    pass
-                else:
-                    gwh += gen_gwh
-            # TODO: give proper time bounds
-            generation.append(pw.PlantGenerationObject.create(gwh, yr, source=GENERATION_SOURCE))
-
-
-        # assign ID number
-        idnr = pw.make_id(SAVE_CODE, plant_id)
-        new_location = pw.LocationObject(pw.NO_DATA_UNICODE, latitude, longitude)
-        new_plant = pw.PowerPlant(plant_idnr=idnr, plant_name=name, plant_owner=owner, 
-            plant_country=COUNTRY_NAME,
-            plant_location=new_location, plant_coord_source=geolocation_source,
-            plant_primary_fuel=primary_fuel, plant_capacity=capacity,
-            plant_generation=generation,
-            plant_source=SOURCE_NAME, plant_cap_year=year_updated,
-            plant_source_url=SOURCE_URL)
-        plants_dictionary[idnr] = new_plant
-        count += 1
+count = 0
+with open(RAW_FILE_NAME, "rU") as fin:
+	geojson = json.load(fin)
+
+
+for plant in geojson['features']:
+	plant_properties = plant['properties']
+	name_original = pw.format_string(plant_properties['name'])
+	plant_oid = plant_properties['objectid']
+
+	# check if plant is already known, and skip if there is not a record (includes cases where AREMI has duplicated plants)
+	if plant_oid not in id_linking_table:
+		print(u"Error: Don't have prescribed ID for plant {0}; OID={1}.".format(name_original, plant_oid))
+		continue
+
+	# get the assigned GPPD identifier
+	plant_idnr = id_linking_table[plant_oid]['gppd_idnr_assigned']
+	if not plant_idnr:
+		print(u"Warning: plant {0}; OID={1} will not be added, ID not found (possible exlucuded on purpose).".format(name_original, plant_oid))
+		continue
+
+	operational_status = plant_properties['operational_status']
+	if operational_status != 'Operational':
+		print(u"Warning: plant {0}; OID={1} will not be added, considered unoperational: {2}".format(name_original, plant_oid, operational_status))
+		continue
+
+	# override name
+	name_enforced = id_linking_table[plant_oid]['name_enforced']
+
+	try:
+		owner = pw.format_string(plant_properties['owner'])
+	except:
+		owner = pw.NO_DATA_UNICODE
+
+	try:
+		primary_fuel = pw.standardize_fuel(plant_properties['primaryfueltype'], fuel_thesaurus)
+	except:
+		print(u"Error: Can't understand fuel {0} for plant {1}.".format(plant_properties['primaryfueltype'], name_original))
+		primary_fuel = pw.NO_DATA_UNICODE
+
+	if plant_idnr in fuel_type_assurance:
+		print(u"Warning: overriding fuel for plant {0}.".format(name_original))
+		primary_fuel = pw.standardize_fuel(fuel_type_assurance[plant_idnr], fuel_thesaurus)
+	try:
+		capacity = plant_properties['generationmw']
+		capacity = float(capacity)
+	except:
+		print(u"Error: Can't read capacity for plant {0}.".format(name_original))
+		capacity = pw.NO_DATA_NUMERIC
+
+	coords = plant['geometry']['coordinates']
+	try:
+		longitude = float(coords[0])
+		latitude = float(coords[1])
+		geolocation_source = SOURCE_NAME
+	except:
+		longitude, latitude = pw.NO_DATA_NUMERIC, pw.NO_DATA_NUMERIC
+		geolocation_source = pw.NO_DATA_UNICODE
+
+	# get generation data (if any) from the NGER datasets
+	generation = []
+	for yr, lookup in zip(
+			range(2013, 2019),
+			[nger_1213, nger_1314, nger_1415, nger_1516, nger_1617, nger_1718]
+		):
+		index_title = 'nger_{0}-{1}_index'.format(yr-1, yr)
+		# get the raw form of the nger indices field
+		try:
+			nger_indices_raw = generation_linking_table[plant_idnr][index_title]
+		except:
+			print(u"Warning: gppd idnr {0} not found in generation matching table".format(plant_idnr))
+			break
+		# if blank, continue to next year
+		if not nger_indices_raw.rstrip():
+			continue
+		# get ampersand-separated list of nger indices
+		nger_indices = nger_indices_raw.split('&')
+		# convert to real integers usable for list indexing
+		nger_indices = map(int, nger_indices)
+		gwh = 0
+		for idx in nger_indices:
+			try:
+				nger_row = lookup[idx]
+			except:
+				print("Error with looking up NGER row for {0} (year = {1}; NGER index = {2};)".format(name_original, yr, idx))
+				continue
+			gen_gj = nger_row['Electricity Production (GJ)']
+			try:
+				gen_gwh = float(gen_gj.replace(",", ""))  / 3600.
+			except:
+				print("Error with NGER generation for {0} (year = {1}; NGER index = {2}; value={3})".format(name_original, yr, idx, gen_gj))
+				pass
+			else:
+				gwh += gen_gwh
+		# TODO: give proper time bounds
+		generation.append(pw.PlantGenerationObject.create(gwh, yr, source=GENERATION_SOURCE))
+
+
+	new_location = pw.LocationObject(pw.NO_DATA_UNICODE, latitude, longitude)
+
+	if primary_fuel:
+		new_plant = pw.PowerPlant(plant_idnr=plant_idnr, plant_name=name_enforced, plant_owner=owner, 
+			plant_country=COUNTRY_NAME,
+			plant_location=new_location, plant_coord_source=geolocation_source,
+			plant_primary_fuel=primary_fuel, plant_capacity=capacity,
+			plant_generation=generation,
+			plant_source=SOURCE_NAME, plant_source_url=SOURCE_URL)
+		plants_dictionary[plant_idnr] = new_plant
+		count += 1
 
 # report on plants read from file
 print(u"...read {0} plants.".format(len(plants_dictionary)))

diff --git a/build_databases/build_database_IND.py b/build_databases/build_database_IND.py
@@ -43,7 +43,7 @@
 SOURCE_URL2 = u"https://www.recregistryindia.nic.in/"
 GEOLOCATION_SOURCE_CEA = u"WRI"
 SAVE_CODE = u"IND"
-RAW_FILE_NAME_CEA = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="database_14.zip")
+RAW_FILE_NAME_CEA = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="database_15.zip")
 RAW_FILE_NAME_CEA_UZ = pw.make_file_path(fileType="raw", filename=SAVE_CODE)
 RAW_FILE_NAME_REC = pw.make_file_path(fileType="raw", subFolder=SAVE_CODE, filename="accredited_rec_generators.html")
 WRI_DATABASE = pw.make_file_path(fileType="src_bin", filename=u"WRI-Database.bin")
@@ -53,7 +53,7 @@
 SAVE_DIRECTORY = pw.make_file_path(fileType="src_bin")
 LOCATION_FILE = pw.make_file_path(fileType="resource", subFolder=SAVE_CODE, filename="plant_locations_IND.csv")
 TAB_NAME = u"Data"
-DATA_YEAR = 2018  # capacity data from CEA
+DATA_YEAR = 2019  # capacity data from CEA
 
 # optional raw files to download
 FILES = {
@@ -88,14 +88,14 @@ def get_CEA_generation(row, col, year, source_name):
 with open(PLANT_LOCATIONS_FILE, 'rU') as f:
     reader = csv.DictReader(f)
     for row in reader:
-        row['match_key'] = int(row['id_2017-2018'])
+        row['match_key'] = int(row['id_2018-2019'])
         try:
             row['latitude'] = float(row['latitude'])
             row['longitude'] = float(row['longitude'])
         except:
             pass
         if row['match_key'] in plant_locations:
-            print(u"-Error: Duplicated ID for 2017-2018: {0}".format(row['match_key']))
+            print(u"-Error: Duplicated ID for 2018-2019: {0}".format(row['match_key']))
         else:
             plant_locations[row['match_key']] = row
 print("Read location coordinates of {0} CEA-listed plants...".format(len(plant_locations)))
@@ -106,15 +106,16 @@ def get_CEA_generation(row, col, year, source_name):
     'name': u"NAME",
     'unit': u"UNIT_NO",
     'year': u"DT_ COMM",
-    'capacity': u"CAPACITY MW AS ON 31/03/2018",
+    'capacity': u"CAPACITY MW AS ON 31/03/2019",
     'type':    u"TYPE",
     'primary_fuel': u"FUEL 1",
     'other_fuel': u"FUEL 2",
-    'gen_13-14': u"2013-14\n\nNet \nGeneration \nGWh",
+    #'gen_13-14': u"2013-14\n\nNet \nGeneration \nGWh",
     'gen_14-15': u"2014-15\n\nNet \nGeneration \nGWh",
     'gen_15-16': u"2015-16\n\nNet \nGeneration \nGWh",
     'gen_16-17': u"2016-17\n\nNet \nGeneration \nGWh",
     'gen_17-18': u"2017-18\n\nNet \nGeneration \nGWh",
+    'gen_18-19': u"2018-19\n\nNet \nGeneration \nGWh",
 }
 
 # prepare list of units
@@ -139,11 +140,12 @@ def get_CEA_generation(row, col, year, source_name):
 type_col = rv.index(COLNAMES['type'])
 primary_fuel_col = rv.index(COLNAMES['primary_fuel'])
 other_fuel_col = rv.index(COLNAMES['other_fuel'])
-gen_13_14_col = rv.index(COLNAMES['gen_13-14'])
+#gen_13_14_col = rv.index(COLNAMES['gen_13-14'])
 gen_14_15_col = rv.index(COLNAMES['gen_14-15'])
 gen_15_16_col = rv.index(COLNAMES['gen_15-16'])
 gen_16_17_col = rv.index(COLNAMES['gen_16-17'])
 gen_17_18_col = rv.index(COLNAMES['gen_17-18'])
+gen_18_19_col = rv.index(COLNAMES['gen_18-19'])
 
 
 # parse each row
@@ -187,17 +189,21 @@ def get_CEA_generation(row, col, year, source_name):
     else:
         date_number = rv[year_col]
         year = pw.excel_date_as_datetime(date_number).year
-        unit_list[serial_id_val].append({'capacity': capacity, 'year': year})
+        try:
+            unit_list[serial_id_val].append({'capacity': capacity, 'year': year})
+        except:
+            print("-Error: Attempting to append unit to non-existent plant {0}".format(name))
         continue   # don't continue reading this line b/c it's not a full plant
 
     # try to load generation data
     # TODO: organize this into fiscal year (april through march)
-    generation_13 = get_CEA_generation(rv, gen_13_14_col, 2013, SOURCE_NAME)
+    #generation_13 = get_CEA_generation(rv, gen_13_14_col, 2013, SOURCE_NAME)
     generation_14 = get_CEA_generation(rv, gen_14_15_col, 2014, SOURCE_NAME)
     generation_15 = get_CEA_generation(rv, gen_15_16_col, 2015, SOURCE_NAME)
     generation_16 = get_CEA_generation(rv, gen_16_17_col, 2016, SOURCE_NAME)
     generation_17 = get_CEA_generation(rv, gen_17_18_col, 2017, SOURCE_NAME)
-    generation = [generation_13, generation_14, generation_15, generation_16, generation_17]
+    generation_18 = get_CEA_generation(rv, gen_18_19_col, 2018, SOURCE_NAME)
+    generation = [generation_14, generation_15, generation_16, generation_17, generation_18]
 
     try:
         plant_type = pw.format_string(rv[type_col])