Add JRC-PPDB-OPEN for some EU generation.

Add a new data source: JRC-PPDB-OPEN (https://ec.europa.eu/jrc/en/publication/joint-research-centre-power-plant-database-jrc-ppdb) This source from the Joint Research Centre of the European Union links a few datasets together, including the Global Power Plant Database with ENTSO-E. Since JRC-PPDB-OPEN includes a measurement of the coverage of reported data (data gaps may exist in the ENTSO-E reporting for any generating unit), a decision was made such that all generating units for a particular GPPD plant would need to have 95% time coverage for a year to have that information aggregated and used as the 'reported' generation value for a GPPD plant. This means there are far fewer plants with reported generation than plants that are matched in JRC-PPDB-OPEN. This should be remedied in the future and is a patch for an underlying data representation issue. Since some of the WRI-collected plants (formerly the 'Fusion Tables') have existing generation values, but the source of those generation values is not well identified, a new column is added to the 'WRI country databases' which seeks to track the source of the singular generation field in the WRI databases. This field has been partially filled out for the accessible sources and should be given attention in the future. In total, this database adds the net generation as follows: 2015: +257 more plants with generation 2016: +400 more plants with generation 2017: +457 more plants with generation This update does not affect the number of plants or total capacity of the database. Database version 1.2.2
wri · Feb 5, 2020 · 13b6a3b · 13b6a3b
1 parent df7b0be
commit 13b6a3b
Show file tree

Hide file tree

Showing 145 changed files with 425,186 additions and 399,376 deletions.
diff --git a/build_databases/build_database_WRI.py b/build_databases/build_database_WRI.py
@@ -36,12 +36,6 @@
 # extract powerplant information from file(s)
 print(u"Reading in plants...")
 
-# specify column names used in raw file
-COLNAMES = ["Power Plant ID", "Name", "Fuel", "Secondary Fuel", "Capacity (MW)",
-            "Location", "Operational Status", "Commissioning Date",
-            "Units", "Owner", "Annual Generation (GWh)", "Source", "URL", "Country",
-            "Latitude", "Longitude", "Geolocation Source", "Year of Data"]
-
 # track IDs that are assigned to plants in two different countries (likely an error)
 overlapping_ids = {}
 countries_with_zero_plants = []
@@ -56,26 +50,26 @@
         plant_fuel_capacities = {}
 
         with open(os.path.join(RAW_FILE_DIRECTORY, afile), 'rU') as f:
-            datareader = csv.reader(f)
-            headers = datareader.next()
+            datareader = csv.DictReader(f)
             try:
-                id_col = headers.index(COLNAMES[0])
-                name_col = headers.index(COLNAMES[1])
-                primary_fuel_col = headers.index(COLNAMES[2])
-                other_fuel_col = headers.index(COLNAMES[3])
-                capacity_col = headers.index(COLNAMES[4])
-                location_col = headers.index(COLNAMES[5])
-                status_col = headers.index(COLNAMES[6])
-                commissioning_year_col = headers.index(COLNAMES[7])
-                owner_col = headers.index(COLNAMES[9])
-                generation_col = headers.index(COLNAMES[10])
-                source_col = headers.index(COLNAMES[11])
-                url_col = headers.index(COLNAMES[12])
-                country_col = headers.index(COLNAMES[13])
-                latitude_col = headers.index(COLNAMES[14])
-                longitude_col = headers.index(COLNAMES[15])
-                geolocation_source_col = headers.index(COLNAMES[16])
-                year_of_data_col = headers.index(COLNAMES[17])
+                id_col = "Power Plant ID"
+                name_col = "Name"
+                primary_fuel_col = "Fuel"
+                other_fuel_col = "Secondary Fuel"
+                capacity_col = "Capacity (MW)"
+                location_col = "Location"
+                status_col = "Operational Status"
+                commissioning_year_col = "Commissioning Date"
+                owner_col = "Owner"
+                generation_col = "Annual Generation (GWh)"
+                generation_source_col = "Generation Data Source"
+                source_col = "Source"
+                url_col = "URL"
+                country_col = "Country"
+                latitude_col = "Latitude"
+                longitude_col = "Longitude"
+                geolocation_source_col = "Geolocation Source"
+                year_of_data_col = "Year of Data"
             except:
                 print(u"- ERROR: One or more columns missing in {0}, skipping...".format(afile))
                 continue
@@ -133,9 +127,11 @@
                 try:
                     gen_gwh = float(pw.format_string(row[generation_col].replace(",", "")))
                     gen_year = int(pw.format_string(row[year_of_data_col]))
-                    generation = pw.PlantGenerationObject.create(gen_gwh, year=gen_year)
+                    gen_source = pw.format_string(row[generation_source_col])
                 except:
                     generation = pw.NO_DATA_OTHER
+                else:
+                    generation = pw.PlantGenerationObject.create(gen_gwh, year=gen_year, source=gen_source)
                 try:
                     owner = pw.format_string(row[owner_col])
                 except:

diff --git a/build_databases/build_global_power_plant_database.py b/build_databases/build_global_power_plant_database.py
@@ -237,6 +237,79 @@
 		print("...skipped {0} plants ({1} MW) for {2}.".format(_vals[0], _vals[1], _country))
 
 
+# STEP 3.9: Add in multinational generation datasets
+COUNTRY_DATABASE_FILE = pw.make_file_path(fileType="src_bin", filename="COUNTRY-Database.bin")
+JRC_OPEN_PERFORMANCE = pw.make_file_path('raw', 'JRC-PPDB-OPEN', 'JRC_OPEN_PERFORMANCE.csv')
+JRC_OPEN_UNITS = pw.make_file_path('raw', 'JRC-PPDB-OPEN', 'JRC_OPEN_UNITS.csv')
+JRC_OPEN_LINKAGES = pw.make_file_path('raw', 'JRC-PPDB-OPEN', 'JRC_OPEN_LINKAGES.csv')
+JRC_OPEN_TEMPORAL = pw.make_file_path('raw', 'JRC-PPDB-OPEN', 'JRC_OPEN_TEMPORAL.csv')
+JRC_BLACKLIST = set([
+	# blacklist created looking at obviously-wrong matches based on country designation
+	# eic_g,  # bad_wri_id
+	'50WG00000001097W',  # 'BRA0030768'
+	'48W000000SUTB-1P',  # 'USA0060878'
+	'26WUCNTRLDSCND24',  # 'CAN0008429'
+	'26WUCNTRLDSCND16',  # 'CAN0008429'
+	'50WG000000019861',  # 'BRA0029858'
+	'50WG000000019853',  # 'BRA0029858'
+	'50WGI00000019875',  # 'BRA0029858'
+	'48W000000ROOS-1P',  # 'USA0006202'
+])
+
+# {wri_id: [eic_g_1, eic_g_2, ...], ...}
+gppd_ppdb_link = {}
+with open(JRC_OPEN_LINKAGES) as fin:
+	r = csv.DictReader(fin)
+	for row in r:
+		wri_id = row['WRI_id']
+		gen_id = row['eic_g']
+		if gen_id:  # some blank gen_ids, which currently don't have wri_id matches
+			gppd_ppdb_link[wri_id] = gppd_ppdb_link.get(wri_id, []) + [gen_id]
+
+# {yr: {eic_g: (gen, time_coverage), ...}, ...}
+ppdb_generation = {str(yr): {} for yr in [2015, 2016, 2017, 2018]}
+with open(JRC_OPEN_TEMPORAL) as fin:
+	r = csv.DictReader(fin)
+	skipped_generation = 0
+	for row in r:
+		year_data = ppdb_generation[row['cyear']]
+		# value is in MWh according to `datapackage.json` in JRC-PPDB-OPEN
+		year_data[row['eic_g']] = (row['Generation'], row['time_coverage'])
+
+# desired lookup structure: {plant1: {year1: val, year2: val2, ...}, ...}
+agg_gen_by_gppd = {}
+# per-unit time availability
+time_threshold = '0.950'  # yes this is a string
+# WRI plants that aren't having the estimation applied [(plant1, yearA), ...]
+jrc_skipped_plants = []
+for wri_id, gen_ids in gppd_ppdb_link.items():
+	plant_totals = {}
+	for year in map(str, [2015, 2016, 2017]):
+		year_data = ppdb_generation[year]
+		year_gen_val = 0
+		accepted_gen_ids = []
+		for gen_id in gen_ids:
+			gen, time_coverage = year_data.get(gen_id, (0, '0.000'))
+			if time_coverage < time_threshold or gen_id in JRC_BLACKLIST:
+				jrc_skipped_plants.append((wri_id, int(year)))
+				break
+			year_gen_val += float(gen)
+			accepted_gen_ids.append(gen_id)
+		if set(accepted_gen_ids) == set(gen_ids):
+			# convert MWh to GWh and assign value for the year
+			plant_totals[int(year)] = year_gen_val / 1000
+	agg_gen_by_gppd[wri_id] = plant_totals
+
+for pid, pp in core_database.items():
+	if agg_gen_by_gppd.get(pid, {}):
+		new_generation = []
+		for yr, val in agg_gen_by_gppd[pid].items():
+			gen = pw.PlantGenerationObject.create(val, year=yr, source='JRC-PPDB-OPEN')
+			new_generation.append(gen)
+		if new_generation:
+			pp.generation = new_generation
+#print("Added {0} plants ({1} MW) from {2}.".format(data['count'], data['capacity'], dbname))
+
 # STEP 4: Estimate generation for plants without reported generation for target year
 count_plants_with_generation = 0
 #for plant_id,plant in core_database.iteritems():

diff --git a/output_database/DATABASE_VERSION b/output_database/DATABASE_VERSION
@@ -1,2 +1,2 @@
-1.2.1
+1.2.2