Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update automatic metadata creation #393

Draft
wants to merge 12 commits into
base: develop
Choose a base branch
from
64 changes: 23 additions & 41 deletions open_mastr/soap_api/metadata/create.py
Original file line number Diff line number Diff line change
@@ -69,8 +69,9 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None):
"id": str(uuid.uuid4()),
"description": f"Raw data download Marktstammdatenregister (MaStR) data using the webservice.\n\n{description_extra}",
"language": ["en-GB", "de-DE"],
"subject": [{"name": None, "path": None}],
"keywords": ["powerplants", "renewables"],
"created": publication_date,
"publicationDate": publication_date,
"version": data_version,
"context": {
"homepage": "https://www.marktstammdatenregister.de/MaStR/",
@@ -85,13 +86,15 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None):
"spatial": {"location": None, "extent": "Germany", "resolution": "vector"},
"temporal": {
"referenceDate": reference_date.strftime("%Y-%m-%d %H:%M:%S"),
"timeseries": {
"timeseries": [
{
"start": None,
"end": None,
"resolution": None,
"alignment": None,
"aggregationType": None,
},
}
]
},
"sources": [
{
@@ -106,7 +109,7 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None):
"instruction": "You are free: To Share, To Create, To Adapt; As long as you: Attribute",
"attribution": f"© Marktstammdatenregister {datetime.date.today().year} | dl-de/by-2-0",
}
],
]
},
{
"title": "RLI - open_MaStR",
@@ -120,8 +123,8 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None):
"instruction": "You are free: To Share, To Create, To Adapt; As long as you: Attribute, Share-Alike, Keep open!",
"attribution": "open_MaStR © Reiner Lemoine Institut | AGPL-3.0",
}
],
},
]
}
],
"licenses": [
{
@@ -134,30 +137,16 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None):
],
"contributors": [
{
"title": "Ludee",
"title": None,
"email": None,
"path": "https://github.com/ludee",
"role": "maintainer",
"organization": "Reiner Lemoine Institut gGmbH",
},
{
"title": "Guido Pleßmann",
"email": None,
"path": "https://gplssm.de",
"role": "maintainer",
"organization": "Reiner Lemoine Institut gGmbH",
},
{
"title": "oakca",
"email": None,
"path": "https://github.com/oakca",
"role": "contributor",
"organization": "Reiner Lemoine Institut gGmbH",
},
"date": None,
"object": None,
"comment": None
}
],
"review": {"path": None, "badge": None},
"metaMetadata": {
"metadataVersion": "OEP-1.4.0",
"metadataVersion": "OEP-1.5.2",
"metadataLicense": {
"name": "CC0-1.0",
"title": "Creative Commons Zero v1.0 Universal",
@@ -172,6 +161,7 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None):
"licenses": "License name must follow the SPDX License List (https://spdx.org/licenses/)",
"review": "Following the OEP Data Review (https://github.com/OpenEnergyPlatform/data-preprocessing/wiki)",
"null": "If not applicable use (null)",
"todo": "If a value ist not yet available, use: todo"
},
}

@@ -253,50 +243,44 @@ def create_datapackage_meta_json(
resource = {
"profile": "tabular-data-resource",
"name": f"bnetza_mastr_{tech}_raw",
"title": f"open-MaStR {tech} units (raw)",
"path": filenames["raw"][tech]["joined"],
"scheme": "file",
"format": "csv",
"encoding": "utf-8",
"mediatype": "text/csv",
"schema": {
"fields": raw_fields,
"primaryKey": ["EinheitMastrNummer"],
},
"dialect": {"delimiter": ","},
}

resources_meta["resources"].append(resource)
if "cleaned" in data:
resource = {
"profile": "tabular-data-resource",
"name": f"bnetza_mastr_{tech}_cleaned",
"title": f"open-MaStR {tech} units (cleaned)",
"path": filenames["cleaned"][tech],
"scheme": "file",
"format": "csv",
"encoding": "utf-8",
"mediatype": "text/csv",
"schema": {
"fields": raw_fields,
"primaryKey": ["EinheitMastrNummer"],
},
"dialect": {"delimiter": ","},
}

resources_meta["resources"].append(resource)
if "postprocessed" in data:
processed_fields = [
{
"name": "geom",
"unit": None,
"type": "str",
"desciption": "Standort der Anlage als Punktgeometrie im WKB Format",
"description": "Standort der Anlage als Punktgeometrie im WKB Format",
"examples": "0101000020e610000071fbe59315131c40a2b437f8c20e4a40",
},
{
"name": "comment",
"unit": None,
"type": "str",
"desciption": "Information about data post-processing",
"description": "Information about data post-processing",
"examples": "has_geom; outside_vg250",
},
]
@@ -306,7 +290,7 @@ def create_datapackage_meta_json(
"name": "tags",
"unit": None,
"type": "json",
"desciption": "Data insights and report about post-processing steps",
"description": "Data insights and report about post-processing steps",
"examples": {
"plz_check": False,
"processed": True,
@@ -319,18 +303,16 @@ def create_datapackage_meta_json(
"name": "geom",
"unit": None,
"type": "str",
"desciption": "Standort der Anlage als Punktgeometrie im WKB Format (EPSG 3035)",
"description": "Standort der Anlage als Punktgeometrie im WKB Format (EPSG 3035)",
"examples": "0101000020e610000071fbe59315131c40a2b437f8c20e4a40",
}
)
resource = {
"profile": "tabular-data-resource",
"name": f"bnetza_mastr_{tech}",
"title": f"open-MaStR {tech} units",
"path": filenames["postprocessed"][tech],
"scheme": "file",
"format": "csv",
"encoding": "utf-8",
"mediatype": "text/csv",
"schema": {
"fields": raw_fields + processed_fields,
"primaryKey": ["EinheitMastrNummer"],
70 changes: 46 additions & 24 deletions open_mastr/soap_api/metadata/description.py
Original file line number Diff line number Diff line change
@@ -33,19 +33,19 @@ def __init__(self, xml=None):
self.xml = fh.read()
else:
# If no XML file is given, the file is read from an URL
zipurl = 'https://www.marktstammdatenregister.de/MaStRHilfe/files/' \
'webdienst/Dienstbeschreibung_1_2_39_Produktion.zip'
zipurl = "https://www.marktstammdatenregister.de/MaStRHilfe/files/webdienst/" \
"Dienstbeschreibung_Produktion_Version" \
"1.2.87" \ # update version here
".zip"

with urlopen(zipurl) as zipresp:
with ZipFile(BytesIO(zipresp.read())) as zfile:
self.xml = zfile.read('xsd/mastrbasetypes.xsd')


self.xml = zfile.read("xsd/mastrbasetypes.xsd")

# Parse XML and extract relevant data
parsed = xmltodict.parse(self.xml, process_namespaces=False)
self.complex_types = parsed['schema']["complexType"]
self.simple_types = parsed['schema']["simpleType"]
self.complex_types = parsed["schema"]["complexType"]
self.simple_types = parsed["schema"]["simpleType"]

# Prepare parsed data for documentational purposes
abstract_types, parameters, responses, types = self._filter_type_descriptions()
@@ -78,13 +78,17 @@ def _filter_type_descriptions(self):
raise ValueError("Ohh...")
else:
# Filter all functions
if item["@name"].startswith(("Get", "Set", "Erneute", "Verschiebe", "Delete")):
if item["@name"].startswith(
("Get", "Set", "Erneute", "Verschiebe", "Delete")
):
functions.append(item)

# Further split the list of functions into paramters and responses
if item["@name"].endswith("Parameter"):
if "complexContent" in item.keys():
parameters[item["@name"]] = item["complexContent"]["extension"]
parameters[item["@name"]] = item["complexContent"][
"extension"
]
else:
parameters[item["@name"]] = item
elif item["@name"].endswith("Antwort"):
@@ -111,12 +115,14 @@ def prepare_simple_type(self):

for simple_type in self.simple_types:
if "enumeration" in simple_type["restriction"]:
possible_values = [_["@value"] for _ in simple_type["restriction"]["enumeration"]]
possible_values = [
_["@value"] for _ in simple_type["restriction"]["enumeration"]
]
else:
possible_values = []
simple_types_doc[simple_type["@name"]] = {
"type": simple_type["restriction"]["@base"],
"values": possible_values
"values": possible_values,
}
return simple_types_doc

@@ -140,49 +146,61 @@ def functions_data_documentation(self):
if "annotation" in fcn["sequence"]["element"]:
fcn_data = [fcn["sequence"]["element"]]
else:
fcn_data = self.types[fcn["sequence"]["element"]["@type"].split(":")[1]]["sequence"]["element"]
fcn_data = self.types[
fcn["sequence"]["element"]["@type"].split(":")[1]
]["sequence"]["element"]
else:
print(type(fcn["sequence"]))
print(fcn["sequence"])
raise ValueError

# Add data for inherited columns from base types
if "@base" in fcn:
if not fcn["@base"] == 'mastr:AntwortBasis':
fcn_data = _collect_columns_of_base_type(self.types, fcn["@base"].split(":")[1], fcn_data)
if not fcn["@base"] == "mastr:AntwortBasis":
fcn_data = _collect_columns_of_base_type(
self.types, fcn["@base"].split(":")[1], fcn_data
)
function_docs[fcn_name] = {}
for column in fcn_data:
# Replace MaStR internal types with more general ones
if column["@type"].startswith("mastr:"):
try:
column_type = self.simple_types_prepared[column["@type"].split(":")[1]]["type"]
column_type = self.simple_types_prepared[
column["@type"].split(":")[1]
]["type"]
except KeyError:
column_type = column["@type"]
else:
column_type = column["@type"]

if "annotation" in column.keys():
description = column["annotation"]["documentation"].get("#text", None)
description = column["annotation"]["documentation"].get(
"#text", None
)
if description:
description = re.sub(" +", " ", description.replace("\n", ""))
description = re.sub(
" +", " ", description.replace("\n", "")
)
function_docs[fcn_name][column["@name"]] = {
"type": column_type,
"description": description,
"example": column["annotation"]["documentation"].get("m-ex", None)
"type": column_type,
"description": description,
"example": column["annotation"]["documentation"].get(
"m-ex", None
),
}
else:
function_docs[fcn_name][column["@name"]] = {
"type": column_type,
# TODO: insert information from simple type here
"description": None,
"example": None
"example": None,
}

# Hack in a descrition for a column that gets created after download while flattening data
function_docs["GetEinheitWind"]["HerstellerId"] = {
"type": "str",
"description": "Id des Herstellers der Einheit",
"example": 923
"example": 923,
}

return function_docs
@@ -193,7 +211,11 @@ def _collect_columns_of_base_type(base_types, base_type_name, fcn_data):
fcn_data += type_description["extension"]["sequence"]["element"]

if "@base" in type_description["extension"]:
if not type_description["extension"]["@base"] == 'mastr:AntwortBasis':
fcn_data = _collect_columns_of_base_type(base_types, type_description["extension"]["@base"].split(":")[1], fcn_data)
if not type_description["extension"]["@base"] == "mastr:AntwortBasis":
fcn_data = _collect_columns_of_base_type(
base_types,
type_description["extension"]["@base"].split(":")[1],
fcn_data,
)

return fcn_data
65 changes: 43 additions & 22 deletions open_mastr/soap_api/metadata/mastr_datapackage.json
Original file line number Diff line number Diff line change
@@ -9,15 +9,29 @@
],
"subject": [
{
"name": null,
"path": null
"name": "power plant",
"path": "http://openenergy-platform.org/ontology/oeo/OEO_00000031"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the function of ontology references?

Copy link
Member Author

@chrwm chrwm Dec 8, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The purpose is that the @context field makes some fields in the oemetadata searchable via sparql queries. Amongst others subject, isAbout, valueReference. It makes the metadata JSON to a JSON-LD.
The idea is that in the future datasets are annotated ontologically for two use-cases:

  1. common vocabulary to search for data in the energy domain in a search engine that builds upon linked open data
    Here the prototype of a search engine from LOD-GEOSS project: http://moss.tools.dbpedia.org/search
    The usability and visualistion are going to be improved.
    When you search for hub height, select the concept from the suggestions and search for it on the energy databus, you'll find wind power plant datasets that contain information about hub height you might use for your calculations. The datasets are registered in the databus, which is a metadata catalog that points to decentrally hosted databases. So the data is not actually hosted and maintained on the databus but at individual institutions with the idea to improve data findability in the domain (only works if sufficient datapoints with good metadata participate).
  2. inference of knowlegde with annotated datasets
    simplistic example: I annote the mastr dataset with power plant and you know generally what the concept power plant is, but not what types of power plants exist in the energy domain. You could gain knowledge from the information stored via the hierarchical relations in the ontology.
    https://openenergy-platform.org/viewer/oeo/ if you search for power plant here.

},
{
"name": "renewable",
"path": "http://openenergy-platform.org/ontology/oeo/OEO_00030004"
},
{
"name": "conventional",
"path": "http://openenergy-platform.org/ontology/oeo/OEO_00020147"
},
{
"name": "energy storage object",
"path": "http://openenergy-platform.org/ontology/oeo/OEO_00000159"
}
],
"keywords": [
"powerplants",
"renewables"
"renewables",
"coventional",
"storage"
],
"publicationDate": "2022-05-16",
"publicationDate": "2022-12-01",
"context": {
"homepage": "https://www.marktstammdatenregister.de/MaStR/",
"documentation": "https://www.marktstammdatenregister.de/MaStRHilfe/index.html",
@@ -34,7 +48,7 @@
"resolution": "vector"
},
"temporal": {
"referenceDate": "2022-05-16",
"referenceDate": "2022-12-01",
"timeseries": [
{
"start": null,
@@ -154,13 +168,20 @@
"date": "2022-05-16",
"object": "metadata and data",
"comment": "Update metadata and run download"
},
{
"title": "chrwm",
"email": null,
"date": "2022-12-01",
"object": "metadata and data",
"comment": "Update metadata and run bulk download with open-MaStR v0.12.2"
}
],
"resources": [
{
"profile": "tabular-data-resource",
"name": "bnetza_open_mastr_wind",
"path": "bnetza_open_mastr_wind.csv",
"path": "https://doi.org/10.5281/zenodo.7387843",
"format": "csv",
"encoding": "UTF-8",
"schema": {
@@ -3130,14 +3151,14 @@
]
},
"dialect": {
"delimiter": ";",
"delimiter": ",",
"decimalSeparator": "."
}
},
{
"profile": "tabular-data-resource",
"name": "bnetza_open_mastr_hydro",
"path": "bnetza_open_mastr_hydro.csv",
"path": "https://doi.org/10.5281/zenodo.7387843",
"format": "csv",
"encoding": "UTF-8",
"schema": {
@@ -5651,14 +5672,14 @@
]
},
"dialect": {
"delimiter": ";",
"delimiter": ",",
"decimalSeparator": "."
}
},
{
"profile": "tabular-data-resource",
"name": "bnetza_open_mastr_biomass",
"path": "bnetza_open_mastr_biomass.csv",
"path": "https://doi.org/10.5281/zenodo.7387843",
"format": "csv",
"encoding": "UTF-8",
"schema": {
@@ -8381,14 +8402,14 @@
]
},
"dialect": {
"delimiter": ";",
"delimiter": ",",
"decimalSeparator": "."
}
},
{
"profile": "tabular-data-resource",
"name": "bnetza_open_mastr_solar",
"path": "bnetza_open_mastr_solar.csv",
"path": "https://doi.org/10.5281/zenodo.7387843",
"format": "csv",
"encoding": "UTF-8",
"schema": {
@@ -11187,14 +11208,14 @@
]
},
"dialect": {
"delimiter": ";",
"delimiter": ",",
"decimalSeparator": "."
}
},
{
"profile": "tabular-data-resource",
"name": "bnetza_open_mastr_storage",
"path": "bnetza_open_mastr_storage.csv",
"path": "https://doi.org/10.5281/zenodo.7387843",
"format": "csv",
"encoding": "UTF-8",
"schema": {
@@ -13556,14 +13577,14 @@
]
},
"dialect": {
"delimiter": ";",
"delimiter": ",",
"decimalSeparator": "."
}
},
{
"profile": "tabular-data-resource",
"name": "bnetza_open_mastr_combustion",
"path": "bnetza_open_mastr_combustion.csv",
"path": "https://doi.org/10.5281/zenodo.7387843",
"format": "csv",
"encoding": "UTF-8",
"schema": {
@@ -16034,14 +16055,14 @@
]
},
"dialect": {
"delimiter": ";",
"delimiter": ",",
"decimalSeparator": "."
}
},
{
"profile": "tabular-data-resource",
"name": "bnetza_open_mastr_nuclear",
"path": "bnetza_open_mastr_nuclear.csv",
"path": "https://doi.org/10.5281/zenodo.7387843",
"format": "csv",
"encoding": "UTF-8",
"schema": {
@@ -17961,14 +17982,14 @@
]
},
"dialect": {
"delimiter": ";",
"delimiter": ",",
"decimalSeparator": "."
}
},
{
"profile": "tabular-data-resource",
"name": "bnetza_open_mastr_gsgk",
"path": "bnetza_open_mastr_gsgk.csv",
"path": "https://doi.org/10.5281/zenodo.7387843",
"format": "csv",
"encoding": "UTF-8",
"schema": {
@@ -20268,13 +20289,13 @@
]
},
"dialect": {
"delimiter": ";",
"delimiter": ",",
"decimalSeparator": "."
}
}
],
"@id": null,
"@context": null,
"@context": "https://raw.githubusercontent.com/OpenEnergyPlatform/oemetadata/master/metadata/latest/context.json",
"review": {
"path": null,
"badge": null
6 changes: 5 additions & 1 deletion open_mastr/xml_download/utils_cleansing_bulk.py
Original file line number Diff line number Diff line change
@@ -5,10 +5,14 @@
columns_replace_list,
)
from zipfile import ZipFile
from open_mastr.utils.config import setup_logger

# setup logger
log = setup_logger()


def cleanse_bulk_data(df: pd.DataFrame, zipped_xml_file_path: str) -> pd.DataFrame:
print("Data is cleansed.")
log.info("Data is cleansed.")
df = replace_ids_with_names(df, system_catalog)
# Katalogeintraege: int -> string value
df = replace_mastr_katalogeintraege(
15 changes: 8 additions & 7 deletions open_mastr/xml_download/utils_write_to_database.py
Original file line number Diff line number Diff line change
@@ -13,6 +13,7 @@
from open_mastr.xml_download.utils_cleansing_bulk import cleanse_bulk_data
from open_mastr.utils.config import setup_logger

log = setup_logger()

def write_mastr_xml_to_database(
engine: sqlalchemy.engine.Engine,
@@ -38,11 +39,11 @@ def write_mastr_xml_to_database(

if is_first_file(file_name):
create_database_table(engine=engine, xml_tablename=xml_tablename)
print(
log.info(
f"Table '{sql_tablename}' is filled with data '{xml_tablename}' "
"from the bulk download."
)
print(f"File '{file_name}' is parsed.")
log.info(f"File '{file_name}' is parsed.")

df = preprocess_table_for_writing_to_database(
f=f,
@@ -64,7 +65,7 @@ def write_mastr_xml_to_database(
if_exists="append",
engine=engine,
)
print("Bulk download and data cleansing were successful.")
log.info("Bulk download and data cleansing were successful.")


def is_table_relevant(xml_tablename: str, include_tables: list) -> bool:
@@ -289,7 +290,7 @@ def write_single_entries_until_not_unique_comes_up(
len_df_before = len(df)
df = df.drop(labels=key_list, errors="ignore")
df = df.reset_index()
print(f"{len_df_before-len(df)} entries already existed in the database.")
log.info(f"{len_df_before-len(df)} entries already existed in the database.")

return df

@@ -311,7 +312,7 @@ def add_missing_column_to_table(
-------
"""
log = setup_logger()


if engine.name == "postgresql":
missing_column = err.args[0].split("»")[1].split("«")[0]
@@ -337,7 +338,7 @@ def add_missing_column_to_table(

def delete_wrong_xml_entry(err: Error, df: pd.DataFrame) -> None:
delete_entry = str(err).split("«")[0].split("»")[1]
print(f"The entry {delete_entry} was deleted due to its false data type.")
log.info(f"The entry {delete_entry} was deleted due to its false data type.")
df = df.replace(delete_entry, np.nan)


@@ -376,7 +377,7 @@ def handle_xml_syntax_error(data: bytes, err: Error) -> pd.DataFrame:
else:
decoded_data = decoded_data[:start_char] + decoded_data[start_char + 1 :]
df = pd.read_xml(decoded_data)
print("One invalid xml expression was deleted.")
log.info("One invalid xml expression was deleted.")
return df