diff --git a/open_mastr/soap_api/metadata/create.py b/open_mastr/soap_api/metadata/create.py index fb254339..0dfb8185 100644 --- a/open_mastr/soap_api/metadata/create.py +++ b/open_mastr/soap_api/metadata/create.py @@ -69,8 +69,9 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None): "id": str(uuid.uuid4()), "description": f"Raw data download Marktstammdatenregister (MaStR) data using the webservice.\n\n{description_extra}", "language": ["en-GB", "de-DE"], + "subject": [{"name": None, "path": None}], "keywords": ["powerplants", "renewables"], - "created": publication_date, + "publicationDate": publication_date, "version": data_version, "context": { "homepage": "https://www.marktstammdatenregister.de/MaStR/", @@ -85,13 +86,15 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None): "spatial": {"location": None, "extent": "Germany", "resolution": "vector"}, "temporal": { "referenceDate": reference_date.strftime("%Y-%m-%d %H:%M:%S"), - "timeseries": { + "timeseries": [ + { "start": None, "end": None, "resolution": None, "alignment": None, "aggregationType": None, - }, + } + ] }, "sources": [ { @@ -106,7 +109,7 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None): "instruction": "You are free: To Share, To Create, To Adapt; As long as you: Attribute", "attribution": f"© Marktstammdatenregister {datetime.date.today().year} | dl-de/by-2-0", } - ], + ] }, { "title": "RLI - open_MaStR", @@ -120,8 +123,8 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None): "instruction": "You are free: To Share, To Create, To Adapt; As long as you: Attribute, Share-Alike, Keep open!", "attribution": "open_MaStR © Reiner Lemoine Institut | AGPL-3.0", } - ], - }, + ] + } ], "licenses": [ { @@ -134,30 +137,16 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None): ], "contributors": [ { - "title": "Ludee", + "title": None, "email": None, - "path": "https://github.com/ludee", - "role": "maintainer", - "organization": "Reiner Lemoine Institut gGmbH", - }, - { - "title": "Guido Pleßmann", - "email": None, - "path": "https://gplssm.de", - "role": "maintainer", - "organization": "Reiner Lemoine Institut gGmbH", - }, - { - "title": "oakca", - "email": None, - "path": "https://github.com/oakca", - "role": "contributor", - "organization": "Reiner Lemoine Institut gGmbH", - }, + "date": None, + "object": None, + "comment": None + } ], "review": {"path": None, "badge": None}, "metaMetadata": { - "metadataVersion": "OEP-1.4.0", + "metadataVersion": "OEP-1.5.2", "metadataLicense": { "name": "CC0-1.0", "title": "Creative Commons Zero v1.0 Universal", @@ -172,6 +161,7 @@ def datapackag_base(reference_date, publication_date=None, statistik_flag=None): "licenses": "License name must follow the SPDX License List (https://spdx.org/licenses/)", "review": "Following the OEP Data Review (https://github.com/OpenEnergyPlatform/data-preprocessing/wiki)", "null": "If not applicable use (null)", + "todo": "If a value ist not yet available, use: todo" }, } @@ -253,35 +243,29 @@ def create_datapackage_meta_json( resource = { "profile": "tabular-data-resource", "name": f"bnetza_mastr_{tech}_raw", - "title": f"open-MaStR {tech} units (raw)", "path": filenames["raw"][tech]["joined"], - "scheme": "file", + "format": "csv", "encoding": "utf-8", - "mediatype": "text/csv", "schema": { "fields": raw_fields, "primaryKey": ["EinheitMastrNummer"], }, "dialect": {"delimiter": ","}, } - resources_meta["resources"].append(resource) if "cleaned" in data: resource = { "profile": "tabular-data-resource", "name": f"bnetza_mastr_{tech}_cleaned", - "title": f"open-MaStR {tech} units (cleaned)", "path": filenames["cleaned"][tech], - "scheme": "file", + "format": "csv", "encoding": "utf-8", - "mediatype": "text/csv", "schema": { "fields": raw_fields, "primaryKey": ["EinheitMastrNummer"], }, "dialect": {"delimiter": ","}, } - resources_meta["resources"].append(resource) if "postprocessed" in data: processed_fields = [ @@ -289,14 +273,14 @@ def create_datapackage_meta_json( "name": "geom", "unit": None, "type": "str", - "desciption": "Standort der Anlage als Punktgeometrie im WKB Format", + "description": "Standort der Anlage als Punktgeometrie im WKB Format", "examples": "0101000020e610000071fbe59315131c40a2b437f8c20e4a40", }, { "name": "comment", "unit": None, "type": "str", - "desciption": "Information about data post-processing", + "description": "Information about data post-processing", "examples": "has_geom; outside_vg250", }, ] @@ -306,7 +290,7 @@ def create_datapackage_meta_json( "name": "tags", "unit": None, "type": "json", - "desciption": "Data insights and report about post-processing steps", + "description": "Data insights and report about post-processing steps", "examples": { "plz_check": False, "processed": True, @@ -319,18 +303,16 @@ def create_datapackage_meta_json( "name": "geom", "unit": None, "type": "str", - "desciption": "Standort der Anlage als Punktgeometrie im WKB Format (EPSG 3035)", + "description": "Standort der Anlage als Punktgeometrie im WKB Format (EPSG 3035)", "examples": "0101000020e610000071fbe59315131c40a2b437f8c20e4a40", } ) resource = { "profile": "tabular-data-resource", "name": f"bnetza_mastr_{tech}", - "title": f"open-MaStR {tech} units", "path": filenames["postprocessed"][tech], - "scheme": "file", + "format": "csv", "encoding": "utf-8", - "mediatype": "text/csv", "schema": { "fields": raw_fields + processed_fields, "primaryKey": ["EinheitMastrNummer"], diff --git a/open_mastr/soap_api/metadata/description.py b/open_mastr/soap_api/metadata/description.py index 8fc55526..e3b52827 100644 --- a/open_mastr/soap_api/metadata/description.py +++ b/open_mastr/soap_api/metadata/description.py @@ -33,19 +33,19 @@ def __init__(self, xml=None): self.xml = fh.read() else: # If no XML file is given, the file is read from an URL - zipurl = 'https://www.marktstammdatenregister.de/MaStRHilfe/files/' \ - 'webdienst/Dienstbeschreibung_1_2_39_Produktion.zip' + zipurl = "https://www.marktstammdatenregister.de/MaStRHilfe/files/webdienst/" \ + "Dienstbeschreibung_Produktion_Version" \ + "1.2.87" \ # update version here + ".zip" with urlopen(zipurl) as zipresp: with ZipFile(BytesIO(zipresp.read())) as zfile: - self.xml = zfile.read('xsd/mastrbasetypes.xsd') - - + self.xml = zfile.read("xsd/mastrbasetypes.xsd") # Parse XML and extract relevant data parsed = xmltodict.parse(self.xml, process_namespaces=False) - self.complex_types = parsed['schema']["complexType"] - self.simple_types = parsed['schema']["simpleType"] + self.complex_types = parsed["schema"]["complexType"] + self.simple_types = parsed["schema"]["simpleType"] # Prepare parsed data for documentational purposes abstract_types, parameters, responses, types = self._filter_type_descriptions() @@ -78,13 +78,17 @@ def _filter_type_descriptions(self): raise ValueError("Ohh...") else: # Filter all functions - if item["@name"].startswith(("Get", "Set", "Erneute", "Verschiebe", "Delete")): + if item["@name"].startswith( + ("Get", "Set", "Erneute", "Verschiebe", "Delete") + ): functions.append(item) # Further split the list of functions into paramters and responses if item["@name"].endswith("Parameter"): if "complexContent" in item.keys(): - parameters[item["@name"]] = item["complexContent"]["extension"] + parameters[item["@name"]] = item["complexContent"][ + "extension" + ] else: parameters[item["@name"]] = item elif item["@name"].endswith("Antwort"): @@ -111,12 +115,14 @@ def prepare_simple_type(self): for simple_type in self.simple_types: if "enumeration" in simple_type["restriction"]: - possible_values = [_["@value"] for _ in simple_type["restriction"]["enumeration"]] + possible_values = [ + _["@value"] for _ in simple_type["restriction"]["enumeration"] + ] else: possible_values = [] simple_types_doc[simple_type["@name"]] = { "type": simple_type["restriction"]["@base"], - "values": possible_values + "values": possible_values, } return simple_types_doc @@ -140,7 +146,9 @@ def functions_data_documentation(self): if "annotation" in fcn["sequence"]["element"]: fcn_data = [fcn["sequence"]["element"]] else: - fcn_data = self.types[fcn["sequence"]["element"]["@type"].split(":")[1]]["sequence"]["element"] + fcn_data = self.types[ + fcn["sequence"]["element"]["@type"].split(":")[1] + ]["sequence"]["element"] else: print(type(fcn["sequence"])) print(fcn["sequence"]) @@ -148,41 +156,51 @@ def functions_data_documentation(self): # Add data for inherited columns from base types if "@base" in fcn: - if not fcn["@base"] == 'mastr:AntwortBasis': - fcn_data = _collect_columns_of_base_type(self.types, fcn["@base"].split(":")[1], fcn_data) + if not fcn["@base"] == "mastr:AntwortBasis": + fcn_data = _collect_columns_of_base_type( + self.types, fcn["@base"].split(":")[1], fcn_data + ) function_docs[fcn_name] = {} for column in fcn_data: # Replace MaStR internal types with more general ones if column["@type"].startswith("mastr:"): try: - column_type = self.simple_types_prepared[column["@type"].split(":")[1]]["type"] + column_type = self.simple_types_prepared[ + column["@type"].split(":")[1] + ]["type"] except KeyError: column_type = column["@type"] else: column_type = column["@type"] if "annotation" in column.keys(): - description = column["annotation"]["documentation"].get("#text", None) + description = column["annotation"]["documentation"].get( + "#text", None + ) if description: - description = re.sub(" +", " ", description.replace("\n", "")) + description = re.sub( + " +", " ", description.replace("\n", "") + ) function_docs[fcn_name][column["@name"]] = { - "type": column_type, - "description": description, - "example": column["annotation"]["documentation"].get("m-ex", None) + "type": column_type, + "description": description, + "example": column["annotation"]["documentation"].get( + "m-ex", None + ), } else: function_docs[fcn_name][column["@name"]] = { "type": column_type, # TODO: insert information from simple type here "description": None, - "example": None + "example": None, } # Hack in a descrition for a column that gets created after download while flattening data function_docs["GetEinheitWind"]["HerstellerId"] = { "type": "str", "description": "Id des Herstellers der Einheit", - "example": 923 + "example": 923, } return function_docs @@ -193,7 +211,11 @@ def _collect_columns_of_base_type(base_types, base_type_name, fcn_data): fcn_data += type_description["extension"]["sequence"]["element"] if "@base" in type_description["extension"]: - if not type_description["extension"]["@base"] == 'mastr:AntwortBasis': - fcn_data = _collect_columns_of_base_type(base_types, type_description["extension"]["@base"].split(":")[1], fcn_data) + if not type_description["extension"]["@base"] == "mastr:AntwortBasis": + fcn_data = _collect_columns_of_base_type( + base_types, + type_description["extension"]["@base"].split(":")[1], + fcn_data, + ) return fcn_data diff --git a/open_mastr/soap_api/metadata/mastr_datapackage.json b/open_mastr/soap_api/metadata/mastr_datapackage.json index d75e32b6..838ee15c 100644 --- a/open_mastr/soap_api/metadata/mastr_datapackage.json +++ b/open_mastr/soap_api/metadata/mastr_datapackage.json @@ -9,15 +9,29 @@ ], "subject": [ { - "name": null, - "path": null + "name": "power plant", + "path": "http://openenergy-platform.org/ontology/oeo/OEO_00000031" + }, + { + "name": "renewable", + "path": "http://openenergy-platform.org/ontology/oeo/OEO_00030004" + }, + { + "name": "conventional", + "path": "http://openenergy-platform.org/ontology/oeo/OEO_00020147" + }, + { + "name": "energy storage object", + "path": "http://openenergy-platform.org/ontology/oeo/OEO_00000159" } ], "keywords": [ "powerplants", - "renewables" + "renewables", + "coventional", + "storage" ], - "publicationDate": "2022-05-16", + "publicationDate": "2022-12-01", "context": { "homepage": "https://www.marktstammdatenregister.de/MaStR/", "documentation": "https://www.marktstammdatenregister.de/MaStRHilfe/index.html", @@ -34,7 +48,7 @@ "resolution": "vector" }, "temporal": { - "referenceDate": "2022-05-16", + "referenceDate": "2022-12-01", "timeseries": [ { "start": null, @@ -154,13 +168,20 @@ "date": "2022-05-16", "object": "metadata and data", "comment": "Update metadata and run download" + }, + { + "title": "chrwm", + "email": null, + "date": "2022-12-01", + "object": "metadata and data", + "comment": "Update metadata and run bulk download with open-MaStR v0.12.2" } ], "resources": [ { "profile": "tabular-data-resource", "name": "bnetza_open_mastr_wind", - "path": "bnetza_open_mastr_wind.csv", + "path": "https://doi.org/10.5281/zenodo.7387843", "format": "csv", "encoding": "UTF-8", "schema": { @@ -3130,14 +3151,14 @@ ] }, "dialect": { - "delimiter": ";", + "delimiter": ",", "decimalSeparator": "." } }, { "profile": "tabular-data-resource", "name": "bnetza_open_mastr_hydro", - "path": "bnetza_open_mastr_hydro.csv", + "path": "https://doi.org/10.5281/zenodo.7387843", "format": "csv", "encoding": "UTF-8", "schema": { @@ -5651,14 +5672,14 @@ ] }, "dialect": { - "delimiter": ";", + "delimiter": ",", "decimalSeparator": "." } }, { "profile": "tabular-data-resource", "name": "bnetza_open_mastr_biomass", - "path": "bnetza_open_mastr_biomass.csv", + "path": "https://doi.org/10.5281/zenodo.7387843", "format": "csv", "encoding": "UTF-8", "schema": { @@ -8381,14 +8402,14 @@ ] }, "dialect": { - "delimiter": ";", + "delimiter": ",", "decimalSeparator": "." } }, { "profile": "tabular-data-resource", "name": "bnetza_open_mastr_solar", - "path": "bnetza_open_mastr_solar.csv", + "path": "https://doi.org/10.5281/zenodo.7387843", "format": "csv", "encoding": "UTF-8", "schema": { @@ -11187,14 +11208,14 @@ ] }, "dialect": { - "delimiter": ";", + "delimiter": ",", "decimalSeparator": "." } }, { "profile": "tabular-data-resource", "name": "bnetza_open_mastr_storage", - "path": "bnetza_open_mastr_storage.csv", + "path": "https://doi.org/10.5281/zenodo.7387843", "format": "csv", "encoding": "UTF-8", "schema": { @@ -13556,14 +13577,14 @@ ] }, "dialect": { - "delimiter": ";", + "delimiter": ",", "decimalSeparator": "." } }, { "profile": "tabular-data-resource", "name": "bnetza_open_mastr_combustion", - "path": "bnetza_open_mastr_combustion.csv", + "path": "https://doi.org/10.5281/zenodo.7387843", "format": "csv", "encoding": "UTF-8", "schema": { @@ -16034,14 +16055,14 @@ ] }, "dialect": { - "delimiter": ";", + "delimiter": ",", "decimalSeparator": "." } }, { "profile": "tabular-data-resource", "name": "bnetza_open_mastr_nuclear", - "path": "bnetza_open_mastr_nuclear.csv", + "path": "https://doi.org/10.5281/zenodo.7387843", "format": "csv", "encoding": "UTF-8", "schema": { @@ -17961,14 +17982,14 @@ ] }, "dialect": { - "delimiter": ";", + "delimiter": ",", "decimalSeparator": "." } }, { "profile": "tabular-data-resource", "name": "bnetza_open_mastr_gsgk", - "path": "bnetza_open_mastr_gsgk.csv", + "path": "https://doi.org/10.5281/zenodo.7387843", "format": "csv", "encoding": "UTF-8", "schema": { @@ -20268,13 +20289,13 @@ ] }, "dialect": { - "delimiter": ";", + "delimiter": ",", "decimalSeparator": "." } } ], "@id": null, - "@context": null, + "@context": "https://raw.githubusercontent.com/OpenEnergyPlatform/oemetadata/master/metadata/latest/context.json", "review": { "path": null, "badge": null diff --git a/open_mastr/xml_download/utils_cleansing_bulk.py b/open_mastr/xml_download/utils_cleansing_bulk.py index c0d857b4..0448adb0 100644 --- a/open_mastr/xml_download/utils_cleansing_bulk.py +++ b/open_mastr/xml_download/utils_cleansing_bulk.py @@ -5,10 +5,14 @@ columns_replace_list, ) from zipfile import ZipFile +from open_mastr.utils.config import setup_logger + +# setup logger +log = setup_logger() def cleanse_bulk_data(df: pd.DataFrame, zipped_xml_file_path: str) -> pd.DataFrame: - print("Data is cleansed.") + log.info("Data is cleansed.") df = replace_ids_with_names(df, system_catalog) # Katalogeintraege: int -> string value df = replace_mastr_katalogeintraege( diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index f29c1b0e..a461725b 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -13,6 +13,7 @@ from open_mastr.xml_download.utils_cleansing_bulk import cleanse_bulk_data from open_mastr.utils.config import setup_logger +log = setup_logger() def write_mastr_xml_to_database( engine: sqlalchemy.engine.Engine, @@ -38,11 +39,11 @@ def write_mastr_xml_to_database( if is_first_file(file_name): create_database_table(engine=engine, xml_tablename=xml_tablename) - print( + log.info( f"Table '{sql_tablename}' is filled with data '{xml_tablename}' " "from the bulk download." ) - print(f"File '{file_name}' is parsed.") + log.info(f"File '{file_name}' is parsed.") df = preprocess_table_for_writing_to_database( f=f, @@ -64,7 +65,7 @@ def write_mastr_xml_to_database( if_exists="append", engine=engine, ) - print("Bulk download and data cleansing were successful.") + log.info("Bulk download and data cleansing were successful.") def is_table_relevant(xml_tablename: str, include_tables: list) -> bool: @@ -289,7 +290,7 @@ def write_single_entries_until_not_unique_comes_up( len_df_before = len(df) df = df.drop(labels=key_list, errors="ignore") df = df.reset_index() - print(f"{len_df_before-len(df)} entries already existed in the database.") + log.info(f"{len_df_before-len(df)} entries already existed in the database.") return df @@ -311,7 +312,7 @@ def add_missing_column_to_table( ------- """ - log = setup_logger() + if engine.name == "postgresql": missing_column = err.args[0].split("»")[1].split("«")[0] @@ -337,7 +338,7 @@ def add_missing_column_to_table( def delete_wrong_xml_entry(err: Error, df: pd.DataFrame) -> None: delete_entry = str(err).split("«")[0].split("»")[1] - print(f"The entry {delete_entry} was deleted due to its false data type.") + log.info(f"The entry {delete_entry} was deleted due to its false data type.") df = df.replace(delete_entry, np.nan) @@ -376,7 +377,7 @@ def handle_xml_syntax_error(data: bytes, err: Error) -> pd.DataFrame: else: decoded_data = decoded_data[:start_char] + decoded_data[start_char + 1 :] df = pd.read_xml(decoded_data) - print("One invalid xml expression was deleted.") + log.info("One invalid xml expression was deleted.") return df