-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Adds a new affiliations stream that populates a DB table with ROR API results. This table will be used for curation and normalizing the affiliations found during record migration * closes CERNDocumentServer/cds-rdm#216 Co-authored-by: Anika Churilova <[email protected]>
- Loading branch information
1 parent
2eb68f4
commit 2a4ff40
Showing
20 changed files
with
682 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022 CERN. | ||
# | ||
# CDS-RDM is free software; you can redistribute it and/or modify it under | ||
# the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""CDS-RDM migration stats module.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2024 CERN. | ||
# | ||
# CDS-RDM is free software; you can redistribute it and/or modify it under | ||
# the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""CDS-RDM migration load module.""" | ||
import logging | ||
import os | ||
import json | ||
import psycopg2 | ||
|
||
from invenio_db import db | ||
from invenio_rdm_migrator.load.base import Load | ||
from sqlalchemy.exc import IntegrityError | ||
|
||
from cds_rdm.models import CDSMigrationAffiliationMapping | ||
|
||
from .log import AffiliationsLogger | ||
|
||
logger = AffiliationsLogger.get_logger() | ||
|
||
|
||
class CDSAffiliationsLoad(Load): | ||
"""CDSAffiliationsLoad.""" | ||
|
||
def __init__( | ||
self, | ||
dry_run=False, | ||
): | ||
"""Constructor.""" | ||
self.dry_run = dry_run | ||
|
||
def _prepare(self, entry): | ||
"""Prepare the record.""" | ||
pass | ||
|
||
def _save_affiliation(self, affiliations): | ||
""".""" | ||
|
||
for affiliation in affiliations: | ||
_affiliation_model = None | ||
_original_input = affiliation.pop("original_input") | ||
try: | ||
if affiliation.get("ror_exact_match"): | ||
_affiliation_model = CDSMigrationAffiliationMapping( | ||
legacy_affiliation_input=_original_input, | ||
ror_exact_match=affiliation["ror_exact_match"], | ||
) | ||
else: | ||
_affiliation_model = CDSMigrationAffiliationMapping( | ||
legacy_affiliation_input=_original_input, | ||
ror_not_exact_match=affiliation["ror_not_exact_match"], | ||
) | ||
db.session.add(_affiliation_model) | ||
db.session.commit() | ||
except IntegrityError as e: | ||
db.session.rollback() | ||
# We continue when the legacy affiliation input is already in the db | ||
if isinstance(e.orig, psycopg2.errors.UniqueViolation): | ||
continue | ||
|
||
def _load(self, entry): | ||
"""Use the services to load the entries.""" | ||
if entry: | ||
creators_affiliations = entry["creators_affiliations"] | ||
contributors_affiliations = entry["contributors_affiliations"] | ||
try: | ||
self._save_affiliation(creators_affiliations) | ||
self._save_affiliation(contributors_affiliations) | ||
except Exception as ex: | ||
logger.error(ex) | ||
|
||
def _cleanup(self, *args, **kwargs): | ||
"""Cleanup the entries.""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022 CERN. | ||
# | ||
# CDS-RDM is free software; you can redistribute it and/or modify it under | ||
# the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""CDS-RDM migration record stats logger module.""" | ||
|
||
import logging | ||
|
||
|
||
class AffiliationsLogger: | ||
"""Migrator affiliations logger.""" | ||
|
||
@classmethod | ||
def initialize(cls, log_dir): | ||
"""Constructor.""" | ||
formatter = logging.Formatter( | ||
fmt="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S" | ||
) | ||
logger = logging.getLogger("affiliations-migrator") | ||
fh = logging.FileHandler(log_dir / "matched.log") | ||
logger.setLevel(logging.WARNING) | ||
logger.addHandler(fh) | ||
|
||
# errors to file | ||
fh = logging.FileHandler(log_dir / "unmatched.log") | ||
fh.setLevel(logging.ERROR) | ||
fh.setFormatter(formatter) | ||
logger.addHandler(fh) | ||
|
||
# info to stream/stdout | ||
sh = logging.StreamHandler() | ||
sh.setFormatter(formatter) | ||
sh.setLevel(logging.INFO) | ||
logger.addHandler(sh) | ||
|
||
@classmethod | ||
def get_logger(cls): | ||
"""Get migration logger.""" | ||
return logging.getLogger("affiliations-migrator") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022 CERN. | ||
# | ||
# Invenio-RDM-Migrator is free software; you can redistribute it and/or modify | ||
# it under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""InvenioRDM migration streams runner.""" | ||
|
||
from pathlib import Path | ||
|
||
from invenio_rdm_migrator.streams import Stream | ||
|
||
from cds_migrator_kit.rdm.migration.affiliations.log import AffiliationsLogger | ||
|
||
|
||
class RecordAffiliationsRunner: | ||
"""ETL streams runner.""" | ||
|
||
def __init__(self, stream_definition, filepath, log_dir, dry_run): | ||
"""Constructor.""" | ||
|
||
self.log_dir = Path(log_dir) | ||
self.log_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
AffiliationsLogger.initialize(self.log_dir) | ||
|
||
self.stream = Stream( | ||
stream_definition.name, | ||
extract=stream_definition.extract_cls(filepath), | ||
transform=stream_definition.transform_cls(), | ||
load=stream_definition.load_cls(dry_run=dry_run), | ||
) | ||
|
||
def run(self): | ||
"""Run Statistics ETL stream.""" | ||
try: | ||
self.stream.run() | ||
except Exception as e: | ||
AffiliationsLogger.get_logger().exception( | ||
f"Stream {self.stream.name} failed.", exc_info=1 | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2022 CERN. | ||
# | ||
# CDS-RDM is free software; you can redistribute it and/or modify it under | ||
# the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""CDS-RDM migration streams module.""" | ||
from invenio_rdm_migrator.streams import StreamDefinition | ||
from invenio_rdm_migrator.transform import IdentityTransform | ||
|
||
from cds_migrator_kit.rdm.migration.extract import LegacyExtract | ||
|
||
from .load import CDSAffiliationsLoad | ||
from .transform import CDSToRDMAffiliationTransform | ||
|
||
AffiliationsStreamDefinition = StreamDefinition( | ||
name="affiliations", | ||
extract_cls=LegacyExtract, | ||
transform_cls=CDSToRDMAffiliationTransform, | ||
load_cls=CDSAffiliationsLoad, | ||
) | ||
"""ETL stream for CDS to RDM records statistics.""" |
Oops, something went wrong.