Skip to content

Commit

Permalink
affiliations: search via ROR API
Browse files Browse the repository at this point in the history
* Adds a new affiliations stream that populates a DB table with ROR API
  results. This table will be used for curation and normalizing the
  affiliations found during record migration

* closes CERNDocumentServer/cds-rdm#216

Co-authored-by: Anika Churilova <[email protected]>
  • Loading branch information
anikachurilova authored and zzacharo committed Oct 25, 2024
1 parent 2eb68f4 commit 2a4ff40
Show file tree
Hide file tree
Showing 20 changed files with 682 additions and 46 deletions.
68 changes: 68 additions & 0 deletions cds_migrator_kit/rdm/migration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,27 @@ python copy_collection_files.py --dump-folder /eos/media/cds/cds-rdm/dev/migrati
5. click connect
6. use eos account dev credentials

### Collect and dump affiliation mapping

In order to collect all affiliations from the collection dump folder run the following
command pointing to the `cds_migrator_kit.rdm.migration.data.summer_student_reports.dump`
folder:

```
invenio migration affiliations run --filepath /path/to/cds_migrator_kit/rdm/migration/data/summer_student_reports/dump
```

This will collect and check each affiliation against the ROR organization API, and store them in the `cds_rdm.models.CDSMigrationAffiliationMapping` table.

The model is then used during record migration to normalize the affiliation content following the below principles
to map the legacy input to a normalized value:

1. If curated affiliation that might or not have a ROR ID exists then this value is used.
2. A ROR exact match
3. A ROR not exact match with a level of confidence of >= 90%. This will also flag the
record for further curation to validate the value.
4. The legacy affiliation value, and flag the record.

#### Openshift migration pod

```shell
Expand All @@ -184,7 +205,9 @@ When the `invenio migration run` command ends it will produce a `rdm_records_sta
{
"legacy_recid": "2884810",
"parent_recid": "zts3q-6ef46",
"parent_object_uuid": "155be22f-3038-49e0-9f17-9518eaac783a",
"latest_version": "1mae4-skq89",
"latest_version_object_uuid": "155be22f-3038-49e0-9f17-9518eaac783a",
"versions": [
{
"new_recid": "1mae4-skq89",
Expand Down Expand Up @@ -258,3 +281,48 @@ reindex_stats(stats_indices)
```

visit https://migration-cds-rdm-dev.app.cern.ch for report

## Rerun migration from clean state without setup everything again

If you want to cleanup a previous migration run without having to re setup everything
i.e not repopulating all vocabularies which takes a lot of time, then run the following
recipe:

- Cleanup db tables from pgadmin

```sql
DELETE FROM rdm_versions_state;
DELETE FROM rdm_records_files;
DELETE FROM rdm_drafts_files;
DELETE FROM rdm_records_metadata;
DELETE FROM rdm_drafts_metadata;
DELETE FROM rdm_parents_metadata;
DELETE FROM communities_metadata;
DELETE FROM files_objecttags;
DELETE FROM files_object;
DELETE FROM files_buckettags;
DELETE FROM files_bucket;
DELETE FROM files_files;
DELETE FROM pidstore_pid WHERE pid_type = 'lrecid';
DELETE FROM pidstore_pid WHERE pid_type = 'recid';
```

- Cleanup indexed documents from opensearch

```
POST /cds-rdm-rdmrecords/_delete_by_query
{
"query": {
"match_all": {}
}
}
POST /cds-rdm-communities/_delete_by_query
{
"query": {
"match_all": {}
}
}
```

- Recreate the community and copy the `community_id` in your `streams.yaml` file
- Rerun `invenio migration run`
8 changes: 8 additions & 0 deletions cds_migrator_kit/rdm/migration/affiliations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM migration stats module."""
77 changes: 77 additions & 0 deletions cds_migrator_kit/rdm/migration/affiliations/load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM migration load module."""
import logging
import os
import json
import psycopg2

from invenio_db import db
from invenio_rdm_migrator.load.base import Load
from sqlalchemy.exc import IntegrityError

from cds_rdm.models import CDSMigrationAffiliationMapping

from .log import AffiliationsLogger

logger = AffiliationsLogger.get_logger()


class CDSAffiliationsLoad(Load):
"""CDSAffiliationsLoad."""

def __init__(
self,
dry_run=False,
):
"""Constructor."""
self.dry_run = dry_run

def _prepare(self, entry):
"""Prepare the record."""
pass

def _save_affiliation(self, affiliations):
"""."""

for affiliation in affiliations:
_affiliation_model = None
_original_input = affiliation.pop("original_input")
try:
if affiliation.get("ror_exact_match"):
_affiliation_model = CDSMigrationAffiliationMapping(
legacy_affiliation_input=_original_input,
ror_exact_match=affiliation["ror_exact_match"],
)
else:
_affiliation_model = CDSMigrationAffiliationMapping(
legacy_affiliation_input=_original_input,
ror_not_exact_match=affiliation["ror_not_exact_match"],
)
db.session.add(_affiliation_model)
db.session.commit()
except IntegrityError as e:
db.session.rollback()
# We continue when the legacy affiliation input is already in the db
if isinstance(e.orig, psycopg2.errors.UniqueViolation):
continue

def _load(self, entry):
"""Use the services to load the entries."""
if entry:
creators_affiliations = entry["creators_affiliations"]
contributors_affiliations = entry["contributors_affiliations"]
try:
self._save_affiliation(creators_affiliations)
self._save_affiliation(contributors_affiliations)
except Exception as ex:
logger.error(ex)

def _cleanup(self, *args, **kwargs):
"""Cleanup the entries."""
pass
42 changes: 42 additions & 0 deletions cds_migrator_kit/rdm/migration/affiliations/log.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM migration record stats logger module."""

import logging


class AffiliationsLogger:
"""Migrator affiliations logger."""

@classmethod
def initialize(cls, log_dir):
"""Constructor."""
formatter = logging.Formatter(
fmt="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger("affiliations-migrator")
fh = logging.FileHandler(log_dir / "matched.log")
logger.setLevel(logging.WARNING)
logger.addHandler(fh)

# errors to file
fh = logging.FileHandler(log_dir / "unmatched.log")
fh.setLevel(logging.ERROR)
fh.setFormatter(formatter)
logger.addHandler(fh)

# info to stream/stdout
sh = logging.StreamHandler()
sh.setFormatter(formatter)
sh.setLevel(logging.INFO)
logger.addHandler(sh)

@classmethod
def get_logger(cls):
"""Get migration logger."""
return logging.getLogger("affiliations-migrator")
42 changes: 42 additions & 0 deletions cds_migrator_kit/rdm/migration/affiliations/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022 CERN.
#
# Invenio-RDM-Migrator is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""InvenioRDM migration streams runner."""

from pathlib import Path

from invenio_rdm_migrator.streams import Stream

from cds_migrator_kit.rdm.migration.affiliations.log import AffiliationsLogger


class RecordAffiliationsRunner:
"""ETL streams runner."""

def __init__(self, stream_definition, filepath, log_dir, dry_run):
"""Constructor."""

self.log_dir = Path(log_dir)
self.log_dir.mkdir(parents=True, exist_ok=True)

AffiliationsLogger.initialize(self.log_dir)

self.stream = Stream(
stream_definition.name,
extract=stream_definition.extract_cls(filepath),
transform=stream_definition.transform_cls(),
load=stream_definition.load_cls(dry_run=dry_run),
)

def run(self):
"""Run Statistics ETL stream."""
try:
self.stream.run()
except Exception as e:
AffiliationsLogger.get_logger().exception(
f"Stream {self.stream.name} failed.", exc_info=1
)
23 changes: 23 additions & 0 deletions cds_migrator_kit/rdm/migration/affiliations/streams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2022 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM migration streams module."""
from invenio_rdm_migrator.streams import StreamDefinition
from invenio_rdm_migrator.transform import IdentityTransform

from cds_migrator_kit.rdm.migration.extract import LegacyExtract

from .load import CDSAffiliationsLoad
from .transform import CDSToRDMAffiliationTransform

AffiliationsStreamDefinition = StreamDefinition(
name="affiliations",
extract_cls=LegacyExtract,
transform_cls=CDSToRDMAffiliationTransform,
load_cls=CDSAffiliationsLoad,
)
"""ETL stream for CDS to RDM records statistics."""
Loading

0 comments on commit 2a4ff40

Please sign in to comment.