users: re-create inactive users for record owner

CERNDocumentServer · Oct 2, 2024 · 17b061d · 17b061d
1 parent beaae08
commit 17b061d
Show file tree

Hide file tree

Showing 15 changed files with 477 additions and 266 deletions.
diff --git a/cds_migrator_kit/rdm/migration/cli.py b/cds_migrator_kit/rdm/migration/cli.py
@@ -14,7 +14,8 @@
 from flask.cli import with_appcontext
 
 from cds_migrator_kit.rdm.migration.runner import Runner
-from cds_migrator_kit.rdm.migration.streams import RecordStreamDefinition
+from cds_migrator_kit.rdm.migration.streams import RecordStreamDefinition, \
+    UserStreamDefinition
 
 cli_logger = logging.getLogger("migrator")
 
@@ -36,6 +37,7 @@ def run(dry_run=False):
     stream_config = current_app.config["CDS_MIGRATOR_KIT_STREAM_CONFIG"]
     runner = Runner(
         stream_definitions=[RecordStreamDefinition],
+        # stream_definitions=[UserStreamDefinition],
         config_filepath=Path(stream_config).absolute(),
         dry_run=dry_run
     )

diff --git a/cds_migrator_kit/rdm/migration/extract.py b/cds_migrator_kit/rdm/migration/extract.py
@@ -46,7 +46,7 @@ def __init__(self, filepath):
         """Constructor."""
         self.filepath = Path(filepath).absolute()
 
-    def run(self):
+    def extract(self, email):
         """Run."""
         with open(self.filepath, "r") as dump_file:
             data = json.load(dump_file)

diff --git a/cds_migrator_kit/rdm/migration/load/load.py b/cds_migrator_kit/rdm/migration/load/load.py
@@ -11,6 +11,7 @@
 import arrow
 from invenio_access.permissions import system_identity
 from invenio_db import db
+from invenio_pidstore.errors import PIDAlreadyExists
 from invenio_rdm_migrator.load.base import Load
 from invenio_rdm_records.proxies import current_rdm_records_service
 
@@ -54,59 +55,58 @@ def _load_files(self, draft, entry, version_files):
 
         identity = system_identity  # Should we create an identity for the migration?
 
-        # take first file for the fist version
-        filename = next(iter(version_files))
-
-        file = version_files[filename]
-
-        try:
-            current_rdm_records_service.draft_files.init_files(
-                identity,
-                draft.id,
-                data=[
-                    {
-                        "key": file["key"],
-                        "metadata": file["metadata"],
-                        "access": {"hidden": False},
-                    }
-                ],
-            )
-            # TODO change to eos move or xrootd command instead of going through the app
-            # TODO leave the init part to pre-create the destination folder
-            # TODO update checksum, size, commit (to be checked on how these methods work)
-            # if current_app.config["XROOTD_ENABLED"]:
-            #     storage = current_files_rest.storage_factory
-            #     current_rdm_records_service.draft_files.set_file_content(
-            #         identity,
-            #         draft.id,
-            #         file["key"],
-            #         BytesIO(b"Placeholder file"),
-            #     )
-            #     obj = None
-            #     for object in draft._record.files.objects:
-            #         if object.key == file["key"]:
-            #             obj = object
-            #     path = obj.file.uri
-            # else:
-            # for local development
-            current_rdm_records_service.draft_files.set_file_content(
-                identity,
-                draft.id,
-                file["key"],
-                import_legacy_files(file["eos_tmp_path"]),
-            )
-            result = current_rdm_records_service.draft_files.commit_file(
-                identity, draft.id, file["key"]
-            )
-            legacy_checksum = f"md5:{file['checksum']}"
-            new_checksum = result.to_dict()["checksum"]
-            assert legacy_checksum == new_checksum
-        except Exception as e:
-            exc = ManualImportRequired(
-                message=str(e), field="filename", value=file["key"]
-            )
-            migration_logger.add_log(exc, output=entry, key="filename",
-                                     value=file["key"])
+        for filename, file_data in version_files.items():
+
+            file_data = version_files[filename]
+
+            try:
+                current_rdm_records_service.draft_files.init_files(
+                    identity,
+                    draft.id,
+                    data=[
+                        {
+                            "key": file_data["key"],
+                            "metadata": file_data["metadata"],
+                            "access": {"hidden": False},
+                        }
+                    ],
+                )
+                # TODO change to eos move or xrootd command instead of going through the app
+                # TODO leave the init part to pre-create the destination folder
+                # TODO update checksum, size, commit (to be checked on how these methods work)
+                # if current_app.config["XROOTD_ENABLED"]:
+                #     storage = current_files_rest.storage_factory
+                #     current_rdm_records_service.draft_files.set_file_content(
+                #         identity,
+                #         draft.id,
+                #         file["key"],
+                #         BytesIO(b"Placeholder file"),
+                #     )
+                #     obj = None
+                #     for object in draft._record.files.objects:
+                #         if object.key == file["key"]:
+                #             obj = object
+                #     path = obj.file.uri
+                # else:
+                # for local development
+                current_rdm_records_service.draft_files.set_file_content(
+                    identity,
+                    draft.id,
+                    file_data["key"],
+                    import_legacy_files(file_data["eos_tmp_path"]),
+                )
+                result = current_rdm_records_service.draft_files.commit_file(
+                    identity, draft.id, file_data["key"]
+                )
+                legacy_checksum = f"md5:{file_data['checksum']}"
+                new_checksum = result.to_dict()["checksum"]
+                assert legacy_checksum == new_checksum
+            except Exception as e:
+                exc = ManualImportRequired(
+                    message=str(e), field="filename", value=file_data["key"]
+                )
+                migration_logger.add_log(exc, output=entry, key="filename",
+                                         value=file_data["key"])
 
     def _load_access(self, draft, entry):
         """Load access rights."""
@@ -118,35 +118,52 @@ def _load_access(self, draft, entry):
     def _load_versions(self, draft, entry):
         """Load other versions of the record."""
         draft_files = entry["draft_files"]
+
+        def publish_and_mint_recid(draft, version):
+            record = current_rdm_records_service.publish(system_identity, draft["id"])
+            # mint legacy ids for redirections
+            if version == 1:
+                # it seems more intuitive if we mint the lrecid for parent
+                # but then we get a double redirection
+                legacy_recid_minter(entry["record"]["recid"],
+                                    record._record.parent.model.id)
+
+        if not draft_files:
+            # when missing files, just publish
+            publish_and_mint_recid(draft, 1)
+
         for version in draft_files.keys():
-            file_dict = draft_files.get(version)
+            version_files_dict = draft_files.get(version)
             if version == 1:
-                self._load_files(draft, entry, file_dict)
+                self._load_files(draft, entry, version_files_dict)
             else:
                 draft = current_rdm_records_service.new_version(
                     system_identity, draft["id"]
                 )
 
-                self._load_files(draft, entry, file_dict)
-                filename = next(iter(file_dict))
-                file = file_dict[filename]
+                self._load_files(draft, entry, version_files_dict)
+
+                # attention! the metadata of new version
+                # will be taken from the first file
+                # on the list TODO can we improve this for publication accuracy?
+                # maybe sorting by creation date would be better?
+                filename = next(iter(version_files_dict))
+                file = version_files_dict[filename]
+                # add missing metadata for new version
                 missing_data = {
                     "metadata": {
+                        # copy over the previous draft metadata
                         **draft.to_dict()["metadata"],
+                        # add missing publication date based on the time of creation of the new file version
                         "publication_date": file["creation_date"],
                     }
                 }
 
                 draft = current_rdm_records_service.update_draft(
                     system_identity, draft["id"], data=missing_data
                 )
-            record = current_rdm_records_service.publish(system_identity, draft["id"])
-            # mint legacy ids for redirections
-            if version == 1:
-                # it seems more intuitive if we mint the lrecid for parent
-                # but then we get a double redirection
-                legacy_recid_minter(entry["record"]["recid"],
-                                    record._record.parent.model.id)
+
+            publish_and_mint_recid(draft, version)
 
     def _load_model_fields(self, draft, entry):
         """Load model fields of the record."""
@@ -185,9 +202,12 @@ def _load(self, entry):
                     self._load_model_fields(draft, entry)
 
                     self._load_versions(draft, entry)
+                except PIDAlreadyExists:
+                    pass
                 except Exception as e:
                     exc = ManualImportRequired(message=str(e), field="validation")
                     migration_logger.add_log(exc, output=entry)
+                    raise e
 
     def _cleanup(self, *args, **kwargs):
         """Cleanup the entries."""

diff --git a/cds_migrator_kit/rdm/migration/runner.py b/cds_migrator_kit/rdm/migration/runner.py
@@ -46,24 +46,7 @@ def __init__(self, stream_definitions, config_filepath, dry_run):
         Logger.initialize(self.log_dir)
         RDMJsonLogger.initialize(self.log_dir)
         FailedTxLogger.initialize(self.log_dir)
-
         self.db_uri = config.get("db_uri")
-        # self.state = StateDB(
-        #     db_dir=self.state_dir, validators={"parents": ParentModelValidator}
-        # )
-        # STATE.initialized_state(
-        #     self.state,
-        #     cache=config.get("state_cache", True),
-        #     search_cache=config.get("state_search_cache", True),
-        # )
-        # set up secret keys
-        # for key in ("old_secret_key", "new_secret_key"):
-        #     stored_value = STATE.VALUES.get(key)
-        #     if stored_value:
-        #         STATE.VALUES.update(key, {"value": bytes(config.get(key), "utf-8")})
-        #     else:
-        #         STATE.VALUES.add(key, {"value": bytes(config.get(key), "utf-8")})
-
         # start parsing streams
         self.streams = []
         for definition in stream_definitions:

diff --git a/cds_migrator_kit/rdm/migration/streams.py b/cds_migrator_kit/rdm/migration/streams.py
@@ -7,13 +7,12 @@
 
 """CDS-RDM migration streams module."""
 from invenio_rdm_migrator.streams import StreamDefinition
-# from invenio_rdm_migrator.streams.users.load import UserCopyLoad
 
 from cds_migrator_kit.rdm.migration.extract import LegacyExtract, LegacyUserExtract
 from cds_migrator_kit.rdm.migration.transform.transform import CDSToRDMRecordTransform
 
 from .load import CDSRecordServiceLoad
-from .transform.user_transform import CDSUserTransform
+from .transform.users import CDSUserIntermediaryLoad, CDSRDMUserTransform
 
 RecordStreamDefinition = StreamDefinition(
     name="records",
@@ -24,10 +23,10 @@
 """ETL stream for CDS to RDM records."""
 
 
-# UserStreamDefinition = StreamDefinition(
-#     name="users",
-#     extract_cls=LegacyUserExtract,
-#     transform_cls=CDSUserTransform,
-#     load_cls=UserCopyLoad,
-# )
-# """ETL stream for CDS to import users."""
+UserStreamDefinition = StreamDefinition(
+    name="users",
+    extract_cls=LegacyExtract,
+    transform_cls=CDSRDMUserTransform,
+    load_cls=CDSUserIntermediaryLoad,
+)
+"""ETL stream for CDS to import users."""
diff --git a/cds_migrator_kit/rdm/migration/streams.yaml b/cds_migrator_kit/rdm/migration/streams.yaml
@@ -7,12 +7,15 @@ old_secret_key: CHANGE_ME
 new_secret_key: CHANGE_ME
 #users:
 #  extract:
-#    filepath: cds_migrator_kit/rdm/migration/data/users/active_users.json
+#    dirpath: cds_migrator_kit/rdm/migration/data/users/dump
 #  load:
-#    db_uri: postgresql://cds-rdm:cds-rdm@localhost:5432/cds-rdm
-#    tmp_dir: site/cds_rdm/migration/data/transform
+#    filepath: cds_migrator_kit/rdm/migration/data/users/people.csv
 records:
   extract:
     dirpath: cds_migrator_kit/rdm/migration/data/summer_student_reports/dump/
   transform:
     files_dump_dir: cds_migrator_kit/rdm/migration/data/summer_student_reports/files/
+#    missing_users: /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/users
+    missing_users: cds_migrator_kit/rdm/migration/data/users
+    community_slug: summer-student-reports
+
diff --git a/cds_migrator_kit/rdm/migration/transform/models/people.py b/cds_migrator_kit/rdm/migration/transform/models/people.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of CERN Document Server.
+# Copyright (C) 2024 CERN.
+#
+# Invenio is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# Invenio is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Invenio; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+
+"""CDS-RDM Summer student model."""
+
+from .base import model as base_model
+from .overdo import CdsOverdo
+
+
+class PeopleAuthority(CdsOverdo):
+    """Translation Index for CDS Books."""
+
+    __query__ = "980__:AUTHORITY 980__:PEOPLE"
+    __ignore_keys__ = {
+        "001",
+        "005",
+        "372__0",  # affiliations
+        "371__1",  # affiliation abbreviation
+        "371__0",  # affiliation name
+        "371__j",  # TODO ??? ex 2872846
+        "371__k",  # phone number
+        "371__f",  # phone number
+        "371__l",  # phone number
+        "371__h",  # looks like duplicate
+        "371__d",  # duplicate  department info
+        "371__g",  # group
+        "371__v",  # source = CERN LDAP
+        # "371__d",  # department
+        "1001_a",  # surname
+        "1000_a",  # given names
+        "690C_a",  # always CERN
+        "595__c",  # internal note
+        "595__a",  # internal note
+        "980__a",  # collection
+    }
+
+model = PeopleAuthority(
+    bases=(base_model,), entry_point_group="cds_migrator_kit.migrator.rules.people"
+)