Skip to content

Commit

Permalink
users: re-create inactive users for record owner
Browse files Browse the repository at this point in the history
  • Loading branch information
kpsherva committed Oct 2, 2024
1 parent beaae08 commit 17b061d
Show file tree
Hide file tree
Showing 15 changed files with 477 additions and 266 deletions.
4 changes: 3 additions & 1 deletion cds_migrator_kit/rdm/migration/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
from flask.cli import with_appcontext

from cds_migrator_kit.rdm.migration.runner import Runner
from cds_migrator_kit.rdm.migration.streams import RecordStreamDefinition
from cds_migrator_kit.rdm.migration.streams import RecordStreamDefinition, \
UserStreamDefinition

cli_logger = logging.getLogger("migrator")

Expand All @@ -36,6 +37,7 @@ def run(dry_run=False):
stream_config = current_app.config["CDS_MIGRATOR_KIT_STREAM_CONFIG"]
runner = Runner(
stream_definitions=[RecordStreamDefinition],
# stream_definitions=[UserStreamDefinition],
config_filepath=Path(stream_config).absolute(),
dry_run=dry_run
)
Expand Down
2 changes: 1 addition & 1 deletion cds_migrator_kit/rdm/migration/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self, filepath):
"""Constructor."""
self.filepath = Path(filepath).absolute()

def run(self):
def extract(self, email):
"""Run."""
with open(self.filepath, "r") as dump_file:
data = json.load(dump_file)
Expand Down
150 changes: 85 additions & 65 deletions cds_migrator_kit/rdm/migration/load/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import arrow
from invenio_access.permissions import system_identity
from invenio_db import db
from invenio_pidstore.errors import PIDAlreadyExists
from invenio_rdm_migrator.load.base import Load
from invenio_rdm_records.proxies import current_rdm_records_service

Expand Down Expand Up @@ -54,59 +55,58 @@ def _load_files(self, draft, entry, version_files):

identity = system_identity # Should we create an identity for the migration?

# take first file for the fist version
filename = next(iter(version_files))

file = version_files[filename]

try:
current_rdm_records_service.draft_files.init_files(
identity,
draft.id,
data=[
{
"key": file["key"],
"metadata": file["metadata"],
"access": {"hidden": False},
}
],
)
# TODO change to eos move or xrootd command instead of going through the app
# TODO leave the init part to pre-create the destination folder
# TODO update checksum, size, commit (to be checked on how these methods work)
# if current_app.config["XROOTD_ENABLED"]:
# storage = current_files_rest.storage_factory
# current_rdm_records_service.draft_files.set_file_content(
# identity,
# draft.id,
# file["key"],
# BytesIO(b"Placeholder file"),
# )
# obj = None
# for object in draft._record.files.objects:
# if object.key == file["key"]:
# obj = object
# path = obj.file.uri
# else:
# for local development
current_rdm_records_service.draft_files.set_file_content(
identity,
draft.id,
file["key"],
import_legacy_files(file["eos_tmp_path"]),
)
result = current_rdm_records_service.draft_files.commit_file(
identity, draft.id, file["key"]
)
legacy_checksum = f"md5:{file['checksum']}"
new_checksum = result.to_dict()["checksum"]
assert legacy_checksum == new_checksum
except Exception as e:
exc = ManualImportRequired(
message=str(e), field="filename", value=file["key"]
)
migration_logger.add_log(exc, output=entry, key="filename",
value=file["key"])
for filename, file_data in version_files.items():

file_data = version_files[filename]

try:
current_rdm_records_service.draft_files.init_files(
identity,
draft.id,
data=[
{
"key": file_data["key"],
"metadata": file_data["metadata"],
"access": {"hidden": False},
}
],
)
# TODO change to eos move or xrootd command instead of going through the app
# TODO leave the init part to pre-create the destination folder
# TODO update checksum, size, commit (to be checked on how these methods work)
# if current_app.config["XROOTD_ENABLED"]:
# storage = current_files_rest.storage_factory
# current_rdm_records_service.draft_files.set_file_content(
# identity,
# draft.id,
# file["key"],
# BytesIO(b"Placeholder file"),
# )
# obj = None
# for object in draft._record.files.objects:
# if object.key == file["key"]:
# obj = object
# path = obj.file.uri
# else:
# for local development
current_rdm_records_service.draft_files.set_file_content(
identity,
draft.id,
file_data["key"],
import_legacy_files(file_data["eos_tmp_path"]),
)
result = current_rdm_records_service.draft_files.commit_file(
identity, draft.id, file_data["key"]
)
legacy_checksum = f"md5:{file_data['checksum']}"
new_checksum = result.to_dict()["checksum"]
assert legacy_checksum == new_checksum
except Exception as e:
exc = ManualImportRequired(
message=str(e), field="filename", value=file_data["key"]
)
migration_logger.add_log(exc, output=entry, key="filename",
value=file_data["key"])

def _load_access(self, draft, entry):
"""Load access rights."""
Expand All @@ -118,35 +118,52 @@ def _load_access(self, draft, entry):
def _load_versions(self, draft, entry):
"""Load other versions of the record."""
draft_files = entry["draft_files"]

def publish_and_mint_recid(draft, version):
record = current_rdm_records_service.publish(system_identity, draft["id"])
# mint legacy ids for redirections
if version == 1:
# it seems more intuitive if we mint the lrecid for parent
# but then we get a double redirection
legacy_recid_minter(entry["record"]["recid"],
record._record.parent.model.id)

if not draft_files:
# when missing files, just publish
publish_and_mint_recid(draft, 1)

for version in draft_files.keys():
file_dict = draft_files.get(version)
version_files_dict = draft_files.get(version)
if version == 1:
self._load_files(draft, entry, file_dict)
self._load_files(draft, entry, version_files_dict)
else:
draft = current_rdm_records_service.new_version(
system_identity, draft["id"]
)

self._load_files(draft, entry, file_dict)
filename = next(iter(file_dict))
file = file_dict[filename]
self._load_files(draft, entry, version_files_dict)

# attention! the metadata of new version
# will be taken from the first file
# on the list TODO can we improve this for publication accuracy?
# maybe sorting by creation date would be better?
filename = next(iter(version_files_dict))
file = version_files_dict[filename]
# add missing metadata for new version
missing_data = {
"metadata": {
# copy over the previous draft metadata
**draft.to_dict()["metadata"],
# add missing publication date based on the time of creation of the new file version
"publication_date": file["creation_date"],
}
}

draft = current_rdm_records_service.update_draft(
system_identity, draft["id"], data=missing_data
)
record = current_rdm_records_service.publish(system_identity, draft["id"])
# mint legacy ids for redirections
if version == 1:
# it seems more intuitive if we mint the lrecid for parent
# but then we get a double redirection
legacy_recid_minter(entry["record"]["recid"],
record._record.parent.model.id)

publish_and_mint_recid(draft, version)

def _load_model_fields(self, draft, entry):
"""Load model fields of the record."""
Expand Down Expand Up @@ -185,9 +202,12 @@ def _load(self, entry):
self._load_model_fields(draft, entry)

self._load_versions(draft, entry)
except PIDAlreadyExists:
pass
except Exception as e:
exc = ManualImportRequired(message=str(e), field="validation")
migration_logger.add_log(exc, output=entry)
raise e

def _cleanup(self, *args, **kwargs):
"""Cleanup the entries."""
Expand Down
17 changes: 0 additions & 17 deletions cds_migrator_kit/rdm/migration/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,7 @@ def __init__(self, stream_definitions, config_filepath, dry_run):
Logger.initialize(self.log_dir)
RDMJsonLogger.initialize(self.log_dir)
FailedTxLogger.initialize(self.log_dir)

self.db_uri = config.get("db_uri")
# self.state = StateDB(
# db_dir=self.state_dir, validators={"parents": ParentModelValidator}
# )
# STATE.initialized_state(
# self.state,
# cache=config.get("state_cache", True),
# search_cache=config.get("state_search_cache", True),
# )
# set up secret keys
# for key in ("old_secret_key", "new_secret_key"):
# stored_value = STATE.VALUES.get(key)
# if stored_value:
# STATE.VALUES.update(key, {"value": bytes(config.get(key), "utf-8")})
# else:
# STATE.VALUES.add(key, {"value": bytes(config.get(key), "utf-8")})

# start parsing streams
self.streams = []
for definition in stream_definitions:
Expand Down
17 changes: 8 additions & 9 deletions cds_migrator_kit/rdm/migration/streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@

"""CDS-RDM migration streams module."""
from invenio_rdm_migrator.streams import StreamDefinition
# from invenio_rdm_migrator.streams.users.load import UserCopyLoad

from cds_migrator_kit.rdm.migration.extract import LegacyExtract, LegacyUserExtract
from cds_migrator_kit.rdm.migration.transform.transform import CDSToRDMRecordTransform

from .load import CDSRecordServiceLoad
from .transform.user_transform import CDSUserTransform
from .transform.users import CDSUserIntermediaryLoad, CDSRDMUserTransform

RecordStreamDefinition = StreamDefinition(
name="records",
Expand All @@ -24,10 +23,10 @@
"""ETL stream for CDS to RDM records."""


# UserStreamDefinition = StreamDefinition(
# name="users",
# extract_cls=LegacyUserExtract,
# transform_cls=CDSUserTransform,
# load_cls=UserCopyLoad,
# )
# """ETL stream for CDS to import users."""
UserStreamDefinition = StreamDefinition(
name="users",
extract_cls=LegacyExtract,
transform_cls=CDSRDMUserTransform,
load_cls=CDSUserIntermediaryLoad,
)
"""ETL stream for CDS to import users."""
9 changes: 6 additions & 3 deletions cds_migrator_kit/rdm/migration/streams.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@ old_secret_key: CHANGE_ME
new_secret_key: CHANGE_ME
#users:
# extract:
# filepath: cds_migrator_kit/rdm/migration/data/users/active_users.json
# dirpath: cds_migrator_kit/rdm/migration/data/users/dump
# load:
# db_uri: postgresql://cds-rdm:cds-rdm@localhost:5432/cds-rdm
# tmp_dir: site/cds_rdm/migration/data/transform
# filepath: cds_migrator_kit/rdm/migration/data/users/people.csv
records:
extract:
dirpath: cds_migrator_kit/rdm/migration/data/summer_student_reports/dump/
transform:
files_dump_dir: cds_migrator_kit/rdm/migration/data/summer_student_reports/files/
# missing_users: /eos/media/cds/cds-rdm/dev/migration/summer-student-notes/users
missing_users: cds_migrator_kit/rdm/migration/data/users
community_slug: summer-student-reports

55 changes: 55 additions & 0 deletions cds_migrator_kit/rdm/migration/transform/models/people.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-
#
# This file is part of CERN Document Server.
# Copyright (C) 2024 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

"""CDS-RDM Summer student model."""

from .base import model as base_model
from .overdo import CdsOverdo


class PeopleAuthority(CdsOverdo):
"""Translation Index for CDS Books."""

__query__ = "980__:AUTHORITY 980__:PEOPLE"
__ignore_keys__ = {
"001",
"005",
"372__0", # affiliations
"371__1", # affiliation abbreviation
"371__0", # affiliation name
"371__j", # TODO ??? ex 2872846
"371__k", # phone number
"371__f", # phone number
"371__l", # phone number
"371__h", # looks like duplicate
"371__d", # duplicate department info
"371__g", # group
"371__v", # source = CERN LDAP
# "371__d", # department
"1001_a", # surname
"1000_a", # given names
"690C_a", # always CERN
"595__c", # internal note
"595__a", # internal note
"980__a", # collection
}

model = PeopleAuthority(
bases=(base_model,), entry_point_group="cds_migrator_kit.migrator.rules.people"
)
Loading

0 comments on commit 17b061d

Please sign in to comment.