From ac86d209c88583c3dd7ef1f2723371405ea1489f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johnny=20Marie=CC=81thoz?= Date: Tue, 8 Nov 2022 17:08:44 +0100 Subject: [PATCH] fix: OAI-PMH crash for document containing control chars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adds new `safety` exceptions. * Removes controls chars when the dublin core xml file is produced. * Closes #867. Co-Authored-by: Johnny Mariéthoz --- scripts/test | 5 ++++- sonar/modules/documents/serializers/dc.py | 3 ++- tests/ui/documents/test_dc_schema.py | 9 +++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/scripts/test b/scripts/test index 7be187ed..8d5cafe1 100755 --- a/scripts/test +++ b/scripts/test @@ -34,7 +34,10 @@ fi # | package | installed | affected | ID | # +============================+===========+==========================+==========+ # | wtforms | 2.3.3 | <3.0.0a1 | 42852 | +# | wheel | 0.37.1 | <0.38.0 | 51499 | # | sqlalchemy-utils | 0.35.0 | >=0.27.0 | 42194 | +# | safety | 1.10.3 | <2.2.0 | 51358 | +# | py | 1.11.0 | <=1.11.0 | 51457 | # | nbconvert | 6.4.5 | <6.5.1 | 50792 | # | lxml | 4.7.0 | <4.9.1 | 50748 | # | flask-security | 3.0.0 | <3.1.0 | 45183 | @@ -43,7 +46,7 @@ fi # | celery | 5.1.2 | <5.2.0 | 42498 | # | celery | 5.1.2 | <5.2.2 | 43738 | # +==============================================================================+ -safety check -i 42852 -i 42050 -i 42194 -i 45183 -i 42498 -i 44501 -i 43738 -i 47833 -i 50748 -i 50792 +safety check -i 42852 -i 51499 -i 42194 -i 51358 -i 51457 -i 50792 -i 50748 -i 45183 -i 44501 -i 47833 -i 42498 -i 43738 pydocstyle sonar tests docs isort --check-only --diff "${SCRIPT_PATH}/.." autoflake -c -r --remove-all-unused-imports --ignore-init-module-imports . &> /dev/null || { diff --git a/sonar/modules/documents/serializers/dc.py b/sonar/modules/documents/serializers/dc.py index 6a997672..eb9e8466 100644 --- a/sonar/modules/documents/serializers/dc.py +++ b/sonar/modules/documents/serializers/dc.py @@ -18,6 +18,7 @@ """Dublin Core serializer.""" from flask_resources.serializers import SerializerMixin +from invenio_oaiserver.utils import sanitize_unicode from lxml import etree from sonar.modules.documents.serializers.schemas.dc import DublinCoreSchema @@ -102,7 +103,7 @@ def serialize_dict_to_etree(self, data): f'{{http://purl.org/dc/elements/1.1/}}{elements[key]}', attrs ) - field.text = val + field.text = sanitize_unicode(val) return root diff --git a/tests/ui/documents/test_dc_schema.py b/tests/ui/documents/test_dc_schema.py index 2c5e6f92..fe85a22c 100644 --- a/tests/ui/documents/test_dc_schema.py +++ b/tests/ui/documents/test_dc_schema.py @@ -202,6 +202,15 @@ def test_descriptions_attributes(minimal_document): } ] + +def test_descriptions_xml_control_char(minimal_document): + minimal_document['abstracts'] = [{ + 'language': 'fre', + 'value': 'sous\x02évalués' + }] + assert SonarDublinCoreXMLSerializer().serialize_object_xml(dict(_source=minimal_document)) + + def test_formats(minimal_document): result = SonarDublinCoreXMLSerializer().transform_record(minimal_document) assert result['formats'] == []