Skip to content

Commit

Permalink
test(container): Fix container tests for new regex
Browse files Browse the repository at this point in the history
  • Loading branch information
jgoguen committed Dec 10, 2024
1 parent b96e592 commit e30f7a4
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 27 deletions.
4 changes: 3 additions & 1 deletion container.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,7 @@ def _add_kobo_spans_to_node(
def _append_kobo_spans_from_text(
self, node: etree._Element, text: str, name: str
) -> etree._Element:
if not text:
if not text or text == "":
self.log.error(f"[{name}] No text passed, can't add spans")
return False

Expand All @@ -606,6 +606,8 @@ def _append_kobo_spans_from_text(
# append each sentence in its own span
segment_counter = 1
for g, ws in zip(groups[1::2], groups[2::2]):
if g.strip() == "":
continue
span = etree.Element(
f"{{{XHTML_NAMESPACE}}}span",
attrib={
Expand Down
55 changes: 29 additions & 26 deletions tests/test_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,19 +275,22 @@ def __run_single_node_test(self, text, text_only=False, number_of_sentences=None

# check number of sentences
if number_of_sentences is not None:
self.assertEqual(len(node.getchildren()), number_of_sentences)
self.assertEqual(
number_of_sentences,
len(node.getchildren()),
)

para_count = 1
text_chunks = [
chunk.strip() for chunk in text.strip().split("\n") if chunk.split() != ""
chunk.rstrip() for chunk in text.split("\n") if chunk.strip() != ""
]
self.assertEqual(len(node.getchildren()), len(text_chunks))
for span, chunk in zip(node.getchildren(), text_chunks):
self.assertEqual(span.text, chunk)
# spans should not end in whitespace (PR#191), and be nonempty
self.assertFalse(re.match(r'\s', span.text[-1]))
self.assertFalse(re.match(r"\s", span.text[-1]))
# tail of span should *only* be whitespace
self.assertTrue(re.match(r'\s*', span.tail or ''))
self.assertTrue(re.match(r"\s*", span.tail or ""))

# attrib is technically of type lxml.etree._Attrib, but functionally
# it's a dict. Cast it here to make assertDictEqual() happy.
Expand All @@ -297,10 +300,10 @@ def __run_single_node_test(self, text, text_only=False, number_of_sentences=None
para_count += 1

# remaining text should only contain whitespace
self.assertTrue(re.match(r'\s*', node.text or ''))
self.assertTrue(re.match(r"\s*", node.text or ""))

# complete text should remain the same
self.assertEqual(''.join(node.itertext()), text)
self.assertEqual("".join(node.itertext()), "".join(text_chunks))

def test_add_spans_to_text(self):
text_samples = [
Expand All @@ -309,12 +312,14 @@ def test_add_spans_to_text(self):
"Hello, World! ",
" Hello, World! ",
"\n\n GIF is pronounced as it's spelled.\n ",
" \"Yes, but I asked 'Why?'\" "
" \"Yes, but I asked 'Why?'\" ",
]

for text in text_samples:
for text_only in {True, False}:
self.__run_single_node_test(text, text_only=text_only, number_of_sentences=1)
self.__run_single_node_test(
text, text_only=text_only, number_of_sentences=1
)

def __run_multiple_node_test(self, text_nodes): # type: (List[str]) -> None
html = "<div>"
Expand All @@ -334,11 +339,18 @@ def __run_multiple_node_test(self, text_nodes): # type: (List[str]) -> None
self.assertEqual(len(text_nodes), len(children))
for text, node in zip(text_nodes, children):
spans = node.getchildren()
# note: this regexp isn't being tested (it's known to be fallible, but good enough)
sentences = container.TEXT_SPLIT_RE.findall(text)
# note: this regexp isn't being tested (it's known to be fallible, but
# good enough)
sentences = [
txt
for txt in container.TEXT_SPLIT_RE.findall(text)
if txt.strip() != ""
]

# check spans are added correctly for phrase individually
self.__run_single_node_test(text, text_only=False, number_of_sentences=len(sentences))
self.__run_single_node_test(
text, text_only=False, number_of_sentences=len(sentences)
)

# assert span is correctly split into sentences
self.assertEqual(len(spans), len(sentences))
Expand Down Expand Up @@ -366,23 +378,14 @@ def test_gitub_pr_106(self):
self.assertIn(container_name, self.container.name_path_map)

pre_span = self.container.parsed(container_name)
text_chunks = [
g
for g in pre_span.xpath(
"//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE}
)
]

self.container.add_kobo_spans(container_name)

post_span = self.container.parsed(container_name)
post_text_chunks = [
g
for g in post_span.xpath(
"//xhtml:p//text()", namespaces={"xhtml": container.XHTML_NAMESPACE}
)
]
self.assertEqual(''.join(text_chunks), ''.join(post_text_chunks))

self.assertEqual(
len(list(pre_span.itertext())), len(list(post_span.itertext()))
)
for pre, post in zip(pre_span.itertext(), post_span.itertext()):
self.assertEqual(pre, post, f"\npre='{pre}'\npost='{post}'")

def test_github_issue_136(self):
source_file = os.path.join(self.testfile_basedir, "page_github_136.html")
Expand Down

0 comments on commit e30f7a4

Please sign in to comment.