Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for math formulas in docx files #295

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 108 additions & 5 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
import os
import re
import shutil
import string
import subprocess
import sys
import tempfile
import traceback
import zipfile
from io import BytesIO
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union
from pathlib import Path
Expand All @@ -32,6 +34,7 @@
import requests
from bs4 import BeautifulSoup
from charset_normalizer import from_path
from lxml import etree as ET

# Azure imports
from azure.ai.documentintelligence import DocumentIntelligenceClient
Expand Down Expand Up @@ -714,22 +717,122 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""

def __init__(self):
self._omath_re = re.compile(r"<m:oMath[^<>]*>.+?</m:oMath>", flags=re.S)
self._omath_para_re = re.compile(
r"<m:oMathPara\s*>(.+?)</m:oMathPara>", flags=re.S
)
self._formula_re = re.compile(r"\$formula\$(.+?)\$/formula\$")

self.nsmap = {
"m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
"o": "urn:schemas-microsoft-com:office:office",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
"v": "urn:schemas-microsoft-com:vml",
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"cx": "http://schemas.microsoft.com/office/drawing/2014/chartex",
"cx1": "http://schemas.microsoft.com/office/drawing/2015/9/8/chartex",
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
"w10": "urn:schemas-microsoft-com:office:word",
"w14": "http://schemas.microsoft.com/office/word/2010/wordml",
"w15": "http://schemas.microsoft.com/office/word/2012/wordml",
"w16se": "http://schemas.microsoft.com/office/word/2015/wordml/symex",
"wpc": "http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas",
"wne": "http://schemas.microsoft.com/office/word/2006/wordml",
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
"wp14": "http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing",
"wpg": "http://schemas.microsoft.com/office/word/2010/wordprocessingGroup",
"wpi": "http://schemas.microsoft.com/office/word/2010/wordprocessingInk",
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
}
self._xmlns_str = " ".join(
'xmlns:{}="{}"'.format(key, value) for key, value in self.nsmap.items()
)
self._template = string.Template(
"""<?xml version="1.0" standalone="yes"?>
<w:document mc:Ignorable="w14 w15 w16se wp14" {}>
$formula_xml
</w:document>""".format(
self._xmlns_str
)
)
self._xsl_folder = os.path.join(
os.path.dirname(
os.path.dirname(os.path.abspath(__file__))),
'xsl',
)
self._mml_to_tex_xsl = os.path.join(self._xsl_folder, "mmltex.xsl")
self._omml_to_mml_xsl = os.path.join(self._xsl_folder, "omml2mml.xsl")

def _mml_to_tex(self, mml_xml: str) -> str:
tree = ET.fromstring(mml_xml)
transform = ET.XSLT(ET.parse(self._mml_to_tex_xsl))
return str(transform(tree))

def _omml_to_mml(self, formula_xml: str) -> str:
xml_content = self._template.safe_substitute(formula_xml=formula_xml)
tree = ET.fromstring(xml_content)
transform = ET.XSLT(ET.parse(self._omml_to_mml_xsl))
return str(transform(tree))

def _omml_to_tex(self, omml_xml: str) -> str:
mml_xml = self._omml_to_mml(omml_xml)
return self._mml_to_tex(mml_xml)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".docx":
return None

result = None
with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map", None)
# preprocess docx equations in docx file
docx_file = self._quote_equations(local_path)
style_map = kwargs.get("style_map", None)

result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value
result = self._convert(html_content)
result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = self._unquote_omath_to_tex(result.value)
result = self._convert(html_content)

return result

def _quote_omath(self, xml_content: str) -> str:
def replace(match):
quoted_omath = quote(match.group(0))
return "<w:t>$formula$ {} $/formula$</w:t>".format(quoted_omath)

xml_content = self._omath_re.sub(replace, xml_content)
xml_content = self._omath_para_re.sub(lambda m: m.group(1), xml_content)
return xml_content

def _unquote_omath_to_tex(self, html: str) -> str:
def replace(match):
omml_content = unquote(match.group(1))
return self._omml_to_tex(omml_content)

return self._formula_re.sub(replace, html)

def _quote_equations(self, docx_filename: str) -> BytesIO:
"""
Surrounds all OMML equations in the docx file with $formula$ and
$/formula$ tags.
"""
doc_files = ("word/document.xml", "word/footnotes.xml", "word/endnotes.xml")
output_zip = BytesIO()
with zipfile.ZipFile(docx_filename, "r") as z_in:
with zipfile.ZipFile(output_zip, "w", zipfile.ZIP_DEFLATED) as z_out:
z_out.comment = z_in.comment
for item in z_in.infolist():
if item.filename not in doc_files:
z_out.writestr(item, z_in.read(item.filename))
else:
xml_content = self._quote_omath(
z_in.read(item.filename).decode("utf8")
).encode("utf8")
z_out.writestr(item.filename, xml_content)
output_zip.seek(0)
return output_zip


class XlsxConverter(HtmlConverter):
"""
Expand Down
Binary file added tests/test_files/equation.docx
Binary file not shown.
10 changes: 10 additions & 0 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,10 @@ def test_markitdown_local() -> None:
assert "# Test" in result.text_content






@pytest.mark.skipif(
skip_exiftool,
reason="do not run if exiftool is not installed",
Expand Down Expand Up @@ -358,6 +362,12 @@ def test_markitdown_llm() -> None:
for test_string in ["red", "circle", "blue", "square"]:
assert test_string in result.text_content.lower()

def test_equation() -> None:
expected_string = r'${\left(x+a\right)}^{n}={\sum }\_{k=0}^{n}\left(\begin{array}{c}n\\ k\end{array}\right){x}^{k}{a}^{n-k}$'
markitdown = MarkItDown()
result = markitdown.convert( os.path.join(TEST_FILES_DIR, "equation.docx"))
assert expected_string == result.text_content.strip()


if __name__ == "__main__":
"""Runs this file's tests from the command line."""
Expand Down