feat: Add support for Unstructured Parser, improve Table and Image Pa…

…rsing, and add TOC and Hyperlinks for Docx (#9)
QuivrHQ · Jun 2, 2024 · 4934776 · 4934776
1 parent 08691af
commit 4934776
Show file tree

Hide file tree

Showing 8 changed files with 735 additions and 198 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,6 @@ __pycache__/
 dist/**
 megaparse.egg-info/
 *.pyc
-build/*
+build/*
+
+!megaparse/tests/output_tests/MegaFake_report.md
diff --git a/Pipfile b/Pipfile
@@ -13,6 +13,12 @@ pdf2docx = "*"
 unstructured = {extras = ["pdf"], version = "*"}
 langchain = "*"
 langchain-community = "*"
+llama-index = "*"
+pytesseract = "*"
+poppler-utils = "*"
+markdownify = "*"
+langchain-openai = "*"
+langchain-core = "*"
 
 [dev-packages]
 ipykernel = "*"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -31,30 +31,40 @@ pip install megaparse
 
 ## Usage
 
-1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key.
+```python
+from megaparse import MegaParse
 
-2. Create a new file in the root directory of the project and name it `.env`.
+megaparse = MegaParse(file_path="./test.pdf")
+content = megaparse.convert()
+print(content)
+megaparse.save_md(content, "./test.md")
+```
 
-3. Add the following line to the `.env` file and replace `llx-your_api_key` with your actual API key.
+### Use LlamaParse
 
-```bash
-LLAMA_CLOUD_API_KEY=llx-your_api_key
-```
+1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key.
 
-4. Now you can use the following code to convert a PDF to Markdown and save it to a file.
+2. Call Megaparse with the `llama_parse_api_key` parameter
 
 ```python
 from megaparse import MegaParse
 
-megaparse = MegaParse(file_path="./test.pdf")
+megaparse = MegaParse(file_path="./test.pdf", llama_parse_api_key="llx-your_api_key")
 content = megaparse.convert()
 print(content)
-megaparse.save_md(content, "./test.md")
 ```
 
+## BenchMark
+
+**Diff megaparse unstructured:** 120
+**Diff llama parse:** 31
+**Diff megaparse llama:** 26
+
+
+*Lower is better*
+
 ## Next Steps
 
-- [ ] Add Unstructured Parser Support
 - [ ] Improve Table Parsing
 - [ ] Improve Image Parsing and description
 - [ ] Add TOC for Docx

diff --git a/megaparse/Converter.py b/megaparse/Converter.py
@@ -17,6 +17,11 @@
 from llama_parse.utils import ResultType, Language
 from llama_index.core.schema import Document as LlamaDocument
 from megaparse.markdown_processor import MarkdownProcessor
+from megaparse.unstructured import UnstructuredParser
+from pathlib import Path
+from llama_index.core import download_loader
+from unstructured.partition.auto import partition
+
 
 import nest_asyncio
 
@@ -201,12 +206,16 @@ def save_md(self, md_content: str, file_path: Path | str) -> None:
 
 class PDFConverter:
     def __init__(
-        self, api_key: str, handle_pagination: bool = True, handle_header: bool = True
+        self,
+        llama_parse_api_key: str,
+        handle_pagination: bool = True,
+        handle_header: bool = True,
     ) -> None:
         self.handle_pagination = handle_pagination
         self.handle_header = handle_header
-        self.api_key = api_key
+        self.llama_parse_api_key = llama_parse_api_key
 
+    def _llama_parse(self, api_key: str, file_path: str):
         parsing_instructions = "Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged. Keep the same format for similar tables."
         self.parser = LlamaParse(
             api_key=str(api_key),
@@ -216,10 +225,20 @@ def __init__(
             language=Language.FRENCH,
             parsing_instruction=parsing_instructions,  # Optionally you can define a parsing instruction
         )
-
-    def convert(self, file_path: str) -> str:
         documents: List[LlamaDocument] = self.parser.load_data(file_path)
         parsed_md = documents[0].get_content()
+        return parsed_md
+
+    def _unstructured_parse(self, file_path: str):
+        unstructured_parser = UnstructuredParser()
+        return unstructured_parser.convert(file_path)
+
+    def convert(self, file_path: str) -> str:
+        parsed_md = ""
+        if self.llama_parse_api_key:
+            parsed_md = self._llama_parse(self.llama_parse_api_key, file_path)
+        else:
+            parsed_md = self._unstructured_parse(file_path)
 
         if not (self.handle_pagination or self.handle_header):
             return parsed_md
@@ -238,9 +257,9 @@ def save_md(self, md_content: str, file_path: Path | str) -> None:
 
 
 class MegaParse:
-    def __init__(self, file_path: str) -> None:
+    def __init__(self, file_path: str, llama_parse_api_key: str | None = None) -> None:
         self.file_path = file_path
-        self.api_key = os.getenv("LLAMA_CLOUD_API_KEY")
+        self.llama_parse_api_key = llama_parse_api_key
 
     def convert(self) -> str:
         file_extension: str = os.path.splitext(self.file_path)[1]
@@ -254,12 +273,13 @@ def convert(self) -> str:
                 file_path=self.file_path, file_extension=file_extension
             )
         elif file_extension == ".pdf":
-            converter = PDFConverter(api_key=self.api_key)
+            converter = PDFConverter(llama_parse_api_key=self.llama_parse_api_key)
         else:
             print(self.file_path, file_extension)
             raise ValueError(f"Unsupported file extension: {file_extension}")
         return converter.convert(self.file_path)
 
     def save_md(self, md_content: str, file_path: Path | str) -> None:
-        with open(file_path, "w") as f:
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        with open(file_path, "w+") as f:
             f.write(md_content)
diff --git a/megaparse/tests/output_tests/MegaFake_report.md b/megaparse/tests/output_tests/MegaFake_report.md
@@ -0,0 +1,86 @@
+| My Mega fake report | #1756394 | 31/05/2024 |
+|---------------------|----------|------------|
+
+## Why Mega Parse might be the best ?
+
+### Introduction
+
+Mega Parse is a state-of-the-art document parser designed to convert various document formats such as PDF, DOCX, PPTX, and more into Markdown (MD) format, making them ready for Retrieval-Augmented Generation (RAG) ingestion. In today's data-driven world, the ability to efficiently manage and utilize large volumes of information is crucial. This report explores the features, benefits, and comparative performance of Mega Parse, illustrating why it stands out as a superior tool in the realm of document parsing.
+
+### Features of Mega Parse
+
+Mega Parse boasts an impressive array of features tailored to meet the diverse needs of modern enterprises.
+
+**Multiple Format Support:** Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered.
+
+**High-Speed Processing:** One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion.
+
+**Markdown Output:** Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount.
+
+Accuracy: Accuracy in text extraction and formatting is a critical aspect of any document parser. Mega Parse ensures high accuracy, maintaining the integrity and structure of the original documents. This is particularly important for documents that contain complex formatting and embedded elements.
+
+Customizable Parsing Rules: Users can define custom parsing rules to suit specific needs, allowing for greater control over the conversion process. This flexibility ensures that Mega Parse can be adapted to a wide variety of use cases.
+
+Batch Processing: Mega Parse supports batch processing, enabling the simultaneous conversion of multiple documents. This feature is particularly useful for organizations dealing with large volumes of documents, as it streamlines the workflow and saves time.
+
+Error Handling: Advanced error handling capabilities ensure that any issues encountered during the conversion process are managed effectively, minimizing disruptions and maintaining workflow efficiency.
+
+# Benefits of Mega Parse
+
+The implementation of Mega Parse offers numerous benefits that can transform the way organizations manage their documents.
+
+**Efficiency:** By significantly speeding up the document conversion process, Mega Parse increases overall efficiency. This is especially beneficial for industries that handle large volumes of documents on a daily basis, such as legal firms, financial institutions, and research organizations.
+
+**Versatility:** Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task.
+
+**Enhanced Knowledge Management:** Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but
+also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information.
+
+Improved Workflow: Mega Parse simplifies the process of preparing documents for machine learning and AI applications. By converting documents into a structured format, it reduces the time and effort required to preprocess data, allowing teams to focus on higher-level tasks.
+
+Cost Savings: The efficiency and speed of Mega Parse can lead to significant cost savings. Reduced processing times and improved workflow efficiency mean that resources can be allocated more effectively, ultimately lowering operational costs.
+
+Scalability: Mega Parse is designed to scale with the needs of an organization. As document volumes grow, Mega Parse can handle the increased load without compromising performance, making it a future-proof solution for document management.
+
+# Comparative Performance
+
+The following table provides a comprehensive comparative analysis of Mega Parse against other document parsers based on fictional performance metrics. This comparison highlights the strengths of Mega Parse in various key areas.
+
+| Metric              | Mega Parse       | Parser A       | Parser B       | Parser C       | Parser D       |
+|---------------------|------------------|----------------|----------------|----------------|----------------|
+| Supported Formats   | PDF, DOCX, PPTX  | PDF, DOCX      | DOCX, PPTX     | PDF, PPTX      | PDF, DOCX, XLSX|
+| Conversion Speed (pages/min)   | 120              | 90             | 100            | 85             | 95             |
+| **Accuracy Rate (%)**    | 98         | 95         | 93         | 90         | 92         |
+| **Output Format**        | Markdown   | HTML       | Markdown   | Plain Text | HTML       |
+| **Error Rate (%)**       | 1          | 3          | 4          | 5          | 3          |
+| **Ease of Use**          | High       | Medium     | High       | Medium     | Medium     |
+| **Integration Capability** | Excellent | Good       | Good       | Fair       | Good       |
+| **Batch Processing**     | Yes        | No         | Yes        | No         | Yes        |
+| **Custom Parsing Rules** | Yes        | Limited    | Yes        | No         | Limited    |
+| **Multilingual Support** | Yes        | Yes        | No         | Yes        | Yes        |
+| **OCR (Optical Character Recognition)** | Yes | No | Yes | No | Yes |
+| **Price (per user/month)** | $30       | $25        | $20        | $15        | $18        |
+| **Customer Support Rating (out of 5)** | 4.8 | 4.2 | 4.5 | 3.9 | 4.1 |
+| **Free Trial Available** | Yes        | Yes        | No         | Yes        | No         |
+| **Cloud Integration**    | Yes        | No         | Yes        | Yes        | No         |
+| **Security Features**    | Advanced   | Basic      | Advanced   | Basic      | Intermediate |
+| **User Community Size**        | Large                          | Medium                         | Medium                         | Small                          | Medium                         |
+| **Monthly Updates**            | Yes                            | Yes                            | No                             | Yes                            | No                             |
+| **Mobile App Availability**    | Yes                            | No                             | Yes                            | No                             | Yes                            |
+| **Platform Compatibility**     | Windows, Mac, Linux            | Windows, Mac                   | Windows                        | Mac, Linux                     | Windows, Linux                 |
+| **Data Privacy Compliance**    | High                           | Medium                         | High                           | Low                            | Medium                         |
+| **AI-Driven Enhancements**     | Yes                            | No                             | Yes                            | No                             | Yes                            |
+| **File Size Limit (per document)** | 1GB                            | 500MB                          | 750MB                          | 200MB                          | 500MB                          |
+| **User Training Resources**    | Extensive                      | Moderate                       | Extensive                      | Limited                        | Moderate                       |
+| **API Access**                 | Yes                            | No                             | Yes                            | No                             | Yes                            |
+| **Customizable Output Templates** | Yes                            | Limited                        | Yes                            | No                             | Yes                            |
+| **Collaboration Features**     | Yes                            | No                             | Yes                            | No                             | Limited                        |
+| **Document Version Control**   | Yes                            | No                             | Yes                            | No                             | Yes                            |
+| **Import/Export Options**      | Extensive                      | Moderate                       | Extensive                      | Limited                        | Moderate                       |
+| Feedback Mechanism | Yes | No | Yes | No | Yes |
+
+*Note: All data presented in this table is fictional and for illustrative purposes only.*
+
+## Conclusion
+
+Mega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence.
diff --git a/megaparse/unstructured.py b/megaparse/unstructured.py
@@ -0,0 +1,98 @@
+import markdownify
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import PromptTemplate
+from unstructured.partition.pdf import partition_pdf
+
+
+class UnstructuredParser:
+
+    # Function to convert element category to markdown format
+    def convert_to_markdown(self, elements):
+        markdown_content = ""
+        element_hierarchy = {}
+
+        for el in elements:
+            element_type = el["type"]
+            text = el["text"]
+            metadata = el["metadata"]
+            parent_id = metadata.get("parent_id", None)
+            category_depth = metadata.get("category_depth", 0)
+
+            markdown_line = ""
+
+            if element_type == "Title":
+                if parent_id:
+                    markdown_line = f"## {text}\n\n"  # Adjusted to add sub headers if parent_id exists
+                else:
+                    markdown_line = f"# {text}\n\n"
+            elif element_type == "Subtitle":
+                markdown_line = f"## {text}\n\n"
+            elif element_type == "Header":
+                markdown_line = f"{'#' * (category_depth + 1)} {text}\n\n"
+            elif element_type == "Footer":
+                markdown_line = f"#### {text}\n\n"
+            elif element_type == "NarrativeText":
+                markdown_line = f"{text}\n\n"
+            elif element_type == "ListItem":
+                markdown_line = f"- {text}\n"
+            elif element_type == "Table":
+                markdown_line = el["metadata"]["text_as_html"]
+            elif element_type == "PageBreak":
+                markdown_line = f"---\n\n"
+            elif element_type == "Image":
+                markdown_line = f"![Image]({el['metadata'].get('image_path', '')})\n\n"
+            elif element_type == "Formula":
+                markdown_line = f"$$ {text} $$\n\n"
+            elif element_type == "FigureCaption":
+                markdown_line = f"**Figure:** {text}\n\n"
+            elif element_type == "Address":
+                markdown_line = f"**Address:** {text}\n\n"
+            elif element_type == "EmailAddress":
+                markdown_line = f"**Email:** {text}\n\n"
+            elif element_type == "CodeSnippet":
+                markdown_line = (
+                    f"```{el['metadata'].get('language', '')}\n{text}\n```\n\n"
+                )
+            elif element_type == "PageNumber":
+                markdown_line = f"**Page {text}**\n\n"
+            else:
+                markdown_line = f"{text}\n\n"
+
+            markdown_content += markdown_line
+
+        return markdown_content
+
+    def convert(self, path):
+        # Partition the PDF
+        elements = partition_pdf(
+            filename=path, infer_table_structure=True, strategy="hi_res"
+        )
+
+        # Convert elements to markdown
+        # Check if dict is a table, if so send it to openai using langchain for cleaning and improvements
+
+        llm = ChatOpenAI(model="gpt-4o")
+
+        # Define the prompt
+
+        messages = [
+            (
+                "system",
+                "You are an expert at parsing HTML tables in markdown, improve this html table and return it as markdown. You answer with just the table in pure markdown, nothing else.",
+            ),
+        ]
+
+        improved_elements = []
+        for el in elements:
+            if el.category == "Table":
+                messages.append(("human", el.metadata.text_as_html))
+                result = llm.invoke(messages)
+                el.metadata.text_as_html = result.content
+                # add line break to separate tables
+                el.metadata.text_as_html = el.metadata.text_as_html + "\n\n"
+
+            improved_elements.append(el)
+
+        elements_dict = [el.to_dict() for el in improved_elements]
+        markdown_content = self.convert_to_markdown(elements_dict)
+        return markdown_content
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,4 +7,6 @@ __pycache__/ @@
     dist/**
     megaparse.egg-info/
     *.pyc
-    build/*
+    build/*
+    !megaparse/tests/output_tests/MegaFake_report.md