Skip to content

Commit

Permalink
feat: Add support for Unstructured Parser, improve Table and Image Pa…
Browse files Browse the repository at this point in the history
…rsing, and add TOC and Hyperlinks for Docx (#9)
  • Loading branch information
StanGirard authored Jun 2, 2024
1 parent 08691af commit 4934776
Show file tree
Hide file tree
Showing 8 changed files with 735 additions and 198 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ __pycache__/
dist/**
megaparse.egg-info/
*.pyc
build/*
build/*

!megaparse/tests/output_tests/MegaFake_report.md
6 changes: 6 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ pdf2docx = "*"
unstructured = {extras = ["pdf"], version = "*"}
langchain = "*"
langchain-community = "*"
llama-index = "*"
pytesseract = "*"
poppler-utils = "*"
markdownify = "*"
langchain-openai = "*"
langchain-core = "*"

[dev-packages]
ipykernel = "*"
Expand Down
460 changes: 310 additions & 150 deletions Pipfile.lock

Large diffs are not rendered by default.

30 changes: 20 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,30 +31,40 @@ pip install megaparse

## Usage

1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key.
```python
from megaparse import MegaParse

2. Create a new file in the root directory of the project and name it `.env`.
megaparse = MegaParse(file_path="./test.pdf")
content = megaparse.convert()
print(content)
megaparse.save_md(content, "./test.md")
```

3. Add the following line to the `.env` file and replace `llx-your_api_key` with your actual API key.
### Use LlamaParse

```bash
LLAMA_CLOUD_API_KEY=llx-your_api_key
```
1. Create an account on [Llama Cloud](https://cloud.llamaindex.ai/) and get your API key.

4. Now you can use the following code to convert a PDF to Markdown and save it to a file.
2. Call Megaparse with the `llama_parse_api_key` parameter

```python
from megaparse import MegaParse

megaparse = MegaParse(file_path="./test.pdf")
megaparse = MegaParse(file_path="./test.pdf", llama_parse_api_key="llx-your_api_key")
content = megaparse.convert()
print(content)
megaparse.save_md(content, "./test.md")
```

## BenchMark

**Diff megaparse unstructured:** 120
**Diff llama parse:** 31
**Diff megaparse llama:** 26


*Lower is better*

## Next Steps

- [ ] Add Unstructured Parser Support
- [ ] Improve Table Parsing
- [ ] Improve Image Parsing and description
- [ ] Add TOC for Docx
Expand Down
36 changes: 28 additions & 8 deletions megaparse/Converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
from llama_parse.utils import ResultType, Language
from llama_index.core.schema import Document as LlamaDocument
from megaparse.markdown_processor import MarkdownProcessor
from megaparse.unstructured import UnstructuredParser
from pathlib import Path
from llama_index.core import download_loader
from unstructured.partition.auto import partition


import nest_asyncio

Expand Down Expand Up @@ -201,12 +206,16 @@ def save_md(self, md_content: str, file_path: Path | str) -> None:

class PDFConverter:
def __init__(
self, api_key: str, handle_pagination: bool = True, handle_header: bool = True
self,
llama_parse_api_key: str,
handle_pagination: bool = True,
handle_header: bool = True,
) -> None:
self.handle_pagination = handle_pagination
self.handle_header = handle_header
self.api_key = api_key
self.llama_parse_api_key = llama_parse_api_key

def _llama_parse(self, api_key: str, file_path: str):
parsing_instructions = "Do not take into account the page breaks (no --- between pages), do not repeat the header and the footer so the tables are merged. Keep the same format for similar tables."
self.parser = LlamaParse(
api_key=str(api_key),
Expand All @@ -216,10 +225,20 @@ def __init__(
language=Language.FRENCH,
parsing_instruction=parsing_instructions, # Optionally you can define a parsing instruction
)

def convert(self, file_path: str) -> str:
documents: List[LlamaDocument] = self.parser.load_data(file_path)
parsed_md = documents[0].get_content()
return parsed_md

def _unstructured_parse(self, file_path: str):
unstructured_parser = UnstructuredParser()
return unstructured_parser.convert(file_path)

def convert(self, file_path: str) -> str:
parsed_md = ""
if self.llama_parse_api_key:
parsed_md = self._llama_parse(self.llama_parse_api_key, file_path)
else:
parsed_md = self._unstructured_parse(file_path)

if not (self.handle_pagination or self.handle_header):
return parsed_md
Expand All @@ -238,9 +257,9 @@ def save_md(self, md_content: str, file_path: Path | str) -> None:


class MegaParse:
def __init__(self, file_path: str) -> None:
def __init__(self, file_path: str, llama_parse_api_key: str | None = None) -> None:
self.file_path = file_path
self.api_key = os.getenv("LLAMA_CLOUD_API_KEY")
self.llama_parse_api_key = llama_parse_api_key

def convert(self) -> str:
file_extension: str = os.path.splitext(self.file_path)[1]
Expand All @@ -254,12 +273,13 @@ def convert(self) -> str:
file_path=self.file_path, file_extension=file_extension
)
elif file_extension == ".pdf":
converter = PDFConverter(api_key=self.api_key)
converter = PDFConverter(llama_parse_api_key=self.llama_parse_api_key)
else:
print(self.file_path, file_extension)
raise ValueError(f"Unsupported file extension: {file_extension}")
return converter.convert(self.file_path)

def save_md(self, md_content: str, file_path: Path | str) -> None:
with open(file_path, "w") as f:
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "w+") as f:
f.write(md_content)
86 changes: 86 additions & 0 deletions megaparse/tests/output_tests/MegaFake_report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
| My Mega fake report | #1756394 | 31/05/2024 |
|---------------------|----------|------------|

## Why Mega Parse might be the best ?

### Introduction

Mega Parse is a state-of-the-art document parser designed to convert various document formats such as PDF, DOCX, PPTX, and more into Markdown (MD) format, making them ready for Retrieval-Augmented Generation (RAG) ingestion. In today's data-driven world, the ability to efficiently manage and utilize large volumes of information is crucial. This report explores the features, benefits, and comparative performance of Mega Parse, illustrating why it stands out as a superior tool in the realm of document parsing.

### Features of Mega Parse

Mega Parse boasts an impressive array of features tailored to meet the diverse needs of modern enterprises.

**Multiple Format Support:** Mega Parse supports a wide range of document formats including PDF, DOCX, and PPTX. This versatility allows users to handle various document types without needing multiple tools. Whether you are working with text documents, presentations, or scanned PDFs, Mega Parse has you covered.

**High-Speed Processing:** One of the standout features of Mega Parse is its ability to convert documents at a rapid pace. With processing speeds of up to 120 pages per minute, it significantly enhances productivity by reducing the time spent on document conversion.

**Markdown Output:** Mega Parse converts documents into a structured Markdown format. Markdown is a lightweight markup language with plain text formatting syntax, which is widely used because of its simplicity and ease of conversion to other formats. This makes it ideal for RAG ingestion, where structured and easily interpretable data is paramount.

Accuracy: Accuracy in text extraction and formatting is a critical aspect of any document parser. Mega Parse ensures high accuracy, maintaining the integrity and structure of the original documents. This is particularly important for documents that contain complex formatting and embedded elements.

Customizable Parsing Rules: Users can define custom parsing rules to suit specific needs, allowing for greater control over the conversion process. This flexibility ensures that Mega Parse can be adapted to a wide variety of use cases.

Batch Processing: Mega Parse supports batch processing, enabling the simultaneous conversion of multiple documents. This feature is particularly useful for organizations dealing with large volumes of documents, as it streamlines the workflow and saves time.

Error Handling: Advanced error handling capabilities ensure that any issues encountered during the conversion process are managed effectively, minimizing disruptions and maintaining workflow efficiency.

# Benefits of Mega Parse

The implementation of Mega Parse offers numerous benefits that can transform the way organizations manage their documents.

**Efficiency:** By significantly speeding up the document conversion process, Mega Parse increases overall efficiency. This is especially beneficial for industries that handle large volumes of documents on a daily basis, such as legal firms, financial institutions, and research organizations.

**Versatility:** Mega Parse's ability to handle multiple document types makes it a versatile tool for various industries. Whether you need to convert legal documents, technical manuals, or business presentations, Mega Parse is equipped to handle the task.

**Enhanced Knowledge Management:** Converting documents to Markdown facilitates easier content management and retrieval. Markdown files are not only lightweight but
also highly compatible with various knowledge management systems, making it easier to organize, search, and utilize information.

Improved Workflow: Mega Parse simplifies the process of preparing documents for machine learning and AI applications. By converting documents into a structured format, it reduces the time and effort required to preprocess data, allowing teams to focus on higher-level tasks.

Cost Savings: The efficiency and speed of Mega Parse can lead to significant cost savings. Reduced processing times and improved workflow efficiency mean that resources can be allocated more effectively, ultimately lowering operational costs.

Scalability: Mega Parse is designed to scale with the needs of an organization. As document volumes grow, Mega Parse can handle the increased load without compromising performance, making it a future-proof solution for document management.

# Comparative Performance

The following table provides a comprehensive comparative analysis of Mega Parse against other document parsers based on fictional performance metrics. This comparison highlights the strengths of Mega Parse in various key areas.

| Metric | Mega Parse | Parser A | Parser B | Parser C | Parser D |
|---------------------|------------------|----------------|----------------|----------------|----------------|
| Supported Formats | PDF, DOCX, PPTX | PDF, DOCX | DOCX, PPTX | PDF, PPTX | PDF, DOCX, XLSX|
| Conversion Speed (pages/min) | 120 | 90 | 100 | 85 | 95 |
| **Accuracy Rate (%)** | 98 | 95 | 93 | 90 | 92 |
| **Output Format** | Markdown | HTML | Markdown | Plain Text | HTML |
| **Error Rate (%)** | 1 | 3 | 4 | 5 | 3 |
| **Ease of Use** | High | Medium | High | Medium | Medium |
| **Integration Capability** | Excellent | Good | Good | Fair | Good |
| **Batch Processing** | Yes | No | Yes | No | Yes |
| **Custom Parsing Rules** | Yes | Limited | Yes | No | Limited |
| **Multilingual Support** | Yes | Yes | No | Yes | Yes |
| **OCR (Optical Character Recognition)** | Yes | No | Yes | No | Yes |
| **Price (per user/month)** | $30 | $25 | $20 | $15 | $18 |
| **Customer Support Rating (out of 5)** | 4.8 | 4.2 | 4.5 | 3.9 | 4.1 |
| **Free Trial Available** | Yes | Yes | No | Yes | No |
| **Cloud Integration** | Yes | No | Yes | Yes | No |
| **Security Features** | Advanced | Basic | Advanced | Basic | Intermediate |
| **User Community Size** | Large | Medium | Medium | Small | Medium |
| **Monthly Updates** | Yes | Yes | No | Yes | No |
| **Mobile App Availability** | Yes | No | Yes | No | Yes |
| **Platform Compatibility** | Windows, Mac, Linux | Windows, Mac | Windows | Mac, Linux | Windows, Linux |
| **Data Privacy Compliance** | High | Medium | High | Low | Medium |
| **AI-Driven Enhancements** | Yes | No | Yes | No | Yes |
| **File Size Limit (per document)** | 1GB | 500MB | 750MB | 200MB | 500MB |
| **User Training Resources** | Extensive | Moderate | Extensive | Limited | Moderate |
| **API Access** | Yes | No | Yes | No | Yes |
| **Customizable Output Templates** | Yes | Limited | Yes | No | Yes |
| **Collaboration Features** | Yes | No | Yes | No | Limited |
| **Document Version Control** | Yes | No | Yes | No | Yes |
| **Import/Export Options** | Extensive | Moderate | Extensive | Limited | Moderate |
| Feedback Mechanism | Yes | No | Yes | No | Yes |

*Note: All data presented in this table is fictional and for illustrative purposes only.*

## Conclusion

Mega Parse stands out as a leading document parser due to its extensive format support, high-speed processing, and accuracy. Its ability to convert a variety of document types into Markdown format makes it an invaluable tool for organizations looking to streamline their document management processes and enhance their knowledge management systems. With features like customizable parsing rules, batch processing, and advanced error handling, Mega Parse is well-equipped to meet the demands of modern enterprises. Its scalability and cost-effectiveness further reinforce its position as a top choice for document parsing and conversion needs. By leveraging Mega Parse, organizations can improve their workflow efficiency, reduce operational costs, and better manage their information assets in the age of big data and artificial intelligence.
98 changes: 98 additions & 0 deletions megaparse/unstructured.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import markdownify
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from unstructured.partition.pdf import partition_pdf


class UnstructuredParser:

# Function to convert element category to markdown format
def convert_to_markdown(self, elements):
markdown_content = ""
element_hierarchy = {}

for el in elements:
element_type = el["type"]
text = el["text"]
metadata = el["metadata"]
parent_id = metadata.get("parent_id", None)
category_depth = metadata.get("category_depth", 0)

markdown_line = ""

if element_type == "Title":
if parent_id:
markdown_line = f"## {text}\n\n" # Adjusted to add sub headers if parent_id exists
else:
markdown_line = f"# {text}\n\n"
elif element_type == "Subtitle":
markdown_line = f"## {text}\n\n"
elif element_type == "Header":
markdown_line = f"{'#' * (category_depth + 1)} {text}\n\n"
elif element_type == "Footer":
markdown_line = f"#### {text}\n\n"
elif element_type == "NarrativeText":
markdown_line = f"{text}\n\n"
elif element_type == "ListItem":
markdown_line = f"- {text}\n"
elif element_type == "Table":
markdown_line = el["metadata"]["text_as_html"]
elif element_type == "PageBreak":
markdown_line = f"---\n\n"
elif element_type == "Image":
markdown_line = f"![Image]({el['metadata'].get('image_path', '')})\n\n"
elif element_type == "Formula":
markdown_line = f"$$ {text} $$\n\n"
elif element_type == "FigureCaption":
markdown_line = f"**Figure:** {text}\n\n"
elif element_type == "Address":
markdown_line = f"**Address:** {text}\n\n"
elif element_type == "EmailAddress":
markdown_line = f"**Email:** {text}\n\n"
elif element_type == "CodeSnippet":
markdown_line = (
f"```{el['metadata'].get('language', '')}\n{text}\n```\n\n"
)
elif element_type == "PageNumber":
markdown_line = f"**Page {text}**\n\n"
else:
markdown_line = f"{text}\n\n"

markdown_content += markdown_line

return markdown_content

def convert(self, path):
# Partition the PDF
elements = partition_pdf(
filename=path, infer_table_structure=True, strategy="hi_res"
)

# Convert elements to markdown
# Check if dict is a table, if so send it to openai using langchain for cleaning and improvements

llm = ChatOpenAI(model="gpt-4o")

# Define the prompt

messages = [
(
"system",
"You are an expert at parsing HTML tables in markdown, improve this html table and return it as markdown. You answer with just the table in pure markdown, nothing else.",
),
]

improved_elements = []
for el in elements:
if el.category == "Table":
messages.append(("human", el.metadata.text_as_html))
result = llm.invoke(messages)
el.metadata.text_as_html = result.content
# add line break to separate tables
el.metadata.text_as_html = el.metadata.text_as_html + "\n\n"

improved_elements.append(el)

elements_dict = [el.to_dict() for el in improved_elements]
markdown_content = self.convert_to_markdown(elements_dict)
return markdown_content
Loading

0 comments on commit 4934776

Please sign in to comment.