Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add pdfium backend and set it as default image conversion backend ✨ #230

Merged
merged 11 commits into from
Nov 1, 2024
12 changes: 7 additions & 5 deletions camelot/backends/image_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@

from .base import ConversionBackend
from .ghostscript_backend import GhostscriptBackend
from .pdfium_backend import PdfiumBackend
from .poppler_backend import PopplerBackend


BACKENDS: Dict[str, Type[ConversionBackend]] = {
"poppler": PopplerBackend,
"pdfium": PdfiumBackend,
"ghostscript": GhostscriptBackend,
"poppler": PopplerBackend,
}


Expand All @@ -23,13 +25,13 @@ class ImageConversionError(ValueError): # noqa D101
class ImageConversionBackend:
"""Classes the ImageConversionBackend backend."""

def __init__(self, backend: Any = "poppler", use_fallback: bool = True) -> None:
def __init__(self, backend: Any = "pdfium", use_fallback: bool = True) -> None:
"""Initialize the conversion backend .

Parameters
----------
backend : str, optional
Backend for image conversion, by default "poppler"
backend : Any, optional
Backend for image conversion, by default "pdfium"
use_fallback : bool, optional
Fallback to another backend if unavailable, by default True

Expand Down Expand Up @@ -90,7 +92,7 @@ def implements_convert():
if isinstance(backend, str):
if backend not in BACKENDS.keys():
raise NotImplementedError(
f"Unknown backend {backend!r} specified. Please use either 'poppler' or 'ghostscript'."
f"Unknown backend {backend!r} specified. Please use 'pdfium', 'poppler' or 'ghostscript'."
)

return BACKENDS[backend]()
Expand Down
43 changes: 43 additions & 0 deletions camelot/backends/pdfium_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Creates a pdfium backend class to convert a pdf to a png file."""

from camelot.backends.base import ConversionBackend


PDFIUM_EXC = None

try:
import pypdfium2 as pdfium

except ModuleNotFoundError as e:
PDFIUM_EXC = e


class PdfiumBackend(ConversionBackend):
"""Classmethod to create PdfiumBackend."""

def installed(self) -> bool: # noqa D102
if not PDFIUM_EXC:
return True
return False

def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
"""Convert PDF to png.

Parameters
----------
pdf_path : str
Path where to read the pdf file.
png_path : str
Path where to save png file.

Raises
------
OSError
Raise an error if pdfium is not installed
"""
if not self.installed():
raise OSError(f"pypdfium2 is not available: {PDFIUM_EXC!r}")
doc = pdfium.PdfDocument(pdf_path)
doc.init_forms()
image = doc[0].render(scale=resolution / 72).to_pil()
image.save(png_path)
2 changes: 1 addition & 1 deletion camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def __init__(
iterations=0,
resolution=300,
use_fallback=True,
backend="ghostscript",
backend="pdfium",
**kwargs,
):
super().__init__("lattice")
Expand Down
6 changes: 5 additions & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,11 @@ def mypy(session: Session) -> None:
session.run("mypy", f"--python-executable={sys.executable}", "noxfile.py")


base_requires = ["ghostscript>=0.7", "opencv-python-headless>=3.4.2.17"]
base_requires = [
"ghostscript>=0.7",
"opencv-python-headless>=3.4.2.17",
"pypdfium2>=4,<5",
]

plot_requires = [
"matplotlib>=2.2.3",
Expand Down
24 changes: 23 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pypdf_table_extraction/backends/pdfium_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from camelot.backends.pdfium_backend import PdfiumBackend # noqa: F401
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ pypdf_table_extraction = "pypdf_table_extraction.__main__:main"
[tool.poetry.group.base.dependencies]
ghostscript = "^0.7"
opencv-python-headless = "^4.7.0.68"
pypdfium2 = "^4"


[tool.poetry.group.plot.dependencies]
Expand Down
8 changes: 4 additions & 4 deletions tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2786,8 +2786,8 @@
"",
"",
"",
"Congress-",
"",
"Congress-",
"",
"Senator 36th",
"",
Expand Down Expand Up @@ -2817,20 +2817,20 @@
"",
"",
"1st Dist",
"", # Dist.
"Dist.",
"",
"",
"Dist.",
"", # Deeds
"Deeds",
"",
"Commission",
"",
"District #1",
"",
"ct #2",
"#3",
"Dist #4",
"",
"Dist #4",
"#5",
],
[
Expand Down
38 changes: 38 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,16 @@ def test_password(testdir):
assert_frame_equal(df, tables[0].df)


def test_repr_pdfium(testdir):
filename = os.path.join(testdir, "foo.pdf")
tables = camelot.read_pdf(
filename, flavor="lattice", backend="pdfium", use_fallback=False
)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


@skip_pdftopng
def test_repr_poppler(testdir):
filename = os.path.join(testdir, "foo.pdf")
Expand Down Expand Up @@ -62,6 +72,16 @@ def test_repr_ghostscript_custom_backend(testdir):
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


def test_url_pdfium():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(
url, flavor="lattice", backend="pdfium", use_fallback=False
)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


@skip_pdftopng
def test_url_poppler():
url = "https://pypdf-table-extraction.readthedocs.io/en/latest/_static/pdf/foo.pdf"
Expand Down Expand Up @@ -89,6 +109,24 @@ def test_url_ghostscript_custom_backend(testdir):
assert repr(tables[0].cells[0][0]) == "<Cell x1=120 y1=218 x2=165 y2=234>"


def test_pages_pdfium():
url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
tables = camelot.read_pdf(url, backend="pdfium", use_fallback=False)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"

tables = camelot.read_pdf(url, pages="1-end", backend="pdfium", use_fallback=False)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"

tables = camelot.read_pdf(url, pages="all", backend="pdfium", use_fallback=False)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"
assert repr(tables[0].cells[0][0]) == "<Cell x1=121 y1=218 x2=165 y2=234>"


@skip_pdftopng
def test_pages_poppler():
url = "https://pypdf-table-extraction.readthedocs.io/en/latest/_static/pdf/foo.pdf"
Expand Down
31 changes: 30 additions & 1 deletion tests/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def test_lattice_no_tables_on_page(testdir):


def test_lattice_unknown_backend(foo_pdf):
message = "Unknown backend 'mupdf' specified. Please use either 'poppler' or 'ghostscript'."
message = "Unknown backend 'mupdf' specified. Please use 'pdfium', 'poppler' or 'ghostscript'."
with pytest.raises(NotImplementedError, match=message):
tables = camelot.read_pdf(
foo_pdf, flavor="lattice", backend="mupdf", use_fallback=False
Expand All @@ -149,6 +149,35 @@ def test_invalid_url():
assert is_url(url) is False


def test_pdfium_backend_import_error(testdir):
filename = os.path.join(testdir, "table_region.pdf")
with mock.patch.dict(sys.modules, {"pypdfium2": None}):
message = "pypdfium2 is not available: "
try:
tables = camelot.read_pdf(
filename,
flavor="lattice",
backend="pdfium",
use_fallback=False,
)
except Exception as em:
print(em)
assert message in str(em)


def test_pdfium_backend_import_error_alternative(testdir):
filename = os.path.join(testdir, "table_region.pdf")
with mock.patch.dict(sys.modules, {"pypdfium2": None}):
message = "pypdfium2 is not available: "
tables = camelot.read_pdf(
filename,
flavor="lattice",
backend="pdfium",
use_fallback=False,
)
assert tables is not None


def test_ghostscript_backend_import_error(testdir):
filename = os.path.join(testdir, "table_region.pdf")
with mock.patch.dict(sys.modules, {"ghostscript": None}):
Expand Down
4 changes: 3 additions & 1 deletion tests/test_hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ def test_hybrid_vertical_header(testdir):
df = pd.DataFrame(data_hybrid_vertical_headers)

filename = os.path.join(testdir, "vertical_header.pdf")
tables = camelot.read_pdf(filename, flavor="hybrid")
tables = camelot.read_pdf(
filename, flavor="hybrid", backend="pdfium", use_fallback=False
)
assert len(tables) == 1
assert_frame_equal(df, tables[0].df)

Expand Down
15 changes: 13 additions & 2 deletions tests/test_image_conversion_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def patch_backends(monkeypatch):
{
"poppler": PopplerBackendError,
"ghostscript": GhostscriptBackendNoError,
"pdfdium": PdfiumBackendError,
},
raising=True,
)
Expand All @@ -30,6 +31,11 @@ def convert(self, pdf_path, png_path):
pass


class PdfiumBackendError:
def convert(self, pdf_path, png_path):
raise ValueError("Image conversion failed")


def test_poppler_backend_error_when_no_use_fallback(patch_backends):
backend = ImageConversionBackend(backend="poppler", use_fallback=False)

Expand All @@ -44,17 +50,22 @@ def test_ghostscript_backend_when_use_fallback(patch_backends):


def test_ghostscript_backend_error_when_use_fallback(monkeypatch):
"""Use an image conversion backend and let it fallback to ghostscript.

Then capture the error message of the second backend (the fallback).
"""
backends = {
"pdfium": PdfiumBackendError,
"ghostscript": GhostscriptBackendError,
"poppler": PopplerBackendError,
}

monkeypatch.setattr(
"camelot.backends.image_conversion.BACKENDS", backends, raising=True
)
backend = ImageConversionBackend(backend="ghostscript")
backend = ImageConversionBackend(backend="pdfium")

message = "Image conversion failed with image conversion backend 'poppler'\n error: Image conversion failed"
message = "Image conversion failed with image conversion backend 'ghostscript'\n error: Image conversion failed"
with pytest.raises(ValueError, match=message):
backend.convert("foo", "bar")

Expand Down
1 change: 1 addition & 0 deletions tests/test_rename.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
def test_imports():

from camelot.backends.pdfium_backend import PdfiumBackend # noqa: F401
from camelot.backends.poppler_backend import PopplerBackend # noqa: F401
from camelot.handlers import Hybrid # noqa: F401
from camelot.handlers import Lattice # noqa: F401
Expand Down
Loading