pyjanitor-devs · Sabrina-Hassaim · Jan 25, 2025 · Jan 27, 2025 · Jan 27, 2025 · Jan 27, 2025
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -114,3 +114,4 @@ Contributors
 - [@joranbeasley](https://github.com/joranbeasley) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/issues?q=is%3Aclosed+mentions%joranbeasley)
 -[@kianmeng](https://github.com/kianmeng) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1290#issue-1906020324)
 - [@lbeltrame](https://github.com/lbeltrame) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1401)
+- [@Sabrina-Hassaim](https://github.com/Sabrina-Hassaim?) | [contributions](https://github.com/pyjanitor-devs/pyjanitor/pull/1440)
diff --git a/janitor/io.py b/janitor/io.py
@@ -3,7 +3,9 @@
 import inspect
 import os
 import subprocess
+import tarfile
 import warnings
+import zipfile
 from collections import defaultdict
 from glob import glob
 from io import StringIO
@@ -689,3 +691,187 @@ def _object_to_dict(obj):
             data[key] = _object_to_dict(value)
         return data
     return obj
+
+
+#################################################################
+
+
+def read_archive(
+    file_path: str,
+    extract_to_df: bool = True,
+    file_type: str | None = None,
+    selected_files: list[str] | None = None,
+) -> pd.DataFrame | list[str]:
+    """
+    Reads an archive file (.zip, .tar, .tar.gz) and optionally lists its content
+    or extracts specific files into a DataFrame.
+
+    Args:
+        file_path: The path to the archive file.
+        extract_to_df: Whether to read the contents into a DataFrame
+            (for CSV or similar formats). Default is True.
+        file_type: Optional file type hint ('zip', 'tar', 'tar.gz').
+            If None, it will be inferred from the file extension.
+        selected_files: List of files to read directly without user interaction.
+
+    Returns:
+        - A pandas DataFrame if extract_to_df is True
+          and the user selects a file to load.
+        - A list of dataframes that contains
+        compatible file names in the archive otherwise.
+    """
+    file_type = file_type or _infer_file_type(file_path)
+
+    if file_type == "zip":
+        return _process_zip_archive(file_path, extract_to_df, selected_files)
+    elif file_type in {"tar", "tar.gz"}:
+        return _process_tar_archive(file_path, extract_to_df, selected_files)
+    else:
+        raise ValueError(
+            "Unsupported archive format. Supported formats are .zip, .tar, or .tar.gz."
+        )
+
+
+def _process_zip_archive(
+    file_path: str, extract_to_df: bool, selected_files: list[str] | None
+) -> pd.DataFrame | list[str]:
+    """Process a ZIP archive."""
+    with zipfile.ZipFile(file_path) as archive:
+        compatible_files = _list_compatible_files(archive.namelist())
+
+        if extract_to_df:
+            return _select_and_extract_from_zip(
+                archive, compatible_files, selected_files
+            )
+        return compatible_files
+
+
+def _process_tar_archive(
+    file_path: str, extract_to_df: bool, selected_files: list[str] | None
+) -> pd.DataFrame | list[str]:
+    """Process a TAR archive."""
+    mode = "r:gz" if file_path.endswith(".gz") else "r"
+    with tarfile.open(file_path, mode) as archive:
+        compatible_files = _list_compatible_files(archive.getnames())
+
+        if extract_to_df:
+            return _select_and_extract_from_tar(
+                archive, compatible_files, selected_files
+            )
+        return compatible_files
+
+
+def _select_and_extract_from_zip(
+    archive: zipfile.ZipFile,
+    compatible_files: list[str],
+    selected_files: list[str] | None,
+) -> pd.DataFrame | list[pd.DataFrame]:
+    """Select and read specific files from a ZIP archive."""
+    if not selected_files:
+        selected_files = _select_files_interactively(compatible_files)
+
+    dfs = []
+    for selected_file in selected_files:
+        with archive.open(selected_file) as file:
+            if selected_file.endswith(".csv"):
+                dfs.append(pd.read_csv(file))
+            elif selected_file.endswith(".xlsx"):
+                dfs.append(pd.read_excel(file))
+    return dfs if len(dfs) > 1 else dfs[0]
+
+
+def _select_and_extract_from_tar(
+    archive: tarfile.TarFile,
+    compatible_files: list[str],
+    selected_files: list[str] | None,
+) -> pd.DataFrame | list[pd.DataFrame]:
+    """Select and read specific files from a TAR archive."""
+    if not selected_files:
+        selected_files = _select_files_interactively(compatible_files)
+
+    dfs = []
+    for selected_file in selected_files:
+        member = archive.getmember(selected_file)
+        with archive.extractfile(member) as file:
+            if selected_file.endswith(".csv"):
+                dfs.append(pd.read_csv(file))
+            elif selected_file.endswith(".xlsx"):
+                dfs.append(pd.read_excel(file))
+    return dfs if len(dfs) > 1 else dfs[0]
+
+
+def _select_files_interactively(compatible_files: list[str]) -> list[str]:
+    """
+    Allow the user to select files from a list interactively.
+
+    Args:
+        compatible_files: List of compatible file names.
+
+    Returns:
+        List of selected file names.
+    """
+    print("Compatible files found in the archive:")
+    for idx, file_name in enumerate(compatible_files, 1):
+        print(f"{idx}. {file_name}")
+
+    selected_indices = (
+        input(
+            "Enter the numbers of the files to read, "
+            "separated by commas (e.g., 1,2,3): "
+        )
+        .strip()
+        .split(",")
+    )
+    selected_files = [
+        compatible_files[int(idx) - 1]
+        for idx in selected_indices
+        if idx.strip().isdigit() and 0 < int(idx) <= len(compatible_files)
+    ]
+    if not selected_files:
+        raise ValueError("No valid files selected.")
+    return selected_files
+
+
+def _list_compatible_files(file_names: list[str]) -> list[str]:
+    """
+    Helper function to list compatible files (e.g., .csv, .xlsx) from an archive.
+
+    Args:
+        file_names: List of file names in the archive.
+
+    Returns:
+        List of compatible file names.
+    """
+    compatible_files = [
+        file_name
+        for file_name in file_names
+        if file_name.endswith((".csv", ".xlsx"))
+    ]
+    print("Compatible files detected :", compatible_files)
+    if not compatible_files:
+        raise ValueError("No compatible files found in the archive.")
+    return compatible_files
+
+
+def _infer_file_type(file_path: str) -> str:
+    """
+    Infer the type of the archive based on the file extension.
+
+    Args:
+        file_path: Path to the file.
+
+    Returns:
+        A string representing the archive type ('zip', 'tar', 'tar.gz').
+
+    Raises:
+        ValueError if the file extension is unsupported.
+    """
+    if file_path.endswith(".zip"):
+        return "zip"
+    elif file_path.endswith((".tar", ".tar.gz")):
+        return "tar.gz" if file_path.endswith(".tar.gz") else "tar"
+    else:
+        raise ValueError(
+            "Cannot infer file type from the file extension. "
+            "Please specify the 'file_type' parameter."
+        )
diff --git a/tests/io/test_read_archive.py b/tests/io/test_read_archive.py
@@ -0,0 +1,151 @@
+import io
+import tarfile
+import zipfile
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+
+from janitor.io import (
+    _infer_file_type,
+    read_archive,
+)
+
+
+# Fixtures for creating test archives
+@pytest.fixture
+def dummy_zip_file(tmp_path):
+    """Create a dummy ZIP file containing two CSV files."""
+    zip_path = tmp_path / "dummy.zip"
+    with zipfile.ZipFile(zip_path, mode="w") as zf:
+        zf.writestr("file1.csv", "col1,col2\n1,2\n3,4")
+        zf.writestr("file2.csv", "col3,col4\n5,6\n7,8")
+    return zip_path
+
+
+@pytest.fixture
+def dummy_tar_file(tmp_path):
+    """Create a dummy TAR file containing two CSV files."""
+    tar_path = tmp_path / "dummy.tar.gz"
+    with tarfile.open(tar_path, mode="w:gz") as tf:
+        info1 = tarfile.TarInfo(name="file1.csv")
+        data1 = io.BytesIO(b"col1,col2\n1,2\n3,4")
+        info1.size = data1.getbuffer().nbytes
+        tf.addfile(info1, data1)
+
+        info2 = tarfile.TarInfo(name="file2.csv")
+        data2 = io.BytesIO(b"col3,col4\n5,6\n7,8")
+        info2.size = data2.getbuffer().nbytes
+        tf.addfile(info2, data2)
+    return tar_path
+
+
+# Tests for reading archives via `read_archive`
+def test_read_zip_archive(dummy_zip_file):
+    """Test reading a specific file from a ZIP archive."""
+    result = read_archive(
+        str(dummy_zip_file), extract_to_df=True, selected_files=["file1.csv"]
+    )
+    assert isinstance(result, pd.DataFrame)
+    assert list(result.columns) == ["col1", "col2"]
+    assert result.shape == (2, 2)
+
+
+def test_list_files_in_zip(dummy_zip_file):
+    """Test listing files in a ZIP archive."""
+    result = read_archive(str(dummy_zip_file), extract_to_df=False)
+    assert isinstance(result, list)
+    assert "file1.csv" in result
+    assert "file2.csv" in result
+
+
+def test_no_compatible_files_in_zip(tmp_path):
+    """Test handling a ZIP archive with no compatible files."""
+    zip_path = tmp_path / "empty.zip"
+    with zipfile.ZipFile(zip_path, mode="w") as zf:
+        zf.writestr("file1.txt", "Just some text")
+    with pytest.raises(
+        ValueError, match="No compatible files found in the archive"
+    ):
+        read_archive(str(zip_path))
+
+
+def test_read_tar_archive(dummy_tar_file):
+    """Test reading a specific file from a TAR archive."""
+    result = read_archive(
+        str(dummy_tar_file), extract_to_df=True, selected_files=["file1.csv"]
+    )
+    assert isinstance(result, pd.DataFrame)
+    assert list(result.columns) == ["col1", "col2"]
+    assert result.shape == (2, 2)
+
+
+def test_list_files_in_tar(dummy_tar_file):
+    """Test listing files in a TAR archive."""
+    result = read_archive(str(dummy_tar_file), extract_to_df=False)
+    assert isinstance(result, list)
+    assert "file1.csv" in result
+    assert "file2.csv" in result
+
+
+def test_no_compatible_files_in_tar(tmp_path):
+    """Test handling a TAR archive with no compatible files."""
+    tar_path = tmp_path / "invalid.tar.gz"
+    with tarfile.open(tar_path, mode="w:gz") as tf:
+        info = tarfile.TarInfo(name="file1.txt")
+        data = io.BytesIO(b"Just some text")
+        info.size = data.getbuffer().nbytes
+        tf.addfile(info, data)
+    with pytest.raises(
+        ValueError, match="No compatible files found in the archive"
+    ):
+        read_archive(str(tar_path))
+
+
+# Tests for unsupported file types
+def test_read_archive_unsupported_file():
+    """Test handling unsupported file types."""
+    with pytest.raises(
+        ValueError,
+        match="Cannot infer file type from the file extension. "
+        "Please specify the 'file_type' parameter.",
+    ):
+        read_archive("test.unsupported")
+
+
+def test_read_archive_no_extension():
+    """Test handling files with no extension."""
+    with pytest.raises(
+        ValueError,
+        match="Cannot infer file type from the file extension. "
+        "Please specify the 'file_type' parameter.",
+    ):
+        read_archive("testfile")
+
+
+# Tests for interactive file selection
+def test_interactive_file_selection_valid(dummy_zip_file):
+    """Test valid input for interactive file selection."""
+    user_input = "1,2"
+    with patch("builtins.input", return_value=user_input):
+        result = read_archive(str(dummy_zip_file), extract_to_df=False)
+        assert "file1.csv" in result
+        assert "file2.csv" in result
+
+
+# Tests for file type inference
+def test_infer_file_type_valid():
+    """Test valid file type inference."""
+    assert _infer_file_type("test.zip") == "zip"
+    assert _infer_file_type("test.tar") == "tar"
+    assert _infer_file_type("test.tar.gz") == "tar.gz"
+
+
+def test_infer_file_type_invalid():
+    """Test invalid file type inference."""
+    with pytest.raises(
+        ValueError,
+        match="Cannot infer file type from the file extension. "
+        "Please specify the 'file_type' parameter.",
+    ):
+        _infer_file_type("testfile")