Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

style: enforce Python code style #101

Merged
merged 4 commits into from
Aug 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion .github/workflows/compliance.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,22 @@ jobs:
- uses: actions/checkout@v4
- name: Check license header
uses: apache/skywalking-eyes/[email protected]
- name: Check code style
- name: Check rust code style
run: cd python && make check-rust
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
cache: pip
cache-dependency-path: pyproject.toml
- name: Install python linter dependencies
working-directory: ./python
run: |
make setup-env
source venv/bin/activate
pip install ruff mypy
- name: Run python linter
working-directory: ./python
run: |
source venv/bin/activate
make check-python
9 changes: 9 additions & 0 deletions python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@ check-rust: ## Run check on Rust
$(info --- Check Rust format ---)
cargo fmt --all -- --check

.PHONY: check-python
check-python: ## Run check on Python
$(info --- Check Python format ---)
ruff format --check --diff .
$(info --- Check Python linting ---)
ruff check .
$(info --- Check Python typing ---)
mypy .

.PHONY: test-rust
test-rust: ## Run tests on Rust
$(info --- Run Rust tests ---)
Expand Down
2 changes: 1 addition & 1 deletion python/hudi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
# specific language governing permissions and limitations
# under the License.

from ._internal import __version__ as __version__
from ._internal import HudiFileSlice as HudiFileSlice
from ._internal import HudiTable as HudiTable
from ._internal import __version__ as __version__
21 changes: 6 additions & 15 deletions python/hudi/_internal.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@
# specific language governing permissions and limitations
# under the License.
from dataclasses import dataclass
from typing import Optional, Dict, List
from typing import Dict, List, Optional

import pyarrow
import pyarrow # type: ignore

__version__: str


@dataclass(init=False)
class HudiFileSlice:
file_group_id: str
Expand All @@ -33,24 +32,16 @@ class HudiFileSlice:

def base_file_relative_path(self) -> str: ...


@dataclass(init=False)
class HudiTable:

def __init__(
self,
table_uri: str,
options: Optional[Dict[str, str]] = None,
self,
table_uri: str,
options: Optional[Dict[str, str]] = None,
): ...

def get_schema(self) -> "pyarrow.Schema": ...

def split_file_slices(self, n: int) -> List[List[HudiFileSlice]]: ...

def get_file_slices(self) -> List[HudiFileSlice]: ...

def read_file_slice(self, base_file_relative_path) -> pyarrow.RecordBatch: ...

def read_file_slice(self, base_file_relative_path: str) -> pyarrow.RecordBatch: ...
def read_snapshot(self) -> List["pyarrow.RecordBatch"]: ...

def read_snapshot_as_of(self, timestamp: str) -> List["pyarrow.RecordBatch"]: ...
18 changes: 18 additions & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,34 @@ dependencies = [
optional-dependencies = { devel = [
"pytest",
"coverage",
"ruff==0.5.2",
"mypy==1.10.1",
] }

dynamic = ["version"]

[tool.maturin]
module-name = "hudi._internal"

[tool.ruff]
target-version = 'py38'
# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
lint.select = [
"E4",
"E7",
"E9",
"F",
# isort
"I",
]
# don't ignore any rule unless it becomes imperative
lint.ignore = []
lint.isort.known-first-party = ["hudi"]

[tool.mypy]
files = "hudi/*.py"
exclude = "^tests"
strict = true

[tool.pytest.ini_options]
testpaths = [
Expand Down
130 changes: 97 additions & 33 deletions python/tests/test_table_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,49 @@

from hudi import HudiTable

PYARROW_LE_8_0_0 = tuple(int(s) for s in pa.__version__.split(".") if s.isnumeric()) < (8, 0, 0)
pytestmark = pytest.mark.skipif(PYARROW_LE_8_0_0, reason="hudi only supported if pyarrow >= 8.0.0")
PYARROW_LE_8_0_0 = tuple(int(s) for s in pa.__version__.split(".") if s.isnumeric()) < (
8,
0,
0,
)
pytestmark = pytest.mark.skipif(
PYARROW_LE_8_0_0, reason="hudi only supported if pyarrow >= 8.0.0"
)


def test_sample_table(get_sample_table):
table_path = get_sample_table
table = HudiTable(table_path)

assert table.get_schema().names == ['_hoodie_commit_time', '_hoodie_commit_seqno', '_hoodie_record_key',
'_hoodie_partition_path', '_hoodie_file_name', 'ts', 'uuid', 'rider', 'driver',
'fare', 'city']
assert table.get_schema().names == [
"_hoodie_commit_time",
"_hoodie_commit_seqno",
"_hoodie_record_key",
"_hoodie_partition_path",
"_hoodie_file_name",
"ts",
"uuid",
"rider",
"driver",
"fare",
"city",
]

file_slices = table.get_file_slices()
assert len(file_slices) == 5
assert set(f.commit_time for f in file_slices) == {'20240402123035233', '20240402144910683'}
assert set(f.commit_time for f in file_slices) == {
"20240402123035233",
"20240402144910683",
}
assert all(f.num_records == 1 for f in file_slices)
file_slice_paths = [f.base_file_relative_path() for f in file_slices]
assert set(file_slice_paths) == {'chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet',
'san_francisco/d9082ffd-2eb1-4394-aefc-deb4a61ecc57-0_1-9-0_20240402123035233.parquet',
'san_francisco/780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_0-8-0_20240402123035233.parquet',
'san_francisco/5a226868-2934-4f84-a16f-55124630c68d-0_0-7-24_20240402144910683.parquet',
'sao_paulo/ee915c68-d7f8-44f6-9759-e691add290d8-0_3-11-0_20240402123035233.parquet'}
assert set(file_slice_paths) == {
"chennai/68d3c349-f621-4cd8-9e8b-c6dd8eb20d08-0_4-12-0_20240402123035233.parquet",
"san_francisco/d9082ffd-2eb1-4394-aefc-deb4a61ecc57-0_1-9-0_20240402123035233.parquet",
"san_francisco/780b8586-3ad0-48ef-a6a1-d2217845ce4a-0_0-8-0_20240402123035233.parquet",
"san_francisco/5a226868-2934-4f84-a16f-55124630c68d-0_0-7-24_20240402144910683.parquet",
"sao_paulo/ee915c68-d7f8-44f6-9759-e691add290d8-0_3-11-0_20240402123035233.parquet",
}

batch = table.read_file_slice(file_slice_paths[0])
t = pa.Table.from_batches([batch])
Expand All @@ -54,28 +75,71 @@ def test_sample_table(get_sample_table):

batches = table.read_snapshot()
t = pa.Table.from_batches(batches).select([0, 5, 6, 9]).sort_by("ts")
assert t.to_pylist() == [{'_hoodie_commit_time': '20240402144910683', 'ts': 1695046462179,
'uuid': '9909a8b1-2d15-4d3d-8ec9-efc48c536a00', 'fare': 339.0},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695091554788,
'uuid': 'e96c4396-3fad-413a-a942-4cb36106d721', 'fare': 27.7},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695115999911,
'uuid': 'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa', 'fare': 17.85},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695159649087,
'uuid': '334e26e9-8355-45cc-97c6-c31daf0df330', 'fare': 19.1},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695516137016,
'uuid': 'e3cf430c-889d-4015-bc98-59bdce1e530c', 'fare': 34.15}]
assert t.to_pylist() == [
{
"_hoodie_commit_time": "20240402144910683",
"ts": 1695046462179,
"uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00",
"fare": 339.0,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695091554788,
"uuid": "e96c4396-3fad-413a-a942-4cb36106d721",
"fare": 27.7,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695115999911,
"uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa",
"fare": 17.85,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695159649087,
"uuid": "334e26e9-8355-45cc-97c6-c31daf0df330",
"fare": 19.1,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695516137016,
"uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c",
"fare": 34.15,
},
]

table = HudiTable(table_path, {
"hoodie.read.as.of.timestamp": "20240402123035233"})
table = HudiTable(table_path, {"hoodie.read.as.of.timestamp": "20240402123035233"})
batches = table.read_snapshot()
t = pa.Table.from_batches(batches).select([0, 5, 6, 9]).sort_by("ts")
assert t.to_pylist() == [{'_hoodie_commit_time': '20240402123035233', 'ts': 1695046462179,
'uuid': '9909a8b1-2d15-4d3d-8ec9-efc48c536a00', 'fare': 33.9},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695091554788,
'uuid': 'e96c4396-3fad-413a-a942-4cb36106d721', 'fare': 27.7},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695115999911,
'uuid': 'c8abbe79-8d89-47ea-b4ce-4d224bae5bfa', 'fare': 17.85},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695159649087,
'uuid': '334e26e9-8355-45cc-97c6-c31daf0df330', 'fare': 19.1},
{'_hoodie_commit_time': '20240402123035233', 'ts': 1695516137016,
'uuid': 'e3cf430c-889d-4015-bc98-59bdce1e530c', 'fare': 34.15}]
assert t.to_pylist() == [
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695046462179,
"uuid": "9909a8b1-2d15-4d3d-8ec9-efc48c536a00",
"fare": 33.9,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695091554788,
"uuid": "e96c4396-3fad-413a-a942-4cb36106d721",
"fare": 27.7,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695115999911,
"uuid": "c8abbe79-8d89-47ea-b4ce-4d224bae5bfa",
"fare": 17.85,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695159649087,
"uuid": "334e26e9-8355-45cc-97c6-c31daf0df330",
"fare": 19.1,
},
{
"_hoodie_commit_time": "20240402123035233",
"ts": 1695516137016,
"uuid": "e3cf430c-889d-4015-bc98-59bdce1e530c",
"fare": 34.15,
},
]
Loading