Skip to content

Commit

Permalink
test: Add CodSpeed to follow lib performance
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Feb 8, 2025
1 parent 2d03ec8 commit c4b51c3
Show file tree
Hide file tree
Showing 13 changed files with 95 additions and 10 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ on:
- feat/*
- hotfix/*
- main
# Allows CodSpeed to trigger backtest performance analysis in order to generate initial data
workflow_dispatch:

jobs:
init:
Expand Down Expand Up @@ -104,7 +106,10 @@ jobs:
run: echo "${{ secrets.DOTENV_UNIT_TESTS }}" > .env

- name: Run tests
run: make test-${{ matrix.step }} version_full=${{ needs.init.outputs.VERSION_FULL }}
uses: CodSpeedHQ/[email protected]
with:
token: ${{ secrets.CODSPEED_TOKEN }}
run: make test-${{ matrix.step }} version_full=${{ needs.init.outputs.VERSION_FULL }}

- name: Upload artifacts
uses: actions/[email protected]
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -291,3 +291,6 @@ test-reports/
# Local .env
!.env.example
.env.*

# CodSpeed
.codspeed/
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ test-static-server:
test-unit-run:
@echo "➡️ Unit tests (Pytest)..."
uv run pytest \
--codspeed \
--junit-xml=test-reports/$(version_full).xml \
--maxprocesses=4 \
-n=logical \
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Web scraper made for AI and simplicity in mind. It runs as a CLI that can be par
[![GitHub project license](https://img.shields.io/github/license/clemlesne/scrape-it-now)](https://github.com/clemlesne/scrape-it-now/blob/main/LICENSE)
[![PyPI package version](https://img.shields.io/pypi/v/scrape-it-now)](https://pypi.org/project/scrape-it-now)
[![PyPI supported Python versions](https://img.shields.io/pypi/pyversions/scrape-it-now)](https://pypi.org/project/scrape-it-now)
[![CodSpeed report](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/clemlesne/scrape-it-now)

## Features

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ dev = [
"pyinstaller~=6.11", # Create standalone executable
"pyright~=1.1", # Static type checker
"pytest-asyncio~=0.23", # Pytest plugin for async tests
"pytest-codspeed~=3.2.0", # Pytest plugin for measuring code speed
"pytest-repeat~=0.9", # Pytest plugin for repeating tests
"pytest-xdist[psutil]~=3.6", # Pytest plugin for parallel testing
"pytest~=8.3", # Testing framework
Expand Down
9 changes: 9 additions & 0 deletions src/scrape_it_now/helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
from os import environ as env

from dotenv import find_dotenv, load_dotenv

# First, load the environment variables from the .env file
load_dotenv(
find_dotenv(
# Use the current working directory from where the command is run
usecwd=True,
)
)

# Detect if the code is running in a CI environment
# See: https://stackoverflow.com/a/75223617
IS_CI = env.get("CI", "").lower() == "true"
if IS_CI:
print("CI environment detected, be aware configuration may differ")
2 changes: 2 additions & 0 deletions src/scrape_it_now/helpers/identity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@

from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider

from scrape_it_now.helpers import IS_CI
from scrape_it_now.helpers.cache import lru_acache
from scrape_it_now.helpers.http import azure_transport


@lru_acache()
async def credential() -> DefaultAzureCredential:
return DefaultAzureCredential(
process_timeout=120 if IS_CI else 10, # 2 mins in CI, 10 secs in production
# Performance
transport=await azure_transport(),
)
Expand Down
9 changes: 3 additions & 6 deletions src/scrape_it_now/persistence/local_disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from aiofiles.os import makedirs, path, remove, rmdir
from pydantic import BaseModel, Field

from scrape_it_now.helpers import IS_CI
from scrape_it_now.helpers.logging import logger
from scrape_it_now.helpers.resources import file_lock, local_disk_cache_path
from scrape_it_now.models.message import Message
Expand Down Expand Up @@ -257,7 +258,6 @@ async def __aexit__(self, *exc: Any) -> None:
class QueueConfig(BaseModel):
name: str
table: str = "queue"
timeout: int = 30

async def db_path(self) -> str:
return await path.abspath(
Expand Down Expand Up @@ -401,10 +401,7 @@ async def create_queue(
await makedirs(dirname(file_path), exist_ok=True)

# Initialize the database
async with aiosqlite.connect(
database=file_path,
timeout=self._config.timeout, # Wait for 30 secs before giving up
) as connection:
async with self._use_connection() as connection:
# Enable WAL mode to allow multiple readers and one writer
await connection.execute(
"""
Expand Down Expand Up @@ -442,7 +439,7 @@ async def _use_connection(self) -> AsyncGenerator[aiosqlite.Connection, None]:
# Connect and return the connection
async with aiosqlite.connect(
database=await self._config.db_path(),
timeout=self._config.timeout, # Wait for 30 secs before giving up
timeout=2 * 60 if IS_CI else 30, # 2 mins in CI, 30 secs in production
) as connection:
yield connection

Expand Down
4 changes: 1 addition & 3 deletions src/scrape_it_now/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@

# Bowser
BROWSER_NAME = "chromium"
BROWSER_TIMEOUT_MS = 180000 # 3 mins
BROWSER_TIMEOUT_MS = 3 * 60 * 1000 # 3 mins


async def _queue( # noqa: PLR0913
Expand Down Expand Up @@ -1006,7 +1006,6 @@ def _network_used_callback(size_bytes: int) -> None:
# Convert HTML to Markdown
full_markdown = convert_text(
format="html", # Input is HTML
sandbox=True, # Enable sandbox mode, we don't know what we are scraping
source=full_html_minus_resources,
to="markdown-fenced_divs-native_divs-raw_html-bracketed_spans-native_spans-link_attributes-header_attributes-inline_code_attributes",
verify_format=False, # We know the format, don't verify it
Expand Down Expand Up @@ -1367,7 +1366,6 @@ async def _get_broswer(
"""
# Launch the browser
browser = await browser_type.launch(
chromium_sandbox=True, # Enable the sandbox for security, we don't know what we are scraping
timeout=BROWSER_TIMEOUT_MS,
# See: https://github.com/microsoft/playwright/blob/99a36310570617222290c09b96a2026beb8b00f9/packages/playwright-core/src/server/chromium/chromium.ts
args=[
Expand Down
1 change: 1 addition & 0 deletions tests/blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
],
ids=lambda x: x.value,
)
@pytest.mark.benchmark
@pytest.mark.repeat(10) # Catch multi-threading and concurrency issues
async def test_acid(provider: BlobProvider) -> None:
# Init values
Expand Down
1 change: 1 addition & 0 deletions tests/queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
],
ids=lambda x: x.value,
)
@pytest.mark.benchmark
@pytest.mark.repeat(10) # Catch multi-threading and concurrency issues
async def test_acid(provider: QueueProvider) -> None:
# Init values
Expand Down
1 change: 1 addition & 0 deletions tests/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
],
ids=lambda x: x,
)
@pytest.mark.benchmark
async def test_scrape_page_website(
website: str,
browser: Browser,
Expand Down
65 changes: 65 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit c4b51c3

Please sign in to comment.