Skip to content

Commit

Permalink
test: Add CodSpeed to follow lib performance
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Feb 9, 2025
1 parent 4733803 commit ac9f5db
Show file tree
Hide file tree
Showing 15 changed files with 235 additions and 58 deletions.
15 changes: 13 additions & 2 deletions .github/workflows/pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ on:
- feat/*
- hotfix/*
- main
# Allows CodSpeed to trigger backtest performance analysis in order to generate initial data
workflow_dispatch:

jobs:
init:
Expand Down Expand Up @@ -52,7 +54,8 @@ jobs:
# Run all test suites
step:
- static
- unit
- unit-simple
- unit-codspeed
# Run on all supported Python versions
python-version:
- "3.11"
Expand Down Expand Up @@ -103,9 +106,17 @@ jobs:
- name: Configure environment variables
run: echo "${{ secrets.DOTENV_UNIT_TESTS }}" > .env

- name: Run tests
- name: Run tests (simple)
if: ${{ !contains(matrix.step, 'codspeed') }}
run: make test-${{ matrix.step }} version_full=${{ needs.init.outputs.VERSION_FULL }}

- name: Run tests (CodSpeed)
if: ${{ contains(matrix.step, 'codspeed') }}
uses: CodSpeedHQ/[email protected]
with:
token: ${{ secrets.CODSPEED_TOKEN }}
run: make test-${{ matrix.step }} version_full=${{ needs.init.outputs.VERSION_FULL }}

- name: Upload artifacts
uses: actions/[email protected]
if: always()
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -291,3 +291,6 @@ test-reports/
# Local .env
!.env.example
.env.*

# CodSpeed
.codspeed/
29 changes: 21 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ upgrade:

test:
$(MAKE) test-static
$(MAKE) test-unit
$(MAKE) test-unit-simple

test-static:
@echo "➡️ Test dependencies issues (deptry)..."
Expand All @@ -65,18 +65,31 @@ test-static:
@echo "➡️ Test types (Pyright)..."
uv run pyright

test-unit:
bash cicd/test-unit-ci.sh
test-unit-simple:
bash cicd/test-unit-ci.sh simple

test-unit-codspeed:
bash cicd/test-unit-ci.sh codspeed

test-static-server:
@echo "➡️ Starting local static server..."
python3 -m http.server -d ./tests/websites 8000

test-unit-run:
@echo "➡️ Unit tests (Pytest)..."
uv run pytest \
--junit-xml=test-reports/$(version_full).xml \
--log-file=test-reports/$(version_full).log \
test-unit-simple-run:
@echo "➡️ Unit tests with no extra (Pytest)..."
CI=true uv run pytest \
--junit-xml=test-reports/$(version_full)-simple.xml \
--log-file=test-reports/$(version_full)-simple.log \
--maxprocesses=4 \
-n=logical \
tests/*.py

test-unit-codspeed-run:
@echo "➡️ Unit tests with CodSpeed (Pytest)..."
CI=true uv run pytest \
--codspeed \
--junit-xml=test-reports/$(version_full)-codspeed.xml \
--log-file=test-reports/$(version_full)-codspeed.log \
--maxprocesses=4 \
-n=logical \
tests/*.py
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Web scraper made for AI and simplicity in mind. It runs as a CLI that can be par
[![GitHub project license](https://img.shields.io/github/license/clemlesne/scrape-it-now)](https://github.com/clemlesne/scrape-it-now/blob/main/LICENSE)
[![PyPI package version](https://img.shields.io/pypi/v/scrape-it-now)](https://pypi.org/project/scrape-it-now)
[![PyPI supported Python versions](https://img.shields.io/pypi/pyversions/scrape-it-now)](https://pypi.org/project/scrape-it-now)
[![CodSpeed report](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/clemlesne/scrape-it-now)

## Features

Expand Down
9 changes: 8 additions & 1 deletion cicd/test-unit-ci.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
#!/bin/bash

mode=$1

if [ -z "$mode" ]; then
echo "Error: mode is required." >&2
exit 1
fi

# Start the first command in the background
make test-static-server 1>/dev/null 2>&1 &

# Capture the PID of the background process
UNIT_RUN_PID=$!

# Run the second command
make test-unit-run
make test-unit-${mode}-run
exit_code=$?

# Once the second command exits, kill the first process
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ dev = [
"pyinstaller~=6.11", # Create standalone executable
"pyright~=1.1", # Static type checker
"pytest-asyncio~=0.23", # Pytest plugin for async tests
"pytest-codspeed~=3.2.0", # Pytest plugin for measuring code speed
"pytest-repeat~=0.9", # Pytest plugin for repeating tests
"pytest-xdist[psutil]~=3.6", # Pytest plugin for parallel testing
"pytest~=8.3", # Testing framework
Expand Down
2 changes: 2 additions & 0 deletions src/scrape_it_now/helpers/identity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@

from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider

from scrape_it_now.helpers import IS_CI
from scrape_it_now.helpers.cache import lru_acache
from scrape_it_now.helpers.http import azure_transport


@lru_acache()
async def credential() -> DefaultAzureCredential:
return DefaultAzureCredential(
process_timeout=120 if IS_CI else 10, # 2 mins in CI, 10 secs in production
# Performance
transport=await azure_transport(),
)
Expand Down
68 changes: 50 additions & 18 deletions src/scrape_it_now/persistence/azure_blob_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,14 +182,12 @@ async def delete_container(
await self._client.delete_container()
# Wait for it to be deleted, API is eventually consistent
while True:
try:
properties = await self._client.get_container_properties()
if properties.deleted:
break
with suppress(ResourceNotFoundError):
await self._client.get_container_properties()
await asyncio.sleep(2)
continue
# Deleted
except ResourceNotFoundError:
break
break
logger.info('Deleted Blob Storage "%s"', self._config.name)

async def __aenter__(self) -> "AzureBlobStorage":
Expand All @@ -207,21 +205,55 @@ async def __aenter__(self) -> "AzureBlobStorage":
container=self._config.name,
)

# Create if it does not exist
with suppress(ResourceExistsError):
# Create
await self._client.create_container()
# Wait for it to be created, API is eventually consistent
while True:
with suppress(ResourceNotFoundError):
properties = await self._client.get_container_properties()
if not properties.deleted:
break
await asyncio.sleep(2)
logger.debug('Created Blob Storage "%s"', self._config.name)
await self._wait_for_creation()
await self._wait_for_ready()

# Return instance
return self

async def _wait_for_ready(self) -> None:
"""
Wait for the container to be ready.
API is not consistent, so we need to check if the resource is ready to be used.
"""
while True:
# Try using it
try:
# Upload and clean a test blob
await self.upload_blob(
blob="ping",
data=b"ping",
length=4,
overwrite=True,
)
await self._client.delete_blob("ping")
# If no exception, the container is ready
logger.debug('Blob Storage "%s" is ready', self._config.name)
break
# If exception, the container is not ready yet
except Exception:
logger.debug("Blob Storage not ready yet, retrying", exc_info=True)
await asyncio.sleep(2)

async def _wait_for_creation(self) -> None:
"""
Wait for the container to be created.
Loop indefinitely until the the container respond to upload/download operations. Loop indefinitely until the container is created. API is not consistent, so we need to check if the resource is created.
"""
# Start creation
with suppress(ResourceExistsError):
await self._client.create_container()

# Wait for it to be created, API is eventually consistent
while True:
with suppress(ResourceNotFoundError):
await self._client.get_container_properties()
logger.debug('Created Blob Storage "%s"', self._config.name)
# Created
break
await asyncio.sleep(2)

async def __aexit__(self, *exc: Any) -> None:
await self._service.close()
49 changes: 30 additions & 19 deletions src/scrape_it_now/persistence/azure_queue_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,35 +120,46 @@ async def create_queue(
await self._wait_for_ready()

async def _wait_for_ready(self) -> None:
"""
Wait for the queue to be ready.
Loop indefinitely until the queue is respond to send/pull operations. API is not consistent, so we need to check if the resource is ready to be used.
"""
while True:
# Try using it
try:
# Send a test message
# Send and clean a test message
await self.send_message("ping")
# Try to consume the message(s)
async for message in self.receive_messages(
max_messages=1, visibility_timeout=1
):
await self.delete_message(message)
# If no exception, the queue is created
# If no exception, the queue is ready
logger.debug('Queue Storage "%s" is ready', self._config.name)
return
except Exception: # If exception, the queue is not created yet
logger.debug("Queue not created yet, retrying")
break
# If exception, the queue is not ready yet
except Exception:
logger.debug("Queue not ready yet, retrying", exc_info=True)
await asyncio.sleep(2)

async def _wait_for_creation(self) -> None:
# Create if it does not exist
"""
Wait for the queue to be created.
Loop indefinitely until the queue is created. API is not consistent, so we need to check if the resource is created.
"""
# Start creation
with suppress(ResourceExistsError):
# Create
await self._client.create_queue()
# Wait for it to be created, API is eventually consistent
while True:
with suppress(ResourceNotFoundError):
await self._client.get_queue_properties()
logger.debug('Created Queue Storage "%s"', self._config.name)
# Created
return
await asyncio.sleep(2)

# Wait for it to be created, API is eventually consistent
while True:
with suppress(ResourceNotFoundError):
await self._client.get_queue_properties()
logger.debug('Created Queue Storage "%s"', self._config.name)
# Created
break
await asyncio.sleep(2)

@retry(
reraise=True,
Expand All @@ -166,12 +177,12 @@ async def delete_queue(
await self._client.delete_queue()
# Wait for it to be deleted, API is eventually consistent
while True:
try:
with suppress(ResourceNotFoundError):
await self._client.get_queue_properties()
await asyncio.sleep(2)
continue
# Deleted
except ResourceNotFoundError:
break
break
logger.info('Deleted Queue Storage "%s"', self._config.name)

def _escape(self, value: str) -> str:
Expand Down
9 changes: 3 additions & 6 deletions src/scrape_it_now/persistence/local_disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from aiofiles.os import makedirs, path, remove, rmdir
from pydantic import BaseModel, Field

from scrape_it_now.helpers import IS_CI
from scrape_it_now.helpers.logging import logger
from scrape_it_now.helpers.resources import file_lock, local_disk_cache_path
from scrape_it_now.models.message import Message
Expand Down Expand Up @@ -254,7 +255,6 @@ async def __aexit__(self, *exc: Any) -> None:
class QueueConfig(BaseModel):
name: str
table: str = "queue"
timeout: int = 30

async def db_path(self) -> str:
return await path.abspath(
Expand Down Expand Up @@ -398,10 +398,7 @@ async def create_queue(
await makedirs(dirname(file_path), exist_ok=True)

# Initialize the database
async with aiosqlite.connect(
database=file_path,
timeout=self._config.timeout, # Wait for 30 secs before giving up
) as connection:
async with self._use_connection() as connection:
# Enable WAL mode to allow multiple readers and one writer
await connection.execute(
"""
Expand Down Expand Up @@ -437,7 +434,7 @@ async def _use_connection(self) -> AsyncGenerator[aiosqlite.Connection, None]:
# Connect and return the connection
async with aiosqlite.connect(
database=await self._config.db_path(),
timeout=self._config.timeout, # Wait for 30 secs before giving up
timeout=2 * 60 if IS_CI else 30, # 2 mins in CI, 30 secs in production
) as connection:
yield connection

Expand Down
3 changes: 1 addition & 2 deletions src/scrape_it_now/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
wait_random_exponential,
)

from scrape_it_now.helpers import IS_CI
from scrape_it_now.helpers.logging import logger
from scrape_it_now.helpers.persistence import blob_client, queue_client
from scrape_it_now.helpers.resources import (
Expand Down Expand Up @@ -1005,7 +1006,6 @@ def _network_used_callback(size_bytes: int) -> None:
# Convert HTML to Markdown
full_markdown = convert_text(
format="html", # Input is HTML
sandbox=True, # Enable sandbox mode, we don't know what we are scraping
source=full_html_minus_resources,
to="markdown-fenced_divs-native_divs-raw_html-bracketed_spans-native_spans-link_attributes-header_attributes-inline_code_attributes",
verify_format=False, # We know the format, don't verify it
Expand Down Expand Up @@ -1368,7 +1368,6 @@ async def _get_broswer(
# Launch the browser
browser = await browser_type.launch(
channel="chromium", # Explicitly use the new headless mode (see: https://playwright.dev/python/docs/browsers#chromium-new-headless-mode)
chromium_sandbox=True, # Enable the sandbox for security, we don't know what we are scraping
timeout=BROWSER_TIMEOUT_MS,
args=[
"--disable-gl-drawing-for-tests", # Disable UI rendering, lower CPU usage
Expand Down
Loading

0 comments on commit ac9f5db

Please sign in to comment.