test: Add CodSpeed to follow lib performance

clemlesne · Feb 8, 2025 · c4b51c3 · c4b51c3
1 parent 2d03ec8
commit c4b51c3
Show file tree

Hide file tree

Showing 13 changed files with 95 additions and 10 deletions.
diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml
@@ -15,6 +15,8 @@ on:
       - feat/*
       - hotfix/*
       - main
+  # Allows CodSpeed to trigger backtest performance analysis in order to generate initial data
+  workflow_dispatch:
 
 jobs:
   init:
@@ -104,7 +106,10 @@ jobs:
         run: echo "${{ secrets.DOTENV_UNIT_TESTS }}" > .env
 
       - name: Run tests
-        run: make test-${{ matrix.step }} version_full=${{ needs.init.outputs.VERSION_FULL }}
+        uses: CodSpeedHQ/[email protected]
+        with:
+          token: ${{ secrets.CODSPEED_TOKEN }}
+          run: make test-${{ matrix.step }} version_full=${{ needs.init.outputs.VERSION_FULL }}
 
       - name: Upload artifacts
         uses: actions/[email protected]

diff --git a/.gitignore b/.gitignore
@@ -291,3 +291,6 @@ test-reports/
 # Local .env
 !.env.example
 .env.*
+
+# CodSpeed
+.codspeed/
diff --git a/Makefile b/Makefile
@@ -75,6 +75,7 @@ test-static-server:
 test-unit-run:
 	@echo "➡️ Unit tests (Pytest)..."
 	uv run pytest \
+		--codspeed \
 		--junit-xml=test-reports/$(version_full).xml \
 		--maxprocesses=4 \
 		-n=logical \

diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@ Web scraper made for AI and simplicity in mind. It runs as a CLI that can be par
 [![GitHub project license](https://img.shields.io/github/license/clemlesne/scrape-it-now)](https://github.com/clemlesne/scrape-it-now/blob/main/LICENSE)
 [![PyPI package version](https://img.shields.io/pypi/v/scrape-it-now)](https://pypi.org/project/scrape-it-now)
 [![PyPI supported Python versions](https://img.shields.io/pypi/pyversions/scrape-it-now)](https://pypi.org/project/scrape-it-now)
+[![CodSpeed report](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/clemlesne/scrape-it-now)
 
 ## Features
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -72,6 +72,7 @@ dev = [
   "pyinstaller~=6.11",         # Create standalone executable
   "pyright~=1.1",              # Static type checker
   "pytest-asyncio~=0.23",      # Pytest plugin for async tests
+  "pytest-codspeed~=3.2.0",    # Pytest plugin for measuring code speed
   "pytest-repeat~=0.9",        # Pytest plugin for repeating tests
   "pytest-xdist[psutil]~=3.6", # Pytest plugin for parallel testing
   "pytest~=8.3",               # Testing framework

diff --git a/src/scrape_it_now/helpers/__init__.py b/src/scrape_it_now/helpers/__init__.py
@@ -1,8 +1,17 @@
+from os import environ as env
+
 from dotenv import find_dotenv, load_dotenv
 
+# First, load the environment variables from the .env file
 load_dotenv(
     find_dotenv(
         # Use the current working directory from where the command is run
         usecwd=True,
     )
 )
+
+# Detect if the code is running in a CI environment
+# See: https://stackoverflow.com/a/75223617
+IS_CI = env.get("CI", "").lower() == "true"
+if IS_CI:
+    print("CI environment detected, be aware configuration may differ")
diff --git a/src/scrape_it_now/helpers/identity.py b/src/scrape_it_now/helpers/identity.py
@@ -2,13 +2,15 @@
 
 from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
 
+from scrape_it_now.helpers import IS_CI
 from scrape_it_now.helpers.cache import lru_acache
 from scrape_it_now.helpers.http import azure_transport
 
 
 @lru_acache()
 async def credential() -> DefaultAzureCredential:
     return DefaultAzureCredential(
+        process_timeout=120 if IS_CI else 10,  # 2 mins in CI, 10 secs in production
         # Performance
         transport=await azure_transport(),
     )

diff --git a/src/scrape_it_now/persistence/local_disk.py b/src/scrape_it_now/persistence/local_disk.py
@@ -15,6 +15,7 @@
 from aiofiles.os import makedirs, path, remove, rmdir
 from pydantic import BaseModel, Field
 
+from scrape_it_now.helpers import IS_CI
 from scrape_it_now.helpers.logging import logger
 from scrape_it_now.helpers.resources import file_lock, local_disk_cache_path
 from scrape_it_now.models.message import Message
@@ -257,7 +258,6 @@ async def __aexit__(self, *exc: Any) -> None:
 class QueueConfig(BaseModel):
     name: str
     table: str = "queue"
-    timeout: int = 30
 
     async def db_path(self) -> str:
         return await path.abspath(
@@ -401,10 +401,7 @@ async def create_queue(
         await makedirs(dirname(file_path), exist_ok=True)
 
         # Initialize the database
-        async with aiosqlite.connect(
-            database=file_path,
-            timeout=self._config.timeout,  # Wait for 30 secs before giving up
-        ) as connection:
+        async with self._use_connection() as connection:
             # Enable WAL mode to allow multiple readers and one writer
             await connection.execute(
                 """
@@ -442,7 +439,7 @@ async def _use_connection(self) -> AsyncGenerator[aiosqlite.Connection, None]:
         # Connect and return the connection
         async with aiosqlite.connect(
             database=await self._config.db_path(),
-            timeout=self._config.timeout,  # Wait for 30 secs before giving up
+            timeout=2 * 60 if IS_CI else 30,  # 2 mins in CI, 30 secs in production
         ) as connection:
             yield connection
 

diff --git a/src/scrape_it_now/scrape.py b/src/scrape_it_now/scrape.py
@@ -74,7 +74,7 @@
 
 # Bowser
 BROWSER_NAME = "chromium"
-BROWSER_TIMEOUT_MS = 180000  # 3 mins
+BROWSER_TIMEOUT_MS = 3 * 60 * 1000  # 3 mins
 
 
 async def _queue(  # noqa: PLR0913
@@ -1006,7 +1006,6 @@ def _network_used_callback(size_bytes: int) -> None:
             # Convert HTML to Markdown
             full_markdown = convert_text(
                 format="html",  # Input is HTML
-                sandbox=True,  # Enable sandbox mode, we don't know what we are scraping
                 source=full_html_minus_resources,
                 to="markdown-fenced_divs-native_divs-raw_html-bracketed_spans-native_spans-link_attributes-header_attributes-inline_code_attributes",
                 verify_format=False,  # We know the format, don't verify it
@@ -1367,7 +1366,6 @@ async def _get_broswer(
     """
     # Launch the browser
     browser = await browser_type.launch(
-        chromium_sandbox=True,  # Enable the sandbox for security, we don't know what we are scraping
         timeout=BROWSER_TIMEOUT_MS,
         # See: https://github.com/microsoft/playwright/blob/99a36310570617222290c09b96a2026beb8b00f9/packages/playwright-core/src/server/chromium/chromium.ts
         args=[

diff --git a/tests/blob.py b/tests/blob.py
@@ -25,6 +25,7 @@
     ],
     ids=lambda x: x.value,
 )
+@pytest.mark.benchmark
 @pytest.mark.repeat(10)  # Catch multi-threading and concurrency issues
 async def test_acid(provider: BlobProvider) -> None:
     # Init values

diff --git a/tests/queue.py b/tests/queue.py
@@ -21,6 +21,7 @@
     ],
     ids=lambda x: x.value,
 )
+@pytest.mark.benchmark
 @pytest.mark.repeat(10)  # Catch multi-threading and concurrency issues
 async def test_acid(provider: QueueProvider) -> None:
     # Init values

diff --git a/tests/scrape.py b/tests/scrape.py
@@ -41,6 +41,7 @@
     ],
     ids=lambda x: x,
 )
+@pytest.mark.benchmark
 async def test_scrape_page_website(
     website: str,
     browser: Browser,

diff --git a/uv.lock b/uv.lock