Skip to content

Commit

Permalink
DISCO-3308 Adding metrics to icon processing (#824)
Browse files Browse the repository at this point in the history
  • Loading branch information
gruberb authored Mar 11, 2025
1 parent 9afd74a commit 407d90c
Showing 1 changed file with 46 additions and 28 deletions.
74 changes: 46 additions & 28 deletions merino/utils/icon_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from merino.utils.gcs.gcs_uploader import GcsUploader
from merino.utils.gcs.models import Image
from merino.utils.http_client import create_http_client
from merino.utils import metrics

logger = logging.getLogger(__name__)

Expand All @@ -25,46 +26,63 @@ def __init__(self, gcs_project: str, gcs_bucket: str, cdn_hostname: str) -> None
# Content hash cache: {content_hash: gcs_url}
self.content_hash_cache = {}

# Metrics
self.metrics_client = metrics.get_metrics_client()

async def process_icon_url(self, url: str) -> str:
"""Process an external icon URL and return a GCS-hosted URL."""
# Skip URLs that are already from our CDN
cdn_hostname = self.uploader.cdn_hostname
if cdn_hostname and url.startswith(f"https://{cdn_hostname}"):
return url
with self.metrics_client.timeit("icon_processor.processing_time"):
self.metrics_client.increment("icon_processor.requests")

try:
# Download favicon
favicon_image = await self._download_favicon(url)
if not favicon_image:
logger.info(f"Failed to download favicon from {url}")
# Skip URLs that are already from our CDN
cdn_hostname = self.uploader.cdn_hostname
if cdn_hostname and url.startswith(f"https://{cdn_hostname}"):
return url

# Check if the image is valid
if not self._is_valid_image(favicon_image):
logger.info(f"Invalid image from {url}")
return url
try:
# Download favicon
favicon_image = await self.metrics_client.timeit_task(
self._download_favicon(url), "icon_processor.download_time"
)

# Generate content hash
content_hash = hashlib.sha256(favicon_image.content).hexdigest()
if not favicon_image:
logger.info(f"Failed to download favicon from {url}")
self.metrics_client.increment("icon_processor.download_failures")
return url

# Check content hash cache - this avoids re-uploading identical content
if content_hash in self.content_hash_cache:
return self.content_hash_cache[content_hash]
# Check if the image is valid
if not self._is_valid_image(favicon_image):
logger.info(f"Invalid image from {url}")
self.metrics_client.increment("icon_processor.invalid_images")
return url

# Generate destination path based on content hash
destination = self._get_destination_path(favicon_image, content_hash)
# Generate content hash
content_hash = hashlib.sha256(favicon_image.content).hexdigest()

# GcsUploader already checks if the file exists before uploading
gcs_url = self.uploader.upload_image(favicon_image, destination, forced_upload=False)
# Check content hash cache - this avoids re-uploading identical content
if content_hash in self.content_hash_cache:
return self.content_hash_cache[content_hash]

# Cache the result
self.content_hash_cache[content_hash] = gcs_url
# Generate destination path based on content hash
destination = self._get_destination_path(favicon_image, content_hash)

return gcs_url
# GcsUploader already checks if the file exists before uploading
with self.metrics_client.timeit("icon_processor.upload_time"):
gcs_url = self.uploader.upload_image(
favicon_image, destination, forced_upload=False
)

except Exception as e:
logger.warning(f"Error processing icon {url}: {e}")
return url
# Cache the result
self.content_hash_cache[content_hash] = gcs_url

# Track successful processing
self.metrics_client.increment("icon_processor.processed")
return gcs_url

except Exception as e:
logger.warning(f"Error processing icon {url}: {e}")
self.metrics_client.increment("icon_processor.errors")
return url

async def _download_favicon(self, url: str) -> Optional[Image]:
"""Download the favicon from the given URL.
Expand Down

0 comments on commit 407d90c

Please sign in to comment.