Skip to content

Commit

Permalink
Make the analyzer optional
Browse files Browse the repository at this point in the history
- Rework the production and dev dockerfiles to make the analyzer
optional
- Tests/coverage/linting skip the analyzer module
- Analyzer dependencies moved to their own requirement file
  • Loading branch information
davidfischer committed Feb 10, 2023
1 parent e7641fe commit 22f9d81
Show file tree
Hide file tree
Showing 12 changed files with 95 additions and 69 deletions.
3 changes: 2 additions & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ omit =
adserver/management/commands/pypi_import.py
adserver/management/commands/adtype-templates/*.html
adserver/router.py
# Analyzer is conditionally skipped in testing
adserver/analyzer/*

machine_learning_experiments/

[report]
# Regexes for lines to exclude from consideration
Expand Down
27 changes: 16 additions & 11 deletions adserver/analyzer/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,22 @@
from django.utils import timezone
from django_dynamic_fixture import get

from . import tasks
from ..models import Offer
from ..models import Publisher
from ..tests.common import BaseAdModelsTestCase
from .backends import EthicalAdsTopicsBackend
from .backends import NaiveKeywordAnalyzerBackend
from .backends import TextacyAnalyzerBackend
from .models import AnalyzedUrl
from .utils import get_url_analyzer_backend
from .utils import normalize_url
from .validators import KeywordsValidator
try:
from . import tasks
from ..models import Offer
from ..models import Publisher
from ..tests.common import BaseAdModelsTestCase
from .backends import EthicalAdsTopicsBackend
from .backends import NaiveKeywordAnalyzerBackend
from .backends import TextacyAnalyzerBackend
from .models import AnalyzedUrl
from .utils import get_url_analyzer_backend
from .utils import normalize_url
from .validators import KeywordsValidator
except ImportError:
pytest.skip(
"Skip testing the analyzer due to missing dependencies", allow_module_level=True
)


class TestValidators(TestCase):
Expand Down
4 changes: 4 additions & 0 deletions adserver/decisionengine/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import random

from django.conf import settings
from django.db import models
from user_agents import parse

Expand Down Expand Up @@ -112,6 +113,9 @@ def get_analyzer_keywords(self):
if not self.url:
return None

if "adserver.analyzer" not in settings.INSTALLED_APPS:
return None

normalized_url = normalize_url(self.url)
analyzed_url = AnalyzedUrl.objects.filter(
url=normalized_url, publisher=self.publisher
Expand Down
4 changes: 3 additions & 1 deletion config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
"rest_framework.authtoken",
"adserver",
"adserver.auth",
"adserver.analyzer",
"simple_history",
"django_slack",
"djstripe",
Expand Down Expand Up @@ -477,10 +476,13 @@

# The backend to be used by the ad server
# for topic and keyword analysis
# Set to `None` to disable the analyzer entirely
ADSERVER_ANALYZER_BACKEND = env(
"ADSERVER_ANALYZER_BACKEND",
default="adserver.analyzer.backends.TextacyAnalyzerBackend",
)
if ADSERVER_ANALYZER_BACKEND:
INSTALLED_APPS.append("adserver.analyzer")

# Whether Do Not Track is enabled for the ad server
ADSERVER_DO_NOT_TRACK = False
Expand Down
14 changes: 8 additions & 6 deletions config/settings/production.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,16 +162,18 @@
"task": "adserver.tasks.run_publisher_importers",
"schedule": crontab(hour="1", minute="0"),
},
# Run analyzer tasks
"every-day-visited-urls": {
}

# Tasks which should only be run if the analyzer is installed
if "adserver.analyzer" in INSTALLED_APPS:
CELERY_BEAT_SCHEDULE["every-day-visited-urls"] = {
"task": "adserver.analyzer.tasks.daily_visited_urls_aggregation",
"schedule": crontab(hour="3", minute="0"),
},
"every-day-analyze-urls": {
}
CELERY_BEAT_SCHEDULE["every-day-analyze-urls"] = {
"task": "adserver.analyzer.tasks.daily_analyze_urls",
"schedule": crontab(hour="4", minute="0"),
},
}
}


# Sentry settings for error monitoring
Expand Down
2 changes: 2 additions & 0 deletions docker-compose/local/django/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ COPY ./requirements /requirements
RUN pip install --upgrade pip
RUN pip install -r /requirements/development.txt
RUN pip install -r /requirements/production.txt
# Uncomment this if you need the page/topic analyzer
# RUN pip install -r /requirements/analyzer.txt

# COPY ./docker-compose/local/django/entrypoint /entrypoint
# RUN chmod +x /entrypoint
Expand Down
70 changes: 41 additions & 29 deletions docker-compose/production/django/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,52 @@
# This Dockerfile is not currently used or maintained
# but may be useful in setting up your own adserver instance.

FROM python:3.8-alpine
FROM ubuntu:20.04
MAINTAINER Read the Docs, Inc. <[email protected]>

ENV DEBIAN_FRONTEND noninteractive
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
ENV PYTHONUNBUFFERED 1

RUN apk update \
# psycopg2 dependencies
&& apk add --virtual build-deps gcc python3-dev musl-dev \
&& apk add postgresql-dev \
# Pillow dependencies
&& apk add jpeg-dev zlib-dev freetype-dev lcms2-dev openjpeg-dev tiff-dev tk-dev tcl-dev \
# CFFI dependencies
&& apk add libffi-dev py-cffi \
# Translations dependencies
&& apk add gettext \
# https://docs.djangoproject.com/en/dev/ref/django-admin/#dbshell
&& apk add postgresql-client \
# Add Node dependencies for building static files
&& apk add nodejs npm \
# Ensure git is on the system - some dependencies may rely on it
&& apk add git \
# Needed to build Python Cryptography on Alpine Linux
# https://cryptography.io/en/latest/installation.html#alpine
&& apk add openssl-dev cargo

RUN addgroup -S django \
&& adduser -S -G django django
RUN apt-get -y update
RUN apt-get -y install \
curl \
g++ \
git-core \
libevent-dev \
libpq-dev \
libxml2-dev \
libxslt1-dev \
locales \
build-essential \
python3-pip \
python3-dev \
libmysqlclient-dev \
libfreetype6 \
libjpeg-dev \
sqlite \
netcat \
telnet \
lsb-release

# Install Node v14
RUN curl -sL https://deb.nodesource.com/setup_14.x | bash -
RUN apt-get -y install nodejs

RUN addgroup django && useradd -g django django

# Requirements are installed here to ensure they will be cached.
COPY ./requirements /requirements
# Uncomment this if you need the page/topic analyzer
# RUN pip install --no-cache-dir -r /requirements/analyzer.txt
RUN pip install --no-cache-dir -r /requirements/production.txt \
&& rm -rf /requirements

# Install node dependencies
RUN npm install

COPY ./docker-compose/production/django/start /start
RUN chmod +x /start
RUN chown django /start
Expand All @@ -60,19 +73,18 @@ COPY ./package.json /app
COPY ./package-lock.json /app
COPY ./webpack.config.js /app

# Uncomment if you require Azure's container SSH setup
# Allow Azure to SSH into the running container
# Although the root password is known, port 2222 is inaccessible from the internet
# https://docs.microsoft.com/en-us/azure/app-service/containers/configure-custom-container#enable-ssh
RUN apk add openssh \
&& echo "root:Docker!" | chpasswd
COPY ./docker-compose/production/django/sshd_config /etc/ssh/
# RUN apk add openssh && echo "root:Docker!" | chpasswd
# COPY ./docker-compose/production/django/sshd_config /etc/ssh/

RUN chown -R django:django /app
USER django

WORKDIR /app

# Install node dependencies
RUN npm install

EXPOSE 2222 5000

CMD ["/start"]
2 changes: 1 addition & 1 deletion prospector.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ ignore-paths:
- manage.py
- tests.py
- tests/
- machine_learning_experiments/
- adserver/analyzer

ignore-patterns:
- /migrations/
Expand Down
18 changes: 18 additions & 0 deletions requirements/analyzer.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Used by the keyword/topic analyzer
beautifulsoup4==4.11.1
textacy==0.12.0
spacy==3.4.1
# Spacy transformers is listed in the production requirements
# It installs PyTorch which is hundreds of MB
langdetect==1.0.9
# Our use of textacy has an incompatibility with networkx v3
networkx<3.0
# Has to be downloaded directly like this (~30MB)
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0-py3-none-any.whl


#######################################################################
# Machine learning production requirements
#######################################################################
# This installs PyTorch which is ~250MB
spacy-transformers==1.1.8
12 changes: 0 additions & 12 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,3 @@ dj-stripe==2.5.1

# JWT used by Metabase embedding
PyJWT==2.4.0

# Used by the keyword/topic analyzer
beautifulsoup4==4.11.1
textacy==0.12.0
spacy==3.4.1
# Spacy transformers is listed in the production requirements
# It installs PyTorch which is hundreds of MB
langdetect==1.0.9
# Our use of textacy has an incompatibility with networkx v3
networkx<3.0
# Has to be downloaded directly like this (~30MB)
https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.0/en_core_web_md-3.4.0-py3-none-any.whl
6 changes: 0 additions & 6 deletions requirements/production.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,3 @@ django-storages[azure]==1.12.3
# Logging and monitoring
newrelic==7.4.0.172
sentry-sdk==1.5.5

#######################################################################
# Machine learning production requirements
#######################################################################
# This installs PyTorch which is ~250MB
spacy-transformers==1.1.8
2 changes: 0 additions & 2 deletions requirements/testing.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
# Tox will setup its own environment and so it doesn't depend on anything


tox<4.0

0 comments on commit 22f9d81

Please sign in to comment.