Skip to content

Commit

Permalink
[Feat] Upgrade yt-whisper bot to use serverless pinecone client
Browse files Browse the repository at this point in the history
  • Loading branch information
Davidnet committed Jan 29, 2024
1 parent 381370b commit 1811bb8
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 145 deletions.
54 changes: 46 additions & 8 deletions yt-whisper/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,53 @@
FROM python:3.11-bullseye
# syntax=docker/dockerfile:1

WORKDIR /usr/src/app
# Comments are provided throughout this file to help you get started.
# If you need more help, visit the Dockerfile reference guide at
# https://docs.docker.com/go/dockerfile-reference/

COPY yt_whisper /usr/src/app/yt_whisper
COPY pyproject.toml /usr/src/app/pyproject.toml
COPY README.md /usr/src/app/README.md
ARG PYTHON_VERSION=3.11
FROM python:${PYTHON_VERSION}-slim as base

RUN pip install -e .
# Prevents Python from writing pyc files.
ENV PYTHONDONTWRITEBYTECODE=1

# Keeps Python from buffering stdout and stderr to avoid situations where
# the application crashes without emitting any logs due to buffering.
ENV PYTHONUNBUFFERED=1

WORKDIR /app

# Create a non-privileged user that the app will run under.
# See https://docs.docker.com/go/dockerfile-user-best-practices/
ARG UID=10001
RUN adduser \
--disabled-password \
--gecos "" \
--home "/nonexistent" \
--shell "/sbin/nologin" \
--no-create-home \
--uid "${UID}" \
appuser

# Download dependencies as a separate step to take advantage of Docker's caching.
# Leverage a cache mount to /root/.cache/pip to speed up subsequent builds.
# Leverage a bind mount to requirements.txt to avoid having to copy them into
# into this layer.
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
--mount=type=bind,source=poetry.lock,target=poetry.lock \
--mount=type=bind,source=yt_whisper,target=yt_whisper \
--mount=type=bind,source=README.md,target=README.md \
python -m pip install .

# Switch to the non-privileged user to run the application.
USER appuser

# Copy the source code into the container.
COPY . .

# Expose the port that the application listens on.
EXPOSE 8503

HEALTHCHECK CMD curl --fail http://localhost:8503/_stcore/health

ENTRYPOINT ["streamlit", "run", "yt_whisper/app.py", "--server.port=8503", "--server.address=0.0.0.0"]
# Run the application.
CMD streamlit run yt_whisper/app.py --server.port=8503 --server.address=0.0.0.0
131 changes: 8 additions & 123 deletions yt-whisper/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions yt-whisper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ authors = ["David Cardozo <[email protected]>"]
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.11"
python = ">=3.11,<3.13"
streamlit = "^1.29.0"
python-dotenv = "^1.0.0"
pytube = "^15.0.0"
openai = "^1.7.0"
pinecone-client = "^2.2.4"
pinecone-client = "^3.0.2"


[tool.poetry.group.dev.dependencies]
Expand Down
4 changes: 3 additions & 1 deletion yt-whisper/scripts/run_with_docker.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/bin/bash
set -euf -o pipefail
docker run -p 8503:8503 --env-file .env demo
declare -r docker_image_name="yt-whisper:demo"
docker build -t ${docker_image_name} .
docker run -it --rm -p 8503:8503 --env-file .env ${docker_image_name}
24 changes: 13 additions & 11 deletions yt-whisper/yt_whisper/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from pathlib import Path
from tempfile import mkdtemp

import pinecone
import streamlit as st
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from pytube import YouTube
from pytube.exceptions import RegexMatchError
from streamlit.logger import get_logger
Expand All @@ -23,14 +23,16 @@
@st.cache_resource
def load_pinecone(index_name="docker-genai"):
# initialize pinecone
pinecone.init(
api_key=os.getenv("PINECONE_TOKEN"),
environment=os.getenv("PINECONE_ENVIRONMENT"),
)
if index_name not in pinecone.list_indexes():
# we create a new index
pinecone.create_index(name=index_name, metric="cosine", dimension=1536)
index = pinecone.Index(index_name)
pc = Pinecone(api_key=os.getenv("PINECONE_TOKEN"))

if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=1536,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-west-2"),
)
index = pc.Index(index_name)
return index


Expand All @@ -48,7 +50,7 @@ def process_video(video_url: str) -> dict[str, str]:
audio_stream = yt_handler.streams.filter(only_audio=True).first()
audio_file = audio_stream.download(tmp_dir)
file_stats = os.stat(audio_file)
logger.info(f"File size: {file_stats.st_size}")
logger.info(f"File size(bytes): {file_stats.st_size}")
logger.info(f"File name: {audio_file}")
if file_stats.st_size > 24 * 1024 * 1024: # 25 MB Limit check
# TODO(davidnet): Split and process the video in chunks
Expand Down Expand Up @@ -127,7 +129,7 @@ def disable(b):


def main():
logger.info("Rendering app")
logger.debug("Rendering app")
if "tempfolder" not in st.session_state:
st.session_state.tempfolder = Path(mkdtemp(prefix="yt_transcription_"))
if "videos" not in st.session_state:
Expand Down

0 comments on commit 1811bb8

Please sign in to comment.