Skip to content

Commit

Permalink
fix: GPT-3.5 tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Aug 17, 2024
1 parent bd50c32 commit 09fab9f
Show file tree
Hide file tree
Showing 3 changed files with 100,264 additions and 0 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ build:
pyinstaller \
--add-data resources:resources \
--clean \
--hidden-import=tiktoken_ext \
--hidden-import=tiktoken_ext.openai_public \
--icon resources/logo.ico \
--name scrape-it-now \
--onefile \
Expand Down
6 changes: 6 additions & 0 deletions app/index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio, math
from os import environ as env

import tiktoken
from azure.core.exceptions import (
Expand Down Expand Up @@ -30,6 +31,7 @@
hash_url,
index_index_name,
index_queue_name,
resources_dir,
scrape_container_name,
)
from app.helpers.threading import run_workers
Expand Down Expand Up @@ -355,6 +357,10 @@ async def run(
) -> None:
logger.info("Starting indexing job %s", job)

# Patch Tiktoken
# See: https://stackoverflow.com/a/76107077
env["TIKTOKEN_CACHE_DIR"] = resources_dir("tiktoken")

run_workers(
azure_openai_api_key=azure_openai_api_key,
azure_openai_embedding_deployment=azure_openai_embedding_deployment,
Expand Down
Loading

0 comments on commit 09fab9f

Please sign in to comment.