Skip to content

Commit

Permalink
Merge pull request FlorianSchepers#17 from FlorianSchepers/make_meeti…
Browse files Browse the repository at this point in the history
…ng_minutes_generation_more_object_oriented

Make meeting minutes generation more object oriented
  • Loading branch information
FlorianSchepers authored Apr 30, 2024
2 parents 7907c00 + 8516170 commit 7367e2e
Show file tree
Hide file tree
Showing 8 changed files with 216 additions and 223 deletions.
2 changes: 1 addition & 1 deletion meminto/audio_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from pathlib import Path
import torchaudio as torchaudio
from torch import Tensor
from decorators import log_time
from meminto.decorators import log_time
from pyannote.core import Annotation, Segment
from pyannote.core.utils.types import Label

Expand Down
50 changes: 50 additions & 0 deletions meminto/chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from meminto.llm.tokenizers import Tokenizer
from meminto.transcriber import TranscriptSection

RATIO_OF_TOKENS_RESERVED_FOR_RESPONSE = 0.3

def chunk_transcript(
system_prompt: str, transcript: list[TranscriptSection], tokenizer: Tokenizer, max_tokens: int
) -> list[str]:
number_of_tokens_per_chunk = _number_of_tokens_per_chunk(
system_prompt=system_prompt, transcript=transcript, tokenizer=tokenizer, max_tokens=max_tokens
)

transcript_chunks = []
current_chunk = ""
for transcript_section in transcript:
current_chunk = current_chunk + str(transcript_section)
if (
tokenizer.number_of_tokens(current_chunk + str(transcript_section))
>= number_of_tokens_per_chunk
):
transcript_chunks.append(current_chunk)
current_chunk = ""
if current_chunk:
transcript_chunks.append(current_chunk)
return transcript_chunks

def _number_of_tokens_per_chunk(
system_prompt: str, transcript: list[TranscriptSection], tokenizer: Tokenizer, max_tokens:int
) -> int:
token_count_system_prompt = tokenizer.number_of_tokens(system_prompt)
token_count_available = int(max_tokens) - token_count_system_prompt
token_count_reserved_for_response = int(
token_count_available * RATIO_OF_TOKENS_RESERVED_FOR_RESPONSE
)
token_count_per_chunk = token_count_available - token_count_reserved_for_response

token_count_transcript = tokenizer.number_of_tokens("".join(map(str, transcript)))
number_of_chunks = token_count_transcript // token_count_per_chunk + 1
number_of_tokens_per_chunk = token_count_transcript // number_of_chunks + 1

print(f"Spliting transcript in chunks:")
print(f"LLM max. token count: {max_tokens}")
print(f"Token count of system prompt: {token_count_system_prompt}")
print(f"Token count reserved for response: {token_count_reserved_for_response}")
print(f"Token count per transcript chunk: {token_count_per_chunk}")
print(f"Token count of transcript: {token_count_transcript}")
print(f"Number of chunks: {number_of_chunks}")
print(f"Number of tokens per chunk: {number_of_tokens_per_chunk}")

return number_of_tokens_per_chunk
2 changes: 1 addition & 1 deletion meminto/diarizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path
from pyannote.audio import Pipeline # type: ignore
from pyannote.core import Annotation # type: ignore
from decorators import log_time
from meminto.decorators import log_time


class Diarizer():
Expand Down
7 changes: 4 additions & 3 deletions meminto/llm/tokenizers.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import os
import tiktoken
from sentencepiece import SentencePieceProcessor
from transformers import AutoTokenizer, OpenAIGPTTokenizer
from huggingface_hub import login


class Tokenizer:
def __init__(self, model: str):
def __init__(self, model: str, hugging_face_acces_token: str):
self.model = model
self.hugging_face_acces_token = hugging_face_acces_token
self.tokenizer = self._select_tokenizer()

def _select_tokenizer(self):
login(token=self.hugging_face_acces_token)
try:
tokenizer = AutoTokenizer.from_pretrained(self.model)
except:
Expand Down
52 changes: 32 additions & 20 deletions meminto/main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
from pathlib import Path
import click
from audio_processing import split_audio
from decorators import log_time
from meminto.llm.tokenizers import Tokenizer
from meminto.audio_processing import split_audio
from meminto.decorators import log_time
from meminto.diarizer import Diarizer
from helpers import (
from meminto.helpers import (
Language,
load_pkl,
parse_input_file_path,
Expand All @@ -13,9 +14,9 @@
select_language,
write_text_to_file,
)
from transcript_to_meeting_minutes import (
meeting_minutes_chunks_to_text,
transcript_to_meeting_minutes,
from meminto.llm.llm import LLM
from meminto.meeting_minutes_generator import (
MeetingMinutesGenerator,
)
from meminto.transcriber import (
Transcriber,
Expand All @@ -28,11 +29,16 @@


@log_time
def create_meeting_minutes(audio_input_file_path: Path, output_folder_path: Path, language :Language):
diarizer = Diarizer(model="pyannote/[email protected]", hugging_face_token=os.environ["HUGGING_FACE_ACCESS_TOKEN"])
def create_meeting_minutes(
audio_input_file_path: Path, output_folder_path: Path, language: Language
):
diarizer = Diarizer(
model="pyannote/[email protected]",
hugging_face_token=os.environ["HUGGING_FACE_ACCESS_TOKEN"],
)
diarization = diarizer.diarize_audio(audio_input_file_path)
save_as_pkl(diarization, output_folder_path / "diarization.pkl")

diarization = load_pkl(output_folder_path / "diarization.pkl")
audio_sections = split_audio(audio_input_file_path, diarization)

Expand All @@ -41,20 +47,26 @@ def create_meeting_minutes(audio_input_file_path: Path, output_folder_path: Path
save_as_pkl(transcript, output_folder_path / "transcript.pkl")
save_transcript_as_txt(transcript, output_folder_path / "transcript.txt")

transcript = load_pkl(
output_folder_path / "transcript.pkl"
tokenizer = Tokenizer(
os.environ["LLM_MODEL"],
hugging_face_acces_token=os.environ["HUGGING_FACE_ACCESS_TOKEN"],
)
merged_meeting_minutes, meeting_minutes_chunks = transcript_to_meeting_minutes(
transcript, language
llm = LLM(
model=os.environ["LLM_MODEL"],
url=os.environ["LLM_URL"],
authorization=os.environ["LLM_AUTHORIZATION"],
temperature=0.5,
max_tokens=int(os.environ["LLM_MAX_TOKENS"]),
)
write_text_to_file(
meeting_minutes_chunks_to_text(meeting_minutes_chunks),
output_folder_path / "meeting_minutes_chunks.txt",
)
write_text_to_file(
merged_meeting_minutes, output_folder_path / "meeting_minutes.txt"

transcript = load_pkl(output_folder_path / "transcript.pkl")
meeting_minutes_generator = MeetingMinutesGenerator(tokenizer=tokenizer, llm=llm)
meeting_minutes = meeting_minutes_generator.generate(
transcript=transcript, language=language
)

write_text_to_file(meeting_minutes, output_folder_path / "meeting_minutes.txt")


@click.command()
@click.option(
Expand All @@ -78,7 +90,7 @@ def create_meeting_minutes(audio_input_file_path: Path, output_folder_path: Path
default="english",
help="Select the language in which the meeting minutes should be generated. Currently supproted are 'english' and 'german'.",
)
def main(input_file: str, output_folder: str, language :str) -> None:
def main(input_file: str, output_folder: str, language: str) -> None:
load_dotenv()
audio_input_file_path = parse_input_file_path(input_file)
output_folder_path = parse_output_folder_path(output_folder)
Expand Down
126 changes: 126 additions & 0 deletions meminto/meeting_minutes_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import os
from typing import Tuple
from meminto.decorators import log_time
from meminto.helpers import Language
from meminto.llm.llm import LLM
from meminto.chunking import chunk_transcript
from meminto.prompts import (
CONTEXT,
EXAMPLE_INPUT,
EXAMPLE_INPUT_INTRO,
EXAMPLE_OUTPUT,
EXAMPLE_OUTPUT_INTRO,
INSTRUCTIONS_CREATE_MEETING_MINUTES,
INSTRUCTIONS_MERGE_MEETING_MINUTES,
SELECT_LANGUAGE,
)
from meminto.llm.tokenizers import Tokenizer
from meminto.transcriber import TranscriptSection
from huggingface_hub import login


class MeetingMinutesGenerator:
def __init__(
self,
tokenizer: Tokenizer,
llm: LLM,
):
self.tokenizer = tokenizer
self.llm = llm

@log_time
def generate(self, transcript: list[TranscriptSection], language: Language) -> str:
meeting_minutes_chunks = self.generate_meeting_minutes_chunks(
transcript, language
)

merged_meeting_minutes = self.merged_meeting_minutes(
meeting_minutes_chunks, language
)

return merged_meeting_minutes

def generate_meeting_minutes_chunks(
self,
transcript: list[TranscriptSection],
language: Language,
) -> list[str]:

system_prompt = (
CONTEXT
+ INSTRUCTIONS_CREATE_MEETING_MINUTES
+ SELECT_LANGUAGE
+ language.value
+ ".\n"
+ EXAMPLE_OUTPUT_INTRO
+ EXAMPLE_OUTPUT
)

transcript_chunks = chunk_transcript(
system_prompt, transcript, self.tokenizer, self.llm.max_tokens
)

meeting_minutes_chunks = []
for chunk in transcript_chunks:
meeting_minutes_chunk = self.llm.infer(system_prompt, chunk)
meeting_minutes_chunks.append(meeting_minutes_chunk)

return meeting_minutes_chunks

def merged_meeting_minutes(
self,
meeting_minutes_chunks: list[str],
language: Language,
) -> str:
system_prompt = (
CONTEXT
+ INSTRUCTIONS_MERGE_MEETING_MINUTES
+ SELECT_LANGUAGE
+ language.value
+ ".\n"
+ EXAMPLE_INPUT_INTRO
+ EXAMPLE_INPUT
+ EXAMPLE_OUTPUT_INTRO
+ EXAMPLE_OUTPUT
)

while len(meeting_minutes_chunks) > 1:
merged_meeting_minutes = []
for i in range(0, len(meeting_minutes_chunks), 2):
if i + 1 < len(meeting_minutes_chunks):
meeting_minutes_chunks_as_text = (
self.meeting_minutes_chunks_to_text(
meeting_minutes_chunks[i : i + 2]
)
)
token_count_system_prompt = self.tokenizer.number_of_tokens(
system_prompt
)
token_count_meeting_minutes = self.tokenizer.number_of_tokens(
meeting_minutes_chunks_as_text
)
print("Mergin meeting minutes chunks: ")
print(f"Token count of system prompt: {token_count_system_prompt}")
print(
f"Token count of meeting minutes chunks: {token_count_meeting_minutes}"
)
print(
f"Total token count: {token_count_system_prompt + token_count_meeting_minutes}"
)
merged_minutes = self.llm.infer(
system_prompt, meeting_minutes_chunks_as_text
)
merged_meeting_minutes.append(merged_minutes)
elif i < len(meeting_minutes_chunks):
merged_meeting_minutes.append(meeting_minutes_chunks[i])
meeting_minutes_chunks = merged_meeting_minutes
return meeting_minutes_chunks[0]

@staticmethod
def meeting_minutes_chunks_to_text(meeting_minutes_chunks: list[str]) -> str:
meeting_minutes_chunks_as_text = ""
for idx, chunks in enumerate(meeting_minutes_chunks):
meeting_minutes_chunks_as_text = (
meeting_minutes_chunks_as_text + f"Section {idx+1}\n" + chunks + "\n\n"
)
return meeting_minutes_chunks_as_text
4 changes: 2 additions & 2 deletions meminto/transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
WhisperForConditionalGeneration,
pipeline,
)
from decorators import log_time
from audio_processing import AudioSection
from meminto.decorators import log_time
from meminto.audio_processing import AudioSection


class WHISPER_MODEL_SIZE(Enum):
Expand Down
Loading

0 comments on commit 7367e2e

Please sign in to comment.