forked from FlorianSchepers/Meminto
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request FlorianSchepers#17 from FlorianSchepers/make_meeti…
…ng_minutes_generation_more_object_oriented Make meeting minutes generation more object oriented
- Loading branch information
Showing
8 changed files
with
216 additions
and
223 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from meminto.llm.tokenizers import Tokenizer | ||
from meminto.transcriber import TranscriptSection | ||
|
||
RATIO_OF_TOKENS_RESERVED_FOR_RESPONSE = 0.3 | ||
|
||
def chunk_transcript( | ||
system_prompt: str, transcript: list[TranscriptSection], tokenizer: Tokenizer, max_tokens: int | ||
) -> list[str]: | ||
number_of_tokens_per_chunk = _number_of_tokens_per_chunk( | ||
system_prompt=system_prompt, transcript=transcript, tokenizer=tokenizer, max_tokens=max_tokens | ||
) | ||
|
||
transcript_chunks = [] | ||
current_chunk = "" | ||
for transcript_section in transcript: | ||
current_chunk = current_chunk + str(transcript_section) | ||
if ( | ||
tokenizer.number_of_tokens(current_chunk + str(transcript_section)) | ||
>= number_of_tokens_per_chunk | ||
): | ||
transcript_chunks.append(current_chunk) | ||
current_chunk = "" | ||
if current_chunk: | ||
transcript_chunks.append(current_chunk) | ||
return transcript_chunks | ||
|
||
def _number_of_tokens_per_chunk( | ||
system_prompt: str, transcript: list[TranscriptSection], tokenizer: Tokenizer, max_tokens:int | ||
) -> int: | ||
token_count_system_prompt = tokenizer.number_of_tokens(system_prompt) | ||
token_count_available = int(max_tokens) - token_count_system_prompt | ||
token_count_reserved_for_response = int( | ||
token_count_available * RATIO_OF_TOKENS_RESERVED_FOR_RESPONSE | ||
) | ||
token_count_per_chunk = token_count_available - token_count_reserved_for_response | ||
|
||
token_count_transcript = tokenizer.number_of_tokens("".join(map(str, transcript))) | ||
number_of_chunks = token_count_transcript // token_count_per_chunk + 1 | ||
number_of_tokens_per_chunk = token_count_transcript // number_of_chunks + 1 | ||
|
||
print(f"Spliting transcript in chunks:") | ||
print(f"LLM max. token count: {max_tokens}") | ||
print(f"Token count of system prompt: {token_count_system_prompt}") | ||
print(f"Token count reserved for response: {token_count_reserved_for_response}") | ||
print(f"Token count per transcript chunk: {token_count_per_chunk}") | ||
print(f"Token count of transcript: {token_count_transcript}") | ||
print(f"Number of chunks: {number_of_chunks}") | ||
print(f"Number of tokens per chunk: {number_of_tokens_per_chunk}") | ||
|
||
return number_of_tokens_per_chunk |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,11 @@ | ||
import os | ||
from pathlib import Path | ||
import click | ||
from audio_processing import split_audio | ||
from decorators import log_time | ||
from meminto.llm.tokenizers import Tokenizer | ||
from meminto.audio_processing import split_audio | ||
from meminto.decorators import log_time | ||
from meminto.diarizer import Diarizer | ||
from helpers import ( | ||
from meminto.helpers import ( | ||
Language, | ||
load_pkl, | ||
parse_input_file_path, | ||
|
@@ -13,9 +14,9 @@ | |
select_language, | ||
write_text_to_file, | ||
) | ||
from transcript_to_meeting_minutes import ( | ||
meeting_minutes_chunks_to_text, | ||
transcript_to_meeting_minutes, | ||
from meminto.llm.llm import LLM | ||
from meminto.meeting_minutes_generator import ( | ||
MeetingMinutesGenerator, | ||
) | ||
from meminto.transcriber import ( | ||
Transcriber, | ||
|
@@ -28,11 +29,16 @@ | |
|
||
|
||
@log_time | ||
def create_meeting_minutes(audio_input_file_path: Path, output_folder_path: Path, language :Language): | ||
diarizer = Diarizer(model="pyannote/[email protected]", hugging_face_token=os.environ["HUGGING_FACE_ACCESS_TOKEN"]) | ||
def create_meeting_minutes( | ||
audio_input_file_path: Path, output_folder_path: Path, language: Language | ||
): | ||
diarizer = Diarizer( | ||
model="pyannote/[email protected]", | ||
hugging_face_token=os.environ["HUGGING_FACE_ACCESS_TOKEN"], | ||
) | ||
diarization = diarizer.diarize_audio(audio_input_file_path) | ||
save_as_pkl(diarization, output_folder_path / "diarization.pkl") | ||
|
||
diarization = load_pkl(output_folder_path / "diarization.pkl") | ||
audio_sections = split_audio(audio_input_file_path, diarization) | ||
|
||
|
@@ -41,20 +47,26 @@ def create_meeting_minutes(audio_input_file_path: Path, output_folder_path: Path | |
save_as_pkl(transcript, output_folder_path / "transcript.pkl") | ||
save_transcript_as_txt(transcript, output_folder_path / "transcript.txt") | ||
|
||
transcript = load_pkl( | ||
output_folder_path / "transcript.pkl" | ||
tokenizer = Tokenizer( | ||
os.environ["LLM_MODEL"], | ||
hugging_face_acces_token=os.environ["HUGGING_FACE_ACCESS_TOKEN"], | ||
) | ||
merged_meeting_minutes, meeting_minutes_chunks = transcript_to_meeting_minutes( | ||
transcript, language | ||
llm = LLM( | ||
model=os.environ["LLM_MODEL"], | ||
url=os.environ["LLM_URL"], | ||
authorization=os.environ["LLM_AUTHORIZATION"], | ||
temperature=0.5, | ||
max_tokens=int(os.environ["LLM_MAX_TOKENS"]), | ||
) | ||
write_text_to_file( | ||
meeting_minutes_chunks_to_text(meeting_minutes_chunks), | ||
output_folder_path / "meeting_minutes_chunks.txt", | ||
) | ||
write_text_to_file( | ||
merged_meeting_minutes, output_folder_path / "meeting_minutes.txt" | ||
|
||
transcript = load_pkl(output_folder_path / "transcript.pkl") | ||
meeting_minutes_generator = MeetingMinutesGenerator(tokenizer=tokenizer, llm=llm) | ||
meeting_minutes = meeting_minutes_generator.generate( | ||
transcript=transcript, language=language | ||
) | ||
|
||
write_text_to_file(meeting_minutes, output_folder_path / "meeting_minutes.txt") | ||
|
||
|
||
@click.command() | ||
@click.option( | ||
|
@@ -78,7 +90,7 @@ def create_meeting_minutes(audio_input_file_path: Path, output_folder_path: Path | |
default="english", | ||
help="Select the language in which the meeting minutes should be generated. Currently supproted are 'english' and 'german'.", | ||
) | ||
def main(input_file: str, output_folder: str, language :str) -> None: | ||
def main(input_file: str, output_folder: str, language: str) -> None: | ||
load_dotenv() | ||
audio_input_file_path = parse_input_file_path(input_file) | ||
output_folder_path = parse_output_folder_path(output_folder) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
import os | ||
from typing import Tuple | ||
from meminto.decorators import log_time | ||
from meminto.helpers import Language | ||
from meminto.llm.llm import LLM | ||
from meminto.chunking import chunk_transcript | ||
from meminto.prompts import ( | ||
CONTEXT, | ||
EXAMPLE_INPUT, | ||
EXAMPLE_INPUT_INTRO, | ||
EXAMPLE_OUTPUT, | ||
EXAMPLE_OUTPUT_INTRO, | ||
INSTRUCTIONS_CREATE_MEETING_MINUTES, | ||
INSTRUCTIONS_MERGE_MEETING_MINUTES, | ||
SELECT_LANGUAGE, | ||
) | ||
from meminto.llm.tokenizers import Tokenizer | ||
from meminto.transcriber import TranscriptSection | ||
from huggingface_hub import login | ||
|
||
|
||
class MeetingMinutesGenerator: | ||
def __init__( | ||
self, | ||
tokenizer: Tokenizer, | ||
llm: LLM, | ||
): | ||
self.tokenizer = tokenizer | ||
self.llm = llm | ||
|
||
@log_time | ||
def generate(self, transcript: list[TranscriptSection], language: Language) -> str: | ||
meeting_minutes_chunks = self.generate_meeting_minutes_chunks( | ||
transcript, language | ||
) | ||
|
||
merged_meeting_minutes = self.merged_meeting_minutes( | ||
meeting_minutes_chunks, language | ||
) | ||
|
||
return merged_meeting_minutes | ||
|
||
def generate_meeting_minutes_chunks( | ||
self, | ||
transcript: list[TranscriptSection], | ||
language: Language, | ||
) -> list[str]: | ||
|
||
system_prompt = ( | ||
CONTEXT | ||
+ INSTRUCTIONS_CREATE_MEETING_MINUTES | ||
+ SELECT_LANGUAGE | ||
+ language.value | ||
+ ".\n" | ||
+ EXAMPLE_OUTPUT_INTRO | ||
+ EXAMPLE_OUTPUT | ||
) | ||
|
||
transcript_chunks = chunk_transcript( | ||
system_prompt, transcript, self.tokenizer, self.llm.max_tokens | ||
) | ||
|
||
meeting_minutes_chunks = [] | ||
for chunk in transcript_chunks: | ||
meeting_minutes_chunk = self.llm.infer(system_prompt, chunk) | ||
meeting_minutes_chunks.append(meeting_minutes_chunk) | ||
|
||
return meeting_minutes_chunks | ||
|
||
def merged_meeting_minutes( | ||
self, | ||
meeting_minutes_chunks: list[str], | ||
language: Language, | ||
) -> str: | ||
system_prompt = ( | ||
CONTEXT | ||
+ INSTRUCTIONS_MERGE_MEETING_MINUTES | ||
+ SELECT_LANGUAGE | ||
+ language.value | ||
+ ".\n" | ||
+ EXAMPLE_INPUT_INTRO | ||
+ EXAMPLE_INPUT | ||
+ EXAMPLE_OUTPUT_INTRO | ||
+ EXAMPLE_OUTPUT | ||
) | ||
|
||
while len(meeting_minutes_chunks) > 1: | ||
merged_meeting_minutes = [] | ||
for i in range(0, len(meeting_minutes_chunks), 2): | ||
if i + 1 < len(meeting_minutes_chunks): | ||
meeting_minutes_chunks_as_text = ( | ||
self.meeting_minutes_chunks_to_text( | ||
meeting_minutes_chunks[i : i + 2] | ||
) | ||
) | ||
token_count_system_prompt = self.tokenizer.number_of_tokens( | ||
system_prompt | ||
) | ||
token_count_meeting_minutes = self.tokenizer.number_of_tokens( | ||
meeting_minutes_chunks_as_text | ||
) | ||
print("Mergin meeting minutes chunks: ") | ||
print(f"Token count of system prompt: {token_count_system_prompt}") | ||
print( | ||
f"Token count of meeting minutes chunks: {token_count_meeting_minutes}" | ||
) | ||
print( | ||
f"Total token count: {token_count_system_prompt + token_count_meeting_minutes}" | ||
) | ||
merged_minutes = self.llm.infer( | ||
system_prompt, meeting_minutes_chunks_as_text | ||
) | ||
merged_meeting_minutes.append(merged_minutes) | ||
elif i < len(meeting_minutes_chunks): | ||
merged_meeting_minutes.append(meeting_minutes_chunks[i]) | ||
meeting_minutes_chunks = merged_meeting_minutes | ||
return meeting_minutes_chunks[0] | ||
|
||
@staticmethod | ||
def meeting_minutes_chunks_to_text(meeting_minutes_chunks: list[str]) -> str: | ||
meeting_minutes_chunks_as_text = "" | ||
for idx, chunks in enumerate(meeting_minutes_chunks): | ||
meeting_minutes_chunks_as_text = ( | ||
meeting_minutes_chunks_as_text + f"Section {idx+1}\n" + chunks + "\n\n" | ||
) | ||
return meeting_minutes_chunks_as_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.