-
Notifications
You must be signed in to change notification settings - Fork 0
/
3_longest_durations.py
64 lines (48 loc) · 2.56 KB
/
3_longest_durations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import pandas as pd
def duration_to_seconds(duration):
h, m, s = map(int, duration.split(':'))
return h * 3600 + m * 60 + s
def seconds_to_hhmmss(total_seconds):
h = total_seconds // 3600
m = (total_seconds % 3600) // 60
s = total_seconds % 60
return f"{h:02}:{m:02}:{s:02}"
def process_speaker_data(directory):
speaker_data = {}
total_speaking_time_all = 0
total_statements_all = 0
for file in os.listdir(directory):
if file.endswith('.csv'):
df = pd.read_csv(os.path.join(directory, file))
if {'Speaker', 'Duration'}.issubset(df.columns):
for _, row in df.iterrows():
speaker = row['Speaker']
duration = duration_to_seconds(row['Duration'])
total_speaking_time_all += duration
total_statements_all += 1
if speaker not in speaker_data:
speaker_data[speaker] = {'total_time': 0, 'statement_count': 0}
speaker_data[speaker]['total_time'] += duration
speaker_data[speaker]['statement_count'] += 1
return speaker_data, total_speaking_time_all
def filter_and_calculate_percentage(speaker_data, total_speaking_time_all, min_statements=15):
"""Filter speakers with more than `min_statements` and calculate their total speaking time percentage."""
high_freq_speakers = {speaker: data for speaker, data in speaker_data.items()
if data['statement_count'] > min_statements}
result_df = pd.DataFrame.from_dict(high_freq_speakers, orient='index').reset_index(drop=True)
result_df['speaker'] = high_freq_speakers.keys()
result_df = result_df.sort_values('total_time', ascending=False)
total_high_freq_time = result_df['total_time'].sum()
percentage_high_freq = (total_high_freq_time / total_speaking_time_all) * 100
# Convert total speaking time from seconds to hh:mm:ss format
total_time_hhmmss = seconds_to_hhmmss(total_high_freq_time)
return percentage_high_freq, total_time_hhmmss
def main():
output_dir = './srcs/'
speaker_data, total_speaking_time_all = process_speaker_data(output_dir)
percentage_high_freq, total_time_hhmmss = filter_and_calculate_percentage(speaker_data, total_speaking_time_all)
print(f"Percentage of total speaking time by high-frequency speakers: {percentage_high_freq:.2f}%")
print(f"Total speaking time by high-frequency speakers: {total_time_hhmmss}")
if __name__ == "__main__":
main()