pip install pyannote.audio
Scene:
There are multiple speakers in an audio, and the goal is to separate the speech of different individuals.
Given the speech characteristics of certain individuals, the cosine distance between the extracted segments and the characteristics is calculated, and the individual with the minimum cosine distance is identified as the speaker.
_*_ coding: utf-8 _*_
# @Time : 2024/3/16 10:47
# @Author : Michael
# @File : spearker_rec.py
# @desc :
import torch
from pyannote.audio import Model, Pipeline, Inference
from pyannote.core import Segment
from scipy.spatial.distance import cosine
def extract_speaker_embedding(pipeline, audio_file, speaker_label):
diarization = pipeline(audio_file)
speaker_embedding = None
for turn, _, label in diarization.itertracks(yield_label=True):
if label == speaker_label:
segment = Segment(turn.start, turn.end)
speaker_embedding = inference.crop(audio_file, segment)
break
return speaker_embedding
# Extract speaker embeddings for a given audio and compare them with the embeddings in the speaker library
def recognize_speaker(pipeline, audio_file):
diarization = pipeline(audio_file)
speaker_turns = []
for turn, _, speaker_label in diarization.itertracks(yield_label=True):
# Extract embeddings for the segment
embedding = inference.crop(audio_file, turn)
distances = {}
for speaker, embeddings in speaker_embeddings.items():
# Calculate the cosine distance between the embedding and the known speaker embeddings
distances[speaker] = min([cosine(embedding, e) for e in embeddings])
# Select the speaker with the minimum distance
recognized_speaker = min(distances, key=distances.get)
speaker_turns.append((turn, recognized_speaker))
# Record the time segment of the speaker and the predicted speaker with the minimum cosine distance
return speaker_turns
if __name__ == "__main__":
token = "hf_***" # Please replace with your Hugging Face Token
# Load the speaker diarization model
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=token, # Agree to the terms of use on the project page and obtain a Hugging Face Token
# cache_dir="/home/huggingface/hub/models--pyannote--speaker-diarization-3.1/"
)
# Load the speaker embedding model
embed_model = Model.from_pretrained("pyannote/embedding", use_auth_token=token)
inference = Inference(embed_model, window="whole")
# pipeline.to(torch.device("cuda"))
# Assume you have a collection of audio files with different speakers and their corresponding labels
audio_files = {
"mick": "mick.wav", # Audio of mick
"moon": "moon.wav", # Audio of moon
}
speaker_embeddings = {}
for speaker, audio_file in audio_files.items():
diarization = pipeline(audio_file)
for turn, _, speaker_label in diarization.itertracks(yield_label=True):
embedding = extract_speaker_embedding(pipeline, audio_file, speaker_label)
# Get the original speaker embeddings for the known speakers
speaker_embeddings.setdefault(speaker, []).append(embedding)
# Given a new unknown audio file
given_audio_file = "2_voice.wav" # The first half is mick speaking, the second half is moon speaking
# Recognize the speakers in the given audio
recognized_speakers = recognize_speaker(pipeline, given_audio_file)
print("Recognized speakers in the given audio:")
for turn, speaker in recognized_speakers:
print(f"Speaker {speaker} spoke between {turn.start:.2f}s and {turn.end:.2f}s")
Output:
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.2.1+cpu. Bad things might happen unless you revert torch to 1.x.
Recognized speakers in the given audio:
Speaker mick spoke between 0.57s and 1.67s
Speaker moon spoke between 2.47s and 2.81s
Speaker moon spoke between 3.08s and 4.47s