Using pyannote.audio for speech separation and speaker recognition

pip install pyannote.audio

Scene:

There are multiple speakers in an audio, and the goal is to separate the speech of different individuals.
Given the speech characteristics of certain individuals, the cosine distance between the extracted segments and the characteristics is calculated, and the individual with the minimum cosine distance is identified as the speaker.

 _*_ coding: utf-8 _*_
# @Time : 2024/3/16 10:47
# @Author : Michael
# @File : spearker_rec.py
# @desc :
import torch
from pyannote.audio import Model, Pipeline, Inference
from pyannote.core import Segment
from scipy.spatial.distance import cosine


def extract_speaker_embedding(pipeline, audio_file, speaker_label):
    diarization = pipeline(audio_file)
    speaker_embedding = None
    for turn, _, label in diarization.itertracks(yield_label=True):
        if label == speaker_label:
            segment = Segment(turn.start, turn.end)
            speaker_embedding = inference.crop(audio_file, segment)
            break
    return speaker_embedding

# Extract speaker embeddings for a given audio and compare them with the embeddings in the speaker library
def recognize_speaker(pipeline, audio_file):
    diarization = pipeline(audio_file)
    speaker_turns = []
    for turn, _, speaker_label in diarization.itertracks(yield_label=True):
        # Extract embeddings for the segment
        embedding = inference.crop(audio_file, turn)  
        distances = {}
        for speaker, embeddings in speaker_embeddings.items():  
         # Calculate the cosine distance between the embedding and the known speaker embeddings
            distances[speaker] = min([cosine(embedding, e) for e in embeddings])
        # Select the speaker with the minimum distance
        recognized_speaker = min(distances, key=distances.get)  
        speaker_turns.append((turn, recognized_speaker))  
        # Record the time segment of the speaker and the predicted speaker with the minimum cosine distance
    return speaker_turns

if __name__ == "__main__":
    token = "hf_***"  # Please replace with your Hugging Face Token

    # Load the speaker diarization model
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=token,  # Agree to the terms of use on the project page and obtain a Hugging Face Token
        # cache_dir="/home/huggingface/hub/models--pyannote--speaker-diarization-3.1/"
    )

    # Load the speaker embedding model
    embed_model = Model.from_pretrained("pyannote/embedding", use_auth_token=token)
    inference = Inference(embed_model, window="whole")

    # pipeline.to(torch.device("cuda"))

    # Assume you have a collection of audio files with different speakers and their corresponding labels
    audio_files = {
        "mick": "mick.wav",  # Audio of mick
        "moon": "moon.wav",  # Audio of moon
    }
    speaker_embeddings = {}
    for speaker, audio_file in audio_files.items():
        diarization = pipeline(audio_file)
        for turn, _, speaker_label in diarization.itertracks(yield_label=True):
            embedding = extract_speaker_embedding(pipeline, audio_file, speaker_label)
            # Get the original speaker embeddings for the known speakers
            speaker_embeddings.setdefault(speaker, []).append(embedding)

    # Given a new unknown audio file
    given_audio_file = "2_voice.wav"  # The first half is mick speaking, the second half is moon speaking

    # Recognize the speakers in the given audio
    recognized_speakers = recognize_speaker(pipeline, given_audio_file)
    print("Recognized speakers in the given audio:")
    for turn, speaker in recognized_speakers:
        print(f"Speaker {speaker} spoke between {turn.start:.2f}s and {turn.end:.2f}s")

Output:

Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.2.1+cpu. Bad things might happen unless you revert torch to 1.x.

Recognized speakers in the given audio:
Speaker mick spoke between 0.57s and 1.67s
Speaker moon spoke between 2.47s and 2.81s
Speaker moon spoke between 3.08s and 4.47s