Source code for speechpy.feature

"""feature module.

This module provides functions for calculating the main speech
features that the package is aimed to extract as well as the required
elements.


Functions:

    filterbanks: Compute the Mel-filterbanks
                 The filterbanks must be created for extracting
                 speech features such as MFCC.

    mfcc: Extracting Mel Frequency Cepstral Coefficient feature.

    mfe: Extracting Mel Energy feature.

    lmfe: Extracting Log Mel Energy feature.

    extract_derivative_feature: Extract the first and second derivative
        features. This finction, directly use the ``derivative_extraction``
        function in the ``processing`` module.

"""

from __future__ import division
import numpy as np
from . import processing
from scipy.fftpack import dct
from . import functions


def filterbanks(
        num_filter,
        coefficients,
        sampling_freq,
        low_freq=None,
        high_freq=None):
    """Compute the Mel-filterbanks. Each filter will be stored in one rows.
    The columns correspond to fft bins.

    Args:
        num_filter (int): the number of filters in the filterbank, default 20.
        coefficients (int): (fftpoints//2 + 1). Default is 257.
        sampling_freq (float): the samplerate of the signal we are working
            with. It affects mel spacing.
        low_freq (float): lowest band edge of mel filters, default 0 Hz
        high_freq (float): highest band edge of mel filters,
            default samplerate/2

    Returns:
           array: A numpy array of size num_filter x (fftpoints//2 + 1)
               which are filterbank
    """
    high_freq = high_freq or sampling_freq / 2
    low_freq = low_freq or 300
    s = "High frequency cannot be greater than half of the sampling frequency!"
    assert high_freq <= sampling_freq / 2, s
    assert low_freq >= 0, "low frequency cannot be less than zero!"

    # Computing the Mel filterbank
    # converting the upper and lower frequencies to Mels.
    # num_filter + 2 is because for num_filter filterbanks we need
    # num_filter+2 point.
    mels = np.linspace(
        functions.frequency_to_mel(low_freq),
        functions.frequency_to_mel(high_freq),
        num_filter + 2)

    # we should convert Mels back to Hertz because the start and end-points
    # should be at the desired frequencies.
    hertz = functions.mel_to_frequency(mels)

    # The frequency resolution required to put filters at the
    # exact points calculated above should be extracted.
    #  So we should round those frequencies to the closest FFT bin.
    freq_index = (
        np.floor(
            (coefficients +
             1) *
            hertz /
            sampling_freq)).astype(int)

    # Initial definition
    filterbank = np.zeros([num_filter, coefficients])

    # The triangular function for each filter
    for i in range(0, num_filter):
        left = int(freq_index[i])
        middle = int(freq_index[i + 1])
        right = int(freq_index[i + 2])
        z = np.linspace(left, right, num=right - left + 1)
        filterbank[i,
                   left:right + 1] = functions.triangle(z,
                                                        left=left,
                                                        middle=middle,
                                                        right=right)

    return filterbank


[docs]def mfcc( signal, sampling_frequency, frame_length=0.020, frame_stride=0.01, num_cepstral=13, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None, dc_elimination=True): """Compute MFCC features from an audio signal. Args: signal (array): the audio signal from which to compute features. Should be an N x 1 array sampling_frequency (int): the sampling frequency of the signal we are working with. frame_length (float): the length of each frame in seconds. Default is 0.020s frame_stride (float): the step between successive frames in seconds. Default is 0.02s (means no overlap) num_filters (int): the number of filters in the filterbank, default 40. fft_length (int): number of FFT points. Default is 512. low_frequency (float): lowest band edge of mel filters. In Hz, default is 0. high_frequency (float): highest band edge of mel filters. In Hz, default is samplerate/2 num_cepstral (int): Number of cepstral coefficients. dc_elimination (bool): hIf the first dc component should be eliminated or not. Returns: array: A numpy array of size (num_frames x num_cepstral) containing mfcc features. """ feature, energy = mfe(signal, sampling_frequency=sampling_frequency, frame_length=frame_length, frame_stride=frame_stride, num_filters=num_filters, fft_length=fft_length, low_frequency=low_frequency, high_frequency=high_frequency) if len(feature) == 0: return np.empty((0, num_cepstral)) feature = np.log(feature) feature = dct(feature, type=2, axis=-1, norm='ortho')[:, :num_cepstral] # replace first cepstral coefficient with log of frame energy for DC # elimination. if dc_elimination: feature[:, 0] = np.log(energy) return feature
[docs]def mfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None): """Compute Mel-filterbank energy features from an audio signal. Args: signal (array): the audio signal from which to compute features. Should be an N x 1 array sampling_frequency (int): the sampling frequency of the signal we are working with. frame_length (float): the length of each frame in seconds. Default is 0.020s frame_stride (float): the step between successive frames in seconds. Default is 0.02s (means no overlap) num_filters (int): the number of filters in the filterbank, default 40. fft_length (int): number of FFT points. Default is 512. low_frequency (float): lowest band edge of mel filters. In Hz, default is 0. high_frequency (float): highest band edge of mel filters. In Hz, default is samplerate/2 Returns: array: features - the energy of fiterbank of size num_frames x num_filters. The energy of each frame: num_frames x 1 """ # Convert to float signal = signal.astype(float) # Stack frames frames = processing.stack_frames( signal, sampling_frequency=sampling_frequency, frame_length=frame_length, frame_stride=frame_stride, filter=lambda x: np.ones( (x, )), zero_padding=False) # getting the high frequency high_frequency = high_frequency or sampling_frequency / 2 # calculation of the power sprectum power_spectrum = processing.power_spectrum(frames, fft_length) coefficients = power_spectrum.shape[1] # this stores the total energy in each frame frame_energies = np.sum(power_spectrum, 1) # Handling zero enegies. frame_energies = functions.zero_handling(frame_energies) # Extracting the filterbank filter_banks = filterbanks( num_filters, coefficients, sampling_frequency, low_frequency, high_frequency) # Filterbank energies features = np.dot(power_spectrum, filter_banks.T) features = functions.zero_handling(features) return features, frame_energies
[docs]def lmfe(signal, sampling_frequency, frame_length=0.020, frame_stride=0.01, num_filters=40, fft_length=512, low_frequency=0, high_frequency=None): """Compute log Mel-filterbank energy features from an audio signal. Args: signal (array): the audio signal from which to compute features. Should be an N x 1 array sampling_frequency (int): the sampling frequency of the signal we are working with. frame_length (float): the length of each frame in seconds. Default is 0.020s frame_stride (float): the step between successive frames in seconds. Default is 0.02s (means no overlap) num_filters (int): the number of filters in the filterbank, default 40. fft_length (int): number of FFT points. Default is 512. low_frequency (float): lowest band edge of mel filters. In Hz, default is 0. high_frequency (float): highest band edge of mel filters. In Hz, default is samplerate/2 Returns: array: Features - The log energy of fiterbank of size num_frames x num_filters frame_log_energies. The log energy of each frame num_frames x 1 """ feature, frame_energies = mfe(signal, sampling_frequency=sampling_frequency, frame_length=frame_length, frame_stride=frame_stride, num_filters=num_filters, fft_length=fft_length, low_frequency=low_frequency, high_frequency=high_frequency) feature = np.log(feature) return feature
[docs]def extract_derivative_feature(feature): """ This function extracts temporal derivative features which are first and second derivatives. Args: feature (array): The feature vector which its size is: N x M Return: array: The feature cube vector which contains the static, first and second derivative features of size: N x M x 3 """ first_derivative_feature = processing.derivative_extraction( feature, DeltaWindows=2) second_derivative_feature = processing.derivative_extraction( first_derivative_feature, DeltaWindows=2) # Creating the future cube for each file feature_cube = np.concatenate( (feature[:, :, None], first_derivative_feature[:, :, None], second_derivative_feature[:, :, None]), axis=2) return feature_cube