使用 pyDub 将增益应用于特定频率

Question

我想增加 wav 文件中特定频率的音量，使它们比其余频率更响亮（更容易听到）。

我完全没有音频处理经验，在开发代码时学习基础知识......

到目前为止我所做的（或者至少我相信）是找到我想要调高的特定频率，我陷入了我必须增加其增益的部分，它在我的音频中添加了烦人的嗡嗡声音频。

这是代码：

import array
from pydub import AudioSegment
from pydub.generators import Sine
import numpy as np
from scipy.fft import fft
import sezone.fmanagement as file_management
import math

def find_frequencies_in_segment(segment: AudioSegment) -> (array, tuple):
    raw_data = np.array(segment.get_array_of_samples())
    fft_result = fft(raw_data)
    freqs = np.fft.fftfreq(len(fft_result), d=1/segment.frame_rate)
    
    return (freqs, fft_result)

def create_audio_sample() -> AudioSegment:
    # in Hz
    FREQUENCY_LOW = 60
    FREQUENCY_MEDIUM = 100

    tone_low: AudioSegment  = Sine(FREQUENCY_LOW).to_audio_segment(duration=500)
    tone_medium: AudioSegment  = Sine(FREQUENCY_MEDIUM).to_audio_segment(duration=125)
    
    audio: AudioSegment = tone_low + tone_medium
    audio = audio - 20 #making it more quiet, so it'd be easier for me notice any difference in the volume after processing.
    
    return audio

def calc_scale_factor(original_sample: AudioSegment, fft_result: array) -> float:
    max_original_amp = np.array(original_sample.get_array_of_samples()).max()
    max_fft_amp = np.array(fft_result).max()

    return max_original_amp/max_fft_amp

def find_indexes_with_bandwidth(samples: array, amplitude: int, bandwith: int) -> np.ndarray:
    indexes = []

    for i in range(len(samples)):
        min_value = samples[i] - bandwith
        max_value = samples[i] + bandwith

        if amplitude >= min_value and amplitude <= max_value:
            indexes.append(i)

    return indexes

def original_sample_index_for_frequency(audio: AudioSegment, frequency: int) -> array:
    # Apply FFT to get the frequencies
    (freqs, fft_result) = find_frequencies_in_segment(audio)

    # spectral magnitude
    amps = np.abs(fft_result)

    # indexes containing my desired frequency
    filtered_indexes = [i for i in range(len(freqs)) if math.floor(freqs[i]) == frequency]

    # the amplitude of the desired frequency. From the spectral magnitude returned by the FFT
    freq_amp = amps[filtered_indexes[0]]
    
    # gets the corresponding amplitude in my original audio sample array
    original_amp = freq_amp * calc_scale_factor(audio, amps)

    original_samples = audio.get_array_of_samples()
                                
    indexes = find_indexes_with_bandwidth(original_samples, original_amp, 2)

    # the indexes in my original array sample corresponding to the frequency i want to pitch up.
    return indexes

def apply_gain_pydub(audio: AudioSegment, indexes: array) -> AudioSegment:
    original_order = audio.get_sample_slice(0, 0)
    processed_index = -1

    for i in indexes:
        # the part of the audio I dont want to change
        original_order += audio.get_sample_slice(processed_index + 1, i - 1)# todo verify index 0
        # the part of the audio I want to change
        audio_to_change = audio.get_sample_slice(i, i+1)
        changed_audio = audio_to_change.apply_gain(20)
        # putting both audios together, in sequence
        original_order += changed_audio
        processed_index = i

    # do the same as the loop above, but for the remainig part of the audio.
    original_order += audio.get_sample_slice(processed_index + 1, len(audio.get_array_of_samples()) - 1)    

    original_order.export("../output/new_audio_pydub.wav", format="wav")

    return original_order

def apply_gain_manually(audio: AudioSegment, indexes: array) -> AudioSegment:
    original_samples = audio.get_array_of_samples()
                                
    # loop for the indexes applying gain (pitching the volume up)                                
    for i in indexes:
        gain = 10**(20 / 20.0)
        original_samples[i] = int(original_samples[i] * gain)

    reconstructed_audio = AudioSegment(original_samples.tobytes(), 
                                       frame_rate=audio.frame_rate,
                                       sample_width=2,
                                       channels=1)
    
    reconstructed_audio.export("../output/new_audio_manually.wav", format="wav")

    return reconstructed_audio

def main():
    audio = create_audio_sample()
    audio.export("../output/audio_sample.wav", format="wav")

    # it gives me the indexes of the frequencies i want to pitch up.
    # in this case, 60Hz
    indexes = original_sample_index_for_frequency(audio, 60)

    # do the same thing, but in differente ways to see if there is any difference
    audio_pydub = apply_gain_pydub(audio, indexes)
    audio_manually = apply_gain_manually(audio, indexes)

    # making sure none audio has been clipped.
    print("maximum possible value: ", (2 ** 15) - 1)# 16 bits
    print("audio max: ", np.array(audio.get_array_of_samples()).max())
    print("audio_pydub max: ", np.array(audio_pydub.get_array_of_samples()).max())
    print("audio_manually max: ", np.array(audio_manually.get_array_of_samples()).max())

    # Both audios have a buzzing sound after the processing.

if __name__ == '__main__':
    print("\n")
    main()
    print("\n")

我想了解：

我做错了什么，为什么我的音频中应用了嗡嗡声效果？
按照上面的逻辑是否可以达到我想要的效果？
调整特定频率的正确方法是什么？

由于我仍在学习音频处理的基础知识，任何背景或上下文信息都会有所帮助。

提前致谢

Answer 1

您可能需要修改

apply_gain_pydub

和

apply_gain_manually

函数，尤其是增益应用于已识别索引的部分。我建议您可以应用基于频率分量幅度的增益，而不是应用固定增益。您可以这样做：

from pydub.utils import ratio_to_db, db_to_float
gain_adjustment = 20 # Adjust the gain according to your requirements

def apply_gain_pydub(audio: AudioSegment, indexes: array) -> AudioSegment:
    original_order = audio.get_sample_slice(0, 0)
    processed_index = -1
    
    for i in indexes:
        original_order += audio.get_sample_slice(processed_index + 1, i - 1)
        audio_to_change = audio.get_sample_slice(i, i + 1)

        # Calculate gain based on the amplitude of the frequency component
        amplitude_db = ratio_to_db(db_to_float(audio_to_change.max_possible_amplitude))
        gain = amplitude_db + gain_adjustment  # Adjust the gain according to your requirements
        
        changed_audio = audio_to_change.apply_gain(gain)
        original_order += changed_audio
        processed_index = i

    original_order += audio.get_sample_slice(processed_index + 1, len(audio.get_array_of_samples()) - 1)
    
    original_order.export("../output/new_audio_pydub.wav", format="wav")
    
    return original_order

def apply_gain_manually(audio: AudioSegment, indexes: array) -> AudioSegment:
    original_samples = audio.get_array_of_samples()

    for i in indexes:
        audio_to_change = audio.get_sample_slice(i, i + 1)
        amplitude = max(audio_to_change.get_array_of_samples())

        # Calculate gain based on the amplitude of the frequency component
        amplitude_db = ratio_to_db(db_to_float(amplitude))
        gain = amplitude_db + gain_adjustment  # Adjust the gain according to your requirements

        gain_multiplier = 10 ** (gain / gain_adjustment)
        original_samples[i] = int(original_samples[i] * gain_multiplier)

    reconstructed_audio = AudioSegment(original_samples.tobytes(), frame_rate=audio.frame_rate, sample_width=2, channels=1)
    reconstructed_audio.export("../output/new_audio_manually.wav", format="wav")

    return reconstructed_audio

附注我按照我自己的代码之一修改了您的这部分代码，看看这是否适合您。

使用 pyDub 将增益应用于特定频率

问题描述投票：0回答：1

1个回答

最新问题

使用 pyDub 将增益应用于特定频率

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1