Automatically cut audio books
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

244 行
6.9KB

  1. import librosa
  2. import librosa.display
  3. import numpy as np
  4. import matplotlib.pyplot as plt
  5. import soundfile as sf
  6. from pydub import AudioSegment
  7. from pydub.silence import split_on_silence, detect_nonsilent
  8. import math
  9. import wave
  10. import contextlib
  11. import webrtcvad
  12. def calc_dtw_sim(y1, y2, sr1, sr2, plot_result=False):
  13. hop_length = 64
  14. l = min(len(y1), len(y2))
  15. to_consider = min(l, max(round(0.2*l), 2048))
  16. bound = round(0.2 * l)
  17. y1 = y1[0:round(0.2*l)]
  18. y2 = y2[0:round(0.2*l)]
  19. if bound < 2048:
  20. n_fft = 512
  21. n_mels = 64
  22. else:
  23. n_fft = 2048
  24. n_mels = 128
  25. mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1, hop_length=hop_length, n_mfcc=42, n_fft=n_fft, n_mels=n_mels)[1:,:]
  26. mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2, hop_length=hop_length, n_mfcc=42, n_fft=n_fft, n_mels=n_mels)[1:,:]
  27. D, wp = librosa.sequence.dtw(mfcc1, mfcc2)
  28. if plot_result:
  29. fig, ax = plt.subplots(nrows=4)
  30. img = librosa.display.specshow(D, x_axis='frames', y_axis='frames',
  31. ax=ax[0])
  32. ax[0].set(title='DTW cost', xlabel='Noisy sequence', ylabel='Target')
  33. ax[0].plot(wp[:, 1], wp[:, 0], label='Optimal path', color='y')
  34. ax[0].legend()
  35. fig.colorbar(img, ax=ax[0])
  36. ax[1].plot(D[-1, :] / wp.shape[0])
  37. ax[1].set(xlim=[0, mfcc1.shape[1]],
  38. title='Matching cost function')
  39. ax[2].imshow(mfcc1)
  40. ax[3].imshow(mfcc2)
  41. plt.show()
  42. total_alignment_cost = D[-1, -1] / wp.shape[0]
  43. return total_alignment_cost
  44. def calc_xcorr_sim(y1, y2, sr1, sr2):
  45. hop_length = 256
  46. y1 = y1[0:round(len(y1)*0.2)]
  47. y2 = y2[0:round(len(y2)*0.2)]
  48. mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1, hop_length=hop_length, n_mfcc=13)[1:,:]
  49. mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2, hop_length=hop_length, n_mfcc=13)[1:,:]
  50. xsim = librosa.segment.cross_similarity(mfcc1, mfcc2, mode='distance')
  51. return xsim
  52. def match_target_amplitude(aChunk, target_dBFS):
  53. ''' Normalize given audio chunk '''
  54. change_in_dBFS = target_dBFS - aChunk.dBFS
  55. return aChunk.apply_gain(change_in_dBFS)
  56. def spl_on_silence():
  57. # Import the AudioSegment class for processing audio and the
  58. # Load your audio.
  59. song = AudioSegment.from_wav("recording.wav")
  60. # Split track where the silence is 2 seconds or more and get chunks using
  61. # the imported function.
  62. chunks = split_on_silence (
  63. # Use the loaded audio.
  64. song,
  65. # Specify that a silent chunk must be at least 2 seconds or 2000 ms long.
  66. min_silence_len = 1000,
  67. # Consider a chunk silent if it's quieter than -16 dBFS.
  68. # (You may want to adjust this parameter.)
  69. silence_thresh = -50,
  70. timestamps=True
  71. )
  72. ## Process each chunk with your parameters
  73. #for i, chunk in enumerate(chunks):
  74. # # Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding.
  75. # silence_chunk = AudioSegment.silent(duration=500)
  76. # # Add the padding chunk to beginning and end of the entire chunk.
  77. # audio_chunk = silence_chunk + chunk + silence_chunk
  78. # # Normalize the entire chunk.
  79. # normalized_chunk = match_target_amplitude(audio_chunk, -20.0)
  80. # # Export the audio chunk with new bitrate.
  81. # print("Exporting chunk{0}.mp3.".format(i))
  82. # normalized_chunk.export(
  83. # ".//chunk{0}.wav".format(i),
  84. # bitrate = "192k",
  85. # format = "wav"
  86. # )
  87. return ([ audiosegment_to_librosawav(c) for c in chunks ], song.frame_rate)
  88. def non_silent_chunks(song):
  89. #song = AudioSegment.from_wav("recording.wav")
  90. return detect_nonsilent(song, min_silence_len=400, silence_thresh=-50)
  91. def audiosegment_to_librosawav(audiosegment):
  92. channel_sounds = audiosegment.split_to_mono()
  93. samples = [s.get_array_of_samples() for s in channel_sounds]
  94. fp_arr = np.array(samples).T.astype(np.float32)
  95. fp_arr /= np.iinfo(samples[0].typecode).max
  96. fp_arr = fp_arr.reshape(-1)
  97. return fp_arr
  98. # sr = samples / second
  99. def millisecond_to_samples(ms, sr):
  100. return round((ms / 1000) * sr)
  101. def ms_to_time(ms):
  102. secs = ms / 1000
  103. return "{0}:{1}".format(math.floor(secs / 60), secs % 60)
  104. def seg_is_speech(seg):
  105. f = lambda x: int(32768 * x)
  106. x = np.vectorize(f)(seg)
  107. pcm_data = x.tobytes()
  108. speeches = 0
  109. total = 0
  110. offset = 0
  111. n = int(sr * (frame_duration_ms / 1000.0) * 2)
  112. duration = (float(n) / sr) / 2.0
  113. while offset + n < len(pcm_data):
  114. frame = pcm_data[offset:(offset+n)]
  115. if vad.is_speech(frame, sr):
  116. speeches += 1
  117. offset = offset + n
  118. total += 1
  119. return speeches / total
  120. if __name__ == '__main__':
  121. vad = webrtcvad.Vad()
  122. frame_duration_ms = 10
  123. fp = "hard_piece_2.wav"
  124. y, sr = librosa.load(fp, mono=True, sr=32000)
  125. #pcm_data = y.tobytes()
  126. #n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
  127. #duration = (float(n) / sample_rate) / 2.0
  128. #frame = pcm_data[0:n]
  129. #y, sr = librosa.load("recording.wav")
  130. song = AudioSegment.from_wav(fp)
  131. #print("pydub load done")
  132. #with contextlib.closing(wave.open(fp, "rb")) as wf:
  133. # num_channels = wf.getnchannels()
  134. # assert num_channels == 1
  135. # sample_width = wf.getsampwidth()
  136. # assert sample_width == 2
  137. # sample_rate = wf.getframerate()
  138. # assert sample_rate in (8000, 16000, 32000, 48000)
  139. # pcm_data = wf.readframes(wf.getnframes())
  140. # n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
  141. # duration = (float(n) / sample_rate) / 2.0
  142. # frame = pcm_data[0:n]
  143. # #print(len(pcm_data))
  144. # print(vad.is_speech(frame, sample_rate))
  145. #y2 = audiosegment_to_librosawav(song)
  146. #print(y)
  147. #print(y2)
  148. #segs = librosa.effects.split(y, top_db = 5, hop_length=512, frame_length=4096)
  149. #segs, sr = spl_on_silence()
  150. #print("librosa load done")
  151. segs = []
  152. for ts in non_silent_chunks(song):
  153. start, end = ts[0], ts[1]
  154. seg = y[ millisecond_to_samples(start, sr) : millisecond_to_samples(end, sr) ]
  155. segs.append(((start, end), seg))
  156. for i in range(len(segs)-1):
  157. (s1, e1), y1 = segs[i]
  158. (s2, e2), y2 = segs[i+1]
  159. diff = calc_dtw_sim(y1, y2, sr, sr, plot_result=False)
  160. vad_coeff = seg_is_speech(y1)
  161. #if diff < 100:
  162. #print("{0}\t{1}\tdiff: {2}, vad: {3}".format(s1/1000, e1/1000, diff, vad_coeff))
  163. #print(ms_to_time(s1), ms_to_time(e1), ms_to_time(s2), ms_to_time(e2), diff)
  164. #if vad_coeff < 0.9:
  165. #print("{0}\t{1}\tvad {2}".format(s1/1000, e1/1000, vad_coeff))
  166. #for n, seg in enumerate(segs):
  167. # sf.write('part' + str(n) + '.wav', seg, sr)
  168. #print(segs)
  169. #y1, sr1 = librosa.load("out000.wav")
  170. #y2, sr2 = librosa.load("out004.wav")
  171. #print("total alignment cost:", calc_dtw_sim(y1, y2, sr1, sr2, plot_result=True))
  172. #print("xcorr:", np.trace(calc_xcorr_sim(y1, y2, sr1, sr2)))