Automatically cut audio books
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

308 行
9.1KB

  1. import librosa
  2. import librosa.display
  3. import numpy as np
  4. import matplotlib.pyplot as plt
  5. import soundfile as sf
  6. from pydub import AudioSegment
  7. from pydub.silence import split_on_silence, detect_nonsilent
  8. import math
  9. import wave
  10. import contextlib
  11. import random
  12. from mpl_toolkits.axes_grid1.axes_divider import VBoxDivider
  13. import mpl_toolkits.axes_grid1.axes_size as Size
  14. import webrtcvad
  15. def calc_dtw_sim(y1, y2, sr1, sr2, plot_result=False):
  16. hop_length = 64
  17. assert sr1 == sr2
  18. l = min(len(y1), len(y2))
  19. to_consider = min(l, max(round(0.2*l), 2048))
  20. min_len = millisecond_to_samples(100, sr1)
  21. bound = round(0.5 * l)
  22. if bound < min_len:
  23. bound = min_len
  24. #bound = max(round(0.2 * l), millisecond_to_samples(200, sr1))
  25. y1 = y1[0:bound]
  26. y2 = y2[0:bound]
  27. if bound < 2048:
  28. n_fft = bound
  29. n_mels = 64
  30. else:
  31. n_fft = 2048
  32. n_mels = 128
  33. mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1, hop_length=hop_length, n_mfcc=42, n_fft=n_fft, n_mels=n_mels)[1:,:]
  34. mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2, hop_length=hop_length, n_mfcc=42, n_fft=n_fft, n_mels=n_mels)[1:,:]
  35. D, wp = librosa.sequence.dtw(mfcc1, mfcc2)
  36. if plot_result:
  37. fig, ax = plt.subplots(nrows=4)
  38. img = librosa.display.specshow(D, x_axis='frames', y_axis='frames',
  39. ax=ax[0])
  40. ax[0].set(title='DTW cost', xlabel='Noisy sequence', ylabel='Target')
  41. ax[0].plot(wp[:, 1], wp[:, 0], label='Optimal path', color='y')
  42. ax[0].legend()
  43. fig.colorbar(img, ax=ax[0])
  44. ax[1].plot(D[-1, :] / wp.shape[0])
  45. ax[1].set(xlim=[0, mfcc1.shape[1]],
  46. title='Matching cost function')
  47. ax[2].imshow(mfcc1)
  48. ax[3].imshow(mfcc2)
  49. plt.show()
  50. total_alignment_cost = D[-1, -1] / wp.shape[0]
  51. return total_alignment_cost
  52. def calc_xcorr_sim(y1, y2, sr1, sr2):
  53. hop_length = 256
  54. y1 = y1[0:round(len(y1)*0.2)]
  55. y2 = y2[0:round(len(y2)*0.2)]
  56. mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1, hop_length=hop_length, n_mfcc=13)[1:,:]
  57. mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2, hop_length=hop_length, n_mfcc=13)[1:,:]
  58. xsim = librosa.segment.cross_similarity(mfcc1, mfcc2, mode='distance')
  59. return xsim
  60. def match_target_amplitude(aChunk, target_dBFS):
  61. ''' Normalize given audio chunk '''
  62. change_in_dBFS = target_dBFS - aChunk.dBFS
  63. return aChunk.apply_gain(change_in_dBFS)
  64. def spl_on_silence():
  65. # Import the AudioSegment class for processing audio and the
  66. # Load your audio.
  67. song = AudioSegment.from_wav("recording.wav")
  68. # Split track where the silence is 2 seconds or more and get chunks using
  69. # the imported function.
  70. chunks = split_on_silence (
  71. # Use the loaded audio.
  72. song,
  73. # Specify that a silent chunk must be at least 2 seconds or 2000 ms long.
  74. min_silence_len = 1000,
  75. # Consider a chunk silent if it's quieter than -16 dBFS.
  76. # (You may want to adjust this parameter.)
  77. silence_thresh = -50,
  78. timestamps=True
  79. )
  80. ## Process each chunk with your parameters
  81. #for i, chunk in enumerate(chunks):
  82. # # Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding.
  83. # silence_chunk = AudioSegment.silent(duration=500)
  84. # # Add the padding chunk to beginning and end of the entire chunk.
  85. # audio_chunk = silence_chunk + chunk + silence_chunk
  86. # # Normalize the entire chunk.
  87. # normalized_chunk = match_target_amplitude(audio_chunk, -20.0)
  88. # # Export the audio chunk with new bitrate.
  89. # print("Exporting chunk{0}.mp3.".format(i))
  90. # normalized_chunk.export(
  91. # ".//chunk{0}.wav".format(i),
  92. # bitrate = "192k",
  93. # format = "wav"
  94. # )
  95. return ([ audiosegment_to_librosawav(c) for c in chunks ], song.frame_rate)
  96. def non_silent_chunks(song):
  97. #song = AudioSegment.from_wav("recording.wav")
  98. return detect_nonsilent(song, min_silence_len=10, silence_thresh=-50)
  99. def audiosegment_to_librosawav(audiosegment):
  100. channel_sounds = audiosegment.split_to_mono()
  101. samples = [s.get_array_of_samples() for s in channel_sounds]
  102. fp_arr = np.array(samples).T.astype(np.float32)
  103. fp_arr /= np.iinfo(samples[0].typecode).max
  104. fp_arr = fp_arr.reshape(-1)
  105. return fp_arr
  106. # sr = samples / second
  107. def millisecond_to_samples(ms, sr):
  108. return round((ms / 1000) * sr)
  109. def ms_to_time(ms):
  110. secs = ms / 1000
  111. return "{0}:{1}".format(math.floor(secs / 60), secs % 60)
  112. def seg_is_speech(seg):
  113. f = lambda x: int(32768 * x)
  114. x = np.vectorize(f)(seg)
  115. pcm_data = x.tobytes()
  116. speeches = 0
  117. total = 0
  118. offset = 0
  119. n = int(sr * (frame_duration_ms / 1000.0) * 2)
  120. duration = (float(n) / sr) / 2.0
  121. while offset + n < len(pcm_data):
  122. frame = pcm_data[offset:(offset+n)]
  123. if vad.is_speech(frame, sr):
  124. speeches += 1
  125. offset = offset + n
  126. total += 1
  127. #return speeches / total
  128. return 1.0
  129. def make_widths_equal(fig, rect, ax1, ax2, ax3, pad):
  130. # pad in inches
  131. divider = VBoxDivider(
  132. fig, rect,
  133. horizontal=[Size.AxesX(ax1), Size.Scaled(1), Size.AxesX(ax2), Size.Scaled(1), Size.AxesX(ax3)],
  134. vertical=[Size.AxesY(ax1), Size.Fixed(pad), Size.AxesY(ax2), Size.Fixed(pad), Size.AxesY(ax3)])
  135. ax1.set_axes_locator(divider.new_locator(0))
  136. ax2.set_axes_locator(divider.new_locator(2))
  137. ax3.set_axes_locator(divider.new_locator(4))
  138. if __name__ == '__main__':
  139. vad = webrtcvad.Vad()
  140. frame_duration_ms = 10
  141. fp = "hard_piece_7.wav"
  142. y, sr = librosa.load(fp, mono=True, sr=32000)
  143. #pcm_data = y.tobytes()
  144. #n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
  145. #duration = (float(n) / sample_rate) / 2.0
  146. #frame = pcm_data[0:n]
  147. #y, sr = librosa.load("recording.wav")
  148. song = AudioSegment.from_wav(fp)
  149. #print("pydub load done")
  150. #with contextlib.closing(wave.open(fp, "rb")) as wf:
  151. # num_channels = wf.getnchannels()
  152. # assert num_channels == 1
  153. # sample_width = wf.getsampwidth()
  154. # assert sample_width == 2
  155. # sample_rate = wf.getframerate()
  156. # assert sample_rate in (8000, 16000, 32000, 48000)
  157. # pcm_data = wf.readframes(wf.getnframes())
  158. # n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
  159. # duration = (float(n) / sample_rate) / 2.0
  160. # frame = pcm_data[0:n]
  161. # #print(len(pcm_data))
  162. # print(vad.is_speech(frame, sample_rate))
  163. #y2 = audiosegment_to_librosawav(song)
  164. #print(y)
  165. #print(y2)
  166. #segs = librosa.effects.split(y, top_db = 5, hop_length=512, frame_length=4096)
  167. #segs, sr = spl_on_silence()
  168. #print("librosa load done")
  169. segs = []
  170. #i = 0
  171. for ts in non_silent_chunks(song):
  172. start, end = ts[0], ts[1]
  173. seg = y[ millisecond_to_samples(start, sr) : millisecond_to_samples(end, sr) ]
  174. segs.append(((start, end), seg))
  175. #sf.write("part{0}.wav".format(i), seg, sr, 'PCM_16')
  176. #i += 1
  177. #segs = segs[1:]
  178. n_segs = len(segs)
  179. #random.shuffle(segs)
  180. diffs = np.zeros((n_segs, n_segs))
  181. diffs_penalised = np.zeros((n_segs, n_segs))
  182. vad_coeffs = np.zeros((n_segs,))
  183. lengths = np.zeros((n_segs,))
  184. for i in range(n_segs):
  185. (s1, e1), y1 = segs[i]
  186. for j in range(i):
  187. (s2, e2), y2 = segs[j]
  188. diffs[i,j] = calc_dtw_sim(y1, y2, sr, sr, plot_result=False)
  189. diffs[j,i] = diffs[i,j]
  190. distance_penalty = abs(i-j)**(100/min((e1-s1), (e2-s2)))
  191. diffs_penalised[i,j] = diffs[i,j] * distance_penalty
  192. diffs_penalised[j,i] = diffs_penalised[i,j]
  193. vad_coeffs[i] = seg_is_speech(y1)
  194. lengths[i] = e1 - s1
  195. delete_segs = np.zeros((n_segs,), dtype=bool)
  196. for i in range(n_segs):
  197. if delete_segs[i]:
  198. continue
  199. max_j = i
  200. for j in range(i, n_segs):
  201. if diffs[i,j] < 80:
  202. #if diffs_penalised[i,j] < 80:
  203. max_j = j
  204. delete_segs[i:max_j] = True
  205. for i in range(n_segs):
  206. (s1, e1), y1 = segs[i]
  207. print("{0}\t{1}\tn: {2} delete: {3}, vad: {4}".format(s1/1000, e1/1000, i, delete_segs[i], vad_coeffs[i]))
  208. #if diff < 100:
  209. #print("{0}\t{1}\tdiff: {2}, vad: {3}".format(s1/1000, e1/1000, diff, vad_coeff))
  210. #print(ms_to_time(s1), ms_to_time(e1), ms_to_time(s2), ms_to_time(e2), diff)
  211. #if vad_coeff < 0.9:
  212. #print("{0}\t{1}\tvad {2}".format(s1/1000, e1/1000, vad_coeff))
  213. #fig, ax = plt.subplots(nrows=3, sharex=True)
  214. fig, ax = plt.subplots(nrows=1, sharex=True)
  215. ax.imshow(diffs)
  216. #ax[1].imshow(diffs_penalised)
  217. #ax[1].imshow(np.reshape(vad_coeffs, (1, n_segs)))
  218. #ax[2].imshow(np.reshape(lengths, (1, n_segs)))
  219. #make_widths_equal(fig, 111, ax[0], ax[1], ax[2], pad=0.5)
  220. plt.show()
  221. #for n, seg in enumerate(segs):
  222. # sf.write('part' + str(n) + '.wav', seg, sr)
  223. #print(segs)
  224. #y1, sr1 = librosa.load("out000.wav")
  225. #y2, sr2 = librosa.load("out004.wav")
  226. #print("total alignment cost:", calc_dtw_sim(y1, y2, sr1, sr2, plot_result=True))
  227. #print("xcorr:", np.trace(calc_xcorr_sim(y1, y2, sr1, sr2)))