Automatically cut audio books
25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.

351 satır
11KB

  1. import librosa
  2. import librosa.display
  3. import numpy as np
  4. import matplotlib.pyplot as plt
  5. import soundfile as sf
  6. from pydub import AudioSegment
  7. from pydub.silence import split_on_silence, detect_nonsilent
  8. import math
  9. import wave
  10. import contextlib
  11. import random
  12. from mpl_toolkits.axes_grid1.axes_divider import VBoxDivider
  13. import mpl_toolkits.axes_grid1.axes_size as Size
  14. import cv2
  15. import webrtcvad
  16. min_silence_len = 400
  17. def calc_dtw_sim(y1, y2, sr1, sr2, plot_result=False):
  18. hop_length = 64
  19. assert sr1 == sr2
  20. l = min(len(y1), len(y2))
  21. to_consider = min(l, max(round(0.2*l), 2048))
  22. min_len = millisecond_to_samples(100, sr1)
  23. bound = round(0.5 * l)
  24. if bound < min_len:
  25. bound = min_len
  26. #bound = max(round(0.2 * l), millisecond_to_samples(200, sr1))
  27. y1 = y1[0:bound]
  28. y2 = y2[0:bound]
  29. if bound < 2048:
  30. n_fft = bound
  31. n_mels = 64
  32. else:
  33. n_fft = 2048
  34. n_mels = 128
  35. mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1, hop_length=hop_length, n_mfcc=42, n_fft=n_fft, n_mels=n_mels)[1:,:]
  36. mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2, hop_length=hop_length, n_mfcc=42, n_fft=n_fft, n_mels=n_mels)[1:,:]
  37. D, wp = librosa.sequence.dtw(mfcc1, mfcc2)
  38. if plot_result:
  39. fig, ax = plt.subplots(nrows=4)
  40. img = librosa.display.specshow(D, x_axis='frames', y_axis='frames',
  41. ax=ax[0])
  42. ax[0].set(title='DTW cost', xlabel='Noisy sequence', ylabel='Target')
  43. ax[0].plot(wp[:, 1], wp[:, 0], label='Optimal path', color='y')
  44. ax[0].legend()
  45. fig.colorbar(img, ax=ax[0])
  46. ax[1].plot(D[-1, :] / wp.shape[0])
  47. ax[1].set(xlim=[0, mfcc1.shape[1]],
  48. title='Matching cost function')
  49. ax[2].imshow(mfcc1)
  50. ax[3].imshow(mfcc2)
  51. plt.show()
  52. total_alignment_cost = D[-1, -1] / wp.shape[0]
  53. return total_alignment_cost
  54. def calc_xcorr_sim(y1, y2, sr1, sr2):
  55. hop_length = 256
  56. y1 = y1[0:round(len(y1)*0.2)]
  57. y2 = y2[0:round(len(y2)*0.2)]
  58. mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1, hop_length=hop_length, n_mfcc=13)[1:,:]
  59. mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2, hop_length=hop_length, n_mfcc=13)[1:,:]
  60. xsim = librosa.segment.cross_similarity(mfcc1, mfcc2, mode='distance')
  61. return xsim
  62. def match_target_amplitude(aChunk, target_dBFS):
  63. ''' Normalize given audio chunk '''
  64. change_in_dBFS = target_dBFS - aChunk.dBFS
  65. return aChunk.apply_gain(change_in_dBFS)
  66. def spl_on_silence():
  67. # Import the AudioSegment class for processing audio and the
  68. # Load your audio.
  69. song = AudioSegment.from_wav("recording.wav")
  70. # Split track where the silence is 2 seconds or more and get chunks using
  71. # the imported function.
  72. chunks = split_on_silence (
  73. # Use the loaded audio.
  74. song,
  75. # Specify that a silent chunk must be at least 2 seconds or 2000 ms long.
  76. min_silence_len = 1000,
  77. # Consider a chunk silent if it's quieter than -16 dBFS.
  78. # (You may want to adjust this parameter.)
  79. silence_thresh = -50,
  80. timestamps=True
  81. )
  82. ## Process each chunk with your parameters
  83. #for i, chunk in enumerate(chunks):
  84. # # Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding.
  85. # silence_chunk = AudioSegment.silent(duration=500)
  86. # # Add the padding chunk to beginning and end of the entire chunk.
  87. # audio_chunk = silence_chunk + chunk + silence_chunk
  88. # # Normalize the entire chunk.
  89. # normalized_chunk = match_target_amplitude(audio_chunk, -20.0)
  90. # # Export the audio chunk with new bitrate.
  91. # print("Exporting chunk{0}.mp3.".format(i))
  92. # normalized_chunk.export(
  93. # ".//chunk{0}.wav".format(i),
  94. # bitrate = "192k",
  95. # format = "wav"
  96. # )
  97. return ([ audiosegment_to_librosawav(c) for c in chunks ], song.frame_rate)
  98. def non_silent_chunks(song):
  99. #song = AudioSegment.from_wav("recording.wav")
  100. return detect_nonsilent(song, min_silence_len=min_silence_len, silence_thresh=-50)
  101. def audiosegment_to_librosawav(audiosegment):
  102. channel_sounds = audiosegment.split_to_mono()
  103. samples = [s.get_array_of_samples() for s in channel_sounds]
  104. fp_arr = np.array(samples).T.astype(np.float32)
  105. fp_arr /= np.iinfo(samples[0].typecode).max
  106. fp_arr = fp_arr.reshape(-1)
  107. return fp_arr
  108. # sr = samples / second
  109. def millisecond_to_samples(ms, sr):
  110. return round((ms / 1000) * sr)
  111. def samples_to_millisecond(samples, sr):
  112. return (samples / sr) * 1000
  113. def ms_to_time(ms):
  114. secs = ms / 1000
  115. return "{0}:{1}".format(math.floor(secs / 60), secs % 60)
  116. def seg_is_speech(seg):
  117. f = lambda x: int(32768 * x)
  118. x = np.vectorize(f)(seg)
  119. pcm_data = x.tobytes()
  120. speeches = 0
  121. total = 0
  122. offset = 0
  123. n = int(sr * (frame_duration_ms / 1000.0) * 2)
  124. duration = (float(n) / sr) / 2.0
  125. while offset + n < len(pcm_data):
  126. frame = pcm_data[offset:(offset+n)]
  127. if vad.is_speech(frame, sr):
  128. speeches += 1
  129. offset = offset + n
  130. total += 1
  131. #return speeches / total
  132. return 1.0
  133. def calculate_best_offset(mfcc_ref, mfcc_seg, sr):
  134. return librosa.segment.cross_similarity(mfcc_seg, mfcc_ref, mode='affinity', metric='cosine')
  135. def detect_lines(img, duration_x, duration_y):
  136. #print(img.shape)
  137. #print(np.min(img), np.max(img))
  138. img = cv2.imread('affine_similarity.png')
  139. gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  140. kernel_size = 5
  141. blur_gray = cv2.GaussianBlur(gray, (kernel_size, kernel_size), 0)
  142. low_threshold = 50
  143. high_threshold = 150
  144. edges = cv2.Canny(blur_gray, low_threshold, high_threshold)
  145. rho = 1 # distance resolution in pixels of the Hough grid
  146. theta = np.pi / 180 # angular resolution in radians of the Hough grid
  147. threshold = 15 # minimum number of votes (intersections in Hough grid cell)
  148. min_line_length = 50 # minimum number of pixels making up a line
  149. max_line_gap = 20 # maximum gap in pixels between connectable line segments
  150. line_image = np.copy(img) * 0 # creating a blank to draw lines on
  151. # Run Hough on edge detected image
  152. # Output "lines" is an array containing endpoints of detected line segments
  153. lines = cv2.HoughLinesP(edges, rho, theta, threshold, np.array([]),
  154. min_line_length, max_line_gap)
  155. width, height = img.shape[1], img.shape[0]
  156. scale_x = duration_x / width
  157. scale_y = duration_y / height
  158. print(img.shape, scale_x, scale_y, duration_x, duration_y)
  159. #slope = duration_y / duration_x
  160. slope = 1
  161. expected_slope = scale_x / scale_y
  162. #expected_slope = 0.101694915
  163. print(expected_slope)
  164. offsets = []
  165. for line in lines:
  166. for x1,y1,x2,y2 in line:
  167. # swapped y1 and y2 since y is measured from the top
  168. slope = (y1-y2)/(x2-x1)
  169. if abs(slope - expected_slope) < 0.03:
  170. cv2.line(line_image,(x1,y1),(x2,y2),(255,0,0),5)
  171. cv2.putText(img, "{:.2f}".format(slope), (x1, y1), fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=0.5,color=(0, 0, 255))
  172. if (x1 / width) < 0.15:
  173. print(height-y1)
  174. y = height - y1
  175. y0 = y - x1 * slope
  176. offsets.append(y0 * scale_y)
  177. #actual_lines.append((x1 * scale_x, (height - y1) * scale_y, x2 * scale_x, (height - y2) * scale_y))
  178. #print(max(slopes))
  179. lines_edges = cv2.addWeighted(img, 0.8, line_image, 1, 0)
  180. #cv2.imshow("lines", lines_edges)
  181. #cv2.waitKey(0)
  182. return offsets
  183. def map2d(x, y, f):
  184. n_x = len(x)
  185. n_y = len(y)
  186. res = np.zeros((n_x, n_y))
  187. for i in range(n_x):
  188. for j in range(n_y):
  189. res[i,j] = f(x[i], y[j])
  190. return res
  191. def make_widths_equal(fig, rect, ax1, ax2, ax3, pad):
  192. # pad in inches
  193. divider = VBoxDivider(
  194. fig, rect,
  195. horizontal=[Size.AxesX(ax1), Size.Scaled(1), Size.AxesX(ax2), Size.Scaled(1), Size.AxesX(ax3)],
  196. vertical=[Size.AxesY(ax1), Size.Fixed(pad), Size.AxesY(ax2), Size.Fixed(pad), Size.AxesY(ax3)])
  197. ax1.set_axes_locator(divider.new_locator(0))
  198. ax2.set_axes_locator(divider.new_locator(2))
  199. ax3.set_axes_locator(divider.new_locator(4))
  200. if __name__ == '__main__':
  201. #vad = webrtcvad.Vad()
  202. #hop_length = 128
  203. #n_mfcc = 13
  204. #frame_duration_ms = 10
  205. fp = "hard_piece_7.wav"
  206. y, sr = librosa.load(fp, mono=True)
  207. song = AudioSegment.from_wav(fp)
  208. ts_non_sil_ms = non_silent_chunks(song)
  209. #print(y.shape)
  210. #mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=n_mfcc)[1:,:]
  211. #print(mfcc.shape)
  212. #seg_duration_ms = 100
  213. #seg_duration_samples = millisecond_to_samples(seg_duration_ms, sr)
  214. ## split complete audio in 10ms segments, only keep those that have voice in it
  215. ##segs = []
  216. ##offset = 0
  217. ###i = 0
  218. ##while offset + seg_duration_samples < len(y):
  219. ## seg = y[ offset : offset + seg_duration_samples ]
  220. ## if seg_is_speech(seg):
  221. ## segs.append((seg, offset))
  222. ## offset += seg_duration_samples
  223. ##segs = segs[1:]
  224. ##n_segs = len(segs)
  225. ##(seg, offset) = segs[0]
  226. fp_segment = "segment.wav"
  227. seg, sr_seg = librosa.load(fp_segment, mono=True)
  228. assert sr==sr_seg
  229. ##for seg in segs:
  230. #mfcc_seg = librosa.feature.mfcc(y=seg, sr=sr_seg, hop_length=hop_length, n_mfcc=n_mfcc)[1:,:]
  231. #xsim = calculate_best_offset(mfcc, mfcc_seg, sr)
  232. #fig, ax = plt.subplots(nrows=1, sharex=True)
  233. #img = librosa.display.specshow(xsim, x_axis='s', y_axis='s', hop_length=hop_length, ax=ax, cmap='magma_r')
  234. #print(detect_lines(xsim))
  235. #ax.imshow(np.transpose(xsim), aspect='auto')
  236. #ax[1].imshow(diffs_penalised)
  237. #ax[1].imshow(np.reshape(vad_coeffs, (1, n_segs)))
  238. #ax[2].imshow(np.reshape(lengths, (1, n_segs)))
  239. #make_widths_equal(fig, 111, ax[0], ax[1], ax[2], pad=0.5)
  240. #plt.show()
  241. found_starts = sorted([ samples_to_millisecond(y0, sr) for y0 in detect_lines(None, len(seg), len(y))])
  242. def f(ts, start):
  243. return abs(ts[0] - start)
  244. closest = map2d(ts_non_sil_ms, found_starts, f)
  245. plt.imshow(closest)
  246. plt.show()
  247. latest = -1
  248. for i, row in enumerate(closest):
  249. # min silence len = 400
  250. if min(row) < min_silence_len / 2:
  251. latest = ts_non_sil_ms[i]
  252. print("delete until:", ms_to_time(latest[0]))
  253. #print("possible starts:", [ ms_to_time(t) for t in found_starts])
  254. #for n, seg in enumerate(segs):
  255. # sf.write('part' + str(n) + '.wav', seg, sr)
  256. #print(segs)
  257. #y1, sr1 = librosa.load("out000.wav")
  258. #y2, sr2 = librosa.load("out004.wav")
  259. #print("total alignment cost:", calc_dtw_sim(y1, y2, sr1, sr2, plot_result=True))
  260. #print("xcorr:", np.trace(calc_xcorr_sim(y1, y2, sr1, sr2)))