Automatically cut audio books
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

503 行
17KB

  1. import librosa
  2. import librosa.display
  3. import numpy as np
  4. import matplotlib.pyplot as plt
  5. import soundfile as sf
  6. from pydub import AudioSegment
  7. from pydub.silence import split_on_silence, detect_nonsilent
  8. import math
  9. import wave
  10. import contextlib
  11. import random
  12. from mpl_toolkits.axes_grid1.axes_divider import VBoxDivider
  13. import mpl_toolkits.axes_grid1.axes_size as Size
  14. import cv2
  15. import sys
  16. import webrtcvad
  17. min_silence_len = 400
  18. frame_duration_ms = 10
  19. def calc_dtw_sim(y1, y2, sr1, sr2, plot_result=False):
  20. hop_length = 64
  21. assert sr1 == sr2
  22. l = min(len(y1), len(y2))
  23. to_consider = min(l, max(round(0.2*l), 2048))
  24. min_len = millisecond_to_samples(100, sr1)
  25. bound = round(0.5 * l)
  26. if bound < min_len:
  27. bound = min_len
  28. #bound = max(round(0.2 * l), millisecond_to_samples(200, sr1))
  29. y1 = y1[0:bound]
  30. y2 = y2[0:bound]
  31. if bound < 2048:
  32. n_fft = bound
  33. n_mels = 64
  34. else:
  35. n_fft = 2048
  36. n_mels = 128
  37. mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1, hop_length=hop_length, n_mfcc=42, n_fft=n_fft, n_mels=n_mels)[1:,:]
  38. mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2, hop_length=hop_length, n_mfcc=42, n_fft=n_fft, n_mels=n_mels)[1:,:]
  39. D, wp = librosa.sequence.dtw(mfcc1, mfcc2)
  40. if plot_result:
  41. fig, ax = plt.subplots(nrows=4)
  42. img = librosa.display.specshow(D, x_axis='frames', y_axis='frames',
  43. ax=ax[0])
  44. ax[0].set(title='DTW cost', xlabel='Noisy sequence', ylabel='Target')
  45. ax[0].plot(wp[:, 1], wp[:, 0], label='Optimal path', color='y')
  46. ax[0].legend()
  47. fig.colorbar(img, ax=ax[0])
  48. ax[1].plot(D[-1, :] / wp.shape[0])
  49. ax[1].set(xlim=[0, mfcc1.shape[1]],
  50. title='Matching cost function')
  51. ax[2].imshow(mfcc1)
  52. ax[3].imshow(mfcc2)
  53. plt.show()
  54. total_alignment_cost = D[-1, -1] / wp.shape[0]
  55. return total_alignment_cost
  56. def calc_xcorr_sim(y1, y2, sr1, sr2):
  57. hop_length = 256
  58. y1 = y1[0:round(len(y1)*0.2)]
  59. y2 = y2[0:round(len(y2)*0.2)]
  60. mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1, hop_length=hop_length, n_mfcc=13)[1:,:]
  61. mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2, hop_length=hop_length, n_mfcc=13)[1:,:]
  62. xsim = librosa.segment.cross_similarity(mfcc1, mfcc2, mode='distance')
  63. return xsim
  64. def match_target_amplitude(aChunk, target_dBFS):
  65. ''' Normalize given audio chunk '''
  66. change_in_dBFS = target_dBFS - aChunk.dBFS
  67. return aChunk.apply_gain(change_in_dBFS)
  68. def spl_on_silence():
  69. # Import the AudioSegment class for processing audio and the
  70. # Load your audio.
  71. song = AudioSegment.from_wav("recording.wav")
  72. # Split track where the silence is 2 seconds or more and get chunks using
  73. # the imported function.
  74. chunks = split_on_silence (
  75. # Use the loaded audio.
  76. song,
  77. # Specify that a silent chunk must be at least 2 seconds or 2000 ms long.
  78. min_silence_len = 1000,
  79. # Consider a chunk silent if it's quieter than -16 dBFS.
  80. # (You may want to adjust this parameter.)
  81. silence_thresh = -50,
  82. timestamps=True
  83. )
  84. ## Process each chunk with your parameters
  85. #for i, chunk in enumerate(chunks):
  86. # # Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding.
  87. # silence_chunk = AudioSegment.silent(duration=500)
  88. # # Add the padding chunk to beginning and end of the entire chunk.
  89. # audio_chunk = silence_chunk + chunk + silence_chunk
  90. # # Normalize the entire chunk.
  91. # normalized_chunk = match_target_amplitude(audio_chunk, -20.0)
  92. # # Export the audio chunk with new bitrate.
  93. # print("Exporting chunk{0}.mp3.".format(i))
  94. # normalized_chunk.export(
  95. # ".//chunk{0}.wav".format(i),
  96. # bitrate = "192k",
  97. # format = "wav"
  98. # )
  99. return ([ audiosegment_to_librosawav(c) for c in chunks ], song.frame_rate)
  100. def non_silent_chunks(song):
  101. #song = AudioSegment.from_wav("recording.wav")
  102. return detect_nonsilent(song, min_silence_len=min_silence_len, silence_thresh=-50)
  103. def audiosegment_to_librosawav(audiosegment):
  104. channel_sounds = audiosegment.split_to_mono()
  105. samples = [s.get_array_of_samples() for s in channel_sounds]
  106. fp_arr = np.array(samples).T.astype(np.float32)
  107. fp_arr /= np.iinfo(samples[0].typecode).max
  108. fp_arr = fp_arr.reshape(-1)
  109. return fp_arr
  110. # sr = samples / second
  111. def millisecond_to_samples(ms, sr):
  112. return round((ms / 1000) * sr)
  113. def samples_to_millisecond(samples, sr):
  114. return (samples / sr) * 1000
  115. def samples_to_time(samples, sr):
  116. return ms_to_time(samples_to_millisecond(samples, sr))
  117. def ms_to_time(ms):
  118. secs = ms / 1000
  119. return "{0}:{1:.4f}".format(math.floor(secs / 60), secs % 60)
  120. def seg_is_speech(seg):
  121. f = lambda x: int(32768 * x)
  122. x = np.vectorize(f)(seg)
  123. pcm_data = x.tobytes()
  124. speeches = 0
  125. total = 0
  126. offset = 0
  127. n = int(sr * (frame_duration_ms / 1000.0) * 2)
  128. duration = (float(n) / sr) / 2.0
  129. while offset + n < len(pcm_data):
  130. frame = pcm_data[offset:(offset+n)]
  131. if vad.is_speech(frame, sr):
  132. speeches += 1
  133. offset = offset + n
  134. total += 1
  135. return speeches / total
  136. def calculate_best_offset(mfcc_ref, mfcc_seg, sr):
  137. return librosa.segment.cross_similarity(mfcc_seg, mfcc_ref, mode='affinity', metric='cosine')
  138. def detect_lines(img, duration_x, duration_y, plot_result=False):
  139. #print(img.shape)
  140. #print(np.min(img), np.max(img))
  141. gray = np.vectorize(int)((1-img) * 255).astype('uint8')
  142. img = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
  143. #print(img, type(img))
  144. #img = cv2.imread('affine_similarity_2.png')
  145. #gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  146. #cv2.imshow("gray", gray)
  147. #cv2.waitKey(0)
  148. #print(gray, type(gray), gray.shape, gray.dtype)
  149. #print(gray2, type(gray2), gray2.shape, gray2.dtype)
  150. kernel_size = 5
  151. blur_gray = cv2.GaussianBlur(gray, (kernel_size, kernel_size), 0)
  152. #cv2.imshow("blur gray", blur_gray)
  153. #cv2.waitKey(0)
  154. low_threshold = 50
  155. high_threshold = 150
  156. edges = cv2.Canny(blur_gray, low_threshold, high_threshold)
  157. rho = 1 # distance resolution in pixels of the Hough grid
  158. theta = np.pi / 180 # angular resolution in radians of the Hough grid
  159. threshold = 15 # minimum number of votes (intersections in Hough grid cell)
  160. min_line_length = 50 # minimum number of pixels making up a line
  161. max_line_gap = 20 # maximum gap in pixels between connectable line segments
  162. if plot_result:
  163. line_image = np.copy(img) * 0 # creating a blank to draw lines on
  164. # Run Hough on edge detected image
  165. # Output "lines" is an array containing endpoints of detected line segments
  166. lines = cv2.HoughLinesP(edges, rho, theta, threshold, np.array([]),
  167. min_line_length, max_line_gap)
  168. width, height = img.shape[1], img.shape[0]
  169. scale_x = duration_x / width
  170. scale_y = duration_y / height
  171. #print(img.shape, scale_x, scale_y, duration_x, duration_y)
  172. #slope = duration_y / duration_x
  173. slope = 1
  174. expected_slope = scale_x / scale_y
  175. #print(expected_slope)
  176. #expected_slope = 1.0 # y is inverted by opencv
  177. #expected_slope = 0.101694915
  178. ls = []
  179. offsets = []
  180. xs = []
  181. if lines is not None:
  182. for line in lines:
  183. for x1,y1,x2,y2 in line:
  184. # swapped y1 and y2 since y is measured from the top
  185. slope = (y2-y1)/(x2-x1) if x2 != x1 else 42
  186. if abs(slope - expected_slope) < 0.15:#and (x1 / width) < 0.15:
  187. y = y1
  188. y0 = (y - x1 * slope)
  189. if plot_result:
  190. #cv2.line(line_image,(0,int(y0)),(x2,y2),(0,255,0),5)
  191. cv2.line(line_image,(x1, y1),(x2,y2),(255,0,0),5)
  192. cv2.putText(img, "{:.2f}".format(slope), (x1, y1), fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=2,color=(0, 0, 255))
  193. #if (x1 / width) < 0.15:
  194. #print(height-y1)
  195. #y = height - y1
  196. #y = y1
  197. #y0 = y - x1 * slope
  198. #offsets.append(y0 * scale_y)
  199. #xs.append(x1)
  200. ls.append((x1, y1, slope))
  201. #actual_lines.append((x1 * scale_x, (height - y1) * scale_y, x2 * scale_x, (height - y2) * scale_y))
  202. #print(max(slopes))
  203. x_min = min(ls, key=lambda a: a[0])[0] if len(ls) > 0 else 42 # just something > 10
  204. offsets = [ (y1 + (x_min - x1)*slope) * scale_y for x1, y1, slope in ls ]
  205. if plot_result:
  206. for x1, y1, slope in ls:
  207. y = y1 + (x_min -x1)*slope
  208. #cv2.line(line_image,(x_min,int(y)),(x1,y1),(0,255,0),5)
  209. #cv2.line(line_image, (x_min, 0), (x_min, height-1), (0, 0, 255), 2)
  210. lines_edges = cv2.addWeighted(img, 0.8, line_image, 1, 0)
  211. lines_edges_resized = cv2.resize(lines_edges, (int(1024 * duration_x / duration_y ), 1024))
  212. cv2.imshow("lines", lines_edges_resized)
  213. cv2.waitKey(0)
  214. return (x_min*scale_x, offsets)
  215. def map2d(x, y, f):
  216. n_x = len(x)
  217. n_y = len(y)
  218. res = np.zeros((n_x, n_y))
  219. for i in range(n_x):
  220. for j in range(n_y):
  221. res[i,j] = f(x[i], y[j])
  222. return res
  223. def find_repetition(mfcc_ref, seg, sr, hop_length, sentence_timestamps, plot_result=False):
  224. mfcc_seg = librosa.feature.mfcc(y=seg, sr=sr, hop_length=hop_length, n_mfcc=n_mfcc)[1:,:]
  225. xsim = calculate_best_offset(mfcc_ref, mfcc_seg, sr)
  226. x_min, offsets = detect_lines(xsim, len(seg), mfcc_ref.shape[1] * hop_length, plot_result=plot_result)
  227. found_starts = sorted([ samples_to_millisecond(y0, sr) for y0 in offsets ])
  228. def f(ts, start):
  229. return abs(ts - start)
  230. closest = map2d(sentence_timestamps, found_starts, f)
  231. if plot_result:
  232. plt.imshow(closest)
  233. plt.show()
  234. latest = None
  235. for i, row in enumerate(closest):
  236. if len(row) == 0:
  237. continue
  238. if min(row) < min_silence_len / 2:
  239. latest = sentence_timestamps[i]
  240. return (samples_to_millisecond(x_min, sr), latest)
  241. def samples_to_hops(samples, hop_length):
  242. return round(samples / hop_length)
  243. def hops_to_samples(hops, hop_length):
  244. return round(hop_length * hops)
  245. def cont_find_repetitions(y, sr, hop_length, sentence_timestamps):
  246. assert sorted(sentence_timestamps, key=lambda t: t[0]) == sentence_timestamps
  247. #print(y.shape)
  248. mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=n_mfcc)[1:,:]
  249. step_length_ms = 200
  250. step_length_samples = millisecond_to_samples(step_length_ms, sr)
  251. window_length_ms = 1500
  252. window_length_samples = millisecond_to_samples(window_length_ms, sr)
  253. ref_window_length_ms = 20*1000 # 10 sekunden
  254. ref_window_length_samples = millisecond_to_samples(ref_window_length_ms, sr)
  255. ref_window_length_hops = samples_to_hops(ref_window_length_samples, hop_length)
  256. offset = 0
  257. available_ts = sentence_timestamps
  258. last_sentence_end = 0
  259. deletion_suggestions = []
  260. while offset + step_length_samples < len(y) and len(available_ts) > 0:
  261. offset_ms = samples_to_millisecond(offset, sr)
  262. #print(ms_to_time(offset_ms), file=sys.stderr)
  263. if offset_ms < available_ts[0][0] and offset_ms >= last_sentence_end:
  264. offset += step_length_samples
  265. continue
  266. seg = y[ offset : offset + window_length_samples ]
  267. # no longer needed since skipping based on sentence timestamps?
  268. #if seg_is_speech(seg) < 0.5:
  269. # offset += step_length_samples
  270. # continue
  271. relevant_start = offset_ms
  272. mfcc_window = mfcc[:,samples_to_hops(offset, hop_length) : samples_to_hops(offset, hop_length) + ref_window_length_hops]
  273. x_offset_ms, ts_ms = find_repetition(mfcc_window,
  274. seg,
  275. sr,
  276. hop_length,
  277. [ t[0] - offset_ms for t in available_ts ])
  278. if ts_ms is not None and x_offset_ms < step_length_ms:
  279. print("delete from {0} to {1}".format(samples_to_time(offset + millisecond_to_samples(x_offset_ms, sr), sr), ms_to_time(offset_ms + ts_ms)))
  280. deletion_suggestions.append((offset_ms + x_offset_ms, offset_ms + ts_ms))
  281. #print("window {0} - {1} is repeated at: {2}".format(samples_to_time(offset, sr), samples_to_time(offset + window_length_samples, sr), ms_to_time(ts_ms)))
  282. offset += step_length_samples
  283. if offset_ms + step_length_ms > available_ts[0][0]:
  284. last_sentence_end = available_ts[0][1]
  285. available_ts = available_ts[1:]
  286. #available_ts = [t for t in ts_non_sil_ms if t[0] > offset_ms ]
  287. deletions = []
  288. cur_deletion = None
  289. for sugg in deletion_suggestions:
  290. if cur_deletion is None:
  291. cur_deletion = [sugg]
  292. else:
  293. if sugg[0] - cur_deletion[-1][0] < 250:
  294. cur_deletion.append(sugg)
  295. else:
  296. deletions.append(cur_deletion)
  297. cur_deletion = [sugg]
  298. deletions = [(np.mean([d[0] for d in ds]), np.max([d[1] for d in ds])) for ds in deletions]
  299. for n, d in enumerate(deletions):
  300. offs = [abs(d[0]-ts[0]) for ts in sentence_timestamps]
  301. i = np.argmin(offs)
  302. if offs[i] < 150:
  303. deletions[n] = (sentence_timestamps[i][0], d[1])
  304. else:
  305. deletions[n] = (d[0], d[1])
  306. return deletions
  307. def make_widths_equal(fig, rect, ax1, ax2, ax3, pad):
  308. # pad in inches
  309. divider = VBoxDivider(
  310. fig, rect,
  311. horizontal=[Size.AxesX(ax1), Size.Scaled(1), Size.AxesX(ax2), Size.Scaled(1), Size.AxesX(ax3)],
  312. vertical=[Size.AxesY(ax1), Size.Fixed(pad), Size.AxesY(ax2), Size.Fixed(pad), Size.AxesY(ax3)])
  313. ax1.set_axes_locator(divider.new_locator(0))
  314. ax2.set_axes_locator(divider.new_locator(2))
  315. ax3.set_axes_locator(divider.new_locator(4))
  316. if __name__ == '__main__':
  317. vad = webrtcvad.Vad()
  318. hop_length = 128
  319. n_mfcc = 42
  320. fp = "hard_pieces.wav"
  321. print("loading file ...")
  322. y, sr = librosa.load(fp, mono=True, sr=32000)
  323. print("calculating mfcc ...")
  324. mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=n_mfcc)[1:,:]
  325. song = AudioSegment.from_wav(fp)
  326. mf_w = mfcc.shape[1]
  327. l = y.shape[0]
  328. print(l / mf_w)
  329. ts_non_sil_ms = non_silent_chunks(song)
  330. #autocorr = librosa.autocorrelate(y)
  331. #fig, ax = plt.subplots()
  332. #ax.plot(autocorr)
  333. #plt.show()
  334. #ts_non_sil_ms = [ t[0] for t in non_silent_chunks(song) ]
  335. #print(mfcc.shape)
  336. #print("finding reps ...")
  337. dels = cont_find_repetitions(y, sr, hop_length, ts_non_sil_ms)
  338. for d in dels:
  339. print("{0}\t{1}\tdelete".format(d[0]/1000, d[1]/1000))
  340. #window_length_ms = 1000
  341. #window_length_samples = millisecond_to_samples(window_length_ms, sr)
  342. #seg = y[25280 : 25280 + window_length_samples]
  343. #seg_duration_ms = 100
  344. #seg_duration_samples = millisecond_to_samples(seg_duration_ms, sr)
  345. ## split complete audio in 10ms segments, only keep those that have voice in it
  346. ##segs = []
  347. ##offset = 0
  348. ###i = 0
  349. ##while offset + seg_duration_samples < len(y):
  350. ## seg = y[ offset : offset + seg_duration_samples ]
  351. ## if seg_is_speech(seg):
  352. ## segs.append((seg, offset))
  353. ## offset += seg_duration_samples
  354. ##segs = segs[1:]
  355. ##n_segs = len(segs)
  356. ##(seg, offset) = segs[0]
  357. fp_segment = "segment.wav"
  358. #seg = y
  359. #sr_seg = sr
  360. seg, sr_seg = librosa.load(fp_segment, mono=True, sr=32000)
  361. #assert sr==sr_seg
  362. #mfcc_window = mfcc[:,1000:]
  363. #x_offset, ts_ms = find_repetition(mfcc_window, seg, sr, hop_length, [ t[0] for t in ts_non_sil_ms], plot_result=True)
  364. #if ts_ms is not None:
  365. # print("starting from {0} the seg is repeated at {1}".format(ms_to_time(x_offset), ms_to_time(ts_ms)))
  366. #else:
  367. # print("no rep found")
  368. #cutoff = int(0.2*len(seg))
  369. #print(samples_to_millisecond(cutoff, sr))
  370. #print("calculating xcross ...")
  371. #xsim = librosa.segment.cross_similarity(mfcc, mfcc, mode='affinity', metric='cosine')
  372. #chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length)
  373. #mfcc_stack = librosa.feature.stack_memory(mfcc, n_steps=10, delay=3)
  374. #xsim = librosa.segment.recurrence_matrix(mfcc, mode='affinity', metric='cosine')
  375. #lag = librosa.segment.recurrence_to_lag(xsim, pad=False)
  376. #xsim = librosa.segment.recurrence_matrix(mfcc, mode='affinity', metric='cosine',
  377. # width=50)
  378. #fig, ax = plt.subplots(nrows=1, sharex=True)
  379. #img = librosa.display.specshow(xsim, x_axis='s', y_axis='s', hop_length=hop_length, ax=ax, cmap='magma_r')
  380. #plt.show()
  381. print("detecting lines ...")
  382. #detect_lines(np.flip(xsim, 0), len(y), len(y), plot_result=True)
  383. #print(detect_lines(xsim))
  384. #ax.imshow(np.transpose(xsim), aspect='auto')
  385. #ax[1].imshow(diffs_penalised)
  386. #ax[1].imshow(np.reshape(vad_coeffs, (1, n_segs)))
  387. #ax[2].imshow(np.reshape(lengths, (1, n_segs)))
  388. #make_widths_equal(fig, 111, ax[0], ax[1], ax[2], pad=0.5)
  389. #plt.show()
  390. #print("possible starts:", [ ms_to_time(t) for t in found_starts])
  391. #for n, seg in enumerate(segs):
  392. # sf.write('part' + str(n) + '.wav', seg, sr)
  393. #print(segs)
  394. #y1, sr1 = librosa.load("out000.wav")
  395. #y2, sr2 = librosa.load("out004.wav")
  396. #print("total alignment cost:", calc_dtw_sim(y1, y2, sr1, sr2, plot_result=True))
  397. #print("xcorr:", np.trace(calc_xcorr_sim(y1, y2, sr1, sr2)))