| @@ -8,23 +8,31 @@ from pydub.silence import split_on_silence, detect_nonsilent | |||||
| import math | import math | ||||
| import wave | import wave | ||||
| import contextlib | import contextlib | ||||
| import random | |||||
| from mpl_toolkits.axes_grid1.axes_divider import VBoxDivider | |||||
| import mpl_toolkits.axes_grid1.axes_size as Size | |||||
| import webrtcvad | import webrtcvad | ||||
| def calc_dtw_sim(y1, y2, sr1, sr2, plot_result=False): | def calc_dtw_sim(y1, y2, sr1, sr2, plot_result=False): | ||||
| hop_length = 64 | hop_length = 64 | ||||
| assert sr1 == sr2 | |||||
| l = min(len(y1), len(y2)) | l = min(len(y1), len(y2)) | ||||
| to_consider = min(l, max(round(0.2*l), 2048)) | to_consider = min(l, max(round(0.2*l), 2048)) | ||||
| bound = round(0.2 * l) | |||||
| min_len = millisecond_to_samples(100, sr1) | |||||
| bound = round(0.5 * l) | |||||
| if bound < min_len: | |||||
| bound = min_len | |||||
| #bound = max(round(0.2 * l), millisecond_to_samples(200, sr1)) | |||||
| y1 = y1[0:round(0.2*l)] | |||||
| y2 = y2[0:round(0.2*l)] | |||||
| y1 = y1[0:bound] | |||||
| y2 = y2[0:bound] | |||||
| if bound < 2048: | if bound < 2048: | ||||
| n_fft = 512 | |||||
| n_fft = bound | |||||
| n_mels = 64 | n_mels = 64 | ||||
| else: | else: | ||||
| n_fft = 2048 | n_fft = 2048 | ||||
| @@ -169,11 +177,23 @@ def seg_is_speech(seg): | |||||
| return speeches / total | return speeches / total | ||||
| def make_widths_equal(fig, rect, ax1, ax2, ax3, pad): | |||||
| # pad in inches | |||||
| divider = VBoxDivider( | |||||
| fig, rect, | |||||
| horizontal=[Size.AxesX(ax1), Size.Scaled(1), Size.AxesX(ax2), Size.Scaled(1), Size.AxesX(ax3)], | |||||
| vertical=[Size.AxesY(ax1), Size.Fixed(pad), Size.AxesY(ax2), Size.Fixed(pad), Size.AxesY(ax3)]) | |||||
| ax1.set_axes_locator(divider.new_locator(0)) | |||||
| ax2.set_axes_locator(divider.new_locator(2)) | |||||
| ax3.set_axes_locator(divider.new_locator(4)) | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| vad = webrtcvad.Vad() | vad = webrtcvad.Vad() | ||||
| frame_duration_ms = 10 | frame_duration_ms = 10 | ||||
| fp = "hard_piece_2.wav" | |||||
| fp = "hard_pieces.wav" | |||||
| y, sr = librosa.load(fp, mono=True, sr=32000) | y, sr = librosa.load(fp, mono=True, sr=32000) | ||||
| #pcm_data = y.tobytes() | #pcm_data = y.tobytes() | ||||
| @@ -213,16 +233,49 @@ if __name__ == '__main__': | |||||
| #print("librosa load done") | #print("librosa load done") | ||||
| segs = [] | segs = [] | ||||
| #i = 0 | |||||
| for ts in non_silent_chunks(song): | for ts in non_silent_chunks(song): | ||||
| start, end = ts[0], ts[1] | start, end = ts[0], ts[1] | ||||
| seg = y[ millisecond_to_samples(start, sr) : millisecond_to_samples(end, sr) ] | seg = y[ millisecond_to_samples(start, sr) : millisecond_to_samples(end, sr) ] | ||||
| segs.append(((start, end), seg)) | segs.append(((start, end), seg)) | ||||
| #sf.write("part{0}.wav".format(i), seg, sr, 'PCM_16') | |||||
| #i += 1 | |||||
| #segs = segs[1:] | |||||
| n_segs = len(segs) | |||||
| #random.shuffle(segs) | |||||
| diffs = np.zeros((n_segs, n_segs)) | |||||
| diffs_penalised = np.zeros((n_segs, n_segs)) | |||||
| for i in range(len(segs)-1): | |||||
| vad_coeffs = np.zeros((n_segs,)) | |||||
| lengths = np.zeros((n_segs,)) | |||||
| for i in range(n_segs): | |||||
| (s1, e1), y1 = segs[i] | |||||
| for j in range(i): | |||||
| (s2, e2), y2 = segs[j] | |||||
| diffs[i,j] = calc_dtw_sim(y1, y2, sr, sr, plot_result=False) | |||||
| diffs[j,i] = diffs[i,j] | |||||
| distance_penalty = abs(i-j)**(100/min((e1-s1), (e2-s2))) | |||||
| diffs_penalised[i,j] = diffs[i,j] * distance_penalty | |||||
| diffs_penalised[j,i] = diffs_penalised[i,j] | |||||
| vad_coeffs[i] = seg_is_speech(y1) | |||||
| lengths[i] = e1 - s1 | |||||
| delete_segs = np.zeros((n_segs,), dtype=bool) | |||||
| for i in range(n_segs): | |||||
| if delete_segs[i]: | |||||
| continue | |||||
| max_j = i | |||||
| for j in range(i, n_segs): | |||||
| if diffs_penalised[i,j] < 80: | |||||
| max_j = j | |||||
| delete_segs[i:max_j] = True | |||||
| for i in range(n_segs): | |||||
| (s1, e1), y1 = segs[i] | (s1, e1), y1 = segs[i] | ||||
| (s2, e2), y2 = segs[i+1] | |||||
| diff = calc_dtw_sim(y1, y2, sr, sr, plot_result=False) | |||||
| vad_coeff = seg_is_speech(y1) | |||||
| print("{0}\t{1}\tn: {2} delete: {3}, vad: {4}".format(s1/1000, e1/1000, i, delete_segs[i], vad_coeffs[i])) | |||||
| #if diff < 100: | #if diff < 100: | ||||
| #print("{0}\t{1}\tdiff: {2}, vad: {3}".format(s1/1000, e1/1000, diff, vad_coeff)) | #print("{0}\t{1}\tdiff: {2}, vad: {3}".format(s1/1000, e1/1000, diff, vad_coeff)) | ||||
| @@ -232,6 +285,14 @@ if __name__ == '__main__': | |||||
| #print("{0}\t{1}\tvad {2}".format(s1/1000, e1/1000, vad_coeff)) | #print("{0}\t{1}\tvad {2}".format(s1/1000, e1/1000, vad_coeff)) | ||||
| fig, ax = plt.subplots(nrows=3, sharex=True) | |||||
| ax[0].imshow(diffs) | |||||
| ax[1].imshow(diffs_penalised) | |||||
| #ax[1].imshow(np.reshape(vad_coeffs, (1, n_segs))) | |||||
| ax[2].imshow(np.reshape(lengths, (1, n_segs))) | |||||
| make_widths_equal(fig, 111, ax[0], ax[1], ax[2], pad=0.5) | |||||
| plt.show() | |||||
| #for n, seg in enumerate(segs): | #for n, seg in enumerate(segs): | ||||
| # sf.write('part' + str(n) + '.wav', seg, sr) | # sf.write('part' + str(n) + '.wav', seg, sr) | ||||
| #print(segs) | #print(segs) | ||||