From cc4f477029d8ce7ea4ed013d880464672415142d Mon Sep 17 00:00:00 2001 From: flavis Date: Fri, 9 Sep 2022 12:19:32 +0200 Subject: [PATCH] some more fine tuning --- analysis.py | 79 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 70 insertions(+), 9 deletions(-) diff --git a/analysis.py b/analysis.py index 2f6894b..fbe3a65 100644 --- a/analysis.py +++ b/analysis.py @@ -8,23 +8,31 @@ from pydub.silence import split_on_silence, detect_nonsilent import math import wave import contextlib +import random +from mpl_toolkits.axes_grid1.axes_divider import VBoxDivider +import mpl_toolkits.axes_grid1.axes_size as Size import webrtcvad def calc_dtw_sim(y1, y2, sr1, sr2, plot_result=False): hop_length = 64 + assert sr1 == sr2 l = min(len(y1), len(y2)) to_consider = min(l, max(round(0.2*l), 2048)) - bound = round(0.2 * l) + min_len = millisecond_to_samples(100, sr1) + bound = round(0.5 * l) + if bound < min_len: + bound = min_len + #bound = max(round(0.2 * l), millisecond_to_samples(200, sr1)) - y1 = y1[0:round(0.2*l)] - y2 = y2[0:round(0.2*l)] + y1 = y1[0:bound] + y2 = y2[0:bound] if bound < 2048: - n_fft = 512 + n_fft = bound n_mels = 64 else: n_fft = 2048 @@ -169,11 +177,23 @@ def seg_is_speech(seg): return speeches / total + +def make_widths_equal(fig, rect, ax1, ax2, ax3, pad): + # pad in inches + divider = VBoxDivider( + fig, rect, + horizontal=[Size.AxesX(ax1), Size.Scaled(1), Size.AxesX(ax2), Size.Scaled(1), Size.AxesX(ax3)], + vertical=[Size.AxesY(ax1), Size.Fixed(pad), Size.AxesY(ax2), Size.Fixed(pad), Size.AxesY(ax3)]) + ax1.set_axes_locator(divider.new_locator(0)) + ax2.set_axes_locator(divider.new_locator(2)) + ax3.set_axes_locator(divider.new_locator(4)) + + if __name__ == '__main__': vad = webrtcvad.Vad() frame_duration_ms = 10 - fp = "hard_piece_2.wav" + fp = "hard_pieces.wav" y, sr = librosa.load(fp, mono=True, sr=32000) #pcm_data = y.tobytes() @@ -213,16 +233,49 @@ if __name__ == '__main__': #print("librosa load done") segs = [] + #i = 0 for ts in non_silent_chunks(song): start, end = ts[0], ts[1] seg = y[ millisecond_to_samples(start, sr) : millisecond_to_samples(end, sr) ] segs.append(((start, end), seg)) + #sf.write("part{0}.wav".format(i), seg, sr, 'PCM_16') + #i += 1 + + #segs = segs[1:] + n_segs = len(segs) + #random.shuffle(segs) + diffs = np.zeros((n_segs, n_segs)) + diffs_penalised = np.zeros((n_segs, n_segs)) - for i in range(len(segs)-1): + vad_coeffs = np.zeros((n_segs,)) + lengths = np.zeros((n_segs,)) + + for i in range(n_segs): + (s1, e1), y1 = segs[i] + for j in range(i): + (s2, e2), y2 = segs[j] + diffs[i,j] = calc_dtw_sim(y1, y2, sr, sr, plot_result=False) + diffs[j,i] = diffs[i,j] + distance_penalty = abs(i-j)**(100/min((e1-s1), (e2-s2))) + diffs_penalised[i,j] = diffs[i,j] * distance_penalty + diffs_penalised[j,i] = diffs_penalised[i,j] + vad_coeffs[i] = seg_is_speech(y1) + lengths[i] = e1 - s1 + + delete_segs = np.zeros((n_segs,), dtype=bool) + + for i in range(n_segs): + if delete_segs[i]: + continue + max_j = i + for j in range(i, n_segs): + if diffs_penalised[i,j] < 80: + max_j = j + delete_segs[i:max_j] = True + + for i in range(n_segs): (s1, e1), y1 = segs[i] - (s2, e2), y2 = segs[i+1] - diff = calc_dtw_sim(y1, y2, sr, sr, plot_result=False) - vad_coeff = seg_is_speech(y1) + print("{0}\t{1}\tn: {2} delete: {3}, vad: {4}".format(s1/1000, e1/1000, i, delete_segs[i], vad_coeffs[i])) #if diff < 100: #print("{0}\t{1}\tdiff: {2}, vad: {3}".format(s1/1000, e1/1000, diff, vad_coeff)) @@ -232,6 +285,14 @@ if __name__ == '__main__': #print("{0}\t{1}\tvad {2}".format(s1/1000, e1/1000, vad_coeff)) + fig, ax = plt.subplots(nrows=3, sharex=True) + ax[0].imshow(diffs) + ax[1].imshow(diffs_penalised) + #ax[1].imshow(np.reshape(vad_coeffs, (1, n_segs))) + ax[2].imshow(np.reshape(lengths, (1, n_segs))) + + make_widths_equal(fig, 111, ax[0], ax[1], ax[2], pad=0.5) + plt.show() #for n, seg in enumerate(segs): # sf.write('part' + str(n) + '.wav', seg, sr) #print(segs)