Procházet zdrojové kódy

some more fine tuning

master
flavis před 3 roky
rodič
revize
cc4f477029
Podepsáno: christian <christian@flavigny.de> ID GPG klíče: D953D69721B948B3
1 změnil soubory, kde provedl 70 přidání a 9 odebrání
  1. +70
    -9
      analysis.py

+ 70
- 9
analysis.py Zobrazit soubor

@@ -8,23 +8,31 @@ from pydub.silence import split_on_silence, detect_nonsilent
import math import math
import wave import wave
import contextlib import contextlib
import random
from mpl_toolkits.axes_grid1.axes_divider import VBoxDivider
import mpl_toolkits.axes_grid1.axes_size as Size


import webrtcvad import webrtcvad




def calc_dtw_sim(y1, y2, sr1, sr2, plot_result=False): def calc_dtw_sim(y1, y2, sr1, sr2, plot_result=False):
hop_length = 64 hop_length = 64
assert sr1 == sr2


l = min(len(y1), len(y2)) l = min(len(y1), len(y2))


to_consider = min(l, max(round(0.2*l), 2048)) to_consider = min(l, max(round(0.2*l), 2048))
bound = round(0.2 * l)
min_len = millisecond_to_samples(100, sr1)
bound = round(0.5 * l)
if bound < min_len:
bound = min_len
#bound = max(round(0.2 * l), millisecond_to_samples(200, sr1))


y1 = y1[0:round(0.2*l)]
y2 = y2[0:round(0.2*l)]
y1 = y1[0:bound]
y2 = y2[0:bound]


if bound < 2048: if bound < 2048:
n_fft = 512
n_fft = bound
n_mels = 64 n_mels = 64
else: else:
n_fft = 2048 n_fft = 2048
@@ -169,11 +177,23 @@ def seg_is_speech(seg):


return speeches / total return speeches / total



def make_widths_equal(fig, rect, ax1, ax2, ax3, pad):
# pad in inches
divider = VBoxDivider(
fig, rect,
horizontal=[Size.AxesX(ax1), Size.Scaled(1), Size.AxesX(ax2), Size.Scaled(1), Size.AxesX(ax3)],
vertical=[Size.AxesY(ax1), Size.Fixed(pad), Size.AxesY(ax2), Size.Fixed(pad), Size.AxesY(ax3)])
ax1.set_axes_locator(divider.new_locator(0))
ax2.set_axes_locator(divider.new_locator(2))
ax3.set_axes_locator(divider.new_locator(4))


if __name__ == '__main__': if __name__ == '__main__':
vad = webrtcvad.Vad() vad = webrtcvad.Vad()


frame_duration_ms = 10 frame_duration_ms = 10
fp = "hard_piece_2.wav"
fp = "hard_pieces.wav"
y, sr = librosa.load(fp, mono=True, sr=32000) y, sr = librosa.load(fp, mono=True, sr=32000)


#pcm_data = y.tobytes() #pcm_data = y.tobytes()
@@ -213,16 +233,49 @@ if __name__ == '__main__':
#print("librosa load done") #print("librosa load done")


segs = [] segs = []
#i = 0
for ts in non_silent_chunks(song): for ts in non_silent_chunks(song):
start, end = ts[0], ts[1] start, end = ts[0], ts[1]
seg = y[ millisecond_to_samples(start, sr) : millisecond_to_samples(end, sr) ] seg = y[ millisecond_to_samples(start, sr) : millisecond_to_samples(end, sr) ]
segs.append(((start, end), seg)) segs.append(((start, end), seg))
#sf.write("part{0}.wav".format(i), seg, sr, 'PCM_16')
#i += 1

#segs = segs[1:]
n_segs = len(segs)
#random.shuffle(segs)
diffs = np.zeros((n_segs, n_segs))
diffs_penalised = np.zeros((n_segs, n_segs))


for i in range(len(segs)-1):
vad_coeffs = np.zeros((n_segs,))
lengths = np.zeros((n_segs,))

for i in range(n_segs):
(s1, e1), y1 = segs[i]
for j in range(i):
(s2, e2), y2 = segs[j]
diffs[i,j] = calc_dtw_sim(y1, y2, sr, sr, plot_result=False)
diffs[j,i] = diffs[i,j]
distance_penalty = abs(i-j)**(100/min((e1-s1), (e2-s2)))
diffs_penalised[i,j] = diffs[i,j] * distance_penalty
diffs_penalised[j,i] = diffs_penalised[i,j]
vad_coeffs[i] = seg_is_speech(y1)
lengths[i] = e1 - s1

delete_segs = np.zeros((n_segs,), dtype=bool)

for i in range(n_segs):
if delete_segs[i]:
continue
max_j = i
for j in range(i, n_segs):
if diffs_penalised[i,j] < 80:
max_j = j
delete_segs[i:max_j] = True
for i in range(n_segs):
(s1, e1), y1 = segs[i] (s1, e1), y1 = segs[i]
(s2, e2), y2 = segs[i+1]
diff = calc_dtw_sim(y1, y2, sr, sr, plot_result=False)
vad_coeff = seg_is_speech(y1)
print("{0}\t{1}\tn: {2} delete: {3}, vad: {4}".format(s1/1000, e1/1000, i, delete_segs[i], vad_coeffs[i]))


#if diff < 100: #if diff < 100:
#print("{0}\t{1}\tdiff: {2}, vad: {3}".format(s1/1000, e1/1000, diff, vad_coeff)) #print("{0}\t{1}\tdiff: {2}, vad: {3}".format(s1/1000, e1/1000, diff, vad_coeff))
@@ -232,6 +285,14 @@ if __name__ == '__main__':
#print("{0}\t{1}\tvad {2}".format(s1/1000, e1/1000, vad_coeff)) #print("{0}\t{1}\tvad {2}".format(s1/1000, e1/1000, vad_coeff))




fig, ax = plt.subplots(nrows=3, sharex=True)
ax[0].imshow(diffs)
ax[1].imshow(diffs_penalised)
#ax[1].imshow(np.reshape(vad_coeffs, (1, n_segs)))
ax[2].imshow(np.reshape(lengths, (1, n_segs)))

make_widths_equal(fig, 111, ax[0], ax[1], ax[2], pad=0.5)
plt.show()
#for n, seg in enumerate(segs): #for n, seg in enumerate(segs):
# sf.write('part' + str(n) + '.wav', seg, sr) # sf.write('part' + str(n) + '.wav', seg, sr)
#print(segs) #print(segs)


Načítá se…
Zrušit
Uložit