christian
/
autocut


			
							import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
from pydub import AudioSegment
from pydub.silence import split_on_silence, detect_nonsilent
import math
import wave
import contextlib
import random
from mpl_toolkits.axes_grid1.axes_divider import VBoxDivider
import mpl_toolkits.axes_grid1.axes_size as Size
import cv2
import sys

import webrtcvad

min_silence_len = 400
frame_duration_ms = 10


def calc_dtw_sim(y1, y2, sr1, sr2, plot_result=False):
    hop_length = 64
    assert sr1 == sr2

    l = min(len(y1), len(y2))

    to_consider = min(l, max(round(0.2*l), 2048))
    min_len = millisecond_to_samples(100, sr1)
    bound = round(0.5 * l)
    if bound < min_len:
        bound = min_len
    #bound = max(round(0.2 * l), millisecond_to_samples(200, sr1))

    y1 = y1[0:bound]
    y2 = y2[0:bound]

    if bound < 2048:
        n_fft = bound
        n_mels = 64
    else:
        n_fft = 2048
        n_mels = 128

    mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1, hop_length=hop_length, n_mfcc=42, n_fft=n_fft, n_mels=n_mels)[1:,:]
    mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2, hop_length=hop_length, n_mfcc=42, n_fft=n_fft, n_mels=n_mels)[1:,:]

    D, wp = librosa.sequence.dtw(mfcc1, mfcc2)

    if plot_result:
        fig, ax = plt.subplots(nrows=4)

        img = librosa.display.specshow(D, x_axis='frames', y_axis='frames',
                                       ax=ax[0])

        ax[0].set(title='DTW cost', xlabel='Noisy sequence', ylabel='Target')

        ax[0].plot(wp[:, 1], wp[:, 0], label='Optimal path', color='y')

        ax[0].legend()

        fig.colorbar(img, ax=ax[0])

        ax[1].plot(D[-1, :] / wp.shape[0])

        ax[1].set(xlim=[0, mfcc1.shape[1]],
                  title='Matching cost function')

        ax[2].imshow(mfcc1)
        ax[3].imshow(mfcc2)

        plt.show()
    total_alignment_cost = D[-1, -1] / wp.shape[0]

    return total_alignment_cost


def calc_xcorr_sim(y1, y2, sr1, sr2):
    hop_length = 256

    y1 = y1[0:round(len(y1)*0.2)]
    y2 = y2[0:round(len(y2)*0.2)]

    mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1, hop_length=hop_length, n_mfcc=13)[1:,:]
    mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2, hop_length=hop_length, n_mfcc=13)[1:,:]

    xsim = librosa.segment.cross_similarity(mfcc1, mfcc2, mode='distance')
    return xsim


def match_target_amplitude(aChunk, target_dBFS):
    ''' Normalize given audio chunk '''
    change_in_dBFS = target_dBFS - aChunk.dBFS
    return aChunk.apply_gain(change_in_dBFS)


def spl_on_silence():
    # Import the AudioSegment class for processing audio and the 

    # Load your audio.
    song = AudioSegment.from_wav("recording.wav")

    # Split track where the silence is 2 seconds or more and get chunks using 
    # the imported function.
    chunks = split_on_silence (
        # Use the loaded audio.
        song, 
        # Specify that a silent chunk must be at least 2 seconds or 2000 ms long.
        min_silence_len = 1000,
        # Consider a chunk silent if it's quieter than -16 dBFS.
        # (You may want to adjust this parameter.)
        silence_thresh = -50,
        timestamps=True
    )

    ## Process each chunk with your parameters
    #for i, chunk in enumerate(chunks):
    #    # Create a silence chunk that's 0.5 seconds (or 500 ms) long for padding.
    #    silence_chunk = AudioSegment.silent(duration=500)

    #    # Add the padding chunk to beginning and end of the entire chunk.
    #    audio_chunk = silence_chunk + chunk + silence_chunk

    #    # Normalize the entire chunk.
    #    normalized_chunk = match_target_amplitude(audio_chunk, -20.0)

    #    # Export the audio chunk with new bitrate.
    #    print("Exporting chunk{0}.mp3.".format(i))
    #    normalized_chunk.export(
    #        ".//chunk{0}.wav".format(i),
    #        bitrate = "192k",
    #        format = "wav"
    #    )
    return ([ audiosegment_to_librosawav(c) for c in chunks ], song.frame_rate)


def non_silent_chunks(song):
    #song = AudioSegment.from_wav("recording.wav")

    return detect_nonsilent(song, min_silence_len=min_silence_len, silence_thresh=-50)


def audiosegment_to_librosawav(audiosegment):    
    channel_sounds = audiosegment.split_to_mono()
    samples = [s.get_array_of_samples() for s in channel_sounds]

    fp_arr = np.array(samples).T.astype(np.float32)
    fp_arr /= np.iinfo(samples[0].typecode).max
    fp_arr = fp_arr.reshape(-1)

    return fp_arr


# sr = samples / second
def millisecond_to_samples(ms, sr):
    return round((ms / 1000) * sr)


def samples_to_millisecond(samples, sr):
    return (samples / sr) * 1000


def samples_to_time(samples, sr):
    return ms_to_time(samples_to_millisecond(samples, sr))


def ms_to_time(ms):
    secs = ms / 1000
    return "{0}:{1:.4f}".format(math.floor(secs / 60), secs % 60)


def seg_is_speech(seg):
    f = lambda x: int(32768 * x)
    x = np.vectorize(f)(seg)
    pcm_data = x.tobytes()

    speeches = 0
    total = 0
    offset = 0
    n = int(sr * (frame_duration_ms / 1000.0) * 2)
    duration = (float(n) / sr) / 2.0

    while offset + n < len(pcm_data):
        frame = pcm_data[offset:(offset+n)]
        if vad.is_speech(frame, sr):
            speeches += 1
        offset = offset + n
        total += 1

    return speeches / total


def calculate_best_offset(mfcc_ref, mfcc_seg, sr):
    return librosa.segment.cross_similarity(mfcc_seg, mfcc_ref, mode='affinity', metric='cosine')


def detect_lines(img, duration_x, duration_y, plot_result=False):
    #print(img.shape)
    #print(np.min(img), np.max(img))
    gray = np.vectorize(int)((1-img) * 255).astype('uint8')
    img = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
    #print(img, type(img))
    #img = cv2.imread('affine_similarity_2.png')
    #gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    #cv2.imshow("gray", gray)
    #cv2.waitKey(0)
    #print(gray, type(gray), gray.shape, gray.dtype)
    #print(gray2, type(gray2), gray2.shape, gray2.dtype)
    kernel_size = 5
    blur_gray = cv2.GaussianBlur(gray, (kernel_size, kernel_size), 0)
    #cv2.imshow("blur gray", blur_gray)
    #cv2.waitKey(0)

    low_threshold = 50
    high_threshold = 150
    edges = cv2.Canny(blur_gray, low_threshold, high_threshold)

    rho = 1  # distance resolution in pixels of the Hough grid
    theta = np.pi / 180  # angular resolution in radians of the Hough grid
    threshold = 15  # minimum number of votes (intersections in Hough grid cell)
    min_line_length = 50  # minimum number of pixels making up a line
    max_line_gap = 20  # maximum gap in pixels between connectable line segments
    if plot_result:
        line_image = np.copy(img) * 0  # creating a blank to draw lines on

    # Run Hough on edge detected image
    # Output "lines" is an array containing endpoints of detected line segments
    lines = cv2.HoughLinesP(edges, rho, theta, threshold, np.array([]),
                            min_line_length, max_line_gap)

    width, height = img.shape[1], img.shape[0]

    scale_x = duration_x / width
    scale_y = duration_y / height
    #print(img.shape, scale_x, scale_y, duration_x, duration_y)

    #slope = duration_y / duration_x
    slope = 1

    expected_slope = scale_x / scale_y
    #print(expected_slope)
    #expected_slope = 1.0 # y is inverted by opencv
    #expected_slope = 0.101694915

    ls = []
    offsets = []
    xs = []
    if lines is not None:
        for line in lines:
            for x1,y1,x2,y2 in line:
                # swapped y1 and y2 since y is measured from the top
                slope = (y2-y1)/(x2-x1) if x2 != x1 else 42
                if abs(slope - expected_slope) < 0.15:#and (x1 / width) < 0.15:
                    y = y1
                    y0 = (y - x1 * slope)
                    if plot_result:
                        #cv2.line(line_image,(0,int(y0)),(x2,y2),(0,255,0),5)
                        cv2.line(line_image,(x1, y1),(x2,y2),(255,0,0),5)
                        cv2.putText(img, "{:.2f}".format(slope), (x1, y1), fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=2,color=(0, 0, 255))
                    #if (x1 / width) < 0.15:
                        #print(height-y1)
                        #y = height - y1
                    #y = y1
                    #y0 = y - x1 * slope
                    #offsets.append(y0 * scale_y)
                    #xs.append(x1)
                    ls.append((x1, y1, slope))
                        #actual_lines.append((x1 * scale_x, (height - y1) * scale_y, x2 * scale_x, (height - y2) * scale_y))
    #print(max(slopes))
    x_min = min(ls, key=lambda a: a[0])[0] if len(ls) > 0 else 42 # just something > 10
    offsets = [ (y1 + (x_min - x1)*slope) * scale_y for x1, y1, slope in ls ]
    if plot_result:
        for x1, y1, slope in ls:
            y = y1 + (x_min -x1)*slope
            #cv2.line(line_image,(x_min,int(y)),(x1,y1),(0,255,0),5)

        #cv2.line(line_image, (x_min, 0), (x_min, height-1), (0, 0, 255), 2)
        lines_edges = cv2.addWeighted(img, 0.8, line_image, 1, 0)
        lines_edges_resized = cv2.resize(lines_edges, (int(1024 * duration_x / duration_y ), 1024))
        cv2.imshow("lines", lines_edges_resized)
        cv2.waitKey(0)
    return (x_min*scale_x, offsets)


def map2d(x, y, f):
    n_x = len(x)
    n_y = len(y)
    res = np.zeros((n_x, n_y))
    for i in range(n_x):
        for j in range(n_y):
            res[i,j] = f(x[i], y[j])
    return res


def find_repetition(mfcc_ref, seg, sr, hop_length, sentence_timestamps, plot_result=False):
    mfcc_seg = librosa.feature.mfcc(y=seg, sr=sr, hop_length=hop_length, n_mfcc=n_mfcc)[1:,:]
    xsim = calculate_best_offset(mfcc_ref, mfcc_seg, sr)
    x_min, offsets = detect_lines(xsim, len(seg), mfcc_ref.shape[1] * hop_length, plot_result=plot_result)
    found_starts = sorted([ samples_to_millisecond(y0, sr) for y0 in offsets ])

    def f(ts, start):
        return abs(ts - start) 

    closest = map2d(sentence_timestamps, found_starts, f)
    if plot_result:
        plt.imshow(closest)
        plt.show()
    latest = None
    for i, row in enumerate(closest):
        if len(row) == 0:
            continue
        if min(row) < min_silence_len / 2:
            latest = sentence_timestamps[i]
    return (samples_to_millisecond(x_min, sr), latest)


def samples_to_hops(samples, hop_length):
    return round(samples / hop_length)


def hops_to_samples(hops, hop_length):
    return round(hop_length * hops)


def cont_find_repetitions(y, sr, hop_length, sentence_timestamps):
    assert sorted(sentence_timestamps, key=lambda t: t[0]) == sentence_timestamps
    #print(y.shape)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=n_mfcc)[1:,:]

    step_length_ms = 200
    step_length_samples = millisecond_to_samples(step_length_ms, sr)
    window_length_ms = 1500
    window_length_samples = millisecond_to_samples(window_length_ms, sr)
    ref_window_length_ms = 20*1000 # 10 sekunden
    ref_window_length_samples = millisecond_to_samples(ref_window_length_ms, sr)
    ref_window_length_hops = samples_to_hops(ref_window_length_samples, hop_length)

    offset = 0
    available_ts = sentence_timestamps
    last_sentence_end = 0
    deletion_suggestions = []

    while offset + step_length_samples < len(y) and len(available_ts) > 0:
        offset_ms = samples_to_millisecond(offset, sr)
        #print(ms_to_time(offset_ms), file=sys.stderr)
        if offset_ms < available_ts[0][0] and offset_ms >= last_sentence_end:
            offset += step_length_samples
            continue
        seg = y[ offset : offset + window_length_samples ]
        # no longer needed since skipping based on sentence timestamps?
        #if seg_is_speech(seg) < 0.5:
        #    offset += step_length_samples
        #    continue
        relevant_start = offset_ms
        mfcc_window = mfcc[:,samples_to_hops(offset, hop_length) : samples_to_hops(offset, hop_length) + ref_window_length_hops]
        x_offset_ms, ts_ms = find_repetition(mfcc_window,
                                             seg,
                                             sr,
                                             hop_length,
                                             [ t[0] - offset_ms for t in available_ts ])
        if ts_ms is not None and x_offset_ms < step_length_ms:
            print("delete from {0} to {1}".format(samples_to_time(offset + millisecond_to_samples(x_offset_ms, sr), sr), ms_to_time(offset_ms + ts_ms)))
            deletion_suggestions.append((offset_ms + x_offset_ms, offset_ms + ts_ms))
            #print("window {0} - {1} is repeated at: {2}".format(samples_to_time(offset, sr), samples_to_time(offset + window_length_samples, sr), ms_to_time(ts_ms)))
        offset += step_length_samples
        if offset_ms + step_length_ms > available_ts[0][0]:
            last_sentence_end = available_ts[0][1]
            available_ts = available_ts[1:]
        #available_ts = [t for t in ts_non_sil_ms if t[0] > offset_ms ]
    deletions = []
    cur_deletion = None
    for sugg in deletion_suggestions:
        if cur_deletion is None:
            cur_deletion = [sugg]
        else:
            if sugg[0] - cur_deletion[-1][0] < 250:
                cur_deletion.append(sugg)
            else:
                deletions.append(cur_deletion)
                cur_deletion = [sugg]
    deletions = [(np.mean([d[0] for d in ds]), np.max([d[1] for d in ds])) for ds in deletions]
    for n, d in enumerate(deletions):
        offs = [abs(d[0]-ts[0]) for ts in sentence_timestamps]
        i = np.argmin(offs)
        if offs[i] < 150:
            deletions[n] = (sentence_timestamps[i][0], d[1])
        else:
            deletions[n] = (d[0], d[1])
    return deletions


def make_widths_equal(fig, rect, ax1, ax2, ax3, pad):
    # pad in inches
    divider = VBoxDivider(
        fig, rect,
        horizontal=[Size.AxesX(ax1), Size.Scaled(1), Size.AxesX(ax2), Size.Scaled(1), Size.AxesX(ax3)],
        vertical=[Size.AxesY(ax1), Size.Fixed(pad), Size.AxesY(ax2), Size.Fixed(pad), Size.AxesY(ax3)])
    ax1.set_axes_locator(divider.new_locator(0))
    ax2.set_axes_locator(divider.new_locator(2))
    ax3.set_axes_locator(divider.new_locator(4))


if __name__ == '__main__':
    vad = webrtcvad.Vad()
    hop_length = 128
    n_mfcc = 42

    fp = "hard_pieces.wav"
    print("loading file ...")
    y, sr = librosa.load(fp, mono=True, sr=32000)
    print("calculating mfcc ...")
    mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=n_mfcc)[1:,:]
    song = AudioSegment.from_wav(fp)
    mf_w = mfcc.shape[1]
    l = y.shape[0]
    print(l / mf_w)

    ts_non_sil_ms = non_silent_chunks(song)

    #autocorr = librosa.autocorrelate(y)
    #fig, ax = plt.subplots()
    #ax.plot(autocorr)
    #plt.show()
    #ts_non_sil_ms = [ t[0] for t in non_silent_chunks(song) ]
    #print(mfcc.shape)
    #print("finding reps ...")
    dels = cont_find_repetitions(y, sr, hop_length, ts_non_sil_ms)

    for d in dels:
        print("{0}\t{1}\tdelete".format(d[0]/1000, d[1]/1000))
    #window_length_ms = 1000
    #window_length_samples = millisecond_to_samples(window_length_ms, sr)
    #seg = y[25280 : 25280 + window_length_samples]

    #seg_duration_ms = 100
    #seg_duration_samples = millisecond_to_samples(seg_duration_ms, sr)

    ## split complete audio in 10ms segments, only keep those that have voice in it
    ##segs = []
    ##offset = 0
    ###i = 0
    ##while offset + seg_duration_samples < len(y):
    ##    seg = y[ offset : offset + seg_duration_samples ]
    ##    if seg_is_speech(seg):
    ##        segs.append((seg, offset))
    ##    offset += seg_duration_samples

    ##segs = segs[1:]
    ##n_segs = len(segs)

    ##(seg, offset) = segs[0]

    fp_segment = "segment.wav"
    #seg = y
    #sr_seg = sr
    seg, sr_seg = librosa.load(fp_segment, mono=True, sr=32000)

    #assert sr==sr_seg
    #mfcc_window = mfcc[:,1000:]

    #x_offset, ts_ms = find_repetition(mfcc_window, seg, sr, hop_length, [ t[0] for t in ts_non_sil_ms], plot_result=True)
    #if ts_ms is not None:
    #    print("starting from {0} the seg is repeated at {1}".format(ms_to_time(x_offset), ms_to_time(ts_ms)))
    #else:
    #    print("no rep found")

    #cutoff = int(0.2*len(seg))
    #print(samples_to_millisecond(cutoff, sr))

    #print("calculating xcross ...")
    #xsim = librosa.segment.cross_similarity(mfcc, mfcc, mode='affinity', metric='cosine')
    #chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=hop_length)
    #mfcc_stack = librosa.feature.stack_memory(mfcc, n_steps=10, delay=3)
    #xsim = librosa.segment.recurrence_matrix(mfcc, mode='affinity', metric='cosine')
    #lag = librosa.segment.recurrence_to_lag(xsim, pad=False)

    #xsim = librosa.segment.recurrence_matrix(mfcc, mode='affinity', metric='cosine',
    #        width=50)
    #fig, ax = plt.subplots(nrows=1, sharex=True)
    #img = librosa.display.specshow(xsim, x_axis='s', y_axis='s', hop_length=hop_length, ax=ax, cmap='magma_r')
    #plt.show()
    print("detecting lines ...")
    #detect_lines(np.flip(xsim, 0), len(y), len(y), plot_result=True)
    #print(detect_lines(xsim))
    #ax.imshow(np.transpose(xsim), aspect='auto')
    #ax[1].imshow(diffs_penalised)
    #ax[1].imshow(np.reshape(vad_coeffs, (1, n_segs)))
    #ax[2].imshow(np.reshape(lengths, (1, n_segs)))

    #make_widths_equal(fig, 111, ax[0], ax[1], ax[2], pad=0.5)
    #plt.show()
    #print("possible starts:", [ ms_to_time(t) for t in found_starts])
    #for n, seg in enumerate(segs):
    #    sf.write('part' + str(n) + '.wav', seg, sr)
    #print(segs)

    #y1, sr1 = librosa.load("out000.wav")
    #y2, sr2 = librosa.load("out004.wav")

    #print("total alignment cost:", calc_dtw_sim(y1, y2, sr1, sr2, plot_result=True))
    #print("xcorr:", np.trace(calc_xcorr_sim(y1, y2, sr1, sr2)))