en_de/generate.py

import sys

import cv2
from PIL import ImageFont, Image, ImageDraw
import csv
import os
from shutil import copyfile
from time import sleep, time

from pydub import AudioSegment

SHORT_SILENCE = 500
LONG_SILENCE = 1000
FRAMES = 24
SOUNDS_DIR = "sounds"
IMAGES_DIR = "images"
AUDIO_LESSONS_DIR = "lessons"
VIDEOS_DIR = "videos"
SOUND_CACHE_FPATH = 'sound_cache.csv'
WORDS_FPATH = 'words.csv'

for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]:
    os.makedirs(d, exist_ok=True)

EXPONENTIAL_BACKOFF = 1.5
LANG_REGIONS = {
    'en': 'en-US',
    'de': 'de-de',
}

SOUND_CACHE = {}


def load_sound_cache():
    with open(SOUND_CACHE_FPATH, 'w+') as csvFile:
        reader = csv.reader(csvFile)
        for line, row in enumerate(reader):
            wordid, lang, word = row[0], row[1], row[2]
            SOUND_CACHE[(word, lang)] = wordid


def get_cached_sound(word, lang):
    wordid = SOUND_CACHE.get((word, lang))
    return wordid


def gen_speech(phrase, lang, filepath):
    wav_fpath = filepath.split('.')[0] + '.wav'
    # +f1 -k1 -s100
    cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}'
    cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath)
    print(cmd)
    os.system(cmd)


def generate_sound(word, lang, wordid):
    lang_region = LANG_REGIONS.get(lang)
    cached_wordid = get_cached_sound(word, lang)
    if cached_wordid:
        print("  Found in cache: {}".format(word))
        if cached_wordid != wordid:
            # TODO: this is duplicating space, but my brain is fried, should be mapping
            cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang))
            word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang))
            copyfile(cached_filepath, word_filepath)
    else:
        filename = "{}_{}.mp3".format(wordid, lang)
        filepath = os.path.join(SOUNDS_DIR, filename)
        start = time()
        gen_speech(word, lang_region, filepath)
        duration = time() - start
        print("  Generated ({} - {} s): {}".format(lang, duration, word))
        SOUND_CACHE[(word, lang)] = wordid
        with open(SOUND_CACHE_FPATH, 'a') as f:
            writer = csv.writer(f)
            writer.writerow([wordid, lang, word])


# umlauts: ä ö ü  Ä Ö
def gen_tts(wordids=None):
    if not wordids:
        wordids = list()
    with open(WORDS_FPATH, 'r') as csvFile:
        reader = csv.reader(csvFile)
        for row in reader:
            wordid, german, english = row[0], row[1], row[2]
            if wordid not in wordids:
                continue
            print("Generating image..")
            image_gen(wordid, german, english)
            print("Generating sound {}: {}, {}".format(wordid, german, english))
            backoff, attempt = 1, 0
            while True:
                try:
                    for word, lang in [(german, 'de'), (english, 'en')]:
                        generate_sound(word, lang, wordid)
                except Exception as e:
                    backoff = backoff * EXPONENTIAL_BACKOFF
                    print("Sleeping for {}. Error: {}.".format(backoff, e))
                    if attempt % 3 == 0:
                        import pdb;
                        pdb.set_trace()
                    attempt += 1
                    sleep(backoff)
                    continue
                break


def filter_words(contains):
    wordids = []
    with open(WORDS_FPATH, 'r') as csvFile:
        reader = csv.reader(csvFile)
        for row in reader:
            if contains in row:
                wordids.append(row[0])
    return wordids


def find_roundoff_silence(this_phrase_duration):
    length_increment = 0.1
    for i in range(0, 10):
        incr = round(this_phrase_duration, 1) + length_increment * i
        if (incr * FRAMES) % FRAMES == 0:
            return round((incr - this_phrase_duration) * 1000)
    return 0


def concatenate(filename="lesson1", wordids=None):
    if not wordids:
        wordids = list()
    print("Concatenating {} sounds: {}".format(len(wordids), wordids))
    lessons = AudioSegment.silent(duration=1)
    silence = AudioSegment.silent(duration=SHORT_SILENCE)
    long_silence = AudioSegment.silent(duration=LONG_SILENCE)
    gen_tts(wordids=wordids)
    images_durations = []
    for wordid in wordids:
        start = time()
        sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))
        sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))
        this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence
        this_phrase_duration = this_phrase.duration_seconds
        roundoff_silence = find_roundoff_silence(this_phrase_duration)
        this_phrase = this_phrase + AudioSegment.silent(duration=roundoff_silence)
        images_durations.append((wordid, this_phrase.duration_seconds))
        lessons = lessons + this_phrase
        duration = time() - start
        print("Concatenated {} - {}s".format(wordid, duration))

    lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3")
    create_video(filename, images_durations)


def image_gen(wordid, de_text, en_text):
    width, height = 800, 450
    margin, initial_font_size, font_step = 50, 120, 4
    filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid))
    image = Image.new(mode="RGB", size=(width, height), color="black")
    draw = ImageDraw.Draw(image)
    font_size = initial_font_size
    while True:
        fnt = ImageFont.truetype('arial.ttf', font_size)
        de_w, de_h = draw.textsize(de_text, font=fnt)
        en_w, en_h = draw.textsize(en_text, font=fnt)
        if de_w + 2 * margin > width or en_w + 2 * margin > width:
            font_size -= font_step
            continue
        break
    draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255))
    draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255))
    draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2)
    image.save(filename)


def create_video(lesson_name, images_durations):
    tmp_video_name = '{}_tmp.mp4'.format(lesson_name)
    tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name)

    fourcc = cv2.VideoWriter_fourcc(*'MP4V')  # define the video codec

    frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0])))
    height, width, layers = frame.shape

    video = cv2.VideoWriter(tmp_video_filepath, fourcc, FRAMES, (width, height))
    for image, image_duration in images_durations:
        image_frames = round(image_duration * FRAMES)
        for _ in range(image_frames):
            video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image))))

    cv2.destroyAllWindows()
    video.release()

    os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format(
        video_name=tmp_video_filepath,
        audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)),
        video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)),
    ))
    os.remove(tmp_video_filepath)


ERROR_MSG = "First argument needs to be the lesson to be generated"

if __name__ == "__main__":
    load_sound_cache()
    if len(sys.argv) != 2:
        print(ERROR_MSG)
        exit(1)
    lesson = sys.argv[1]
    if not lesson.isdigit():
        print(ERROR_MSG)
        exit(1)
    lesson = int(lesson)
    if not 0 < lesson < 100:
        print(ERROR_MSG)
        exit(1)
    print(lesson)
    lesson = "lesson{:02d}".format(lesson)
    concatenate(filename=lesson,
                wordids=filter_words(lesson))
espeak generation and video 2022-01-15 21:37:57 +01:00			`import sys`

			`import cv2`
			`from PIL import ImageFont, Image, ImageDraw`
initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`import csv`
			`import os`
			`from shutil import copyfile`
			`from time import sleep, time`

			`from pydub import AudioSegment`

			`SHORT_SILENCE = 500`
			`LONG_SILENCE = 1000`
align frames 2022-01-16 13:59:25 +01:00			`FRAMES = 24`
espeak generation and video 2022-01-15 21:37:57 +01:00			`SOUNDS_DIR = "sounds"`
			`IMAGES_DIR = "images"`
			`AUDIO_LESSONS_DIR = "lessons"`
			`VIDEOS_DIR = "videos"`
initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`SOUND_CACHE_FPATH = 'sound_cache.csv'`
			`WORDS_FPATH = 'words.csv'`

espeak generation and video 2022-01-15 21:37:57 +01:00			`for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]:`
			`os.makedirs(d, exist_ok=True)`

initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`EXPONENTIAL_BACKOFF = 1.5`
			`LANG_REGIONS = {`
			`'en': 'en-US',`
			`'de': 'de-de',`
			`}`

			`SOUND_CACHE = {}`


			`def load_sound_cache():`
espeak generation and video 2022-01-15 21:37:57 +01:00			`with open(SOUND_CACHE_FPATH, 'w+') as csvFile:`
initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`reader = csv.reader(csvFile)`
			`for line, row in enumerate(reader):`
			`wordid, lang, word = row[0], row[1], row[2]`
			`SOUND_CACHE[(word, lang)] = wordid`


			`def get_cached_sound(word, lang):`
			`wordid = SOUND_CACHE.get((word, lang))`
			`return wordid`


espeak generation and video 2022-01-15 21:37:57 +01:00			`def gen_speech(phrase, lang, filepath):`
			`wav_fpath = filepath.split('.')[0] + '.wav'`
			`# +f1 -k1 -s100`
			`cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}'`
			`cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath)`
			`print(cmd)`
			`os.system(cmd)`


initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`def generate_sound(word, lang, wordid):`
			`lang_region = LANG_REGIONS.get(lang)`
			`cached_wordid = get_cached_sound(word, lang)`
			`if cached_wordid:`
			`print(" Found in cache: {}".format(word))`
			`if cached_wordid != wordid:`
			`# TODO: this is duplicating space, but my brain is fried, should be mapping`
espeak generation and video 2022-01-15 21:37:57 +01:00			`cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang))`
			`word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang))`
initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`copyfile(cached_filepath, word_filepath)`
			`else:`
			`filename = "{}_{}.mp3".format(wordid, lang)`
espeak generation and video 2022-01-15 21:37:57 +01:00			`filepath = os.path.join(SOUNDS_DIR, filename)`
initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`start = time()`
espeak generation and video 2022-01-15 21:37:57 +01:00			`gen_speech(word, lang_region, filepath)`
initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`duration = time() - start`
			`print(" Generated ({} - {} s): {}".format(lang, duration, word))`
			`SOUND_CACHE[(word, lang)] = wordid`
			`with open(SOUND_CACHE_FPATH, 'a') as f:`
			`writer = csv.writer(f)`
			`writer.writerow([wordid, lang, word])`


			`# umlauts: ä ö ü Ä Ö`
			`def gen_tts(wordids=None):`
			`if not wordids:`
			`wordids = list()`
			`with open(WORDS_FPATH, 'r') as csvFile:`
			`reader = csv.reader(csvFile)`
			`for row in reader:`
			`wordid, german, english = row[0], row[1], row[2]`
			`if wordid not in wordids:`
			`continue`
espeak generation and video 2022-01-15 21:37:57 +01:00			`print("Generating image..")`
			`image_gen(wordid, german, english)`
			`print("Generating sound {}: {}, {}".format(wordid, german, english))`
initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`backoff, attempt = 1, 0`
			`while True:`
			`try:`
			`for word, lang in [(german, 'de'), (english, 'en')]:`
			`generate_sound(word, lang, wordid)`
			`except Exception as e:`
			`backoff = backoff * EXPONENTIAL_BACKOFF`
			`print("Sleeping for {}. Error: {}.".format(backoff, e))`
			`if attempt % 3 == 0:`
			`import pdb;`
			`pdb.set_trace()`
			`attempt += 1`
			`sleep(backoff)`
			`continue`
			`break`


			`def filter_words(contains):`
			`wordids = []`
			`with open(WORDS_FPATH, 'r') as csvFile:`
			`reader = csv.reader(csvFile)`
			`for row in reader:`
			`if contains in row:`
			`wordids.append(row[0])`
			`return wordids`


align frames 2022-01-16 13:59:25 +01:00			`def find_roundoff_silence(this_phrase_duration):`
			`length_increment = 0.1`
			`for i in range(0, 10):`
			`incr = round(this_phrase_duration, 1) + length_increment * i`
			`if (incr * FRAMES) % FRAMES == 0:`
			`return round((incr - this_phrase_duration) * 1000)`
			`return 0`


initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`def concatenate(filename="lesson1", wordids=None):`
			`if not wordids:`
			`wordids = list()`
			`print("Concatenating {} sounds: {}".format(len(wordids), wordids))`
			`lessons = AudioSegment.silent(duration=1)`
			`silence = AudioSegment.silent(duration=SHORT_SILENCE)`
			`long_silence = AudioSegment.silent(duration=LONG_SILENCE)`
			`gen_tts(wordids=wordids)`
espeak generation and video 2022-01-15 21:37:57 +01:00			`images_durations = []`
initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`for wordid in wordids:`
			`start = time()`
			`sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))`
			`sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))`
espeak generation and video 2022-01-15 21:37:57 +01:00			`this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence`
align frames 2022-01-16 13:59:25 +01:00			`this_phrase_duration = this_phrase.duration_seconds`
			`roundoff_silence = find_roundoff_silence(this_phrase_duration)`
			`this_phrase = this_phrase + AudioSegment.silent(duration=roundoff_silence)`
espeak generation and video 2022-01-15 21:37:57 +01:00			`images_durations.append((wordid, this_phrase.duration_seconds))`
			`lessons = lessons + this_phrase`
initial commit - up to lesson06 2019-04-14 21:37:45 +02:00			`duration = time() - start`
			`print("Concatenated {} - {}s".format(wordid, duration))`

espeak generation and video 2022-01-15 21:37:57 +01:00			`lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3")`
			`create_video(filename, images_durations)`


			`def image_gen(wordid, de_text, en_text):`
			`width, height = 800, 450`
			`margin, initial_font_size, font_step = 50, 120, 4`
			`filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid))`
			`image = Image.new(mode="RGB", size=(width, height), color="black")`
			`draw = ImageDraw.Draw(image)`
			`font_size = initial_font_size`
			`while True:`
			`fnt = ImageFont.truetype('arial.ttf', font_size)`
			`de_w, de_h = draw.textsize(de_text, font=fnt)`
			`en_w, en_h = draw.textsize(en_text, font=fnt)`
			`if de_w + 2 * margin > width or en_w + 2 * margin > width:`
			`font_size -= font_step`
			`continue`
			`break`
			`draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255))`
			`draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255))`
			`draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2)`
			`image.save(filename)`


			`def create_video(lesson_name, images_durations):`
			`tmp_video_name = '{}_tmp.mp4'.format(lesson_name)`
			`tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name)`

			`fourcc = cv2.VideoWriter_fourcc(*'MP4V') # define the video codec`

			`frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0])))`
			`height, width, layers = frame.shape`

align frames 2022-01-16 13:59:25 +01:00			`video = cv2.VideoWriter(tmp_video_filepath, fourcc, FRAMES, (width, height))`
espeak generation and video 2022-01-15 21:37:57 +01:00			`for image, image_duration in images_durations:`
align frames 2022-01-16 13:59:25 +01:00			`image_frames = round(image_duration * FRAMES)`
espeak generation and video 2022-01-15 21:37:57 +01:00			`for _ in range(image_frames):`
			`video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image))))`

			`cv2.destroyAllWindows()`
			`video.release()`

			`os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format(`
			`video_name=tmp_video_filepath,`
			`audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)),`
			`video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)),`
			`))`
			`os.remove(tmp_video_filepath)`


			`ERROR_MSG = "First argument needs to be the lesson to be generated"`
initial commit - up to lesson06 2019-04-14 21:37:45 +02:00
			`if __name__ == "__main__":`
			`load_sound_cache()`
espeak generation and video 2022-01-15 21:37:57 +01:00			`if len(sys.argv) != 2:`
			`print(ERROR_MSG)`
			`exit(1)`
			`lesson = sys.argv[1]`
			`if not lesson.isdigit():`
			`print(ERROR_MSG)`
			`exit(1)`
			`lesson = int(lesson)`
			`if not 0 < lesson < 100:`
			`print(ERROR_MSG)`
			`exit(1)`
			`print(lesson)`
			`lesson = "lesson{:02d}".format(lesson)`
			`concatenate(filename=lesson,`
			`wordids=filter_words(lesson))`