en_de/generate.py

220 lines
7.6 KiB
Python

import sys
import cv2
from PIL import ImageFont, Image, ImageDraw
import csv
import os
from shutil import copyfile
from time import sleep, time
from pydub import AudioSegment
SHORT_SILENCE = 500
LONG_SILENCE = 1000
FRAMES = 24
SOUNDS_DIR = "sounds"
IMAGES_DIR = "images"
AUDIO_LESSONS_DIR = "lessons"
VIDEOS_DIR = "videos"
SOUND_CACHE_FPATH = 'sound_cache.csv'
WORDS_FPATH = 'words.csv'
for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]:
os.makedirs(d, exist_ok=True)
EXPONENTIAL_BACKOFF = 1.5
LANG_REGIONS = {
'en': 'en-US',
'de': 'de-de',
}
SOUND_CACHE = {}
def load_sound_cache():
with open(SOUND_CACHE_FPATH, 'w+') as csvFile:
reader = csv.reader(csvFile)
for line, row in enumerate(reader):
wordid, lang, word = row[0], row[1], row[2]
SOUND_CACHE[(word, lang)] = wordid
def get_cached_sound(word, lang):
wordid = SOUND_CACHE.get((word, lang))
return wordid
def gen_speech(phrase, lang, filepath):
wav_fpath = filepath.split('.')[0] + '.wav'
# +f1 -k1 -s100
cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}'
cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath)
print(cmd)
os.system(cmd)
def generate_sound(word, lang, wordid):
lang_region = LANG_REGIONS.get(lang)
cached_wordid = get_cached_sound(word, lang)
if cached_wordid:
print(" Found in cache: {}".format(word))
if cached_wordid != wordid:
# TODO: this is duplicating space, but my brain is fried, should be mapping
cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang))
word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang))
copyfile(cached_filepath, word_filepath)
else:
filename = "{}_{}.mp3".format(wordid, lang)
filepath = os.path.join(SOUNDS_DIR, filename)
start = time()
gen_speech(word, lang_region, filepath)
duration = time() - start
print(" Generated ({} - {} s): {}".format(lang, duration, word))
SOUND_CACHE[(word, lang)] = wordid
with open(SOUND_CACHE_FPATH, 'a') as f:
writer = csv.writer(f)
writer.writerow([wordid, lang, word])
# umlauts: ä ö ü Ä Ö
def gen_tts(wordids=None):
if not wordids:
wordids = list()
with open(WORDS_FPATH, 'r') as csvFile:
reader = csv.reader(csvFile)
for row in reader:
wordid, german, english = row[0], row[1], row[2]
if wordid not in wordids:
continue
print("Generating image..")
image_gen(wordid, german, english)
print("Generating sound {}: {}, {}".format(wordid, german, english))
backoff, attempt = 1, 0
while True:
try:
for word, lang in [(german, 'de'), (english, 'en')]:
generate_sound(word, lang, wordid)
except Exception as e:
backoff = backoff * EXPONENTIAL_BACKOFF
print("Sleeping for {}. Error: {}.".format(backoff, e))
if attempt % 3 == 0:
import pdb;
pdb.set_trace()
attempt += 1
sleep(backoff)
continue
break
def filter_words(contains):
wordids = []
with open(WORDS_FPATH, 'r') as csvFile:
reader = csv.reader(csvFile)
for row in reader:
if contains in row:
wordids.append(row[0])
return wordids
def find_roundoff_silence(this_phrase_duration):
length_increment = 0.1
for i in range(0, 10):
incr = round(this_phrase_duration, 1) + length_increment * i
if (incr * FRAMES) % FRAMES == 0:
return round((incr - this_phrase_duration) * 1000)
return 0
def concatenate(filename="lesson1", wordids=None):
if not wordids:
wordids = list()
print("Concatenating {} sounds: {}".format(len(wordids), wordids))
lessons = AudioSegment.silent(duration=1)
silence = AudioSegment.silent(duration=SHORT_SILENCE)
long_silence = AudioSegment.silent(duration=LONG_SILENCE)
gen_tts(wordids=wordids)
images_durations = []
for wordid in wordids:
start = time()
sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))
sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))
this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence
this_phrase_duration = this_phrase.duration_seconds
roundoff_silence = find_roundoff_silence(this_phrase_duration)
this_phrase = this_phrase + AudioSegment.silent(duration=roundoff_silence)
images_durations.append((wordid, this_phrase.duration_seconds))
lessons = lessons + this_phrase
duration = time() - start
print("Concatenated {} - {}s".format(wordid, duration))
lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3")
create_video(filename, images_durations)
def image_gen(wordid, de_text, en_text):
width, height = 800, 450
margin, initial_font_size, font_step = 50, 120, 4
filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid))
image = Image.new(mode="RGB", size=(width, height), color="black")
draw = ImageDraw.Draw(image)
font_size = initial_font_size
while True:
fnt = ImageFont.truetype('arial.ttf', font_size)
de_w, de_h = draw.textsize(de_text, font=fnt)
en_w, en_h = draw.textsize(en_text, font=fnt)
if de_w + 2 * margin > width or en_w + 2 * margin > width:
font_size -= font_step
continue
break
draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255))
draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255))
draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2)
image.save(filename)
def create_video(lesson_name, images_durations):
tmp_video_name = '{}_tmp.mp4'.format(lesson_name)
tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name)
fourcc = cv2.VideoWriter_fourcc(*'MP4V') # define the video codec
frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0])))
height, width, layers = frame.shape
video = cv2.VideoWriter(tmp_video_filepath, fourcc, FRAMES, (width, height))
for image, image_duration in images_durations:
image_frames = round(image_duration * FRAMES)
for _ in range(image_frames):
video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image))))
cv2.destroyAllWindows()
video.release()
os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format(
video_name=tmp_video_filepath,
audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)),
video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)),
))
os.remove(tmp_video_filepath)
ERROR_MSG = "First argument needs to be the lesson to be generated"
if __name__ == "__main__":
load_sound_cache()
if len(sys.argv) != 2:
print(ERROR_MSG)
exit(1)
lesson = sys.argv[1]
if not lesson.isdigit():
print(ERROR_MSG)
exit(1)
lesson = int(lesson)
if not 0 < lesson < 100:
print(ERROR_MSG)
exit(1)
print(lesson)
lesson = "lesson{:02d}".format(lesson)
concatenate(filename=lesson,
wordids=filter_words(lesson))