en_de/generate.py

220 lines
7.6 KiB
Python
Raw Permalink Normal View History

2022-01-15 21:37:57 +01:00
import sys
import cv2
from PIL import ImageFont, Image, ImageDraw
2019-04-14 21:37:45 +02:00
import csv
import os
from shutil import copyfile
from time import sleep, time
from pydub import AudioSegment
SHORT_SILENCE = 500
LONG_SILENCE = 1000
2022-01-16 13:59:25 +01:00
FRAMES = 24
2022-01-15 21:37:57 +01:00
SOUNDS_DIR = "sounds"
IMAGES_DIR = "images"
AUDIO_LESSONS_DIR = "lessons"
VIDEOS_DIR = "videos"
2019-04-14 21:37:45 +02:00
SOUND_CACHE_FPATH = 'sound_cache.csv'
WORDS_FPATH = 'words.csv'
2022-01-15 21:37:57 +01:00
for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]:
os.makedirs(d, exist_ok=True)
2019-04-14 21:37:45 +02:00
EXPONENTIAL_BACKOFF = 1.5
LANG_REGIONS = {
'en': 'en-US',
'de': 'de-de',
}
SOUND_CACHE = {}
def load_sound_cache():
2022-01-15 21:37:57 +01:00
with open(SOUND_CACHE_FPATH, 'w+') as csvFile:
2019-04-14 21:37:45 +02:00
reader = csv.reader(csvFile)
for line, row in enumerate(reader):
wordid, lang, word = row[0], row[1], row[2]
SOUND_CACHE[(word, lang)] = wordid
def get_cached_sound(word, lang):
wordid = SOUND_CACHE.get((word, lang))
return wordid
2022-01-15 21:37:57 +01:00
def gen_speech(phrase, lang, filepath):
wav_fpath = filepath.split('.')[0] + '.wav'
# +f1 -k1 -s100
cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}'
cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath)
print(cmd)
os.system(cmd)
2019-04-14 21:37:45 +02:00
def generate_sound(word, lang, wordid):
lang_region = LANG_REGIONS.get(lang)
cached_wordid = get_cached_sound(word, lang)
if cached_wordid:
print(" Found in cache: {}".format(word))
if cached_wordid != wordid:
# TODO: this is duplicating space, but my brain is fried, should be mapping
2022-01-15 21:37:57 +01:00
cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang))
word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang))
2019-04-14 21:37:45 +02:00
copyfile(cached_filepath, word_filepath)
else:
filename = "{}_{}.mp3".format(wordid, lang)
2022-01-15 21:37:57 +01:00
filepath = os.path.join(SOUNDS_DIR, filename)
2019-04-14 21:37:45 +02:00
start = time()
2022-01-15 21:37:57 +01:00
gen_speech(word, lang_region, filepath)
2019-04-14 21:37:45 +02:00
duration = time() - start
print(" Generated ({} - {} s): {}".format(lang, duration, word))
SOUND_CACHE[(word, lang)] = wordid
with open(SOUND_CACHE_FPATH, 'a') as f:
writer = csv.writer(f)
writer.writerow([wordid, lang, word])
# umlauts: ä ö ü Ä Ö
def gen_tts(wordids=None):
if not wordids:
wordids = list()
with open(WORDS_FPATH, 'r') as csvFile:
reader = csv.reader(csvFile)
for row in reader:
wordid, german, english = row[0], row[1], row[2]
if wordid not in wordids:
continue
2022-01-15 21:37:57 +01:00
print("Generating image..")
image_gen(wordid, german, english)
print("Generating sound {}: {}, {}".format(wordid, german, english))
2019-04-14 21:37:45 +02:00
backoff, attempt = 1, 0
while True:
try:
for word, lang in [(german, 'de'), (english, 'en')]:
generate_sound(word, lang, wordid)
except Exception as e:
backoff = backoff * EXPONENTIAL_BACKOFF
print("Sleeping for {}. Error: {}.".format(backoff, e))
if attempt % 3 == 0:
import pdb;
pdb.set_trace()
attempt += 1
sleep(backoff)
continue
break
def filter_words(contains):
wordids = []
with open(WORDS_FPATH, 'r') as csvFile:
reader = csv.reader(csvFile)
for row in reader:
if contains in row:
wordids.append(row[0])
return wordids
2022-01-16 13:59:25 +01:00
def find_roundoff_silence(this_phrase_duration):
length_increment = 0.1
for i in range(0, 10):
incr = round(this_phrase_duration, 1) + length_increment * i
if (incr * FRAMES) % FRAMES == 0:
return round((incr - this_phrase_duration) * 1000)
return 0
2019-04-14 21:37:45 +02:00
def concatenate(filename="lesson1", wordids=None):
if not wordids:
wordids = list()
print("Concatenating {} sounds: {}".format(len(wordids), wordids))
lessons = AudioSegment.silent(duration=1)
silence = AudioSegment.silent(duration=SHORT_SILENCE)
long_silence = AudioSegment.silent(duration=LONG_SILENCE)
gen_tts(wordids=wordids)
2022-01-15 21:37:57 +01:00
images_durations = []
2019-04-14 21:37:45 +02:00
for wordid in wordids:
start = time()
sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))
sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))
2022-01-15 21:37:57 +01:00
this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence
2022-01-16 13:59:25 +01:00
this_phrase_duration = this_phrase.duration_seconds
roundoff_silence = find_roundoff_silence(this_phrase_duration)
this_phrase = this_phrase + AudioSegment.silent(duration=roundoff_silence)
2022-01-15 21:37:57 +01:00
images_durations.append((wordid, this_phrase.duration_seconds))
lessons = lessons + this_phrase
2019-04-14 21:37:45 +02:00
duration = time() - start
print("Concatenated {} - {}s".format(wordid, duration))
2022-01-15 21:37:57 +01:00
lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3")
create_video(filename, images_durations)
def image_gen(wordid, de_text, en_text):
width, height = 800, 450
margin, initial_font_size, font_step = 50, 120, 4
filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid))
image = Image.new(mode="RGB", size=(width, height), color="black")
draw = ImageDraw.Draw(image)
font_size = initial_font_size
while True:
fnt = ImageFont.truetype('arial.ttf', font_size)
de_w, de_h = draw.textsize(de_text, font=fnt)
en_w, en_h = draw.textsize(en_text, font=fnt)
if de_w + 2 * margin > width or en_w + 2 * margin > width:
font_size -= font_step
continue
break
draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255))
draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255))
draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2)
image.save(filename)
def create_video(lesson_name, images_durations):
tmp_video_name = '{}_tmp.mp4'.format(lesson_name)
tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name)
fourcc = cv2.VideoWriter_fourcc(*'MP4V') # define the video codec
frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0])))
height, width, layers = frame.shape
2022-01-16 13:59:25 +01:00
video = cv2.VideoWriter(tmp_video_filepath, fourcc, FRAMES, (width, height))
2022-01-15 21:37:57 +01:00
for image, image_duration in images_durations:
2022-01-16 13:59:25 +01:00
image_frames = round(image_duration * FRAMES)
2022-01-15 21:37:57 +01:00
for _ in range(image_frames):
video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image))))
cv2.destroyAllWindows()
video.release()
os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format(
video_name=tmp_video_filepath,
audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)),
video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)),
))
os.remove(tmp_video_filepath)
ERROR_MSG = "First argument needs to be the lesson to be generated"
2019-04-14 21:37:45 +02:00
if __name__ == "__main__":
load_sound_cache()
2022-01-15 21:37:57 +01:00
if len(sys.argv) != 2:
print(ERROR_MSG)
exit(1)
lesson = sys.argv[1]
if not lesson.isdigit():
print(ERROR_MSG)
exit(1)
lesson = int(lesson)
if not 0 < lesson < 100:
print(ERROR_MSG)
exit(1)
print(lesson)
lesson = "lesson{:02d}".format(lesson)
concatenate(filename=lesson,
wordids=filter_words(lesson))