220 lines
7.6 KiB
Python
220 lines
7.6 KiB
Python
import sys
|
|
|
|
import cv2
|
|
from PIL import ImageFont, Image, ImageDraw
|
|
import csv
|
|
import os
|
|
from shutil import copyfile
|
|
from time import sleep, time
|
|
|
|
from pydub import AudioSegment
|
|
|
|
SHORT_SILENCE = 500
|
|
LONG_SILENCE = 1000
|
|
FRAMES = 24
|
|
SOUNDS_DIR = "sounds"
|
|
IMAGES_DIR = "images"
|
|
AUDIO_LESSONS_DIR = "lessons"
|
|
VIDEOS_DIR = "videos"
|
|
SOUND_CACHE_FPATH = 'sound_cache.csv'
|
|
WORDS_FPATH = 'words.csv'
|
|
|
|
for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]:
|
|
os.makedirs(d, exist_ok=True)
|
|
|
|
EXPONENTIAL_BACKOFF = 1.5
|
|
LANG_REGIONS = {
|
|
'en': 'en-US',
|
|
'de': 'de-de',
|
|
}
|
|
|
|
SOUND_CACHE = {}
|
|
|
|
|
|
def load_sound_cache():
|
|
with open(SOUND_CACHE_FPATH, 'w+') as csvFile:
|
|
reader = csv.reader(csvFile)
|
|
for line, row in enumerate(reader):
|
|
wordid, lang, word = row[0], row[1], row[2]
|
|
SOUND_CACHE[(word, lang)] = wordid
|
|
|
|
|
|
def get_cached_sound(word, lang):
|
|
wordid = SOUND_CACHE.get((word, lang))
|
|
return wordid
|
|
|
|
|
|
def gen_speech(phrase, lang, filepath):
|
|
wav_fpath = filepath.split('.')[0] + '.wav'
|
|
# +f1 -k1 -s100
|
|
cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}'
|
|
cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath)
|
|
print(cmd)
|
|
os.system(cmd)
|
|
|
|
|
|
def generate_sound(word, lang, wordid):
|
|
lang_region = LANG_REGIONS.get(lang)
|
|
cached_wordid = get_cached_sound(word, lang)
|
|
if cached_wordid:
|
|
print(" Found in cache: {}".format(word))
|
|
if cached_wordid != wordid:
|
|
# TODO: this is duplicating space, but my brain is fried, should be mapping
|
|
cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang))
|
|
word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang))
|
|
copyfile(cached_filepath, word_filepath)
|
|
else:
|
|
filename = "{}_{}.mp3".format(wordid, lang)
|
|
filepath = os.path.join(SOUNDS_DIR, filename)
|
|
start = time()
|
|
gen_speech(word, lang_region, filepath)
|
|
duration = time() - start
|
|
print(" Generated ({} - {} s): {}".format(lang, duration, word))
|
|
SOUND_CACHE[(word, lang)] = wordid
|
|
with open(SOUND_CACHE_FPATH, 'a') as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow([wordid, lang, word])
|
|
|
|
|
|
# umlauts: ä ö ü Ä Ö
|
|
def gen_tts(wordids=None):
|
|
if not wordids:
|
|
wordids = list()
|
|
with open(WORDS_FPATH, 'r') as csvFile:
|
|
reader = csv.reader(csvFile)
|
|
for row in reader:
|
|
wordid, german, english = row[0], row[1], row[2]
|
|
if wordid not in wordids:
|
|
continue
|
|
print("Generating image..")
|
|
image_gen(wordid, german, english)
|
|
print("Generating sound {}: {}, {}".format(wordid, german, english))
|
|
backoff, attempt = 1, 0
|
|
while True:
|
|
try:
|
|
for word, lang in [(german, 'de'), (english, 'en')]:
|
|
generate_sound(word, lang, wordid)
|
|
except Exception as e:
|
|
backoff = backoff * EXPONENTIAL_BACKOFF
|
|
print("Sleeping for {}. Error: {}.".format(backoff, e))
|
|
if attempt % 3 == 0:
|
|
import pdb;
|
|
pdb.set_trace()
|
|
attempt += 1
|
|
sleep(backoff)
|
|
continue
|
|
break
|
|
|
|
|
|
def filter_words(contains):
|
|
wordids = []
|
|
with open(WORDS_FPATH, 'r') as csvFile:
|
|
reader = csv.reader(csvFile)
|
|
for row in reader:
|
|
if contains in row:
|
|
wordids.append(row[0])
|
|
return wordids
|
|
|
|
|
|
def find_roundoff_silence(this_phrase_duration):
|
|
length_increment = 0.1
|
|
for i in range(0, 10):
|
|
incr = round(this_phrase_duration, 1) + length_increment * i
|
|
if (incr * FRAMES) % FRAMES == 0:
|
|
return round((incr - this_phrase_duration) * 1000)
|
|
return 0
|
|
|
|
|
|
def concatenate(filename="lesson1", wordids=None):
|
|
if not wordids:
|
|
wordids = list()
|
|
print("Concatenating {} sounds: {}".format(len(wordids), wordids))
|
|
lessons = AudioSegment.silent(duration=1)
|
|
silence = AudioSegment.silent(duration=SHORT_SILENCE)
|
|
long_silence = AudioSegment.silent(duration=LONG_SILENCE)
|
|
gen_tts(wordids=wordids)
|
|
images_durations = []
|
|
for wordid in wordids:
|
|
start = time()
|
|
sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))
|
|
sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))
|
|
this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence
|
|
this_phrase_duration = this_phrase.duration_seconds
|
|
roundoff_silence = find_roundoff_silence(this_phrase_duration)
|
|
this_phrase = this_phrase + AudioSegment.silent(duration=roundoff_silence)
|
|
images_durations.append((wordid, this_phrase.duration_seconds))
|
|
lessons = lessons + this_phrase
|
|
duration = time() - start
|
|
print("Concatenated {} - {}s".format(wordid, duration))
|
|
|
|
lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3")
|
|
create_video(filename, images_durations)
|
|
|
|
|
|
def image_gen(wordid, de_text, en_text):
|
|
width, height = 800, 450
|
|
margin, initial_font_size, font_step = 50, 120, 4
|
|
filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid))
|
|
image = Image.new(mode="RGB", size=(width, height), color="black")
|
|
draw = ImageDraw.Draw(image)
|
|
font_size = initial_font_size
|
|
while True:
|
|
fnt = ImageFont.truetype('arial.ttf', font_size)
|
|
de_w, de_h = draw.textsize(de_text, font=fnt)
|
|
en_w, en_h = draw.textsize(en_text, font=fnt)
|
|
if de_w + 2 * margin > width or en_w + 2 * margin > width:
|
|
font_size -= font_step
|
|
continue
|
|
break
|
|
draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255))
|
|
draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255))
|
|
draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2)
|
|
image.save(filename)
|
|
|
|
|
|
def create_video(lesson_name, images_durations):
|
|
tmp_video_name = '{}_tmp.mp4'.format(lesson_name)
|
|
tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name)
|
|
|
|
fourcc = cv2.VideoWriter_fourcc(*'MP4V') # define the video codec
|
|
|
|
frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0])))
|
|
height, width, layers = frame.shape
|
|
|
|
video = cv2.VideoWriter(tmp_video_filepath, fourcc, FRAMES, (width, height))
|
|
for image, image_duration in images_durations:
|
|
image_frames = round(image_duration * FRAMES)
|
|
for _ in range(image_frames):
|
|
video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image))))
|
|
|
|
cv2.destroyAllWindows()
|
|
video.release()
|
|
|
|
os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format(
|
|
video_name=tmp_video_filepath,
|
|
audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)),
|
|
video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)),
|
|
))
|
|
os.remove(tmp_video_filepath)
|
|
|
|
|
|
ERROR_MSG = "First argument needs to be the lesson to be generated"
|
|
|
|
if __name__ == "__main__":
|
|
load_sound_cache()
|
|
if len(sys.argv) != 2:
|
|
print(ERROR_MSG)
|
|
exit(1)
|
|
lesson = sys.argv[1]
|
|
if not lesson.isdigit():
|
|
print(ERROR_MSG)
|
|
exit(1)
|
|
lesson = int(lesson)
|
|
if not 0 < lesson < 100:
|
|
print(ERROR_MSG)
|
|
exit(1)
|
|
print(lesson)
|
|
lesson = "lesson{:02d}".format(lesson)
|
|
concatenate(filename=lesson,
|
|
wordids=filter_words(lesson))
|