import sys import cv2 from PIL import ImageFont, Image, ImageDraw import csv import os from shutil import copyfile from time import sleep, time from pydub import AudioSegment SHORT_SILENCE = 500 LONG_SILENCE = 1000 SOUNDS_DIR = "sounds" IMAGES_DIR = "images" AUDIO_LESSONS_DIR = "lessons" VIDEOS_DIR = "videos" SOUND_CACHE_FPATH = 'sound_cache.csv' WORDS_FPATH = 'words.csv' for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]: os.makedirs(d, exist_ok=True) EXPONENTIAL_BACKOFF = 1.5 LANG_REGIONS = { 'en': 'en-US', 'de': 'de-de', } SOUND_CACHE = {} def load_sound_cache(): with open(SOUND_CACHE_FPATH, 'w+') as csvFile: reader = csv.reader(csvFile) for line, row in enumerate(reader): wordid, lang, word = row[0], row[1], row[2] SOUND_CACHE[(word, lang)] = wordid def get_cached_sound(word, lang): wordid = SOUND_CACHE.get((word, lang)) return wordid def gen_speech(phrase, lang, filepath): wav_fpath = filepath.split('.')[0] + '.wav' # +f1 -k1 -s100 cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}' cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath) print(cmd) os.system(cmd) def generate_sound(word, lang, wordid): lang_region = LANG_REGIONS.get(lang) cached_wordid = get_cached_sound(word, lang) if cached_wordid: print(" Found in cache: {}".format(word)) if cached_wordid != wordid: # TODO: this is duplicating space, but my brain is fried, should be mapping cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang)) word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang)) copyfile(cached_filepath, word_filepath) else: filename = "{}_{}.mp3".format(wordid, lang) filepath = os.path.join(SOUNDS_DIR, filename) start = time() gen_speech(word, lang_region, filepath) duration = time() - start print(" Generated ({} - {} s): {}".format(lang, duration, word)) SOUND_CACHE[(word, lang)] = wordid with open(SOUND_CACHE_FPATH, 'a') as f: writer = csv.writer(f) writer.writerow([wordid, lang, word]) # umlauts: ä ö ü Ä Ö def gen_tts(wordids=None): if not wordids: wordids = list() with open(WORDS_FPATH, 'r') as csvFile: reader = csv.reader(csvFile) for row in reader: wordid, german, english = row[0], row[1], row[2] if wordid not in wordids: continue print("Generating image..") image_gen(wordid, german, english) print("Generating sound {}: {}, {}".format(wordid, german, english)) backoff, attempt = 1, 0 while True: try: for word, lang in [(german, 'de'), (english, 'en')]: generate_sound(word, lang, wordid) except Exception as e: backoff = backoff * EXPONENTIAL_BACKOFF print("Sleeping for {}. Error: {}.".format(backoff, e)) if attempt % 3 == 0: import pdb; pdb.set_trace() attempt += 1 sleep(backoff) continue break def filter_words(contains): wordids = [] with open(WORDS_FPATH, 'r') as csvFile: reader = csv.reader(csvFile) for row in reader: if contains in row: wordids.append(row[0]) return wordids def concatenate(filename="lesson1", wordids=None): if not wordids: wordids = list() print("Concatenating {} sounds: {}".format(len(wordids), wordids)) lessons = AudioSegment.silent(duration=1) silence = AudioSegment.silent(duration=SHORT_SILENCE) long_silence = AudioSegment.silent(duration=LONG_SILENCE) gen_tts(wordids=wordids) images_durations = [] for wordid in wordids: start = time() sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid)) sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid)) this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence images_durations.append((wordid, this_phrase.duration_seconds)) lessons = lessons + this_phrase duration = time() - start print("Concatenated {} - {}s".format(wordid, duration)) lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3") create_video(filename, images_durations) def image_gen(wordid, de_text, en_text): width, height = 800, 450 margin, initial_font_size, font_step = 50, 120, 4 filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid)) image = Image.new(mode="RGB", size=(width, height), color="black") draw = ImageDraw.Draw(image) font_size = initial_font_size while True: fnt = ImageFont.truetype('arial.ttf', font_size) de_w, de_h = draw.textsize(de_text, font=fnt) en_w, en_h = draw.textsize(en_text, font=fnt) if de_w + 2 * margin > width or en_w + 2 * margin > width: font_size -= font_step continue break draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255)) draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255)) draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2) image.save(filename) def create_video(lesson_name, images_durations): tmp_video_name = '{}_tmp.mp4'.format(lesson_name) tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name) fourcc = cv2.VideoWriter_fourcc(*'MP4V') # define the video codec frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0]))) height, width, layers = frame.shape frames = 24 video = cv2.VideoWriter(tmp_video_filepath, fourcc, frames, (width, height)) for image, image_duration in images_durations: image_frames = int(image_duration * frames) for _ in range(image_frames): video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image)))) cv2.destroyAllWindows() video.release() os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format( video_name=tmp_video_filepath, audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)), video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)), )) os.remove(tmp_video_filepath) ERROR_MSG = "First argument needs to be the lesson to be generated" if __name__ == "__main__": load_sound_cache() if len(sys.argv) != 2: print(ERROR_MSG) exit(1) lesson = sys.argv[1] if not lesson.isdigit(): print(ERROR_MSG) exit(1) lesson = int(lesson) if not 0 < lesson < 100: print(ERROR_MSG) exit(1) print(lesson) lesson = "lesson{:02d}".format(lesson) concatenate(filename=lesson, wordids=filter_words(lesson))