espeak generation and video

2022-01-15 21:37:57 +01:00 · 2022-01-15 21:37:57 +01:00 · 11e440afcc
commit 11e440afcc
parent 41381e027c
8 changed files with 122 additions and 1265 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,4 +3,7 @@ venv
 __pycache__
 *secret*
 sounds
-lessons
+lessons
 images
 videos
 sound_cache.csv
--- a/INSTALL.sh
+++ b/INSTALL.sh
@ -1,4 +1,6 @@
 #!/bin/bash
 sudo apt install especk ffmpeg
 virtualenv -p python3 venv
 source venv/bin/activate
 pip install -r requirements.txt
--- a/README.md
+++ b/README.md
@ -1,6 +1,8 @@
 # en_de
-Generate audio from a dictionary.
+Generate audio/video from a dictionary.
 Uses espeak for audio generation, Pillow for images with text overlay, OpenCV for video combining, ffmpeg for combining.
 ## install
@ -10,6 +12,14 @@ Generate audio from a dictionary.
 # run
 Change the lesson
 ```
 python generate.py <lesson number>
 ```
 # old run
 You first need a service account secret from [Google Cloud Text-To-Speech API](https://cloud.google.com/text-to-speech/docs/quickstart-client-libraries#client-libraries-install-python) (go through the steps of `Before you begin` section, the rest is handled by the app).
 Modify what you need from the `generate.py` file:
--- a/generate.py
+++ b/generate.py
@ -1,16 +1,26 @@
 import sys
 import cv2
 from PIL import ImageFont, Image, ImageDraw
 import csv
 import os
 from shutil import copyfile
 from time import sleep, time
 import tts
 from pydub import AudioSegment
 SHORT_SILENCE = 500
 LONG_SILENCE = 1000
 SOUNDS_DIR = "sounds"
 IMAGES_DIR = "images"
 AUDIO_LESSONS_DIR = "lessons"
 VIDEOS_DIR = "videos"
 SOUND_CACHE_FPATH = 'sound_cache.csv'
 WORDS_FPATH = 'words.csv'
 for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]:
    os.makedirs(d, exist_ok=True)
 EXPONENTIAL_BACKOFF = 1.5
 LANG_REGIONS = {
    'en': 'en-US',
@ -21,7 +31,7 @@ SOUND_CACHE = {}
 def load_sound_cache():
-    with open(SOUND_CACHE_FPATH, 'r') as csvFile:
+    with open(SOUND_CACHE_FPATH, 'w+') as csvFile:
        reader = csv.reader(csvFile)
        for line, row in enumerate(reader):
            wordid, lang, word = row[0], row[1], row[2]
@ -33,6 +43,15 @@ def get_cached_sound(word, lang):
    return wordid
 def gen_speech(phrase, lang, filepath):
    wav_fpath = filepath.split('.')[0] + '.wav'
    # +f1 -k1 -s100
    cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}'
    cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath)
    print(cmd)
    os.system(cmd)
 def generate_sound(word, lang, wordid):
    lang_region = LANG_REGIONS.get(lang)
    cached_wordid = get_cached_sound(word, lang)
@ -40,14 +59,14 @@ def generate_sound(word, lang, wordid):
        print("  Found in cache: {}".format(word))
        if cached_wordid != wordid:
            # TODO: this is duplicating space, but my brain is fried, should be mapping
-            cached_filepath = os.path.join("sounds", "{}_{}.mp3".format(cached_wordid, lang))
+            cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang))
-            word_filepath = os.path.join("sounds", "{}_{}.mp3".format(wordid, lang))
+            word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang))
            copyfile(cached_filepath, word_filepath)
    else:
        filename = "{}_{}.mp3".format(wordid, lang)
-        filepath = os.path.join("sounds", filename)
+        filepath = os.path.join(SOUNDS_DIR, filename)
        start = time()
-        tts.gen_speech(word, lang_region, filepath)
+        gen_speech(word, lang_region, filepath)
        duration = time() - start
        print("  Generated ({} - {} s): {}".format(lang, duration, word))
        SOUND_CACHE[(word, lang)] = wordid
@ -66,7 +85,9 @@ def gen_tts(wordids=None):
            wordid, german, english = row[0], row[1], row[2]
            if wordid not in wordids:
                continue
-            print("Generating {}: {}, {}".format(wordid, german, english))
+            print("Generating image..")
            image_gen(wordid, german, english)
            print("Generating sound {}: {}, {}".format(wordid, german, english))
            backoff, attempt = 1, 0
            while True:
                try:
@ -102,18 +123,87 @@ def concatenate(filename="lesson1", wordids=None):
    silence = AudioSegment.silent(duration=SHORT_SILENCE)
    long_silence = AudioSegment.silent(duration=LONG_SILENCE)
    gen_tts(wordids=wordids)
    images_durations = []
    for wordid in wordids:
        start = time()
        sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))
        sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))
-        lessons = lessons + sound_de + silence + sound_en + silence + sound_de + long_silence
+        this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence
        images_durations.append((wordid, this_phrase.duration_seconds))
        lessons = lessons + this_phrase
        duration = time() - start
        print("Concatenated {} - {}s".format(wordid, duration))
-    lessons.export(os.path.join("lessons", "{}.mp3".format(filename)), format="mp3")
+    lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3")
    create_video(filename, images_durations)
 def image_gen(wordid, de_text, en_text):
    width, height = 800, 450
    margin, initial_font_size, font_step = 50, 120, 4
    filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid))
    image = Image.new(mode="RGB", size=(width, height), color="black")
    draw = ImageDraw.Draw(image)
    font_size = initial_font_size
    while True:
        fnt = ImageFont.truetype('arial.ttf', font_size)
        de_w, de_h = draw.textsize(de_text, font=fnt)
        en_w, en_h = draw.textsize(en_text, font=fnt)
        if de_w + 2 * margin > width or en_w + 2 * margin > width:
            font_size -= font_step
            continue
        break
    draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255))
    draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255))
    draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2)
    image.save(filename)
 def create_video(lesson_name, images_durations):
    tmp_video_name = '{}_tmp.mp4'.format(lesson_name)
    tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name)
    fourcc = cv2.VideoWriter_fourcc(*'MP4V')  # define the video codec
    frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0])))
    height, width, layers = frame.shape
    frames = 24
    video = cv2.VideoWriter(tmp_video_filepath, fourcc, frames, (width, height))
    for image, image_duration in images_durations:
        image_frames = int(image_duration * frames)
        for _ in range(image_frames):
            video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image))))
    cv2.destroyAllWindows()
    video.release()
    os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format(
        video_name=tmp_video_filepath,
        audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)),
        video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)),
    ))
    os.remove(tmp_video_filepath)
 ERROR_MSG = "First argument needs to be the lesson to be generated"
 if __name__ == "__main__":
    load_sound_cache()
-    wordids = filter_words("lesson05")
+    if len(sys.argv) != 2:
-    concatenate(filename="lesson05", wordids=wordids)
+        print(ERROR_MSG)
        exit(1)
    lesson = sys.argv[1]
    if not lesson.isdigit():
        print(ERROR_MSG)
        exit(1)
    lesson = int(lesson)
    if not 0 < lesson < 100:
        print(ERROR_MSG)
        exit(1)
    print(lesson)
    lesson = "lesson{:02d}".format(lesson)
    concatenate(filename=lesson,
                wordids=filter_words(lesson))
--- a/old_RUN.sh
+++ b/old_RUN.sh
--- a/requirements.txt
+++ b/requirements.txt
@ -1,18 +1,4 @@
-cachetools==3.1.0
+numpy==1.22.1
-certifi==2019.3.9
+opencv-python==4.5.5.62
-chardet==3.0.4
+Pillow==9.0.0
-google-api-core==1.8.0
+pydub==0.25.1
 google-auth==1.6.3
 google-cloud-texttospeech==0.4.0
 googleapis-common-protos==1.5.8
 grpcio==1.19.0
 idna==2.8
 protobuf==3.7.1
 pyasn1==0.4.5
 pyasn1-modules==0.2.4
 pydub==0.23.1
 pytz==2019.1
 requests==2.21.0
 rsa==4.0
 six==1.12.0
 urllib3==1.24.1
--- a/sound_cache.csv
+++ b/sound_cache.csv
--- a/tts.py
+++ b/tts.py
@ -1,34 +0,0 @@
 from google.cloud import texttospeech
 # Instantiates a client
 client = texttospeech.TextToSpeechClient()
 def gen_speech(text, language_code, output_file):
    """Synthesizes speech from the input string of text or ssml.
    Note: ssml must be well-formed according to:
        https://www.w3.org/TR/speech-synthesis/
    """
    # Set the text input to be synthesized
    synthesis_input = texttospeech.types.SynthesisInput(text=text)
    # Build the voice request, select the language code ("en-US") and the ssml
    # voice gender ("neutral")
    voice = texttospeech.types.VoiceSelectionParams(
        language_code=language_code,
        ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
    # Select the type of audio file you want returned
    audio_config = texttospeech.types.AudioConfig(
        audio_encoding=texttospeech.enums.AudioEncoding.MP3)
    # Perform the text-to-speech request on the text input with the selected
    # voice parameters and audio file type
    response = client.synthesize_speech(synthesis_input, voice, audio_config)
    # The response's audio_content is binary.
    with open(output_file, 'wb') as out:
        # Write the response to the output file.
        out.write(response.audio_content)