espeak generation and video

2022-01-15 21:37:57 +01:00 · 2022-01-15 21:37:57 +01:00 · 11e440afcc
commit 11e440afcc
parent 41381e027c
8 changed files with 122 additions and 1265 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,4 +3,7 @@ venv
 __pycache__
 *secret*
 sounds
-lessons
+lessons
+images
+videos
+sound_cache.csv
--- a/INSTALL.sh
+++ b/INSTALL.sh
@ -1,4 +1,6 @@
 #!/bin/bash
+sudo apt install especk ffmpeg
+
 virtualenv -p python3 venv
 source venv/bin/activate
 pip install -r requirements.txt
--- a/README.md
+++ b/README.md
@ -1,6 +1,8 @@
 # en_de

-Generate audio from a dictionary.
+Generate audio/video from a dictionary.
+
+Uses espeak for audio generation, Pillow for images with text overlay, OpenCV for video combining, ffmpeg for combining.

 ## install

@ -10,6 +12,14 @@ Generate audio from a dictionary.

 # run

+Change the lesson
+
+```
+python generate.py <lesson number>
+```
+
+# old run
+
 You first need a service account secret from [Google Cloud Text-To-Speech API](https://cloud.google.com/text-to-speech/docs/quickstart-client-libraries#client-libraries-install-python) (go through the steps of `Before you begin` section, the rest is handled by the app).

 Modify what you need from the `generate.py` file:
--- a/generate.py
+++ b/generate.py
@ -1,16 +1,26 @@
+import sys
+
+import cv2
+from PIL import ImageFont, Image, ImageDraw
 import csv
 import os
 from shutil import copyfile
 from time import sleep, time

-import tts
 from pydub import AudioSegment

 SHORT_SILENCE = 500
 LONG_SILENCE = 1000
+SOUNDS_DIR = "sounds"
+IMAGES_DIR = "images"
+AUDIO_LESSONS_DIR = "lessons"
+VIDEOS_DIR = "videos"
 SOUND_CACHE_FPATH = 'sound_cache.csv'
 WORDS_FPATH = 'words.csv'

+for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]:
+    os.makedirs(d, exist_ok=True)
+
 EXPONENTIAL_BACKOFF = 1.5
 LANG_REGIONS = {
    'en': 'en-US',
@ -21,7 +31,7 @@ SOUND_CACHE = {}


 def load_sound_cache():
-    with open(SOUND_CACHE_FPATH, 'r') as csvFile:
+    with open(SOUND_CACHE_FPATH, 'w+') as csvFile:
        reader = csv.reader(csvFile)
        for line, row in enumerate(reader):
            wordid, lang, word = row[0], row[1], row[2]
@ -33,6 +43,15 @@ def get_cached_sound(word, lang):
    return wordid


+def gen_speech(phrase, lang, filepath):
+    wav_fpath = filepath.split('.')[0] + '.wav'
+    # +f1 -k1 -s100
+    cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}'
+    cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath)
+    print(cmd)
+    os.system(cmd)
+
+
 def generate_sound(word, lang, wordid):
    lang_region = LANG_REGIONS.get(lang)
    cached_wordid = get_cached_sound(word, lang)
@ -40,14 +59,14 @@ def generate_sound(word, lang, wordid):
        print("  Found in cache: {}".format(word))
        if cached_wordid != wordid:
            # TODO: this is duplicating space, but my brain is fried, should be mapping
-            cached_filepath = os.path.join("sounds", "{}_{}.mp3".format(cached_wordid, lang))
-            word_filepath = os.path.join("sounds", "{}_{}.mp3".format(wordid, lang))
+            cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang))
+            word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang))
            copyfile(cached_filepath, word_filepath)
    else:
        filename = "{}_{}.mp3".format(wordid, lang)
-        filepath = os.path.join("sounds", filename)
+        filepath = os.path.join(SOUNDS_DIR, filename)
        start = time()
-        tts.gen_speech(word, lang_region, filepath)
+        gen_speech(word, lang_region, filepath)
        duration = time() - start
        print("  Generated ({} - {} s): {}".format(lang, duration, word))
        SOUND_CACHE[(word, lang)] = wordid
@ -66,7 +85,9 @@ def gen_tts(wordids=None):
            wordid, german, english = row[0], row[1], row[2]
            if wordid not in wordids:
                continue
-            print("Generating {}: {}, {}".format(wordid, german, english))
+            print("Generating image..")
+            image_gen(wordid, german, english)
+            print("Generating sound {}: {}, {}".format(wordid, german, english))
            backoff, attempt = 1, 0
            while True:
                try:
@ -102,18 +123,87 @@ def concatenate(filename="lesson1", wordids=None):
    silence = AudioSegment.silent(duration=SHORT_SILENCE)
    long_silence = AudioSegment.silent(duration=LONG_SILENCE)
    gen_tts(wordids=wordids)
+    images_durations = []
    for wordid in wordids:
        start = time()
        sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))
        sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))
-        lessons = lessons + sound_de + silence + sound_en + silence + sound_de + long_silence
+        this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence
+        images_durations.append((wordid, this_phrase.duration_seconds))
+        lessons = lessons + this_phrase
        duration = time() - start
        print("Concatenated {} - {}s".format(wordid, duration))

-    lessons.export(os.path.join("lessons", "{}.mp3".format(filename)), format="mp3")
+    lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3")
+    create_video(filename, images_durations)
+
+
+def image_gen(wordid, de_text, en_text):
+    width, height = 800, 450
+    margin, initial_font_size, font_step = 50, 120, 4
+    filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid))
+    image = Image.new(mode="RGB", size=(width, height), color="black")
+    draw = ImageDraw.Draw(image)
+    font_size = initial_font_size
+    while True:
+        fnt = ImageFont.truetype('arial.ttf', font_size)
+        de_w, de_h = draw.textsize(de_text, font=fnt)
+        en_w, en_h = draw.textsize(en_text, font=fnt)
+        if de_w + 2 * margin > width or en_w + 2 * margin > width:
+            font_size -= font_step
+            continue
+        break
+    draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255))
+    draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255))
+    draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2)
+    image.save(filename)
+
+
+def create_video(lesson_name, images_durations):
+    tmp_video_name = '{}_tmp.mp4'.format(lesson_name)
+    tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name)
+
+    fourcc = cv2.VideoWriter_fourcc(*'MP4V')  # define the video codec
+
+    frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0])))
+    height, width, layers = frame.shape
+
+    frames = 24
+
+    video = cv2.VideoWriter(tmp_video_filepath, fourcc, frames, (width, height))
+    for image, image_duration in images_durations:
+        image_frames = int(image_duration * frames)
+        for _ in range(image_frames):
+            video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image))))
+
+    cv2.destroyAllWindows()
+    video.release()
+
+    os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format(
+        video_name=tmp_video_filepath,
+        audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)),
+        video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)),
+    ))
+    os.remove(tmp_video_filepath)
+
+
+ERROR_MSG = "First argument needs to be the lesson to be generated"


 if __name__ == "__main__":
    load_sound_cache()
-    wordids = filter_words("lesson05")
-    concatenate(filename="lesson05", wordids=wordids)
+    if len(sys.argv) != 2:
+        print(ERROR_MSG)
+        exit(1)
+    lesson = sys.argv[1]
+    if not lesson.isdigit():
+        print(ERROR_MSG)
+        exit(1)
+    lesson = int(lesson)
+    if not 0 < lesson < 100:
+        print(ERROR_MSG)
+        exit(1)
+    print(lesson)
+    lesson = "lesson{:02d}".format(lesson)
+    concatenate(filename=lesson,
+                wordids=filter_words(lesson))
--- a/old_RUN.sh
+++ b/old_RUN.sh
--- a/requirements.txt
+++ b/requirements.txt
@ -1,18 +1,4 @@
-cachetools==3.1.0
-certifi==2019.3.9
-chardet==3.0.4
-google-api-core==1.8.0
-google-auth==1.6.3
-google-cloud-texttospeech==0.4.0
-googleapis-common-protos==1.5.8
-grpcio==1.19.0
-idna==2.8
-protobuf==3.7.1
-pyasn1==0.4.5
-pyasn1-modules==0.2.4
-pydub==0.23.1
-pytz==2019.1
-requests==2.21.0
-rsa==4.0
-six==1.12.0
-urllib3==1.24.1
+numpy==1.22.1
+opencv-python==4.5.5.62
+Pillow==9.0.0
+pydub==0.25.1
--- a/sound_cache.csv
+++ b/sound_cache.csv
--- a/tts.py
+++ b/tts.py
@ -1,34 +0,0 @@
-from google.cloud import texttospeech
-
-# Instantiates a client
-client = texttospeech.TextToSpeechClient()
-
-
-def gen_speech(text, language_code, output_file):
-    """Synthesizes speech from the input string of text or ssml.
-
-    Note: ssml must be well-formed according to:
-        https://www.w3.org/TR/speech-synthesis/
-    """
-
-    # Set the text input to be synthesized
-    synthesis_input = texttospeech.types.SynthesisInput(text=text)
-
-    # Build the voice request, select the language code ("en-US") and the ssml
-    # voice gender ("neutral")
-    voice = texttospeech.types.VoiceSelectionParams(
-        language_code=language_code,
-        ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
-
-    # Select the type of audio file you want returned
-    audio_config = texttospeech.types.AudioConfig(
-        audio_encoding=texttospeech.enums.AudioEncoding.MP3)
-
-    # Perform the text-to-speech request on the text input with the selected
-    # voice parameters and audio file type
-    response = client.synthesize_speech(synthesis_input, voice, audio_config)
-
-    # The response's audio_content is binary.
-    with open(output_file, 'wb') as out:
-        # Write the response to the output file.
-        out.write(response.audio_content)