espeak generation and video

This commit is contained in:
Daniel Tsvetkov 2022-01-15 21:37:57 +01:00
parent 41381e027c
commit 11e440afcc
8 changed files with 122 additions and 1265 deletions

5
.gitignore vendored
View File

@ -3,4 +3,7 @@ venv
__pycache__
*secret*
sounds
lessons
lessons
images
videos
sound_cache.csv

View File

@ -1,4 +1,6 @@
#!/bin/bash
sudo apt install especk ffmpeg
virtualenv -p python3 venv
source venv/bin/activate
pip install -r requirements.txt

View File

@ -1,6 +1,8 @@
# en_de
Generate audio from a dictionary.
Generate audio/video from a dictionary.
Uses espeak for audio generation, Pillow for images with text overlay, OpenCV for video combining, ffmpeg for combining.
## install
@ -10,6 +12,14 @@ Generate audio from a dictionary.
# run
Change the lesson
```
python generate.py <lesson number>
```
# old run
You first need a service account secret from [Google Cloud Text-To-Speech API](https://cloud.google.com/text-to-speech/docs/quickstart-client-libraries#client-libraries-install-python) (go through the steps of `Before you begin` section, the rest is handled by the app).
Modify what you need from the `generate.py` file:

View File

@ -1,16 +1,26 @@
import sys
import cv2
from PIL import ImageFont, Image, ImageDraw
import csv
import os
from shutil import copyfile
from time import sleep, time
import tts
from pydub import AudioSegment
SHORT_SILENCE = 500
LONG_SILENCE = 1000
SOUNDS_DIR = "sounds"
IMAGES_DIR = "images"
AUDIO_LESSONS_DIR = "lessons"
VIDEOS_DIR = "videos"
SOUND_CACHE_FPATH = 'sound_cache.csv'
WORDS_FPATH = 'words.csv'
for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]:
os.makedirs(d, exist_ok=True)
EXPONENTIAL_BACKOFF = 1.5
LANG_REGIONS = {
'en': 'en-US',
@ -21,7 +31,7 @@ SOUND_CACHE = {}
def load_sound_cache():
with open(SOUND_CACHE_FPATH, 'r') as csvFile:
with open(SOUND_CACHE_FPATH, 'w+') as csvFile:
reader = csv.reader(csvFile)
for line, row in enumerate(reader):
wordid, lang, word = row[0], row[1], row[2]
@ -33,6 +43,15 @@ def get_cached_sound(word, lang):
return wordid
def gen_speech(phrase, lang, filepath):
wav_fpath = filepath.split('.')[0] + '.wav'
# +f1 -k1 -s100
cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}'
cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath)
print(cmd)
os.system(cmd)
def generate_sound(word, lang, wordid):
lang_region = LANG_REGIONS.get(lang)
cached_wordid = get_cached_sound(word, lang)
@ -40,14 +59,14 @@ def generate_sound(word, lang, wordid):
print(" Found in cache: {}".format(word))
if cached_wordid != wordid:
# TODO: this is duplicating space, but my brain is fried, should be mapping
cached_filepath = os.path.join("sounds", "{}_{}.mp3".format(cached_wordid, lang))
word_filepath = os.path.join("sounds", "{}_{}.mp3".format(wordid, lang))
cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang))
word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang))
copyfile(cached_filepath, word_filepath)
else:
filename = "{}_{}.mp3".format(wordid, lang)
filepath = os.path.join("sounds", filename)
filepath = os.path.join(SOUNDS_DIR, filename)
start = time()
tts.gen_speech(word, lang_region, filepath)
gen_speech(word, lang_region, filepath)
duration = time() - start
print(" Generated ({} - {} s): {}".format(lang, duration, word))
SOUND_CACHE[(word, lang)] = wordid
@ -66,7 +85,9 @@ def gen_tts(wordids=None):
wordid, german, english = row[0], row[1], row[2]
if wordid not in wordids:
continue
print("Generating {}: {}, {}".format(wordid, german, english))
print("Generating image..")
image_gen(wordid, german, english)
print("Generating sound {}: {}, {}".format(wordid, german, english))
backoff, attempt = 1, 0
while True:
try:
@ -102,18 +123,87 @@ def concatenate(filename="lesson1", wordids=None):
silence = AudioSegment.silent(duration=SHORT_SILENCE)
long_silence = AudioSegment.silent(duration=LONG_SILENCE)
gen_tts(wordids=wordids)
images_durations = []
for wordid in wordids:
start = time()
sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))
sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))
lessons = lessons + sound_de + silence + sound_en + silence + sound_de + long_silence
this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence
images_durations.append((wordid, this_phrase.duration_seconds))
lessons = lessons + this_phrase
duration = time() - start
print("Concatenated {} - {}s".format(wordid, duration))
lessons.export(os.path.join("lessons", "{}.mp3".format(filename)), format="mp3")
lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3")
create_video(filename, images_durations)
def image_gen(wordid, de_text, en_text):
width, height = 800, 450
margin, initial_font_size, font_step = 50, 120, 4
filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid))
image = Image.new(mode="RGB", size=(width, height), color="black")
draw = ImageDraw.Draw(image)
font_size = initial_font_size
while True:
fnt = ImageFont.truetype('arial.ttf', font_size)
de_w, de_h = draw.textsize(de_text, font=fnt)
en_w, en_h = draw.textsize(en_text, font=fnt)
if de_w + 2 * margin > width or en_w + 2 * margin > width:
font_size -= font_step
continue
break
draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255))
draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255))
draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2)
image.save(filename)
def create_video(lesson_name, images_durations):
tmp_video_name = '{}_tmp.mp4'.format(lesson_name)
tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name)
fourcc = cv2.VideoWriter_fourcc(*'MP4V') # define the video codec
frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0])))
height, width, layers = frame.shape
frames = 24
video = cv2.VideoWriter(tmp_video_filepath, fourcc, frames, (width, height))
for image, image_duration in images_durations:
image_frames = int(image_duration * frames)
for _ in range(image_frames):
video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image))))
cv2.destroyAllWindows()
video.release()
os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format(
video_name=tmp_video_filepath,
audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)),
video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)),
))
os.remove(tmp_video_filepath)
ERROR_MSG = "First argument needs to be the lesson to be generated"
if __name__ == "__main__":
load_sound_cache()
wordids = filter_words("lesson05")
concatenate(filename="lesson05", wordids=wordids)
if len(sys.argv) != 2:
print(ERROR_MSG)
exit(1)
lesson = sys.argv[1]
if not lesson.isdigit():
print(ERROR_MSG)
exit(1)
lesson = int(lesson)
if not 0 < lesson < 100:
print(ERROR_MSG)
exit(1)
print(lesson)
lesson = "lesson{:02d}".format(lesson)
concatenate(filename=lesson,
wordids=filter_words(lesson))

View File

View File

@ -1,18 +1,4 @@
cachetools==3.1.0
certifi==2019.3.9
chardet==3.0.4
google-api-core==1.8.0
google-auth==1.6.3
google-cloud-texttospeech==0.4.0
googleapis-common-protos==1.5.8
grpcio==1.19.0
idna==2.8
protobuf==3.7.1
pyasn1==0.4.5
pyasn1-modules==0.2.4
pydub==0.23.1
pytz==2019.1
requests==2.21.0
rsa==4.0
six==1.12.0
urllib3==1.24.1
numpy==1.22.1
opencv-python==4.5.5.62
Pillow==9.0.0
pydub==0.25.1

File diff suppressed because it is too large Load Diff

34
tts.py
View File

@ -1,34 +0,0 @@
from google.cloud import texttospeech
# Instantiates a client
client = texttospeech.TextToSpeechClient()
def gen_speech(text, language_code, output_file):
"""Synthesizes speech from the input string of text or ssml.
Note: ssml must be well-formed according to:
https://www.w3.org/TR/speech-synthesis/
"""
# Set the text input to be synthesized
synthesis_input = texttospeech.types.SynthesisInput(text=text)
# Build the voice request, select the language code ("en-US") and the ssml
# voice gender ("neutral")
voice = texttospeech.types.VoiceSelectionParams(
language_code=language_code,
ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
# Select the type of audio file you want returned
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(synthesis_input, voice, audio_config)
# The response's audio_content is binary.
with open(output_file, 'wb') as out:
# Write the response to the output file.
out.write(response.audio_content)