espeak generation and video

This commit is contained in:
Daniel Tsvetkov 2022-01-15 21:37:57 +01:00
parent 41381e027c
commit 11e440afcc
8 changed files with 122 additions and 1265 deletions

3
.gitignore vendored
View File

@ -4,3 +4,6 @@ __pycache__
*secret* *secret*
sounds sounds
lessons lessons
images
videos
sound_cache.csv

View File

@ -1,4 +1,6 @@
#!/bin/bash #!/bin/bash
sudo apt install especk ffmpeg
virtualenv -p python3 venv virtualenv -p python3 venv
source venv/bin/activate source venv/bin/activate
pip install -r requirements.txt pip install -r requirements.txt

View File

@ -1,6 +1,8 @@
# en_de # en_de
Generate audio from a dictionary. Generate audio/video from a dictionary.
Uses espeak for audio generation, Pillow for images with text overlay, OpenCV for video combining, ffmpeg for combining.
## install ## install
@ -10,6 +12,14 @@ Generate audio from a dictionary.
# run # run
Change the lesson
```
python generate.py <lesson number>
```
# old run
You first need a service account secret from [Google Cloud Text-To-Speech API](https://cloud.google.com/text-to-speech/docs/quickstart-client-libraries#client-libraries-install-python) (go through the steps of `Before you begin` section, the rest is handled by the app). You first need a service account secret from [Google Cloud Text-To-Speech API](https://cloud.google.com/text-to-speech/docs/quickstart-client-libraries#client-libraries-install-python) (go through the steps of `Before you begin` section, the rest is handled by the app).
Modify what you need from the `generate.py` file: Modify what you need from the `generate.py` file:

View File

@ -1,16 +1,26 @@
import sys
import cv2
from PIL import ImageFont, Image, ImageDraw
import csv import csv
import os import os
from shutil import copyfile from shutil import copyfile
from time import sleep, time from time import sleep, time
import tts
from pydub import AudioSegment from pydub import AudioSegment
SHORT_SILENCE = 500 SHORT_SILENCE = 500
LONG_SILENCE = 1000 LONG_SILENCE = 1000
SOUNDS_DIR = "sounds"
IMAGES_DIR = "images"
AUDIO_LESSONS_DIR = "lessons"
VIDEOS_DIR = "videos"
SOUND_CACHE_FPATH = 'sound_cache.csv' SOUND_CACHE_FPATH = 'sound_cache.csv'
WORDS_FPATH = 'words.csv' WORDS_FPATH = 'words.csv'
for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]:
os.makedirs(d, exist_ok=True)
EXPONENTIAL_BACKOFF = 1.5 EXPONENTIAL_BACKOFF = 1.5
LANG_REGIONS = { LANG_REGIONS = {
'en': 'en-US', 'en': 'en-US',
@ -21,7 +31,7 @@ SOUND_CACHE = {}
def load_sound_cache(): def load_sound_cache():
with open(SOUND_CACHE_FPATH, 'r') as csvFile: with open(SOUND_CACHE_FPATH, 'w+') as csvFile:
reader = csv.reader(csvFile) reader = csv.reader(csvFile)
for line, row in enumerate(reader): for line, row in enumerate(reader):
wordid, lang, word = row[0], row[1], row[2] wordid, lang, word = row[0], row[1], row[2]
@ -33,6 +43,15 @@ def get_cached_sound(word, lang):
return wordid return wordid
def gen_speech(phrase, lang, filepath):
wav_fpath = filepath.split('.')[0] + '.wav'
# +f1 -k1 -s100
cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}'
cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath)
print(cmd)
os.system(cmd)
def generate_sound(word, lang, wordid): def generate_sound(word, lang, wordid):
lang_region = LANG_REGIONS.get(lang) lang_region = LANG_REGIONS.get(lang)
cached_wordid = get_cached_sound(word, lang) cached_wordid = get_cached_sound(word, lang)
@ -40,14 +59,14 @@ def generate_sound(word, lang, wordid):
print(" Found in cache: {}".format(word)) print(" Found in cache: {}".format(word))
if cached_wordid != wordid: if cached_wordid != wordid:
# TODO: this is duplicating space, but my brain is fried, should be mapping # TODO: this is duplicating space, but my brain is fried, should be mapping
cached_filepath = os.path.join("sounds", "{}_{}.mp3".format(cached_wordid, lang)) cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang))
word_filepath = os.path.join("sounds", "{}_{}.mp3".format(wordid, lang)) word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang))
copyfile(cached_filepath, word_filepath) copyfile(cached_filepath, word_filepath)
else: else:
filename = "{}_{}.mp3".format(wordid, lang) filename = "{}_{}.mp3".format(wordid, lang)
filepath = os.path.join("sounds", filename) filepath = os.path.join(SOUNDS_DIR, filename)
start = time() start = time()
tts.gen_speech(word, lang_region, filepath) gen_speech(word, lang_region, filepath)
duration = time() - start duration = time() - start
print(" Generated ({} - {} s): {}".format(lang, duration, word)) print(" Generated ({} - {} s): {}".format(lang, duration, word))
SOUND_CACHE[(word, lang)] = wordid SOUND_CACHE[(word, lang)] = wordid
@ -66,7 +85,9 @@ def gen_tts(wordids=None):
wordid, german, english = row[0], row[1], row[2] wordid, german, english = row[0], row[1], row[2]
if wordid not in wordids: if wordid not in wordids:
continue continue
print("Generating {}: {}, {}".format(wordid, german, english)) print("Generating image..")
image_gen(wordid, german, english)
print("Generating sound {}: {}, {}".format(wordid, german, english))
backoff, attempt = 1, 0 backoff, attempt = 1, 0
while True: while True:
try: try:
@ -102,18 +123,87 @@ def concatenate(filename="lesson1", wordids=None):
silence = AudioSegment.silent(duration=SHORT_SILENCE) silence = AudioSegment.silent(duration=SHORT_SILENCE)
long_silence = AudioSegment.silent(duration=LONG_SILENCE) long_silence = AudioSegment.silent(duration=LONG_SILENCE)
gen_tts(wordids=wordids) gen_tts(wordids=wordids)
images_durations = []
for wordid in wordids: for wordid in wordids:
start = time() start = time()
sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid)) sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))
sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid)) sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))
lessons = lessons + sound_de + silence + sound_en + silence + sound_de + long_silence this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence
images_durations.append((wordid, this_phrase.duration_seconds))
lessons = lessons + this_phrase
duration = time() - start duration = time() - start
print("Concatenated {} - {}s".format(wordid, duration)) print("Concatenated {} - {}s".format(wordid, duration))
lessons.export(os.path.join("lessons", "{}.mp3".format(filename)), format="mp3") lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3")
create_video(filename, images_durations)
def image_gen(wordid, de_text, en_text):
width, height = 800, 450
margin, initial_font_size, font_step = 50, 120, 4
filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid))
image = Image.new(mode="RGB", size=(width, height), color="black")
draw = ImageDraw.Draw(image)
font_size = initial_font_size
while True:
fnt = ImageFont.truetype('arial.ttf', font_size)
de_w, de_h = draw.textsize(de_text, font=fnt)
en_w, en_h = draw.textsize(en_text, font=fnt)
if de_w + 2 * margin > width or en_w + 2 * margin > width:
font_size -= font_step
continue
break
draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255))
draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255))
draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2)
image.save(filename)
def create_video(lesson_name, images_durations):
tmp_video_name = '{}_tmp.mp4'.format(lesson_name)
tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name)
fourcc = cv2.VideoWriter_fourcc(*'MP4V') # define the video codec
frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0])))
height, width, layers = frame.shape
frames = 24
video = cv2.VideoWriter(tmp_video_filepath, fourcc, frames, (width, height))
for image, image_duration in images_durations:
image_frames = int(image_duration * frames)
for _ in range(image_frames):
video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image))))
cv2.destroyAllWindows()
video.release()
os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format(
video_name=tmp_video_filepath,
audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)),
video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)),
))
os.remove(tmp_video_filepath)
ERROR_MSG = "First argument needs to be the lesson to be generated"
if __name__ == "__main__": if __name__ == "__main__":
load_sound_cache() load_sound_cache()
wordids = filter_words("lesson05") if len(sys.argv) != 2:
concatenate(filename="lesson05", wordids=wordids) print(ERROR_MSG)
exit(1)
lesson = sys.argv[1]
if not lesson.isdigit():
print(ERROR_MSG)
exit(1)
lesson = int(lesson)
if not 0 < lesson < 100:
print(ERROR_MSG)
exit(1)
print(lesson)
lesson = "lesson{:02d}".format(lesson)
concatenate(filename=lesson,
wordids=filter_words(lesson))

View File

View File

@ -1,18 +1,4 @@
cachetools==3.1.0 numpy==1.22.1
certifi==2019.3.9 opencv-python==4.5.5.62
chardet==3.0.4 Pillow==9.0.0
google-api-core==1.8.0 pydub==0.25.1
google-auth==1.6.3
google-cloud-texttospeech==0.4.0
googleapis-common-protos==1.5.8
grpcio==1.19.0
idna==2.8
protobuf==3.7.1
pyasn1==0.4.5
pyasn1-modules==0.2.4
pydub==0.23.1
pytz==2019.1
requests==2.21.0
rsa==4.0
six==1.12.0
urllib3==1.24.1

File diff suppressed because it is too large Load Diff

34
tts.py
View File

@ -1,34 +0,0 @@
from google.cloud import texttospeech
# Instantiates a client
client = texttospeech.TextToSpeechClient()
def gen_speech(text, language_code, output_file):
"""Synthesizes speech from the input string of text or ssml.
Note: ssml must be well-formed according to:
https://www.w3.org/TR/speech-synthesis/
"""
# Set the text input to be synthesized
synthesis_input = texttospeech.types.SynthesisInput(text=text)
# Build the voice request, select the language code ("en-US") and the ssml
# voice gender ("neutral")
voice = texttospeech.types.VoiceSelectionParams(
language_code=language_code,
ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
# Select the type of audio file you want returned
audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(synthesis_input, voice, audio_config)
# The response's audio_content is binary.
with open(output_file, 'wb') as out:
# Write the response to the output file.
out.write(response.audio_content)