espeak generation and video
This commit is contained in:
parent
41381e027c
commit
11e440afcc
3
.gitignore
vendored
3
.gitignore
vendored
@ -4,3 +4,6 @@ __pycache__
|
|||||||
*secret*
|
*secret*
|
||||||
sounds
|
sounds
|
||||||
lessons
|
lessons
|
||||||
|
images
|
||||||
|
videos
|
||||||
|
sound_cache.csv
|
@ -1,4 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
sudo apt install especk ffmpeg
|
||||||
|
|
||||||
virtualenv -p python3 venv
|
virtualenv -p python3 venv
|
||||||
source venv/bin/activate
|
source venv/bin/activate
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
12
README.md
12
README.md
@ -1,6 +1,8 @@
|
|||||||
# en_de
|
# en_de
|
||||||
|
|
||||||
Generate audio from a dictionary.
|
Generate audio/video from a dictionary.
|
||||||
|
|
||||||
|
Uses espeak for audio generation, Pillow for images with text overlay, OpenCV for video combining, ffmpeg for combining.
|
||||||
|
|
||||||
## install
|
## install
|
||||||
|
|
||||||
@ -10,6 +12,14 @@ Generate audio from a dictionary.
|
|||||||
|
|
||||||
# run
|
# run
|
||||||
|
|
||||||
|
Change the lesson
|
||||||
|
|
||||||
|
```
|
||||||
|
python generate.py <lesson number>
|
||||||
|
```
|
||||||
|
|
||||||
|
# old run
|
||||||
|
|
||||||
You first need a service account secret from [Google Cloud Text-To-Speech API](https://cloud.google.com/text-to-speech/docs/quickstart-client-libraries#client-libraries-install-python) (go through the steps of `Before you begin` section, the rest is handled by the app).
|
You first need a service account secret from [Google Cloud Text-To-Speech API](https://cloud.google.com/text-to-speech/docs/quickstart-client-libraries#client-libraries-install-python) (go through the steps of `Before you begin` section, the rest is handled by the app).
|
||||||
|
|
||||||
Modify what you need from the `generate.py` file:
|
Modify what you need from the `generate.py` file:
|
||||||
|
112
generate.py
112
generate.py
@ -1,16 +1,26 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
from PIL import ImageFont, Image, ImageDraw
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
from shutil import copyfile
|
from shutil import copyfile
|
||||||
from time import sleep, time
|
from time import sleep, time
|
||||||
|
|
||||||
import tts
|
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
|
|
||||||
SHORT_SILENCE = 500
|
SHORT_SILENCE = 500
|
||||||
LONG_SILENCE = 1000
|
LONG_SILENCE = 1000
|
||||||
|
SOUNDS_DIR = "sounds"
|
||||||
|
IMAGES_DIR = "images"
|
||||||
|
AUDIO_LESSONS_DIR = "lessons"
|
||||||
|
VIDEOS_DIR = "videos"
|
||||||
SOUND_CACHE_FPATH = 'sound_cache.csv'
|
SOUND_CACHE_FPATH = 'sound_cache.csv'
|
||||||
WORDS_FPATH = 'words.csv'
|
WORDS_FPATH = 'words.csv'
|
||||||
|
|
||||||
|
for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]:
|
||||||
|
os.makedirs(d, exist_ok=True)
|
||||||
|
|
||||||
EXPONENTIAL_BACKOFF = 1.5
|
EXPONENTIAL_BACKOFF = 1.5
|
||||||
LANG_REGIONS = {
|
LANG_REGIONS = {
|
||||||
'en': 'en-US',
|
'en': 'en-US',
|
||||||
@ -21,7 +31,7 @@ SOUND_CACHE = {}
|
|||||||
|
|
||||||
|
|
||||||
def load_sound_cache():
|
def load_sound_cache():
|
||||||
with open(SOUND_CACHE_FPATH, 'r') as csvFile:
|
with open(SOUND_CACHE_FPATH, 'w+') as csvFile:
|
||||||
reader = csv.reader(csvFile)
|
reader = csv.reader(csvFile)
|
||||||
for line, row in enumerate(reader):
|
for line, row in enumerate(reader):
|
||||||
wordid, lang, word = row[0], row[1], row[2]
|
wordid, lang, word = row[0], row[1], row[2]
|
||||||
@ -33,6 +43,15 @@ def get_cached_sound(word, lang):
|
|||||||
return wordid
|
return wordid
|
||||||
|
|
||||||
|
|
||||||
|
def gen_speech(phrase, lang, filepath):
|
||||||
|
wav_fpath = filepath.split('.')[0] + '.wav'
|
||||||
|
# +f1 -k1 -s100
|
||||||
|
cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}'
|
||||||
|
cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath)
|
||||||
|
print(cmd)
|
||||||
|
os.system(cmd)
|
||||||
|
|
||||||
|
|
||||||
def generate_sound(word, lang, wordid):
|
def generate_sound(word, lang, wordid):
|
||||||
lang_region = LANG_REGIONS.get(lang)
|
lang_region = LANG_REGIONS.get(lang)
|
||||||
cached_wordid = get_cached_sound(word, lang)
|
cached_wordid = get_cached_sound(word, lang)
|
||||||
@ -40,14 +59,14 @@ def generate_sound(word, lang, wordid):
|
|||||||
print(" Found in cache: {}".format(word))
|
print(" Found in cache: {}".format(word))
|
||||||
if cached_wordid != wordid:
|
if cached_wordid != wordid:
|
||||||
# TODO: this is duplicating space, but my brain is fried, should be mapping
|
# TODO: this is duplicating space, but my brain is fried, should be mapping
|
||||||
cached_filepath = os.path.join("sounds", "{}_{}.mp3".format(cached_wordid, lang))
|
cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang))
|
||||||
word_filepath = os.path.join("sounds", "{}_{}.mp3".format(wordid, lang))
|
word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang))
|
||||||
copyfile(cached_filepath, word_filepath)
|
copyfile(cached_filepath, word_filepath)
|
||||||
else:
|
else:
|
||||||
filename = "{}_{}.mp3".format(wordid, lang)
|
filename = "{}_{}.mp3".format(wordid, lang)
|
||||||
filepath = os.path.join("sounds", filename)
|
filepath = os.path.join(SOUNDS_DIR, filename)
|
||||||
start = time()
|
start = time()
|
||||||
tts.gen_speech(word, lang_region, filepath)
|
gen_speech(word, lang_region, filepath)
|
||||||
duration = time() - start
|
duration = time() - start
|
||||||
print(" Generated ({} - {} s): {}".format(lang, duration, word))
|
print(" Generated ({} - {} s): {}".format(lang, duration, word))
|
||||||
SOUND_CACHE[(word, lang)] = wordid
|
SOUND_CACHE[(word, lang)] = wordid
|
||||||
@ -66,7 +85,9 @@ def gen_tts(wordids=None):
|
|||||||
wordid, german, english = row[0], row[1], row[2]
|
wordid, german, english = row[0], row[1], row[2]
|
||||||
if wordid not in wordids:
|
if wordid not in wordids:
|
||||||
continue
|
continue
|
||||||
print("Generating {}: {}, {}".format(wordid, german, english))
|
print("Generating image..")
|
||||||
|
image_gen(wordid, german, english)
|
||||||
|
print("Generating sound {}: {}, {}".format(wordid, german, english))
|
||||||
backoff, attempt = 1, 0
|
backoff, attempt = 1, 0
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
@ -102,18 +123,87 @@ def concatenate(filename="lesson1", wordids=None):
|
|||||||
silence = AudioSegment.silent(duration=SHORT_SILENCE)
|
silence = AudioSegment.silent(duration=SHORT_SILENCE)
|
||||||
long_silence = AudioSegment.silent(duration=LONG_SILENCE)
|
long_silence = AudioSegment.silent(duration=LONG_SILENCE)
|
||||||
gen_tts(wordids=wordids)
|
gen_tts(wordids=wordids)
|
||||||
|
images_durations = []
|
||||||
for wordid in wordids:
|
for wordid in wordids:
|
||||||
start = time()
|
start = time()
|
||||||
sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))
|
sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))
|
||||||
sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))
|
sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))
|
||||||
lessons = lessons + sound_de + silence + sound_en + silence + sound_de + long_silence
|
this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence
|
||||||
|
images_durations.append((wordid, this_phrase.duration_seconds))
|
||||||
|
lessons = lessons + this_phrase
|
||||||
duration = time() - start
|
duration = time() - start
|
||||||
print("Concatenated {} - {}s".format(wordid, duration))
|
print("Concatenated {} - {}s".format(wordid, duration))
|
||||||
|
|
||||||
lessons.export(os.path.join("lessons", "{}.mp3".format(filename)), format="mp3")
|
lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3")
|
||||||
|
create_video(filename, images_durations)
|
||||||
|
|
||||||
|
|
||||||
|
def image_gen(wordid, de_text, en_text):
|
||||||
|
width, height = 800, 450
|
||||||
|
margin, initial_font_size, font_step = 50, 120, 4
|
||||||
|
filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid))
|
||||||
|
image = Image.new(mode="RGB", size=(width, height), color="black")
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
font_size = initial_font_size
|
||||||
|
while True:
|
||||||
|
fnt = ImageFont.truetype('arial.ttf', font_size)
|
||||||
|
de_w, de_h = draw.textsize(de_text, font=fnt)
|
||||||
|
en_w, en_h = draw.textsize(en_text, font=fnt)
|
||||||
|
if de_w + 2 * margin > width or en_w + 2 * margin > width:
|
||||||
|
font_size -= font_step
|
||||||
|
continue
|
||||||
|
break
|
||||||
|
draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255))
|
||||||
|
draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255))
|
||||||
|
draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2)
|
||||||
|
image.save(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def create_video(lesson_name, images_durations):
|
||||||
|
tmp_video_name = '{}_tmp.mp4'.format(lesson_name)
|
||||||
|
tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name)
|
||||||
|
|
||||||
|
fourcc = cv2.VideoWriter_fourcc(*'MP4V') # define the video codec
|
||||||
|
|
||||||
|
frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0])))
|
||||||
|
height, width, layers = frame.shape
|
||||||
|
|
||||||
|
frames = 24
|
||||||
|
|
||||||
|
video = cv2.VideoWriter(tmp_video_filepath, fourcc, frames, (width, height))
|
||||||
|
for image, image_duration in images_durations:
|
||||||
|
image_frames = int(image_duration * frames)
|
||||||
|
for _ in range(image_frames):
|
||||||
|
video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image))))
|
||||||
|
|
||||||
|
cv2.destroyAllWindows()
|
||||||
|
video.release()
|
||||||
|
|
||||||
|
os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format(
|
||||||
|
video_name=tmp_video_filepath,
|
||||||
|
audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)),
|
||||||
|
video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)),
|
||||||
|
))
|
||||||
|
os.remove(tmp_video_filepath)
|
||||||
|
|
||||||
|
|
||||||
|
ERROR_MSG = "First argument needs to be the lesson to be generated"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
load_sound_cache()
|
load_sound_cache()
|
||||||
wordids = filter_words("lesson05")
|
if len(sys.argv) != 2:
|
||||||
concatenate(filename="lesson05", wordids=wordids)
|
print(ERROR_MSG)
|
||||||
|
exit(1)
|
||||||
|
lesson = sys.argv[1]
|
||||||
|
if not lesson.isdigit():
|
||||||
|
print(ERROR_MSG)
|
||||||
|
exit(1)
|
||||||
|
lesson = int(lesson)
|
||||||
|
if not 0 < lesson < 100:
|
||||||
|
print(ERROR_MSG)
|
||||||
|
exit(1)
|
||||||
|
print(lesson)
|
||||||
|
lesson = "lesson{:02d}".format(lesson)
|
||||||
|
concatenate(filename=lesson,
|
||||||
|
wordids=filter_words(lesson))
|
||||||
|
@ -1,18 +1,4 @@
|
|||||||
cachetools==3.1.0
|
numpy==1.22.1
|
||||||
certifi==2019.3.9
|
opencv-python==4.5.5.62
|
||||||
chardet==3.0.4
|
Pillow==9.0.0
|
||||||
google-api-core==1.8.0
|
pydub==0.25.1
|
||||||
google-auth==1.6.3
|
|
||||||
google-cloud-texttospeech==0.4.0
|
|
||||||
googleapis-common-protos==1.5.8
|
|
||||||
grpcio==1.19.0
|
|
||||||
idna==2.8
|
|
||||||
protobuf==3.7.1
|
|
||||||
pyasn1==0.4.5
|
|
||||||
pyasn1-modules==0.2.4
|
|
||||||
pydub==0.23.1
|
|
||||||
pytz==2019.1
|
|
||||||
requests==2.21.0
|
|
||||||
rsa==4.0
|
|
||||||
six==1.12.0
|
|
||||||
urllib3==1.24.1
|
|
||||||
|
1200
sound_cache.csv
1200
sound_cache.csv
File diff suppressed because it is too large
Load Diff
34
tts.py
34
tts.py
@ -1,34 +0,0 @@
|
|||||||
from google.cloud import texttospeech
|
|
||||||
|
|
||||||
# Instantiates a client
|
|
||||||
client = texttospeech.TextToSpeechClient()
|
|
||||||
|
|
||||||
|
|
||||||
def gen_speech(text, language_code, output_file):
|
|
||||||
"""Synthesizes speech from the input string of text or ssml.
|
|
||||||
|
|
||||||
Note: ssml must be well-formed according to:
|
|
||||||
https://www.w3.org/TR/speech-synthesis/
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Set the text input to be synthesized
|
|
||||||
synthesis_input = texttospeech.types.SynthesisInput(text=text)
|
|
||||||
|
|
||||||
# Build the voice request, select the language code ("en-US") and the ssml
|
|
||||||
# voice gender ("neutral")
|
|
||||||
voice = texttospeech.types.VoiceSelectionParams(
|
|
||||||
language_code=language_code,
|
|
||||||
ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
|
|
||||||
|
|
||||||
# Select the type of audio file you want returned
|
|
||||||
audio_config = texttospeech.types.AudioConfig(
|
|
||||||
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
|
|
||||||
|
|
||||||
# Perform the text-to-speech request on the text input with the selected
|
|
||||||
# voice parameters and audio file type
|
|
||||||
response = client.synthesize_speech(synthesis_input, voice, audio_config)
|
|
||||||
|
|
||||||
# The response's audio_content is binary.
|
|
||||||
with open(output_file, 'wb') as out:
|
|
||||||
# Write the response to the output file.
|
|
||||||
out.write(response.audio_content)
|
|
Loading…
Reference in New Issue
Block a user