espeak generation and video
This commit is contained in:
parent
41381e027c
commit
11e440afcc
5
.gitignore
vendored
5
.gitignore
vendored
@ -3,4 +3,7 @@ venv
|
||||
__pycache__
|
||||
*secret*
|
||||
sounds
|
||||
lessons
|
||||
lessons
|
||||
images
|
||||
videos
|
||||
sound_cache.csv
|
@ -1,4 +1,6 @@
|
||||
#!/bin/bash
|
||||
sudo apt install especk ffmpeg
|
||||
|
||||
virtualenv -p python3 venv
|
||||
source venv/bin/activate
|
||||
pip install -r requirements.txt
|
||||
|
12
README.md
12
README.md
@ -1,6 +1,8 @@
|
||||
# en_de
|
||||
|
||||
Generate audio from a dictionary.
|
||||
Generate audio/video from a dictionary.
|
||||
|
||||
Uses espeak for audio generation, Pillow for images with text overlay, OpenCV for video combining, ffmpeg for combining.
|
||||
|
||||
## install
|
||||
|
||||
@ -10,6 +12,14 @@ Generate audio from a dictionary.
|
||||
|
||||
# run
|
||||
|
||||
Change the lesson
|
||||
|
||||
```
|
||||
python generate.py <lesson number>
|
||||
```
|
||||
|
||||
# old run
|
||||
|
||||
You first need a service account secret from [Google Cloud Text-To-Speech API](https://cloud.google.com/text-to-speech/docs/quickstart-client-libraries#client-libraries-install-python) (go through the steps of `Before you begin` section, the rest is handled by the app).
|
||||
|
||||
Modify what you need from the `generate.py` file:
|
||||
|
112
generate.py
112
generate.py
@ -1,16 +1,26 @@
|
||||
import sys
|
||||
|
||||
import cv2
|
||||
from PIL import ImageFont, Image, ImageDraw
|
||||
import csv
|
||||
import os
|
||||
from shutil import copyfile
|
||||
from time import sleep, time
|
||||
|
||||
import tts
|
||||
from pydub import AudioSegment
|
||||
|
||||
SHORT_SILENCE = 500
|
||||
LONG_SILENCE = 1000
|
||||
SOUNDS_DIR = "sounds"
|
||||
IMAGES_DIR = "images"
|
||||
AUDIO_LESSONS_DIR = "lessons"
|
||||
VIDEOS_DIR = "videos"
|
||||
SOUND_CACHE_FPATH = 'sound_cache.csv'
|
||||
WORDS_FPATH = 'words.csv'
|
||||
|
||||
for d in [SOUNDS_DIR, IMAGES_DIR, AUDIO_LESSONS_DIR, VIDEOS_DIR]:
|
||||
os.makedirs(d, exist_ok=True)
|
||||
|
||||
EXPONENTIAL_BACKOFF = 1.5
|
||||
LANG_REGIONS = {
|
||||
'en': 'en-US',
|
||||
@ -21,7 +31,7 @@ SOUND_CACHE = {}
|
||||
|
||||
|
||||
def load_sound_cache():
|
||||
with open(SOUND_CACHE_FPATH, 'r') as csvFile:
|
||||
with open(SOUND_CACHE_FPATH, 'w+') as csvFile:
|
||||
reader = csv.reader(csvFile)
|
||||
for line, row in enumerate(reader):
|
||||
wordid, lang, word = row[0], row[1], row[2]
|
||||
@ -33,6 +43,15 @@ def get_cached_sound(word, lang):
|
||||
return wordid
|
||||
|
||||
|
||||
def gen_speech(phrase, lang, filepath):
|
||||
wav_fpath = filepath.split('.')[0] + '.wav'
|
||||
# +f1 -k1 -s100
|
||||
cmd = 'espeak -v{lang} "{phrase}" --stdout > {wav_fpath} && ffmpeg -y -i {wav_fpath} {filepath} && rm {wav_fpath}'
|
||||
cmd = cmd.format(lang=lang, phrase=phrase, filepath=filepath, wav_fpath=wav_fpath)
|
||||
print(cmd)
|
||||
os.system(cmd)
|
||||
|
||||
|
||||
def generate_sound(word, lang, wordid):
|
||||
lang_region = LANG_REGIONS.get(lang)
|
||||
cached_wordid = get_cached_sound(word, lang)
|
||||
@ -40,14 +59,14 @@ def generate_sound(word, lang, wordid):
|
||||
print(" Found in cache: {}".format(word))
|
||||
if cached_wordid != wordid:
|
||||
# TODO: this is duplicating space, but my brain is fried, should be mapping
|
||||
cached_filepath = os.path.join("sounds", "{}_{}.mp3".format(cached_wordid, lang))
|
||||
word_filepath = os.path.join("sounds", "{}_{}.mp3".format(wordid, lang))
|
||||
cached_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(cached_wordid, lang))
|
||||
word_filepath = os.path.join(SOUNDS_DIR, "{}_{}.mp3".format(wordid, lang))
|
||||
copyfile(cached_filepath, word_filepath)
|
||||
else:
|
||||
filename = "{}_{}.mp3".format(wordid, lang)
|
||||
filepath = os.path.join("sounds", filename)
|
||||
filepath = os.path.join(SOUNDS_DIR, filename)
|
||||
start = time()
|
||||
tts.gen_speech(word, lang_region, filepath)
|
||||
gen_speech(word, lang_region, filepath)
|
||||
duration = time() - start
|
||||
print(" Generated ({} - {} s): {}".format(lang, duration, word))
|
||||
SOUND_CACHE[(word, lang)] = wordid
|
||||
@ -66,7 +85,9 @@ def gen_tts(wordids=None):
|
||||
wordid, german, english = row[0], row[1], row[2]
|
||||
if wordid not in wordids:
|
||||
continue
|
||||
print("Generating {}: {}, {}".format(wordid, german, english))
|
||||
print("Generating image..")
|
||||
image_gen(wordid, german, english)
|
||||
print("Generating sound {}: {}, {}".format(wordid, german, english))
|
||||
backoff, attempt = 1, 0
|
||||
while True:
|
||||
try:
|
||||
@ -102,18 +123,87 @@ def concatenate(filename="lesson1", wordids=None):
|
||||
silence = AudioSegment.silent(duration=SHORT_SILENCE)
|
||||
long_silence = AudioSegment.silent(duration=LONG_SILENCE)
|
||||
gen_tts(wordids=wordids)
|
||||
images_durations = []
|
||||
for wordid in wordids:
|
||||
start = time()
|
||||
sound_de = AudioSegment.from_mp3("sounds/{}_de.mp3".format(wordid))
|
||||
sound_en = AudioSegment.from_mp3("sounds/{}_en.mp3".format(wordid))
|
||||
lessons = lessons + sound_de + silence + sound_en + silence + sound_de + long_silence
|
||||
this_phrase = sound_de + silence + sound_en + silence + sound_de + long_silence
|
||||
images_durations.append((wordid, this_phrase.duration_seconds))
|
||||
lessons = lessons + this_phrase
|
||||
duration = time() - start
|
||||
print("Concatenated {} - {}s".format(wordid, duration))
|
||||
|
||||
lessons.export(os.path.join("lessons", "{}.mp3".format(filename)), format="mp3")
|
||||
lessons.export(os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(filename)), format="mp3")
|
||||
create_video(filename, images_durations)
|
||||
|
||||
|
||||
def image_gen(wordid, de_text, en_text):
|
||||
width, height = 800, 450
|
||||
margin, initial_font_size, font_step = 50, 120, 4
|
||||
filename = os.path.join(IMAGES_DIR, "{}.png".format(wordid))
|
||||
image = Image.new(mode="RGB", size=(width, height), color="black")
|
||||
draw = ImageDraw.Draw(image)
|
||||
font_size = initial_font_size
|
||||
while True:
|
||||
fnt = ImageFont.truetype('arial.ttf', font_size)
|
||||
de_w, de_h = draw.textsize(de_text, font=fnt)
|
||||
en_w, en_h = draw.textsize(en_text, font=fnt)
|
||||
if de_w + 2 * margin > width or en_w + 2 * margin > width:
|
||||
font_size -= font_step
|
||||
continue
|
||||
break
|
||||
draw.text(((width - de_w) / 2, height / 2 - margin - de_h), de_text, font=fnt, fill=(255, 255, 255))
|
||||
draw.text(((width - en_w) / 2, height / 2 + margin), en_text, font=fnt, fill=(255, 255, 255))
|
||||
draw.line((0, height / 2, width, height / 2), fill=(255, 0, 0), width=2)
|
||||
image.save(filename)
|
||||
|
||||
|
||||
def create_video(lesson_name, images_durations):
|
||||
tmp_video_name = '{}_tmp.mp4'.format(lesson_name)
|
||||
tmp_video_filepath = os.path.join(VIDEOS_DIR, tmp_video_name)
|
||||
|
||||
fourcc = cv2.VideoWriter_fourcc(*'MP4V') # define the video codec
|
||||
|
||||
frame = cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(images_durations[0][0])))
|
||||
height, width, layers = frame.shape
|
||||
|
||||
frames = 24
|
||||
|
||||
video = cv2.VideoWriter(tmp_video_filepath, fourcc, frames, (width, height))
|
||||
for image, image_duration in images_durations:
|
||||
image_frames = int(image_duration * frames)
|
||||
for _ in range(image_frames):
|
||||
video.write(cv2.imread(os.path.join(IMAGES_DIR, "{}.png".format(image))))
|
||||
|
||||
cv2.destroyAllWindows()
|
||||
video.release()
|
||||
|
||||
os.system("ffmpeg -y -i {video_name} -i {audio_name} -c:v copy -c:a copy {video_output}".format(
|
||||
video_name=tmp_video_filepath,
|
||||
audio_name=os.path.join(AUDIO_LESSONS_DIR, "{}.mp3".format(lesson_name)),
|
||||
video_output=os.path.join(VIDEOS_DIR, "{}.mp4".format(lesson_name)),
|
||||
))
|
||||
os.remove(tmp_video_filepath)
|
||||
|
||||
|
||||
ERROR_MSG = "First argument needs to be the lesson to be generated"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_sound_cache()
|
||||
wordids = filter_words("lesson05")
|
||||
concatenate(filename="lesson05", wordids=wordids)
|
||||
if len(sys.argv) != 2:
|
||||
print(ERROR_MSG)
|
||||
exit(1)
|
||||
lesson = sys.argv[1]
|
||||
if not lesson.isdigit():
|
||||
print(ERROR_MSG)
|
||||
exit(1)
|
||||
lesson = int(lesson)
|
||||
if not 0 < lesson < 100:
|
||||
print(ERROR_MSG)
|
||||
exit(1)
|
||||
print(lesson)
|
||||
lesson = "lesson{:02d}".format(lesson)
|
||||
concatenate(filename=lesson,
|
||||
wordids=filter_words(lesson))
|
||||
|
@ -1,18 +1,4 @@
|
||||
cachetools==3.1.0
|
||||
certifi==2019.3.9
|
||||
chardet==3.0.4
|
||||
google-api-core==1.8.0
|
||||
google-auth==1.6.3
|
||||
google-cloud-texttospeech==0.4.0
|
||||
googleapis-common-protos==1.5.8
|
||||
grpcio==1.19.0
|
||||
idna==2.8
|
||||
protobuf==3.7.1
|
||||
pyasn1==0.4.5
|
||||
pyasn1-modules==0.2.4
|
||||
pydub==0.23.1
|
||||
pytz==2019.1
|
||||
requests==2.21.0
|
||||
rsa==4.0
|
||||
six==1.12.0
|
||||
urllib3==1.24.1
|
||||
numpy==1.22.1
|
||||
opencv-python==4.5.5.62
|
||||
Pillow==9.0.0
|
||||
pydub==0.25.1
|
||||
|
1200
sound_cache.csv
1200
sound_cache.csv
File diff suppressed because it is too large
Load Diff
34
tts.py
34
tts.py
@ -1,34 +0,0 @@
|
||||
from google.cloud import texttospeech
|
||||
|
||||
# Instantiates a client
|
||||
client = texttospeech.TextToSpeechClient()
|
||||
|
||||
|
||||
def gen_speech(text, language_code, output_file):
|
||||
"""Synthesizes speech from the input string of text or ssml.
|
||||
|
||||
Note: ssml must be well-formed according to:
|
||||
https://www.w3.org/TR/speech-synthesis/
|
||||
"""
|
||||
|
||||
# Set the text input to be synthesized
|
||||
synthesis_input = texttospeech.types.SynthesisInput(text=text)
|
||||
|
||||
# Build the voice request, select the language code ("en-US") and the ssml
|
||||
# voice gender ("neutral")
|
||||
voice = texttospeech.types.VoiceSelectionParams(
|
||||
language_code=language_code,
|
||||
ssml_gender=texttospeech.enums.SsmlVoiceGender.NEUTRAL)
|
||||
|
||||
# Select the type of audio file you want returned
|
||||
audio_config = texttospeech.types.AudioConfig(
|
||||
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
|
||||
|
||||
# Perform the text-to-speech request on the text input with the selected
|
||||
# voice parameters and audio file type
|
||||
response = client.synthesize_speech(synthesis_input, voice, audio_config)
|
||||
|
||||
# The response's audio_content is binary.
|
||||
with open(output_file, 'wb') as out:
|
||||
# Write the response to the output file.
|
||||
out.write(response.audio_content)
|
Loading…
Reference in New Issue
Block a user