scrape nimi li and basic data
This commit is contained in:
parent
52dc733cf3
commit
3b25d0c3d5
1
.gitignore
vendored
1
.gitignore
vendored
@ -2,3 +2,4 @@ venv
|
||||
__pycache__
|
||||
.idea
|
||||
tts
|
||||
scrape/data/words
|
||||
|
102
mylist.txt
102
mylist.txt
@ -25,105 +25,3 @@ file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/misc/phrases.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/tok/mi_en_sina_li_moku.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/you_and_i_are_eating..mp4'
|
||||
file 'tts/tok_phrases/tok/mi_en_sina_li_moku.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/jan_utala_mute.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/many_warriors.mp4'
|
||||
file 'tts/tok_phrases/tok/jan_utala_mute.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/jan_lili_mute_li_lape.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/the_children_are_sleeping.mp4'
|
||||
file 'tts/tok_phrases/tok/jan_lili_mute_li_lape.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/kiwen_suli_li_pakala_e_tomo_lipu.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/a_big_rock_damaged_the_library.mp4'
|
||||
file 'tts/tok_phrases/tok/kiwen_suli_li_pakala_e_tomo_lipu.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/mi_pakala_lili.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/i_made_a_little_mistake..mp4'
|
||||
file 'tts/tok_phrases/tok/mi_pakala_lili.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/ilo_sina_li_kalama_mute_ike.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/your_instrument_is_making_lots_of_bad_noise..mp4'
|
||||
file 'tts/tok_phrases/tok/ilo_sina_li_kalama_mute_ike.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/kulupu_ni_li_pona_mute.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/this_community_is_very_good..mp4'
|
||||
file 'tts/tok_phrases/tok/kulupu_ni_li_pona_mute.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/jan_lili_li_toki_e_ni:_sina_pona.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/the_child_said_that_you’re_good..mp4'
|
||||
file 'tts/tok_phrases/tok/jan_lili_li_toki_e_ni:_sina_pona.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/ona_li_toki_e_ni:_“toki!_sina_pona_lukin”.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/they_said:_“hello!_you_look_good.”.mp4'
|
||||
file 'tts/tok_phrases/tok/ona_li_toki_e_ni:_“toki!_sina_pona_lukin”.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/jan_pali_ni_li_pali_e_tomo_mi:_ona_li_jo_e_kiwen_mute.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/the_worker_with_lots_of_rocks_built_my_home.mp4'
|
||||
file 'tts/tok_phrases/tok/jan_pali_ni_li_pali_e_tomo_mi:_ona_li_jo_e_kiwen_mute.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/kulupu_sina_li_ante_mute.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/your_community_is_very_different..mp4'
|
||||
file 'tts/tok_phrases/tok/kulupu_sina_li_ante_mute.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/jan_ike_li_pakala_e_ilo_mi.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/a_bad_person_broke_my_tools..mp4'
|
||||
file 'tts/tok_phrases/tok/jan_ike_li_pakala_e_ilo_mi.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/mi_pali_e_tomo_ni.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/i_built_this_house..mp4'
|
||||
file 'tts/tok_phrases/tok/mi_pali_e_tomo_ni.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/jan_utala_pona_mute_li_awen_e_kulupu_ni.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/the_good_warriors_protect_this_community..mp4'
|
||||
file 'tts/tok_phrases/tok/jan_utala_pona_mute_li_awen_e_kulupu_ni.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/kulupu_suli_li_awen___li_suli_e_ona.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/the_large_community_endures_and_grows_itself.mp4'
|
||||
file 'tts/tok_phrases/tok/kulupu_suli_li_awen___li_suli_e_ona.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/moku_seli_li_pona_mute.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/warm_food_is_very_good..mp4'
|
||||
file 'tts/tok_phrases/tok/moku_seli_li_pona_mute.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/jan_lili_lape_li_kalama_ala.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/sleeping_children_don’t_make_noises..mp4'
|
||||
file 'tts/tok_phrases/tok/jan_lili_lape_li_kalama_ala.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/jan_pali_mute_li_toki_e_ni:_ona_li_wawa___li_kiwen.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/the_workers_said_that_they_are_strong_and_tough..mp4'
|
||||
file 'tts/tok_phrases/tok/jan_pali_mute_li_toki_e_ni:_ona_li_wawa___li_kiwen.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/sina_ante_lukin.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/you_look_different..mp4'
|
||||
file 'tts/tok_phrases/tok/sina_ante_lukin.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/tok/tomo_ni_li_awen_e_seli.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
file 'tts/tok_phrases/en/this_house_preserves_the_heat..mp4'
|
||||
file 'tts/tok_phrases/tok/tomo_ni_li_awen_e_seli.mp4'
|
||||
file 'tts/misc/silence_1.mp3'
|
||||
file 'tts/tok_phrases/lesson_names/end_of_lesson_05.mp4'
|
||||
file 'tts/misc/silence_0.5.mp3'
|
||||
|
134
scrape/app.py
Normal file
134
scrape/app.py
Normal file
@ -0,0 +1,134 @@
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
|
||||
logging.basicConfig()
|
||||
logger = logging.getLogger()
|
||||
|
||||
URL_BASE = 'https://nimi.li/'
|
||||
URL_WORD = URL_BASE + '{word}'
|
||||
URL_WORD_2 = 'https://linku.la/words/{word}'
|
||||
|
||||
|
||||
def setup_logging_level(debug=False):
|
||||
log_level = logging.DEBUG if debug else logging.INFO
|
||||
logger.setLevel(log_level)
|
||||
logger.debug("Debugging enabled")
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('query', nargs='*', default="", help="freeform")
|
||||
parser.add_argument('--debug', dest='debug', action='store_true')
|
||||
parser.add_argument('--raise-exceptions', dest='raise_exceptions', action='store_true')
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
args = parse_args()
|
||||
setup_logging_level(args.debug)
|
||||
|
||||
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
|
||||
WORDS_DIR = os.path.join(DATA_DIR, 'words')
|
||||
|
||||
|
||||
def get_cache_or_url(url, cache_file):
|
||||
if os.path.exists(cache_file):
|
||||
logger.debug("Getting cached response from {}".format(cache_file))
|
||||
with open(cache_file, 'r') as f:
|
||||
response_text = f.read()
|
||||
else:
|
||||
os.makedirs(os.path.dirname(cache_file), exist_ok=True)
|
||||
logger.debug("Getting response from {}".format(url))
|
||||
response = requests.get(url)
|
||||
if response.status_code != 200:
|
||||
if args.raise_exceptions:
|
||||
raise Exception("Error getting response from {}".format(url))
|
||||
logger.error("Error getting response from {}".format(url))
|
||||
response_text = response.text
|
||||
with open(cache_file, 'w+') as f:
|
||||
f.write(response.text)
|
||||
return response_text
|
||||
|
||||
def get_word_list():
|
||||
words_file = os.path.join(DATA_DIR, 'words.json')
|
||||
if os.path.exists(words_file):
|
||||
logger.debug("Getting cached response from {}".format(words_file))
|
||||
with open(words_file, 'r') as f:
|
||||
words = json.load(f)
|
||||
else:
|
||||
words = set()
|
||||
cache_file = os.path.join(DATA_DIR, 'all_words.html')
|
||||
response_text = get_cache_or_url(URL_BASE, cache_file)
|
||||
|
||||
soup = BeautifulSoup(response_text, 'html.parser')
|
||||
|
||||
word_tags = soup.select('main > div.grid p.font-pona')
|
||||
for word_tag in word_tags:
|
||||
if word_tag.text[-1] in ['0', '1', '2', '3']:
|
||||
final_word = word_tag.text[:-1]
|
||||
else:
|
||||
final_word = word_tag.text
|
||||
words.add(final_word)
|
||||
with open(words_file, 'w+') as f:
|
||||
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||
return words
|
||||
|
||||
|
||||
def get_word_data(word, url):
|
||||
cache_file = os.path.join(WORDS_DIR, f'{word}.html')
|
||||
response_text = get_cache_or_url(url, cache_file)
|
||||
|
||||
soup = BeautifulSoup(response_text, 'html.parser')
|
||||
|
||||
tag = soup.select('body > div:nth-child(1) > div.relative.my-0.mx-auto.p-0.flex.flex-col.min-h-dvh > main > div > div:nth-child(1) > div.p-6.pt-0.flex.flex-col.gap-3 > div:nth-child(3) > ul > li')
|
||||
pu_defs = [t.text for t in tag]
|
||||
pu_definitions = []
|
||||
for pu_def in pu_defs:
|
||||
pu_def_split = pu_def.split()
|
||||
pos = pu_def_split[0]
|
||||
definition = ' '.join(pu_def_split[1:])
|
||||
pu_definitions.append({
|
||||
"pos": pos,
|
||||
"definition": definition,
|
||||
})
|
||||
|
||||
return {
|
||||
'pu_definitions': pu_definitions
|
||||
}
|
||||
|
||||
def get_words_data(word_list):
|
||||
words_data_file = os.path.join(DATA_DIR, 'words_data.json')
|
||||
if os.path.exists(words_data_file):
|
||||
logger.debug("Getting cached response from {}".format(words_data_file))
|
||||
with open(words_data_file, 'r') as f:
|
||||
words_data = json.load(f)
|
||||
else:
|
||||
words_data = {}
|
||||
for word in word_list:
|
||||
url = URL_WORD_2.format(word=word)
|
||||
word_data = get_word_data(word, url)
|
||||
words_data[word] = word_data
|
||||
with open(words_data_file, 'w+') as f:
|
||||
json.dump(words_data, f, ensure_ascii=False, indent=2)
|
||||
return words_data
|
||||
|
||||
def main():
|
||||
for folder in [WORDS_DIR]:
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
word_list = get_word_list()
|
||||
print(word_list)
|
||||
words_data = get_words_data(word_list)
|
||||
nouns = []
|
||||
for word, data in words_data.items():
|
||||
for pu_def in data.get('pu_definitions'):
|
||||
if pu_def.get('pos') == 'ADJECTIVE':
|
||||
nouns.append(word)
|
||||
print('" | "'.join(nouns))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
133
scrape/data/words.json
Normal file
133
scrape/data/words.json
Normal file
@ -0,0 +1,133 @@
|
||||
[
|
||||
"a",
|
||||
"akesi",
|
||||
"ala",
|
||||
"alasa",
|
||||
"ale",
|
||||
"anpa",
|
||||
"ante",
|
||||
"anu",
|
||||
"awen",
|
||||
"e",
|
||||
"en",
|
||||
"esun",
|
||||
"ijo",
|
||||
"ike",
|
||||
"ilo",
|
||||
"insa",
|
||||
"jaki",
|
||||
"jaki",
|
||||
"jan",
|
||||
"jelo",
|
||||
"jo",
|
||||
"kala",
|
||||
"kalama",
|
||||
"kama",
|
||||
"kasi",
|
||||
"ken",
|
||||
"kepeken",
|
||||
"kili",
|
||||
"kiwen",
|
||||
"ko",
|
||||
"kon",
|
||||
"kule",
|
||||
"kulupu",
|
||||
"kute",
|
||||
"la",
|
||||
"lape",
|
||||
"laso",
|
||||
"lawa",
|
||||
"len",
|
||||
"lete",
|
||||
"li",
|
||||
"lili",
|
||||
"linja",
|
||||
"lipu",
|
||||
"loje",
|
||||
"lon",
|
||||
"luka",
|
||||
"lukin",
|
||||
"lupa",
|
||||
"ma",
|
||||
"mama",
|
||||
"mani",
|
||||
"mi",
|
||||
"moku",
|
||||
"moli",
|
||||
"monsi",
|
||||
"mu",
|
||||
"mun",
|
||||
"musi",
|
||||
"mute",
|
||||
"nanpa",
|
||||
"nasa",
|
||||
"nasin",
|
||||
"nena",
|
||||
"nimi",
|
||||
"noka",
|
||||
"o",
|
||||
"olin",
|
||||
"ona",
|
||||
"open",
|
||||
"pakala",
|
||||
"pali",
|
||||
"palisa",
|
||||
"pan",
|
||||
"pana",
|
||||
"pi",
|
||||
"pilin",
|
||||
"pimeja",
|
||||
"pini",
|
||||
"pipi",
|
||||
"poka",
|
||||
"poki",
|
||||
"pona",
|
||||
"pu",
|
||||
"sama",
|
||||
"seli",
|
||||
"selo",
|
||||
"seme",
|
||||
"sewi",
|
||||
"sijelo",
|
||||
"sike",
|
||||
"sin",
|
||||
"sina",
|
||||
"sinpin",
|
||||
"sitelen",
|
||||
"sona",
|
||||
"soweli",
|
||||
"suli",
|
||||
"suno",
|
||||
"supa",
|
||||
"suwi",
|
||||
"tan",
|
||||
"taso",
|
||||
"tawa",
|
||||
"telo",
|
||||
"tenpo",
|
||||
"toki",
|
||||
"tomo",
|
||||
"tu",
|
||||
"unpa",
|
||||
"uta",
|
||||
"utala",
|
||||
"walo",
|
||||
"wan",
|
||||
"waso",
|
||||
"wawa",
|
||||
"weka",
|
||||
"wile",
|
||||
"kijetesantakalu",
|
||||
"kin",
|
||||
"kipisi",
|
||||
"ku",
|
||||
"leko",
|
||||
"meli",
|
||||
"mije",
|
||||
"misikeke",
|
||||
"monsuta",
|
||||
"n",
|
||||
"namako",
|
||||
"soko",
|
||||
"tonsi"
|
||||
]
|
1063
scrape/data/words_data.json
Normal file
1063
scrape/data/words_data.json
Normal file
File diff suppressed because it is too large
Load Diff
18
tts_tr.sh
Executable file
18
tts_tr.sh
Executable file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
set -x
|
||||
set -e
|
||||
|
||||
# generate silence
|
||||
# sox -n -r 22050 silence_500.wav trim 0.0 0.500
|
||||
|
||||
python3 lexconvert.py --phones2phones unicode-ipa espeak "${1}. a." | espeak -g 1 -s 1 -v en+f4 -w "tts/tok_words/${3}.wav"
|
||||
duration=`ffprobe -v error -show_entries format=duration -of csv=p=0 tts/tok_words/${3}.wav`
|
||||
duration=`echo $duration - 1.55 | bc`
|
||||
ffmpeg -y -ss 00:00:00 -to "$duration" -i "tts/tok_words/${3}.wav" -c copy "tts/tok_words/${3}_fixed.wav"
|
||||
rm "tts/tok_words/${3}.wav"
|
||||
|
||||
echo "${2}" | espeak -v en+m4 -w "tts/tok_words/${3}_tr.wav"
|
||||
|
||||
sox "tts/tok_words/${3}_fixed.wav" "tts/tok_words/silence_500.wav" "tts/tok_words/${3}_tr.wav" "tts/tok_words/${3}_fixed.wav" "tts/tok_words/${3}_final.wav"
|
||||
|
||||
rm "tts/tok_words/${3}_fixed.wav" "tts/tok_words/${3}_tr.wav"
|
36
web/app.py
Normal file
36
web/app.py
Normal file
@ -0,0 +1,36 @@
|
||||
from flask import Flask, render_template, request
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
WORD_LI = 'li'
|
||||
|
||||
PARTICLES = ['en', WORD_LI, 'e', 'la', 'pi', 'o', 'anu']
|
||||
|
||||
EMOTICLE = ['a']
|
||||
|
||||
PRONOUNS = ['mi', 'sina', 'ona', 'ni']
|
||||
|
||||
PREPOSITIONS = ['lon', 'tawa', 'tan', 'sama', 'kepeken']
|
||||
|
||||
PREVERBS = ['wile', 'sona', 'awen', 'kama', 'ken', 'lukin']
|
||||
|
||||
QUESTIONS = ['seme']
|
||||
|
||||
|
||||
|
||||
|
||||
def create_context(sentence):
|
||||
words = sentence.split()
|
||||
|
||||
|
||||
@app.route('/', methods=['GET', 'POST'])
|
||||
def home():
|
||||
ctx = {}
|
||||
if request.method == 'POST':
|
||||
sentence = request.form.get('sentence')
|
||||
ctx = create_context(sentence)
|
||||
return render_template('home.html', ctx=ctx)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
11
web/templates/home.html
Normal file
11
web/templates/home.html
Normal file
@ -0,0 +1,11 @@
|
||||
{% extends "layout.html" %}
|
||||
{% block body %}
|
||||
|
||||
<form action="/" method="post">
|
||||
<input type="text" size="50" name="sentence" value="{{ request.form.sentence }}"/>
|
||||
<input type="submit" />
|
||||
|
||||
</form>
|
||||
|
||||
|
||||
{% endblock %}
|
9
web/templates/layout.html
Normal file
9
web/templates/layout.html
Normal file
@ -0,0 +1,9 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Title</title>
|
||||
</head>
|
||||
<body>
|
||||
{% block body %}{% endblock %}
|
||||
</body>
|
Loading…
Reference in New Issue
Block a user