scrape nimi li and basic data

This commit is contained in:
Dani Tsvetkov 2024-04-10 07:27:09 +02:00
parent 52dc733cf3
commit 3b25d0c3d5
10 changed files with 1405 additions and 102 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@ venv
__pycache__
.idea
tts
scrape/data/words

View File

@ -25,105 +25,3 @@ file 'tts/misc/silence_0.5.mp3'
file 'tts/misc/silence_1.mp3'
file 'tts/misc/phrases.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/tok/mi_en_sina_li_moku.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/you_and_i_are_eating..mp4'
file 'tts/tok_phrases/tok/mi_en_sina_li_moku.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/jan_utala_mute.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/many_warriors.mp4'
file 'tts/tok_phrases/tok/jan_utala_mute.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/jan_lili_mute_li_lape.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/the_children_are_sleeping.mp4'
file 'tts/tok_phrases/tok/jan_lili_mute_li_lape.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/kiwen_suli_li_pakala_e_tomo_lipu.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/a_big_rock_damaged_the_library.mp4'
file 'tts/tok_phrases/tok/kiwen_suli_li_pakala_e_tomo_lipu.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/mi_pakala_lili.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/i_made_a_little_mistake..mp4'
file 'tts/tok_phrases/tok/mi_pakala_lili.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/ilo_sina_li_kalama_mute_ike.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/your_instrument_is_making_lots_of_bad_noise..mp4'
file 'tts/tok_phrases/tok/ilo_sina_li_kalama_mute_ike.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/kulupu_ni_li_pona_mute.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/this_community_is_very_good..mp4'
file 'tts/tok_phrases/tok/kulupu_ni_li_pona_mute.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/jan_lili_li_toki_e_ni:_sina_pona.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/the_child_said_that_youre_good..mp4'
file 'tts/tok_phrases/tok/jan_lili_li_toki_e_ni:_sina_pona.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/ona_li_toki_e_ni:_“toki!_sina_pona_lukin”.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/they_said:_“hello!_you_look_good.”.mp4'
file 'tts/tok_phrases/tok/ona_li_toki_e_ni:_“toki!_sina_pona_lukin”.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/jan_pali_ni_li_pali_e_tomo_mi:_ona_li_jo_e_kiwen_mute.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/the_worker_with_lots_of_rocks_built_my_home.mp4'
file 'tts/tok_phrases/tok/jan_pali_ni_li_pali_e_tomo_mi:_ona_li_jo_e_kiwen_mute.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/kulupu_sina_li_ante_mute.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/your_community_is_very_different..mp4'
file 'tts/tok_phrases/tok/kulupu_sina_li_ante_mute.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/jan_ike_li_pakala_e_ilo_mi.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/a_bad_person_broke_my_tools..mp4'
file 'tts/tok_phrases/tok/jan_ike_li_pakala_e_ilo_mi.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/mi_pali_e_tomo_ni.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/i_built_this_house..mp4'
file 'tts/tok_phrases/tok/mi_pali_e_tomo_ni.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/jan_utala_pona_mute_li_awen_e_kulupu_ni.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/the_good_warriors_protect_this_community..mp4'
file 'tts/tok_phrases/tok/jan_utala_pona_mute_li_awen_e_kulupu_ni.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/kulupu_suli_li_awen___li_suli_e_ona.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/the_large_community_endures_and_grows_itself.mp4'
file 'tts/tok_phrases/tok/kulupu_suli_li_awen___li_suli_e_ona.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/moku_seli_li_pona_mute.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/warm_food_is_very_good..mp4'
file 'tts/tok_phrases/tok/moku_seli_li_pona_mute.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/jan_lili_lape_li_kalama_ala.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/sleeping_children_dont_make_noises..mp4'
file 'tts/tok_phrases/tok/jan_lili_lape_li_kalama_ala.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/jan_pali_mute_li_toki_e_ni:_ona_li_wawa___li_kiwen.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/the_workers_said_that_they_are_strong_and_tough..mp4'
file 'tts/tok_phrases/tok/jan_pali_mute_li_toki_e_ni:_ona_li_wawa___li_kiwen.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/sina_ante_lukin.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/you_look_different..mp4'
file 'tts/tok_phrases/tok/sina_ante_lukin.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/tok/tomo_ni_li_awen_e_seli.mp4'
file 'tts/misc/silence_0.5.mp3'
file 'tts/tok_phrases/en/this_house_preserves_the_heat..mp4'
file 'tts/tok_phrases/tok/tomo_ni_li_awen_e_seli.mp4'
file 'tts/misc/silence_1.mp3'
file 'tts/tok_phrases/lesson_names/end_of_lesson_05.mp4'
file 'tts/misc/silence_0.5.mp3'

View File

134
scrape/app.py Normal file
View File

@ -0,0 +1,134 @@
import argparse
import json
import logging
import os
import requests
from bs4 import BeautifulSoup, NavigableString
logging.basicConfig()
logger = logging.getLogger()
URL_BASE = 'https://nimi.li/'
URL_WORD = URL_BASE + '{word}'
URL_WORD_2 = 'https://linku.la/words/{word}'
def setup_logging_level(debug=False):
log_level = logging.DEBUG if debug else logging.INFO
logger.setLevel(log_level)
logger.debug("Debugging enabled")
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('query', nargs='*', default="", help="freeform")
parser.add_argument('--debug', dest='debug', action='store_true')
parser.add_argument('--raise-exceptions', dest='raise_exceptions', action='store_true')
return parser.parse_args()
args = parse_args()
setup_logging_level(args.debug)
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
WORDS_DIR = os.path.join(DATA_DIR, 'words')
def get_cache_or_url(url, cache_file):
if os.path.exists(cache_file):
logger.debug("Getting cached response from {}".format(cache_file))
with open(cache_file, 'r') as f:
response_text = f.read()
else:
os.makedirs(os.path.dirname(cache_file), exist_ok=True)
logger.debug("Getting response from {}".format(url))
response = requests.get(url)
if response.status_code != 200:
if args.raise_exceptions:
raise Exception("Error getting response from {}".format(url))
logger.error("Error getting response from {}".format(url))
response_text = response.text
with open(cache_file, 'w+') as f:
f.write(response.text)
return response_text
def get_word_list():
words_file = os.path.join(DATA_DIR, 'words.json')
if os.path.exists(words_file):
logger.debug("Getting cached response from {}".format(words_file))
with open(words_file, 'r') as f:
words = json.load(f)
else:
words = set()
cache_file = os.path.join(DATA_DIR, 'all_words.html')
response_text = get_cache_or_url(URL_BASE, cache_file)
soup = BeautifulSoup(response_text, 'html.parser')
word_tags = soup.select('main > div.grid p.font-pona')
for word_tag in word_tags:
if word_tag.text[-1] in ['0', '1', '2', '3']:
final_word = word_tag.text[:-1]
else:
final_word = word_tag.text
words.add(final_word)
with open(words_file, 'w+') as f:
json.dump(words, f, ensure_ascii=False, indent=2)
return words
def get_word_data(word, url):
cache_file = os.path.join(WORDS_DIR, f'{word}.html')
response_text = get_cache_or_url(url, cache_file)
soup = BeautifulSoup(response_text, 'html.parser')
tag = soup.select('body > div:nth-child(1) > div.relative.my-0.mx-auto.p-0.flex.flex-col.min-h-dvh > main > div > div:nth-child(1) > div.p-6.pt-0.flex.flex-col.gap-3 > div:nth-child(3) > ul > li')
pu_defs = [t.text for t in tag]
pu_definitions = []
for pu_def in pu_defs:
pu_def_split = pu_def.split()
pos = pu_def_split[0]
definition = ' '.join(pu_def_split[1:])
pu_definitions.append({
"pos": pos,
"definition": definition,
})
return {
'pu_definitions': pu_definitions
}
def get_words_data(word_list):
words_data_file = os.path.join(DATA_DIR, 'words_data.json')
if os.path.exists(words_data_file):
logger.debug("Getting cached response from {}".format(words_data_file))
with open(words_data_file, 'r') as f:
words_data = json.load(f)
else:
words_data = {}
for word in word_list:
url = URL_WORD_2.format(word=word)
word_data = get_word_data(word, url)
words_data[word] = word_data
with open(words_data_file, 'w+') as f:
json.dump(words_data, f, ensure_ascii=False, indent=2)
return words_data
def main():
for folder in [WORDS_DIR]:
os.makedirs(folder, exist_ok=True)
word_list = get_word_list()
print(word_list)
words_data = get_words_data(word_list)
nouns = []
for word, data in words_data.items():
for pu_def in data.get('pu_definitions'):
if pu_def.get('pos') == 'ADJECTIVE':
nouns.append(word)
print('" | "'.join(nouns))
if __name__ == "__main__":
main()

133
scrape/data/words.json Normal file
View File

@ -0,0 +1,133 @@
[
"a",
"akesi",
"ala",
"alasa",
"ale",
"anpa",
"ante",
"anu",
"awen",
"e",
"en",
"esun",
"ijo",
"ike",
"ilo",
"insa",
"jaki",
"jaki",
"jan",
"jelo",
"jo",
"kala",
"kalama",
"kama",
"kasi",
"ken",
"kepeken",
"kili",
"kiwen",
"ko",
"kon",
"kule",
"kulupu",
"kute",
"la",
"lape",
"laso",
"lawa",
"len",
"lete",
"li",
"lili",
"linja",
"lipu",
"loje",
"lon",
"luka",
"lukin",
"lupa",
"ma",
"mama",
"mani",
"mi",
"moku",
"moli",
"monsi",
"mu",
"mun",
"musi",
"mute",
"nanpa",
"nasa",
"nasin",
"nena",
"nimi",
"noka",
"o",
"olin",
"ona",
"open",
"pakala",
"pali",
"palisa",
"pan",
"pana",
"pi",
"pilin",
"pimeja",
"pini",
"pipi",
"poka",
"poki",
"pona",
"pu",
"sama",
"seli",
"selo",
"seme",
"sewi",
"sijelo",
"sike",
"sin",
"sina",
"sinpin",
"sitelen",
"sona",
"soweli",
"suli",
"suno",
"supa",
"suwi",
"tan",
"taso",
"tawa",
"telo",
"tenpo",
"toki",
"tomo",
"tu",
"unpa",
"uta",
"utala",
"walo",
"wan",
"waso",
"wawa",
"weka",
"wile",
"kijetesantakalu",
"kin",
"kipisi",
"ku",
"leko",
"meli",
"mije",
"misikeke",
"monsuta",
"n",
"namako",
"soko",
"tonsi"
]

1063
scrape/data/words_data.json Normal file

File diff suppressed because it is too large Load Diff

18
tts_tr.sh Executable file
View File

@ -0,0 +1,18 @@
#!/bin/bash
set -x
set -e
# generate silence
# sox -n -r 22050 silence_500.wav trim 0.0 0.500
python3 lexconvert.py --phones2phones unicode-ipa espeak "${1}. a." | espeak -g 1 -s 1 -v en+f4 -w "tts/tok_words/${3}.wav"
duration=`ffprobe -v error -show_entries format=duration -of csv=p=0 tts/tok_words/${3}.wav`
duration=`echo $duration - 1.55 | bc`
ffmpeg -y -ss 00:00:00 -to "$duration" -i "tts/tok_words/${3}.wav" -c copy "tts/tok_words/${3}_fixed.wav"
rm "tts/tok_words/${3}.wav"
echo "${2}" | espeak -v en+m4 -w "tts/tok_words/${3}_tr.wav"
sox "tts/tok_words/${3}_fixed.wav" "tts/tok_words/silence_500.wav" "tts/tok_words/${3}_tr.wav" "tts/tok_words/${3}_fixed.wav" "tts/tok_words/${3}_final.wav"
rm "tts/tok_words/${3}_fixed.wav" "tts/tok_words/${3}_tr.wav"

36
web/app.py Normal file
View File

@ -0,0 +1,36 @@
from flask import Flask, render_template, request
app = Flask(__name__)
WORD_LI = 'li'
PARTICLES = ['en', WORD_LI, 'e', 'la', 'pi', 'o', 'anu']
EMOTICLE = ['a']
PRONOUNS = ['mi', 'sina', 'ona', 'ni']
PREPOSITIONS = ['lon', 'tawa', 'tan', 'sama', 'kepeken']
PREVERBS = ['wile', 'sona', 'awen', 'kama', 'ken', 'lukin']
QUESTIONS = ['seme']
def create_context(sentence):
words = sentence.split()
@app.route('/', methods=['GET', 'POST'])
def home():
ctx = {}
if request.method == 'POST':
sentence = request.form.get('sentence')
ctx = create_context(sentence)
return render_template('home.html', ctx=ctx)
if __name__ == '__main__':
app.run(debug=True)

11
web/templates/home.html Normal file
View File

@ -0,0 +1,11 @@
{% extends "layout.html" %}
{% block body %}
<form action="/" method="post">
<input type="text" size="50" name="sentence" value="{{ request.form.sentence }}"/>
<input type="submit" />
</form>
{% endblock %}

View File

@ -0,0 +1,9 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
{% block body %}{% endblock %}
</body>