From 3b25d0c3d53ff7fed22589d33ccabcf3daa1f624 Mon Sep 17 00:00:00 2001 From: Dani Tsvetkov Date: Wed, 10 Apr 2024 07:27:09 +0200 Subject: [PATCH] scrape nimi li and basic data --- .gitignore | 1 + mylist.txt | 102 ---- requirements.txt | 0 scrape/app.py | 134 +++++ scrape/data/words.json | 133 +++++ scrape/data/words_data.json | 1063 +++++++++++++++++++++++++++++++++++ tts_tr.sh | 18 + web/app.py | 36 ++ web/templates/home.html | 11 + web/templates/layout.html | 9 + 10 files changed, 1405 insertions(+), 102 deletions(-) delete mode 100644 requirements.txt create mode 100644 scrape/app.py create mode 100644 scrape/data/words.json create mode 100644 scrape/data/words_data.json create mode 100755 tts_tr.sh create mode 100644 web/app.py create mode 100644 web/templates/home.html create mode 100644 web/templates/layout.html diff --git a/.gitignore b/.gitignore index e1d042a..b1d9057 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ venv __pycache__ .idea tts +scrape/data/words diff --git a/mylist.txt b/mylist.txt index ff44180..6a58272 100644 --- a/mylist.txt +++ b/mylist.txt @@ -25,105 +25,3 @@ file 'tts/misc/silence_0.5.mp3' file 'tts/misc/silence_1.mp3' file 'tts/misc/phrases.mp4' file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/tok/mi_en_sina_li_moku.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/you_and_i_are_eating..mp4' -file 'tts/tok_phrases/tok/mi_en_sina_li_moku.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/jan_utala_mute.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/many_warriors.mp4' -file 'tts/tok_phrases/tok/jan_utala_mute.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/jan_lili_mute_li_lape.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/the_children_are_sleeping.mp4' -file 'tts/tok_phrases/tok/jan_lili_mute_li_lape.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/kiwen_suli_li_pakala_e_tomo_lipu.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/a_big_rock_damaged_the_library.mp4' -file 'tts/tok_phrases/tok/kiwen_suli_li_pakala_e_tomo_lipu.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/mi_pakala_lili.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/i_made_a_little_mistake..mp4' -file 'tts/tok_phrases/tok/mi_pakala_lili.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/ilo_sina_li_kalama_mute_ike.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/your_instrument_is_making_lots_of_bad_noise..mp4' -file 'tts/tok_phrases/tok/ilo_sina_li_kalama_mute_ike.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/kulupu_ni_li_pona_mute.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/this_community_is_very_good..mp4' -file 'tts/tok_phrases/tok/kulupu_ni_li_pona_mute.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/jan_lili_li_toki_e_ni:_sina_pona.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/the_child_said_that_you’re_good..mp4' -file 'tts/tok_phrases/tok/jan_lili_li_toki_e_ni:_sina_pona.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/ona_li_toki_e_ni:_“toki!_sina_pona_lukin”.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/they_said:_“hello!_you_look_good.”.mp4' -file 'tts/tok_phrases/tok/ona_li_toki_e_ni:_“toki!_sina_pona_lukin”.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/jan_pali_ni_li_pali_e_tomo_mi:_ona_li_jo_e_kiwen_mute.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/the_worker_with_lots_of_rocks_built_my_home.mp4' -file 'tts/tok_phrases/tok/jan_pali_ni_li_pali_e_tomo_mi:_ona_li_jo_e_kiwen_mute.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/kulupu_sina_li_ante_mute.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/your_community_is_very_different..mp4' -file 'tts/tok_phrases/tok/kulupu_sina_li_ante_mute.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/jan_ike_li_pakala_e_ilo_mi.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/a_bad_person_broke_my_tools..mp4' -file 'tts/tok_phrases/tok/jan_ike_li_pakala_e_ilo_mi.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/mi_pali_e_tomo_ni.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/i_built_this_house..mp4' -file 'tts/tok_phrases/tok/mi_pali_e_tomo_ni.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/jan_utala_pona_mute_li_awen_e_kulupu_ni.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/the_good_warriors_protect_this_community..mp4' -file 'tts/tok_phrases/tok/jan_utala_pona_mute_li_awen_e_kulupu_ni.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/kulupu_suli_li_awen___li_suli_e_ona.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/the_large_community_endures_and_grows_itself.mp4' -file 'tts/tok_phrases/tok/kulupu_suli_li_awen___li_suli_e_ona.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/moku_seli_li_pona_mute.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/warm_food_is_very_good..mp4' -file 'tts/tok_phrases/tok/moku_seli_li_pona_mute.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/jan_lili_lape_li_kalama_ala.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/sleeping_children_don’t_make_noises..mp4' -file 'tts/tok_phrases/tok/jan_lili_lape_li_kalama_ala.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/jan_pali_mute_li_toki_e_ni:_ona_li_wawa___li_kiwen.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/the_workers_said_that_they_are_strong_and_tough..mp4' -file 'tts/tok_phrases/tok/jan_pali_mute_li_toki_e_ni:_ona_li_wawa___li_kiwen.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/sina_ante_lukin.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/you_look_different..mp4' -file 'tts/tok_phrases/tok/sina_ante_lukin.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/tok/tomo_ni_li_awen_e_seli.mp4' -file 'tts/misc/silence_0.5.mp3' -file 'tts/tok_phrases/en/this_house_preserves_the_heat..mp4' -file 'tts/tok_phrases/tok/tomo_ni_li_awen_e_seli.mp4' -file 'tts/misc/silence_1.mp3' -file 'tts/tok_phrases/lesson_names/end_of_lesson_05.mp4' -file 'tts/misc/silence_0.5.mp3' diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e69de29..0000000 diff --git a/scrape/app.py b/scrape/app.py new file mode 100644 index 0000000..268664b --- /dev/null +++ b/scrape/app.py @@ -0,0 +1,134 @@ +import argparse +import json +import logging +import os + +import requests +from bs4 import BeautifulSoup, NavigableString + +logging.basicConfig() +logger = logging.getLogger() + +URL_BASE = 'https://nimi.li/' +URL_WORD = URL_BASE + '{word}' +URL_WORD_2 = 'https://linku.la/words/{word}' + + +def setup_logging_level(debug=False): + log_level = logging.DEBUG if debug else logging.INFO + logger.setLevel(log_level) + logger.debug("Debugging enabled") + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('query', nargs='*', default="", help="freeform") + parser.add_argument('--debug', dest='debug', action='store_true') + parser.add_argument('--raise-exceptions', dest='raise_exceptions', action='store_true') + return parser.parse_args() + + +args = parse_args() +setup_logging_level(args.debug) + +DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') +WORDS_DIR = os.path.join(DATA_DIR, 'words') + + +def get_cache_or_url(url, cache_file): + if os.path.exists(cache_file): + logger.debug("Getting cached response from {}".format(cache_file)) + with open(cache_file, 'r') as f: + response_text = f.read() + else: + os.makedirs(os.path.dirname(cache_file), exist_ok=True) + logger.debug("Getting response from {}".format(url)) + response = requests.get(url) + if response.status_code != 200: + if args.raise_exceptions: + raise Exception("Error getting response from {}".format(url)) + logger.error("Error getting response from {}".format(url)) + response_text = response.text + with open(cache_file, 'w+') as f: + f.write(response.text) + return response_text + +def get_word_list(): + words_file = os.path.join(DATA_DIR, 'words.json') + if os.path.exists(words_file): + logger.debug("Getting cached response from {}".format(words_file)) + with open(words_file, 'r') as f: + words = json.load(f) + else: + words = set() + cache_file = os.path.join(DATA_DIR, 'all_words.html') + response_text = get_cache_or_url(URL_BASE, cache_file) + + soup = BeautifulSoup(response_text, 'html.parser') + + word_tags = soup.select('main > div.grid p.font-pona') + for word_tag in word_tags: + if word_tag.text[-1] in ['0', '1', '2', '3']: + final_word = word_tag.text[:-1] + else: + final_word = word_tag.text + words.add(final_word) + with open(words_file, 'w+') as f: + json.dump(words, f, ensure_ascii=False, indent=2) + return words + + +def get_word_data(word, url): + cache_file = os.path.join(WORDS_DIR, f'{word}.html') + response_text = get_cache_or_url(url, cache_file) + + soup = BeautifulSoup(response_text, 'html.parser') + + tag = soup.select('body > div:nth-child(1) > div.relative.my-0.mx-auto.p-0.flex.flex-col.min-h-dvh > main > div > div:nth-child(1) > div.p-6.pt-0.flex.flex-col.gap-3 > div:nth-child(3) > ul > li') + pu_defs = [t.text for t in tag] + pu_definitions = [] + for pu_def in pu_defs: + pu_def_split = pu_def.split() + pos = pu_def_split[0] + definition = ' '.join(pu_def_split[1:]) + pu_definitions.append({ + "pos": pos, + "definition": definition, + }) + + return { + 'pu_definitions': pu_definitions + } + +def get_words_data(word_list): + words_data_file = os.path.join(DATA_DIR, 'words_data.json') + if os.path.exists(words_data_file): + logger.debug("Getting cached response from {}".format(words_data_file)) + with open(words_data_file, 'r') as f: + words_data = json.load(f) + else: + words_data = {} + for word in word_list: + url = URL_WORD_2.format(word=word) + word_data = get_word_data(word, url) + words_data[word] = word_data + with open(words_data_file, 'w+') as f: + json.dump(words_data, f, ensure_ascii=False, indent=2) + return words_data + +def main(): + for folder in [WORDS_DIR]: + os.makedirs(folder, exist_ok=True) + word_list = get_word_list() + print(word_list) + words_data = get_words_data(word_list) + nouns = [] + for word, data in words_data.items(): + for pu_def in data.get('pu_definitions'): + if pu_def.get('pos') == 'ADJECTIVE': + nouns.append(word) + print('" | "'.join(nouns)) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scrape/data/words.json b/scrape/data/words.json new file mode 100644 index 0000000..5614463 --- /dev/null +++ b/scrape/data/words.json @@ -0,0 +1,133 @@ +[ + "a", + "akesi", + "ala", + "alasa", + "ale", + "anpa", + "ante", + "anu", + "awen", + "e", + "en", + "esun", + "ijo", + "ike", + "ilo", + "insa", + "jaki", + "jaki", + "jan", + "jelo", + "jo", + "kala", + "kalama", + "kama", + "kasi", + "ken", + "kepeken", + "kili", + "kiwen", + "ko", + "kon", + "kule", + "kulupu", + "kute", + "la", + "lape", + "laso", + "lawa", + "len", + "lete", + "li", + "lili", + "linja", + "lipu", + "loje", + "lon", + "luka", + "lukin", + "lupa", + "ma", + "mama", + "mani", + "mi", + "moku", + "moli", + "monsi", + "mu", + "mun", + "musi", + "mute", + "nanpa", + "nasa", + "nasin", + "nena", + "nimi", + "noka", + "o", + "olin", + "ona", + "open", + "pakala", + "pali", + "palisa", + "pan", + "pana", + "pi", + "pilin", + "pimeja", + "pini", + "pipi", + "poka", + "poki", + "pona", + "pu", + "sama", + "seli", + "selo", + "seme", + "sewi", + "sijelo", + "sike", + "sin", + "sina", + "sinpin", + "sitelen", + "sona", + "soweli", + "suli", + "suno", + "supa", + "suwi", + "tan", + "taso", + "tawa", + "telo", + "tenpo", + "toki", + "tomo", + "tu", + "unpa", + "uta", + "utala", + "walo", + "wan", + "waso", + "wawa", + "weka", + "wile", + "kijetesantakalu", + "kin", + "kipisi", + "ku", + "leko", + "meli", + "mije", + "misikeke", + "monsuta", + "n", + "namako", + "soko", + "tonsi" +] \ No newline at end of file diff --git a/scrape/data/words_data.json b/scrape/data/words_data.json new file mode 100644 index 0000000..a1088ac --- /dev/null +++ b/scrape/data/words_data.json @@ -0,0 +1,1063 @@ +{ + "a": { + "pu_definitions": [ + { + "pos": "PARTICLE", + "definition": "(emphasis, emotion or confirmation)" + } + ] + }, + "akesi": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "non-cute animal; reptile, amphibian" + } + ] + }, + "ala": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "no, not, zero" + } + ] + }, + "alasa": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to hunt, forage" + } + ] + }, + "ale": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "all; abundant, countless, bountiful, every, plentiful" + }, + { + "pos": "NOUN", + "definition": "abundance, everything, life, universe" + }, + { + "pos": "NUMBER", + "definition": "100" + } + ] + }, + "anpa": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "bowing down, downward, humble, lowly, dependent" + } + ] + }, + "ante": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "different, altered, changed, other" + } + ] + }, + "anu": { + "pu_definitions": [ + { + "pos": "PARTICLE", + "definition": "or" + } + ] + }, + "awen": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "enduring, kept, protected, safe, waiting, staying" + }, + { + "pos": "PRE-VERB", + "definition": "to continue to" + } + ] + }, + "e": { + "pu_definitions": [] + }, + "en": { + "pu_definitions": [ + { + "pos": "PARTICLE", + "definition": "(between multiple subjects)" + } + ] + }, + "esun": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "market, shop, fair, bazaar, business transaction" + } + ] + }, + "ijo": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "thing, phenomenon, object, matter" + } + ] + }, + "ike": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "bad, negative; non-essential, irrelevant" + } + ] + }, + "ilo": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "tool, implement, machine, device" + } + ] + }, + "insa": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "centre, content, inside, between; internal organ, stomach" + } + ] + }, + "jaki": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "disgusting, obscene, sickly, toxic, unclean, unsanitary" + } + ] + }, + "jan": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "human being, person, somebody" + } + ] + }, + "jelo": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "yellow, yellowish" + } + ] + }, + "jo": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to have, carry, contain, hold" + } + ] + }, + "kala": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "fish, marine animal, sea creature" + } + ] + }, + "kalama": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to produce a sound; recite, utter aloud" + } + ] + }, + "kama": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "arriving, coming, future, summoned" + }, + { + "pos": "PRE-VERB", + "definition": "to become, manage to, succeed in" + } + ] + }, + "kasi": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "plant, vegetation; herb, leaf" + } + ] + }, + "ken": { + "pu_definitions": [ + { + "pos": "PRE-VERB", + "definition": "to be able to, be allowed to, can, may" + }, + { + "pos": "ADJECTIVE", + "definition": "possible" + } + ] + }, + "kepeken": { + "pu_definitions": [ + { + "pos": "PREPOSITION", + "definition": "to use, with, by means of" + } + ] + }, + "kili": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "fruit, vegetable, mushroom" + } + ] + }, + "kiwen": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "hard object, metal, rock, stone" + } + ] + }, + "ko": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "clay, clinging form, dough, semi-solid, paste, powder" + } + ] + }, + "kon": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "air, breath; essence, spirit, hidden reality, unseen agent" + } + ] + }, + "kule": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "colourful, pigmented, painted" + } + ] + }, + "kulupu": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "community, company, group, nation, society, tribe" + } + ] + }, + "kute": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "ear" + }, + { + "pos": "VERB", + "definition": "to hear, listen; pay attention to, obey" + } + ] + }, + "la": { + "pu_definitions": [ + { + "pos": "PARTICLE", + "definition": "(between the context phrase and the main sentence)" + } + ] + }, + "lape": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "sleeping, resting" + } + ] + }, + "laso": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "blue, green" + } + ] + }, + "lawa": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "head, mind VERB to control, direct, guide, lead, own, plan, regulate, rule" + } + ] + }, + "len": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "cloth, clothing, fabric, textile; cover, layer of privacy" + } + ] + }, + "lete": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "cold, cool; uncooked, raw" + } + ] + }, + "li": { + "pu_definitions": [] + }, + "lili": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "little, small, short; few; a bit; young" + } + ] + }, + "linja": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "long and flexible thing; cord, hair, rope, thread, yarn" + } + ] + }, + "lipu": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "flat object; book, document, card, paper, record, website" + } + ] + }, + "loje": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "red, reddish" + } + ] + }, + "lon": { + "pu_definitions": [ + { + "pos": "PREPOSITION", + "definition": "located at, present at, real, true, existing" + } + ] + }, + "luka": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "arm, hand, tactile organ" + }, + { + "pos": "NUMBER", + "definition": "five" + } + ] + }, + "lukin": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "eye" + }, + { + "pos": "VERB", + "definition": "to look at, see, examine, observe, read, watch" + }, + { + "pos": "PRE-VERB", + "definition": "to seek, look for, try to" + } + ] + }, + "lupa": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "door, hole, orifice, window" + } + ] + }, + "ma": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "earth, land; outdoors, world; country, territory; soil" + } + ] + }, + "mama": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "parent, ancestor; creator, originator; caretaker, sustainer" + } + ] + }, + "mani": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "money, cash, savings, wealth; large domesticated animal" + } + ] + }, + "mi": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "I, me, we, us" + } + ] + }, + "moku": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to eat, drink, consume, swallow, ingest" + } + ] + }, + "moli": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "dead, dying" + } + ] + }, + "monsi": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "back, behind, rear" + } + ] + }, + "mu": { + "pu_definitions": [ + { + "pos": "PARTICLE", + "definition": "(animal noise or communication)" + } + ] + }, + "mun": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "moon, night sky object, star" + } + ] + }, + "musi": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "artistic, entertaining, frivolous, playful, recreational" + } + ] + }, + "mute": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "many, a lot, more, much, several, very" + }, + { + "pos": "NOUN", + "definition": "quantity" + } + ] + }, + "nanpa": { + "pu_definitions": [ + { + "pos": "PARTICLE", + "definition": "-th (ordinal number)" + }, + { + "pos": "NOUN", + "definition": "numbers" + } + ] + }, + "nasa": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "unusual, strange; foolish, crazy; drunk, intoxicated" + } + ] + }, + "nasin": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "way, custom, doctrine, method, path, road" + } + ] + }, + "nena": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "bump, button, hill, mountain, nose, protuberance" + } + ] + }, + "nimi": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "name, word" + } + ] + }, + "noka": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "foot, leg, organ of locomotion; bottom, lower part" + } + ] + }, + "o": { + "pu_definitions": [ + { + "pos": "PARTICLE", + "definition": "hey! O! (vocative or imperative)" + } + ] + }, + "olin": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to love, have compassion for, respect, show affection to" + } + ] + }, + "ona": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "he, she, it, they" + } + ] + }, + "open": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to begin, start; open; turn on" + } + ] + }, + "pakala": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "botched, broken, damaged, harmed, messed up" + } + ] + }, + "pali": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to do, take action on, work on; build, make, prepare" + } + ] + }, + "palisa": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "long hard thing; branch, rod, stick" + } + ] + }, + "pan": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "cereal, grain; barley, corn, oat, rice, wheat; bread, pasta" + } + ] + }, + "pana": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to give, send, emit, provide, put, release" + } + ] + }, + "pi": { + "pu_definitions": [ + { + "pos": "PARTICLE", + "definition": "of" + } + ] + }, + "pilin": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "heart (physical or emotional)" + }, + { + "pos": "ADJECTIVE", + "definition": "feeling (an emotion, a direct experience)" + } + ] + }, + "pimeja": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "black, dark, unlit" + } + ] + }, + "pini": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "ago, completed, ended, finished, past" + } + ] + }, + "pipi": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "bug, insect, ant, spider" + } + ] + }, + "poka": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "hip, side; next to, nearby, vicinity" + } + ] + }, + "poki": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "container, bag, bowl, box, cup, cupboard, drawer, vessel" + } + ] + }, + "pona": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "good, positive, useful; friendly, peaceful; simple" + } + ] + }, + "pu": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "interacting with the official Toki Pona book" + } + ] + }, + "sama": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "same, similar; each other; sibling, peer, fellow" + }, + { + "pos": "PREPOSITION", + "definition": "as, like" + } + ] + }, + "seli": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "fire; cooking element, chemical reaction, heat source" + } + ] + }, + "selo": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "outer form, outer layer; bark, peel, shell, skin; boundary" + } + ] + }, + "seme": { + "pu_definitions": [ + { + "pos": "PARTICLE", + "definition": "what? which?" + } + ] + }, + "sewi": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "area above, highest part, something elevated" + }, + { + "pos": "ADJECTIVE", + "definition": "awe-inspiring, divine, sacred, supernatural" + } + ] + }, + "sijelo": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "body (of person or animal), physical state, torso" + } + ] + }, + "sike": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "round or circular thing; ball, circle, cycle, sphere, wheel" + }, + { + "pos": "ADJECTIVE", + "definition": "of one year" + } + ] + }, + "sin": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "new, fresh; additional, another, extra" + } + ] + }, + "sina": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "you" + } + ] + }, + "sinpin": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "face, foremost, front, wall" + } + ] + }, + "sitelen": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "image, picture, representation, symbol, mark, writing" + } + ] + }, + "sona": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to know, be skilled in, be wise about, have information on" + }, + { + "pos": "PRE-VERB", + "definition": "to know how to" + } + ] + }, + "soweli": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "animal, beast, land mammal" + } + ] + }, + "suli": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "big, heavy, large, long, tall; important; adult" + } + ] + }, + "suno": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "sun; light brightness, glow, radiance, shine; light source" + } + ] + }, + "supa": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "horizontal surface, thing to put or rest something on" + } + ] + }, + "suwi": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "sweet, fragrant; cute, innocent, adorable" + } + ] + }, + "tan": { + "pu_definitions": [ + { + "pos": "PREPOSITION", + "definition": "by, from, because of" + } + ] + }, + "taso": { + "pu_definitions": [ + { + "pos": "PARTICLE", + "definition": "but, however" + }, + { + "pos": "ADJECTIVE", + "definition": "only" + } + ] + }, + "tawa": { + "pu_definitions": [ + { + "pos": "PREPOSITION", + "definition": "going to, toward; for; from the perspective of" + }, + { + "pos": "ADJECTIVE", + "definition": "moving" + } + ] + }, + "telo": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "water, liquid, fluid, wet substance; beverage" + } + ] + }, + "tenpo": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "time, duration, moment, occasion, period, situation" + } + ] + }, + "toki": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to communicate, say, speak, say, talk, use language, think" + } + ] + }, + "tomo": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "indoor space; building, home, house, room" + } + ] + }, + "tu": { + "pu_definitions": [ + { + "pos": "NUMBER", + "definition": "two" + } + ] + }, + "unpa": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to have sexual or marital relations with" + } + ] + }, + "uta": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "mouth,lips, oral cavity, jaw" + } + ] + }, + "utala": { + "pu_definitions": [ + { + "pos": "VERB", + "definition": "to battle, challenge, compete against struggle against" + } + ] + }, + "walo": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "white, whitish; light-coloured, pale" + } + ] + }, + "wan": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "unique, united" + }, + { + "pos": "NUMBER", + "definition": "one" + } + ] + }, + "waso": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "bird, flying creature, winged animal" + } + ] + }, + "wawa": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "strong, powerful; confident, sure; energetic, intense" + } + ] + }, + "weka": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "absent, away, ignored" + } + ] + }, + "wile": { + "pu_definitions": [ + { + "pos": "PRE-VERB", + "definition": "must, need, require, should, want, wish" + } + ] + }, + "kijetesantakalu": { + "pu_definitions": [] + }, + "kin": { + "pu_definitions": [ + { + "pos": "PARTICLE", + "definition": "(emphasis, emotion or confirmation)" + } + ] + }, + "kipisi": { + "pu_definitions": [] + }, + "ku": { + "pu_definitions": [] + }, + "leko": { + "pu_definitions": [] + }, + "meli": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "woman, female, feminine person; wife" + } + ] + }, + "mije": { + "pu_definitions": [ + { + "pos": "NOUN", + "definition": "man, male, masculine person; husband" + } + ] + }, + "misikeke": { + "pu_definitions": [] + }, + "monsuta": { + "pu_definitions": [] + }, + "n": { + "pu_definitions": [] + }, + "namako": { + "pu_definitions": [ + { + "pos": "ADJECTIVE", + "definition": "new, fresh; additional, another, extra" + } + ] + }, + "soko": { + "pu_definitions": [] + }, + "tonsi": { + "pu_definitions": [] + } +} \ No newline at end of file diff --git a/tts_tr.sh b/tts_tr.sh new file mode 100755 index 0000000..08dd978 --- /dev/null +++ b/tts_tr.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -x +set -e + +# generate silence +# sox -n -r 22050 silence_500.wav trim 0.0 0.500 + +python3 lexconvert.py --phones2phones unicode-ipa espeak "${1}. a." | espeak -g 1 -s 1 -v en+f4 -w "tts/tok_words/${3}.wav" +duration=`ffprobe -v error -show_entries format=duration -of csv=p=0 tts/tok_words/${3}.wav` +duration=`echo $duration - 1.55 | bc` +ffmpeg -y -ss 00:00:00 -to "$duration" -i "tts/tok_words/${3}.wav" -c copy "tts/tok_words/${3}_fixed.wav" +rm "tts/tok_words/${3}.wav" + +echo "${2}" | espeak -v en+m4 -w "tts/tok_words/${3}_tr.wav" + +sox "tts/tok_words/${3}_fixed.wav" "tts/tok_words/silence_500.wav" "tts/tok_words/${3}_tr.wav" "tts/tok_words/${3}_fixed.wav" "tts/tok_words/${3}_final.wav" + +rm "tts/tok_words/${3}_fixed.wav" "tts/tok_words/${3}_tr.wav" \ No newline at end of file diff --git a/web/app.py b/web/app.py new file mode 100644 index 0000000..b423a49 --- /dev/null +++ b/web/app.py @@ -0,0 +1,36 @@ +from flask import Flask, render_template, request + +app = Flask(__name__) + +WORD_LI = 'li' + +PARTICLES = ['en', WORD_LI, 'e', 'la', 'pi', 'o', 'anu'] + +EMOTICLE = ['a'] + +PRONOUNS = ['mi', 'sina', 'ona', 'ni'] + +PREPOSITIONS = ['lon', 'tawa', 'tan', 'sama', 'kepeken'] + +PREVERBS = ['wile', 'sona', 'awen', 'kama', 'ken', 'lukin'] + +QUESTIONS = ['seme'] + + + + +def create_context(sentence): + words = sentence.split() + + +@app.route('/', methods=['GET', 'POST']) +def home(): + ctx = {} + if request.method == 'POST': + sentence = request.form.get('sentence') + ctx = create_context(sentence) + return render_template('home.html', ctx=ctx) + + +if __name__ == '__main__': + app.run(debug=True) diff --git a/web/templates/home.html b/web/templates/home.html new file mode 100644 index 0000000..4d26f55 --- /dev/null +++ b/web/templates/home.html @@ -0,0 +1,11 @@ +{% extends "layout.html" %} +{% block body %} + +
+ + + +
+ + +{% endblock %} \ No newline at end of file diff --git a/web/templates/layout.html b/web/templates/layout.html new file mode 100644 index 0000000..ae785ac --- /dev/null +++ b/web/templates/layout.html @@ -0,0 +1,9 @@ + + + + + Title + + +{% block body %}{% endblock %} +