From 411f26bb7be249104044d21fa895570d4f055e2d Mon Sep 17 00:00:00 2001 From: Dani Tsvetkov Date: Wed, 10 Apr 2024 07:32:10 +0200 Subject: [PATCH] added grammar --- scrape/grammar_parser.py | 60 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 scrape/grammar_parser.py diff --git a/scrape/grammar_parser.py b/scrape/grammar_parser.py new file mode 100644 index 0000000..75f0f4c --- /dev/null +++ b/scrape/grammar_parser.py @@ -0,0 +1,60 @@ +from lark import Lark, tree + + +sentence = 'tenpo lon la mi sitelen e lipu sona' + +# from https://github.com/kilipan/nasin-toki +grammar = """ + sentence: ((phrase|PREPOSITION phrase|sentence) LA)* (PRONOUN_MI_SINA | phrase (EN phrase)* LI)+ (PREVERB* phrase (E phrase)* (PREPOSITION phrase)*)* + + phrase: head PI? modifier* + head: CONTENT_WORD | loan_word + modifier: CONTENT_WORD | loan_word + + EN: "en" + LI: "li" + E: "e" + LA: "la" + PI: "pi" + O: "o" + ANU: "anu" + + PARTICLE: EN | LI | E | LA | PI | O | ANU + + A: "a" + + EMOTICLE: A + + CONTENT_WORD: "akesi" | "ala" | "alasa" | "ale" | "anpa" | "ante" | "awen" | "esun" | "ijo" | "ike" | "ilo" | "insa" | "jaki" | "jaki" | "jan" | "jelo" | "jo" | "kala" | "kalama" | "kama" | "kasi" | "ken" | "kepeken" | "kili" | "kiwen" | "ko" | "kon" | "kule" | "kulupu" | "kute" | "lape" | "laso" | "lawa" | "len" | "lete" | "lili" | "linja" | "lipu" | "loje" | "lon" | "luka" | "lukin" | "lupa" | "ma" | "mama" | "mani" | "mi" | "moku" | "moli" | "monsi" | "mu" | "mun" | "musi" | "mute" | "nanpa" | "nasa" | "nasin" | "nena" | "nimi" | "noka" | "olin" | "ona" | "open" | "pakala" | "pali" | "palisa" | "pan" | "pana" | "pilin" | "pimeja" | "pini" | "pipi" | "poka" | "poki" | "pona" | "pu" | "sama" | "seli" | "selo" | "seme" | "sewi" | "sijelo" | "sike" | "sin" | "sina" | "sinpin" | "sitelen" | "sona" | "soweli" | "suli" | "suno" | "supa" | "suwi" | "tan" | "taso" | "tawa" | "telo" | "tenpo" | "toki" | "tomo" | "tu" | "unpa" | "uta" | "utala" | "walo" | "wan" | "waso" | "wawa" | "weka" | "wile" | "kijetesantakalu" | "kin" | "kipisi" | "ku" | "leko" | "meli" | "mije" | "misikeke" | "monsuta" | "namako" | "soko" | "tonsi" + + PRONOUN_MI_SINA: "mi" | "sina" + + PRONOUN: PRONOUN_MI_SINA | "ona" | "ni" + + PREPOSITION: "lon" | "tawa" | "tan" | "sama" | "kepeken" + + PREVERB: "wile" | "sona" | "awen" | "kama" | "ken" | "lukin" + + QUESTION: "seme" + + VOWEL: "a" | "e" | "i" | "o" | "u" + CONSONANT: "p" | "t" | "k" | "s" | "m" | "n" | "l" | "j" | "w" + CONSONANT_N: "n" + VOWEL_CAPITAL: "A" | "E" | "I" | "O" | "U" + CONSONANT_CAPITAL: "P" | "T" | "K" | "S" | "M" | "N" | "L" | "J" | "W" + + SYLLABLE: CONSONANT? VOWEL CONSONANT_N? + SYLLABLE_CAPITAL_V: VOWEL_CAPITAL CONSONANT_N? + SYLLABLE_CAPITAL_CV: CONSONANT_CAPITAL VOWEL CONSONANT_N? + + loan_word: (SYLLABLE_CAPITAL_V | SYLLABLE_CAPITAL_CV) SYLLABLE* + + %import common.WS + %ignore WS +""" + +parser = Lark(grammar, start='sentence', ambiguity='explicit') + +print(parser.parse(sentence)) + +print(parser.parse(sentence).pretty()) \ No newline at end of file