general solution based tokenizer

This commit is contained in:
Daniel Tsvetkov 2020-02-11 11:30:11 +01:00
parent 559bd39e9a
commit abbba47bb5
2 changed files with 152 additions and 49 deletions

View File

@ -1,10 +1,12 @@
import json
import re import re
import sys from datetime import datetime
from datetime import timedelta, datetime
from pprint import pprint, pformat
from pygments import highlight, lexers, formatters
from tww import ISO_FORMAT, time_to_emoji
from tww import resolve_timezone, dateparser_parse_dt, get_utcnow, get_s_since_epoch, get_ms_since_epoch, \ from tww import resolve_timezone, dateparser_parse_dt, get_utcnow, get_s_since_epoch, get_ms_since_epoch, \
dt_tz_translation, DEFAULT_FORMAT, get_local_now dt_tz_translation, DEFAULT_FORMAT, get_local_now, query_to_format_result
r_generic = re.compile('(.*)', flags=re.IGNORECASE) r_generic = re.compile('(.*)', flags=re.IGNORECASE)
r_time_in_epoch_s_now = re.compile('(?:time since epoch|seconds since epoch)', flags=re.IGNORECASE) r_time_in_epoch_s_now = re.compile('(?:time since epoch|seconds since epoch)', flags=re.IGNORECASE)
@ -13,10 +15,13 @@ r_time_in_epoch_s3 = re.compile('(?:seconds)?\s*since\s*(.*)', flags=re.IGNORECA
r_time_in_epoch_ms_now = re.compile('(?:milliseconds since epoch)', flags=re.IGNORECASE) r_time_in_epoch_ms_now = re.compile('(?:milliseconds since epoch)', flags=re.IGNORECASE)
r_time_in_epoch_ms2 = re.compile('(.*)?\s*(?:in|to)\s*(?:ms|milliseconds|miliseconds)', flags=re.IGNORECASE) r_time_in_epoch_ms2 = re.compile('(.*)?\s*(?:in|to)\s*(?:ms|milliseconds|miliseconds)', flags=re.IGNORECASE)
r_time_in_epoch_ms3 = re.compile('(?:ms|milliseconds|miliseconds)?\s*since\s*(.*)', flags=re.IGNORECASE) r_time_in_epoch_ms3 = re.compile('(?:ms|milliseconds|miliseconds)?\s*since\s*(.*)', flags=re.IGNORECASE)
r_time_in = re.compile('(?:time)?\s*in\s*(.*)', flags=re.IGNORECASE)
r_time_since = re.compile('(?:time)?\s*since\s*(.*)', flags=re.IGNORECASE) r_time_since = re.compile('(?:time)?\s*since\s*(.*)', flags=re.IGNORECASE)
r_time_until = re.compile('(?:time)?\s*until\s*(.*)', flags=re.IGNORECASE) r_time_until = re.compile('(?:time)?\s*until\s*(.*)', flags=re.IGNORECASE)
r_time_between = re.compile('(?:time)?\s*between\s*(.*)\s*and\s*(.*)', flags=re.IGNORECASE) r_time_between = re.compile('(?:time)?\s*between\s*(.*)\s*and\s*(.*)', flags=re.IGNORECASE)
r_timezone_translation = re.compile('(.*)?\s(?:in|to)\s(.*)', flags=re.IGNORECASE) r_timezone_translation = re.compile('(.*)?\s(?:in|to)\s(.*)', flags=re.IGNORECASE)
r_timezone = re.compile('(.*)?\s(?:timezone|timezones|tz)', flags=re.IGNORECASE)
r_timezone_2 = re.compile('(?:timezone in|timezones in|tz in|timezone|timezones|tz)\s(.*)?', flags=re.IGNORECASE)
def handler_time_s(dt_s: str) -> int: def handler_time_s(dt_s: str) -> int:
@ -74,23 +79,44 @@ def handler_timezone_translation(dt_s: str, timezone_like_s: str) -> dict:
def handler_generic_parser(dt_s: str) -> datetime: def handler_generic_parser(dt_s: str) -> datetime:
return query_to_format_result(dt_s, None)
def handler_dateparser(dt_s: str) -> datetime:
return dateparser_parse_dt(dt_s) return dateparser_parse_dt(dt_s)
def handler_time_in_parser(dt_s: str) -> datetime:
return query_to_format_result("now to {}".format(dt_s))
def handler_timezone(timezone_s: str):
return resolve_timezone(timezone_s)
QUERY_TYPE_DT_TR = "datetime_translation"
QUERY_TYPE_DT = "datetime_details"
QUERY_TYPE_TZ = "timezone"
QUERY_TYPE_TD = "timedelta"
regex_handlers = [ regex_handlers = [
(r_time_in_epoch_s_now, handler_time_s_now_local), (r_time_in_epoch_s_now, handler_time_s_now_local, QUERY_TYPE_DT),
(r_time_in_epoch_s_now, handler_time_s_now_utc), (r_time_in_epoch_s_now, handler_time_s_now_utc, QUERY_TYPE_DT),
(r_time_in_epoch_s2, handler_time_s), (r_time_in_epoch_s2, handler_time_s, QUERY_TYPE_DT),
(r_time_in_epoch_s3, handler_time_s), (r_time_in_epoch_s3, handler_time_s, QUERY_TYPE_DT),
(r_time_in_epoch_ms_now, handler_time_ms_now_local), (r_time_in_epoch_ms_now, handler_time_ms_now_local, QUERY_TYPE_DT),
(r_time_in_epoch_ms_now, handler_time_ms_now_utc), (r_time_in_epoch_ms_now, handler_time_ms_now_utc, QUERY_TYPE_DT),
(r_time_in_epoch_ms2, handler_time_ms), (r_time_in_epoch_ms2, handler_time_ms, QUERY_TYPE_DT),
(r_time_in_epoch_ms3, handler_time_ms), (r_time_in_epoch_ms3, handler_time_ms, QUERY_TYPE_DT),
(r_timezone_translation, handler_timezone_translation), (r_timezone_translation, handler_timezone_translation, QUERY_TYPE_DT_TR),
(r_time_since, handler_time_since), (r_time_since, handler_time_since, QUERY_TYPE_TD),
(r_time_until, handler_time_until), (r_time_until, handler_time_until, QUERY_TYPE_TD),
(r_time_between, handler_time_diff), (r_time_between, handler_time_diff, QUERY_TYPE_TD),
(r_generic, handler_generic_parser), (r_time_in, handler_time_in_parser, QUERY_TYPE_DT),
(r_timezone, handler_timezone, QUERY_TYPE_TZ),
(r_timezone_2, handler_timezone, QUERY_TYPE_TZ),
(r_generic, handler_dateparser, QUERY_TYPE_DT),
(r_generic, handler_generic_parser, QUERY_TYPE_DT),
] ]
@ -104,16 +130,17 @@ def try_regex(r, s):
return groups return groups
def parse(s): def tokenize(s):
solutions = [] solutions = []
for r, h in regex_handlers: for r, h, t in regex_handlers:
g = try_regex(r, s) g = try_regex(r, s)
if g is not None: if g is not None:
try: try:
result = h(*g) result = h(*g)
except Exception as e: except Exception as e:
result = None continue
solutions.append((h.__name__, result)) if result is not None:
solutions.append((h.__name__, (result, ), t))
return solutions return solutions
@ -139,20 +166,71 @@ def test():
"now in dublin", "now in dublin",
] ]
for s in test_strings: for s in test_strings:
print("{} -> {}".format(s, parse(s))) print("{} -> {}".format(s, tokenize(s)))
def pretty_print_dict(obj):
formatted_json = json.dumps(obj, indent=2)
colorful_json = highlight(formatted_json, lexers.JsonLexer(), formatters.TerminalFormatter())
print(colorful_json)
def dt_pretty(dt):
rv = {}
rv["iso8601_full"] = dt.strftime(ISO_FORMAT)
rv["iso8601_date"] = dt.strftime('%Y-%m-%d')
rv["iso8601_time"] = dt.strftime('%H:%M:%S')
rv["locale_dt"] = dt.strftime("%c")
rv["locale_day_of_week"] = dt.strftime("%A")
rv["locale_day_of_week_short"] = dt.strftime("%a")
rv["day_of_week_number"] = dt.strftime("%w")
rv["locale_month"] = dt.strftime("%B")
rv["locale_month_short"] = dt.strftime("%b")
rv["tz_name"] = dt.strftime("%Z")
rv["tz_offset"] = dt.strftime("%z")
rv["hh:mm"] = dt.strftime("%H:%M")
rv["locale_time"] = dt.strftime("%X")
rv["locale_date"] = dt.strftime("%x")
rv["emoji_time"] = time_to_emoji(dt)
rv["unix_s"] = get_s_since_epoch(dt)
rv["unix_ms"] = get_ms_since_epoch(dt)
return rv
def resolve_query_type(query):
solutions = tokenize(query)
if not solutions:
dt = get_local_now()
return [["now", (dt,), QUERY_TYPE_DT]]
return solutions
def resolve_query(query):
rv = {
"query": query,
"solutions": [],
}
solutions = resolve_query_type(query)
for solution in solutions:
element = {}
handler, results, query_type = solution
element["handler"] = handler
element["query_type"] = query_type
if query_type == QUERY_TYPE_DT:
element["dt"] = dt_pretty(results[0])
elif query_type == QUERY_TYPE_DT_TR:
element["src_dt"] = dt_pretty(results[0])
element["dst_dt"] = dt_pretty(results[1])
elif query_type == QUERY_TYPE_TZ:
element["tz"] = results[0]
elif query_type == QUERY_TYPE_TD:
element["timedelta"] = results[0]
rv["solutions"].append(element)
return rv
if __name__ == "__main__": if __name__ == "__main__":
query = "2020-02-07 11:25:58+0000 in seconds" #' '.join(sys.argv[1:]) query = "now in sofia"
results = parse(query) # query = ' '.join(sys.argv[1:])
for handler, result in results: result = resolve_query(query)
if type(result) is datetime: pretty_print_dict(result)
print(" {} -> {}".format(handler, result.strftime(DEFAULT_FORMAT)))
elif type(result) is timedelta:
print(" {} -> {}".format(handler, result))
elif type(result) is dict:
print(" {} -> {}".format(handler, pformat(result)))
elif type(result) is None:
print(" {} -> Couldn't solve query".format(handler))
else:
print(" {} -> {}".format(handler, result))

View File

@ -213,7 +213,7 @@ def parse_query(query):
""" """
Parses the user query to the datetime, tz/loc parts Parses the user query to the datetime, tz/loc parts
""" """
query = ' '.join(query) # query = ' '.join(query)
query = query.strip() query = query.strip()
if not query: if not query:
logger.critical("Use a query like <datetime-like> ['to' <timezone or location>]") logger.critical("Use a query like <datetime-like> ['to' <timezone or location>]")
@ -246,6 +246,32 @@ def serialize_location(location):
} }
def find_from_offset(query):
for universal_alias in ["gmt", "utc", "+", "-"]:
if query.startswith(universal_alias):
splitted_query = query.split(universal_alias)
if len(splitted_query) != 2:
continue
offset = splitted_query[1]
if ':' not in offset:
try:
hhs, mms = offset, "00"
except Exception:
continue
else:
splitted_offset = offset.split(':')
if len(splitted_offset) != 2:
continue
hhs, mms = splitted_offset
try:
if universal_alias in ["+", "-"]:
return tzinfo_from_offset("{}{:02d}{:02d}".format(universal_alias, int(hhs), int(mms)))
return tzinfo_from_offset("+{:02d}{:02d}".format(int(hhs), int(mms)))
except Exception:
continue
return None, []
def resolve_timezone(query): def resolve_timezone(query):
if not query: if not query:
query = "utc" query = "utc"
@ -254,10 +280,7 @@ def resolve_timezone(query):
normal_query = query.lower().strip() normal_query = query.lower().strip()
found_from_iana_tz = NORMALIZED_TZ_DICT.get(normal_query, "") found_from_iana_tz = NORMALIZED_TZ_DICT.get(normal_query, "")
found_from_abbr_tzs = list(NORMALIZED_TZ_ABBR.get(normal_query, set())) found_from_abbr_tzs = list(NORMALIZED_TZ_ABBR.get(normal_query, set()))
found_from_offset_tz = None found_from_offset_tz, offset_tzs = find_from_offset(normal_query)
try:
found_from_offset_tz = tzinfo_from_offset(normal_query)
except: ...
normal_tz = found_from_iana_tz normal_tz = found_from_iana_tz
if not normal_tz: if not normal_tz:
if found_from_abbr_tzs: if found_from_abbr_tzs:
@ -304,7 +327,7 @@ def resolve_timezone(query):
"normal_query": normal_query, "normal_query": normal_query,
"found_from_iana_tz": found_from_iana_tz, "found_from_iana_tz": found_from_iana_tz,
"found_from_abbr_tzs": found_from_abbr_tzs, "found_from_abbr_tzs": found_from_abbr_tzs,
"found_from_offset_tzs": found_from_offset_tz, "found_from_offset_tzs": offset_tzs,
"local_location": local_location, "local_location": local_location,
"remote_location": remote_location, "remote_location": remote_location,
"search_pytz": normal_tz, "search_pytz": normal_tz,
@ -335,8 +358,7 @@ def solve_query(human_dt, human_tz_loc):
def format_result(result, fmt): def format_result(result, fmt):
if result is None: if result is None:
logger.critical("Could not solve query") logger.error("Could not solve query")
exit(1)
logger.debug("Format: {}".format(fmt)) logger.debug("Format: {}".format(fmt))
format_result = result.strftime(fmt) format_result = result.strftime(fmt)
logger.debug("Formated result: {} -> {}".format(result, format_result)) logger.debug("Formated result: {} -> {}".format(result, format_result))
@ -346,8 +368,9 @@ def format_result(result, fmt):
def query_to_format_result(query, fmt=DEFAULT_FORMAT): def query_to_format_result(query, fmt=DEFAULT_FORMAT):
human_dt, human_tz_loc = parse_query(query) human_dt, human_tz_loc = parse_query(query)
result = solve_query(human_dt, human_tz_loc) result = solve_query(human_dt, human_tz_loc)
formated_result = format_result(result, fmt) if fmt:
return formated_result return format_result(result, fmt)
return result
def main(args): def main(args):
@ -442,8 +465,10 @@ def tzinfo_from_offset(offset: str) -> pytz.timezone:
tznames = TZ_OFFSETS.get(offset, []) tznames = TZ_OFFSETS.get(offset, [])
for tzname in tznames: for tzname in tznames:
if tzname.startswith('Etc/GMT'): if tzname.startswith('Etc/GMT'):
return pytz.timezone(tzname) return pytz.timezone(tzname), tznames
return pytz.timezone(tznames[0]) if tznames:
return pytz.timezone(tznames[0]), tznames
return None, []
def dateparser_parse_dt(s: str): def dateparser_parse_dt(s: str):
@ -467,7 +492,7 @@ def get_utcnow(tzaware: bool = True):
def get_local_now(tzaware: bool = True): def get_local_now(tzaware: bool = True):
if tzaware: if tzaware:
return datetime.now().replace(tzinfo=tzinfo_from_offset(get_local_tz_offset())) return datetime.now().replace(tzinfo=tzinfo_from_offset(get_local_tz_offset())[0])
return datetime.utcnow() return datetime.utcnow()