updated tokenizer

This commit is contained in:
Daniel Tsvetkov 2021-08-03 10:59:17 +02:00
parent 5a2ba4a60b
commit 9f1790c201
2 changed files with 74 additions and 42 deletions

20
tww/main.py Normal file
View File

@ -0,0 +1,20 @@
E_NO_SUCH_TIME = "Time does not exist"
# https://dateparser.readthedocs.io/en/latest/settings.html#settings
PARSER_TYPE_TIMESTAMP = "TIMESTAMP"
PARSER_TYPE_ABSOLUTE_TIME = "ABSOLUTE_TIME"
PARSER_TYPE_RELATIVE_TIME = "RELATIVE_TIME"
PARSER_TYPE_TIMEZONE_TRANSLATION = "TIMEZONE_TRANSLATION"
PARSER_TYPE_TIME_DELTA = "TIME_DELTA"
PARSER_TYPE_TIMEZONE_DIFFERENCE = "TIMEZONE_DIFFERENCE"
PARSER_TYPE_TIME_IN_LOCATION = "TIME_IN_LOCATION"
PARSER_TYPE_TIME_DIFFERENCE = "TIME_DIFFERENCE"
PARSER_TYPE_WORKDAYS = "WORKDAYS"
PARSER_TYPE_DAY_OF_WEEK = "DAY_OF_WEEK"
PARSER_TYPE_TIMEZONE = "TIMEZONE"
PARSER_TYPE_CALENDAR = "CALENDAR"
def parser(type, language, output_fmt):
return None

View File

@ -49,6 +49,8 @@ r_timezone = re.compile('(.*)\s(?:timezone|timezones|tz)', flags=re.IGNORECASE)
r_calendar_year = re.compile('(?:cal year|calendar year)\s*(.*)', flags=re.IGNORECASE)
r_calendar_month = re.compile('(?:calendar|cal|month|cal month|calendar month)\s*(.*)', flags=re.IGNORECASE)
r_timezone_2 = re.compile('(?:timezone in|timezones in|tz in|timezone|timezones|tz)\s(.*)?', flags=re.IGNORECASE)
r_timezone_cities = re.compile('(?:cities with timezones|countries with timezones|cities with tz|countries with tz)\s(.*)?', flags=re.IGNORECASE)
r_weeknum = re.compile('(?:week number|week num|weeknum|week)\s(.*)?', flags=re.IGNORECASE)
def handler_time_now_local():
@ -213,7 +215,10 @@ QUERY_TYPE_CAL = "calendar"
h_default = ''
h_unix_s = 'dt->unix_s'
h_unix_ms = 'dt->unix_ms'
h_week_num = 'dt->week_number'
h_tz_offset = 'tz->tz_offset'
h_tz_iana = 'tz->found_from_offset_tzs'
h_tz_abbr = 'tz->found_from_abbr_tzs'
h_time_in = 'dt->hh:mm'
h_translation = 'dt->iso8601_full'
h_default_dt = 'dt->iso8601_full'
@ -223,39 +228,41 @@ h_cal_year = 'cal->year'
h_cal_month = 'cal->month'
regex_handlers = [
(r_time_in_epoch_s_now, handler_time_now_local, QUERY_TYPE_DT, h_unix_s),
(r_time_in_epoch_s_now, handler_time_now_utc, QUERY_TYPE_DT, h_unix_s),
(r_time_in_epoch_s2, handler_generic_parser, QUERY_TYPE_DT, h_unix_s),
(r_time_in_epoch_s3, handler_generic_parser, QUERY_TYPE_DT, h_unix_s),
(r_time_in_epoch_ms_now, handler_time_now_local, QUERY_TYPE_DT, h_unix_ms),
(r_time_in_epoch_ms_now, handler_time_now_utc, QUERY_TYPE_DT, h_unix_ms),
(r_time_in_epoch_ms2, handler_generic_parser, QUERY_TYPE_DT, h_unix_ms),
(r_time_in_epoch_ms3, handler_generic_parser, QUERY_TYPE_DT, h_unix_ms),
(r_timezone_translation, handler_timezone_translation, QUERY_TYPE_DT_TR, h_translation),
(r_timezone_translation_in_to, handler_timezone_translation_in_to, QUERY_TYPE_DT_TR, h_translation),
(r_time_since, handler_time_since_until, QUERY_TYPE_TD, h_default_td),
(r_time_until, handler_time_since_until, QUERY_TYPE_TD, h_default_td),
(r_time_between, handler_time_diff, QUERY_TYPE_TD, h_default_td),
(r_tz_between, handler_tz_diff, QUERY_TYPE_TD, h_default_td),
(r_time_plus, handler_time_plus, QUERY_TYPE_DT, h_default_dt),
(r_time_plus, handler_time_plus_rev, QUERY_TYPE_DT, h_default_dt),
(r_time_minus, handler_time_minus, QUERY_TYPE_DT, h_default_dt),
(r_time_before, handler_time_before, QUERY_TYPE_DT, h_default_dt),
(r_weekday_pre, handler_generic_parser, QUERY_TYPE_DT, h_day_of_week),
(r_weekday_post, handler_generic_parser, QUERY_TYPE_DT, h_day_of_week),
(r_workdays_since, handler_workdays_since_until, QUERY_TYPE_TD, h_default_td),
(r_workdays_until, handler_workdays_since_until, QUERY_TYPE_TD, h_default_td),
(r_workdays_between, handler_workdays_diff, QUERY_TYPE_TD, h_default_td),
(r_workhours_since, handler_workhours_since_until, QUERY_TYPE_TD, h_default_td),
(r_workhours_until, handler_workhours_since_until, QUERY_TYPE_TD, h_default_td),
(r_workhours_between, handler_workhours_diff, QUERY_TYPE_TD, h_default_td),
(r_time_in, handler_time_in_parser, QUERY_TYPE_DT, h_time_in),
(r_timezone, handler_timezone, QUERY_TYPE_TZ, h_tz_offset),
(r_timezone_2, handler_timezone, QUERY_TYPE_TZ, h_tz_offset),
(r_hour_minute_timezone, handler_timezone_creation, QUERY_TYPE_DT_TR, h_translation),
(r_calendar_year, handler_calendar, QUERY_TYPE_CAL, h_cal_year),
(r_calendar_month, handler_calendar, QUERY_TYPE_CAL, h_cal_month),
(r_generic, handler_generic_parser, QUERY_TYPE_DT, h_default_dt),
(r_time_in_epoch_s_now, handler_time_now_local, QUERY_TYPE_DT, [h_unix_s]),
(r_time_in_epoch_s_now, handler_time_now_utc, QUERY_TYPE_DT, [h_unix_s]),
(r_time_in_epoch_s2, handler_generic_parser, QUERY_TYPE_DT, [h_unix_s]),
(r_time_in_epoch_s3, handler_generic_parser, QUERY_TYPE_DT, [h_unix_s]),
(r_time_in_epoch_ms_now, handler_time_now_local, QUERY_TYPE_DT, [h_unix_ms]),
(r_time_in_epoch_ms_now, handler_time_now_utc, QUERY_TYPE_DT, [h_unix_ms]),
(r_time_in_epoch_ms2, handler_generic_parser, QUERY_TYPE_DT, [h_unix_ms]),
(r_time_in_epoch_ms3, handler_generic_parser, QUERY_TYPE_DT, [h_unix_ms]),
(r_timezone_translation, handler_timezone_translation, QUERY_TYPE_DT_TR, [h_translation]),
(r_timezone_translation_in_to, handler_timezone_translation_in_to, QUERY_TYPE_DT_TR, [h_translation]),
(r_time_since, handler_time_since_until, QUERY_TYPE_TD, [h_default_td]),
(r_time_until, handler_time_since_until, QUERY_TYPE_TD, [h_default_td]),
(r_time_between, handler_time_diff, QUERY_TYPE_TD, [h_default_td]),
(r_tz_between, handler_tz_diff, QUERY_TYPE_TD, [h_default_td]),
(r_time_plus, handler_time_plus, QUERY_TYPE_DT, [h_default_dt]),
(r_time_plus, handler_time_plus_rev, QUERY_TYPE_DT, [h_default_dt]),
(r_time_minus, handler_time_minus, QUERY_TYPE_DT, [h_default_dt]),
(r_time_before, handler_time_before, QUERY_TYPE_DT, [h_default_dt]),
(r_weekday_pre, handler_generic_parser, QUERY_TYPE_DT, [h_day_of_week]),
(r_weekday_post, handler_generic_parser, QUERY_TYPE_DT, [h_day_of_week]),
(r_workdays_since, handler_workdays_since_until, QUERY_TYPE_TD, [h_default_td]),
(r_workdays_until, handler_workdays_since_until, QUERY_TYPE_TD, [h_default_td]),
(r_workdays_between, handler_workdays_diff, QUERY_TYPE_TD, [h_default_td]),
(r_workhours_since, handler_workhours_since_until, QUERY_TYPE_TD, [h_default_td]),
(r_workhours_until, handler_workhours_since_until, QUERY_TYPE_TD, [h_default_td]),
(r_workhours_between, handler_workhours_diff, QUERY_TYPE_TD, [h_default_td]),
(r_time_in, handler_time_in_parser, QUERY_TYPE_DT, [h_time_in]),
(r_timezone, handler_timezone, QUERY_TYPE_TZ, [h_tz_offset]),
(r_timezone_cities, handler_timezone, QUERY_TYPE_TZ, [h_tz_iana, h_tz_abbr]),
(r_timezone_2, handler_timezone, QUERY_TYPE_TZ, [h_tz_offset]),
(r_hour_minute_timezone, handler_timezone_creation, QUERY_TYPE_DT_TR, [h_translation]),
(r_calendar_year, handler_calendar, QUERY_TYPE_CAL, [h_cal_year]),
(r_calendar_month, handler_calendar, QUERY_TYPE_CAL, [h_cal_month]),
(r_weeknum, handler_generic_parser, QUERY_TYPE_DT, [h_week_num]),
(r_generic, handler_generic_parser, QUERY_TYPE_DT, [h_default_dt]),
]
@ -271,20 +278,23 @@ def try_regex(r, s):
def tokenize(s):
solutions = []
for r, h, t, hi in regex_handlers:
for r, h, t, his in regex_handlers:
logger.debug("Trying regex: {}".format(r))
g = try_regex(r, s)
if g is not None:
try:
logger.debug("Matched regex: {}".format(r))
logger.debug("Running handler: {} | query_type: {} | output_type: {}".format(h.__name__, t, hi))
logger.debug("Running handler: {} | query_type: {} | output_type: {}".format(h.__name__, t, his))
result = h(*g)
except Exception as e:
logger.debug("Exception from handler: {} -> {}".format(h.__name__, e))
continue
if result is not None:
logger.debug("Matched regex: {}".format(r))
solutions.append((h.__name__, result, t, hi))
for hi in his:
solution = h.__name__, result, t, hi
if solution:
solutions.append(solution)
return solutions
@ -295,7 +305,7 @@ def pretty_print_dict(obj):
return colorful_json
def show_magic_results(obj, args, results=1):
def show_magic_results(obj, args):
rv = []
for solution in obj['solutions']:
entry_proxy = Cut(solution, sep='->')
@ -310,14 +320,14 @@ def show_magic_results(obj, args, results=1):
continue
if args.handlers:
# TODO: not working yet:
user_handlers = args.handlers.split(',')
for user_handler in user_handlers:
to_print = "{} -> {}".format(user_handler, highlight_result)
# user_handlers = args.handlers.split(',')
# for user_handler in user_handlers:
to_print = "{} -> {}".format(solution['handler'], highlight_result)
else:
to_print = highlight_result
rv.append(to_print)
print(to_print)
if len(rv) >= results:
if len(rv) >= args.results:
break
return rv
@ -331,6 +341,7 @@ def dt_pretty(dt):
rv["iso8601_date"] = dt.strftime('%Y-%m-%d')
rv["iso8601_time"] = dt.strftime('%H:%M:%S')
rv["day_of_week_number"] = dt.strftime("%w")
rv["week_number"] = dt.strftime("%W")
rv["locale"] = custom_locale
with setlocale(locale.LC_TIME, custom_locale.get("lc_time")):
rv["locale_month"] = dt.strftime("%B")
@ -441,7 +452,8 @@ def parse_args():
parser.add_argument('query', nargs='*', default="", help="freeform")
parser.add_argument('--locale', dest='locale')
parser.add_argument('--tz', dest='tz', default='local')
parser.add_argument('--handlers', dest='handlers')
parser.add_argument('--handlers', dest='handlers', action='store_true')
parser.add_argument('--results', dest='results', type=int, default=10)
parser.add_argument('--show', dest='show')
parser.add_argument('--full', dest='full', action='store_true')
parser.add_argument('--debug', dest='debug', action='store_true')