tokenizer

This commit is contained in:
Daniel Tsvetkov 2019-12-12 22:29:15 +01:00
parent 3a39204d26
commit 3109540c08
2 changed files with 193 additions and 45 deletions

View File

@ -11,7 +11,8 @@ from tww import parse_query, solve_query, TZ_OFFSETS
def time_ago(date=None, diff=None): def time_ago(date=None, diff=None):
""" """
Get a datetime object, timedelta object or a int() Epoch timestamp and return a Get a datetime object, timedelta object or a int() Epoch timestamp and
return a
pretty string like 'an hour ago', 'Yesterday', '3 months ago', pretty string like 'an hour ago', 'Yesterday', '3 months ago',
'just now', etc 'just now', etc
Modified from: http://stackoverflow.com/a/1551394/141084 Modified from: http://stackoverflow.com/a/1551394/141084
@ -56,19 +57,25 @@ def time_ago(date=None, diff=None):
minutes = second_diff % 3600 // 60 minutes = second_diff % 3600 // 60
if second_diff < 86400: if second_diff < 86400:
hrs_diff = second_diff // 3600 hrs_diff = second_diff // 3600
return str("{}{:02d}:{:02d}:{:02d}".format(sign, hrs_diff, minutes, seconds)) return str("{}{:02d}:{:02d}:{:02d}".format(sign, hrs_diff, minutes,
seconds))
seconds = second_diff % 60 seconds = second_diff % 60
minutes = second_diff % 3600 // 60 minutes = second_diff % 3600 // 60
hours = second_diff // 3600 hours = second_diff // 3600
if day_diff < 365: if day_diff < 365:
if day_diff < 30: if day_diff < 30:
return str("{}{}d{:02d}:{:02d}:{:02d}".format(sign, day_diff, hours, minutes, seconds)) return str("{}{}d{:02d}:{:02d}:{:02d}".format(sign, day_diff, hours,
minutes, seconds))
months, days = day_diff // 30, day_diff % 30 months, days = day_diff // 30, day_diff % 30
return str("{}{}m{}d{:02d}:{:02d}:{:02d}".format(sign, months, days, hours, minutes, seconds)) return str(
"{}{}m{}d{:02d}:{:02d}:{:02d}".format(sign, months, days, hours,
minutes, seconds))
years = day_diff // 365 years = day_diff // 365
days = day_diff % 365 days = day_diff % 365
months, days = days // 30, days % 30 months, days = days // 30, days % 30
return str("{}{}y{}m{}d{:02d}:{:02d}:{:02d}".format(sign, years, months, days, hours, minutes, seconds)) return str(
"{}{}y{}m{}d{:02d}:{:02d}:{:02d}".format(sign, years, months, days,
hours, minutes, seconds))
def query_to_dt(query): def query_to_dt(query):
@ -90,9 +97,13 @@ def tzinfo_from_offset(offset: str) -> pytz.timezone:
def dateparser_parse_dt(s: str): def dateparser_parse_dt(s: str):
print("Dateparser query: {}".format(s)) # print("Dateparser query: {}".format(s))
parsed = parse_dt(s) parsed = parse_dt(s)
print("Dateparser parsed query: {}".format(parsed)) # print("Dateparser parsed query: {}".format(parsed))
if not parsed:
return None
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=pytz.timezone("utc"))
return parsed return parsed
@ -102,7 +113,8 @@ def get_utcnow(tzaware: bool = True):
return datetime.utcnow() return datetime.utcnow()
def dt_tz_translation(dt: datetime, to_tz_offset: str, from_tz_offset: str = "+00:00") -> datetime: def dt_tz_translation(dt: datetime, to_tz_offset: str,
from_tz_offset: str = "+00:00") -> datetime:
if ':' in to_tz_offset: if ':' in to_tz_offset:
to_shh, to_mm = to_tz_offset.split(':') to_shh, to_mm = to_tz_offset.split(':')
else: else:
@ -114,7 +126,8 @@ def dt_tz_translation(dt: datetime, to_tz_offset: str, from_tz_offset: str = "+0
tzinfo = tzinfo_from_offset(to_tz_offset) tzinfo = tzinfo_from_offset(to_tz_offset)
if dt.tzinfo: if dt.tzinfo:
return dt.astimezone(tzinfo) return dt.astimezone(tzinfo)
r_dt = dt + timedelta(hours=int(to_shh), minutes=int(to_mm)) - timedelta(hours=int(from_shh), minutes=int(from_mm)) r_dt = dt + timedelta(hours=int(to_shh), minutes=int(to_mm)) - timedelta(
hours=int(from_shh), minutes=int(from_mm))
r_dt = tzinfo.localize(r_dt) r_dt = tzinfo.localize(r_dt)
return r_dt return r_dt
@ -131,11 +144,13 @@ def get_local_tzname_iana():
def get_local_tz_offset(): def get_local_tz_offset():
return format_offset_from_timedelta(datetime.now(tzlocal()).tzinfo._std_offset) return format_offset_from_timedelta(
datetime.now(tzlocal()).tzinfo._std_offset)
def tzname_to_tz_offset(tzname_iana: str): def tzname_to_tz_offset(tzname_iana: str):
return format_offset_from_timedelta(pytz.timezone(tzname_iana).utcoffset(get_utcnow(False))) return format_offset_from_timedelta(
pytz.timezone(tzname_iana).utcoffset(get_utcnow(False)))
def get_dt_tz_offset(dt: datetime) -> timedelta: def get_dt_tz_offset(dt: datetime) -> timedelta:
@ -148,20 +163,28 @@ def get_dt_tz_offset(dt: datetime) -> timedelta:
return timedelta(0) return timedelta(0)
def get_seconds_since_epoch(dt): def get_us_since_epoch(dt: datetime):
utc_seconds = int(dt.timestamp()) utc_seconds = int(dt.timestamp() * 1e6)
if dt.tzinfo is None: if dt.tzinfo is None:
return utc_seconds return utc_seconds
local_seconds = get_dt_tz_offset(dt).seconds local_seconds = get_dt_tz_offset(dt).seconds
return utc_seconds + local_seconds return utc_seconds + local_seconds
def get_ms_since_epoch(dt):
return int(get_us_since_epoch(dt) / 1e3)
def get_s_since_epoch(dt):
return int(get_us_since_epoch(dt) / 1e6)
def epoch_to_dt(seconds): def epoch_to_dt(seconds):
return datetime.fromtimestamp(seconds) return datetime.fromtimestamp(seconds)
def time_to_emoji(dt): def time_to_emoji(dt):
seconds = get_seconds_since_epoch(dt) seconds = get_s_since_epoch(dt)
a = int((seconds / 900 - 3) / 2 % 24) a = int((seconds / 900 - 3) / 2 % 24)
return chr(128336 + a // 2 + a % 2 * 12) return chr(128336 + a // 2 + a % 2 * 12)
@ -187,7 +210,7 @@ if __name__ == "__main__":
) )
)) ))
print(get_seconds_since_epoch( print(get_s_since_epoch(
dateparser_parse_dt("2019-12-11 15:53:40+0000") dateparser_parse_dt("2019-12-11 15:53:40+0000")
)) ))
@ -204,21 +227,30 @@ if __name__ == "__main__":
"Now in New York" # Location:City "Now in New York" # Location:City
"Now in Bulgaria" # Location:Country "Now in Bulgaria" # Location:Country
"Now in USA" # Location:Country - multiple timezones "Now in USA" # Location:Country - multiple timezones
"Now in CET" # Timezone:Abbreviation - https://en.wikipedia.org/wiki/List_of_time_zone_abbreviations "Now in CET" # Timezone:Abbreviation -
"Now in Europe/Zurich" # Timezone:IANA - tz db, https://en.wikipedia.org/wiki/List_of_tz_database_time_zones, timezone database # https://en.wikipedia.org/wiki/List_of_time_zone_abbreviations
"Now in Alfa" # Timezone:Military - https://en.wikipedia.org/wiki/List_of_military_time_zones "Now in Europe/Zurich" # Timezone:IANA - tz db,
"Now in +02:00" # Timezone:Offset - https://en.wikipedia.org/wiki/List_of_UTC_time_offsets # https://en.wikipedia.org/wiki/List_of_tz_database_time_zones, timezone
# database
"Now in Alfa" # Timezone:Military -
# https://en.wikipedia.org/wiki/List_of_military_time_zones
"Now in +02:00" # Timezone:Offset -
# https://en.wikipedia.org/wiki/List_of_UTC_time_offsets
"<TIME> IN <WHEREVER> TO <WHEREVER>" "<TIME> IN <WHEREVER> TO <WHEREVER>"
"==================================" "=================================="
"Now in Bulgaria to UTC" # == "Now in UTC" "Now in Bulgaria to UTC" # == "Now in UTC"
"2019-12-12 03:14:15+03:00 to New York" # Something in +03:00 to Location:City "2019-12-12 03:14:15+03:00 to New York" # Something in +03:00 to
# Location:City
"<TIMEDELTA>" "<TIMEDELTA>"
"=================" "================="
"[Time] since <TIME>" # now-<TIME> -> timedelta "[Time] since <TIME>" # now-<TIME> ->
"[Time] until <TIME>" # <TIME>-now -> timedelta # timedelta
"[Time] between 27-01-1992 and 09 May 1997" # == end_dt - start_dt -> timedelta "[Time] until <TIME>" # <TIME>-now ->
# timedelta
"[Time] between 27-01-1992 and 09 May 1997" # == end_dt - start_dt ->
# timedelta
"<TIME CALCULATION>" "<TIME CALCULATION>"
"=================" "================="
@ -227,7 +259,6 @@ if __name__ == "__main__":
"12-12-2019 + 2 weeks" # == dt + timedelta -> datetime "12-12-2019 + 2 weeks" # == dt + timedelta -> datetime
"05:23 - 150 minutes" # == dt - timedelta -> datetime "05:23 - 150 minutes" # == dt - timedelta -> datetime
# strptime: str -> datetime # strptime: str -> datetime
# strftime: datetime -> str # strftime: datetime -> str
@ -248,32 +279,54 @@ if __name__ == "__main__":
# https://docs.python.org/3/library/datetime.html # https://docs.python.org/3/library/datetime.html
# #
# %a Weekday as locales abbreviated name. "Sun, Mon, …, Sat (en_US); So, Mo, …, Sa (de_DE) # %a Weekday as locales abbreviated name.
# %A Weekday as locales full name. Sunday, Monday, …, Saturday (en_US); Sonntag, Montag, …, Samstag (de_DE) # "Sun, Mon, …, Sat (en_US); So, Mo, …, Sa (de_DE)
# %w Weekday as a decimal number, where 0 is Sunday and 6 is Saturday. 0, 1, …, 6 # %A Weekday as locales full name.
# %d Day of the month as a zero-padded decimal number. 01, 02, …, 31 # Sunday, Monday, …, Saturday (en_US); Sonntag, Montag, …, Samstag (de_DE)
# %b Month as locales abbreviated name. Jan, Feb, …, Dec (en_US); Jan, Feb, …, Dez (de_DE) # %w Weekday as a decimal number, where 0 is Sunday and 6 is
# %B Month as locales full name. January, February, …, December (en_US); Januar, Februar, …, Dezember (de_DE) # Saturday. 0, 1, …, 6
# %m Month as a zero-padded decimal number. 01, 02, …, 12 # %d Day of the month as a zero-padded decimal number.
# %y Year without century as a zero-padded decimal number. 00, 01, …, 99 # 01, 02, …, 31
# %Y Year with century as a decimal number. 0001, 0002, …, 2013, 2014, …, 9998, 9999 # %b Month as locales abbreviated name.
# %H Hour (24-hour clock) as a zero-padded decimal number. 00, 01, …, 23 # Jan, Feb, …, Dec (en_US); Jan, Feb, …, Dez (de_DE)
# %I Hour (12-hour clock) as a zero-padded decimal number. 01, 02, …, 12 # %B Month as locales full name.
# %p Locales equivalent of either AM or PM. AM, PM (en_US); am, pm (de_DE) # January, February, …, December (en_US); Januar, Februar, …, Dezember (
# %M Minute as a zero-padded decimal number. 00, 01, …, 59 # de_DE)
# %S Second as a zero-padded decimal number. 00, 01, …, 59 # %m Month as a zero-padded decimal number.
# %f Microsecond as a decimal number, zero-padded on the left. 000000, 000001, …, 999999 # 01, 02, …, 12
# %y Year without century as a zero-padded decimal number.
# 00, 01, …, 99
# %Y Year with century as a decimal number.
# 0001, 0002, …, 2013, 2014, …, 9998, 9999
# %H Hour (24-hour clock) as a zero-padded decimal number.
# 00, 01, …, 23
# %I Hour (12-hour clock) as a zero-padded decimal number.
# 01, 02, …, 12
# %p Locales equivalent of either AM or PM.
# AM, PM (en_US); am, pm (de_DE)
# %M Minute as a zero-padded decimal number.
# 00, 01, …, 59
# %S Second as a zero-padded decimal number.
# 00, 01, …, 59
# %f Microsecond as a decimal number, zero-padded on the left.
# 000000, 000001, …, 999999
# %z UTC offset in the form ±HHMM[SS[.ffffff]] # %z UTC offset in the form ±HHMM[SS[.ffffff]]
# (empty string if the object is naive). (empty), +0000, -0400, +1030, +063415, -030712.345216 # (empty string if the object is naive).
# %Z Time zone name (empty string if the object is naive). (empty), UTC, EST, CST # (empty), +0000, -0400, +1030, +063415, -030712.345216
# %j Day of the year as a zero-padded decimal number. 001, 002, …, 366 # %Z Time zone name (empty string if the object is naive).
# (empty), UTC, EST, CST
# %j Day of the year as a zero-padded decimal number.
# 001, 002, …, 366
# %U Week number of the year (Sunday as the first day of the week) # %U Week number of the year (Sunday as the first day of the week)
# as a zero padded decimal number. All days in a new year # as a zero padded decimal number. All days in a new year
# preceding the first Sunday are considered to be in week 0. 00, 01, …, 53 # preceding the first Sunday are considered to be in week
# 0. 00, 01, …, 53
# %W Week number of the year (Monday as the first day of the week) # %W Week number of the year (Monday as the first day of the week)
# as a decimal number. All days in a new year preceding the # as a decimal number. All days in a new year preceding the
# first Monday are considered to be in week 0. 00, 01, …, 53 # first Monday are considered to be in week 0.
# %c Locales appropriate date and time representation. Tue Aug 16 21:30:00 1988 (en_US); Di 16 Aug 21:30:00 1988 (de_DE) # 00, 01, …, 53
# %c Locales appropriate date and time representation.
# Tue Aug 16 21:30:00 1988 (en_US); Di 16 Aug 21:30:00 1988 (de_DE)
# %x Locales appropriate date representation. 08/16/1988 (en_US); 16.08.1988 (de_DE) # %x Locales appropriate date representation. 08/16/1988 (en_US); 16.08.1988 (de_DE)
# %X Locales appropriate time representation. 21:30:00 (en_US); 21:30:00 (de_DE) # %X Locales appropriate time representation. 21:30:00 (en_US); 21:30:00 (de_DE)
# %% A literal '%' character. % # %% A literal '%' character. %

95
src/tww/tokenizer.py Normal file
View File

@ -0,0 +1,95 @@
import re
from datetime import timedelta
from src.tww.time_lib import dateparser_parse_dt, get_utcnow, \
get_s_since_epoch, \
get_ms_since_epoch
r_time_in_epoch_s = re.compile(
'\s*(.*)?\s*(?:in|to)\s*(?:epoch|seconds since epoch|seconds)\s*',
flags=re.IGNORECASE)
r_time_in_epoch_s2 = re.compile('\s*(?:seconds)?\s*since\s*(.*)\s*',
flags=re.IGNORECASE)
r_time_in_epoch_ms = re.compile(
'\s*(.*)?\s*(?:in|to)\s*(?:ms|milliseconds|miliseconds)\s*',
flags=re.IGNORECASE)
r_time_in_epoch_ms2 = re.compile(
'\s*(?:ms|milliseconds|miliseconds)?\s*since\s*(.*)\s*',
flags=re.IGNORECASE)
r_time_since = re.compile('\s*(?:time)?\s*since\s*(.*)\s*', flags=re.IGNORECASE)
r_time_until = re.compile('\s*(?:time)?\s*until\s*(.*)\s*', flags=re.IGNORECASE)
r_time_between = re.compile('\s*(?:time)?\s*between\s*(.*)\s*and\s*(.*)\s*',
flags=re.IGNORECASE)
test_strings = [
None,
"",
"s",
" ",
"Time since 2019-05-12",
"Since yesterday",
"time between yesterday and tomorrow",
"time until 25 december",
"time sinc",
"now in milliseconds",
"seconds since epoch",
"1992-01-27 to epoch",
"milliseconds since 1992-01-27",
]
def handler_time_s(dt_s: str) -> int:
return get_s_since_epoch(dateparser_parse_dt(dt_s))
def handler_time_ms(dt_s: str) -> int:
return get_ms_since_epoch(dateparser_parse_dt(dt_s))
def handler_time_diff(start_dt: str, end_dt: str) -> timedelta:
return dateparser_parse_dt(end_dt) - dateparser_parse_dt(start_dt)
def handler_time_since(start_dt_s: str) -> timedelta:
return handler_time_diff(start_dt_s, str(get_utcnow()))
def handler_time_until(end_dt_s: str) -> timedelta:
return handler_time_diff(str(get_utcnow()), end_dt_s)
regex_handlers = [
(r_time_in_epoch_s, handler_time_s),
(r_time_in_epoch_s2, handler_time_s),
(r_time_in_epoch_ms, handler_time_ms),
(r_time_in_epoch_ms2, handler_time_ms),
(r_time_since, handler_time_since),
(r_time_until, handler_time_until),
(r_time_between, handler_time_diff),
]
def try_regex(r, s):
try:
m = re.match(r, s)
except:
return None
if m:
groups = m.groups()
return groups
def parse(s):
for r, h in regex_handlers:
g = try_regex(r, s)
if g is not None:
try:
return h(*g)
except:
continue
if __name__ == "__main__":
for s in test_strings:
print("{} -> {}".format(s, parse(s)))