tokenizer next

This commit is contained in:
Daniel Tsvetkov 2019-12-13 15:16:42 +01:00
parent 3109540c08
commit f13d40f2ee
3 changed files with 292 additions and 233 deletions

View File

@ -1,193 +1,6 @@
import datetime
import os
from datetime import datetime, timedelta
import pytz
from dateparser import parse as parse_dt
from dateparser.timezone_parser import StaticTzInfo
from dateutil.tz import tzlocal, gettz
from tww import parse_query, solve_query, TZ_OFFSETS
def time_ago(date=None, diff=None):
"""
Get a datetime object, timedelta object or a int() Epoch timestamp and
return a
pretty string like 'an hour ago', 'Yesterday', '3 months ago',
'just now', etc
Modified from: http://stackoverflow.com/a/1551394/141084
"""
now = get_utcnow()
if not date:
if diff:
diff = timedelta(seconds=diff)
else:
diff = now - now
else:
if type(date) is str:
parsed_dt = parse_dt(date)
if parsed_dt.tzinfo is not None:
now = get_utcnow(tzaware=True)
diff = now - parsed_dt
elif type(date) is timedelta:
diff = date
elif type(date) is int:
diff = now - datetime.fromtimestamp(date)
elif isinstance(date, datetime):
if date.tzinfo is not None:
now = get_utcnow(tzaware=True)
diff = now - date
else:
raise ValueError('invalid date %s of type %s' % (date, type(date)))
sign = ''
if diff.days < 0:
diff = -diff
sign = '-'
second_diff = diff.seconds
day_diff = diff.days
if day_diff == 0:
if second_diff < 60:
return str("{}{:02d}".format(sign, second_diff))
seconds = second_diff % 60
if second_diff < 3600:
min_diff = second_diff // 60
return str("{}{:02d}:{:02d}".format(sign, min_diff, seconds))
minutes = second_diff % 3600 // 60
if second_diff < 86400:
hrs_diff = second_diff // 3600
return str("{}{:02d}:{:02d}:{:02d}".format(sign, hrs_diff, minutes,
seconds))
seconds = second_diff % 60
minutes = second_diff % 3600 // 60
hours = second_diff // 3600
if day_diff < 365:
if day_diff < 30:
return str("{}{}d{:02d}:{:02d}:{:02d}".format(sign, day_diff, hours,
minutes, seconds))
months, days = day_diff // 30, day_diff % 30
return str(
"{}{}m{}d{:02d}:{:02d}:{:02d}".format(sign, months, days, hours,
minutes, seconds))
years = day_diff // 365
days = day_diff % 365
months, days = days // 30, days % 30
return str(
"{}{}y{}m{}d{:02d}:{:02d}:{:02d}".format(sign, years, months, days,
hours, minutes, seconds))
def query_to_dt(query):
human_dt, human_tz_loc = parse_query(query)
return solve_query(human_dt, human_tz_loc)
# ==================
def tzinfo_from_offset(offset: str) -> pytz.timezone:
if ':' in offset:
offset = ''.join(offset.split(':'))
tznames = TZ_OFFSETS.get(offset)
if not tznames:
return pytz.timezone("utc")
for tzname in tznames:
if tzname.startswith('Etc/GMT'):
return pytz.timezone(tzname)
return pytz.timezone(tznames[0])
def dateparser_parse_dt(s: str):
# print("Dateparser query: {}".format(s))
parsed = parse_dt(s)
# print("Dateparser parsed query: {}".format(parsed))
if not parsed:
return None
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=pytz.timezone("utc"))
return parsed
def get_utcnow(tzaware: bool = True):
if tzaware:
return datetime.utcnow().replace(tzinfo=pytz.UTC)
return datetime.utcnow()
def dt_tz_translation(dt: datetime, to_tz_offset: str,
from_tz_offset: str = "+00:00") -> datetime:
if ':' in to_tz_offset:
to_shh, to_mm = to_tz_offset.split(':')
else:
to_shh, to_mm = to_tz_offset[:-2], to_tz_offset[-2:]
if ':' in from_tz_offset:
from_shh, from_mm = from_tz_offset.split(':')
else:
from_shh, from_mm = from_tz_offset[:-2], to_tz_offset[-2:]
tzinfo = tzinfo_from_offset(to_tz_offset)
if dt.tzinfo:
return dt.astimezone(tzinfo)
r_dt = dt + timedelta(hours=int(to_shh), minutes=int(to_mm)) - timedelta(
hours=int(from_shh), minutes=int(from_mm))
r_dt = tzinfo.localize(r_dt)
return r_dt
def format_offset_from_timedelta(tdelta: timedelta):
sign = "+" if tdelta.seconds >= 0 else "-"
h = tdelta.seconds // 3600
m = (tdelta.seconds // 60) - h * 60
return "{}{:02d}:{:02d}".format(sign, h, m)
def get_local_tzname_iana():
return '/'.join(os.path.realpath(gettz()._filename).split('/')[-2:])
def get_local_tz_offset():
return format_offset_from_timedelta(
datetime.now(tzlocal()).tzinfo._std_offset)
def tzname_to_tz_offset(tzname_iana: str):
return format_offset_from_timedelta(
pytz.timezone(tzname_iana).utcoffset(get_utcnow(False)))
def get_dt_tz_offset(dt: datetime) -> timedelta:
if dt.tzinfo is not None:
if type(dt.tzinfo) is StaticTzInfo:
tzoffset = dt.tzinfo._StaticTzInfo__offset
else:
tzoffset = dt.tzinfo._utcoffset
return tzoffset
return timedelta(0)
def get_us_since_epoch(dt: datetime):
utc_seconds = int(dt.timestamp() * 1e6)
if dt.tzinfo is None:
return utc_seconds
local_seconds = get_dt_tz_offset(dt).seconds
return utc_seconds + local_seconds
def get_ms_since_epoch(dt):
return int(get_us_since_epoch(dt) / 1e3)
def get_s_since_epoch(dt):
return int(get_us_since_epoch(dt) / 1e6)
def epoch_to_dt(seconds):
return datetime.fromtimestamp(seconds)
def time_to_emoji(dt):
seconds = get_s_since_epoch(dt)
a = int((seconds / 900 - 3) / 2 % 24)
return chr(128336 + a // 2 + a % 2 * 12)
from tww import dateparser_parse_dt, get_utcnow, dt_tz_translation, get_local_tzname_iana, get_local_tz_offset, \
tzname_to_tz_offset, get_s_since_epoch, time_to_emoji
if __name__ == "__main__":
print(get_local_tzname_iana())

View File

@ -1,42 +1,21 @@
import re
from datetime import timedelta
import sys
from datetime import timedelta, datetime
from src.tww.time_lib import dateparser_parse_dt, get_utcnow, \
get_s_since_epoch, \
get_ms_since_epoch
from tww import resolve_timezone, dateparser_parse_dt, get_utcnow, get_s_since_epoch, get_ms_since_epoch, \
dt_tz_translation, DEFAULT_FORMAT, get_local_now
r_time_in_epoch_s = re.compile(
'\s*(.*)?\s*(?:in|to)\s*(?:epoch|seconds since epoch|seconds)\s*',
flags=re.IGNORECASE)
r_time_in_epoch_s2 = re.compile('\s*(?:seconds)?\s*since\s*(.*)\s*',
flags=re.IGNORECASE)
r_time_in_epoch_ms = re.compile(
'\s*(.*)?\s*(?:in|to)\s*(?:ms|milliseconds|miliseconds)\s*',
flags=re.IGNORECASE)
r_time_in_epoch_ms2 = re.compile(
'\s*(?:ms|milliseconds|miliseconds)?\s*since\s*(.*)\s*',
flags=re.IGNORECASE)
r_time_since = re.compile('\s*(?:time)?\s*since\s*(.*)\s*', flags=re.IGNORECASE)
r_time_until = re.compile('\s*(?:time)?\s*until\s*(.*)\s*', flags=re.IGNORECASE)
r_time_between = re.compile('\s*(?:time)?\s*between\s*(.*)\s*and\s*(.*)\s*',
flags=re.IGNORECASE)
test_strings = [
None,
"",
"s",
" ",
"Time since 2019-05-12",
"Since yesterday",
"time between yesterday and tomorrow",
"time until 25 december",
"time sinc",
"now in milliseconds",
"seconds since epoch",
"1992-01-27 to epoch",
"milliseconds since 1992-01-27",
]
r_generic = re.compile('(.*)', flags=re.IGNORECASE)
r_time_in_epoch_s_now = re.compile('(?:time since epoch|seconds since epoch)', flags=re.IGNORECASE)
r_time_in_epoch_s2 = re.compile('(.*)?\s*(?:in|to)\s*(?:epoch|seconds since epoch|seconds)', flags=re.IGNORECASE)
r_time_in_epoch_s3 = re.compile('(?:seconds)?\s*since\s*(.*)', flags=re.IGNORECASE)
r_time_in_epoch_ms_now = re.compile('(?:milliseconds since epoch)', flags=re.IGNORECASE)
r_time_in_epoch_ms2 = re.compile('(.*)?\s*(?:in|to)\s*(?:ms|milliseconds|miliseconds)', flags=re.IGNORECASE)
r_time_in_epoch_ms3 = re.compile('(?:ms|milliseconds|miliseconds)?\s*since\s*(.*)', flags=re.IGNORECASE)
r_time_since = re.compile('(?:time)?\s*since\s*(.*)', flags=re.IGNORECASE)
r_time_until = re.compile('(?:time)?\s*until\s*(.*)', flags=re.IGNORECASE)
r_time_between = re.compile('(?:time)?\s*between\s*(.*)\s*and\s*(.*)', flags=re.IGNORECASE)
r_timezone_translation = re.compile('(.*)?\s(?:in|to)\s(.*)', flags=re.IGNORECASE)
def handler_time_s(dt_s: str) -> int:
@ -47,6 +26,22 @@ def handler_time_ms(dt_s: str) -> int:
return get_ms_since_epoch(dateparser_parse_dt(dt_s))
def handler_time_s_now_local() -> int:
return get_s_since_epoch(get_local_now())
def handler_time_ms_now_local() -> int:
return get_ms_since_epoch(get_local_now())
def handler_time_s_now_utc() -> int:
return get_s_since_epoch(get_utcnow())
def handler_time_ms_now_utc() -> int:
return get_ms_since_epoch(get_utcnow())
def handler_time_diff(start_dt: str, end_dt: str) -> timedelta:
return dateparser_parse_dt(end_dt) - dateparser_parse_dt(start_dt)
@ -59,14 +54,32 @@ def handler_time_until(end_dt_s: str) -> timedelta:
return handler_time_diff(str(get_utcnow()), end_dt_s)
def handler_timezone_translation(dt_s: str, timezone_like_s: str) -> datetime:
dt = dateparser_parse_dt(dt_s)
tz = resolve_timezone(timezone_like_s)
if tz:
offset = tz.get('tz_offset')
return dt_tz_translation(dt, offset)
def handler_generic_parser(dt_s: str) -> datetime:
return dateparser_parse_dt(dt_s)
regex_handlers = [
(r_time_in_epoch_s, handler_time_s),
(r_time_in_epoch_s_now, handler_time_s_now_local),
(r_time_in_epoch_s_now, handler_time_s_now_utc),
(r_time_in_epoch_s2, handler_time_s),
(r_time_in_epoch_ms, handler_time_ms),
(r_time_in_epoch_s3, handler_time_s),
(r_time_in_epoch_ms_now, handler_time_ms_now_local),
(r_time_in_epoch_ms_now, handler_time_ms_now_utc),
(r_time_in_epoch_ms2, handler_time_ms),
(r_time_in_epoch_ms3, handler_time_ms),
(r_timezone_translation, handler_timezone_translation),
(r_time_since, handler_time_since),
(r_time_until, handler_time_until),
(r_time_between, handler_time_diff),
(r_generic, handler_generic_parser),
]
@ -81,15 +94,52 @@ def try_regex(r, s):
def parse(s):
solutions = []
for r, h in regex_handlers:
g = try_regex(r, s)
if g is not None:
try:
return h(*g)
except:
continue
result = h(*g)
except Exception as e:
result = None
solutions.append((h.__name__, result))
return solutions
def test():
test_strings = [
None,
"",
"s",
" ",
"Time since 2019-05-12",
"Since yesterday",
"time between yesterday and tomorrow",
"time until 25 december",
"time sinc",
"now in milliseconds",
"seconds since epoch",
"1992-01-27 to epoch",
"milliseconds since 1992-01-27",
"now in sofia",
"now in PST",
"2 hours ago to Sydney",
"now in +03:00",
"now in dublin",
]
for s in test_strings:
print("{} -> {}".format(s, parse(s)))
if __name__ == "__main__":
for s in test_strings:
print("{} -> {}".format(s, parse(s)))
query = ' '.join(sys.argv[1:])
results = parse(query)
for handler, result in results:
if type(result) is datetime:
print(" {} -> {}".format(handler, result.strftime(DEFAULT_FORMAT)))
elif type(result) is timedelta:
print(" {} -> {}".format(handler, result))
elif type(result) is None:
print(" {} -> Couldn't solve query".format(handler))
else:
print(" {} -> {}".format(handler, result))

View File

@ -4,14 +4,19 @@ Find time now, in the past or future in any timezone or location.
"""
import argparse
import datetime
import logging
import os
from collections import defaultdict
import dateparser
from datetime import datetime
from datetime import datetime, timedelta
import pytz
from dateparser import parse as parse_dt
from dateutil.parser import parse as dutil_parse
from dateparser.timezone_parser import StaticTzInfo
from dateutil.tz import gettz, tzlocal
from pytz import timezone
from pytz.exceptions import UnknownTimeZoneError
@ -249,10 +254,16 @@ def resolve_timezone(query):
normal_query = query.lower().strip()
found_from_iana_tz = NORMALIZED_TZ_DICT.get(normal_query, "")
found_from_abbr_tzs = list(NORMALIZED_TZ_ABBR.get(normal_query, set()))
found_from_offset_tz = None
try:
found_from_offset_tz = tzinfo_from_offset(normal_query)
except: ...
normal_tz = found_from_iana_tz
if not normal_tz:
if found_from_abbr_tzs:
normal_tz = list(found_from_abbr_tzs)[0]
elif found_from_offset_tz:
normal_tz = found_from_offset_tz.zone
tz_abbrs = list(TZ_ABBRS_REVERSE.get(normal_tz, set()))
logger.debug("Normalized timezone: {} -> {}".format(query, normal_tz))
local_location, remote_location = {}, {}
@ -293,6 +304,7 @@ def resolve_timezone(query):
"normal_query": normal_query,
"found_from_iana_tz": found_from_iana_tz,
"found_from_abbr_tzs": found_from_abbr_tzs,
"found_from_offset_tzs": found_from_offset_tz,
"local_location": local_location,
"remote_location": remote_location,
"search_pytz": normal_tz,
@ -348,3 +360,187 @@ if __name__ == "__main__":
args = parse_args()
setup_logging_level(args.debug)
main(args)
def time_ago(date=None, diff=None):
"""
Get a datetime object, timedelta object or a int() Epoch timestamp and
return a
pretty string like 'an hour ago', 'Yesterday', '3 months ago',
'just now', etc
Modified from: http://stackoverflow.com/a/1551394/141084
"""
now = get_utcnow()
if not date:
if diff:
diff = timedelta(seconds=diff)
else:
diff = now - now
else:
if type(date) is str:
parsed_dt = parse_dt(date)
if parsed_dt.tzinfo is not None:
now = get_utcnow(tzaware=True)
diff = now - parsed_dt
elif type(date) is timedelta:
diff = date
elif type(date) is int:
diff = now - datetime.fromtimestamp(date)
elif isinstance(date, datetime):
if date.tzinfo is not None:
now = get_utcnow(tzaware=True)
diff = now - date
else:
raise ValueError('invalid date %s of type %s' % (date, type(date)))
sign = ''
if diff.days < 0:
diff = -diff
sign = '-'
second_diff = diff.seconds
day_diff = diff.days
if day_diff == 0:
if second_diff < 60:
return str("{}{:02d}".format(sign, second_diff))
seconds = second_diff % 60
if second_diff < 3600:
min_diff = second_diff // 60
return str("{}{:02d}:{:02d}".format(sign, min_diff, seconds))
minutes = second_diff % 3600 // 60
if second_diff < 86400:
hrs_diff = second_diff // 3600
return str("{}{:02d}:{:02d}:{:02d}".format(sign, hrs_diff, minutes,
seconds))
seconds = second_diff % 60
minutes = second_diff % 3600 // 60
hours = second_diff // 3600
if day_diff < 365:
if day_diff < 30:
return str("{}{}d{:02d}:{:02d}:{:02d}".format(sign, day_diff, hours,
minutes, seconds))
months, days = day_diff // 30, day_diff % 30
return str(
"{}{}m{}d{:02d}:{:02d}:{:02d}".format(sign, months, days, hours,
minutes, seconds))
years = day_diff // 365
days = day_diff % 365
months, days = days // 30, days % 30
return str(
"{}{}y{}m{}d{:02d}:{:02d}:{:02d}".format(sign, years, months, days,
hours, minutes, seconds))
def query_to_dt(query):
human_dt, human_tz_loc = parse_query(query)
return solve_query(human_dt, human_tz_loc)
def tzinfo_from_offset(offset: str) -> pytz.timezone:
if ':' in offset:
offset = ''.join(offset.split(':'))
tznames = TZ_OFFSETS.get(offset)
for tzname in tznames:
if tzname.startswith('Etc/GMT'):
return pytz.timezone(tzname)
return pytz.timezone(tznames[0])
def dateparser_parse_dt(s: str):
# print("Dateparser query: {}".format(s))
parsed = parse_dt(s)
# print("Dateparser parsed query: {}".format(parsed))
if not parsed:
parsed = dutil_parse(s)
if not parsed:
return None
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=pytz.timezone("utc"))
return parsed
def get_utcnow(tzaware: bool = True):
if tzaware:
return datetime.utcnow().replace(tzinfo=pytz.UTC)
return datetime.utcnow()
def get_local_now(tzaware: bool = True):
if tzaware:
return datetime.now().replace(tzinfo=tzinfo_from_offset(get_local_tz_offset()))
return datetime.utcnow()
def dt_tz_translation(dt: datetime, to_tz_offset: str, from_tz_offset: str = "+00:00") -> datetime:
if ':' in to_tz_offset:
to_shh, to_mm = to_tz_offset.split(':')
else:
to_shh, to_mm = to_tz_offset[:-2], to_tz_offset[-2:]
if ':' in from_tz_offset:
from_shh, from_mm = from_tz_offset.split(':')
else:
from_shh, from_mm = from_tz_offset[:-2], to_tz_offset[-2:]
tzinfo = tzinfo_from_offset(to_tz_offset)
if dt.tzinfo:
return dt.astimezone(tzinfo)
r_dt = dt + timedelta(hours=int(to_shh), minutes=int(to_mm)) - timedelta(
hours=int(from_shh), minutes=int(from_mm))
r_dt = tzinfo.localize(r_dt)
return r_dt
def format_offset_from_timedelta(tdelta: timedelta):
sign = "+" if tdelta.seconds >= 0 else "-"
h = tdelta.seconds // 3600
m = (tdelta.seconds // 60) - h * 60
return "{}{:02d}:{:02d}".format(sign, h, m)
def get_local_tzname_iana():
return '/'.join(os.path.realpath(gettz()._filename).split('/')[-2:])
def get_local_tz_offset():
return format_offset_from_timedelta(
datetime.now(tzlocal()).tzinfo._std_offset)
def tzname_to_tz_offset(tzname_iana: str):
return format_offset_from_timedelta(
pytz.timezone(tzname_iana).utcoffset(get_utcnow(False)))
def get_dt_tz_offset(dt: datetime) -> timedelta:
if dt.tzinfo is not None:
if type(dt.tzinfo) is StaticTzInfo:
tzoffset = dt.tzinfo._StaticTzInfo__offset
else:
tzoffset = dt.tzinfo._utcoffset
return tzoffset
return timedelta(0)
def get_us_since_epoch(dt: datetime):
utc_seconds = int(dt.timestamp() * 1e6)
if dt.tzinfo is None:
return utc_seconds
local_seconds = get_dt_tz_offset(dt).seconds
return utc_seconds + local_seconds
def get_ms_since_epoch(dt):
return int(get_us_since_epoch(dt) / 1e3)
def get_s_since_epoch(dt):
return int(get_us_since_epoch(dt) / 1e6)
def epoch_to_dt(seconds):
return datetime.fromtimestamp(seconds)
def time_to_emoji(dt):
seconds = get_s_since_epoch(dt)
a = int((seconds / 900 - 3) / 2 % 24)
return chr(128336 + a // 2 + a % 2 * 12)