"""Provides the :class:`Arrow ` class, a better way to parse datetime strings."""
import re import sys from datetime import datetime, timedelta from datetime import tzinfo as dt_tzinfo from functools import lru_cache from typing import (
Any,
ClassVar,
Dict,
Iterable,
List,
Match,
Optional,
Pattern,
SupportsFloat,
SupportsInt,
Tuple,
Union,
cast,
overload,
)
from dateutil import tz
from arrow import locales from arrow.constants import DEFAULT_LOCALE from arrow.util import next_weekday, normalize_timestamp
if sys.version_info < (3, 8): # pragma: no cover from typing_extensions import Literal, TypedDict else: from typing import Literal, TypedDict # pragma: no cover
class ParserError(ValueError): pass
# Allows for ParserErrors to be propagated from _build_datetime() # when day_of_year errors occur. # Before this, the ParserErrors were caught by the try/except in # _parse_multiformat() and the appropriate error message was not # transmitted to the user. class ParserMatchError(ParserError): pass
class _Parts(TypedDict, total=False):
year: int
month: int
day_of_year: int
day: int
hour: int
minute: int
second: int
microsecond: int
timestamp: float
expanded_timestamp: int
tzinfo: dt_tzinfo
am_pm: Literal["am", "pm"]
day_of_week: int
weekdate: Tuple[_WEEKDATE_ELEMENT, _WEEKDATE_ELEMENT, Optional[_WEEKDATE_ELEMENT]]
# TODO: since we support more than ISO 8601, we should rename this function # IDEA: break into multiple functions def parse_iso(
self, datetime_string: str, normalize_whitespace: bool = False
) -> datetime: if normalize_whitespace:
datetime_string = re.sub(r"\s+", " ", datetime_string.strip())
has_space_divider = " "in datetime_string
has_t_divider = "T"in datetime_string
num_spaces = datetime_string.count(" ") if has_space_divider and num_spaces != 1 or has_t_divider and num_spaces > 0: raise ParserError(
f"Expected an ISO 8601-like string, but was given {datetime_string!r}. " "Try passing in a format string to resolve this."
)
has_time = has_space_divider or has_t_divider
has_tz = False
# date formats (ISO 8601 and others) to test against # NOTE: YYYYMM is omitted to avoid confusion with YYMMDD (no longer part of ISO 8601, but is still often used)
formats = [ "YYYY-MM-DD", "YYYY-M-DD", "YYYY-M-D", "YYYY/MM/DD", "YYYY/M/DD", "YYYY/M/D", "YYYY.MM.DD", "YYYY.M.DD", "YYYY.M.D", "YYYYMMDD", "YYYY-DDDD", "YYYYDDDD", "YYYY-MM", "YYYY/MM", "YYYY.MM", "YYYY", "W",
]
if has_time: if has_space_divider:
date_string, time_string = datetime_string.split(" ", 1) else:
date_string, time_string = datetime_string.split("T", 1)
if time_components isNone: raise ParserError( "Invalid time component provided. " "Please specify a format or provide a valid time component in the basic or extended ISO 8601 time format."
)
if has_space_divider:
formats = [f"{f} {time_string}"for f in formats] else:
formats = [f"{f}T{time_string}"for f in formats]
if has_time and has_tz: # Add "Z" or "ZZ" to the format strings to indicate to # _parse_token() that a timezone needs to be parsed
formats = [f"{f}{tz_format}"for f in formats]
def _generate_pattern_re(self, fmt: str) -> Tuple[List[_FORMAT_TYPE], Pattern[str]]: # fmt is a string of tokens like 'YYYY-MM-DD' # we construct a new string by replacing each # token by its pattern: # 'YYYY-MM-DD' -> '(?P<YYYY>\d{4})-(?P<MM>\d{2})-(?P<DD>\d{2})'
tokens: List[_FORMAT_TYPE] = []
offset = 0
# Escape all special RegEx chars
escaped_fmt = re.escape(fmt)
# Extract the bracketed expressions to be reinserted later.
escaped_fmt = re.sub(self._ESCAPE_RE, "#", escaped_fmt)
# Any number of S is the same as one. # TODO: allow users to specify the number of digits to parse
escaped_fmt = re.sub(r"S+", "S", escaped_fmt)
escaped_data = re.findall(self._ESCAPE_RE, fmt)
fmt_pattern = escaped_fmt
for m in self._FORMAT_RE.finditer(escaped_fmt):
token: _FORMAT_TYPE = cast(_FORMAT_TYPE, m.group(0)) try:
input_re = self._input_re_map[token] except KeyError: raise ParserError(f"Unrecognized token {token!r}.")
input_pattern = f"(?P<{token}>{input_re.pattern})"
tokens.append(token) # a pattern doesn't have the same length as the token # it replaces! We keep the difference in the offset variable. # This works because the string is scanned left-to-right and matches # are returned in the order found by finditer.
fmt_pattern = (
fmt_pattern[: m.start() + offset]
+ input_pattern
+ fmt_pattern[m.end() + offset :]
)
offset += len(input_pattern) - (m.end() - m.start())
# Due to the way Python splits, 'split_fmt' will always be longer for i in range(len(split_fmt)):
final_fmt_pattern += split_fmt[i] if i < len(escaped_data):
final_fmt_pattern += escaped_data[i][1:-1]
# Wrap final_fmt_pattern in a custom word boundary to strictly # match the formatting pattern and filter out date and time formats # that include junk such as: blah1998-09-12 blah, blah 1998-09-12blah, # blah1998-09-12blah. The custom word boundary matches every character # that is not a whitespace character to allow for searching for a date # and time string in a natural language sentence. Therefore, searching # for a string of the form YYYY-MM-DD in "blah 1998-09-12 blah" will # work properly. # Certain punctuation before or after the target pattern such as # "1998-09-12," is permitted. For the full list of valid punctuation, # see the documentation.
starting_word_boundary = (
r"(? # Don't have two consecutive non-whitespace characters. This ensures that we allow cases # like .11.25.2019 but not 1.11.25.2019 (for pattern MM.DD.YYYY)
r"(?\'\`\[\]\{\}\(\)<>\s])" # This is the list of punctuation that is ok before the # pattern (i.e. "It can't not be these characters before the pattern")
r"(\b|^)" # The \b is to block cases like 1201912 but allow 201912 for pattern YYYYMM. The ^ was necessary to allow a # negative number through i.e. before epoch numbers
)
ending_word_boundary = (
r"(?=[\,\.\;\:\?\!\"\'\`\[\]\{\}\(\)\<\>]?" # Positive lookahead stating that these punctuation marks # can appear after the pattern at most 1 time
r"(?!\S))"# Don't allow any non-whitespace character after the punctuation
)
bounded_fmt_pattern = r"{}{}{}".format(
starting_word_boundary, final_fmt_pattern, ending_word_boundary
)
elif token == "YY":
value = int(value)
parts["year"] = 1900 + value if value > 68 else 2000 + value
elif token in ["MMMM", "MMM"]: # FIXME: month_number() is nullable
parts["month"] = self.locale.month_number(value.lower()) # type: ignore[typeddict-item]
elif token in ["MM", "M"]:
parts["month"] = int(value)
elif token in ["DDDD", "DDD"]:
parts["day_of_year"] = int(value)
elif token in ["DD", "D"]:
parts["day"] = int(value)
elif token == "Do":
parts["day"] = int(value)
elif token == "dddd": # locale day names are 1-indexed
day_of_week = [x.lower() for x in self.locale.day_names].index(
value.lower()
)
parts["day_of_week"] = day_of_week - 1
elif token == "ddd": # locale day abbreviations are 1-indexed
day_of_week = [x.lower() for x in self.locale.day_abbreviations].index(
value.lower()
)
parts["day_of_week"] = day_of_week - 1
elif token.upper() in ["HH", "H"]:
parts["hour"] = int(value)
elif token in ["mm", "m"]:
parts["minute"] = int(value)
elif token in ["ss", "s"]:
parts["second"] = int(value)
elif token == "S": # We have the *most significant* digits of an arbitrary-precision integer. # We want the six most significant digits as an integer, rounded. # IDEA: add nanosecond support somehow? Need datetime support for it first.
value = value.ljust(7, "0")
elif token in ["ZZZ", "ZZ", "Z"]:
parts["tzinfo"] = TzinfoParser.parse(value)
elif token in ["a", "A"]: if value in (self.locale.meridians["am"], self.locale.meridians["AM"]):
parts["am_pm"] = "am" if"hour"in parts andnot 0 <= parts["hour"] <= 12: raise ParserMatchError(
f"Hour token value must be between 0 and 12 inclusive for token {token!r}."
) elif value in (self.locale.meridians["pm"], self.locale.meridians["PM"]):
parts["am_pm"] = "pm" elif token == "W":
parts["weekdate"] = value
if expanded_timestamp isnotNone: return datetime.fromtimestamp(
normalize_timestamp(expanded_timestamp),
tz=tz.tzutc(),
)
day_of_year = parts.get("day_of_year")
if day_of_year isnotNone:
_year = parts.get("year")
month = parts.get("month") if _year isNone: raise ParserError( "Year component is required with the DDD and DDDD tokens."
)
if month isnotNone: raise ParserError( "Month component is not allowed with the DDD and DDDD tokens."
)
date_string = f"{_year}-{day_of_year}" try:
dt = datetime.strptime(date_string, "%Y-%j") except ValueError: raise ParserError(
f"The provided day of year {day_of_year!r} is invalid."
)
day_of_week: Optional[int] = parts.get("day_of_week")
day = parts.get("day")
# If day is passed, ignore day of week if day_of_week isnotNoneand day isNone:
year = parts.get("year", 1970)
month = parts.get("month", 1)
day = 1
# dddd => first day of week after epoch # dddd YYYY => first day of week in specified year # dddd MM YYYY => first day of week in specified year and month # dddd MM => first day after epoch in specified month
next_weekday_dt = next_weekday(datetime(year, month, day), day_of_week)
parts["year"] = next_weekday_dt.year
parts["month"] = next_weekday_dt.month
parts["day"] = next_weekday_dt.day
# Support for midnight at the end of day if hour == 24: if parts.get("minute", 0) != 0: raise ParserError("Midnight at the end of day must not contain minutes") if parts.get("second", 0) != 0: raise ParserError("Midnight at the end of day must not contain seconds") if parts.get("microsecond", 0) != 0: raise ParserError( "Midnight at the end of day must not contain microseconds"
)
hour = 0
day_increment = 1 else:
day_increment = 0
# account for rounding up to 1000000
microsecond = parts.get("microsecond", 0) if microsecond == 1000000:
microsecond = 0
second_increment = 1 else:
second_increment = 0
for fmt in formats: try:
_datetime = self.parse(string, fmt) break except ParserMatchError: pass
if _datetime isNone:
supported_formats = ", ".join(formats) raise ParserError(
f"Could not match input {string!r} to any of the following formats: {supported_formats}."
)
return _datetime
# generates a capture group of choices separated by an OR operator
@staticmethod def _generate_choice_re(
choices: Iterable[str], flags: Union[int, re.RegexFlag] = 0
) -> Pattern[str]: return re.compile(r"({})".format("|".join(choices)), flags=flags)
class TzinfoParser:
_TZINFO_RE: ClassVar[Pattern[str]] = re.compile(
r"^(?:\(UTC)*([\+\-])?(\d{2})(?:\:?(\d{2}))?"
)
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.