我正在寻找路径与给定格式匹配的所有文件(例如
%Y/%j/data_%d.txt
)并检索关联的日期。
一个简单的解决方案是使用 strptime,但如果“格式代码”在格式中出现两次(例如
%Y/%j/data_%Y%m%d.txt
),则此方法不起作用。这是与 re 使用相关的 strptime 限制,该限制早已记录在案 (https://github.com/python/cpython/issues/48680)。
你知道我该如何处理这个问题吗?
我长期以来一直使用这段(有点复杂)的代码来处理“格式代码”出现两次但在树的两个不同级别(文件夹、文件...)中的情况:
def get_path(path_format: List[str], path: str = "") -> List[str]:
""" return all path matching the given pattern with datetime format (%Y, %m etc...) handled
:param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
:param path: directory to explore
:return: List of all matching path
"""
if len(path_format) == 0:
return [path]
current_format = path_format[0]
if '%' in current_format:
out = []
current_content = os.listdir(path)
for content in current_content:
try:
datetime.datetime.strptime(content, current_format)
except ValueError:
continue
out += get_path(path_format[1:], os.path.join(path, content))
return out
else:
path = os.path.join(path, current_format)
if os.path.exists(path):
return get_path(path_format[1:], path)
else:
return []
$ tree data/2017/
data/2017/
├── 001
│ └── data_20170101.txt
└── 100
└── data_20170407.txt
>>> p = 'data/%Y/%j/data_%Y%m%d.txt'
>>> p = p.split('/')
>>> get_path(p)
['data/2017/100/data_20170407.txt', 'data/2017/001/data_20170101.txt']
但是,我需要管理“代码格式”在同一文件夹或文件名中出现两次的新数据(例如:
data/%Y/%j/data_%Y%m%dT0000_%Y%m%dT9000.txt
)。
编辑:我在周末找到了一种解决方法,该解决方案并不理想,但会完成工作,直到我找到更好的解决方案。我只是使用分隔符将文件名分成多个部分(在我的例子中为
_
)。这是代码:
def merge_two_dates(d1, d2):
return d1.replace(year=max(d1.year, d2.year),
month=max(d1.month, d2.month),
day=max(d1.day, d2.day),
hour=max(d1.hour, d2.hour),
minute=max(d1.minute, d2.minute),
second=max(d1.second, d2.second),
microsecond=max(d1.microsecond, d2.microsecond))
def merge_dates(dates):
date = datetime.datetime.min
for d in dates:
date = merge_two_dates(d, date)
return date
def dates_from_path_rec(path_format: List[str], split_c: str = "", path: str = "") -> List[datetime.datetime]:
""" return all path matching the given pattern with datetime format (%Y, %m etc...) handled
:param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
:param split_c: If two dates are in the same directory / file name, split_c will be used to split this name.
:param path: directory to explore
:return: List of date of all matching path
"""
if len(path_format) == 0:
return [datetime.datetime.min]
current_format = path_format[0]
if '%' in current_format:
split_format = current_format.split(split_c) if len(split_c) > 0 else [current_format]
out = []
current_content = os.listdir(path)
for content in current_content:
split_content = content.split(split_c) if len(split_c) > 0 else [content]
if len(split_format) != len(split_content):
continue
try:
dates = [datetime.datetime.strptime(sc, sf)
for sc, sf in zip(split_content, split_format) if '%' in sf]
except ValueError:
continue
date = merge_dates(dates)
if date.strftime(current_format) != content:
continue
out += [merge_two_dates(date, d)
for d in dates_from_path_rec(path_format[1:], split_c, os.path.join(path, content))]
return out
else:
path = os.path.join(path, current_format)
if os.path.exists(path):
return dates_from_path_rec(path_format[1:], split_c, path)
else:
return []
我终于找到了解决问题的方法。 regex 包扩展了 re 库的功能,特别是处理
group name
重新定义的情况。通过从 python 复制并修改 _strptime.py 文件,我可以获得一个接受具有组名重新定义的格式的函数。
我的客户
strptime.py
文件(_strptime.py的副本):
from regex import compile as regex_compile
...
class TimeRE(dict):
...
def compile(self, format):
"""Return a compiled re object for the format string."""
return regex_compile(self.pattern(format), IGNORECASE)
...
def str_to_time(data_string, format="%a %b %d %H:%M:%S %Y"):
"""Return a time struct based on the input string and the
format string."""
tt = _strptime(data_string, format)[0]
return time.struct_time(tt[:time._STRUCT_TM_ITEMS])
def str_to_cls(cls, data_string, format="%a %b %d %H:%M:%S %Y"):
"""Return a class cls instance based on the input string and the
format string."""
tt, fraction, gmtoff_fraction = _strptime(data_string, format)
tzname, gmtoff = tt[-2:]
args = tt[:6] + (fraction,)
if gmtoff is not None:
tzdelta = datetime_timedelta(seconds=gmtoff, microseconds=gmtoff_fraction)
if tzname:
tz = datetime_timezone(tzdelta, tzname)
else:
tz = datetime_timezone(tzdelta)
args += (tz,)
return cls(*args)
然后我可以使用这些函数来重写我的函数
date_from_path()
:
import strptime
def matching_paths(path_format: List[str], path: str = "") -> List[str]:
""" return all path matching the given pattern with datetime format (%Y, %m etc...) handled
:param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
:param path: directory to explore
:return: List of date of all matching path
"""
if len(path_format) == 0:
return [path]
current_format = path_format[0]
if '%' in current_format:
out = []
current_content = os.listdir(path)
for content in current_content:
try:
strptime.str_to_time(content, current_format)
except ValueError:
continue
out += [p for p in matching_paths(path_format[1:], os.path.join(path, content))]
return out
else:
path = os.path.join(path, current_format)
if os.path.exists(path):
return matching_paths(path_format[1:], path)
else:
return []
def dates_from_path(fmt: str) -> Set[datetime.datetime]:
""" Return all date from file matching fmt
:param fmt: input path to explore to find input_object, can contain %Y, %m, %d ...
:return:
"""
fmt = os.path.normpath(fmt)
# set of dates (remove redundancy)
dates = matching_paths(fmt.split(os.sep))
dates = [strptime.str_to_cls(datetime.datetime, d, fmt) for d in dates]
dates = set(dates)
# assert each date is working, avoid potential problem that could have occurred during recurrence
dates = set([date for date in dates if os.path.exists(date.strftime(fmt))])
return dates