我是一个比较新的开发人员,我有一个问题,我试图在一个字符串中搜索一个特定的模式,即 可 在文本文件的某一行的特定位置包含一个数字。
我目前正在写的方法从该跟踪文件中摄取一行,并需要根据已知模式的字典查找匹配。
现在,我已经从我的朋友那里听说,他们是 实际 python程序的开发者,Regex实际上是一种相对缓慢和低效的方式来执行这种操作。但鉴于我的问题的性质,我想不出没有他们的方法。
我正在开发的程序将运行近10000个文本文件,每个文件有200k+行。
有谁能想到更快的方法吗?
def firm_parser(line):
RgxFrm_PX={
"CycleCount":{
"ptrn":re.search(re.compile(r"P\d{1}RV"),line),
"desc":"Request cycle counter",
"deet":""},
"LastAdjustDate":{
"ptrn":re.search(re.compile(r"P\d{1}RJ"),line),
"desc":"Request adjustment date and status",
"deet":""},
"ChanTemp":{
"prtn":re.search(re.compile(r"P\d{1}RM"),line),
"desc":"Request pipetting channel temperature",
"deet":""}
}
for key,value in RgxFrm_PX:
if value["prtn"]:
print(f"{value["desc"]} {line}")
哇,我试了一下,没想到regex和no-regex的方法有很大的区别,当真是让我大吃一惊!
这是我的函数版本,没有regex。
def firm_parserV2(line):
pattern_found = False
matches = []
lineCpy = line
while 1:
idx = lineCpy.find('P')
if idx < 0: break
lineCpy = lineCpy[idx+1:]
if not lineCpy[0].isnumeric():
continue
matches.append([
lineCpy[1:3], # The 2 letters
int(lineCpy[0]) # The number
])
for letters, number in matches:
if letters == 'RV':
message = "Request cycle counter"
elif letters == 'RJ':
message = "Request adjustment date and status"
elif letters == 'RM':
message = "Request pipetting channel temperature"
print(message, line)
我比较了一个小行的时间P4RJasd
),下面是结果。
+------------------------------+------------------------+
| Function | Time |
+------------------------------+------------------------+
| Original | .003547472953796386 ms |
+------------------------------+------------------------+
| Original with rx compilation | .002606389522552490 ms |
| outside the function | |
+------------------------------+------------------------+
| New version | .000612576007843017 ms |
+------------------------------+------------------------+
这是我用来比较这三个函数的完整代码。
import re
import re
import time
import random
def firm_parser(line):
RgxFrm_PX={
"CycleCount":{
"prtn":re.search(re.compile(r"P\d{1}RV"),line),
"desc":"Request cycle counter",
"deet":""},
"LastAdjustDate":{
"prtn":re.search(re.compile(r"P\d{1}RJ"),line),
"desc":"Request adjustment date and status",
"deet":""},
"ChanTemp":{
"prtn":re.search(re.compile(r"P\d{1}RM"),line),
"desc":"Request pipetting channel temperature",
"deet":""}
}
for key,value in RgxFrm_PX.items():
if value["prtn"]:
pass
# print(f"{value['desc']} {line}")
rx_rv = re.compile(r"P\dRV")
rx_rj = re.compile(r"P\dRJ")
rx_rm = re.compile(r"P\dRM")
def firm_parser_no_rx(line):
RgxFrm_PX={
"CycleCount":{
"prtn":re.search(rx_rv, line),
"desc":"Request cycle counter",
"deet":""},
"LastAdjustDate":{
"prtn":re.search(rx_rj, line),
"desc":"Request adjustment date and status",
"deet":""},
"ChanTemp":{
"prtn":re.search(rx_rm, line),
"desc":"Request pipetting channel temperature",
"deet":""}
}
for key,value in RgxFrm_PX.items():
if value["prtn"]:
pass
# print(f"{value['desc']} {line}")
def firm_parserV2(line):
pattern_found = False
matches = []
lineCpy = line
while 1:
idx = lineCpy.find('P')
if idx < 0: break
lineCpy = lineCpy[idx+1:]
if not lineCpy[0].isnumeric():
continue
matches.append([
lineCpy[1:3], # The 2 letters
int(lineCpy[0]) # The number
])
for letters, number in matches:
if letters == 'RV':
message = "Request cycle counter"
elif letters == 'RJ':
message = "Request adjustment date and status"
elif letters == 'RM':
message = "Request pipetting channel temperature"
# print(message, line)
loop_nb = 100000
test_string = 'P4RJasd'
funcs = [
firm_parser,
firm_parser_no_rx,
firm_parserV2
]
times = {}
for func in funcs: times[func.__name__] = 0
for i in range(loop_nb):
# Scrambling the array...
funcs = sorted(funcs, key = lambda x: random.random() )
for func in funcs:
start = time.time()
func(test_string)
end = time.time()
times[func.__name__] += (end - start)
for func, time in times.items():
print(func + '\t', time / loop_nb)