import os
import re
from collections import Counter
from collections import OrderedDict
fileNames = []
textInfo = []
d = {}
currentDirectoryPath = os.getcwd()
print(currentDirectoryPath)
regexp = re.compile(
r'(?P<clientIP>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}).+\['
+ '(?P<timestamp>\d{2}/[A-Z][a-z]{2}/\d\d\d\d).+\"'
+ '(?P<action>[A-Z]{3,4}).+\"'
+ '\s*(?P<statuscode>[1-5][0-9][0-9])'
)
os.chdir("/content/drive/log")
currentDirectoryPath = os.getcwd()
listOfFileNames = os.listdir(currentDirectoryPath)
#for files in listOfFileNames :
#print(files)
f = open('access_1.log', 'r')
matched = 0
failed = 0
cnt_clientIPs = Counter()
cnt_clientAction = Counter()
cnt_clientTimeStamp = Counter()
cnt_clientStatusCode = Counter()
for line in f:
m = re.match(regexp, line)
if m:
cnt_clientIPs.update([m.group('clientIP')])
cnt_clientAction.update([m.group('action')])
cnt_clientStatusCode.update([m.group('statuscode')])
matched += 1
else:
failed += 1
continue
print("""""\
client .........: %s
timestamp ......: %s
action .........: %s
statuscode.........: %s
""" % ( m.group('clientIP'),
m.group('timestamp'),
m.group('action'),
m.group('statuscode'),
))
for line in f:
m = re.match(regexp, line)
if m:
d = {m.group("clientIP"): m.group("statuscode")}
print(d)
userInputIP = input("Enter how many of the top clients you want to see. ")
print('[*] %d lines matched the regular expression' % (matched))
print('[*] %d lines failed to match the regular expression' % (failed), end='\n\n')
print('[*] ============================================')
print('[*] '+ userInputIP +' Most Frequently Occurring Clients Queried')
print('[*] ============================================')
for clientIP, count in cnt_clientIPs.most_common(int(userInputIP)):
print('[*] %30s: %d' % (clientIP, count))
print('[*] ============================================')
userInputAction = input("Enter how many of the top actions you want to see. ")
print('[*] '+ userInputAction +' Most Frequently Occurring Clients Actions')
print('[*] ============================================')
for action, count in cnt_clientAction.most_common(int(userInputAction)):
print('[*] %30s: %d' % (action, count))
print('[*] ============================================')
userInputIpPlusStatus = input("Enter how many of the top clients you want to see. and there status code ")
print('[*] '+ userInputIpPlusStatus +' Most Frequently Occurring Clients IP and Status Code')
print('[*] ============================================')
for clientIP, count in cnt_clientIPs.most_common(int(userInputIpPlusStatus)):
if (stuatuscode == userStatuscodeInput):
print('[*] %30s: %d: %5s:' % (clientIP, count, m.group('statuscode')))
print('[*] ============================================')
Enter how many of the top clients you want to see.5
[*] 49997 lines matched the regular expression
[*] 3 lines failed to match the regular expression
[*] ============================================
[*] 5 Most Frequently Occurring Clients Queried
[*] ============================================
[*] 205.167.170.15: 15695
[*] 79.142.95.122: 3207
[*] 52.22.118.215: 734
[*] 84.112.161.41: 712
[*] 37.1.206.196: 371
[*] ============================================
Enter how many of the top actions you want to see.5
[*] 5 Most Frequently Occurring Clients Actions
[*] ============================================
[*] GET: 44048
[*] POST: 5921
[*] HEAD: 25
[*] PUT: 3
[*] ============================================
{}
我想打印出状态码为404的前5名客户IP。或任何状态码是由用户给出的。下面是一些测试行来帮助。也可以在一定的时间范围内完成吗?例如,打印状态码为404的顶级客户IP,从一月到二月?
for clientIP, count in cnt_clientIPs.most_common(int(userInputIpPlusStatus)):
if (stuatuscode == userStatuscodeInput):
print('[*] %30s: %d: %5s:' % (clientIP, count, m.group('statuscode')))
print('[*] ============================================')
我说的就是这里的这部分。试图在这里添加条件。
80.110.186.51 - - [21/Dec/2015:17:20:12 +0100] "GET /images/stories/raith/oststeiermark.png HTTP/1.1" 200 65225 "http://www.almhuette-raith.at/" "Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1" "-"
80.110.186.51 - - [21/Dec/2015:17:20:12 +0100] "GET /images/stories/raith/garage.jpg HTTP/1.1" 200 57339 "http://www.almhuette-raith.at/" "Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1" "-"
80.110.186.51 - - [21/Dec/2015:17:20:12 +0100] "GET /images/stories/slideshow/almhuette_raith_03.jpg HTTP/1.1" 200 87782 "http://www.almhuette-raith.at/" "Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1" "-"
80.110.186.51 - - [21/Dec/2015:17:20:12 +0100] "GET /images/stories/raith/steiermark_herz.png HTTP/1.1" 200 39683 "http://www.almhuette-raith.at/" "Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1" "-"
上面这些行是一些测试行,以帮助你们,并显示我在文本文件中处理的情况。
你的 cnt_clientStatusCode
计数器应该计算由ipstatus代码对组成的元组。
for line in f:
m = re.match(regexp, line)
if m:
client_ip = m.group('clientIP')
statuscode = m.group('statuscode')
client_statuscode = (client_ip, statuscode) # ip / status code combination
cnt_clientIPs.update([client_ip])
cnt_clientAction.update([m.group('action')])
cnt_clientStatusCode.update([client_statuscode])
matched += 1
else:
failed += 1
continue
然后你可以列出 n
最常见的组合,其中 n = int(userInputIpPlusStatus)
因此,您可以提出一个额外的问题,询问用户感兴趣的状态码是什么,并只打印带有该状态码的项目。
for (clientIP, statusCode), count in cnt_clientStatusCode.most_common(int(userInputIpPlusStatus)):
print('[*] %30s: %d: %5s:' % (clientIP, count, statusCode))
print('[*] ============================================')
当然,你可以提出一个额外的问题 问用户对哪种特定的状态代码感兴趣 然后只打印带有该特定状态代码的项目。这样做的逻辑是
wanted_status_code = input("What status code are you interested in: ")
userInputIpPlusStatus = input("Enter how many of the top clients do you want to see for this status code: ")
n = int(userInputIpPlusStatus)
count = 0
for (clientIP, statusCode), count in cnt_clientStatusCode.most_common():
if statusCode == wanted_status_code:
print('[*] %30s: %d: %5s:' % (clientIP, count, statusCode))
count += 1
if count == n:
break
print('[*] ============================================')
更新
如果你想让搜索某个状态码的效率更高,那么可以有一个计数器的字典,其键是状态码,其值是客户端ips的Counters。
from collections import defaultdict
status_dict = defaultdict(Counter)
for line in f.split:
m = re.match(regexp, line)
if m:
client_ip = m.group('clientIP')
statuscode = m.group('statuscode')
client_statuscode = (client_ip, statuscode)
cnt_clientIPs.update([client_ip])
cnt_clientAction.update([m.group('action')])
cnt_clientStatusCode.update([client_statuscode])
status_dict[statuscode].update([client_ip])
matched += 1
else:
failed += 1
continue
然后:
wanted_status_code = input("What status code are you interested in: ")
userInputIpPlusStatus = input("Enter how many of the top clients do you want to see for this status code: ")
for clientIP, count in status_dict.get(wanted_status_code, Counter()).most_common(int(userInputIpPlusStatus)):
print('[*] %30s: %d: %5s:' % (clientIP, count, wanted_status_code))
print('[*] ============================================')