2016-12-14 8 views
1

pyparsing 모듈을 사용하여 여러 형식의 로그가있는 로그 파일을 구문 분석하는 방법은 무엇입니까? 다음은 내가 작업하고있는 코드입니다.PyParsing을 사용하여 Syslog 구문 분석

# -*- coding: utf-8 -*- 
""" 

""" 

import pandas as pd 

from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex 

from time import strftime 

class Parser(object): 
    def __init__(self): 
    ints = Word(nums) 

    # priority 
    # priority = Suppress("<") + ints + Suppress(">") 

    # timestamp 
    month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 
    day = ints 
    hour = Combine(ints + ":" + ints + ":" + ints) 

    timestamp = month + day + hour 

    # hostname 
    hostname = Word(alphas + nums + "_" + "-" + ".") 

    # appname 
    appname = Word(alphas + "/" + "-" + "_" + "." + "(" + ")") + (Suppress("[") + ints + Suppress("]")) | (Word(alphas + "/" + "-" + "_" + ".") + Word (":")) 

    # message 
    message = Regex(".*") 

    # pattern build 
    self.__pattern = timestamp + hostname + appname + message 


    def parse(self, line): 
    parsed = self.__pattern.parseString(line) 

    payload    = {} 
    #payload["priority"] = parsed[0] 
    payload["timestamp"] = strftime("%Y-%m-%d %H:%M:%S") 
    payload["hostname"] = parsed[3] 
    payload["appname"] = parsed[4] 
    payload["pid"]  = parsed[5] 
    payload["message"] = parsed[6] 


    return payload 


def main(): 

    parser = Parser() 

    with open('./messages.log') as syslogFile: 

     list1 = [] 
     for line in syslogFile: 
      fields = parser.parse(line) 
      list1.append(fields) 

     return list1 


if __name__ == "__main__": 

    main() 

다른 다른 로그의 샘플이 필요가됩니다 다음은 구문 분석 :

Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND 
Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND 
Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK. 
Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses) 
Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246 
Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe 
Mar 7 21:23:22 avas dccifd[6191]: missing message body 
Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53 
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure 
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT 
Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err` 
Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2 
Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577 
Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567) 
Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window 
Mar 8 16:05:26 avas arpwatch: listening on eth0 
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53 
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX 
Mar 8 15:18:40 avas: last message repeated 11 times 

어떻게해야 저를 제안하십시오?

+0

다음과 같은 의미입니까? https://gist.github.com/leandrosilva/3651640 – nir0s

+0

그래, 그런데 내 로그 파일에서 로그 형식이 항상 같지 않아. 다음 줄을 구문 분석하는 동안 Index Out of List를 나열하는 중에 오류가 발생합니다. "Mar 8 15:18:40 avas : 마지막 메시지가 11 번 반복되었습니다." – RRK

+0

언제든지 IndexError – nir0s

답변

0

이 새 줄을 처리하기 위해 선택적 pyparsing Optional 클래스를 사용하여 appname 부분을 선택 사항으로 표시하고 후행 ':'을 분리했습니다. 아래의 코드에서는 parse() 메서드에서 결과 dict 생성을 단순화하기 위해 구문 분석시 데이터 변환을위한 몇 가지 구문 분석 작업과 일부 결과 이름을 약간 수정했습니다.

from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex, Optional 

from datetime import datetime 

class Parser(object): 
    # log lines don't include the year, but if we don't provide one, datetime.strptime will assume 1900 
    ASSUMED_YEAR = '2016' 

    def __init__(self): 
     ints = Word(nums) 

     # priority 
     # priority = Suppress("<") + ints + Suppress(">") 

     # timestamp 
     month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 
     day = ints 
     hour = Combine(ints + ":" + ints + ":" + ints) 

     timestamp = month + day + hour 
     # a parse action will convert this timestamp to a datetime 
     timestamp.setParseAction(lambda t: datetime.strptime(Parser.ASSUMED_YEAR + ' ' + ' '.join(t), '%Y %b %d %H:%M:%S')) 

     # hostname 
     hostname = Word(alphas + nums + "_-.") 

     # appname 
     appname = Word(alphas + "/-_.()")("appname") + (Suppress("[") + ints("pid") + Suppress("]")) | (Word(alphas + "/-_.")("appname")) 
     appname.setName("appname") 

     # message 
     message = Regex(".*") 

     # pattern build 
     # (add results names to make it easier to access parsed fields) 
     self._pattern = timestamp("timestamp") + hostname("hostname") + Optional(appname) + Suppress(':') + message("message") 

    def parse(self, line): 
     parsed = self._pattern.parseString(line) 
     # fill in keys that might not have been found in the input string 
     # (this could have been done in a parse action too, then this method would 
     # have just been a two-liner) 
     for key in 'appname pid'.split(): 
      if key not in parsed: 
       parsed[key] = '' 
     return parsed.asDict() 

사용 runTests() 특정 테스트 입력에 대해 파서를 테스트하려면 :

pattern = Parser()._pattern 

tests = """\ 
Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND 
Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND 
Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK. 
Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses) 
Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246 
Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe 
Mar 7 21:23:22 avas dccifd[6191]: missing message body 
Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53 
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure 
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT 
Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err` 
Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2 
Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577 
Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567) 
Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window 
Mar 8 16:05:26 avas arpwatch: listening on eth0 
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53 
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX 
Mar 8 15:18:40 avas: last message repeated 11 times""" 

pattern.runTests(tests) 

을 제공합니다 :

Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND 
[datetime.datetime(2016, 3, 7, 4, 2, 16), 'avas', 'clamd', '11165', '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND'] 
- appname: 'clamd' 
- hostname: 'avas' 
- message: '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND' 
- pid: '11165' 
- timestamp: datetime.datetime(2016, 3, 7, 4, 2, 16) 


Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND 
[datetime.datetime(2016, 3, 7, 4, 5, 55), 'avas', 'clamd', '11240', '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND'] 
- appname: 'clamd' 
- hostname: 'avas' 
- message: '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND' 
- pid: '11240' 
- timestamp: datetime.datetime(2016, 3, 7, 4, 5, 55) 


Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK. 
[datetime.datetime(2016, 3, 7, 9, 0, 51), 'avas', 'clamd', '27173', 'SelfCheck: Database status OK.'] 
- appname: 'clamd' 
- hostname: 'avas' 
- message: 'SelfCheck: Database status OK.' 
- pid: '27173' 
- timestamp: datetime.datetime(2016, 3, 7, 9, 0, 51) 


Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses) 
[datetime.datetime(2016, 3, 7, 5, 59, 2), 'avas', 'clamd', '27173', 'Database correctly reloaded (20400 viruses)'] 
- appname: 'clamd' 
- hostname: 'avas' 
- message: 'Database correctly reloaded (20400 viruses)' 
- pid: '27173' 
- timestamp: datetime.datetime(2016, 3, 7, 5, 59, 2) 


Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246 
[datetime.datetime(2016, 3, 7, 11, 14, 35), 'avas', 'dccd', '13284', '21 requests/sec are too many from anonymous 205.201.1.56,2246'] 
- appname: 'dccd' 
- hostname: 'avas' 
- message: '21 requests/sec are too many from anonymous 205.201.1.56,2246' 
- pid: '13284' 
- timestamp: datetime.datetime(2016, 3, 7, 11, 14, 35) 


Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe 
[datetime.datetime(2016, 3, 8, 0, 22, 57), 'avas', 'dccifd', '9933', 'write(MTA socket,4): Broken pipe'] 
- appname: 'dccifd' 
- hostname: 'avas' 
- message: 'write(MTA socket,4): Broken pipe' 
- pid: '9933' 
- timestamp: datetime.datetime(2016, 3, 8, 0, 22, 57) 


Mar 7 21:23:22 avas dccifd[6191]: missing message body 
[datetime.datetime(2016, 3, 7, 21, 23, 22), 'avas', 'dccifd', '6191', 'missing message body'] 
- appname: 'dccifd' 
- hostname: 'avas' 
- message: 'missing message body' 
- pid: '6191' 
- timestamp: datetime.datetime(2016, 3, 7, 21, 23, 22) 


Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53 
[datetime.datetime(2016, 3, 9, 16, 5, 17), 'avas', 'named', '12045', 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53'] 
- appname: 'named' 
- hostname: 'avas' 
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53' 
- pid: '12045' 
- timestamp: datetime.datetime(2016, 3, 9, 16, 5, 17) 


Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure 
[datetime.datetime(2016, 3, 10, 0, 38, 16), 'avas', 'dccifd', '23069', 'continue not asking DCC 17 seconds after failure'] 
- appname: 'dccifd' 
- hostname: 'avas' 
- message: 'continue not asking DCC 17 seconds after failure' 
- pid: '23069' 
- timestamp: datetime.datetime(2016, 3, 10, 0, 38, 16) 


Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT 
[datetime.datetime(2016, 3, 10, 9, 42, 11), 'avas', 'named', 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT'] 
- appname: 'named' 
- hostname: 'avas' 
- message: 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT' 
- timestamp: datetime.datetime(2016, 3, 10, 9, 42, 11) 


Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err` 
[datetime.datetime(2016, 3, 9, 3, 48, 7), 'avas', 'dccd', '145', 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`'] 
- appname: 'dccd' 
- hostname: 'avas' 
- message: 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`' 
- pid: '145' 
- timestamp: datetime.datetime(2016, 3, 9, 3, 48, 7) 


Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2 
[datetime.datetime(2016, 3, 9, 11, 58, 18), 'avas', 'kernel', 'i810_audio: Connection 0 with codec id 2'] 
- appname: 'kernel' 
- hostname: 'avas' 
- message: 'i810_audio: Connection 0 with codec id 2' 
- timestamp: datetime.datetime(2016, 3, 9, 11, 58, 18) 


Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577 
[datetime.datetime(2016, 3, 9, 19, 41, 13), 'avas', 'dccd', '3004', '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577'] 
- appname: 'dccd' 
- hostname: 'avas' 
- message: '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577' 
- pid: '3004' 
- timestamp: datetime.datetime(2016, 3, 9, 19, 41, 13) 


Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567) 
[datetime.datetime(2016, 3, 8, 9, 1, 7), 'avas', 'sshd(pam_unix)', '21839', 'session opened for user tom by (uid=35567)'] 
- appname: 'sshd(pam_unix)' 
- hostname: 'avas' 
- message: 'session opened for user tom by (uid=35567)' 
- pid: '21839' 
- timestamp: datetime.datetime(2016, 3, 8, 9, 1, 7) 


Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window 
[datetime.datetime(2016, 3, 8, 3, 52, 4), 'avas', 'dccd', '13284', '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window'] 
- appname: 'dccd' 
- hostname: 'avas' 
- message: '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window' 
- pid: '13284' 
- timestamp: datetime.datetime(2016, 3, 8, 3, 52, 4) 


Mar 8 16:05:26 avas arpwatch: listening on eth0 
[datetime.datetime(2016, 3, 8, 16, 5, 26), 'avas', 'arpwatch', 'listening on eth0'] 
- appname: 'arpwatch' 
- hostname: 'avas' 
- message: 'listening on eth0' 
- timestamp: datetime.datetime(2016, 3, 8, 16, 5, 26) 


Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53 
[datetime.datetime(2016, 3, 10, 10, 0, 6), 'avas', 'named', '6986', 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53'] 
- appname: 'named' 
- hostname: 'avas' 
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53' 
- pid: '6986' 
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 6) 


Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX 
[datetime.datetime(2016, 3, 10, 10, 0, 10), 'avas', 'named', '6986', 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX'] 
- appname: 'named' 
- hostname: 'avas' 
- message: 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX' 
- pid: '6986' 
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 10) 

Mar 8 15:18:40 avas: last message repeated 11 times 
[datetime.datetime(2016, 3, 8, 15, 18, 40), 'avas', 'last message repeated 11 times'] 
- hostname: 'avas' 
- message: 'last message repeated 11 times' 
- timestamp: datetime.datetime(2016, 3, 8, 15, 18, 40) 

또는 사용하여 파서 클래스의 구문 분석() 메소드를 :

from pprint import pprint 
for t in tests.splitlines(): 
    pprint(Parser().parse(t)) 
    print() 

:

{'appname': 'clamd', 
'hostname': 'avas', 
'message': '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: ' 
     'Worm.Mydoom.F FOUND ', 
'pid': '11165', 
'timestamp': datetime.datetime(2016, 3, 7, 4, 2, 16)} 

{'appname': 'clamd', 
'hostname': 'avas', 
'message': '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: ' 
     'Worm.SomeFool.Gen-1 FOUND ', 
'pid': '11240', 
'timestamp': datetime.datetime(2016, 3, 7, 4, 5, 55)} 

{'appname': 'clamd', 
'hostname': 'avas', 
'message': 'SelfCheck: Database status OK.', 
'pid': '27173', 
'timestamp': datetime.datetime(2016, 3, 7, 9, 0, 51)} 

{'appname': 'clamd', 
'hostname': 'avas', 
'message': 'Database correctly reloaded (20400 viruses) ', 
'pid': '27173', 
'timestamp': datetime.datetime(2016, 3, 7, 5, 59, 2)} 

{'appname': 'dccd', 
'hostname': 'avas', 
'message': '21 requests/sec are too many from anonymous 205.201.1.56,2246', 
'pid': '13284', 
'timestamp': datetime.datetime(2016, 3, 7, 11, 14, 35)} 

{'appname': 'dccifd', 
'hostname': 'avas', 
'message': 'write(MTA socket,4): Broken pipe', 
'pid': '9933', 
'timestamp': datetime.datetime(2016, 3, 8, 0, 22, 57)} 

{'appname': 'dccifd', 
'hostname': 'avas', 
'message': 'missing message body', 
'pid': '6191', 
'timestamp': datetime.datetime(2016, 3, 7, 21, 23, 22)} 

{'appname': 'named', 
'hostname': 'avas', 
'message': 'zone PLNet/IN: refresh: non-authoritative answer from master ' 
     '10.0.0.253#53', 
'pid': '12045', 
'timestamp': datetime.datetime(2016, 3, 9, 16, 5, 17)} 

{'appname': 'dccifd', 
'hostname': 'avas', 
'message': 'continue not asking DCC 17 seconds after failure', 
'pid': '23069', 
'timestamp': datetime.datetime(2016, 3, 10, 0, 38, 16)} 

{'appname': 'named', 
'hostname': 'avas', 
'message': 'client 127.0.0.1#55524: query: ' 
     '23.68.27.142.sa-trusted.bondedsender.org IN TXT', 
'pid': '', 
'timestamp': datetime.datetime(2016, 3, 10, 9, 42, 11)} 

{'appname': 'dccd', 
'hostname': 'avas', 
'message': 'automatic dbclean; starting `dbclean -DPq -i 1189 -L ' 
     'info,local5.notice -L error,local5.err`', 
'pid': '145', 
'timestamp': datetime.datetime(2016, 3, 9, 3, 48, 7)} 

{'appname': 'kernel', 
'hostname': 'avas', 
'message': 'i810_audio: Connection 0 with codec id 2', 
'pid': '', 
'timestamp': datetime.datetime(2016, 3, 9, 11, 58, 18)} 

{'appname': 'dccd', 
'hostname': 'avas', 
'message': '"packet length 44 too small for REPORT" sent to client 1 at ' 
     '194.63.250.215,47577', 
'pid': '3004', 
'timestamp': datetime.datetime(2016, 3, 9, 19, 41, 13)} 

{'appname': 'sshd(pam_unix)', 
'hostname': 'avas', 
'message': 'session opened for user tom by (uid=35567)', 
'pid': '21839', 
'timestamp': datetime.datetime(2016, 3, 8, 9, 1, 7)} 

{'appname': 'dccd', 
'hostname': 'avas', 
'message': '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window', 
'pid': '13284', 
'timestamp': datetime.datetime(2016, 3, 8, 3, 52, 4)} 

{'appname': 'arpwatch', 
'hostname': 'avas', 
'message': 'listening on eth0', 
'pid': '', 
'timestamp': datetime.datetime(2016, 3, 8, 16, 5, 26)} 

{'appname': 'named', 
'hostname': 'avas', 
'message': 'zone PLNet/IN: refresh: non-authoritative answer from master ' 
     '192.75.26.21#53', 
'pid': '6986', 
'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 6)} 

{'appname': 'named', 
'hostname': 'avas', 
'message': 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX', 
'pid': '6986', 
'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 10)} 

{'appname': '', 
'hostname': 'avas', 
'message': 'last message repeated 11 times', 
'pid': '', 
'timestamp': datetime.datetime(2016, 3, 8, 15, 18, 40)} 
+0

에서 예외를 시도해 볼 수 있습니다. 고맙습니다 @ 폴 맥과이어 :) – RRK