1100 lines
36 KiB
Python
1100 lines
36 KiB
Python
#!/usr/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright 2012 The Plaso Project Authors.
|
|
# Please see the AUTHORS file for details on individual authors.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""This file contains a class to provide a parsing framework to plaso.
|
|
|
|
This class contains a base framework class for parsing fileobjects, and
|
|
also some implementations that extend it to provide a more comprehensive
|
|
parser.
|
|
"""
|
|
|
|
import abc
|
|
import csv
|
|
import logging
|
|
import os
|
|
|
|
from dfvfs.helpers import text_file
|
|
import pyparsing
|
|
|
|
from plaso.events import text_events
|
|
from plaso.lib import errors
|
|
from plaso.lib import event
|
|
from plaso.lib import lexer
|
|
from plaso.lib import timelib
|
|
from plaso.lib import utils
|
|
from plaso.parsers import interface
|
|
|
|
import pytz
|
|
|
|
# Pylint complains about some functions not being implemented that shouldn't
|
|
# be since they need to be implemented by children.
|
|
# pylint: disable=abstract-method
|
|
|
|
|
|
class SlowLexicalTextParser(interface.BaseParser, lexer.SelfFeederMixIn):
|
|
"""Generic text based parser that uses lexer to assist with parsing.
|
|
|
|
This text parser is based on a rather slow lexer, which makes the
|
|
use of this interface highly discouraged. Parsers that already
|
|
implement it will most likely all be rewritten to support faster
|
|
text parsing implementations.
|
|
|
|
This text based parser needs to be extended to provide an accurate
|
|
list of tokens that define the structure of the log file that the
|
|
parser is designed for.
|
|
"""
|
|
|
|
# Define the max number of lines before we determine this is
|
|
# not the correct parser.
|
|
MAX_LINES = 15
|
|
|
|
# List of tokens that describe the structure of the log file.
|
|
tokens = [
|
|
lexer.Token('INITIAL', '(.+)\n', 'ParseString', ''),
|
|
]
|
|
|
|
def __init__(self, local_zone=True):
|
|
"""Constructor for the SlowLexicalTextParser.
|
|
|
|
Args:
|
|
local_zone: A boolean value that determines if the entries
|
|
in the log file are stored in the local time
|
|
zone of the computer that stored it or in a fixed
|
|
timezone, like UTC.
|
|
"""
|
|
# TODO: remove the multiple inheritance.
|
|
lexer.SelfFeederMixIn.__init__(self)
|
|
interface.BaseParser.__init__(self)
|
|
self.line_ready = False
|
|
self.attributes = {
|
|
'body': '',
|
|
'iyear': 0,
|
|
'imonth': 0,
|
|
'iday': 0,
|
|
'time': '',
|
|
'hostname': '',
|
|
'username': '',
|
|
}
|
|
self.local_zone = local_zone
|
|
self.file_entry = None
|
|
|
|
def ClearValues(self):
|
|
"""Clears all the values inside the attributes dict.
|
|
|
|
All values that start with the letter 'i' are considered
|
|
to be an integer, otherwise string value is assumed.
|
|
"""
|
|
self.line_ready = False
|
|
for attr in self.attributes:
|
|
if attr[0] == 'i':
|
|
self.attributes[attr] = 0
|
|
else:
|
|
self.attributes[attr] = ''
|
|
|
|
def ParseIncomplete(self, match=None, **unused_kwargs):
|
|
"""Indication that we've got a partial line to match against.
|
|
|
|
Args:
|
|
match: The regular expression match object.
|
|
"""
|
|
self.attributes['body'] += match.group(0)
|
|
self.line_ready = True
|
|
|
|
def ParseMessage(self, **unused_kwargs):
|
|
"""Signal that a line is ready to be parsed."""
|
|
self.line_ready = True
|
|
|
|
def SetMonth(self, match=None, **unused_kwargs):
|
|
"""Parses the month.
|
|
|
|
This is a callback function for the text parser (lexer) and is
|
|
called by the corresponding lexer state.
|
|
|
|
Args:
|
|
match: The regular expression match object.
|
|
"""
|
|
self.attributes['imonth'] = int(
|
|
timelib.MONTH_DICT.get(match.group(1).lower(), 1))
|
|
|
|
def SetDay(self, match=None, **unused_kwargs):
|
|
"""Parses the day of the month.
|
|
|
|
This is a callback function for the text parser (lexer) and is
|
|
called by the corresponding lexer state.
|
|
|
|
Args:
|
|
match: The regular expression match object.
|
|
"""
|
|
self.attributes['iday'] = int(match.group(1))
|
|
|
|
def SetTime(self, match=None, **unused_kwargs):
|
|
"""Set the time attribute.
|
|
|
|
Args:
|
|
match: The regular expression match object.
|
|
"""
|
|
self.attributes['time'] = match.group(1)
|
|
|
|
def SetYear(self, match=None, **unused_kwargs):
|
|
"""Parses the year.
|
|
|
|
This is a callback function for the text parser (lexer) and is
|
|
called by the corresponding lexer state.
|
|
|
|
Args:
|
|
match: The regular expression match object.
|
|
"""
|
|
self.attributes['iyear'] = int(match.group(1))
|
|
|
|
def Parse(self, parser_context, file_entry, parser_chain=None):
|
|
"""Extract data from a text file.
|
|
|
|
Args:
|
|
parser_context: A parser context object (instance of ParserContext).
|
|
file_entry: A file entry object (instance of dfvfs.FileEntry).
|
|
parser_chain: Optional string containing the parsing chain up to this
|
|
point. The default is None.
|
|
|
|
Raises:
|
|
UnableToParseFile: when the file cannot be parsed.
|
|
"""
|
|
path_spec_printable = u'{0:s}:{1:s}'.format(
|
|
file_entry.path_spec.type_indicator, file_entry.name)
|
|
file_object = file_entry.GetFileObject()
|
|
|
|
self.file_entry = file_entry
|
|
# TODO: this is necessary since we inherit from lexer.SelfFeederMixIn.
|
|
self.file_object = file_object
|
|
|
|
# Start by checking, is this a text file or not? Before we proceed
|
|
# any further.
|
|
file_object.seek(0, os.SEEK_SET)
|
|
if not utils.IsText(file_object.read(40)):
|
|
raise errors.UnableToParseFile(u'Not a text file, unable to proceed.')
|
|
|
|
file_object.seek(0, os.SEEK_SET)
|
|
|
|
error_count = 0
|
|
file_verified = False
|
|
# We need to clear out few values in the Lexer before continuing.
|
|
# There might be some leftovers from previous run.
|
|
self.error = 0
|
|
self.buffer = ''
|
|
|
|
# Add ourselves to the parser chain, which will be used in all subsequent
|
|
# event creation in this parser.
|
|
parser_chain = self._BuildParserChain(parser_chain)
|
|
|
|
while True:
|
|
_ = self.NextToken()
|
|
|
|
if self.state == 'INITIAL':
|
|
self.entry_offset = getattr(self, 'next_entry_offset', 0)
|
|
self.next_entry_offset = file_object.tell() - len(self.buffer)
|
|
|
|
if not file_verified and self.error >= self.MAX_LINES * 2:
|
|
logging.debug(
|
|
u'Lexer error count: {0:d} and current state {1:s}'.format(
|
|
self.error, self.state))
|
|
file_object.close()
|
|
raise errors.UnableToParseFile(
|
|
u'[{0:s}] unsupported file: {1:s}.'.format(
|
|
self.NAME, path_spec_printable))
|
|
|
|
if self.line_ready:
|
|
try:
|
|
event_object = self.ParseLine(parser_context)
|
|
parser_context.ProduceEvent(
|
|
event_object, parser_chain=parser_chain, file_entry=file_entry)
|
|
|
|
file_verified = True
|
|
|
|
except errors.TimestampNotCorrectlyFormed as exception:
|
|
error_count += 1
|
|
if file_verified:
|
|
logging.debug(
|
|
u'[{0:s} VERIFIED] Error count: {1:d} and ERROR: {2:d}'.format(
|
|
path_spec_printable, error_count, self.error))
|
|
logging.warning(
|
|
u'[{0:s}] Unable to parse timestamp with error: {1:s}'.format(
|
|
self.NAME, exception))
|
|
|
|
else:
|
|
logging.debug((
|
|
u'[{0:s} EVALUATING] Error count: {1:d} and ERROR: '
|
|
u'{2:d})').format(path_spec_printable, error_count, self.error))
|
|
|
|
if error_count >= self.MAX_LINES:
|
|
file_object.close()
|
|
raise errors.UnableToParseFile(
|
|
u'[{0:s}] unsupported file: {1:s}.'.format(
|
|
self.NAME, path_spec_printable))
|
|
|
|
finally:
|
|
self.ClearValues()
|
|
|
|
if self.Empty():
|
|
# Try to fill the buffer to prevent the parser from ending prematurely.
|
|
self.Feed()
|
|
|
|
if self.Empty():
|
|
break
|
|
|
|
if not file_verified:
|
|
file_object.close()
|
|
raise errors.UnableToParseFile(
|
|
u'[{0:s}] unable to parser file: {1:s}.'.format(
|
|
self.NAME, path_spec_printable))
|
|
|
|
file_offset = file_object.get_offset()
|
|
if file_offset < file_object.get_size():
|
|
logging.error((
|
|
u'{0:s} prematurely terminated parsing: {1:s} at offset: '
|
|
u'0x{2:08x}.').format(
|
|
self.NAME, path_spec_printable, file_offset))
|
|
file_object.close()
|
|
|
|
def ParseString(self, match=None, **unused_kwargs):
|
|
"""Return a string with combined values from the lexer.
|
|
|
|
Args:
|
|
match: The regular expression match object.
|
|
|
|
Returns:
|
|
A string that combines the values that are so far
|
|
saved from the lexer.
|
|
"""
|
|
try:
|
|
self.attributes['body'] += match.group(1).strip('\n')
|
|
except IndexError:
|
|
self.attributes['body'] += match.group(0).strip('\n')
|
|
|
|
def PrintLine(self):
|
|
""""Return a string with combined values from the lexer."""
|
|
year = getattr(self.attributes, 'iyear', None)
|
|
month = getattr(self.attributes, 'imonth', None)
|
|
day = getattr(self.attributes, 'iday', None)
|
|
|
|
if None in [year, month, day]:
|
|
date_string = u'[DATE NOT SET]'
|
|
else:
|
|
try:
|
|
year = int(year, 10)
|
|
month = int(month, 10)
|
|
day = int(day, 10)
|
|
|
|
date_string = u'{0:04d}-{1:02d}-{2:02d}'.format(year, month, day)
|
|
except ValueError:
|
|
date_string = u'[DATE INVALID]'
|
|
|
|
time_string = getattr(self.attributes, 'time', u'[TIME NOT SET]')
|
|
hostname_string = getattr(self.attributes, 'hostname', u'HOSTNAME NOT SET')
|
|
reporter_string = getattr(
|
|
self.attributes, 'reporter', u'[REPORTER NOT SET]')
|
|
body_string = getattr(self.attributes, 'body', u'[BODY NOT SET]')
|
|
|
|
# TODO: this is a work in progress. The reason for the try-catch is that
|
|
# the text parser is handed a non-text file and must deal with converting
|
|
# arbitrary binary data.
|
|
try:
|
|
line = u'{0:s} {1:s} [{2:s}] {3:s} => {4:s}'.format(
|
|
date_string, time_string, hostname_string, reporter_string,
|
|
body_string)
|
|
except UnicodeError:
|
|
line = 'Unable to print line - due to encoding error.'
|
|
|
|
return line
|
|
|
|
def ParseLine(self, parser_context):
|
|
"""Return an event object extracted from the current line.
|
|
|
|
Args:
|
|
parser_context: A parser context object (instance of ParserContext).
|
|
|
|
Returns:
|
|
An event object (instance of TextEvent).
|
|
"""
|
|
if not self.attributes['time']:
|
|
raise errors.TimestampNotCorrectlyFormed(
|
|
u'Unable to parse timestamp, time not set.')
|
|
|
|
if not self.attributes['iyear']:
|
|
raise errors.TimestampNotCorrectlyFormed(
|
|
u'Unable to parse timestamp, year not set.')
|
|
|
|
times = self.attributes['time'].split(':')
|
|
if self.local_zone:
|
|
timezone = parser_context.timezone
|
|
else:
|
|
timezone = pytz.UTC
|
|
|
|
if len(times) < 3:
|
|
raise errors.TimestampNotCorrectlyFormed((
|
|
u'Unable to parse timestamp, not of the format HH:MM:SS '
|
|
u'[{0:s}]').format(self.PrintLine()))
|
|
try:
|
|
secs = times[2].split('.')
|
|
if len(secs) == 2:
|
|
sec, us = secs
|
|
else:
|
|
sec = times[2]
|
|
us = 0
|
|
|
|
timestamp = timelib.Timestamp.FromTimeParts(
|
|
int(self.attributes['iyear']), self.attributes['imonth'],
|
|
self.attributes['iday'], int(times[0]), int(times[1]),
|
|
int(sec), microseconds=int(us), timezone=timezone)
|
|
|
|
except ValueError as exception:
|
|
raise errors.TimestampNotCorrectlyFormed(
|
|
u'Unable to parse: {0:s} with error: {1:s}'.format(
|
|
self.PrintLine(), exception))
|
|
|
|
return self.CreateEvent(
|
|
timestamp, getattr(self, 'entry_offset', 0), self.attributes)
|
|
|
|
# TODO: this is a rough initial implementation to get this working.
|
|
def CreateEvent(self, timestamp, offset, attributes):
|
|
"""Creates an event.
|
|
|
|
This function should be overwritten by text parsers that require
|
|
to generate specific event object type, the default is TextEvent.
|
|
|
|
Args:
|
|
timestamp: The timestamp time value. The timestamp contains the
|
|
number of microseconds since Jan 1, 1970 00:00:00 UTC.
|
|
offset: The offset of the event.
|
|
attributes: A dict that contains the events attributes.
|
|
|
|
Returns:
|
|
An event object (instance of TextEvent).
|
|
"""
|
|
return text_events.TextEvent(timestamp, offset, attributes)
|
|
|
|
|
|
class TextCSVParser(interface.BaseParser):
|
|
"""An implementation of a simple CSV line-per-entry log files."""
|
|
|
|
# A list that contains the names of all the fields in the log file.
|
|
COLUMNS = []
|
|
|
|
# A CSV file is comma separated, but this can be overwritten to include
|
|
# tab, pipe or other character separation.
|
|
VALUE_SEPARATOR = ','
|
|
|
|
# If there is a header before the lines start it can be defined here, and
|
|
# the number of header lines that need to be skipped before the parsing
|
|
# starts.
|
|
NUMBER_OF_HEADER_LINES = 0
|
|
|
|
# If there is a special quote character used inside the structured text
|
|
# it can be defined here.
|
|
QUOTE_CHAR = '"'
|
|
|
|
# Value that should not appear inside the file, made to test the actual
|
|
# file to see if it confirms to standards.
|
|
MAGIC_TEST_STRING = 'RegnThvotturMeistarans'
|
|
|
|
def VerifyRow(self, unused_parser_context, unused_row):
|
|
"""Return a bool indicating whether or not this is the correct parser.
|
|
|
|
Args:
|
|
parser_context: A parser context object (instance of ParserContext).
|
|
row: A single row from the CSV file.
|
|
|
|
Returns:
|
|
True if this is the correct parser, False otherwise.
|
|
"""
|
|
pass
|
|
|
|
def ParseRow(
|
|
self, parser_context, row_offset, row, file_entry=None,
|
|
parser_chain=None):
|
|
"""Parse a line of the log file and extract event objects.
|
|
|
|
Args:
|
|
parser_context: A parser context object (instance of ParserContext).
|
|
row_offset: The offset of the row.
|
|
row: A dictionary containing all the fields as denoted in the
|
|
COLUMNS class list.
|
|
file_entry: optional file entry object (instance of dfvfs.FileEntry).
|
|
The default is None.
|
|
parser_chain: Optional string containing the parsing chain up to this
|
|
point. The default is None.
|
|
"""
|
|
event_object = event.EventObject()
|
|
if row_offset is not None:
|
|
event_object.offset = row_offset
|
|
event_object.row_dict = row
|
|
parser_context.ProduceEvent(
|
|
event_object, parser_chain=parser_chain, file_entry=file_entry)
|
|
|
|
def Parse(self, parser_context, file_entry, parser_chain=None):
|
|
"""Extract data from a CVS file.
|
|
|
|
Args:
|
|
parser_context: A parser context object (instance of ParserContext).
|
|
file_entry: A file entry object (instance of dfvfs.FileEntry).
|
|
parser_chain: Optional string containing the parsing chain up to this
|
|
point. The default is None.
|
|
"""
|
|
path_spec_printable = file_entry.path_spec.comparable.replace(u'\n', u';')
|
|
file_object = file_entry.GetFileObject()
|
|
file_object.seek(0, os.SEEK_SET)
|
|
|
|
text_file_object = text_file.TextFile(file_object)
|
|
|
|
# If we specifically define a number of lines we should skip do that here.
|
|
for _ in range(0, self.NUMBER_OF_HEADER_LINES):
|
|
_ = text_file_object.readline()
|
|
|
|
reader = csv.DictReader(
|
|
text_file_object, fieldnames=self.COLUMNS,
|
|
restkey=self.MAGIC_TEST_STRING, restval=self.MAGIC_TEST_STRING,
|
|
delimiter=self.VALUE_SEPARATOR, quotechar=self.QUOTE_CHAR)
|
|
|
|
try:
|
|
row = reader.next()
|
|
except (csv.Error, StopIteration):
|
|
file_object.close()
|
|
raise errors.UnableToParseFile(
|
|
u'[{0:s}] Unable to parse CSV file: {1:s}.'.format(
|
|
self.NAME, path_spec_printable))
|
|
|
|
number_of_columns = len(self.COLUMNS)
|
|
number_of_records = len(row)
|
|
|
|
if number_of_records != number_of_columns:
|
|
file_object.close()
|
|
raise errors.UnableToParseFile((
|
|
u'[{0:s}] Unable to parse CSV file: {1:s}. Wrong number of '
|
|
u'records (expected: {2:d}, got: {3:d})').format(
|
|
self.NAME, path_spec_printable, number_of_columns,
|
|
number_of_records))
|
|
|
|
for key, value in row.items():
|
|
if key == self.MAGIC_TEST_STRING or value == self.MAGIC_TEST_STRING:
|
|
file_object.close()
|
|
raise errors.UnableToParseFile((
|
|
u'[{0:s}] Unable to parse CSV file: {1:s}. Signature '
|
|
u'mismatch.').format(self.NAME, path_spec_printable))
|
|
|
|
if not self.VerifyRow(parser_context, row):
|
|
file_object.close()
|
|
raise errors.UnableToParseFile((
|
|
u'[{0:s}] Unable to parse CSV file: {1:s}. Verification '
|
|
u'failed.').format(self.NAME, path_spec_printable))
|
|
|
|
# Add ourselves to the parser chain, which will be used in all subsequent
|
|
# event creation in this parser.
|
|
parser_chain = self._BuildParserChain(parser_chain)
|
|
|
|
self.ParseRow(
|
|
parser_context, text_file_object.tell(), row, file_entry=file_entry,
|
|
parser_chain=parser_chain)
|
|
|
|
for row in reader:
|
|
self.ParseRow(
|
|
parser_context, text_file_object.tell(), row, file_entry=file_entry,
|
|
parser_chain=parser_chain)
|
|
|
|
file_object.close()
|
|
|
|
|
|
def PyParseRangeCheck(lower_bound, upper_bound):
|
|
"""Verify that a number is within a defined range.
|
|
|
|
This is a callback method for pyparsing setParseAction
|
|
that verifies that a read number is within a certain range.
|
|
|
|
To use this method it needs to be defined as a callback method
|
|
in setParseAction with the upper and lower bound set as parameters.
|
|
|
|
Args:
|
|
lower_bound: An integer representing the lower bound of the range.
|
|
upper_bound: An integer representing the upper bound of the range.
|
|
|
|
Returns:
|
|
A callback method that can be used by pyparsing setParseAction.
|
|
"""
|
|
def CheckRange(unused_string, unused_location, tokens):
|
|
"""Parse the arguments."""
|
|
try:
|
|
check_number = tokens[0]
|
|
except IndexError:
|
|
check_number = -1
|
|
|
|
if check_number < lower_bound:
|
|
raise pyparsing.ParseException(
|
|
u'Value: {0:d} precedes lower bound: {1:d}'.format(
|
|
check_number, lower_bound))
|
|
|
|
if check_number > upper_bound:
|
|
raise pyparsing.ParseException(
|
|
u'Value: {0:d} exceeds upper bound: {1:d}'.format(
|
|
check_number, upper_bound))
|
|
|
|
# Since callback methods for pyparsing need to accept certain parameters
|
|
# and there is no way to define conditions, like upper and lower bounds
|
|
# we need to return here a method that accepts those pyparsing parameters.
|
|
return CheckRange
|
|
|
|
|
|
def PyParseIntCast(unused_string, unused_location, tokens):
|
|
"""Return an integer from a string.
|
|
|
|
This is a pyparsing callback method that converts the matched
|
|
string into an integer.
|
|
|
|
The method modifies the content of the tokens list and converts
|
|
them all to an integer value.
|
|
|
|
Args:
|
|
unused_string: The original parsed string.
|
|
unused_location: The location within the string where the match was made.
|
|
tokens: A list of extracted tokens (where the string to be converted is
|
|
stored).
|
|
"""
|
|
# Cast the regular tokens.
|
|
for index, token in enumerate(tokens):
|
|
try:
|
|
tokens[index] = int(token)
|
|
except ValueError:
|
|
logging.error(u'Unable to cast [{0:s}] to an int, setting to 0'.format(
|
|
token))
|
|
tokens[index] = 0
|
|
|
|
# We also need to cast the dictionary built tokens.
|
|
for key in tokens.keys():
|
|
try:
|
|
tokens[key] = int(tokens[key], 10)
|
|
except ValueError:
|
|
logging.error(
|
|
u'Unable to cast [{0:s} = {1:d}] to an int, setting to 0'.format(
|
|
key, tokens[key]))
|
|
tokens[key] = 0
|
|
|
|
|
|
def PyParseJoinList(unused_string, unused_location, tokens):
|
|
"""Return a joined token from a list of tokens.
|
|
|
|
This is a callback method for pyparsing setParseAction that modifies
|
|
the returned token list to join all the elements in the list to a single
|
|
token.
|
|
|
|
Args:
|
|
unused_string: The original parsed string.
|
|
unused_location: The location within the string where the match was made.
|
|
tokens: A list of extracted tokens. This is the list that should be joined
|
|
together and stored as a single token.
|
|
"""
|
|
join_list = []
|
|
for token in tokens:
|
|
try:
|
|
join_list.append(str(token))
|
|
except UnicodeDecodeError:
|
|
join_list.append(repr(token))
|
|
|
|
tokens[0] = u''.join(join_list)
|
|
del tokens[1:]
|
|
|
|
|
|
class PyparsingConstants(object):
|
|
"""A class that maintains constants for pyparsing."""
|
|
|
|
# Numbers.
|
|
INTEGER = pyparsing.Word(pyparsing.nums).setParseAction(PyParseIntCast)
|
|
IPV4_OCTET = pyparsing.Word(pyparsing.nums, min=1, max=3).setParseAction(
|
|
PyParseIntCast, PyParseRangeCheck(0, 255))
|
|
IPV4_ADDRESS = (IPV4_OCTET + ('.' + IPV4_OCTET) * 3).setParseAction(
|
|
PyParseJoinList)
|
|
|
|
# TODO: Fix the IPv6 address specification to be more accurate (8 :, correct
|
|
# size, etc).
|
|
IPV6_ADDRESS = pyparsing.Word(':' + pyparsing.hexnums).setParseAction(
|
|
PyParseJoinList)
|
|
|
|
# Common words.
|
|
MONTH = pyparsing.Word(
|
|
pyparsing.string.uppercase, pyparsing.string.lowercase,
|
|
exact=3)
|
|
|
|
# Define date structures.
|
|
HYPHEN = pyparsing.Literal('-').suppress()
|
|
YEAR = pyparsing.Word(pyparsing.nums, exact=4).setParseAction(
|
|
PyParseIntCast)
|
|
TWO_DIGITS = pyparsing.Word(pyparsing.nums, exact=2).setParseAction(
|
|
PyParseIntCast)
|
|
ONE_OR_TWO_DIGITS = pyparsing.Word(
|
|
pyparsing.nums, min=1, max=2).setParseAction(PyParseIntCast)
|
|
DATE = pyparsing.Group(
|
|
YEAR + pyparsing.Suppress('-') + TWO_DIGITS +
|
|
pyparsing.Suppress('-') + TWO_DIGITS)
|
|
DATE_REV = pyparsing.Group(
|
|
TWO_DIGITS + pyparsing.Suppress('-') + TWO_DIGITS +
|
|
pyparsing.Suppress('-') + YEAR)
|
|
TIME = pyparsing.Group(
|
|
TWO_DIGITS + pyparsing.Suppress(':') + TWO_DIGITS +
|
|
pyparsing.Suppress(':') + TWO_DIGITS)
|
|
TIME_MSEC = TIME + pyparsing.Suppress('.') + INTEGER
|
|
DATE_TIME = DATE + TIME
|
|
DATE_TIME_MSEC = DATE + TIME_MSEC
|
|
|
|
COMMENT_LINE_HASH = pyparsing.Literal('#') + pyparsing.SkipTo(
|
|
pyparsing.LineEnd())
|
|
# TODO: Add more commonly used structs that can be used by parsers.
|
|
PID = pyparsing.Word(
|
|
pyparsing.nums, min=1, max=5).setParseAction(PyParseIntCast)
|
|
|
|
|
|
class PyparsingSingleLineTextParser(interface.BaseParser):
|
|
"""Single line text parser based on the pyparsing library."""
|
|
|
|
# The actual structure, this needs to be defined by each parser.
|
|
# This is defined as a list of tuples so that more then a single line
|
|
# structure can be defined. That way the parser can support more than a
|
|
# single type of log entry, despite them all having in common the constraint
|
|
# that each log entry is a single line.
|
|
# The tuple should have two entries, a key and a structure. This is done to
|
|
# keep the structures in an order of priority/preference.
|
|
# The key is a comment or an identification that is passed to the ParseRecord
|
|
# function so that the developer can identify which structure got parsed.
|
|
# The value is the actual pyparsing structure.
|
|
LINE_STRUCTURES = []
|
|
|
|
# In order for the tool to not read too much data into a buffer to evaluate
|
|
# whether or not the parser is the right one for this file or not we
|
|
# specifically define a maximum amount of bytes a single line can occupy. This
|
|
# constant can be overwritten by implementations if their format might have a
|
|
# longer line than 400 bytes.
|
|
MAX_LINE_LENGTH = 400
|
|
|
|
# Define an encoding. If a file is encoded using specific encoding it is
|
|
# advised to include it here. If this class constant is set all lines wil be
|
|
# decoded prior to being sent to parsing by pyparsing, if not properly set it
|
|
# could negatively affect parsing of the file.
|
|
# If this value needs to be calculated on the fly (not a fixed constant for
|
|
# this particular file type) it can be done by modifying the self.encoding
|
|
# attribute.
|
|
ENCODING = ''
|
|
|
|
def __init__(self):
|
|
"""Initializes the pyparsing single-line text parser object."""
|
|
super(PyparsingSingleLineTextParser, self).__init__()
|
|
self.encoding = self.ENCODING
|
|
self._current_offset = 0
|
|
# TODO: self._line_structures is a work-around and this needs
|
|
# a structural fix.
|
|
self._line_structures = self.LINE_STRUCTURES
|
|
|
|
def _ReadLine(
|
|
self, parser_context, file_entry, text_file_object, max_len=0,
|
|
quiet=False, depth=0):
|
|
"""Read a single line from a text file and return it back.
|
|
|
|
Args:
|
|
parser_context: A parser context object (instance of ParserContext).
|
|
file_entry: A file entry object (instance of dfvfs.FileEntry).
|
|
text_file_object: A text file object (instance of dfvfs.TextFile).
|
|
max_len: If defined determines the maximum number of bytes a single line
|
|
can take.
|
|
quiet: If True then a decode warning is not displayed.
|
|
depth: A threshold of how many newlines we can encounter before bailing
|
|
out.
|
|
|
|
Returns:
|
|
A single line read from the file-like object, or the maximum number of
|
|
characters (if max_len defined and line longer than the defined size).
|
|
"""
|
|
if max_len:
|
|
line = text_file_object.readline(max_len)
|
|
else:
|
|
line = text_file_object.readline()
|
|
|
|
if not line:
|
|
return
|
|
|
|
# If line is empty, skip it and go on.
|
|
if line == '\n' or line == '\r\n':
|
|
# Max 40 new lines in a row before we bail out.
|
|
if depth == 40:
|
|
return ''
|
|
|
|
return self._ReadLine(
|
|
parser_context, file_entry, text_file_object, max_len=max_len,
|
|
depth=depth + 1)
|
|
|
|
if not self.encoding:
|
|
return line.strip()
|
|
|
|
try:
|
|
decoded_line = line.decode(self.encoding)
|
|
return decoded_line.strip()
|
|
except UnicodeDecodeError:
|
|
if not quiet:
|
|
logging.warning((
|
|
u'Unable to decode line [{0:s}...] with encoding: {1:s} in '
|
|
u'file: {2:s}').format(
|
|
repr(line[1:30]), self.encoding,
|
|
parser_context.GetDisplayName(file_entry)))
|
|
return line.strip()
|
|
|
|
def Parse(self, parser_context, file_entry, parser_chain=None):
|
|
"""Extract data from a text file using a pyparsing definition.
|
|
|
|
Args:
|
|
parser_context: A parser context object (instance of ParserContext).
|
|
file_entry: A file entry object (instance of dfvfs.FileEntry).
|
|
parser_chain: Optional string containing the parsing chain up to this
|
|
point. The default is None.
|
|
|
|
Raises:
|
|
UnableToParseFile: when the file cannot be parsed.
|
|
"""
|
|
# TODO: find a more elegant way for this; currently the mac_wifi and
|
|
# syslog parser seem to rely on this member.
|
|
self.file_entry = file_entry
|
|
|
|
file_object = file_entry.GetFileObject()
|
|
|
|
# TODO: self._line_structures is a work-around and this needs
|
|
# a structural fix.
|
|
if not self._line_structures:
|
|
raise errors.UnableToParseFile(
|
|
u'Line structure undeclared, unable to proceed.')
|
|
|
|
file_object.seek(0, os.SEEK_SET)
|
|
text_file_object = text_file.TextFile(file_object)
|
|
|
|
line = self._ReadLine(
|
|
parser_context, file_entry, text_file_object,
|
|
max_len=self.MAX_LINE_LENGTH, quiet=True)
|
|
if not line:
|
|
raise errors.UnableToParseFile(u'Not a text file.')
|
|
|
|
if len(line) == self.MAX_LINE_LENGTH or len(
|
|
line) == self.MAX_LINE_LENGTH - 1:
|
|
logging.debug((
|
|
u'Trying to read a line and reached the maximum allowed length of '
|
|
u'{0:d}. The last few bytes of the line are: {1:s} [parser '
|
|
u'{2:s}]').format(
|
|
self.MAX_LINE_LENGTH, repr(line[-10:]), self.NAME))
|
|
|
|
if not utils.IsText(line):
|
|
raise errors.UnableToParseFile(u'Not a text file, unable to proceed.')
|
|
|
|
if not self.VerifyStructure(parser_context, line):
|
|
raise errors.UnableToParseFile('Wrong file structure.')
|
|
|
|
# Add ourselves to the parser chain, which will be used in all subsequent
|
|
# event creation in this parser.
|
|
parser_chain = self._BuildParserChain(parser_chain)
|
|
|
|
# Set the offset to the beginning of the file.
|
|
self._current_offset = 0
|
|
# Read every line in the text file.
|
|
while line:
|
|
parsed_structure = None
|
|
use_key = None
|
|
# Try to parse the line using all the line structures.
|
|
for key, structure in self.LINE_STRUCTURES:
|
|
try:
|
|
parsed_structure = structure.parseString(line)
|
|
except pyparsing.ParseException:
|
|
pass
|
|
if parsed_structure:
|
|
use_key = key
|
|
break
|
|
|
|
if parsed_structure:
|
|
parsed_event = self.ParseRecord(
|
|
parser_context, use_key, parsed_structure)
|
|
if parsed_event:
|
|
parsed_event.offset = self._current_offset
|
|
parser_context.ProduceEvent(
|
|
parsed_event, parser_chain=parser_chain, file_entry=file_entry)
|
|
else:
|
|
logging.warning(u'Unable to parse log line: {0:s}'.format(line))
|
|
|
|
self._current_offset = text_file_object.get_offset()
|
|
line = self._ReadLine(parser_context, file_entry, text_file_object)
|
|
|
|
file_object.close()
|
|
|
|
@abc.abstractmethod
|
|
def ParseRecord(self, parser_context, key, structure):
|
|
"""Parse a single extracted pyparsing structure.
|
|
|
|
This function takes as an input a parsed pyparsing structure
|
|
and produces an EventObject if possible from that structure.
|
|
|
|
Args:
|
|
parser_context: A parser context object (instance of ParserContext).
|
|
key: An identification string indicating the name of the parsed
|
|
structure.
|
|
structure: A pyparsing.ParseResults object from a line in the
|
|
log file.
|
|
|
|
Returns:
|
|
An event object (instance of EventObject) or None.
|
|
"""
|
|
|
|
@abc.abstractmethod
|
|
def VerifyStructure(self, parser_context, line):
|
|
"""Verify the structure of the file and return boolean based on that check.
|
|
|
|
This function should read enough text from the text file to confirm
|
|
that the file is the correct one for this particular parser.
|
|
|
|
Args:
|
|
parser_context: A parser context object (instance of ParserContext).
|
|
line: A single line from the text file.
|
|
|
|
Returns:
|
|
True if this is the correct parser, False otherwise.
|
|
"""
|
|
|
|
|
|
class EncodedTextReader(object):
|
|
"""Class to read simple encoded text."""
|
|
|
|
def __init__(self, buffer_size=2048, encoding=None):
|
|
"""Initializes the encoded test reader object.
|
|
|
|
Args:
|
|
buffer_size: optional buffer size. The default is 2048.
|
|
encoding: optional encoding. The default is None.
|
|
"""
|
|
super(EncodedTextReader, self).__init__()
|
|
self._buffer = ''
|
|
self._buffer_size = buffer_size
|
|
self._current_offset = 0
|
|
self._encoding = encoding
|
|
|
|
if self._encoding:
|
|
self._new_line = u'\n'.encode(self._encoding)
|
|
self._carriage_return = u'\r'.encode(self._encoding)
|
|
else:
|
|
self._new_line = '\n'
|
|
self._carriage_return = '\r'
|
|
|
|
self._new_line_length = len(self._new_line)
|
|
self._carriage_return_length = len(self._carriage_return)
|
|
|
|
self.lines = u''
|
|
|
|
def _ReadLine(self, file_object):
|
|
"""Reads a line from the file object.
|
|
|
|
Args:
|
|
file_object: the file-like object.
|
|
|
|
Returns:
|
|
A string containing the line.
|
|
"""
|
|
if len(self._buffer) < self._buffer_size:
|
|
self._buffer = ''.join([
|
|
self._buffer, file_object.read(self._buffer_size)])
|
|
|
|
line, new_line, self._buffer = self._buffer.partition(self._new_line)
|
|
if not line and not new_line:
|
|
line = self._buffer
|
|
self._buffer = ''
|
|
|
|
self._current_offset += len(line)
|
|
|
|
# Strip carriage returns from the text.
|
|
if line.endswith(self._carriage_return):
|
|
line = line[:-self._carriage_return_length]
|
|
|
|
if new_line:
|
|
line = ''.join([line, self._new_line])
|
|
self._current_offset += self._new_line_length
|
|
|
|
# If a parser specifically indicates specific encoding we need
|
|
# to handle the buffer as it is an encoded string.
|
|
# If it fails we fail back to the original raw string.
|
|
if self._encoding:
|
|
try:
|
|
line = line.decode(self._encoding)
|
|
except UnicodeDecodeError:
|
|
# TODO: it might be better to raise here.
|
|
pass
|
|
|
|
return line
|
|
|
|
def ReadLine(self, file_object):
|
|
"""Reads a line.
|
|
|
|
Args:
|
|
file_object: the file-like object.
|
|
|
|
Returns:
|
|
A single line read from the lines buffer.
|
|
"""
|
|
line, _, self.lines = self.lines.partition('\n')
|
|
if not line:
|
|
self.ReadLines(file_object)
|
|
line, _, self.lines = self.lines.partition('\n')
|
|
|
|
return line
|
|
|
|
def ReadLines(self, file_object):
|
|
"""Reads lines into the lines buffer.
|
|
|
|
Args:
|
|
file_object: the file-like object.
|
|
"""
|
|
lines_size = len(self.lines)
|
|
if lines_size < self._buffer_size:
|
|
lines_size = self._buffer_size - lines_size
|
|
while lines_size > 0:
|
|
line = self._ReadLine(file_object)
|
|
if not line:
|
|
break
|
|
|
|
self.lines = u''.join([self.lines, line])
|
|
lines_size -= len(line)
|
|
|
|
def Reset(self):
|
|
"""Resets the encoded text reader."""
|
|
self._buffer = ''
|
|
self._current_offset = 0
|
|
|
|
self.lines = u''
|
|
|
|
def SkipAhead(self, file_object, number_of_characters):
|
|
"""Skips ahead a number of characters.
|
|
|
|
Args:
|
|
file_object: the file-like object.
|
|
number_of_characters: the number of characters.
|
|
"""
|
|
lines_size = len(self.lines)
|
|
while number_of_characters >= lines_size:
|
|
number_of_characters -= lines_size
|
|
|
|
self.lines = u''
|
|
self.ReadLines(file_object)
|
|
lines_size = len(self.lines)
|
|
if lines_size == 0:
|
|
return
|
|
|
|
self.lines = self.lines[number_of_characters:]
|
|
|
|
|
|
class PyparsingMultiLineTextParser(PyparsingSingleLineTextParser):
|
|
"""Multi line text parser based on the pyparsing library."""
|
|
|
|
BUFFER_SIZE = 2048
|
|
|
|
def __init__(self):
|
|
"""Initializes the pyparsing multi-line text parser object."""
|
|
super(PyparsingMultiLineTextParser, self).__init__()
|
|
self._buffer_size = self.BUFFER_SIZE
|
|
self._text_reader = EncodedTextReader(
|
|
buffer_size=self.BUFFER_SIZE, encoding=self.ENCODING)
|
|
|
|
def Parse(self, parser_context, file_entry, parser_chain=None):
|
|
"""Parse a text file using a pyparsing definition.
|
|
|
|
Args:
|
|
parser_context: A parser context object (instance of ParserContext).
|
|
file_entry: A file entry object (instance of dfvfs.FileEntry).
|
|
parser_chain: Optional string containing the parsing chain up to this
|
|
point. The default is None.
|
|
|
|
Raises:
|
|
UnableToParseFile: if the line structures are missing.
|
|
"""
|
|
if not self.LINE_STRUCTURES:
|
|
raise errors.UnableToParseFile(u'Missing line structures.')
|
|
|
|
self._text_reader.Reset()
|
|
|
|
file_object = file_entry.GetFileObject()
|
|
file_object.seek(0, os.SEEK_SET)
|
|
|
|
try:
|
|
self._text_reader.ReadLines(file_object)
|
|
except UnicodeDecodeError as exception:
|
|
raise errors.UnableToParseFile(
|
|
u'Not a text file, with error: {0:s}'.format(exception))
|
|
|
|
if not utils.IsText(self._text_reader.lines):
|
|
raise errors.UnableToParseFile(u'Not a text file, unable to proceed.')
|
|
|
|
if not self.VerifyStructure(parser_context, self._text_reader.lines):
|
|
raise errors.UnableToParseFile(u'Wrong file structure.')
|
|
|
|
# Add ourselves to the parser chain, which will be used in all subsequent
|
|
# event creation in this parser.
|
|
parser_chain = self._BuildParserChain(parser_chain)
|
|
|
|
# Read every line in the text file.
|
|
while self._text_reader.lines:
|
|
# Initialize pyparsing objects.
|
|
tokens = None
|
|
start = 0
|
|
end = 0
|
|
|
|
key = None
|
|
|
|
# Try to parse the line using all the line structures.
|
|
for key, structure in self.LINE_STRUCTURES:
|
|
try:
|
|
parsed_structure = next(
|
|
structure.scanString(self._text_reader.lines, maxMatches=1), None)
|
|
except pyparsing.ParseException:
|
|
continue
|
|
|
|
if not parsed_structure:
|
|
continue
|
|
|
|
tokens, start, end = parsed_structure
|
|
|
|
# Only want to parse the structure if it starts
|
|
# at the beginning of the buffer.
|
|
if start == 0:
|
|
break
|
|
|
|
if tokens and start == 0:
|
|
parsed_event = self.ParseRecord(parser_context, key, tokens)
|
|
if parsed_event:
|
|
# TODO: need a reliable way to handle this.
|
|
# parsed_event.offset = self._text_reader.line_offset
|
|
parser_context.ProduceEvent(
|
|
parsed_event, parser_chain=parser_chain, file_entry=file_entry)
|
|
|
|
self._text_reader.SkipAhead(file_object, end)
|
|
|
|
else:
|
|
odd_line = self._text_reader.ReadLine(file_object)
|
|
if odd_line:
|
|
logging.warning(
|
|
u'Unable to parse log line: {0:s}'.format(repr(odd_line)))
|
|
|
|
try:
|
|
self._text_reader.ReadLines(file_object)
|
|
except UnicodeDecodeError as exception:
|
|
logging.error(
|
|
u'[{0:s}] Unable to read lines from file: {1:s} with error: '
|
|
u'{2:s}'.format(
|
|
parser_chain,
|
|
file_entry.path_spec.comparable.replace(u'\n', u';'),
|
|
exception))
|