plaso-rubanetra/plaso/parsers/text_parser.py
2020-04-06 18:48:34 +02:00

1100 lines
36 KiB
Python

#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2012 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file contains a class to provide a parsing framework to plaso.
This class contains a base framework class for parsing fileobjects, and
also some implementations that extend it to provide a more comprehensive
parser.
"""
import abc
import csv
import logging
import os
from dfvfs.helpers import text_file
import pyparsing
from plaso.events import text_events
from plaso.lib import errors
from plaso.lib import event
from plaso.lib import lexer
from plaso.lib import timelib
from plaso.lib import utils
from plaso.parsers import interface
import pytz
# Pylint complains about some functions not being implemented that shouldn't
# be since they need to be implemented by children.
# pylint: disable=abstract-method
class SlowLexicalTextParser(interface.BaseParser, lexer.SelfFeederMixIn):
"""Generic text based parser that uses lexer to assist with parsing.
This text parser is based on a rather slow lexer, which makes the
use of this interface highly discouraged. Parsers that already
implement it will most likely all be rewritten to support faster
text parsing implementations.
This text based parser needs to be extended to provide an accurate
list of tokens that define the structure of the log file that the
parser is designed for.
"""
# Define the max number of lines before we determine this is
# not the correct parser.
MAX_LINES = 15
# List of tokens that describe the structure of the log file.
tokens = [
lexer.Token('INITIAL', '(.+)\n', 'ParseString', ''),
]
def __init__(self, local_zone=True):
"""Constructor for the SlowLexicalTextParser.
Args:
local_zone: A boolean value that determines if the entries
in the log file are stored in the local time
zone of the computer that stored it or in a fixed
timezone, like UTC.
"""
# TODO: remove the multiple inheritance.
lexer.SelfFeederMixIn.__init__(self)
interface.BaseParser.__init__(self)
self.line_ready = False
self.attributes = {
'body': '',
'iyear': 0,
'imonth': 0,
'iday': 0,
'time': '',
'hostname': '',
'username': '',
}
self.local_zone = local_zone
self.file_entry = None
def ClearValues(self):
"""Clears all the values inside the attributes dict.
All values that start with the letter 'i' are considered
to be an integer, otherwise string value is assumed.
"""
self.line_ready = False
for attr in self.attributes:
if attr[0] == 'i':
self.attributes[attr] = 0
else:
self.attributes[attr] = ''
def ParseIncomplete(self, match=None, **unused_kwargs):
"""Indication that we've got a partial line to match against.
Args:
match: The regular expression match object.
"""
self.attributes['body'] += match.group(0)
self.line_ready = True
def ParseMessage(self, **unused_kwargs):
"""Signal that a line is ready to be parsed."""
self.line_ready = True
def SetMonth(self, match=None, **unused_kwargs):
"""Parses the month.
This is a callback function for the text parser (lexer) and is
called by the corresponding lexer state.
Args:
match: The regular expression match object.
"""
self.attributes['imonth'] = int(
timelib.MONTH_DICT.get(match.group(1).lower(), 1))
def SetDay(self, match=None, **unused_kwargs):
"""Parses the day of the month.
This is a callback function for the text parser (lexer) and is
called by the corresponding lexer state.
Args:
match: The regular expression match object.
"""
self.attributes['iday'] = int(match.group(1))
def SetTime(self, match=None, **unused_kwargs):
"""Set the time attribute.
Args:
match: The regular expression match object.
"""
self.attributes['time'] = match.group(1)
def SetYear(self, match=None, **unused_kwargs):
"""Parses the year.
This is a callback function for the text parser (lexer) and is
called by the corresponding lexer state.
Args:
match: The regular expression match object.
"""
self.attributes['iyear'] = int(match.group(1))
def Parse(self, parser_context, file_entry, parser_chain=None):
"""Extract data from a text file.
Args:
parser_context: A parser context object (instance of ParserContext).
file_entry: A file entry object (instance of dfvfs.FileEntry).
parser_chain: Optional string containing the parsing chain up to this
point. The default is None.
Raises:
UnableToParseFile: when the file cannot be parsed.
"""
path_spec_printable = u'{0:s}:{1:s}'.format(
file_entry.path_spec.type_indicator, file_entry.name)
file_object = file_entry.GetFileObject()
self.file_entry = file_entry
# TODO: this is necessary since we inherit from lexer.SelfFeederMixIn.
self.file_object = file_object
# Start by checking, is this a text file or not? Before we proceed
# any further.
file_object.seek(0, os.SEEK_SET)
if not utils.IsText(file_object.read(40)):
raise errors.UnableToParseFile(u'Not a text file, unable to proceed.')
file_object.seek(0, os.SEEK_SET)
error_count = 0
file_verified = False
# We need to clear out few values in the Lexer before continuing.
# There might be some leftovers from previous run.
self.error = 0
self.buffer = ''
# Add ourselves to the parser chain, which will be used in all subsequent
# event creation in this parser.
parser_chain = self._BuildParserChain(parser_chain)
while True:
_ = self.NextToken()
if self.state == 'INITIAL':
self.entry_offset = getattr(self, 'next_entry_offset', 0)
self.next_entry_offset = file_object.tell() - len(self.buffer)
if not file_verified and self.error >= self.MAX_LINES * 2:
logging.debug(
u'Lexer error count: {0:d} and current state {1:s}'.format(
self.error, self.state))
file_object.close()
raise errors.UnableToParseFile(
u'[{0:s}] unsupported file: {1:s}.'.format(
self.NAME, path_spec_printable))
if self.line_ready:
try:
event_object = self.ParseLine(parser_context)
parser_context.ProduceEvent(
event_object, parser_chain=parser_chain, file_entry=file_entry)
file_verified = True
except errors.TimestampNotCorrectlyFormed as exception:
error_count += 1
if file_verified:
logging.debug(
u'[{0:s} VERIFIED] Error count: {1:d} and ERROR: {2:d}'.format(
path_spec_printable, error_count, self.error))
logging.warning(
u'[{0:s}] Unable to parse timestamp with error: {1:s}'.format(
self.NAME, exception))
else:
logging.debug((
u'[{0:s} EVALUATING] Error count: {1:d} and ERROR: '
u'{2:d})').format(path_spec_printable, error_count, self.error))
if error_count >= self.MAX_LINES:
file_object.close()
raise errors.UnableToParseFile(
u'[{0:s}] unsupported file: {1:s}.'.format(
self.NAME, path_spec_printable))
finally:
self.ClearValues()
if self.Empty():
# Try to fill the buffer to prevent the parser from ending prematurely.
self.Feed()
if self.Empty():
break
if not file_verified:
file_object.close()
raise errors.UnableToParseFile(
u'[{0:s}] unable to parser file: {1:s}.'.format(
self.NAME, path_spec_printable))
file_offset = file_object.get_offset()
if file_offset < file_object.get_size():
logging.error((
u'{0:s} prematurely terminated parsing: {1:s} at offset: '
u'0x{2:08x}.').format(
self.NAME, path_spec_printable, file_offset))
file_object.close()
def ParseString(self, match=None, **unused_kwargs):
"""Return a string with combined values from the lexer.
Args:
match: The regular expression match object.
Returns:
A string that combines the values that are so far
saved from the lexer.
"""
try:
self.attributes['body'] += match.group(1).strip('\n')
except IndexError:
self.attributes['body'] += match.group(0).strip('\n')
def PrintLine(self):
""""Return a string with combined values from the lexer."""
year = getattr(self.attributes, 'iyear', None)
month = getattr(self.attributes, 'imonth', None)
day = getattr(self.attributes, 'iday', None)
if None in [year, month, day]:
date_string = u'[DATE NOT SET]'
else:
try:
year = int(year, 10)
month = int(month, 10)
day = int(day, 10)
date_string = u'{0:04d}-{1:02d}-{2:02d}'.format(year, month, day)
except ValueError:
date_string = u'[DATE INVALID]'
time_string = getattr(self.attributes, 'time', u'[TIME NOT SET]')
hostname_string = getattr(self.attributes, 'hostname', u'HOSTNAME NOT SET')
reporter_string = getattr(
self.attributes, 'reporter', u'[REPORTER NOT SET]')
body_string = getattr(self.attributes, 'body', u'[BODY NOT SET]')
# TODO: this is a work in progress. The reason for the try-catch is that
# the text parser is handed a non-text file and must deal with converting
# arbitrary binary data.
try:
line = u'{0:s} {1:s} [{2:s}] {3:s} => {4:s}'.format(
date_string, time_string, hostname_string, reporter_string,
body_string)
except UnicodeError:
line = 'Unable to print line - due to encoding error.'
return line
def ParseLine(self, parser_context):
"""Return an event object extracted from the current line.
Args:
parser_context: A parser context object (instance of ParserContext).
Returns:
An event object (instance of TextEvent).
"""
if not self.attributes['time']:
raise errors.TimestampNotCorrectlyFormed(
u'Unable to parse timestamp, time not set.')
if not self.attributes['iyear']:
raise errors.TimestampNotCorrectlyFormed(
u'Unable to parse timestamp, year not set.')
times = self.attributes['time'].split(':')
if self.local_zone:
timezone = parser_context.timezone
else:
timezone = pytz.UTC
if len(times) < 3:
raise errors.TimestampNotCorrectlyFormed((
u'Unable to parse timestamp, not of the format HH:MM:SS '
u'[{0:s}]').format(self.PrintLine()))
try:
secs = times[2].split('.')
if len(secs) == 2:
sec, us = secs
else:
sec = times[2]
us = 0
timestamp = timelib.Timestamp.FromTimeParts(
int(self.attributes['iyear']), self.attributes['imonth'],
self.attributes['iday'], int(times[0]), int(times[1]),
int(sec), microseconds=int(us), timezone=timezone)
except ValueError as exception:
raise errors.TimestampNotCorrectlyFormed(
u'Unable to parse: {0:s} with error: {1:s}'.format(
self.PrintLine(), exception))
return self.CreateEvent(
timestamp, getattr(self, 'entry_offset', 0), self.attributes)
# TODO: this is a rough initial implementation to get this working.
def CreateEvent(self, timestamp, offset, attributes):
"""Creates an event.
This function should be overwritten by text parsers that require
to generate specific event object type, the default is TextEvent.
Args:
timestamp: The timestamp time value. The timestamp contains the
number of microseconds since Jan 1, 1970 00:00:00 UTC.
offset: The offset of the event.
attributes: A dict that contains the events attributes.
Returns:
An event object (instance of TextEvent).
"""
return text_events.TextEvent(timestamp, offset, attributes)
class TextCSVParser(interface.BaseParser):
"""An implementation of a simple CSV line-per-entry log files."""
# A list that contains the names of all the fields in the log file.
COLUMNS = []
# A CSV file is comma separated, but this can be overwritten to include
# tab, pipe or other character separation.
VALUE_SEPARATOR = ','
# If there is a header before the lines start it can be defined here, and
# the number of header lines that need to be skipped before the parsing
# starts.
NUMBER_OF_HEADER_LINES = 0
# If there is a special quote character used inside the structured text
# it can be defined here.
QUOTE_CHAR = '"'
# Value that should not appear inside the file, made to test the actual
# file to see if it confirms to standards.
MAGIC_TEST_STRING = 'RegnThvotturMeistarans'
def VerifyRow(self, unused_parser_context, unused_row):
"""Return a bool indicating whether or not this is the correct parser.
Args:
parser_context: A parser context object (instance of ParserContext).
row: A single row from the CSV file.
Returns:
True if this is the correct parser, False otherwise.
"""
pass
def ParseRow(
self, parser_context, row_offset, row, file_entry=None,
parser_chain=None):
"""Parse a line of the log file and extract event objects.
Args:
parser_context: A parser context object (instance of ParserContext).
row_offset: The offset of the row.
row: A dictionary containing all the fields as denoted in the
COLUMNS class list.
file_entry: optional file entry object (instance of dfvfs.FileEntry).
The default is None.
parser_chain: Optional string containing the parsing chain up to this
point. The default is None.
"""
event_object = event.EventObject()
if row_offset is not None:
event_object.offset = row_offset
event_object.row_dict = row
parser_context.ProduceEvent(
event_object, parser_chain=parser_chain, file_entry=file_entry)
def Parse(self, parser_context, file_entry, parser_chain=None):
"""Extract data from a CVS file.
Args:
parser_context: A parser context object (instance of ParserContext).
file_entry: A file entry object (instance of dfvfs.FileEntry).
parser_chain: Optional string containing the parsing chain up to this
point. The default is None.
"""
path_spec_printable = file_entry.path_spec.comparable.replace(u'\n', u';')
file_object = file_entry.GetFileObject()
file_object.seek(0, os.SEEK_SET)
text_file_object = text_file.TextFile(file_object)
# If we specifically define a number of lines we should skip do that here.
for _ in range(0, self.NUMBER_OF_HEADER_LINES):
_ = text_file_object.readline()
reader = csv.DictReader(
text_file_object, fieldnames=self.COLUMNS,
restkey=self.MAGIC_TEST_STRING, restval=self.MAGIC_TEST_STRING,
delimiter=self.VALUE_SEPARATOR, quotechar=self.QUOTE_CHAR)
try:
row = reader.next()
except (csv.Error, StopIteration):
file_object.close()
raise errors.UnableToParseFile(
u'[{0:s}] Unable to parse CSV file: {1:s}.'.format(
self.NAME, path_spec_printable))
number_of_columns = len(self.COLUMNS)
number_of_records = len(row)
if number_of_records != number_of_columns:
file_object.close()
raise errors.UnableToParseFile((
u'[{0:s}] Unable to parse CSV file: {1:s}. Wrong number of '
u'records (expected: {2:d}, got: {3:d})').format(
self.NAME, path_spec_printable, number_of_columns,
number_of_records))
for key, value in row.items():
if key == self.MAGIC_TEST_STRING or value == self.MAGIC_TEST_STRING:
file_object.close()
raise errors.UnableToParseFile((
u'[{0:s}] Unable to parse CSV file: {1:s}. Signature '
u'mismatch.').format(self.NAME, path_spec_printable))
if not self.VerifyRow(parser_context, row):
file_object.close()
raise errors.UnableToParseFile((
u'[{0:s}] Unable to parse CSV file: {1:s}. Verification '
u'failed.').format(self.NAME, path_spec_printable))
# Add ourselves to the parser chain, which will be used in all subsequent
# event creation in this parser.
parser_chain = self._BuildParserChain(parser_chain)
self.ParseRow(
parser_context, text_file_object.tell(), row, file_entry=file_entry,
parser_chain=parser_chain)
for row in reader:
self.ParseRow(
parser_context, text_file_object.tell(), row, file_entry=file_entry,
parser_chain=parser_chain)
file_object.close()
def PyParseRangeCheck(lower_bound, upper_bound):
"""Verify that a number is within a defined range.
This is a callback method for pyparsing setParseAction
that verifies that a read number is within a certain range.
To use this method it needs to be defined as a callback method
in setParseAction with the upper and lower bound set as parameters.
Args:
lower_bound: An integer representing the lower bound of the range.
upper_bound: An integer representing the upper bound of the range.
Returns:
A callback method that can be used by pyparsing setParseAction.
"""
def CheckRange(unused_string, unused_location, tokens):
"""Parse the arguments."""
try:
check_number = tokens[0]
except IndexError:
check_number = -1
if check_number < lower_bound:
raise pyparsing.ParseException(
u'Value: {0:d} precedes lower bound: {1:d}'.format(
check_number, lower_bound))
if check_number > upper_bound:
raise pyparsing.ParseException(
u'Value: {0:d} exceeds upper bound: {1:d}'.format(
check_number, upper_bound))
# Since callback methods for pyparsing need to accept certain parameters
# and there is no way to define conditions, like upper and lower bounds
# we need to return here a method that accepts those pyparsing parameters.
return CheckRange
def PyParseIntCast(unused_string, unused_location, tokens):
"""Return an integer from a string.
This is a pyparsing callback method that converts the matched
string into an integer.
The method modifies the content of the tokens list and converts
them all to an integer value.
Args:
unused_string: The original parsed string.
unused_location: The location within the string where the match was made.
tokens: A list of extracted tokens (where the string to be converted is
stored).
"""
# Cast the regular tokens.
for index, token in enumerate(tokens):
try:
tokens[index] = int(token)
except ValueError:
logging.error(u'Unable to cast [{0:s}] to an int, setting to 0'.format(
token))
tokens[index] = 0
# We also need to cast the dictionary built tokens.
for key in tokens.keys():
try:
tokens[key] = int(tokens[key], 10)
except ValueError:
logging.error(
u'Unable to cast [{0:s} = {1:d}] to an int, setting to 0'.format(
key, tokens[key]))
tokens[key] = 0
def PyParseJoinList(unused_string, unused_location, tokens):
"""Return a joined token from a list of tokens.
This is a callback method for pyparsing setParseAction that modifies
the returned token list to join all the elements in the list to a single
token.
Args:
unused_string: The original parsed string.
unused_location: The location within the string where the match was made.
tokens: A list of extracted tokens. This is the list that should be joined
together and stored as a single token.
"""
join_list = []
for token in tokens:
try:
join_list.append(str(token))
except UnicodeDecodeError:
join_list.append(repr(token))
tokens[0] = u''.join(join_list)
del tokens[1:]
class PyparsingConstants(object):
"""A class that maintains constants for pyparsing."""
# Numbers.
INTEGER = pyparsing.Word(pyparsing.nums).setParseAction(PyParseIntCast)
IPV4_OCTET = pyparsing.Word(pyparsing.nums, min=1, max=3).setParseAction(
PyParseIntCast, PyParseRangeCheck(0, 255))
IPV4_ADDRESS = (IPV4_OCTET + ('.' + IPV4_OCTET) * 3).setParseAction(
PyParseJoinList)
# TODO: Fix the IPv6 address specification to be more accurate (8 :, correct
# size, etc).
IPV6_ADDRESS = pyparsing.Word(':' + pyparsing.hexnums).setParseAction(
PyParseJoinList)
# Common words.
MONTH = pyparsing.Word(
pyparsing.string.uppercase, pyparsing.string.lowercase,
exact=3)
# Define date structures.
HYPHEN = pyparsing.Literal('-').suppress()
YEAR = pyparsing.Word(pyparsing.nums, exact=4).setParseAction(
PyParseIntCast)
TWO_DIGITS = pyparsing.Word(pyparsing.nums, exact=2).setParseAction(
PyParseIntCast)
ONE_OR_TWO_DIGITS = pyparsing.Word(
pyparsing.nums, min=1, max=2).setParseAction(PyParseIntCast)
DATE = pyparsing.Group(
YEAR + pyparsing.Suppress('-') + TWO_DIGITS +
pyparsing.Suppress('-') + TWO_DIGITS)
DATE_REV = pyparsing.Group(
TWO_DIGITS + pyparsing.Suppress('-') + TWO_DIGITS +
pyparsing.Suppress('-') + YEAR)
TIME = pyparsing.Group(
TWO_DIGITS + pyparsing.Suppress(':') + TWO_DIGITS +
pyparsing.Suppress(':') + TWO_DIGITS)
TIME_MSEC = TIME + pyparsing.Suppress('.') + INTEGER
DATE_TIME = DATE + TIME
DATE_TIME_MSEC = DATE + TIME_MSEC
COMMENT_LINE_HASH = pyparsing.Literal('#') + pyparsing.SkipTo(
pyparsing.LineEnd())
# TODO: Add more commonly used structs that can be used by parsers.
PID = pyparsing.Word(
pyparsing.nums, min=1, max=5).setParseAction(PyParseIntCast)
class PyparsingSingleLineTextParser(interface.BaseParser):
"""Single line text parser based on the pyparsing library."""
# The actual structure, this needs to be defined by each parser.
# This is defined as a list of tuples so that more then a single line
# structure can be defined. That way the parser can support more than a
# single type of log entry, despite them all having in common the constraint
# that each log entry is a single line.
# The tuple should have two entries, a key and a structure. This is done to
# keep the structures in an order of priority/preference.
# The key is a comment or an identification that is passed to the ParseRecord
# function so that the developer can identify which structure got parsed.
# The value is the actual pyparsing structure.
LINE_STRUCTURES = []
# In order for the tool to not read too much data into a buffer to evaluate
# whether or not the parser is the right one for this file or not we
# specifically define a maximum amount of bytes a single line can occupy. This
# constant can be overwritten by implementations if their format might have a
# longer line than 400 bytes.
MAX_LINE_LENGTH = 400
# Define an encoding. If a file is encoded using specific encoding it is
# advised to include it here. If this class constant is set all lines wil be
# decoded prior to being sent to parsing by pyparsing, if not properly set it
# could negatively affect parsing of the file.
# If this value needs to be calculated on the fly (not a fixed constant for
# this particular file type) it can be done by modifying the self.encoding
# attribute.
ENCODING = ''
def __init__(self):
"""Initializes the pyparsing single-line text parser object."""
super(PyparsingSingleLineTextParser, self).__init__()
self.encoding = self.ENCODING
self._current_offset = 0
# TODO: self._line_structures is a work-around and this needs
# a structural fix.
self._line_structures = self.LINE_STRUCTURES
def _ReadLine(
self, parser_context, file_entry, text_file_object, max_len=0,
quiet=False, depth=0):
"""Read a single line from a text file and return it back.
Args:
parser_context: A parser context object (instance of ParserContext).
file_entry: A file entry object (instance of dfvfs.FileEntry).
text_file_object: A text file object (instance of dfvfs.TextFile).
max_len: If defined determines the maximum number of bytes a single line
can take.
quiet: If True then a decode warning is not displayed.
depth: A threshold of how many newlines we can encounter before bailing
out.
Returns:
A single line read from the file-like object, or the maximum number of
characters (if max_len defined and line longer than the defined size).
"""
if max_len:
line = text_file_object.readline(max_len)
else:
line = text_file_object.readline()
if not line:
return
# If line is empty, skip it and go on.
if line == '\n' or line == '\r\n':
# Max 40 new lines in a row before we bail out.
if depth == 40:
return ''
return self._ReadLine(
parser_context, file_entry, text_file_object, max_len=max_len,
depth=depth + 1)
if not self.encoding:
return line.strip()
try:
decoded_line = line.decode(self.encoding)
return decoded_line.strip()
except UnicodeDecodeError:
if not quiet:
logging.warning((
u'Unable to decode line [{0:s}...] with encoding: {1:s} in '
u'file: {2:s}').format(
repr(line[1:30]), self.encoding,
parser_context.GetDisplayName(file_entry)))
return line.strip()
def Parse(self, parser_context, file_entry, parser_chain=None):
"""Extract data from a text file using a pyparsing definition.
Args:
parser_context: A parser context object (instance of ParserContext).
file_entry: A file entry object (instance of dfvfs.FileEntry).
parser_chain: Optional string containing the parsing chain up to this
point. The default is None.
Raises:
UnableToParseFile: when the file cannot be parsed.
"""
# TODO: find a more elegant way for this; currently the mac_wifi and
# syslog parser seem to rely on this member.
self.file_entry = file_entry
file_object = file_entry.GetFileObject()
# TODO: self._line_structures is a work-around and this needs
# a structural fix.
if not self._line_structures:
raise errors.UnableToParseFile(
u'Line structure undeclared, unable to proceed.')
file_object.seek(0, os.SEEK_SET)
text_file_object = text_file.TextFile(file_object)
line = self._ReadLine(
parser_context, file_entry, text_file_object,
max_len=self.MAX_LINE_LENGTH, quiet=True)
if not line:
raise errors.UnableToParseFile(u'Not a text file.')
if len(line) == self.MAX_LINE_LENGTH or len(
line) == self.MAX_LINE_LENGTH - 1:
logging.debug((
u'Trying to read a line and reached the maximum allowed length of '
u'{0:d}. The last few bytes of the line are: {1:s} [parser '
u'{2:s}]').format(
self.MAX_LINE_LENGTH, repr(line[-10:]), self.NAME))
if not utils.IsText(line):
raise errors.UnableToParseFile(u'Not a text file, unable to proceed.')
if not self.VerifyStructure(parser_context, line):
raise errors.UnableToParseFile('Wrong file structure.')
# Add ourselves to the parser chain, which will be used in all subsequent
# event creation in this parser.
parser_chain = self._BuildParserChain(parser_chain)
# Set the offset to the beginning of the file.
self._current_offset = 0
# Read every line in the text file.
while line:
parsed_structure = None
use_key = None
# Try to parse the line using all the line structures.
for key, structure in self.LINE_STRUCTURES:
try:
parsed_structure = structure.parseString(line)
except pyparsing.ParseException:
pass
if parsed_structure:
use_key = key
break
if parsed_structure:
parsed_event = self.ParseRecord(
parser_context, use_key, parsed_structure)
if parsed_event:
parsed_event.offset = self._current_offset
parser_context.ProduceEvent(
parsed_event, parser_chain=parser_chain, file_entry=file_entry)
else:
logging.warning(u'Unable to parse log line: {0:s}'.format(line))
self._current_offset = text_file_object.get_offset()
line = self._ReadLine(parser_context, file_entry, text_file_object)
file_object.close()
@abc.abstractmethod
def ParseRecord(self, parser_context, key, structure):
"""Parse a single extracted pyparsing structure.
This function takes as an input a parsed pyparsing structure
and produces an EventObject if possible from that structure.
Args:
parser_context: A parser context object (instance of ParserContext).
key: An identification string indicating the name of the parsed
structure.
structure: A pyparsing.ParseResults object from a line in the
log file.
Returns:
An event object (instance of EventObject) or None.
"""
@abc.abstractmethod
def VerifyStructure(self, parser_context, line):
"""Verify the structure of the file and return boolean based on that check.
This function should read enough text from the text file to confirm
that the file is the correct one for this particular parser.
Args:
parser_context: A parser context object (instance of ParserContext).
line: A single line from the text file.
Returns:
True if this is the correct parser, False otherwise.
"""
class EncodedTextReader(object):
"""Class to read simple encoded text."""
def __init__(self, buffer_size=2048, encoding=None):
"""Initializes the encoded test reader object.
Args:
buffer_size: optional buffer size. The default is 2048.
encoding: optional encoding. The default is None.
"""
super(EncodedTextReader, self).__init__()
self._buffer = ''
self._buffer_size = buffer_size
self._current_offset = 0
self._encoding = encoding
if self._encoding:
self._new_line = u'\n'.encode(self._encoding)
self._carriage_return = u'\r'.encode(self._encoding)
else:
self._new_line = '\n'
self._carriage_return = '\r'
self._new_line_length = len(self._new_line)
self._carriage_return_length = len(self._carriage_return)
self.lines = u''
def _ReadLine(self, file_object):
"""Reads a line from the file object.
Args:
file_object: the file-like object.
Returns:
A string containing the line.
"""
if len(self._buffer) < self._buffer_size:
self._buffer = ''.join([
self._buffer, file_object.read(self._buffer_size)])
line, new_line, self._buffer = self._buffer.partition(self._new_line)
if not line and not new_line:
line = self._buffer
self._buffer = ''
self._current_offset += len(line)
# Strip carriage returns from the text.
if line.endswith(self._carriage_return):
line = line[:-self._carriage_return_length]
if new_line:
line = ''.join([line, self._new_line])
self._current_offset += self._new_line_length
# If a parser specifically indicates specific encoding we need
# to handle the buffer as it is an encoded string.
# If it fails we fail back to the original raw string.
if self._encoding:
try:
line = line.decode(self._encoding)
except UnicodeDecodeError:
# TODO: it might be better to raise here.
pass
return line
def ReadLine(self, file_object):
"""Reads a line.
Args:
file_object: the file-like object.
Returns:
A single line read from the lines buffer.
"""
line, _, self.lines = self.lines.partition('\n')
if not line:
self.ReadLines(file_object)
line, _, self.lines = self.lines.partition('\n')
return line
def ReadLines(self, file_object):
"""Reads lines into the lines buffer.
Args:
file_object: the file-like object.
"""
lines_size = len(self.lines)
if lines_size < self._buffer_size:
lines_size = self._buffer_size - lines_size
while lines_size > 0:
line = self._ReadLine(file_object)
if not line:
break
self.lines = u''.join([self.lines, line])
lines_size -= len(line)
def Reset(self):
"""Resets the encoded text reader."""
self._buffer = ''
self._current_offset = 0
self.lines = u''
def SkipAhead(self, file_object, number_of_characters):
"""Skips ahead a number of characters.
Args:
file_object: the file-like object.
number_of_characters: the number of characters.
"""
lines_size = len(self.lines)
while number_of_characters >= lines_size:
number_of_characters -= lines_size
self.lines = u''
self.ReadLines(file_object)
lines_size = len(self.lines)
if lines_size == 0:
return
self.lines = self.lines[number_of_characters:]
class PyparsingMultiLineTextParser(PyparsingSingleLineTextParser):
"""Multi line text parser based on the pyparsing library."""
BUFFER_SIZE = 2048
def __init__(self):
"""Initializes the pyparsing multi-line text parser object."""
super(PyparsingMultiLineTextParser, self).__init__()
self._buffer_size = self.BUFFER_SIZE
self._text_reader = EncodedTextReader(
buffer_size=self.BUFFER_SIZE, encoding=self.ENCODING)
def Parse(self, parser_context, file_entry, parser_chain=None):
"""Parse a text file using a pyparsing definition.
Args:
parser_context: A parser context object (instance of ParserContext).
file_entry: A file entry object (instance of dfvfs.FileEntry).
parser_chain: Optional string containing the parsing chain up to this
point. The default is None.
Raises:
UnableToParseFile: if the line structures are missing.
"""
if not self.LINE_STRUCTURES:
raise errors.UnableToParseFile(u'Missing line structures.')
self._text_reader.Reset()
file_object = file_entry.GetFileObject()
file_object.seek(0, os.SEEK_SET)
try:
self._text_reader.ReadLines(file_object)
except UnicodeDecodeError as exception:
raise errors.UnableToParseFile(
u'Not a text file, with error: {0:s}'.format(exception))
if not utils.IsText(self._text_reader.lines):
raise errors.UnableToParseFile(u'Not a text file, unable to proceed.')
if not self.VerifyStructure(parser_context, self._text_reader.lines):
raise errors.UnableToParseFile(u'Wrong file structure.')
# Add ourselves to the parser chain, which will be used in all subsequent
# event creation in this parser.
parser_chain = self._BuildParserChain(parser_chain)
# Read every line in the text file.
while self._text_reader.lines:
# Initialize pyparsing objects.
tokens = None
start = 0
end = 0
key = None
# Try to parse the line using all the line structures.
for key, structure in self.LINE_STRUCTURES:
try:
parsed_structure = next(
structure.scanString(self._text_reader.lines, maxMatches=1), None)
except pyparsing.ParseException:
continue
if not parsed_structure:
continue
tokens, start, end = parsed_structure
# Only want to parse the structure if it starts
# at the beginning of the buffer.
if start == 0:
break
if tokens and start == 0:
parsed_event = self.ParseRecord(parser_context, key, tokens)
if parsed_event:
# TODO: need a reliable way to handle this.
# parsed_event.offset = self._text_reader.line_offset
parser_context.ProduceEvent(
parsed_event, parser_chain=parser_chain, file_entry=file_entry)
self._text_reader.SkipAhead(file_object, end)
else:
odd_line = self._text_reader.ReadLine(file_object)
if odd_line:
logging.warning(
u'Unable to parse log line: {0:s}'.format(repr(odd_line)))
try:
self._text_reader.ReadLines(file_object)
except UnicodeDecodeError as exception:
logging.error(
u'[{0:s}] Unable to read lines from file: {1:s} with error: '
u'{2:s}'.format(
parser_chain,
file_entry.path_spec.comparable.replace(u'\n', u';'),
exception))