Import from old repository

This commit is contained in:
Stefan
2020-04-06 18:48:34 +02:00
commit 0da6783a45
762 changed files with 103065 additions and 0 deletions
+17
View File
@@ -0,0 +1,17 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+202
View File
@@ -0,0 +1,202 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The file format classifier."""
# TODO: rewrite most of the classifier in C and integrate with the code in:
# plaso/classifier
import gzip
import logging
import os
import tarfile
import zipfile
import zlib
from dfvfs.lib import definitions
from dfvfs.path import factory as path_spec_factory
from dfvfs.resolver import resolver as path_spec_resolver
from plaso.lib import errors
class Classifier(object):
"""Class that defines the file format classifier."""
_MAGIC_VALUES = {
'ZIP': {'length': 4, 'offset': 0, 'values': ['P', 'K', '\x03', '\x04']},
'TAR': {'length': 5, 'offset': 257, 'values': ['u', 's', 't', 'a', 'r']},
'GZ': {'length': 2, 'offset': 0, 'values': ['\x1f', '\x8b']},
}
# TODO: Remove this logic when the classifier is ready.
# This is only used temporary until files can be classified.
magic_max_length = 0
# Defines the maximum depth into a file (for SmartOpenFiles).
MAX_FILE_DEPTH = 3
@classmethod
def _SmartOpenFile(cls, file_entry):
"""Return a generator for all pathspec protobufs extracted from a file.
If the file is compressed then extract all members and include
them into the processing queue.
Args:
file_entry: The file entry object.
Yields:
A path specification (instance of dfvfs.PathSpec) of embedded file
entries.
"""
file_object = file_entry.GetFileObject()
# TODO: Remove when classifier gets deployed. Then we
# call the classifier here and use that for definition (and
# then we forward the classifier definition in the pathspec
# protobuf.
file_object.seek(0, os.SEEK_SET)
if not cls.magic_max_length:
for magic_value in cls._MAGIC_VALUES.values():
cls.magic_max_length = max(
cls.magic_max_length,
magic_value['length'] + magic_value['offset'])
header = file_object.read(cls.magic_max_length)
file_classification = ''
# Go over each and every magic value defined and compare
# each read byte (according to original offset and current one)
# If all match, then we have a particular file format and we
# can move on.
for m_value, m_dict in cls._MAGIC_VALUES.items():
length = m_dict['length'] + m_dict['offset']
if len(header) < length:
continue
offset = m_dict['offset']
magic = m_dict['values']
if header[offset:offset + len(magic)] == ''.join(magic):
file_classification = m_value
break
# TODO: refactor the file type specific code into sub functions.
if file_classification == 'ZIP':
try:
file_object.seek(0, os.SEEK_SET)
zip_file = zipfile.ZipFile(file_object, 'r')
# TODO: Make this is a more "sane" check, and perhaps
# not entirely skip the file if it has this particular
# ending, but for now, this both slows the tool down
# considerably and makes it also more unstable.
_, _, filename_extension = file_entry.name.rpartition(u'.')
if filename_extension in [u'.jar', u'.sym', u'.xpi']:
file_object.close()
logging.debug(
u'Unsupported ZIP sub type: {0:s} detected in file: {1:s}'.format(
filename_extension, file_entry.path_spec.comparable))
return
for info in zip_file.infolist():
if info.file_size > 0:
logging.debug(
u'Including: {0:s} from ZIP into process queue.'.format(
info.filename))
yield path_spec_factory.Factory.NewPathSpec(
definitions.TYPE_INDICATOR_ZIP, location=info.filename,
parent=file_entry.path_spec)
except zipfile.BadZipfile:
pass
elif file_classification == 'GZ':
try:
type_indicator = file_entry.path_spec.type_indicator
if type_indicator == definitions.TYPE_INDICATOR_GZIP:
raise errors.SameFileType
file_object.seek(0, os.SEEK_SET)
gzip_file = gzip.GzipFile(fileobj=file_object, mode='rb')
_ = gzip_file.read(4)
gzip_file.close()
logging.debug((
u'Including: {0:s} as GZIP compressed stream into process '
u'queue.').format(file_entry.name))
yield path_spec_factory.Factory.NewPathSpec(
definitions.TYPE_INDICATOR_GZIP, parent=file_entry.path_spec)
except (IOError, zlib.error, errors.SameFileType):
pass
# TODO: Add BZ2 support.
elif file_classification == 'TAR':
try:
file_object.seek(0, os.SEEK_SET)
tar_file = tarfile.open(fileobj=file_object, mode='r')
for name_info in tar_file.getmembers():
if not name_info.isfile():
continue
name = name_info.path
logging.debug(
u'Including: {0:s} from TAR into process queue.'.format(name))
yield path_spec_factory.Factory.NewPathSpec(
definitions.TYPE_INDICATOR_TAR, location=name,
parent=file_entry.path_spec)
except tarfile.ReadError:
pass
file_object.close()
@classmethod
def SmartOpenFiles(cls, file_entry, depth=0):
"""Generate a list of all available PathSpecs extracted from a file.
Args:
file_entry: A file entry object.
depth: Incrementing number that defines the current depth into
a file (file inside a ZIP file is depth 1, file inside a tar.gz
would be of depth 2).
Yields:
A file entry object (instance of dfvfs.FileEntry).
"""
if depth >= cls.MAX_FILE_DEPTH:
return
for path_spec in cls._SmartOpenFile(file_entry):
sub_file_entry = path_spec_resolver.Resolver.OpenFileEntry(path_spec)
if sub_file_entry is None:
logging.debug(
u'Unable to open file: {0:s}'.format(path_spec.comparable))
continue
yield sub_file_entry
depth += 1
for sub_file_entry in cls.SmartOpenFiles(sub_file_entry, depth=depth):
yield sub_file_entry
+421
View File
@@ -0,0 +1,421 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2013 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Generic collector that supports both file system and image files."""
import hashlib
import logging
import os
from dfvfs.helpers import file_system_searcher
from dfvfs.lib import definitions as dfvfs_definitions
from dfvfs.lib import errors as dfvfs_errors
from dfvfs.path import factory as path_spec_factory
from dfvfs.resolver import resolver as path_spec_resolver
from plaso.engine import queue
from plaso.lib import errors
class Collector(queue.ItemQueueProducer):
"""Class that implements a collector object."""
def __init__(
self, process_queue, source_path, source_path_spec,
resolver_context=None):
"""Initializes the collector object.
The collector discovers all the files that need to be processed by
the workers. Once a file is discovered it is added to the process queue
as a path specification (instance of dfvfs.PathSpec).
Args:
process_queue: The process queue (instance of Queue). This queue contains
the file entries that need to be processed.
source_path: Path of the source file or directory.
source_path_spec: The source path specification (instance of
dfvfs.PathSpec) as determined by the file system
scanner. The default is None.
resolver_context: Optional resolver context (instance of dfvfs.Context).
The default is None.
"""
super(Collector, self).__init__(process_queue)
self._filter_find_specs = None
self._fs_collector = FileSystemCollector(process_queue)
self._resolver_context = resolver_context
# TODO: remove the need to pass source_path
self._source_path = os.path.abspath(source_path)
self._source_path_spec = source_path_spec
self._vss_stores = None
def __enter__(self):
"""Enters a with statement."""
return self
def __exit__(self, unused_type, unused_value, unused_traceback):
"""Exits a with statement."""
return
def _ProcessImage(self, volume_path_spec, find_specs=None):
"""Processes a volume within a storage media image.
Args:
volume_path_spec: The path specification of the volume containing
the file system.
find_specs: Optional list of find specifications (instances of
dfvfs.FindSpec). The default is None.
"""
if find_specs:
logging.debug(u'Collecting from image file: {0:s} with filter'.format(
self._source_path))
else:
logging.debug(u'Collecting from image file: {0:s}'.format(
self._source_path))
path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/',
parent=volume_path_spec)
try:
file_system = path_spec_resolver.Resolver.OpenFileSystem(
path_spec, resolver_context=self._resolver_context)
except IOError as exception:
logging.error(
u'Unable to open file system with error: {0:s}'.format(exception))
return
try:
self._fs_collector.Collect(
file_system, path_spec, find_specs=find_specs)
except (dfvfs_errors.AccessError, dfvfs_errors.BackEndError) as exception:
logging.warning(u'{0:s}'.format(exception))
if find_specs:
logging.debug(u'Collection from image with filter FAILED.')
else:
logging.debug(u'Collection from image FAILED.')
return
if self._abort:
return
if self._vss_stores:
self._ProcessVSS(volume_path_spec, find_specs=find_specs)
if find_specs:
logging.debug(u'Collection from image with filter COMPLETED.')
else:
logging.debug(u'Collection from image COMPLETED.')
def _ProcessVSS(self, volume_path_spec, find_specs=None):
"""Processes a VSS volume within a storage media image.
Args:
volume_path_spec: The path specification of the volume containing
the file system.
find_specs: Optional list of find specifications (instances of
dfvfs.FindSpec). The default is None.
"""
logging.info(u'Processing VSS.')
vss_path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_VSHADOW, location=u'/',
parent=volume_path_spec)
vss_file_entry = path_spec_resolver.Resolver.OpenFileEntry(
vss_path_spec, resolver_context=self._resolver_context)
number_of_vss = vss_file_entry.number_of_sub_file_entries
# In plaso 1 represents the first store index in dfvfs and pyvshadow 0
# represents the first store index so 1 is subtracted.
vss_store_range = [store_nr - 1 for store_nr in self._vss_stores]
for store_index in vss_store_range:
if self._abort:
return
if find_specs:
logging.info((
u'Collecting from VSS volume: {0:d} out of: {1:d} '
u'with filter').format(store_index + 1, number_of_vss))
else:
logging.info(u'Collecting from VSS volume: {0:d} out of: {1:d}'.format(
store_index + 1, number_of_vss))
vss_path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_VSHADOW, store_index=store_index,
parent=volume_path_spec)
path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/',
parent=vss_path_spec)
file_system = path_spec_resolver.Resolver.OpenFileSystem(
path_spec, resolver_context=self._resolver_context)
try:
self._fs_collector.Collect(
file_system, path_spec, find_specs=find_specs)
except (dfvfs_errors.AccessError, dfvfs_errors.BackEndError) as exception:
logging.warning(u'{0:s}'.format(exception))
if find_specs:
logging.debug(
u'Collection from VSS store: {0:d} with filter FAILED.'.format(
store_index + 1))
else:
logging.debug(u'Collection from VSS store: {0:d} FAILED.'.format(
store_index + 1))
return
if find_specs:
logging.debug(
u'Collection from VSS store: {0:d} with filter COMPLETED.'.format(
store_index + 1))
else:
logging.debug(u'Collection from VSS store: {0:d} COMPLETED.'.format(
store_index + 1))
def Collect(self):
"""Collects files from the source."""
source_file_entry = path_spec_resolver.Resolver.OpenFileEntry(
self._source_path_spec, resolver_context=self._resolver_context)
if not source_file_entry:
logging.warning(u'No files to collect.')
self.SignalEndOfInput()
return
if (not source_file_entry.IsDirectory() and
not source_file_entry.IsFile() and
not source_file_entry.IsDevice()):
raise errors.CollectorError(
u'Source path: {0:s} not a device, file or directory.'.format(
self._source_path))
type_indicator = self._source_path_spec.type_indicator
if type_indicator == dfvfs_definitions.TYPE_INDICATOR_OS:
if source_file_entry.IsFile():
self.ProduceItem(self._source_path_spec)
else:
file_system = path_spec_resolver.Resolver.OpenFileSystem(
self._source_path_spec, resolver_context=self._resolver_context)
try:
self._fs_collector.Collect(
file_system, self._source_path_spec,
find_specs=self._filter_find_specs)
except (dfvfs_errors.AccessError,
dfvfs_errors.BackEndError) as exception:
logging.warning(u'{0:s}'.format(exception))
else:
self._ProcessImage(
self._source_path_spec.parent, find_specs=self._filter_find_specs)
self.SignalEndOfInput()
def SetCollectDirectoryMetadata(self, collect_directory_metadata):
"""Sets the collect directory metadata flag.
Args:
collect_directory_metadata: Boolean value to indicate to collect
directory metadata.
"""
self._fs_collector.SetCollectDirectoryMetadata(collect_directory_metadata)
def SetFilter(self, filter_find_specs):
"""Sets the collection filter find specifications.
Args:
filter_find_specs: List of filter find specifications (instances of
dfvfs.FindSpec).
"""
self._filter_find_specs = filter_find_specs
def SetVssInformation(self, vss_stores):
"""Sets the Volume Shadow Snapshots (VSS) information.
This function will enable VSS collection.
Args:
vss_stores: The range of VSS stores to include in the collection,
where 1 represents the first store.
"""
self._vss_stores = vss_stores
def SignalAbort(self):
"""Signals the producer to abort."""
super(Collector, self).SignalAbort()
self._fs_collector.SignalAbort()
class FileSystemCollector(queue.ItemQueueProducer):
"""Class that implements a file system collector object."""
def __init__(self, process_queue):
"""Initializes the collector object.
The collector discovers all the files that need to be processed by
the workers. Once a file is discovered it is added to the process queue
as a path specification (instance of dfvfs.PathSpec).
Args:
process_queue: The process queue (instance of Queue). This queue contains
the file entries that need to be processed.
"""
super(FileSystemCollector, self).__init__(process_queue)
self._collect_directory_metadata = True
self._duplicate_file_check = False
self._hashlist = {}
self.number_of_file_entries = 0
def __enter__(self):
"""Enters a with statement."""
return self
def __exit__(self, unused_type, unused_value, unused_traceback):
"""Exits a with statement."""
return
def _CalculateNTFSTimeHash(self, file_entry):
"""Return a hash value calculated from a NTFS file's metadata.
Args:
file_entry: The file entry (instance of TSKFileEntry).
Returns:
A hash value (string) that can be used to determine if a file's timestamp
value has changed.
"""
stat_object = file_entry.GetStat()
ret_hash = hashlib.md5()
ret_hash.update('atime:{0:d}.{1:d}'.format(
getattr(stat_object, 'atime', 0),
getattr(stat_object, 'atime_nano', 0)))
ret_hash.update('crtime:{0:d}.{1:d}'.format(
getattr(stat_object, 'crtime', 0),
getattr(stat_object, 'crtime_nano', 0)))
ret_hash.update('mtime:{0:d}.{1:d}'.format(
getattr(stat_object, 'mtime', 0),
getattr(stat_object, 'mtime_nano', 0)))
ret_hash.update('ctime:{0:d}.{1:d}'.format(
getattr(stat_object, 'ctime', 0),
getattr(stat_object, 'ctime_nano', 0)))
return ret_hash.hexdigest()
def _ProcessDirectory(self, file_entry):
"""Processes a directory and extract its metadata if necessary."""
# Need to do a breadth-first search otherwise we'll hit the Python
# maximum recursion depth.
sub_directories = []
for sub_file_entry in file_entry.sub_file_entries:
if self._abort:
return
try:
if not sub_file_entry.IsAllocated() or sub_file_entry.IsLink():
continue
except dfvfs_errors.BackEndError as exception:
logging.warning(
u'Unable to process file: {0:s} with error: {1:s}'.format(
sub_file_entry.path_spec.comparable.replace(
u'\n', u';'), exception))
continue
# For TSK-based file entries only, ignore the virtual /$OrphanFiles
# directory.
if sub_file_entry.type_indicator == dfvfs_definitions.TYPE_INDICATOR_TSK:
if file_entry.IsRoot() and sub_file_entry.name == u'$OrphanFiles':
continue
if sub_file_entry.IsDirectory():
# This check is here to improve performance by not producing
# path specifications that don't get processed.
if self._collect_directory_metadata:
self.ProduceItem(sub_file_entry.path_spec)
self.number_of_file_entries += 1
sub_directories.append(sub_file_entry)
elif sub_file_entry.IsFile():
# If we are dealing with a VSS we want to calculate a hash
# value based on available timestamps and compare that to previously
# calculated hash values, and only include the file into the queue if
# the hash does not match.
if self._duplicate_file_check:
hash_value = self._CalculateNTFSTimeHash(sub_file_entry)
inode = getattr(sub_file_entry.path_spec, 'inode', 0)
if inode in self._hashlist:
if hash_value in self._hashlist[inode]:
continue
self._hashlist.setdefault(inode, []).append(hash_value)
self.ProduceItem(sub_file_entry.path_spec)
self.number_of_file_entries += 1
for sub_file_entry in sub_directories:
if self._abort:
return
try:
self._ProcessDirectory(sub_file_entry)
except (dfvfs_errors.AccessError, dfvfs_errors.BackEndError) as exception:
logging.warning(u'{0:s}'.format(exception))
def Collect(self, file_system, path_spec, find_specs=None):
"""Collects files from the file system.
Args:
file_system: The file system (instance of dfvfs.FileSystem).
path_spec: The path specification (instance of dfvfs.PathSpec).
find_specs: Optional list of find specifications (instances of
dfvfs.FindSpec). The default is None.
"""
if find_specs:
searcher = file_system_searcher.FileSystemSearcher(file_system, path_spec)
for path_spec in searcher.Find(find_specs=find_specs):
if self._abort:
return
self.ProduceItem(path_spec)
self.number_of_file_entries += 1
else:
file_entry = file_system.GetFileEntryByPathSpec(path_spec)
self._ProcessDirectory(file_entry)
def SetCollectDirectoryMetadata(self, collect_directory_metadata):
"""Sets the collect directory metadata flag.
Args:
collect_directory_metadata: Boolean value to indicate to collect
directory metadata.
"""
self._collect_directory_metadata = collect_directory_metadata
+354
View File
@@ -0,0 +1,354 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2013 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The unit tests for the generic collector object."""
import logging
import os
import shutil
import tempfile
import unittest
from dfvfs.helpers import file_system_searcher
from dfvfs.lib import definitions as dfvfs_definitions
from dfvfs.path import factory as path_spec_factory
from dfvfs.resolver import context
from dfvfs.resolver import resolver as path_spec_resolver
from plaso.engine import collector
from plaso.engine import queue
from plaso.engine import single_process
from plaso.engine import utils as engine_utils
class TempDirectory(object):
"""A self cleaning temporary directory."""
def __init__(self):
"""Initializes the temporary directory."""
super(TempDirectory, self).__init__()
self.name = u''
def __enter__(self):
"""Make this work with the 'with' statement."""
self.name = tempfile.mkdtemp()
return self.name
def __exit__(self, unused_type, unused_value, unused_traceback):
"""Make this work with the 'with' statement."""
shutil.rmtree(self.name, True)
class TestCollectorQueueConsumer(queue.ItemQueueConsumer):
"""Class that implements a test collector queue consumer."""
def __init__(self, queue_object):
"""Initializes the queue consumer.
Args:
queue_object: the queue object (instance of Queue).
"""
super(TestCollectorQueueConsumer, self).__init__(queue_object)
self.path_specs = []
def _ConsumeItem(self, path_spec):
"""Consumes an item callback for ConsumeItems.
Args:
path_spec: a path specification (instance of dfvfs.PathSpec).
"""
self.path_specs.append(path_spec)
@property
def number_of_path_specs(self):
"""The number of path specifications."""
return len(self.path_specs)
def GetFilePaths(self):
"""Retrieves a list of file paths from the path specifications."""
file_paths = []
for path_spec in self.path_specs:
location = getattr(path_spec, 'location', None)
if location is not None:
file_paths.append(location)
return file_paths
class CollectorTestCase(unittest.TestCase):
"""The collector test case."""
_TEST_DATA_PATH = os.path.join(os.getcwd(), u'test_data')
# Show full diff results, part of TestCase so does not follow our naming
# conventions.
maxDiff = None
def _GetTestFilePath(self, path_segments):
"""Retrieves the path of a test file relative to the test data directory.
Args:
path_segments: the path segments inside the test data directory.
Returns:
A path of the test file.
"""
# Note that we need to pass the individual path segments to os.path.join
# and not a list.
return os.path.join(self._TEST_DATA_PATH, *path_segments)
class CollectorTest(CollectorTestCase):
"""Tests for the collector."""
def testFileSystemCollection(self):
"""Test collection on the file system."""
test_files = [
self._GetTestFilePath([u'syslog.tgz']),
self._GetTestFilePath([u'syslog.zip']),
self._GetTestFilePath([u'syslog.bz2']),
self._GetTestFilePath([u'wtmp.1'])]
with TempDirectory() as dirname:
for a_file in test_files:
shutil.copy(a_file, dirname)
path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_OS, location=dirname)
test_collection_queue = single_process.SingleProcessQueue()
resolver_context = context.Context()
test_collector = collector.Collector(
test_collection_queue, dirname, path_spec,
resolver_context=resolver_context)
test_collector.Collect()
test_collector_queue_consumer = TestCollectorQueueConsumer(
test_collection_queue)
test_collector_queue_consumer.ConsumeItems()
self.assertEquals(test_collector_queue_consumer.number_of_path_specs, 4)
def testFileSystemWithFilterCollection(self):
"""Test collection on the file system with a filter."""
dirname = u'.'
path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_OS, location=dirname)
filter_name = ''
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
filter_name = temp_file.name
temp_file.write('/test_data/testdir/filter_.+.txt\n')
temp_file.write('/test_data/.+evtx\n')
temp_file.write('/AUTHORS\n')
temp_file.write('/does_not_exist/some_file_[0-9]+txt\n')
test_collection_queue = single_process.SingleProcessQueue()
resolver_context = context.Context()
test_collector = collector.Collector(
test_collection_queue, dirname, path_spec,
resolver_context=resolver_context)
find_specs = engine_utils.BuildFindSpecsFromFile(filter_name)
test_collector.SetFilter(find_specs)
test_collector.Collect()
test_collector_queue_consumer = TestCollectorQueueConsumer(
test_collection_queue)
test_collector_queue_consumer.ConsumeItems()
try:
os.remove(filter_name)
except (OSError, IOError) as exception:
logging.warning((
u'Unable to remove temporary file: {0:s} with error: {1:s}').format(
filter_name, exception))
# Two files with test_data/testdir/filter_*.txt, AUTHORS
# and test_data/System.evtx.
self.assertEquals(test_collector_queue_consumer.number_of_path_specs, 4)
paths = test_collector_queue_consumer.GetFilePaths()
current_directory = os.getcwd()
expected_path = os.path.join(
current_directory, u'test_data', u'testdir', u'filter_1.txt')
self.assertTrue(expected_path in paths)
expected_path = os.path.join(
current_directory, u'test_data', u'testdir', u'filter_2.txt')
self.assertFalse(expected_path in paths)
expected_path = os.path.join(
current_directory, u'test_data', u'testdir', u'filter_3.txt')
self.assertTrue(expected_path in paths)
expected_path = os.path.join(
current_directory, u'AUTHORS')
self.assertTrue(expected_path in paths)
def testImageCollection(self):
"""Test collection on a storage media image file.
This images has two files:
+ logs/hidden.zip
+ logs/sys.tgz
The hidden.zip file contains one file, syslog, which is the
same for sys.tgz.
The end results should therefore be:
+ logs/hidden.zip (unchanged)
+ logs/hidden.zip:syslog (the text file extracted out)
+ logs/sys.tgz (unchanged)
+ logs/sys.tgz (read as a GZIP file, so not compressed)
+ logs/sys.tgz:syslog.gz (A GZIP file from the TAR container)
+ logs/sys.tgz:syslog.gz:syslog (the extracted syslog file)
This means that the collection script should collect 6 files in total.
"""
test_file = self._GetTestFilePath([u'syslog_image.dd'])
volume_path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_OS, location=test_file)
path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/',
parent=volume_path_spec)
test_collection_queue = single_process.SingleProcessQueue()
resolver_context = context.Context()
test_collector = collector.Collector(
test_collection_queue, test_file, path_spec,
resolver_context=resolver_context)
test_collector.Collect()
test_collector_queue_consumer = TestCollectorQueueConsumer(
test_collection_queue)
test_collector_queue_consumer.ConsumeItems()
self.assertEquals(test_collector_queue_consumer.number_of_path_specs, 3)
def testImageWithFilterCollection(self):
"""Test collection on a storage media image file with a filter."""
test_file = self._GetTestFilePath([u'ímynd.dd'])
volume_path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_OS, location=test_file)
path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/',
parent=volume_path_spec)
filter_name = ''
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
filter_name = temp_file.name
temp_file.write('/a_directory/.+zip\n')
temp_file.write('/a_directory/another.+\n')
temp_file.write('/passwords.txt\n')
test_collection_queue = single_process.SingleProcessQueue()
resolver_context = context.Context()
test_collector = collector.Collector(
test_collection_queue, test_file, path_spec,
resolver_context=resolver_context)
find_specs = engine_utils.BuildFindSpecsFromFile(filter_name)
test_collector.SetFilter(find_specs)
test_collector.Collect()
test_collector_queue_consumer = TestCollectorQueueConsumer(
test_collection_queue)
test_collector_queue_consumer.ConsumeItems()
try:
os.remove(filter_name)
except (OSError, IOError) as exception:
logging.warning((
u'Unable to remove temporary file: {0:s} with error: {1:s}').format(
filter_name, exception))
self.assertEquals(test_collector_queue_consumer.number_of_path_specs, 2)
paths = test_collector_queue_consumer.GetFilePaths()
# path_specs[0]
# type: TSK
# file_path: '/a_directory/another_file'
# container_path: 'test_data/ímynd.dd'
# image_offset: 0
self.assertEquals(paths[0], u'/a_directory/another_file')
# path_specs[1]
# type: TSK
# file_path: '/passwords.txt'
# container_path: 'test_data/ímynd.dd'
# image_offset: 0
self.assertEquals(paths[1], u'/passwords.txt')
class BuildFindSpecsFromFileTest(unittest.TestCase):
"""Tests for the BuildFindSpecsFromFile function."""
def testBuildFindSpecsFromFile(self):
"""Tests the BuildFindSpecsFromFile function."""
filter_name = ''
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
filter_name = temp_file.name
# 2 hits.
temp_file.write('/test_data/testdir/filter_.+.txt\n')
# A single hit.
temp_file.write('/test_data/.+evtx\n')
# A single hit.
temp_file.write('/AUTHORS\n')
temp_file.write('/does_not_exist/some_file_[0-9]+txt\n')
# This should not compile properly, missing file information.
temp_file.write('failing/\n')
# This should not fail during initial loading, but fail later on.
temp_file.write('bad re (no close on that parenthesis/file\n')
find_specs = engine_utils.BuildFindSpecsFromFile(filter_name)
try:
os.remove(filter_name)
except (OSError, IOError) as exception:
logging.warning(
u'Unable to remove temporary file: {0:s} with error: {1:s}'.format(
filter_name, exception))
self.assertEquals(len(find_specs), 4)
dirname = u'.'
path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_OS, location=dirname)
file_system = path_spec_resolver.Resolver.OpenFileSystem(path_spec)
searcher = file_system_searcher.FileSystemSearcher(
file_system, path_spec)
path_spec_generator = searcher.Find(find_specs=find_specs)
self.assertNotEquals(path_spec_generator, None)
path_specs = list(path_spec_generator)
# One evtx, one AUTHORS, two filter_*.txt files, total 4 files.
self.assertEquals(len(path_specs), 4)
with self.assertRaises(IOError):
_ = engine_utils.BuildFindSpecsFromFile('thisfiledoesnotexist')
if __name__ == '__main__':
unittest.main()
+319
View File
@@ -0,0 +1,319 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2012 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The processing engine."""
import abc
import logging
from dfvfs.helpers import file_system_searcher
from dfvfs.lib import definitions as dfvfs_definitions
from dfvfs.resolver import resolver as path_spec_resolver
from plaso.artifacts import knowledge_base
from plaso.engine import collector
from plaso.engine import queue
from plaso.lib import errors
from plaso.preprocessors import interface as preprocess_interface
from plaso.preprocessors import manager as preprocess_manager
class BaseEngine(object):
"""Class that defines the processing engine base."""
def __init__(self, collection_queue, storage_queue, parse_error_queue):
"""Initialize the engine object.
Args:
collection_queue: the collection queue object (instance of Queue).
storage_queue: the storage queue object (instance of Queue).
parse_error_queue: the parser error queue object (instance of Queue).
"""
self._collection_queue = collection_queue
self._enable_debug_output = False
self._enable_profiling = False
self._event_queue_producer = queue.ItemQueueProducer(storage_queue)
self._filter_object = None
self._mount_path = None
self._open_files = False
self._parse_error_queue = parse_error_queue
self._parse_error_queue_producer = queue.ItemQueueProducer(
parse_error_queue)
self._profiling_sample_rate = 1000
self._source = None
self._source_path_spec = None
self._source_file_entry = None
self._text_prepend = None
self.knowledge_base = knowledge_base.KnowledgeBase()
self.storage_queue = storage_queue
def CreateCollector(
self, include_directory_stat, vss_stores=None, filter_find_specs=None,
resolver_context=None):
"""Creates a collector object.
The collector discovers all the files that need to be processed by
the workers. Once a file is discovered it is added to the process queue
as a path specification (instance of dfvfs.PathSpec).
Args:
include_directory_stat: Boolean value to indicate whether directory
stat information should be collected.
vss_stores: Optional list of VSS stores to include in the collection,
where 1 represents the first store. Set to None if no
VSS stores should be processed. The default is None.
filter_find_specs: Optional list of filter find specifications (instances
of dfvfs.FindSpec). The default is None.
resolver_context: Optional resolver context (instance of dfvfs.Context).
The default is None. Note that every thread or process
must have its own resolver context.
Returns:
A collector object (instance of Collector).
Raises:
RuntimeError: if source path specification is not set.
"""
if not self._source_path_spec:
raise RuntimeError(u'Missing source.')
collector_object = collector.Collector(
self._collection_queue, self._source, self._source_path_spec,
resolver_context=resolver_context)
collector_object.SetCollectDirectoryMetadata(include_directory_stat)
if vss_stores:
collector_object.SetVssInformation(vss_stores)
if filter_find_specs:
collector_object.SetFilter(filter_find_specs)
return collector_object
@abc.abstractmethod
def CreateExtractionWorker(self, worker_number):
"""Creates an extraction worker object.
Args:
worker_number: A number that identifies the worker.
Returns:
An extraction worker (instance of worker.ExtractionWorker).
"""
def GetSourceFileSystemSearcher(self, resolver_context=None):
"""Retrieves the file system searcher of the source.
Args:
resolver_context: Optional resolver context (instance of dfvfs.Context).
The default is None. Note that every thread or process
must have its own resolver context.
Returns:
The file system searcher object (instance of dfvfs.FileSystemSearcher).
Raises:
RuntimeError: if source path specification is not set.
"""
if not self._source_path_spec:
raise RuntimeError(u'Missing source.')
file_system = path_spec_resolver.Resolver.OpenFileSystem(
self._source_path_spec, resolver_context=resolver_context)
type_indicator = self._source_path_spec.type_indicator
if type_indicator == dfvfs_definitions.TYPE_INDICATOR_OS:
mount_point = self._source_path_spec
else:
mount_point = self._source_path_spec.parent
return file_system_searcher.FileSystemSearcher(file_system, mount_point)
def PreprocessSource(self, platform, resolver_context=None):
"""Preprocesses the source and fills the preprocessing object.
Args:
platform: string that indicates the platform (operating system).
resolver_context: Optional resolver context (instance of dfvfs.Context).
The default is None. Note that every thread or process
must have its own resolver context.
"""
searcher = self.GetSourceFileSystemSearcher(
resolver_context=resolver_context)
if not platform:
platform = preprocess_interface.GuessOS(searcher)
self.knowledge_base.platform = platform
preprocess_manager.PreprocessPluginsManager.RunPlugins(
platform, searcher, self.knowledge_base)
def SetEnableDebugOutput(self, enable_debug_output):
"""Enables or disables debug output.
Args:
enable_debug_output: boolean value to indicate if the debug output
should be enabled.
"""
self._enable_debug_output = enable_debug_output
def SetEnableProfiling(self, enable_profiling, profiling_sample_rate=1000):
"""Enables or disables profiling.
Args:
enable_debug_output: boolean value to indicate if the profiling
should be enabled.
profiling_sample_rate: optional integer indicating the profiling sample
rate. The value contains the number of files
processed. The default value is 1000.
"""
self._enable_profiling = enable_profiling
self._profiling_sample_rate = profiling_sample_rate
def SetFilterObject(self, filter_object):
"""Sets the filter object.
Args:
filter_object: the filter object (instance of objectfilter.Filter).
"""
self._filter_object = filter_object
def SetMountPath(self, mount_path):
"""Sets the mount path.
Args:
mount_path: string containing the mount path.
"""
self._mount_path = mount_path
# TODO: rename this mode.
def SetOpenFiles(self, open_files):
"""Sets the open files mode.
Args:
open_files: boolean value to indicate if the worker should scan for
file entries inside files.
"""
self._open_files = open_files
def SetSource(self, source_path_spec, resolver_context=None):
"""Sets the source.
Args:
source_path_spec: The source path specification (instance of
dfvfs.PathSpec) as determined by the file system
scanner. The default is None.
resolver_context: Optional resolver context (instance of dfvfs.Context).
The default is None. Note that every thread or process
must have its own resolver context.
Raises:
BadConfigOption: if source cannot be set.
"""
path_spec = source_path_spec
while path_spec.parent:
path_spec = path_spec.parent
# Note that source should be used for output purposes only.
self._source = getattr(path_spec, 'location', u'')
self._source_path_spec = source_path_spec
self._source_file_entry = path_spec_resolver.Resolver.OpenFileEntry(
self._source_path_spec, resolver_context=resolver_context)
if not self._source_file_entry:
raise errors.BadConfigOption(
u'No such device, file or directory: {0:s}.'.format(self._source))
if (not self._source_file_entry.IsDirectory() and
not self._source_file_entry.IsFile() and
not self._source_file_entry.IsDevice()):
raise errors.CollectorError(
u'Source path: {0:s} not a device, file or directory.'.format(
self._source))
if self._source_path_spec.type_indicator in [
dfvfs_definitions.TYPE_INDICATOR_OS,
dfvfs_definitions.TYPE_INDICATOR_FAKE]:
if self._source_file_entry.IsFile():
logging.debug(u'Starting a collection on a single file.')
# No need for multiple workers when parsing a single file.
elif not self._source_file_entry.IsDirectory():
raise errors.BadConfigOption(
u'Source: {0:s} has to be a file or directory.'.format(
self._source))
# TODO: remove this functionality.
def SetTextPrepend(self, text_prepend):
"""Sets the text prepend.
Args:
text_prepend: string that contains the text to prepend to every
event object.
"""
self._text_prepend = text_prepend
def SignalAbort(self):
"""Signals the engine to abort."""
logging.warning(u'Signalled abort.')
self._event_queue_producer.SignalEndOfInput()
self._parse_error_queue_producer.SignalEndOfInput()
def SignalEndOfInputStorageQueue(self):
"""Signals the storage queue no input remains."""
self._event_queue_producer.SignalEndOfInput()
self._parse_error_queue_producer.SignalEndOfInput()
def SourceIsDirectory(self):
"""Determines if the source is a directory.
Raises:
RuntimeError: if source path specification is not set.
"""
if not self._source_file_entry:
raise RuntimeError(u'Missing source.')
return (not self.SourceIsStorageMediaImage() and
self._source_file_entry.IsDirectory())
def SourceIsFile(self):
"""Determines if the source is a file.
Raises:
RuntimeError: if source path specification is not set.
"""
if not self._source_file_entry:
raise RuntimeError(u'Missing source.')
return (not self.SourceIsStorageMediaImage() and
self._source_file_entry.IsFile())
def SourceIsStorageMediaImage(self):
"""Determines if the source is storage media image file or device.
Raises:
RuntimeError: if source path specification is not set.
"""
if not self._source_path_spec:
raise RuntimeError(u'Missing source.')
return self._source_path_spec.type_indicator not in [
dfvfs_definitions.TYPE_INDICATOR_OS,
dfvfs_definitions.TYPE_INDICATOR_FAKE]
+204
View File
@@ -0,0 +1,204 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2012 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Queue management implementation for Plaso.
This file contains an implementation of a queue used by plaso for
queue management.
The queue has been abstracted in order to provide support for different
implementations of the queueing mechanism, to support multi processing and
scalability.
"""
import abc
from plaso.lib import errors
class QueueEndOfInput(object):
"""Class that implements a queue end of input."""
class Queue(object):
"""Class that implements the queue interface."""
@abc.abstractmethod
def __len__(self):
"""Returns the estimated current number of items in the queue."""
@abc.abstractmethod
def IsEmpty(self):
"""Determines if the queue is empty."""
@abc.abstractmethod
def PushItem(self, item):
"""Pushes an item onto the queue."""
@abc.abstractmethod
def PopItem(self):
"""Pops an item off the queue."""
def SignalEndOfInput(self):
"""Signals the queue no input remains."""
self.PushItem(QueueEndOfInput())
class QueueConsumer(object):
"""Class that implements the queue consumer interface.
The consumer subscribes to updates on the queue.
"""
def __init__(self, queue_object):
"""Initializes the queue consumer.
Args:
queue_object: the queue object (instance of Queue).
"""
super(QueueConsumer, self).__init__()
self._abort = False
self._queue = queue_object
def SignalAbort(self):
"""Signals the consumer to abort."""
self._abort = True
class QueueProducer(object):
"""Class that implements the queue producer interface.
The producer generates updates on the queue.
"""
def __init__(self, queue_object):
"""Initializes the queue producer.
Args:
queue_object: the queue object (instance of Queue).
"""
super(QueueProducer, self).__init__()
self._abort = False
self._queue = queue_object
def SignalAbort(self):
"""Signals the producer to abort."""
self._abort = True
def SignalEndOfInput(self):
"""Signals the queue no input remains."""
self._queue.SignalEndOfInput()
class EventObjectQueueConsumer(QueueConsumer):
"""Class that implements the event object queue consumer.
The consumer subscribes to updates on the queue.
"""
@abc.abstractmethod
def _ConsumeEventObject(self, event_object, **kwargs):
"""Consumes an event object callback for ConsumeEventObjects."""
def ConsumeEventObjects(self, **kwargs):
"""Consumes the event object that are pushed on the queue.
This function will issue a callback to _ConsumeEventObject for every
event object (instance of EventObject) consumed from the queue.
Args:
kwargs: keyword arguments to pass to the _ConsumeEventObject callback.
"""
while not self._abort:
try:
item = self._queue.PopItem()
except errors.QueueEmpty:
break
if isinstance(item, QueueEndOfInput):
# Push the item back onto the queue to make sure all
# queue consumers are stopped.
self._queue.PushItem(item)
break
self._ConsumeEventObject(item, **kwargs)
self._abort = False
class ItemQueueConsumer(QueueConsumer):
"""Class that implements an item queue consumer.
The consumer subscribes to updates on the queue.
"""
@abc.abstractmethod
def _ConsumeItem(self, item):
"""Consumes an item callback for ConsumeItems.
Args:
item: the item object.
"""
def ConsumeItems(self):
"""Consumes the items that are pushed on the queue."""
while not self._abort:
try:
item = self._queue.PopItem()
except errors.QueueEmpty:
break
if isinstance(item, QueueEndOfInput):
# Push the item back onto the queue to make sure all
# queue consumers are stopped.
self._queue.PushItem(item)
break
self._ConsumeItem(item)
self._abort = False
class ItemQueueProducer(QueueProducer):
"""Class that implements an item queue producer.
The producer generates updates on the queue.
"""
def _FlushQueue(self):
"""Flushes the queue callback for the QueueFull exception."""
return
def ProduceItem(self, item):
"""Produces an item onto the queue.
Args:
item: the item object.
"""
try:
self._queue.PushItem(item)
except errors.QueueFull:
self._FlushQueue()
def ProduceItems(self, items):
"""Produces items onto the queue.
Args:
items: a list or generator of item objects.
"""
for item in items:
self.ProduceItem(item)
+366
View File
@@ -0,0 +1,366 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The single process processing engine."""
import collections
import logging
import pdb
from plaso.engine import collector
from plaso.engine import engine
from plaso.engine import queue
from plaso.engine import worker
from plaso.lib import errors
from plaso.parsers import context as parsers_context
class SingleProcessCollector(collector.Collector):
"""Class that implements a single process collector object."""
def __init__(
self, process_queue, source_path, source_path_spec,
resolver_context=None):
"""Initializes the collector object.
The collector discovers all the files that need to be processed by
the workers. Once a file is discovered it is added to the process queue
as a path specification (instance of dfvfs.PathSpec).
Args:
process_queue: The process queue (instance of Queue). This queue contains
the file entries that need to be processed.
source_path: Path of the source file or directory.
source_path_spec: The source path specification (instance of
dfvfs.PathSpec) as determined by the file system
scanner. The default is None.
resolver_context: Optional resolver context (instance of dfvfs.Context).
The default is None.
"""
super(SingleProcessCollector, self).__init__(
process_queue, source_path, source_path_spec,
resolver_context=resolver_context)
self._extraction_worker = None
self._fs_collector = SingleProcessFileSystemCollector(process_queue)
def _FlushQueue(self):
"""Flushes the queue callback for the QueueFull exception."""
while not self._queue.IsEmpty():
logging.debug(u'Extraction worker started.')
self._extraction_worker.Run()
logging.debug(u'Extraction worker stopped.')
def SetExtractionWorker(self, extraction_worker):
"""Sets the extraction worker.
Args:
extraction_worker: the extraction worker object (instance of
EventExtractionWorker).
"""
self._extraction_worker = extraction_worker
self._fs_collector.SetExtractionWorker(extraction_worker)
class SingleProcessEngine(engine.BaseEngine):
"""Class that defines the single process engine."""
def __init__(self, maximum_number_of_queued_items=0):
"""Initialize the single process engine object.
Args:
maximum_number_of_queued_items: The maximum number of queued items.
The default is 0, which represents
no limit.
"""
collection_queue = SingleProcessQueue(
maximum_number_of_queued_items=maximum_number_of_queued_items)
storage_queue = SingleProcessQueue(
maximum_number_of_queued_items=maximum_number_of_queued_items)
parse_error_queue = SingleProcessQueue(
maximum_number_of_queued_items=maximum_number_of_queued_items)
super(SingleProcessEngine, self).__init__(
collection_queue, storage_queue, parse_error_queue)
self._event_queue_producer = SingleProcessItemQueueProducer(storage_queue)
self._parse_error_queue_producer = SingleProcessItemQueueProducer(
parse_error_queue)
def CreateCollector(
self, include_directory_stat, vss_stores=None, filter_find_specs=None,
resolver_context=None):
"""Creates a collector object.
The collector discovers all the files that need to be processed by
the workers. Once a file is discovered it is added to the process queue
as a path specification (instance of dfvfs.PathSpec).
Args:
include_directory_stat: Boolean value to indicate whether directory
stat information should be collected.
vss_stores: Optional list of VSS stores to include in the collection,
where 1 represents the first store. Set to None if no
VSS stores should be processed. The default is None.
filter_find_specs: Optional list of filter find specifications (instances
of dfvfs.FindSpec). The default is None.
resolver_context: Optional resolver context (instance of dfvfs.Context).
The default is None. Note that every thread or process
must have its own resolver context.
Returns:
A collector object (instance of Collector).
Raises:
RuntimeError: if source path specification is not set.
"""
if not self._source_path_spec:
raise RuntimeError(u'Missing source.')
collector_object = SingleProcessCollector(
self._collection_queue, self._source, self._source_path_spec,
resolver_context=resolver_context)
collector_object.SetCollectDirectoryMetadata(include_directory_stat)
if vss_stores:
collector_object.SetVssInformation(vss_stores)
if filter_find_specs:
collector_object.SetFilter(filter_find_specs)
return collector_object
def CreateExtractionWorker(self, worker_number):
"""Creates an extraction worker object.
Args:
worker_number: A number that identifies the worker.
Returns:
An extraction worker (instance of worker.ExtractionWorker).
"""
parser_context = parsers_context.ParserContext(
self._event_queue_producer, self._parse_error_queue_producer,
self.knowledge_base)
extraction_worker = SingleProcessEventExtractionWorker(
worker_number, self._collection_queue, self._event_queue_producer,
self._parse_error_queue_producer, parser_context)
extraction_worker.SetEnableDebugOutput(self._enable_debug_output)
# TODO: move profiler in separate object.
extraction_worker.SetEnableProfiling(
self._enable_profiling,
profiling_sample_rate=self._profiling_sample_rate)
if self._open_files:
extraction_worker.SetOpenFiles(self._open_files)
if self._filter_object:
extraction_worker.SetFilterObject(self._filter_object)
if self._mount_path:
extraction_worker.SetMountPath(self._mount_path)
if self._text_prepend:
extraction_worker.SetTextPrepend(self._text_prepend)
return extraction_worker
def ProcessSource(
self, collector_object, storage_writer, parser_filter_string=None):
"""Processes the source and extracts event objects.
Args:
collector_object: A collector object (instance of Collector).
storage_writer: A storage writer object (instance of BaseStorageWriter).
parser_filter_string: Optional parser filter string. The default is None.
"""
extraction_worker = self.CreateExtractionWorker(0)
extraction_worker.InitalizeParserObjects(
parser_filter_string=parser_filter_string)
# Set the extraction worker and storage writer values so that they
# can be accessed if the QueueFull exception is raised. This is
# needed in single process mode to prevent the queue consuming too
# much memory.
collector_object.SetExtractionWorker(extraction_worker)
self._event_queue_producer.SetStorageWriter(storage_writer)
self._parse_error_queue_producer.SetStorageWriter(storage_writer)
logging.debug(u'Processing started.')
logging.debug(u'Collection started.')
collector_object.Collect()
logging.debug(u'Collection stopped.')
logging.debug(u'Extraction worker started.')
extraction_worker.Run()
logging.debug(u'Extraction worker stopped.')
self._event_queue_producer.SignalEndOfInput()
logging.debug(u'Storage writer started.')
storage_writer.WriteEventObjects()
logging.debug(u'Storage writer stopped.')
# Reset the extraction worker and storage writer values to return
# the objects in their original state. This will prevent access
# to the extraction worker outside this function and allow it
# to be garbage collected.
self._event_queue_producer.SetStorageWriter(None)
self._parse_error_queue_producer.SetStorageWriter(None)
collector_object.SetExtractionWorker(None)
logging.debug(u'Processing completed.')
class SingleProcessEventExtractionWorker(worker.BaseEventExtractionWorker):
"""Class that defines the single process event extraction worker."""
def _DebugParseFileEntry(self):
"""Callback for debugging file entry parsing failures."""
pdb.post_mortem()
class SingleProcessFileSystemCollector(collector.FileSystemCollector):
"""Class that implements a single process file system collector object."""
def __init__(self, process_queue):
"""Initializes the collector object.
The collector discovers all the files that need to be processed by
the workers. Once a file is discovered it is added to the process queue
as a path specification (instance of dfvfs.PathSpec).
Args:
process_queue: The process queue (instance of Queue). This queue contains
the file entries that need to be processed.
"""
super(SingleProcessFileSystemCollector, self).__init__(process_queue)
self._extraction_worker = None
def _FlushQueue(self):
"""Flushes the queue callback for the QueueFull exception."""
while not self._queue.IsEmpty():
logging.debug(u'Extraction worker started.')
self._extraction_worker.Run()
logging.debug(u'Extraction worker stopped.')
def SetExtractionWorker(self, extraction_worker):
"""Sets the extraction worker.
Args:
extraction_worker: the extraction worker object (instance of
EventExtractionWorker).
"""
self._extraction_worker = extraction_worker
class SingleProcessItemQueueProducer(queue.ItemQueueProducer):
"""Class that implements a single process item queue producer."""
def __init__(self, queue_object):
"""Initializes the queue producer.
Args:
queue_object: the queue object (instance of Queue).
"""
super(SingleProcessItemQueueProducer, self).__init__(queue_object)
self._storage_writer = None
def _FlushQueue(self):
"""Flushes the queue callback for the QueueFull exception."""
logging.debug(u'Storage writer started.')
self._storage_writer.WriteEventObjects()
logging.debug(u'Storage writer stopped.')
def SetStorageWriter(self, storage_writer):
"""Sets the storage writer.
Args:
storage_writer: the storage writer object (instance of
BaseStorageWriter).
"""
self._storage_writer = storage_writer
class SingleProcessQueue(queue.Queue):
"""Single process queue."""
def __init__(self, maximum_number_of_queued_items=0):
"""Initializes a single process queue object.
Args:
maximum_number_of_queued_items: The maximum number of queued items.
The default is 0, which represents
no limit.
"""
super(SingleProcessQueue, self).__init__()
# The Queue interface defines the maximum number of queued items to be
# 0 if unlimited as does the multi processing queue, but deque uses
# None to indicate no limit.
if maximum_number_of_queued_items == 0:
maximum_number_of_queued_items = None
# maxlen contains the maximum number of items allowed to be queued,
# where None represents unlimited.
self._queue = collections.deque(
maxlen=maximum_number_of_queued_items)
def __len__(self):
"""Returns the estimated current number of items in the queue."""
return len(self._queue)
def IsEmpty(self):
"""Determines if the queue is empty."""
return len(self._queue) == 0
def PushItem(self, item):
"""Pushes an item onto the queue.
Raises:
QueueFull: when the queue is full.
"""
number_of_items = len(self._queue)
# Deque will drop the first item in the queue when maxlen is exceeded.
if not self._queue.maxlen or number_of_items < self._queue.maxlen:
self._queue.append(item)
number_of_items += 1
if self._queue.maxlen and number_of_items == self._queue.maxlen:
raise errors.QueueFull
def PopItem(self):
"""Pops an item off the queue.
Raises:
QueueEmpty: when the queue is empty.
"""
try:
# Using popleft to have FIFO behavior.
return self._queue.popleft()
except IndexError:
raise errors.QueueEmpty
+133
View File
@@ -0,0 +1,133 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests the single process processing engine."""
import os
import unittest
from dfvfs.lib import definitions as dfvfs_definitions
from dfvfs.helpers import file_system_searcher
from dfvfs.path import factory as path_spec_factory
from dfvfs.resolver import context
from plaso.engine import single_process
from plaso.engine import test_lib
from plaso.lib import errors
class SingleProcessQueueTest(unittest.TestCase):
"""Tests the single process queue."""
_ITEMS = frozenset(['item1', 'item2', 'item3', 'item4'])
def testPushPopItem(self):
"""Tests the PushItem and PopItem functions."""
test_queue = single_process.SingleProcessQueue()
for item in self._ITEMS:
test_queue.PushItem(item)
self.assertEquals(len(test_queue), len(self._ITEMS))
test_queue.SignalEndOfInput()
test_queue_consumer = test_lib.TestQueueConsumer(test_queue)
test_queue_consumer.ConsumeItems()
expected_number_of_items = len(self._ITEMS)
self.assertEquals(
test_queue_consumer.number_of_items, expected_number_of_items)
def testQueueEmpty(self):
"""Tests the queue raises the QueueEmpty exception."""
test_queue = single_process.SingleProcessQueue()
with self.assertRaises(errors.QueueEmpty):
test_queue.PopItem()
def testQueueFull(self):
"""Tests the queue raises the QueueFull exception."""
test_queue = single_process.SingleProcessQueue(
maximum_number_of_queued_items=5)
for item in self._ITEMS:
test_queue.PushItem(item)
with self.assertRaises(errors.QueueFull):
test_queue.PushItem('item5')
with self.assertRaises(errors.QueueFull):
test_queue.PushItem('item6')
test_queue_consumer = test_lib.TestQueueConsumer(test_queue)
test_queue_consumer.ConsumeItems()
expected_number_of_items = len(self._ITEMS)
self.assertEquals(
test_queue_consumer.number_of_items, expected_number_of_items + 1)
class SingleProcessEngineTest(unittest.TestCase):
"""Tests for the engine object."""
_TEST_DATA_PATH = os.path.join(os.getcwd(), u'test_data')
def testEngine(self):
"""Test the engine functionality."""
resolver_context = context.Context()
test_engine = single_process.SingleProcessEngine(
maximum_number_of_queued_items=25000)
self.assertNotEquals(test_engine, None)
source_path = os.path.join(self._TEST_DATA_PATH, u'ímynd.dd')
os_path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_OS, location=source_path)
source_path_spec = path_spec_factory.Factory.NewPathSpec(
dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/',
parent=os_path_spec)
test_engine.SetSource(source_path_spec, resolver_context=resolver_context)
self.assertFalse(test_engine.SourceIsDirectory())
self.assertFalse(test_engine.SourceIsFile())
self.assertTrue(test_engine.SourceIsStorageMediaImage())
test_searcher = test_engine.GetSourceFileSystemSearcher(
resolver_context=resolver_context)
self.assertNotEquals(test_searcher, None)
self.assertIsInstance(
test_searcher, file_system_searcher.FileSystemSearcher)
test_engine.PreprocessSource('Windows')
test_collector = test_engine.CreateCollector(
False, vss_stores=None, filter_find_specs=None,
resolver_context=resolver_context)
self.assertNotEquals(test_collector, None)
self.assertIsInstance(
test_collector, single_process.SingleProcessCollector)
test_extraction_worker = test_engine.CreateExtractionWorker(0)
self.assertNotEquals(test_extraction_worker, None)
self.assertIsInstance(
test_extraction_worker,
single_process.SingleProcessEventExtractionWorker)
if __name__ == '__main__':
unittest.main()
+71
View File
@@ -0,0 +1,71 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Engine related functions and classes for testing."""
import os
import unittest
from plaso.engine import queue
class TestQueueConsumer(queue.ItemQueueConsumer):
"""Class that implements the test queue consumer.
The queue consumer subscribes to updates on the queue.
"""
def __init__(self, test_queue):
"""Initializes the queue consumer.
Args:
test_queue: the test queue (instance of Queue).
"""
super(TestQueueConsumer, self).__init__(test_queue)
self.items = []
def _ConsumeItem(self, item):
"""Consumes an item callback for ConsumeItems."""
self.items.append(item)
@property
def number_of_items(self):
"""The number of items."""
return len(self.items)
class EngineTestCase(unittest.TestCase):
"""The unit test case for a front-end."""
_TEST_DATA_PATH = os.path.join(os.getcwd(), 'test_data')
# Show full diff results, part of TestCase so does not follow our naming
# conventions.
maxDiff = None
def _GetTestFilePath(self, path_segments):
"""Retrieves the path of a test file relative to the test data directory.
Args:
path_segments: the path segments inside the test data directory.
Returns:
A path of the test file.
"""
# Note that we need to pass the individual path segments to os.path.join
# and not a list.
return os.path.join(self._TEST_DATA_PATH, *path_segments)
+75
View File
@@ -0,0 +1,75 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2014 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Engine utility functions."""
import logging
from dfvfs.helpers import file_system_searcher
from plaso.winreg import path_expander
def BuildFindSpecsFromFile(filter_file_path, pre_obj=None):
"""Returns a list of find specification from a filter file.
Args:
filter_file_path: A path to a file that contains find specifications.
pre_obj: A preprocessing object (instance of PreprocessObject). This is
optional but when provided takes care of expanding each segment.
"""
find_specs = []
if pre_obj:
expander = path_expander.WinRegistryKeyPathExpander()
with open(filter_file_path, 'rb') as file_object:
for line in file_object:
line = line.strip()
if line.startswith(u'#'):
continue
if pre_obj:
try:
line = expander.ExpandPath(line, pre_obj=pre_obj)
except KeyError as exception:
logging.error((
u'Unable to use collection filter line: {0:s} with error: '
u'{1:s}').format(line, exception))
continue
if not line.startswith(u'/'):
logging.warning((
u'The filter string must be defined as an abolute path: '
u'{0:s}').format(line))
continue
_, _, file_path = line.rstrip().rpartition(u'/')
if not file_path:
logging.warning(
u'Unable to parse the filter string: {0:s}'.format(line))
continue
# Convert the filter paths into a list of path segments and strip
# the root path segment.
path_segments = line.split(u'/')
path_segments.pop(0)
find_specs.append(file_system_searcher.FindSpec(
location_regex=path_segments, case_sensitive=False))
return find_specs
+352
View File
@@ -0,0 +1,352 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2012 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The event extraction worker."""
import logging
import os
from dfvfs.resolver import context
from dfvfs.resolver import resolver as path_spec_resolver
try:
from guppy import hpy
except ImportError:
hpy = None
from plaso.engine import classifier
from plaso.engine import queue
from plaso.lib import errors
from plaso.parsers import manager as parsers_manager
class BaseEventExtractionWorker(queue.ItemQueueConsumer):
"""Class that defines the event extraction worker base.
This class is designed to watch a queue for path specifications of files
and directories (file entries) for which events need to be extracted.
The event extraction worker needs to determine if a parser suitable
for parsing a particular file is available. All extracted event objects
are pushed on a storage queue for further processing.
"""
def __init__(
self, identifier, process_queue, event_queue_producer,
parse_error_queue_producer, parser_context):
"""Initializes the event extraction worker object.
Args:
identifier: The identifier, usually an incrementing integer.
process_queue: The process queue (instance of Queue). This queue contains
the file entries that need to be processed.
event_queue_producer: The event object queue producer (instance of
ItemQueueProducer).
parse_error_queue_producer: The parse error queue producer (instance of
ItemQueueProducer).
parser_context: A parser context object (instance of ParserContext).
"""
super(BaseEventExtractionWorker, self).__init__(process_queue)
self._enable_debug_output = False
self._identifier = identifier
self._open_files = False
self._parser_context = parser_context
self._filestat_parser_object = None
self._parser_objects = None
# We need a resolver context per process to prevent multi processing
# issues with file objects stored in images.
self._resolver_context = context.Context()
self._event_queue_producer = event_queue_producer
self._parse_error_queue_producer = parse_error_queue_producer
# Attributes that contain the current status of the worker.
self._current_working_file = u''
self._is_running = False
# Attributes for profiling.
self._enable_profiling = False
self._heapy = None
self._profiling_sample = 0
self._profiling_sample_rate = 1000
self._profiling_sample_file = u'{0!s}.hpy'.format(self._identifier)
def _ConsumeItem(self, path_spec):
"""Consumes an item callback for ConsumeItems.
Args:
path_spec: a path specification (instance of dfvfs.PathSpec).
"""
file_entry = path_spec_resolver.Resolver.OpenFileEntry(
path_spec, resolver_context=self._resolver_context)
if file_entry is None:
logging.warning(u'Unable to open file entry: {0:s}'.format(
path_spec.comparable))
return
try:
self.ParseFileEntry(file_entry)
except IOError as exception:
logging.warning(u'Unable to parse file: {0:s} with error: {1:s}'.format(
path_spec.comparable, exception))
def _DebugParseFileEntry(self):
"""Callback for debugging file entry parsing failures."""
return
def _ParseFileEntryWithParser(self, parser_object, file_entry):
"""Parses a file entry with a specific parser.
Args:
parser_object: A parser object (instance of BaseParser).
file_entry: A file entry object (instance of dfvfs.FileEntry).
Raises:
QueueFull: If a queue is full.
"""
try:
parser_object.Parse(self._parser_context, file_entry)
except errors.UnableToParseFile as exception:
logging.debug(u'Not a {0:s} file ({1:s}) - {2:s}'.format(
parser_object.NAME, file_entry.name, exception))
except errors.QueueFull:
raise
except IOError as exception:
logging.debug(
u'[{0:s}] Unable to parse: {1:s} with error: {2:s}'.format(
parser_object.NAME, file_entry.path_spec.comparable,
exception))
# Casting a wide net, catching all exceptions. Done to keep the worker
# running, despite the parser hitting errors, so the worker doesn't die
# if a single file is corrupted or there is a bug in a parser.
except Exception as exception:
logging.warning(
u'[{0:s}] Unable to process file: {1:s} with error: {2:s}.'.format(
parser_object.NAME, file_entry.path_spec.comparable,
exception))
logging.debug(
u'The path specification that caused the error: {0:s}'.format(
file_entry.path_spec.comparable))
logging.exception(exception)
if self._enable_debug_output:
self._DebugParseFileEntry()
def _ProfilingStart(self):
"""Starts the profiling."""
self._heapy.setrelheap()
self._profiling_sample = 0
try:
os.remove(self._profiling_sample_file)
except OSError:
pass
def _ProfilingStop(self):
"""Stops the profiling."""
self._ProfilingWriteSample()
def _ProfilingUpdate(self):
"""Updates the profiling."""
self._profiling_sample += 1
if self._profiling_sample >= self._profiling_sample_rate:
self._ProfilingWriteSample()
self._profiling_sample = 0
def _ProfilingWriteSample(self):
"""Writes a profiling sample to the sample file."""
heap = self._heapy.heap()
heap.dump(self._profiling_sample_file)
def GetStatus(self):
"""Returns a status dictionary."""
return {
'is_running': self._is_running,
'identifier': u'Worker_{0:d}'.format(self._identifier),
'current_file': self._current_working_file,
'counter': self._parser_context.number_of_events}
def InitalizeParserObjects(self, parser_filter_string=None):
"""Initializes the parser objects.
The parser_filter_string is a simple comma separated value string that
denotes a list of parser names to include and/or exclude. Each entry
can have the value of:
+ Exact match of a list of parsers, or a preset (see
plaso/frontend/presets.py for a full list of available presets).
+ A name of a single parser (case insensitive), eg. msiecfparser.
+ A glob name for a single parser, eg: '*msie*' (case insensitive).
Args:
parser_filter_string: Optional parser filter string. The default is None.
"""
self._parser_objects = parsers_manager.ParsersManager.GetParserObjects(
parser_filter_string=parser_filter_string)
for parser_object in self._parser_objects:
if parser_object.NAME == 'filestat':
self._filestat_parser_object = parser_object
break
def ParseFileEntry(self, file_entry):
"""Parses a file entry.
Args:
file_entry: A file entry object (instance of dfvfs.FileEntry).
"""
logging.debug(u'[ParseFileEntry] Parsing: {0:s}'.format(
file_entry.path_spec.comparable))
self._current_working_file = getattr(
file_entry.path_spec, u'location', file_entry.name)
if file_entry.IsDirectory() and self._filestat_parser_object:
self._ParseFileEntryWithParser(self._filestat_parser_object, file_entry)
elif file_entry.IsFile():
# TODO: Not go through all parsers, just the ones
# that the classifier classifies the file as.
for parser_object in self._parser_objects:
logging.debug(u'Trying to parse: {0:s} with parser: {1:s}'.format(
file_entry.name, parser_object.NAME))
self._ParseFileEntryWithParser(parser_object, file_entry)
logging.debug(u'[ParseFileEntry] Done parsing: {0:s}'.format(
file_entry.path_spec.comparable))
if self._enable_profiling:
self._ProfilingUpdate()
if self._open_files:
try:
for sub_file_entry in classifier.Classifier.SmartOpenFiles(file_entry):
if self._abort:
break
self.ParseFileEntry(sub_file_entry)
except IOError as exception:
logging.warning(
u'Unable to parse file: {0:s} with error: {1:s}'.format(
file_entry.path_spec.comparable, exception))
def Run(self):
"""Extracts event objects from file entries."""
self._parser_context.ResetCounters()
if self._enable_profiling:
self._ProfilingStart()
self._is_running = True
logging.info(
u'Worker {0:d} (PID: {1:d}) started monitoring process queue.'.format(
self._identifier, os.getpid()))
self.ConsumeItems()
logging.info(
u'Worker {0:d} (PID: {1:d}) stopped monitoring process queue.'.format(
self._identifier, os.getpid()))
self._current_working_file = u''
self._is_running = False
if self._enable_profiling:
self._ProfilingStop()
self._resolver_context.Empty()
def SetEnableDebugOutput(self, enable_debug_output):
"""Enables or disables debug output.
Args:
enable_debug_output: boolean value to indicate if the debug output
should be enabled.
"""
self._enable_debug_output = enable_debug_output
def SetEnableProfiling(self, enable_profiling, profiling_sample_rate=1000):
"""Enables or disables profiling.
Args:
enable_debug_output: boolean value to indicate if the profiling
should be enabled.
profiling_sample_rate: optional integer indicating the profiling sample
rate. The value contains the number of files
processed. The default value is 1000.
"""
if hpy:
self._enable_profiling = enable_profiling
self._profiling_sample_rate = profiling_sample_rate
if self._enable_profiling and not self._heapy:
self._heapy = hpy()
def SetFilterObject(self, filter_object):
"""Sets the filter object.
Args:
filter_object: the filter object (instance of objectfilter.Filter).
"""
self._parser_context.SetFilterObject(filter_object)
def SetMountPath(self, mount_path):
"""Sets the mount path.
Args:
mount_path: string containing the mount path.
"""
self._parser_context.SetMountPath(mount_path)
# TODO: rename this mode.
def SetOpenFiles(self, open_files):
"""Sets the open files mode.
Args:
open_files: boolean value to indicate if the worker should scan for
file entries inside files.
"""
self._open_files = open_files
def SetTextPrepend(self, text_prepend):
"""Sets the text prepend.
Args:
text_prepend: string that contains the text to prepend to every
event object.
"""
self._parser_context.SetTextPrepend(text_prepend)
def SignalAbort(self):
"""Signals the worker to abort."""
super(BaseEventExtractionWorker, self).SignalAbort()
self._parser_context.SignalAbort()
@classmethod
def SupportsProfiling(cls):
"""Returns a boolean value to indicate if profiling is supported."""
return hpy is not None