plaso-rubanetra/plaso/frontend/image_export.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2013 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The image export front-end."""

import argparse
import collections
import hashlib
import logging
import os
import sys

from dfvfs.helpers import file_system_searcher
from dfvfs.lib import definitions as dfvfs_definitions
from dfvfs.path import factory as path_spec_factory
from dfvfs.resolver import resolver as path_spec_resolver

from plaso.artifacts import knowledge_base
from plaso.engine import collector
from plaso.engine import utils as engine_utils
from plaso.engine import queue
from plaso.engine import single_process
from plaso.frontend import frontend
from plaso.frontend import utils as frontend_utils
from plaso.lib import errors
from plaso.lib import timelib
from plaso.preprocessors import interface as preprocess_interface
from plaso.preprocessors import manager as preprocess_manager


def CalculateHash(file_object):
  """Return a hash for a given file object."""
  md5 = hashlib.md5()
  file_object.seek(0)

  data = file_object.read(4098)
  while data:
    md5.update(data)
    data = file_object.read(4098)

  return md5.hexdigest()


class DateFilter(object):
  """Class that implements a date filter for file entries."""

  DATE_FILTER_INSTANCE = collections.namedtuple(
      'date_filter_instance', 'type start end')

  DATE_FILTER_TYPES = frozenset([
      u'atime', u'bkup', u'ctime', u'crtime', u'dtime', u'mtime'])

  def __init__(self):
    """Initialize the date filter object."""
    super(DateFilter, self).__init__()
    self._filters = []

  @property
  def number_of_filters(self):
    """Return back the filter count."""
    return len(self._filters)

  def Add(self, filter_type, filter_start=None, filter_end=None):
    """Add a date filter.

    Args:
      filter_type: String that defines what timestamp is affected by the
                    date filter, valid values are atime, ctime, crtime,
                    dtime, bkup and mtime.
      filter_start: Optional start date of the filter. This is a string
                    in the form of "YYYY-MM-DD HH:MM:SS", or "YYYY-MM-DD".
                    If not supplied there will be no limitation to the initial
                    timeframe.
      filter_end: Optional end date of the filter. This is a string
                  in the form of "YYYY-MM-DD HH:MM:SS", or "YYYY-MM-DD".
                  If not supplied there will be no limitation to the initial
                  timeframe.

    Raises:
      errors.WrongFilterOption: If the filter is badly formed.
    """
    if not isinstance(filter_type, basestring):
      raise errors.WrongFilterOption(u'Filter type must be a string.')

    if filter_start is None and filter_end is None:
      raise errors.WrongFilterOption(
          u'A date filter has to have either a start or an end date.')

    filter_type_lower = filter_type.lower()
    if filter_type_lower not in self.DATE_FILTER_TYPES:
      raise errors.WrongFilterOption(u'Unknown filter type: {0:s}.'.format(
          filter_type))

    date_filter_type = filter_type_lower
    date_filter_start = None
    date_filter_end = None

    if filter_start is not None:
      # If the date string is invalid the timestamp will be set to zero,
      # which is also a valid date. Thus all invalid timestamp strings
      # will be set to filter from the POSIX epoch time.
      # Thus the actual value of the filter is printed out so that the user
      # may catch this potentially unwanted behavior.
      date_filter_start = timelib.Timestamp.FromTimeString(filter_start)
      logging.info(
          u'Date filter for start date configured: [{0:s}] {1:s}'.format(
              date_filter_type,
              timelib.Timestamp.CopyToIsoFormat(date_filter_start)))

    if filter_end is not None:
      date_filter_end = timelib.Timestamp.FromTimeString(filter_end)
      logging.info(
          u'Date filter for end date configured: [{0:s}] {1:s}'.format(
              date_filter_type,
              timelib.Timestamp.CopyToIsoFormat(date_filter_end)))

      # Make sure that the end timestamp occurs after the beginning.
      # If not then we need to reverse the time range.
      if (date_filter_start is not None and
          date_filter_start > date_filter_end):
        temporary_placeholder = date_filter_end
        date_filter_end = date_filter_start
        date_filter_start = temporary_placeholder

    self._filters.append(self.DATE_FILTER_INSTANCE(
        date_filter_type, date_filter_start, date_filter_end))

  def CompareFileEntry(self, file_entry):
    """Compare the set date filters against timestamps of a file entry.

    Args:
      file_entry: The file entry (instance of dfvfs.FileEntry).

    Returns:
      True, if there are no date filters set. Otherwise the date filters are
      compared and True only returned if the timestamps are outside of the time
      range.

    Raises:
        errors.WrongFilterOption: If an attempt is made to filter against
                                  a date type that is not stored in the stat
                                  object.
    """
    if not self._filters:
      return True

    # Compare timestamps of the file entry.
    stat = file_entry.GetStat()

    # Go over each filter.
    for date_filter in self._filters:
      posix_time = getattr(stat, date_filter.type, None)

      if posix_time is None:
        # Trying to filter against a date type that is not saved in the stat
        # object.
        raise errors.WrongFilterOption(
            u'Date type: {0:s} is not stored in the file entry'.format(
                date_filter.type))

      timestamp = timelib.Timestamp.FromPosixTime(posix_time)

      if date_filter.start is not None and (timestamp < date_filter.start):
        logging.debug((
            u'[skipping] Not saving file: {0:s}, timestamp out of '
            u'range.').format(file_entry.path_spec.location))
        return False

      if date_filter.end is not None and (timestamp > date_filter.end):
        logging.debug((
            u'[skipping] Not saving file: {0:s}, timestamp out of '
            u'range.').format(file_entry.path_spec.location))
        return False

    return True

  def Remove(self, filter_type, filter_start=None, filter_end=None):
    """Remove a date filter from the set of defined date filters.

    Args:
      filter_type: String that defines what timestamp is affected by the
                    date filter, valid values are atime, ctime, crtime,
                    dtime, bkup and mtime.
      filter_start: Optional start date of the filter. This is a string
                    in the form of "YYYY-MM-DD HH:MM:SS", or "YYYY-MM-DD".
                    If not supplied there will be no limitation to the initial
                    timeframe.
      filter_end: Optional end date of the filter. This is a string
                  in the form of "YYYY-MM-DD HH:MM:SS", or "YYYY-MM-DD".
                  If not supplied there will be no limitation to the initial
                  timeframe.
    """
    if not self._filters:
      return

    # TODO: Instead of doing it this way calculate a hash for every filter
    # that is stored and use that for removals.
    for date_filter_index, date_filter in enumerate(self._filters):
      if filter_start is None:
        date_filter_start = filter_start
      else:
        date_filter_start = timelib.Timestamp.FromTimeString(filter_start)
      if filter_end is None:
        date_filter_end = filter_end
      else:
        date_filter_end = timelib.Timestamp.FromTimeString(filter_end)

      if (date_filter.type == filter_type and
          date_filter.start == date_filter_start and
          date_filter.end == date_filter_end):
        del self._filters[date_filter_index]
        return

  def Reset(self):
    """Resets the date filter."""
    self._filters = []


class FileSaver(object):
  """A simple class that is used to save files."""

  md5_dict = {}
  calc_md5 = False
  # TODO: Move this functionality into the frontend as a state attribute.
  _date_filter = None

  @classmethod
  def SetDateFilter(cls, date_filter):
    """Set a date filter for the file saver.

    If a date filter is set files will not be saved unless they are within
    the time boundaries.

    Args:
      date_filter: A date filter object (instance of DateFilter).
    """
    cls._date_filter = date_filter

  @classmethod
  def WriteFile(cls, source_path_spec, destination_path, filename_prefix=''):
    """Writes the contents of the source to the destination file.

    Args:
      source_path_spec: the path specification of the source file.
      destination_path: the path of the destination file.
      filename_prefix: optional prefix for the filename. The default is an
                       empty string.
    """
    file_entry = path_spec_resolver.Resolver.OpenFileEntry(source_path_spec)
    directory = u''
    filename = getattr(source_path_spec, 'location', None)
    if not filename:
      filename = source_path_spec.file_path

    # There will be issues on systems that use a different separator than a
    # forward slash. However a forward slash is always used in the pathspec.
    if os.path.sep != u'/':
      filename = filename.replace(u'/', os.path.sep)

    if os.path.sep in filename:
      directory_string, _, filename = filename.rpartition(os.path.sep)
      if directory_string:
        directory = os.path.join(
            destination_path, *directory_string.split(os.path.sep))

    if filename_prefix:
      extracted_filename = u'{0:s}_{1:s}'.format(filename_prefix, filename)
    else:
      extracted_filename = filename

    while extracted_filename.startswith(os.path.sep):
      extracted_filename = extracted_filename[1:]

    if directory:
      if not os.path.isdir(directory):
        os.makedirs(directory)
    else:
      directory = destination_path

    if cls.calc_md5:
      stat = file_entry.GetStat()
      inode = getattr(stat, 'ino', 0)
      file_object = file_entry.GetFileObject()
      md5sum = CalculateHash(file_object)
      if inode in cls.md5_dict:
        if md5sum in cls.md5_dict[inode]:
          return
        cls.md5_dict[inode].append(md5sum)
      else:
        cls.md5_dict[inode] = [md5sum]

    # Check if we do not want to save the file.
    if cls._date_filter and not cls._date_filter.CompareFileEntry(file_entry):
      return

    try:
      file_object = file_entry.GetFileObject()
      frontend_utils.OutputWriter.WriteFile(
          file_object, os.path.join(directory, extracted_filename))
    except IOError as exception:
      logging.error(
          u'[skipping] unable to save file: {0:s} with error: {1:s}'.format(
              filename, exception))


class ImageExtractorQueueConsumer(queue.ItemQueueConsumer):
  """Class that implements an image extractor queue consumer."""

  def __init__(self, process_queue, extensions, destination_path):
    """Initializes the image extractor queue consumer.

    Args:
      process_queue: the process queue (instance of Queue).
      extensions: a list of extensions.
      destination_path: the path where the extracted files should be stored.
    """
    super(ImageExtractorQueueConsumer, self).__init__(process_queue)
    self._destination_path = destination_path
    self._extensions = extensions

  def _ConsumeItem(self, path_spec):
    """Consumes an item callback for ConsumeItems.

    Args:
      path_spec: a path specification (instance of dfvfs.PathSpec).
    """
    # TODO: move this into a function of path spec e.g. GetExtension().
    location = getattr(path_spec, 'location', None)
    if not location:
      location = path_spec.file_path
    _, _, extension = location.rpartition('.')
    if extension.lower() in self._extensions:
      vss_store_number = getattr(path_spec, 'vss_store_number', None)
      if vss_store_number is not None:
        filename_prefix = 'vss_{0:d}'.format(vss_store_number + 1)
      else:
        filename_prefix = ''

      FileSaver.WriteFile(
          path_spec, self._destination_path, filename_prefix=filename_prefix)


class ImageExportFrontend(frontend.StorageMediaFrontend):
  """Class that implements the image export front-end."""

  def __init__(self):
    """Initializes the front-end object."""
    input_reader = frontend.StdinFrontendInputReader()
    output_writer = frontend.StdoutFrontendOutputWriter()

    super(ImageExportFrontend, self).__init__(input_reader, output_writer)

    self._knowledge_base = None
    self._remove_duplicates = True
    self._source_path_spec = None

  # TODO: merge with collector and/or engine.
  def _ExtractWithExtensions(self, extensions, destination_path):
    """Extracts files using extensions.

    Args:
      extensions: a list of extensions.
      destination_path: the path where the extracted files should be stored.
    """
    logging.info(u'Finding files with extensions: {0:s}'.format(extensions))

    if not os.path.isdir(destination_path):
      os.makedirs(destination_path)

    input_queue = single_process.SingleProcessQueue()

    # TODO: add support to handle multiple partitions.
    self._source_path_spec = self.GetSourcePathSpec()

    image_collector = collector.Collector(
        input_queue, self._source_path, self._source_path_spec)

    image_collector.Collect()

    FileSaver.calc_md5 = self._remove_duplicates

    input_queue_consumer = ImageExtractorQueueConsumer(
        input_queue, extensions, destination_path)
    input_queue_consumer.ConsumeItems()

  # TODO: merge with collector and/or engine.
  def _ExtractWithFilter(self, filter_file_path, destination_path):
    """Extracts files using a filter expression.

    This method runs the file extraction process on the image and
    potentially on every VSS if that is wanted.

    Args:
      filter_file_path: The path of the file that contains the filter
                        expressions.
      destination_path: The path where the extracted files should be stored.
    """
    # TODO: add support to handle multiple partitions.
    self._source_path_spec = self.GetSourcePathSpec()

    searcher = self._GetSourceFileSystemSearcher(
        resolver_context=self._resolver_context)

    if self._knowledge_base is None:
      self._Preprocess(searcher)

    if not os.path.isdir(destination_path):
      os.makedirs(destination_path)

    find_specs = engine_utils.BuildFindSpecsFromFile(
        filter_file_path, pre_obj=self._knowledge_base.pre_obj)

    # Save the regular files.
    FileSaver.calc_md5 = self._remove_duplicates

    for path_spec in searcher.Find(find_specs=find_specs):
      FileSaver.WriteFile(path_spec, destination_path)

    if self._process_vss and self._vss_stores:
      volume_path_spec = self._source_path_spec.parent

      logging.info(u'Extracting files from VSS.')
      vss_path_spec = path_spec_factory.Factory.NewPathSpec(
          dfvfs_definitions.TYPE_INDICATOR_VSHADOW, location=u'/',
          parent=volume_path_spec)

      vss_file_entry = path_spec_resolver.Resolver.OpenFileEntry(vss_path_spec)

      number_of_vss = vss_file_entry.number_of_sub_file_entries

      # In plaso 1 represents the first store index in dfvfs and pyvshadow 0
      # represents the first store index so 1 is subtracted.
      vss_store_range = [store_nr - 1 for store_nr in self._vss_stores]

      for store_index in vss_store_range:
        logging.info(u'Extracting files from VSS {0:d} out of {1:d}'.format(
            store_index + 1, number_of_vss))

        vss_path_spec = path_spec_factory.Factory.NewPathSpec(
            dfvfs_definitions.TYPE_INDICATOR_VSHADOW, store_index=store_index,
            parent=volume_path_spec)
        path_spec = path_spec_factory.Factory.NewPathSpec(
            dfvfs_definitions.TYPE_INDICATOR_TSK, location=u'/',
            parent=vss_path_spec)

        filename_prefix = 'vss_{0:d}'.format(store_index)

        file_system = path_spec_resolver.Resolver.OpenFileSystem(
            path_spec, resolver_context=self._resolver_context)
        searcher = file_system_searcher.FileSystemSearcher(
            file_system, vss_path_spec)

        for path_spec in searcher.Find(find_specs=find_specs):
          FileSaver.WriteFile(
              path_spec, destination_path, filename_prefix=filename_prefix)

  # TODO: refactor, this is a duplicate of the function in engine.
  def _GetSourceFileSystemSearcher(self, resolver_context=None):
    """Retrieves the file system searcher of the source.

    Args:
      resolver_context: Optional resolver context (instance of dfvfs.Context).
                        The default is None. Note that every thread or process
                        must have its own resolver context.

    Returns:
      The file system searcher object (instance of dfvfs.FileSystemSearcher).

    Raises:
      RuntimeError: if source path specification is not set.
    """
    if not self._source_path_spec:
      raise RuntimeError(u'Missing source.')

    file_system = path_spec_resolver.Resolver.OpenFileSystem(
        self._source_path_spec, resolver_context=resolver_context)

    type_indicator = self._source_path_spec.type_indicator
    if type_indicator == dfvfs_definitions.TYPE_INDICATOR_OS:
      mount_point = self._source_path_spec
    else:
      mount_point = self._source_path_spec.parent

    return file_system_searcher.FileSystemSearcher(file_system, mount_point)

  def _Preprocess(self, searcher):
    """Preprocesses the image.

    Args:
      searcher: The file system searcher object (instance of
                dfvfs.FileSystemSearcher).
    """
    if self._knowledge_base is not None:
      return

    self._knowledge_base = knowledge_base.KnowledgeBase()

    logging.info(u'Guessing OS')

    platform = preprocess_interface.GuessOS(searcher)
    logging.info(u'OS: {0:s}'.format(platform))

    logging.info(u'Running preprocess.')

    preprocess_manager.PreprocessPluginsManager.RunPlugins(
        platform, searcher, self._knowledge_base)

    logging.info(u'Preprocess done, saving files from image.')

  def ParseOptions(self, options, source_option='source'):
    """Parses the options and initializes the front-end.

    Args:
      options: the command line arguments (instance of argparse.Namespace).
      source_option: optional name of the source option. The default is source.

    Raises:
      BadConfigOption: if the options are invalid.
    """
    super(ImageExportFrontend, self).ParseOptions(
        options, source_option=source_option)

    filter_file = getattr(options, 'filter', None)
    if not filter_file and not getattr(options, 'extension_string', None):
      raise errors.BadConfigOption(
          u'Neither an extension string or a filter is defined.')

    if filter_file and not os.path.isfile(filter_file):
      raise errors.BadConfigOption(
          u'Unable to proceed, filter file: {0:s} does not exist.'.format(
              filter_file))

    if (getattr(options, 'no_vss', False) or
        getattr(options, 'include_duplicates', False)):
      self._remove_duplicates = False

    # Process date filter.
    date_filters = getattr(options, 'date_filters', [])
    if date_filters:
      date_filter_object = DateFilter()

      for date_filter in date_filters:
        date_filter_pieces = date_filter.split(',')
        if len(date_filter_pieces) != 3:
          raise errors.BadConfigOption(
              u'Date filter badly formed: {0:s}'.format(date_filter))

        filter_type, filter_start, filter_end = date_filter_pieces
        date_filter_object.Add(
            filter_type=filter_type.strip(), filter_start=filter_start.strip(),
            filter_end=filter_end.strip())

      # TODO: Move the date filter to the front-end as an attribute.
      FileSaver.SetDateFilter(date_filter_object)

  def ProcessSource(self, options):
    """Processes the source.

    Args:
      options: the command line arguments (instance of argparse.Namespace).

    Raises:
      SourceScannerError: if the source scanner could not find a supported
                          file system.
      UserAbort: if the user initiated an abort.
    """
    self.ScanSource(options)

    filter_file = getattr(options, 'filter', None)
    if filter_file:
      self._ExtractWithFilter(filter_file, options.path)

    extension_string = getattr(options, 'extension_string', None)
    if extension_string:
      extensions = [x.strip() for x in extension_string.split(',')]

      self._ExtractWithExtensions(extensions, options.path)
      logging.info(u'Files based on extension extracted.')


def Main():
  """The main function, running the show."""
  front_end = ImageExportFrontend()

  arg_parser = argparse.ArgumentParser(
      description=(
          'This is a simple collector designed to export files inside an '
          'image, both within a regular RAW image as well as inside a VSS. '
          'The tool uses a collection filter that uses the same syntax as a '
          'targeted plaso filter.'),
      epilog='And that\'s how you export files, plaso style.')

  arg_parser.add_argument(
      '-d', '--debug', dest='debug', action='store_true', default=False,
      help='Turn on debugging information.')

  arg_parser.add_argument(
      '-w', '--write', dest='path', action='store', default='.', type=str,
      help='The directory in which extracted files should be stored in.')

  arg_parser.add_argument(
      '-x', '--extensions', dest='extension_string', action='store',
      type=str, metavar='EXTENSION_STRING', help=(
          'If the purpose is to find all files given a certain extension '
          'this options should be used. This option accepts a comma separated '
          'string denoting all file extensions, eg: -x "csv,docx,pst".'))

  arg_parser.add_argument(
      '-f', '--filter', action='store', dest='filter', metavar='FILTER_FILE',
      type=str, help=(
          'Full path to the file that contains the collection filter, '
          'the file can use variables that are defined in preprocesing, '
          'just like any other log2timeline/plaso collection filter.'))

  arg_parser.add_argument(
      '--date-filter', '--date_filter', action='append', type=str,
      dest='date_filters', metavar="TYPE_START_END", default=None, help=(
          'Add a date based filter to the export criteria. If a date based '
          'filter is set no file is saved unless it\'s within the date '
          'boundary. This parameter should be in the form of "TYPE,START,END" '
          'where TYPE defines which timestamp this date filter affects, eg: '
          'atime, ctime, crtime, bkup, etc. START defines the start date and '
          'time of the boundary and END defines the end time. Both timestamps '
          'are optional and should be set as - if not needed. The correct form '
          'of the timestamp value is in the form of "YYYY-MM-DD HH:MM:SS" or '
          '"YYYY-MM-DD". Examples are "atime, 2013-01-01 23:12:14, 2013-02-23" '
          'This parameter can be repeated as needed to add additional date '
          'date boundaries, eg: once for atime, once for crtime, etc.'))

  arg_parser.add_argument(
      '--include_duplicates', dest='include_duplicates', action='store_true',
      default=False, help=(
          'By default if VSS is turned on all files saved will have their '
          'MD5 sum calculated and compared to other files already saved '
          'with the same inode value. If the MD5 sum is the same the file '
          'does not get saved again. This option turns off that behavior '
          'so that all files will get stored, even if they are duplicates.'))

  front_end.AddImageOptions(arg_parser)
  front_end.AddVssProcessingOptions(arg_parser)

  arg_parser.add_argument(
      'image', action='store', metavar='IMAGE', default=None, type=str, help=(
          'The full path to the image file that we are about to extract files '
          'from, it should be a raw image or another image that plaso '
          'supports.'))

  options = arg_parser.parse_args()

  format_str = u'%(asctime)s [%(levelname)s] %(message)s'
  if options.debug:
    logging.basicConfig(level=logging.DEBUG, format=format_str)
  else:
    logging.basicConfig(level=logging.INFO, format=format_str)

  try:
    front_end.ParseOptions(options, source_option='image')
  except errors.BadConfigOption as exception:
    arg_parser.print_help()
    print u''
    logging.error(u'{0:s}'.format(exception))
    return False

  try:
    front_end.ProcessSource(options)
    logging.info(u'Processing completed.')

  except (KeyboardInterrupt, errors.UserAbort):
    logging.warning(u'Aborted by user.')
    return False

  except errors.SourceScannerError as exception:
    logging.warning((
        u'Unable to scan for a supported filesystem with error: {0:s}\n'
        u'Most likely the image format is not supported by the '
        u'tool.').format(exception))
    return False

  return True


if __name__ == '__main__':
  if not Main():
    sys.exit(1)
  else:
    sys.exit(0)