plaso-rubanetra/plaso/classifier/classifier.py

#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Copyright 2013 The Plaso Project Authors.
# Please see the AUTHORS file for details on individual authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file contains the format classifier classes.

Plaso is a tool that extracts events from files on a file system.
For this it either reads files from a mounted file system or from an image.
It uses an exhaustive approach to determine parse events from a file, meaning
that it passes the file first to parser A and if that fails it continues with
parser B.

The classifier is designed to be able to more quickly determine the format of
a file and limit the number of parsers part of the exhaustive approach.

The current version of the classifier uses signatures to identify file formats.
Some signatures must always be defined at a specific offset, this is referred to
as an offset-bound signature or bound for short. Other signatures are commonly
found at a specific offset but not necessarily. The last form of signatures is
unbound, meaning that they don't have a fixed or common location where they can
be found.

A specification is a collection of signatures with additional metadata that
defines a specific file format. These specifications are grouped into a store
for ease of use, e.g. so that they can be read from a configuration file all
at once.

The classifier requires a scanner to analyze the data in a file. The scanner
uses the specifications in a store to scan for the signatures or a certain
format.

The classifier allows for multiple methods of scanning a file:
* full:      the entire file is scanned. This is the default scanning method.
* head-tail: only the beginning (head) and the end (tail) of the file is
             scanned. This approach is more efficient for larger files.
             The buffer size is used as the size of the data that is scanned.
             Smaller files are scanned entirely.

The classifier returns zero or more classifications which point to a format
specification and the scan results for the signatures defined by
the specification.
"""

import logging


class Classification(object):
  """This class represents a format classification.

     The format classification consists of a format specification and
     scan results.
  """

  def __init__(self, specification, scan_matches):
    """Initializes the classification.

    Args:
      specification: the format specification (instance of Specification).
      scan_matches: the list of scan matches (instances of _ScanMatch).

    Raises:
      TypeError: if the specification is not of type Specification.
    """
    self._specification = specification
    self.scan_matches = scan_matches

  @property
  def identifier(self):
    """The classification type."""
    return self._specification.identifier

  @property
  def magic_types(self):
    """The magic types or an empty list if none."""
    return self._specification.magic_types

  @property
  def mime_types(self):
    """The mime type or an empty list if none."""
    return self._specification.mime_types


class Classifier(object):
  """Class for classifying formats in raw data.

  The classifier is initialized with one or more specifications.
  After which it can be used to classify data in files or file-like objects.

  The actual scanning of the data is done by the scanner, these are separate
  to allow for the scanner to easily be replaced for a more efficient
  alternative if necessary.

  For an example of how the classifier is to be used see: classify.py.
  """
  BUFFER_SIZE = 16 * 1024 * 1024

  def __init__(self, scanner):
    """Initializes the classifier and sets up the scanning related structures.

    Args:
      scanner: an instance of the signature scanner.
    """
    self._scanner = scanner

  def _GetClassifications(self, scan_results):
    """Retrieves the classifications based on the scan results.

    Multiple scan results are combined into a single classification.

    Args:
      scan_results: a list containing instances of _ScanResult.

    Returns:
      a list of instances of Classification.
    """
    classifications = {}

    for scan_result in scan_results:
      for scan_match in scan_result.scan_matches:
        logging.debug(
            u'scan match at offset: 0x{0:08x} specification: {1:s}'.format(
                scan_match.total_data_offset, scan_result.identifier))

      if scan_result.identifier not in classifications:
        classifications[scan_result.identifier] = Classification(
            scan_result.specification, scan_result.scan_matches)

    return classifications.values()

  def ClassifyBuffer(self, data, data_size):
    """Classifies the data in a buffer, assumes all necessary data is available.

    Args:
      data: a buffer containing raw data.
      data_size: the size of the raw data in the buffer.

    Returns:
      a list of classifications or an empty list.
    """
    scan_state = self._scanner.StartScan()
    self._scanner.ScanBuffer(scan_state, data, data_size)
    self._scanner.StopScan(scan_state)

    return self._GetClassifications(scan_state.GetResults())

  def ClassifyFileObject(self, file_object):
    """Classifies the data in a file-like object.

    Args:
      file_object: a file-like object.

    Returns:
      a list of classifier classifications or an empty list.
    """
    scan_results = self._scanner.ScanFileObject(file_object)

    return self._GetClassifications(scan_results)

  def ClassifyFile(self, filename):
    """Classifies the data in a file.

    Args:
      filename: the name of the file.

    Returns:
      a list of classifier classifications or an empty list.
    """
    classifications = []
    with open(filename, 'rb') as file_object:
      classifications = self.ClassifyFileObject(file_object)
    return classifications