#!/usr/bin/python # -*- coding: utf-8 -*- # # Copyright 2013 The Plaso Project Authors. # Please see the AUTHORS file for details on individual authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """This file contains the format classifier classes. Plaso is a tool that extracts events from files on a file system. For this it either reads files from a mounted file system or from an image. It uses an exhaustive approach to determine parse events from a file, meaning that it passes the file first to parser A and if that fails it continues with parser B. The classifier is designed to be able to more quickly determine the format of a file and limit the number of parsers part of the exhaustive approach. The current version of the classifier uses signatures to identify file formats. Some signatures must always be defined at a specific offset, this is referred to as an offset-bound signature or bound for short. Other signatures are commonly found at a specific offset but not necessarily. The last form of signatures is unbound, meaning that they don't have a fixed or common location where they can be found. A specification is a collection of signatures with additional metadata that defines a specific file format. These specifications are grouped into a store for ease of use, e.g. so that they can be read from a configuration file all at once. The classifier requires a scanner to analyze the data in a file. The scanner uses the specifications in a store to scan for the signatures or a certain format. The classifier allows for multiple methods of scanning a file: * full: the entire file is scanned. This is the default scanning method. * head-tail: only the beginning (head) and the end (tail) of the file is scanned. This approach is more efficient for larger files. The buffer size is used as the size of the data that is scanned. Smaller files are scanned entirely. The classifier returns zero or more classifications which point to a format specification and the scan results for the signatures defined by the specification. """ import logging class Classification(object): """This class represents a format classification. The format classification consists of a format specification and scan results. """ def __init__(self, specification, scan_matches): """Initializes the classification. Args: specification: the format specification (instance of Specification). scan_matches: the list of scan matches (instances of _ScanMatch). Raises: TypeError: if the specification is not of type Specification. """ self._specification = specification self.scan_matches = scan_matches @property def identifier(self): """The classification type.""" return self._specification.identifier @property def magic_types(self): """The magic types or an empty list if none.""" return self._specification.magic_types @property def mime_types(self): """The mime type or an empty list if none.""" return self._specification.mime_types class Classifier(object): """Class for classifying formats in raw data. The classifier is initialized with one or more specifications. After which it can be used to classify data in files or file-like objects. The actual scanning of the data is done by the scanner, these are separate to allow for the scanner to easily be replaced for a more efficient alternative if necessary. For an example of how the classifier is to be used see: classify.py. """ BUFFER_SIZE = 16 * 1024 * 1024 def __init__(self, scanner): """Initializes the classifier and sets up the scanning related structures. Args: scanner: an instance of the signature scanner. """ self._scanner = scanner def _GetClassifications(self, scan_results): """Retrieves the classifications based on the scan results. Multiple scan results are combined into a single classification. Args: scan_results: a list containing instances of _ScanResult. Returns: a list of instances of Classification. """ classifications = {} for scan_result in scan_results: for scan_match in scan_result.scan_matches: logging.debug( u'scan match at offset: 0x{0:08x} specification: {1:s}'.format( scan_match.total_data_offset, scan_result.identifier)) if scan_result.identifier not in classifications: classifications[scan_result.identifier] = Classification( scan_result.specification, scan_result.scan_matches) return classifications.values() def ClassifyBuffer(self, data, data_size): """Classifies the data in a buffer, assumes all necessary data is available. Args: data: a buffer containing raw data. data_size: the size of the raw data in the buffer. Returns: a list of classifications or an empty list. """ scan_state = self._scanner.StartScan() self._scanner.ScanBuffer(scan_state, data, data_size) self._scanner.StopScan(scan_state) return self._GetClassifications(scan_state.GetResults()) def ClassifyFileObject(self, file_object): """Classifies the data in a file-like object. Args: file_object: a file-like object. Returns: a list of classifier classifications or an empty list. """ scan_results = self._scanner.ScanFileObject(file_object) return self._GetClassifications(scan_results) def ClassifyFile(self, filename): """Classifies the data in a file. Args: filename: the name of the file. Returns: a list of classifier classifications or an empty list. """ classifications = [] with open(filename, 'rb') as file_object: classifications = self.ClassifyFileObject(file_object) return classifications