#!/usr/bin/python # -*- coding: utf-8 -*- # # Copyright 2013 The Plaso Project Authors. # Please see the AUTHORS file for details on individual authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """This file contains a parser for OXML files (i.e. MS Office 2007+).""" import logging import re import struct import zipfile from xml.etree import ElementTree from plaso.events import time_events from plaso.lib import errors from plaso.lib import eventdata from plaso.lib import timelib from plaso.parsers import interface from plaso.parsers import manager __author__ = 'David Nides (david.nides@gmail.com)' class OpenXMLParserEvent(time_events.TimestampEvent): """Process timestamps from MS Office XML Events.""" DATA_TYPE = 'metadata:openxml' def __init__(self, timestamp_string, usage, metadata): """Initializes the event object. Args: timestamp_string: An ISO 8601 representation of a timestamp. usage: The description of the usage of the time value. metadata: A dict object containing extracted metadata. """ timestamp = timelib.Timestamp.FromTimeString(timestamp_string) super(OpenXMLParserEvent, self).__init__(timestamp, usage, self.DATA_TYPE) for key, value in metadata.iteritems(): setattr(self, key, value) class OpenXMLParser(interface.BaseParser): """Parse metadata from OXML files.""" NAME = 'openxml' DESCRIPTION = u'Parser for OpenXML (OXML) files.' _METAKEY_TRANSLATE = { 'creator': 'author', 'lastModifiedBy': 'last_saved_by', 'Total_Time': 'total_edit_time', 'Pages': 'num_pages', 'Characters_with_spaces': 'num_chars_w_spaces', 'Paragraphs': 'num_paragraphs', 'Characters': 'num_chars', 'Lines': 'num_lines', 'revision': 'revision_num', 'Words': 'num_words', 'Application': 'creating_app', 'Shared_Doc': 'shared', } _FILES_REQUIRED = frozenset([ '[Content_Types].xml', '_rels/.rels', 'docProps/core.xml']) def _FixString(self, key): """Convert CamelCase to lower_with_underscore.""" # TODO: Add unicode support. fix_key = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', key) return re.sub('([a-z0-9])([A-Z])', r'\1_\2', fix_key).lower() def Parse(self, parser_context, file_entry, parser_chain=None): """Extract data from an OXML file. Args: parser_context: A parser context object (instance of ParserContext). file_entry: A file entry object (instance of dfvfs.FileEntry). parser_chain: Optional string containing the parsing chain up to this point. The default is None. """ file_object = file_entry.GetFileObject() if not zipfile.is_zipfile(file_object): raise errors.UnableToParseFile( u'[{0:s}] unable to parse file: {1:s} with error: {2:s}'.format( self.NAME, file_entry.name, 'Not a Zip file.')) try: zip_container = zipfile.ZipFile(file_object, 'r') except (zipfile.BadZipfile, struct.error, zipfile.LargeZipFile): raise errors.UnableToParseFile( u'[{0:s}] unable to parse file: {1:s} with error: {2:s}'.format( self.NAME, file_entry.name, 'Bad Zip file.')) zip_name_list = set(zip_container.namelist()) if not self._FILES_REQUIRED.issubset(zip_name_list): raise errors.UnableToParseFile( u'[{0:s}] unable to parse file: {1:s} with error: {2:s}'.format( self.NAME, file_entry.name, 'OXML element(s) missing.')) # Add ourselves to the parser chain, which will be used in all subsequent # event creation in this parser. parser_chain = self._BuildParserChain(parser_chain) metadata = {} timestamps = {} try: rels_xml = zip_container.read('_rels/.rels') except zipfile.BadZipfile as exception: logging.error( u'Unable to parse file {0:s} with error: {1:s}'.format( file_entry.name, exception)) return rels_root = ElementTree.fromstring(rels_xml) for properties in rels_root.iter(): if 'properties' in repr(properties.get('Type')): try: xml = zip_container.read(properties.get('Target')) root = ElementTree.fromstring(xml) except ( OverflowError, IndexError, KeyError, ValueError, zipfile.BadZipfile) as exception: logging.warning( u'[{0:s}] unable to read property with error: {1:s}.'.format( self.NAME, exception)) continue for element in root.iter(): if element.text: _, _, tag = element.tag.partition('}') # Not including the 'lpstr' attribute because it is # very verbose. if tag == 'lpstr': continue if tag in ('created', 'modified', 'lastPrinted'): timestamps[tag] = element.text else: tag_name = self._METAKEY_TRANSLATE.get(tag, self._FixString(tag)) metadata[tag_name] = element.text if timestamps.get('created', None): event_object = OpenXMLParserEvent( timestamps.get('created'), eventdata.EventTimestamp.CREATION_TIME, metadata) parser_context.ProduceEvent( event_object, parser_chain=parser_chain, file_entry=file_entry) if timestamps.get('modified', None): event_object = OpenXMLParserEvent( timestamps.get('modified'), eventdata.EventTimestamp.MODIFICATION_TIME, metadata) parser_context.ProduceEvent( event_object, parser_chain=parser_chain, file_entry=file_entry) if timestamps.get('lastPrinted', None): event_object = OpenXMLParserEvent( timestamps.get('lastPrinted'), eventdata.EventTimestamp.LAST_PRINTED, metadata) parser_context.ProduceEvent( event_object, parser_chain=parser_chain, file_entry=file_entry) manager.ParsersManager.RegisterParser(OpenXMLParser)